├── unitspeech ├── textlesslib │ ├── pytest.ini │ ├── CHANGELOG.md │ ├── examples │ │ ├── __init__.py │ │ ├── gslm │ │ │ ├── __init__.py │ │ │ ├── README.md │ │ │ ├── sampler.py │ │ │ └── sample.py │ │ ├── resynthesis │ │ │ ├── README.md │ │ │ └── resynth.py │ │ └── speaker_probing │ │ │ ├── README.md │ │ │ ├── probes.py │ │ │ └── train.py │ ├── textless │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── collater_utils.py │ │ │ ├── kmeans_quantizer.py │ │ │ ├── hubert_feature_reader.py │ │ │ ├── f0_preprocess.py │ │ │ └── cpc_feature_reader.py │ │ ├── vocoders │ │ │ └── tacotron2 │ │ │ │ ├── tts_data.py │ │ │ │ ├── symbols.py │ │ │ │ ├── __init__.py │ │ │ │ ├── cmudict.py │ │ │ │ ├── waveglow_denoiser.py │ │ │ │ ├── numbers.py │ │ │ │ ├── cleaners.py │ │ │ │ ├── text.py │ │ │ │ ├── audio_processing.py │ │ │ │ ├── vocoder.py │ │ │ │ ├── layers.py │ │ │ │ ├── stft.py │ │ │ │ └── utils.py │ │ ├── __init__.py │ │ └── checkpoint_manager │ │ │ └── manager.py │ ├── tools │ │ └── distributed_transcribe │ │ │ ├── __init__.py │ │ │ ├── single.sh │ │ │ ├── local.sh │ │ │ ├── slurm.sbatch │ │ │ ├── data_handler.py │ │ │ ├── distributed.py │ │ │ ├── README.md │ │ │ └── transcribe.py │ ├── requirements.txt │ ├── CITATION.bib │ ├── setup.cfg │ ├── setup.py │ ├── LICENSE │ ├── tests │ │ ├── test_checkpoint_manager.py │ │ ├── test_quantized_dataset.py │ │ └── test_model_handling.py │ ├── CONTRIBUTING.md │ ├── CODE_OF_CONDUCT.md │ └── README.md ├── vocoder │ ├── alias_free_torch │ │ ├── __init__.py │ │ ├── act.py │ │ ├── resample.py │ │ └── filter.py │ ├── env.py │ ├── incl_licenses │ │ ├── LICENSE_5 │ │ ├── LICENSE_1 │ │ ├── LICENSE_2 │ │ └── LICENSE_4 │ ├── LICENSE │ ├── xutils.py │ ├── README.md │ ├── activations.py │ └── meldataset.py ├── text │ ├── symbols.py │ ├── __init__.py │ ├── LICENSE │ └── cleaners.py ├── checkpoints │ ├── finetune.json │ ├── voice-conversion.json │ └── text-to-speech.json ├── base.py ├── duration_predictor.py ├── speaker_encoder │ └── utils.py └── util.py ├── .gitignore ├── setup.py ├── scripts ├── voice_conversion.py └── text_to_speech.py └── README.md /unitspeech/textlesslib/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --doctest-modules -------------------------------------------------------------------------------- /unitspeech/textlesslib/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | ## v0.1.0 3 | 4 | Initial version -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | .ipynb_checkpoints 4 | notebooks/.ipynb_checkpoints 5 | 6 | unitspeech/checkpoints 7 | unitspeech/outputs 8 | unitspeech/speaker_encoder/checkpts 9 | unitspeech/vocoder/checkpts 10 | 11 | unitspeech.egg-info -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/gslm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.1.0 2 | torchaudio 3 | pytest 4 | AMFM_decompy 5 | librosa 6 | threadpoolctl==3.0.0 7 | numpy==1.22.0 8 | numba==0.53.0 9 | joblib 10 | scikit-learn 11 | npy-append-array 12 | unidecode 13 | inflect 14 | -------------------------------------------------------------------------------- /unitspeech/vocoder/alias_free_torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * -------------------------------------------------------------------------------- /unitspeech/textlesslib/CITATION.bib: -------------------------------------------------------------------------------- 1 | @article{Kharitonov2022, 2 | title={textless-lib: a Library for Textless Spoken Language Processing}, 3 | author={Eugene Kharitonov and Jade Copet and Kushal Lakhotia and Tu Anh Nguyen and Paden Tomasello and Ann Lee and Ali Elkahky and Wei-Ning Hsu and Abdelrahman Mohamed and Emmanuel Dupoux and Yossi Adi}, 4 | year={2022}, 5 | eprint={2202.07359}, 6 | archivePrefix={arXiv}, 7 | primaryClass={cs.CL} 8 | } -------------------------------------------------------------------------------- /unitspeech/textlesslib/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_file = LICENSE 3 | 4 | 5 | [flake8] 6 | exclude = 7 | .tox, 8 | .git, 9 | __pycache__, 10 | build, 11 | dist, 12 | *.md, 13 | *.pyc, 14 | *.egg-info, 15 | .cache, 16 | .eggs, 17 | max-line-length = 120 18 | 19 | 20 | [isort] 21 | multi_line_output = 3 22 | include_trailing_comma = True 23 | force_grid_wrap = 0 24 | use_parentheses = True 25 | ensure_newline_before_comments = True 26 | line_length = 88 27 | 28 | 29 | [black] 30 | -------------------------------------------------------------------------------- /unitspeech/vocoder/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super(AttrDict, self).__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /unitspeech/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | ''' 6 | _pad = '_' 7 | _punctuation = ';:,.!?¡¿—…"«»“” ' 8 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 9 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ̃" 10 | 11 | 12 | # Export all symbols: 13 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 14 | 15 | # Special symbol ids 16 | SPACE_ID = symbols.index(" ") 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="unitspeech", 5 | py_modules=["unitspeech"], 6 | install_requires=[ 7 | "amfm_decompy==1.0.11", 8 | "einops==0.6.1", 9 | "fairseq==0.12.2", 10 | "inflect==7.0.0", 11 | "joblib==1.2.0", 12 | "librosa==0.10.0.post2", 13 | "matplotlib==3.7.1", 14 | "packaging==23.1", 15 | "phonemizer==3.2.1", 16 | "torch==2.0.1", 17 | "torchvision==0.15.2", 18 | "torchaudio==2.0.2", 19 | "transformers==4.30.2", 20 | "unidecode==1.3.6", 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/single.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | DENSE_NAME=hubert-base-ls960 8 | QUANTIZER_NAME=kmeans 9 | VOCAB_SIZE=50 10 | MANIFEST=manifest.tsv 11 | TRANSCRIPT=transcript 12 | 13 | python transcribe.py \ 14 | --manifest $MANIFEST \ 15 | --output=$TRANSCRIPT \ 16 | --dense_model=$DENSE_NAME \ 17 | --quantizer_model=$QUANTIZER_NAME \ 18 | --vocab_size=$VOCAB_SIZE \ 19 | --durations --deduplicate 20 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/local.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | WORKERS_PER_NODE=4 7 | DENSE_NAME=hubert-base-ls960 8 | QUANTIZER_NAME=kmeans 9 | VOCAB_SIZE=50 10 | MANIFEST=manifest.tsv 11 | TRANSCRIPT=transcript 12 | 13 | python -m torch.distributed.run --nproc_per_node=$WORKERS_PER_NODE transcribe.py \ 14 | --manifest $MANIFEST \ 15 | --output=$TRANSCRIPT \ 16 | --dense_model=$DENSE_NAME \ 17 | --quantizer_model=$QUANTIZER_NAME \ 18 | --vocab_size=$VOCAB_SIZE \ 19 | --durations --deduplicate 20 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from setuptools import find_packages, setup 7 | 8 | with open("requirements.txt") as f: 9 | requirements = f.read().splitlines() 10 | 11 | setup( 12 | name="textless", 13 | version="0.1.0", 14 | url="https://github.com/facebookresearch/textlesslib", 15 | author="Textless NLP team at Facebook AI Research", 16 | author_email="kharitonov@fb.com", 17 | description="Tools for Textless NLP Research", 18 | packages=find_packages(), 19 | install_requires=requirements, 20 | ) 21 | -------------------------------------------------------------------------------- /unitspeech/checkpoints/finetune.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "n_units": 1000, 4 | "n_feats": 80, 5 | "n_fft": 1024, 6 | "hop_length": 256, 7 | "win_length": 1024, 8 | "sampling_rate": 22050, 9 | "mel_fmin": 0.0, 10 | "mel_fmax": 8000.0 11 | }, 12 | "encoder": { 13 | "n_channels": 192, 14 | "filter_channels": 768, 15 | "n_layers": 6, 16 | "kernel_size": 3, 17 | "p_dropout": 0.1, 18 | "n_heads": 2, 19 | "window_size": 4 20 | }, 21 | "decoder": { 22 | "dim": 128, 23 | "dim_mults": [1, 2, 4, 8], 24 | "pe_scale": 1000, 25 | "beta_min": 0.05, 26 | "beta_max": 20.0, 27 | "spk_emb_dim": 256 28 | }, 29 | "train": { 30 | "out_size_second": 2, 31 | "vocoder_config_path": "unitspeech/vocoder/checkpts/bigvgan-config.json", 32 | "vocoder_ckpt_path": "unitspeech/vocoder/checkpts/bigvgan.pt" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /unitspeech/checkpoints/voice-conversion.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "n_feats": 80, 4 | "n_fft": 1024, 5 | "hop_length": 256, 6 | "win_length": 1024, 7 | "sampling_rate": 22050, 8 | "mel_fmin": 0.0, 9 | "mel_fmax": 8000.0 10 | }, 11 | "encoder": { 12 | "n_channels": 192, 13 | "filter_channels": 768, 14 | "n_layers": 6, 15 | "kernel_size": 3, 16 | "p_dropout": 0.1, 17 | "n_heads": 2, 18 | "window_size": 4, 19 | "n_contentvec": 768 20 | }, 21 | "decoder": { 22 | "dim": 128, 23 | "dim_mults": [1, 2, 4, 8], 24 | "pe_scale": 1000, 25 | "beta_min": 0.05, 26 | "beta_max": 20.0, 27 | "spk_emb_dim": 256 28 | }, 29 | "train": { 30 | "out_size_second": 2, 31 | "vocoder_config_path": "unitspeech/vocoder/checkpts/bigvgan-config.json", 32 | "vocoder_ckpt_path": "unitspeech/vocoder/checkpts/bigvgan.pt" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /unitspeech/vocoder/alias_free_torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from .resample import UpSample1d, DownSample1d 6 | 7 | 8 | class Activation1d(nn.Module): 9 | def __init__(self, 10 | activation, 11 | up_ratio: int = 2, 12 | down_ratio: int = 2, 13 | up_kernel_size: int = 12, 14 | down_kernel_size: int = 12): 15 | super().__init__() 16 | self.up_ratio = up_ratio 17 | self.down_ratio = down_ratio 18 | self.act = activation 19 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 20 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 21 | 22 | # x: [B,C,T] 23 | def forward(self, x): 24 | x = self.upsample(x) 25 | x = self.act(x) 26 | x = self.downsample(x) 27 | 28 | return x -------------------------------------------------------------------------------- /unitspeech/base.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS """ 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | class BaseModule(torch.nn.Module): 8 | def __init__(self): 9 | super(BaseModule, self).__init__() 10 | 11 | @property 12 | def nparams(self): 13 | """ 14 | Returns number of trainable parameters of the module. 15 | """ 16 | num_params = 0 17 | for name, param in self.named_parameters(): 18 | if param.requires_grad: 19 | num_params += np.prod(param.detach().cpu().numpy().shape) 20 | return num_params 21 | 22 | 23 | def relocate_input(self, x: list): 24 | """ 25 | Relocates provided tensors to the same device set for the module. 26 | """ 27 | device = next(self.parameters()).device 28 | for i in range(len(x)): 29 | if isinstance(x[i], torch.Tensor) and x[i].device != device: 30 | x[i] = x[i].to(device) 31 | return x 32 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/resynthesis/README.md: -------------------------------------------------------------------------------- 1 | # Discrete Resynthesis example 2 | 3 | In `resynth.py` we showcase a simple demonstration of the audio resynthesis done via HuBERT-based discrete pseudo-units. The code closesly 4 | follows the [unit2speech module](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech) of GSLM. 5 | 6 | # How to run 7 | Below is an example of running the script: 8 | ```bash 9 | python resynth.py --input test_input.wav --output=test_output.wav --vocab_size=100 --decoder_steps=500 10 | ``` 11 | 12 | `resynth.py` supports the following command-line arguments: 13 | * `--dense_model_name`: name of the dense representation model to be used (suppported: `hubert-base-ls960` and `cpc-big-ll6k`); 14 | * `--input`: the input audio file (must have the sample rate of 16 KHz); 15 | * `--output`: the output file name; 16 | * `--vocab_size`: the size of the quantization vocabulary to be used (one of 50, 100, 200); 17 | * `--decoder_steps`: determines the maximal duration of the produces audio. 18 | -------------------------------------------------------------------------------- /unitspeech/checkpoints/text-to-speech.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "n_feats": 80, 4 | "n_fft": 1024, 5 | "hop_length": 256, 6 | "win_length": 1024, 7 | "sampling_rate": 22050, 8 | "mel_fmin": 0.0, 9 | "mel_fmax": 8000.0 10 | }, 11 | "encoder": { 12 | "n_channels": 192, 13 | "filter_channels": 768, 14 | "n_layers": 6, 15 | "kernel_size": 3, 16 | "p_dropout": 0.1, 17 | "n_heads": 2, 18 | "window_size": 4 19 | }, 20 | "duration_predictor": { 21 | "in_channels": 192, 22 | "filter_channels": 256, 23 | "kernel_size": 3, 24 | "p_dropout": 0.1, 25 | "spk_emb_dim": 256 26 | }, 27 | "decoder": { 28 | "dim": 128, 29 | "dim_mults": [1, 2, 4, 8], 30 | "pe_scale": 1000, 31 | "beta_min": 0.05, 32 | "beta_max": 20.0, 33 | "spk_emb_dim": 256 34 | }, 35 | "train": { 36 | "out_size_second": 2, 37 | "vocoder_config_path": "unitspeech/vocoder/checkpts/bigvgan-config.json", 38 | "vocoder_ckpt_path": "unitspeech/vocoder/checkpts/bigvgan.pt" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/slurm.sbatch: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | ## Set output and job name 8 | #SBATCH --job-name=transcribe 9 | #SBATCH --output=log.out 10 | #SBATCH --error=log.err 11 | ## partition name 12 | #SBATCH --partition=devlab 13 | 14 | ## number of nodes, tasks per nodes, etc 15 | #SBATCH --nodes=1 16 | #SBATCH --gpus-per-node=8 17 | #SBATCH --ntasks-per-node=32 18 | #SBATCH --time 4320 19 | #SBATCH --mem 400G 20 | #SBATCH --gres=gpu:8 21 | #SBATCH --cpus-per-task=2 22 | 23 | DENSE_NAME=hubert-base-ls960 24 | QUANTIZER_NAME=kmeans 25 | VOCAB_SIZE=50 26 | MANIFEST=manifest.tsv 27 | TRANSCRIPT=transcript 28 | 29 | srun -u python transcribe.py \ 30 | --manifest $MANIFEST \ 31 | --output=$TRANSCRIPT \ 32 | --dense_model=$DENSE_NAME \ 33 | --quantizer_model=$QUANTIZER_NAME \ 34 | --vocab_size=$VOCAB_SIZE \ 35 | --durations --deduplicate 36 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/data_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import torchaudio 7 | 8 | import pathlib 9 | 10 | import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class ManifestDataset: 16 | def __init__(self, manifest): 17 | with open(manifest, "r") as fin: 18 | self.root = pathlib.Path(fin.readline().strip()) 19 | self.files = [x.strip().split()[0] for x in fin.readlines()] 20 | 21 | logger.info( 22 | f"Init dataset with root in {self.root}, containing {len(self.files)} files" 23 | ) 24 | 25 | def __len__(self): 26 | return len(self.files) 27 | 28 | def __getitem__(self, k): 29 | path = self.root / self.files[k] 30 | data, sr = torchaudio.load(str(path)) 31 | 32 | assert sr == 16_000 33 | return data.squeeze(0), path.with_suffix("").name 34 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/gslm/README.md: -------------------------------------------------------------------------------- 1 | # Generative Spoken Language Modeling pipeline 2 | 3 | ## Retrieve a language model 4 | 5 | Assume you want to experiment with a pre-trained language model that is trained on HuBERT representations, quantized with a codebook of size 100. 6 | Firstly, you need to download and unpack the model itself: 7 | ```bash 8 | mkdir LM/ 9 | wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km100/hubert100_lm.tgz -O LM/hubert100_lm.tgz 10 | cd LM/ && tar -xvf hubert100_lm.tgz 11 | ``` 12 | (other checkpoints can be found in the [Textless NLP GSLM release](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/ulm).) 13 | 14 | ## Run Speech Continuation on a file 15 | To run the speech continuation pipeline with the previously downloaded models, you can use the following command: 16 | ```bash 17 | python sample.py \ 18 | --language-model-data-dir=LM/hubert100_lm \ 19 | --input-file 174-84280-0004.flac \ 20 | --output-file output_new.wav \ 21 | --prompt-duration-sec=3 \ 22 | --temperature=0.7 \ 23 | --vocab-size=100 24 | ``` 25 | -------------------------------------------------------------------------------- /unitspeech/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from unitspeech.text import cleaners 3 | from unitspeech.text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | 11 | def phonemize(text, global_phonemizer): 12 | text = cleaners.convert_to_ascii(text) 13 | text = cleaners.lowercase(text) 14 | text = cleaners.expand_abbreviations(text) 15 | phonemes = global_phonemizer.phonemize([text], strip=True)[0] 16 | phonemes = cleaners.collapse_whitespace(phonemes) 17 | return phonemes 18 | 19 | 20 | def cleaned_text_to_sequence(cleaned_text): 21 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 22 | Args: 23 | text: string to convert to a sequence 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] 28 | return sequence 29 | -------------------------------------------------------------------------------- /unitspeech/text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /unitspeech/vocoder/incl_licenses/LICENSE_5: -------------------------------------------------------------------------------- 1 | Copyright 2020 Alexandre Défossez 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or 10 | substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 15 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /unitspeech/vocoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 NVIDIA CORPORATION. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /unitspeech/vocoder/incl_licenses/LICENSE_1: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /unitspeech/vocoder/incl_licenses/LICENSE_2: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Edward Dixon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /unitspeech/textlesslib/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tests/test_checkpoint_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER 7 | import tempfile 8 | import pathlib 9 | import pytest 10 | 11 | 12 | def test_checkpoint_manager(): 13 | codes = CHECKPOINT_MANAGER.get_by_name( 14 | "hubert-base-ls960-kmeans-50-tacotron-codes", download_if_needed=True 15 | ) 16 | assert pathlib.Path(codes).exists() 17 | 18 | with pytest.raises(KeyError): 19 | codes = CHECKPOINT_MANAGER.get_by_name("123", download_if_needed=True) 20 | 21 | 22 | def test_changing_root(): 23 | name = "hubert-base-ls960-kmeans-50-tacotron-codes" 24 | 25 | with tempfile.TemporaryDirectory() as tmpdir: 26 | CHECKPOINT_MANAGER.set_root(tmpdir) 27 | with pytest.raises(FileNotFoundError): 28 | CHECKPOINT_MANAGER.get_by_name(name, download_if_needed=False) 29 | 30 | CHECKPOINT_MANAGER.get_by_name(name, download_if_needed=True) 31 | assert (pathlib.Path(tmpdir) / CHECKPOINT_MANAGER.storage[name].fname).exists() 32 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/speaker_probing/README.md: -------------------------------------------------------------------------------- 1 | # Speaker probing example 2 | 3 | This directory contains a short example that illustrates the speaker probing task. Specifically, we investigate whether an anonymised speaker id can 4 | be predicted based on their utterances representated as (potentially quantized) HuBERT representations. This example uses LibriSpeech dev-clean as a dataset. 5 | 6 | ## Running example 7 | To train a simple speaker classifier and get its accuracy on validation data, it is enough to simply run a command: 8 | ```python train.py --model_type=discrete --seed=0 --epochs=5 --vocab_size=50``` 9 | This will train a small Transformer model on HuBERT representations, quantized into a vocabulary of 50 pseudo-units. 10 | 11 | ## Command-line arguments 12 | * `--dense_model_name`: dense model to be used. Must be either `hubert-base-ls960` or `cpc-big-ll6k`; 13 | * `--seed`: sets the random seed; 14 | * `--epochs`: sets the number of training epochs; 15 | * `--vocab_size`: sets the size of the codebook. The example uses pre-trained codebooks and support vocabulary sizes of 50, 100, and 200; 16 | * `--model_type`: selects the model/representation to be used. Must be one of [`discrete`, `continuous`, `baseline` (default)]. 17 | 18 | 19 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tests/test_quantized_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import pathlib 7 | from unitspeech.textlesslib.textless.data.quantized_datasets import QuantizedLibriSpeech 8 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder 9 | 10 | 11 | def test_quantized_librispeech(): 12 | url = "dev-clean" 13 | root = "./data" 14 | 15 | pathlib.Path(root).mkdir(exist_ok=True) 16 | 17 | dense_model_name = "hubert-base-ls960" 18 | quantizer_name = "kmeans" 19 | vocab_size = 100 20 | 21 | encoder = SpeechEncoder.by_name( 22 | dense_model_name=dense_model_name, 23 | quantizer_model_name=quantizer_name, 24 | vocab_size=vocab_size, 25 | need_f0=True, 26 | deduplicate=True, 27 | f0_normalizer=None, 28 | f0_quantizer=None, 29 | ) 30 | 31 | quantized_dataset = QuantizedLibriSpeech( 32 | root=root, speech_encoder=encoder, url=url, download=True 33 | ) 34 | item = quantized_dataset[0] 35 | 36 | # checking a few invariants 37 | assert item["units"].size(0) == item["durations"].size(0) == item["f0"].size(0) 38 | assert item["durations"].sum().item() == item["dense"].size(0) 39 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/data/collater_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import torch 8 | 9 | 10 | def collate_tensors(stream, pad): 11 | """ 12 | >>> tensors = [torch.tensor(x) for x in [[1,2,3], [1]]] 13 | >>> pad = 0 14 | >>> collate_tensors(tensors, pad) 15 | tensor([[1, 2, 3], 16 | [1, 0, 0]]) 17 | """ 18 | assert len(stream) > 0 19 | 20 | length = max(v.size(0) for v in stream) 21 | n_samples = len(stream) 22 | 23 | collated = stream[0].new_full((n_samples, length), pad) 24 | 25 | for i, v in enumerate(stream): 26 | collated[i, : v.size(0)] = v 27 | 28 | return collated 29 | 30 | 31 | def wrap_bos_eos(units, durations, f0, dense, bos, eos): 32 | assert units.size(0) == durations.size(0) == dense.size(0) 33 | if f0 is not None: 34 | assert units.size(0) == f0.size(0) 35 | 36 | units = torch.cat([bos, units, eos]) 37 | z = torch.zeros_like(durations[0:1]) 38 | durations = torch.cat([z, durations, z]) 39 | if f0 is not None: 40 | z = torch.zeros_like(f0[0:1]) 41 | f0 = torch.cat([z, f0, z]) 42 | z = torch.zeros_like(dense[0:1, :]) 43 | dense = torch.cat([z, dense, z]) 44 | 45 | return units, durations, f0, dense 46 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to textless-lib 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | We develop on GitHub. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes and add new tests if this is relevant. 15 | 5. Run `black` code-style formatter. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Meta's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | We use `black` to enforce a uniform codestyle. 34 | 35 | ## License 36 | By contributing to textless-lib, you agree that your contributions will be licensed 37 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/data/kmeans_quantizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import joblib 8 | import warnings 9 | 10 | 11 | class KMeansQuantizer(torch.nn.Module): 12 | def __init__(self, checkpoint_path): 13 | super().__init__() 14 | self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) 15 | self.kmeans_model = self.load_kmeans_model(checkpoint_path) 16 | 17 | def forward(self, x): 18 | return torch.from_numpy(self.kmeans_model.predict(x.cpu().numpy())).to( 19 | self.device 20 | ) 21 | 22 | @property 23 | def vocab_size(self) -> int: 24 | return self.kmeans_model.n_clusters 25 | 26 | @property 27 | def device(self): 28 | return self._float_tensor.device 29 | 30 | @staticmethod 31 | def load_kmeans_model(checkpoint_path: str): 32 | with open(checkpoint_path, "rb") as fd: 33 | with warnings.catch_warnings(): 34 | # produces lots of version warnings which can be annoying when we have many workers 35 | warnings.simplefilter("ignore") 36 | kmeans_model = joblib.load(fd) 37 | # some of the GSLM checkpoints (CPC) were saved under a different scikit version 38 | if not hasattr(kmeans_model, "_n_threads"): 39 | kmeans_model._n_threads = 40 40 | 41 | kmeans_model.verbose = False 42 | return kmeans_model 43 | -------------------------------------------------------------------------------- /unitspeech/vocoder/incl_licenses/LICENSE_4: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Seungwon Park 박승원 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /unitspeech/vocoder/xutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import glob 4 | import os 5 | import matplotlib 6 | import torch 7 | from torch.nn.utils import weight_norm 8 | matplotlib.use("Agg") 9 | import matplotlib.pylab as plt 10 | 11 | 12 | def plot_spectrogram(spectrogram): 13 | fig, ax = plt.subplots(figsize=(10, 2)) 14 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 15 | interpolation='none') 16 | plt.colorbar(im, ax=ax) 17 | 18 | fig.canvas.draw() 19 | plt.close() 20 | 21 | return fig 22 | 23 | 24 | def init_weights(m, mean=0.0, std=0.01): 25 | classname = m.__class__.__name__ 26 | if classname.find("Conv") != -1: 27 | m.weight.data.normal_(mean, std) 28 | 29 | 30 | def apply_weight_norm(m): 31 | classname = m.__class__.__name__ 32 | if classname.find("Conv") != -1: 33 | weight_norm(m) 34 | 35 | 36 | def get_padding(kernel_size, dilation=1): 37 | return int((kernel_size*dilation - dilation)/2) 38 | 39 | 40 | def load_checkpoint(filepath, device): 41 | assert os.path.isfile(filepath) 42 | print("Loading '{}'".format(filepath)) 43 | checkpoint_dict = torch.load(filepath, map_location=device) 44 | print("Complete.") 45 | return checkpoint_dict 46 | 47 | 48 | def save_checkpoint(filepath, obj): 49 | print("Saving checkpoint to {}".format(filepath)) 50 | torch.save(obj, filepath) 51 | print("Complete.") 52 | 53 | 54 | def scan_checkpoint(cp_dir, prefix): 55 | pattern = os.path.join(cp_dir, prefix + '????????') 56 | cp_list = glob.glob(pattern) 57 | if len(cp_list) == 0: 58 | return None 59 | return sorted(cp_list)[-1] 60 | 61 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/tts_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import torch 8 | import numpy as np 9 | from .text import ( 10 | EOS_TOK, 11 | SOS_TOK, 12 | code_to_sequence, 13 | text_to_sequence, 14 | ) 15 | from .utils import ( 16 | load_code_dict, 17 | ) 18 | 19 | 20 | class TacotronInputDataset: 21 | def __init__(self, hparams, append_str=""): 22 | self.is_text = getattr(hparams, "text_or_code", "text") == "text" 23 | if not self.is_text: 24 | self.code_dict = load_code_dict(hparams.code_dict) 25 | self.code_key = hparams.code_key 26 | self.add_sos = hparams.add_sos 27 | self.add_eos = hparams.add_eos 28 | self.collapse_code = hparams.collapse_code 29 | self.append_str = append_str 30 | 31 | def process_code(self, inp_str): 32 | inp_toks = inp_str.split() 33 | if self.add_sos: 34 | inp_toks = [SOS_TOK] + inp_toks 35 | if self.add_eos: 36 | inp_toks = inp_toks + [EOS_TOK] 37 | return code_to_sequence(inp_toks, self.code_dict, self.collapse_code) 38 | 39 | def process_text(self, inp_str): 40 | return text_to_sequence(inp_str, ["english_cleaners"]) 41 | 42 | def get_tensor(self, inp_str): 43 | # uid, txt, inp_str = self._get_data(idx) 44 | inp_str = inp_str + self.append_str 45 | if self.is_text: 46 | inp_toks = self.process_text(inp_str) 47 | else: 48 | inp_toks = self.process_code(inp_str) 49 | return torch.from_numpy(np.array(inp_toks)).long() 50 | 51 | def __len__(self): 52 | return len(self.data) 53 | -------------------------------------------------------------------------------- /unitspeech/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | 18 | 19 | # Regular expression matching whitespace: 20 | _whitespace_re = re.compile(r'\s+') 21 | 22 | # List of (regular expression, replacement) pairs for abbreviations: 23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 24 | ('mrs', 'misess'), 25 | ('mr', 'mister'), 26 | ('dr', 'doctor'), 27 | ('st', 'saint'), 28 | ('co', 'company'), 29 | ('jr', 'junior'), 30 | ('maj', 'major'), 31 | ('gen', 'general'), 32 | ('drs', 'doctors'), 33 | ('rev', 'reverend'), 34 | ('lt', 'lieutenant'), 35 | ('hon', 'honorable'), 36 | ('sgt', 'sergeant'), 37 | ('capt', 'captain'), 38 | ('esq', 'esquire'), 39 | ('ltd', 'limited'), 40 | ('col', 'colonel'), 41 | ('ft', 'fort'), 42 | ]] 43 | 44 | 45 | def expand_abbreviations(text): 46 | for regex, replacement in _abbreviations: 47 | text = re.sub(regex, replacement, text) 48 | return text 49 | 50 | 51 | def lowercase(text): 52 | return text.lower() 53 | 54 | 55 | def collapse_whitespace(text): 56 | return re.sub(_whitespace_re, ' ', text) 57 | 58 | 59 | def convert_to_ascii(text): 60 | return unidecode(text) 61 | 62 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from unitspeech.textlesslib.textless.data.cpc_feature_reader import CpcFeatureReader 7 | from unitspeech.textlesslib.textless.data.hubert_feature_reader import HubertFeatureReader 8 | from unitspeech.textlesslib.textless.data.kmeans_quantizer import KMeansQuantizer 9 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER 10 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder 11 | 12 | DENSE_MODELS = { 13 | "hubert-base-ls960": HubertFeatureReader, 14 | "mhubert-base-vp_en_es_fr": HubertFeatureReader, 15 | "cpc-big-ll6k": CpcFeatureReader, 16 | } 17 | 18 | 19 | QUANTIZER_MODELS = { 20 | "kmeans": KMeansQuantizer, 21 | } 22 | 23 | 24 | def dispatch_dense_model(name: str, **kwargs): 25 | model_class = DENSE_MODELS[name] 26 | checkpoint_path = CHECKPOINT_MANAGER.get_by_name(name) 27 | return model_class(checkpoint_path, **kwargs) 28 | 29 | 30 | def dispatch_quantizer(dense_model_name: str, quantizer_name: str, vocab_size: int): 31 | quantizer_checkpoint_name = f"{dense_model_name}-{quantizer_name}-{vocab_size}" 32 | checkpoint_path = CHECKPOINT_MANAGER.get_by_name(quantizer_checkpoint_name) 33 | quantizer = QUANTIZER_MODELS[quantizer_name](checkpoint_path) 34 | return quantizer 35 | 36 | 37 | def dispatch_vocoder( 38 | dense_model_name: str, 39 | quantizer_name: str, 40 | vocoder_name: str, 41 | vocab_size: int, 42 | ): 43 | if vocoder_name == "tacotron": 44 | vocoder = TacotronVocoder.by_name( 45 | dense_model_name, 46 | quantizer_name, 47 | vocab_size, 48 | ) 49 | else: 50 | assert False, "Unsupported vocoder name" 51 | return vocoder 52 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/symbols.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Keith Ito 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | """ from MIT-licensed https://github.com/keithito/tacotron """ 22 | 23 | ''' 24 | Defines the set of symbols used in text input to the model. 25 | 26 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 27 | from . import cmudict 28 | 29 | _pad = '_' 30 | _punctuation = '!\'(),.:;? ' 31 | _special = '-' 32 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 33 | 34 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 35 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 36 | 37 | # Export all symbols: 38 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet 39 | -------------------------------------------------------------------------------- /unitspeech/vocoder/alias_free_torch/resample.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | from .filter import LowPassFilter1d 7 | from .filter import kaiser_sinc_filter1d 8 | 9 | 10 | class UpSample1d(nn.Module): 11 | def __init__(self, ratio=2, kernel_size=None): 12 | super().__init__() 13 | self.ratio = ratio 14 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 15 | self.stride = ratio 16 | self.pad = self.kernel_size // ratio - 1 17 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 18 | self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 19 | filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, 20 | half_width=0.6 / ratio, 21 | kernel_size=self.kernel_size) 22 | self.register_buffer("filter", filter) 23 | 24 | # x: [B, C, T] 25 | def forward(self, x): 26 | _, C, _ = x.shape 27 | 28 | x = F.pad(x, (self.pad, self.pad), mode='replicate') 29 | x = self.ratio * F.conv_transpose1d( 30 | x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 31 | x = x[..., self.pad_left:-self.pad_right] 32 | 33 | return x 34 | 35 | 36 | class DownSample1d(nn.Module): 37 | def __init__(self, ratio=2, kernel_size=None): 38 | super().__init__() 39 | self.ratio = ratio 40 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 41 | self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, 42 | half_width=0.6 / ratio, 43 | stride=ratio, 44 | kernel_size=self.kernel_size) 45 | 46 | def forward(self, x): 47 | xx = self.lowpass(x) 48 | 49 | return xx -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/checkpoint_manager/manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from typing import Union 7 | 8 | from dataclasses import dataclass 9 | # Modified (UnitSpeech) 10 | # from torchaudio.datasets.utils import download_url 11 | from torch.hub import download_url_to_file 12 | import pathlib 13 | 14 | 15 | @dataclass 16 | class Checkpoint: 17 | name: str 18 | remote_path: str 19 | fname: str 20 | sha256: str 21 | 22 | 23 | class CheckpointManager: 24 | def __init__(self, disk_root: Union[str, pathlib.Path] = "~/.textless/"): 25 | self.disk_root = pathlib.Path(disk_root).expanduser().resolve() 26 | if not self.disk_root.exists(): 27 | self.disk_root.mkdir() 28 | 29 | self.storage: dict[str, Checkpoint] = {} 30 | 31 | def add_checkpoint(self, checkpoint: Checkpoint) -> None: 32 | name = checkpoint.name 33 | assert name not in self.storage 34 | self.storage[name] = checkpoint 35 | 36 | def download_by_name(self, name: str) -> None: 37 | checkpoint = self.storage[name] 38 | # Modified (UnitSpeech) 39 | # download_url( 40 | # checkpoint.remote_path, 41 | # self.disk_root, 42 | # hash_value=checkpoint.sha256, 43 | # hash_type="sha256", 44 | # filename=checkpoint.fname, 45 | # ) 46 | download_url_to_file( 47 | checkpoint.remote_path, 48 | self.disk_root / checkpoint.fname 49 | ) 50 | 51 | def get_by_name(self, name: str, download_if_needed: bool = True) -> pathlib.Path: 52 | checkpoint = self.storage[name] 53 | disk_name = self.disk_root / checkpoint.fname 54 | 55 | if not disk_name.exists(): 56 | if download_if_needed: 57 | self.download_by_name(name) 58 | else: 59 | raise FileNotFoundError( 60 | f"Checkpoint {checkpoint} was not found locally at {disk_name}, please set `allow_download` flag" 61 | ) 62 | return disk_name 63 | 64 | def set_root(self, new_root: Union[str, pathlib.Path]) -> None: 65 | self.disk_root = pathlib.Path(new_root) 66 | -------------------------------------------------------------------------------- /unitspeech/duration_predictor.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jaywalnut310/glow-tts """ 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from unitspeech.base import BaseModule 7 | 8 | 9 | class LayerNorm(nn.Module): 10 | def __init__(self, channels, eps=1e-5): 11 | super().__init__() 12 | self.channels = channels 13 | self.eps = eps 14 | 15 | self.gamma = nn.Parameter(torch.ones(channels)) 16 | self.beta = nn.Parameter(torch.zeros(channels)) 17 | 18 | def forward(self, x): 19 | x = x.transpose(1, -1).contiguous() 20 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 21 | return x.transpose(1, -1).contiguous() 22 | 23 | 24 | class DurationPredictor(BaseModule): 25 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, spk_emb_dim=0): 26 | super(DurationPredictor, self).__init__() 27 | in_channels = in_channels + spk_emb_dim 28 | 29 | self.in_channels = in_channels 30 | self.filter_channels = filter_channels 31 | self.p_dropout = p_dropout 32 | 33 | self.drop = torch.nn.Dropout(p_dropout) 34 | self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, 35 | kernel_size, padding=kernel_size//2) 36 | self.norm_1 = LayerNorm(filter_channels) 37 | self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels, 38 | kernel_size, padding=kernel_size//2) 39 | self.norm_2 = LayerNorm(filter_channels) 40 | self.proj = torch.nn.Conv1d(filter_channels, 1, 1) 41 | 42 | def forward(self, x, x_mask, w=None, g=None, reverse=False): 43 | x = torch.detach(x) 44 | if g is not None: 45 | x = torch.cat([x, g.transpose(1, 2).repeat(1, 1, x.shape[-1])], dim=1) 46 | x = self.conv_1(x * x_mask) 47 | x = torch.relu(x) 48 | x = self.norm_1(x) 49 | x = self.drop(x) 50 | x = self.conv_2(x * x_mask) 51 | x = torch.relu(x) 52 | x = self.norm_2(x) 53 | x = self.drop(x) 54 | logw = self.proj(x * x_mask) * x_mask 55 | if not reverse: 56 | logw_ = torch.log(w + 1e-6) * x_mask 57 | return torch.sum((logw - logw_) ** 2) / torch.sum(x_mask) # for averaging 58 | else: 59 | return logw -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import torch 8 | import unitspeech.textlesslib.textless.vocoders.tacotron2 as tacotron2 9 | import sys 10 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER 11 | from .waveglow_denoiser import Denoiser 12 | from .model import Tacotron2 13 | from .tts_data import TacotronInputDataset 14 | 15 | 16 | def get_waveglow(download_if_needed=True): 17 | 18 | waveglow_path = CHECKPOINT_MANAGER.get_by_name( 19 | "waveglow", download_if_needed=download_if_needed 20 | ) 21 | 22 | sys.path.append(tacotron2.__path__[0]) 23 | waveglow = torch.load(waveglow_path)["model"] 24 | sys.path.pop() 25 | 26 | waveglow = waveglow.cuda().eval() 27 | denoiser = Denoiser(waveglow) 28 | return waveglow, denoiser 29 | 30 | 31 | def load_tacotron(model_name, max_decoder_steps, download_if_needed=True): 32 | tacotron_path = CHECKPOINT_MANAGER.get_by_name( 33 | model_name, download_if_needed=download_if_needed 34 | ) 35 | ckpt_dict = torch.load(tacotron_path) 36 | 37 | hparams = ckpt_dict["hparams"] 38 | codes_path = CHECKPOINT_MANAGER.get_by_name( 39 | f"{model_name}-codes", download_if_needed=download_if_needed 40 | ) 41 | hparams.code_dict = codes_path 42 | 43 | hparams.max_decoder_steps = max_decoder_steps 44 | model = Tacotron2(hparams) 45 | model.load_state_dict(ckpt_dict["model_dict"]) 46 | model = model.cuda().eval().half() 47 | 48 | tts_dataset = TacotronInputDataset(hparams) 49 | 50 | return model, tts_dataset 51 | 52 | 53 | def synthesize_audio( 54 | units, model, tts_dataset, waveglow, denoiser, lab=None, denoiser_strength=0.0 55 | ): 56 | quantized_units_str = " ".join(map(str, units.tolist())) 57 | tokens = tts_dataset.get_tensor(quantized_units_str).cuda().unsqueeze(0) 58 | 59 | if lab is not None: 60 | lab = torch.LongTensor(1).cuda().fill_(lab) 61 | 62 | with torch.no_grad(): 63 | _, mel, _, ali, has_eos = model.inference(tokens, lab, ret_has_eos=True) 64 | mel = mel.float() 65 | audio = waveglow.infer(mel, sigma=0.666) 66 | denoised_audio = denoiser(audio, strength=denoiser_strength).squeeze(1) 67 | return mel, audio, denoised_audio, has_eos 68 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tests/test_model_handling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import pytest 7 | from unitspeech.textlesslib.textless import dispatch_dense_model, dispatch_quantizer 8 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder 9 | import torch 10 | from itertools import product 11 | 12 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder 13 | 14 | 15 | def test_model_dispatch(): 16 | dense_model_name = "hubert-base-ls960" 17 | quantizer_name = "kmeans" 18 | vocab_size = 100 19 | 20 | # getting dense model 21 | dense_model = dispatch_dense_model(dense_model_name) 22 | assert isinstance(dense_model, torch.nn.Module) 23 | 24 | # getting a quantizer for it 25 | assert ( 26 | dispatch_quantizer(dense_model_name, quantizer_name, vocab_size=vocab_size) 27 | is not None 28 | ) 29 | 30 | with pytest.raises(KeyError): 31 | assert dispatch_quantizer(dense_model_name, quantizer_name, vocab_size=101) 32 | 33 | # getting a vocoder for it 34 | assert ( 35 | TacotronVocoder.by_name( 36 | dense_model_name=dense_model_name, 37 | quantizer_model_name=quantizer_name, 38 | vocab_size=vocab_size, 39 | ) 40 | is not None 41 | ) 42 | 43 | 44 | densename_vocabsize = list(product(["hubert-base-ls960", "cpc-big-ll6k"], [50, 100, 200])) 45 | 46 | 47 | @pytest.mark.parametrize("dense_name,vocab_size", densename_vocabsize) 48 | def test_speech_encoder(dense_name, vocab_size): 49 | quantizer_name = "kmeans" 50 | 51 | encoder = SpeechEncoder.by_name( 52 | dense_model_name=dense_name, 53 | quantizer_model_name=quantizer_name, 54 | vocab_size=vocab_size, 55 | need_f0=False, 56 | deduplicate=True, 57 | f0_normalizer=None, 58 | f0_quantizer=None, 59 | ) 60 | 61 | assert encoder is not None 62 | 63 | # let's pass 0.5s of silence thru it 64 | waveform = torch.zeros(encoder.expected_sample_rate // 2) 65 | encoded = encoder(waveform) 66 | 67 | assert encoded 68 | 69 | 70 | @pytest.mark.parametrize("dense_name,vocab_size", densename_vocabsize) 71 | def test_vocoder_lookup(dense_name, vocab_size): 72 | quantizer_name = "kmeans" 73 | 74 | vocoder = TacotronVocoder.by_name( 75 | dense_model_name=dense_name, 76 | quantizer_model_name=quantizer_name, 77 | vocab_size=vocab_size, 78 | ) 79 | assert vocoder is not None 80 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/data/hubert_feature_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import torch 8 | import fairseq 9 | import torch.nn.functional as F 10 | 11 | 12 | class HubertFeatureReader(torch.nn.Module): 13 | def __init__( 14 | self, checkpoint_path, layer=6, max_chunk=100 * 16_000, lazy_load=False 15 | ): 16 | super().__init__() 17 | # NB: fairseq doesn't support pathlib.Path 18 | self.checkpoint_path = str(checkpoint_path) 19 | self.should_normalize = False 20 | self.lazy_load = lazy_load 21 | self.model = None 22 | self.layer = layer 23 | self.max_chunk = max_chunk 24 | # this is useful for determining the device 25 | self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) 26 | if not self.lazy_load: 27 | self.load_checkpoint_() 28 | 29 | @torch.no_grad() # otherwise some non-leaf nodes appear which breaks serialization 30 | def load_checkpoint_(self): 31 | model, _, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( 32 | [self.checkpoint_path] 33 | ) 34 | self.model = model[0].eval() 35 | self.model = self.model.to(self.device) 36 | for parameter in self.model.parameters(): 37 | parameter.requires_grad_(False) 38 | 39 | self.should_normalize = task.cfg.normalize 40 | 41 | @property 42 | def device(self): 43 | return self._float_tensor.device 44 | 45 | @property 46 | def code_hop_size(self) -> int: 47 | return 320 48 | 49 | @property 50 | def expected_sample_rate(self) -> int: 51 | return 16_000 52 | 53 | def forward(self, x): 54 | if self.lazy_load and self.model is None: 55 | self.load_checkpoint_() 56 | 57 | return self.get_features(x) 58 | 59 | @torch.inference_mode() 60 | def get_features(self, x): 61 | x = x.to(self.device) 62 | if self.should_normalize: 63 | x = F.layer_norm(x, x.shape) 64 | x = x.view(1, -1) 65 | 66 | feat = [] 67 | for start in range(0, x.size(1), self.max_chunk): 68 | x_chunk = x[:, start : start + self.max_chunk] 69 | feat_chunk, _ = self.model.extract_features( 70 | source=x_chunk, 71 | padding_mask=None, 72 | mask=False, 73 | output_layer=self.layer, 74 | ) 75 | feat.append(feat_chunk) 76 | return torch.cat(feat, 1).squeeze(0).cpu() 77 | -------------------------------------------------------------------------------- /unitspeech/speaker_encoder/utils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification """ 2 | 3 | import torch 4 | import fairseq 5 | from packaging import version 6 | import torch.nn.functional as F 7 | from fairseq import tasks 8 | from fairseq.checkpoint_utils import load_checkpoint_to_cpu 9 | from fairseq.dataclass.utils import convert_namespace_to_omegaconf 10 | from omegaconf import OmegaConf 11 | from s3prl.upstream.interfaces import UpstreamBase 12 | from torch.nn.utils.rnn import pad_sequence 13 | 14 | def load_model(filepath): 15 | state = torch.load(filepath, map_location=lambda storage, loc: storage) 16 | # state = load_checkpoint_to_cpu(filepath) 17 | state["cfg"] = OmegaConf.create(state["cfg"]) 18 | 19 | if "args" in state and state["args"] is not None: 20 | cfg = convert_namespace_to_omegaconf(state["args"]) 21 | elif "cfg" in state and state["cfg"] is not None: 22 | cfg = state["cfg"] 23 | else: 24 | raise RuntimeError( 25 | f"Neither args nor cfg exist in state keys = {state.keys()}" 26 | ) 27 | 28 | task = tasks.setup_task(cfg.task) 29 | if "task_state" in state: 30 | task.load_state_dict(state["task_state"]) 31 | 32 | model = task.build_model(cfg.model) 33 | 34 | return model, cfg, task 35 | 36 | 37 | ################### 38 | # UPSTREAM EXPERT # 39 | ################### 40 | class UpstreamExpert(UpstreamBase): 41 | def __init__(self, ckpt, **kwargs): 42 | super().__init__(**kwargs) 43 | assert version.parse(fairseq.__version__) > version.parse( 44 | "0.10.2" 45 | ), "Please install the fairseq master branch." 46 | 47 | model, cfg, task = load_model(ckpt) 48 | self.model = model 49 | self.task = task 50 | 51 | if len(self.hooks) == 0: 52 | module_name = "self.model.encoder.layers" 53 | for module_id in range(len(eval(module_name))): 54 | self.add_hook( 55 | f"{module_name}[{module_id}]", 56 | lambda input, output: input[0].transpose(0, 1), 57 | ) 58 | self.add_hook("self.model.encoder", lambda input, output: output[0]) 59 | 60 | def forward(self, wavs): 61 | if self.task.cfg.normalize: 62 | wavs = [F.layer_norm(wav, wav.shape) for wav in wavs] 63 | 64 | device = wavs[0].device 65 | wav_lengths = torch.LongTensor([len(wav) for wav in wavs]).to(device) 66 | wav_padding_mask = ~torch.lt( 67 | torch.arange(max(wav_lengths)).unsqueeze(0).to(device), 68 | wav_lengths.unsqueeze(1), 69 | ) 70 | padded_wav = pad_sequence(wavs, batch_first=True) 71 | 72 | features, feat_padding_mask = self.model.extract_features( 73 | padded_wav, 74 | padding_mask=wav_padding_mask, 75 | mask=None, 76 | ) 77 | return { 78 | "default": features, 79 | } 80 | 81 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import os 7 | import subprocess 8 | from dataclasses import dataclass 9 | import torch.distributed as dist 10 | 11 | 12 | @dataclass(frozen=True, repr=True, eq=True, unsafe_hash=True) 13 | class DistributedContext: 14 | is_distributed: bool 15 | rank: int 16 | local_rank: int 17 | world_size: int 18 | mode: str 19 | 20 | @property 21 | def is_leader(self) -> bool: 22 | return self.rank == 0 23 | 24 | 25 | def init_distributed_context(port: int) -> DistributedContext: 26 | # Sometimes the nccl backend hangs on the barrier op (https://github.com/pytorch/pytorch/issues/53658). 27 | # Since it is the only op we care about here, we'd use the gloo backend. 28 | BACKEND = "gloo" 29 | 30 | # default, non-distributed context 31 | context = DistributedContext( 32 | is_distributed=False, rank=0, local_rank=0, world_size=1, mode="none" 33 | ) 34 | 35 | launch_keys = ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK", "LOCAL_RANK"] 36 | slurm_keys = [ 37 | "SLURM_LOCALID", 38 | "SLURM_PROCID", 39 | "SLURM_NTASKS", 40 | "SLURM_NODEID", 41 | "SLURM_JOB_NODELIST", 42 | ] 43 | 44 | # is it torch.distributed.launch? 45 | if all(key in os.environ for key in launch_keys): 46 | init_method = "env://" 47 | world_size = int(os.environ["WORLD_SIZE"]) 48 | rank = int(os.environ["RANK"]) 49 | local_rank = int(os.environ["LOCAL_RANK"]) 50 | context = DistributedContext( 51 | is_distributed=True, 52 | rank=rank, 53 | world_size=world_size, 54 | local_rank=local_rank, 55 | mode="launch", 56 | ) 57 | dist.init_process_group( 58 | backend=BACKEND, init_method=init_method, world_size=world_size, rank=rank 59 | ) 60 | # is it slurm? 61 | elif all(key in os.environ for key in slurm_keys): 62 | init_method = "env://" 63 | local_rank = int(os.environ["SLURM_LOCALID"]) 64 | rank = int(os.environ["SLURM_PROCID"]) 65 | world_size = int(os.environ["SLURM_NTASKS"]) 66 | 67 | hostnames = subprocess.check_output( 68 | ["scontrol", "show", "hostnames", os.environ["SLURM_JOB_NODELIST"]] 69 | ) 70 | leader_addr = hostnames.split()[0].decode("utf-8") 71 | 72 | os.environ["MASTER_ADDR"] = leader_addr 73 | os.environ["MASTER_PORT"] = str(port) 74 | os.environ["WORLD_SIZE"] = str(world_size) 75 | os.environ["RANK"] = str(rank) 76 | 77 | context = DistributedContext( 78 | is_distributed=True, 79 | rank=rank, 80 | local_rank=local_rank, 81 | world_size=world_size, 82 | mode="slurm", 83 | ) 84 | dist.init_process_group( 85 | backend=BACKEND, 86 | init_method=init_method, 87 | world_size=world_size, 88 | rank=rank, 89 | ) 90 | 91 | return context 92 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/gslm/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import typing as tp 7 | 8 | from fairseq import hub_utils, utils 9 | from fairseq.hub_utils import GeneratorHubInterface 10 | 11 | 12 | class UnitLanguageModelSampler(GeneratorHubInterface): 13 | """ 14 | A simple PyTorch interface for ULM 15 | """ 16 | 17 | def __init__(self, cfg, task, models): 18 | super().__init__(cfg, task, models) 19 | self.model = self.models[0] 20 | self.model.eval() 21 | 22 | def encode(self, unit_str): 23 | tokens = self.task.source_dictionary.encode_line( 24 | unit_str, add_if_not_exist=False 25 | ).long() 26 | return tokens 27 | 28 | def get_prefix_size(self): 29 | return self.cfg.generation.prefix_size 30 | 31 | def post_process_predictions(self, src_tokens, hypos): 32 | src_tokens = utils.strip_pad(src_tokens, self.tgt_dict.pad()) 33 | src_str = None 34 | if self.task.source_dictionary is not None: 35 | src_str = self.task.source_dictionary.string( 36 | src_tokens, self.cfg.common_eval.post_process 37 | ) 38 | return [ 39 | utils.post_process_prediction( 40 | hypo_tokens=hypo["tokens"].int().cpu(), 41 | src_str=src_str, 42 | alignment=hypo["alignment"], 43 | align_dict=self.align_dict, 44 | tgt_dict=self.tgt_dict, 45 | remove_bpe=self.cfg.common_eval.post_process, 46 | )[1] 47 | for hypo in hypos 48 | ] 49 | 50 | def sample( 51 | self, sentences: tp.List[str], beam: int = 1, verbose: bool = False, **kwargs 52 | ): 53 | hypotheses = self.sample_top_hypotheses(sentences, beam, verbose, **kwargs) 54 | return [hypos[0] for hypos in hypotheses] 55 | 56 | def sample_top_hypotheses( 57 | self, sentences: tp.List[str], beam: int = 1, verbose: bool = False, **kwargs 58 | ) -> tp.List[str]: 59 | if isinstance(sentences, str): 60 | return self.sample_top_hypotheses( 61 | [sentences], beam=beam, verbose=verbose, **kwargs 62 | )[0] 63 | tokenized_sentences = [self.encode(sentence) for sentence in sentences] 64 | batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs) 65 | 66 | return [ 67 | self.post_process_predictions(src_tokens, hypos) 68 | for src_tokens, hypos in zip(tokenized_sentences, batched_hypos) 69 | ] 70 | 71 | @classmethod 72 | def from_pretrained( 73 | cls, 74 | model_name_or_path, 75 | checkpoint_file="checkpoint_best.pt", 76 | data_name_or_path=".", 77 | **kwargs, 78 | ): 79 | x = hub_utils.from_pretrained( 80 | model_name_or_path, 81 | checkpoint_file, 82 | data_name_or_path, 83 | archive_map=None, 84 | bpe=None, 85 | load_checkpoint_heads=True, 86 | sample_break_mode="eos", 87 | **kwargs, 88 | ) 89 | return cls(x["args"], x["task"], x["models"]) 90 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/cmudict.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Keith Ito 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | """ from MIT-licensed https://github.com/keithito/tacotron """ 22 | 23 | import re 24 | 25 | 26 | valid_symbols = [ 27 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 28 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 29 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 30 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 31 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 32 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 33 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 34 | ] 35 | 36 | _valid_symbol_set = set(valid_symbols) 37 | 38 | 39 | class CMUDict: 40 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 41 | def __init__(self, file_or_path, keep_ambiguous=True): 42 | if isinstance(file_or_path, str): 43 | with open(file_or_path, encoding='latin-1') as f: 44 | entries = _parse_cmudict(f) 45 | else: 46 | entries = _parse_cmudict(file_or_path) 47 | if not keep_ambiguous: 48 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 49 | self._entries = entries 50 | 51 | 52 | def __len__(self): 53 | return len(self._entries) 54 | 55 | 56 | def lookup(self, word): 57 | '''Returns list of ARPAbet pronunciations of the given word.''' 58 | return self._entries.get(word.upper()) 59 | 60 | 61 | 62 | _alt_re = re.compile(r'\([0-9]+\)') 63 | 64 | 65 | def _parse_cmudict(file): 66 | cmudict = {} 67 | for line in file: 68 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 69 | parts = line.split(' ') 70 | word = re.sub(_alt_re, '', parts[0]) 71 | pronunciation = _get_pronunciation(parts[1]) 72 | if pronunciation: 73 | if word in cmudict: 74 | cmudict[word].append(pronunciation) 75 | else: 76 | cmudict[word] = [pronunciation] 77 | return cmudict 78 | 79 | 80 | def _get_pronunciation(s): 81 | parts = s.strip().split(' ') 82 | for part in parts: 83 | if part not in _valid_symbol_set: 84 | return None 85 | return ' '.join(parts) 86 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/waveglow_denoiser.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import torch 29 | from .layers import STFT 30 | 31 | 32 | class Denoiser(torch.nn.Module): 33 | """ Removes model bias from audio produced with waveglow """ 34 | 35 | def __init__(self, waveglow, filter_length=1024, n_overlap=4, 36 | win_length=1024, mode='zeros'): 37 | super(Denoiser, self).__init__() 38 | self.stft = STFT(filter_length=filter_length, 39 | hop_length=int(filter_length/n_overlap), 40 | win_length=win_length) 41 | if mode == 'zeros': 42 | mel_input = torch.zeros( 43 | (1, 80, 88), 44 | dtype=waveglow.upsample.weight.dtype, 45 | device=waveglow.upsample.weight.device) 46 | elif mode == 'normal': 47 | mel_input = torch.randn( 48 | (1, 80, 88), 49 | dtype=waveglow.upsample.weight.dtype, 50 | device=waveglow.upsample.weight.device) 51 | else: 52 | raise Exception("Mode {} if not supported".format(mode)) 53 | 54 | with torch.no_grad(): 55 | bias_audio = waveglow.infer(mel_input, sigma=0.0).float() 56 | bias_spec, _ = self.stft.transform(bias_audio) 57 | 58 | self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) 59 | 60 | def forward(self, audio, strength=0.1): 61 | audio_spec, audio_angles = self.stft.transform(audio.cuda().float()) 62 | audio_spec_denoised = audio_spec - self.bias_spec * strength 63 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 64 | audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) 65 | return audio_denoised 66 | -------------------------------------------------------------------------------- /unitspeech/util.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS """ 2 | 3 | import torch 4 | 5 | 6 | def sequence_mask(length, max_length=None): 7 | if max_length is None: 8 | max_length = length.max() 9 | x = torch.arange(int(max_length), dtype=length.dtype, device=length.device) 10 | return x.unsqueeze(0) < length.unsqueeze(1) 11 | 12 | 13 | def generate_path(duration, mask): 14 | device = duration.device 15 | 16 | b, t_x, t_y = mask.shape 17 | cum_duration = torch.cumsum(duration, 1) 18 | path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device) 19 | 20 | cum_duration_flat = cum_duration.view(b * t_x) 21 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 22 | path = path.view(b, t_x, t_y) 23 | path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], 24 | [1, 0], [0, 0]]))[:, :-1] 25 | path = path * mask 26 | return path 27 | 28 | 29 | def convert_pad_shape(pad_shape): 30 | l = pad_shape[::-1] 31 | pad_shape = [item for sublist in l for item in sublist] 32 | return pad_shape 33 | 34 | 35 | def fix_len_compatibility(length, num_downsamplings_in_unet=3): 36 | while True: 37 | if length % (2**num_downsamplings_in_unet) == 0: 38 | return int(length) 39 | length += 1 40 | 41 | 42 | def intersperse(lst, item): 43 | # Adds blank symbol 44 | result = [item] * (len(lst) * 2 + 1) 45 | result[1::2] = lst 46 | return result 47 | 48 | 49 | def process_unit(encoded, sampling_rate, hop_length): 50 | # A method that aligns units and durations (50Hz) extracted from 16kHz audio with 51 | # mel-spectrograms extracted from 22,050Hz audio. 52 | 53 | unit = encoded["units"].cpu().tolist() 54 | duration = encoded["durations"].cpu().tolist() 55 | 56 | duration = [int(i) * (sampling_rate // 50) for i in duration] 57 | 58 | expand_unit = [] 59 | 60 | for u, d in zip(unit, duration): 61 | for _ in range(d): 62 | expand_unit.append(u) 63 | 64 | new_length = len(expand_unit) // hop_length * hop_length 65 | 66 | unit = torch.LongTensor(expand_unit)[:new_length].reshape(-1, hop_length).mode(1)[0].tolist() 67 | 68 | squeezed_unit = [unit[0]] 69 | squeezed_duration = [1] 70 | 71 | for u in unit[1:]: 72 | if u == squeezed_unit[-1]: 73 | squeezed_duration[-1] += 1 74 | else: 75 | squeezed_unit.append(u) 76 | squeezed_duration.append(1) 77 | 78 | unit = torch.LongTensor(squeezed_unit) 79 | duration = torch.LongTensor(squeezed_duration) 80 | 81 | return unit, duration 82 | 83 | 84 | class HParams(): 85 | def __init__(self, **kwargs): 86 | for k, v in kwargs.items(): 87 | if type(v) == dict: 88 | v = HParams(**v) 89 | self[k] = v 90 | 91 | def keys(self): 92 | return self.__dict__.keys() 93 | 94 | def items(self): 95 | return self.__dict__.items() 96 | 97 | def values(self): 98 | return self.__dict__.values() 99 | 100 | def __len__(self): 101 | return len(self.__dict__) 102 | 103 | def __getitem__(self, key): 104 | return getattr(self, key) 105 | 106 | def __setitem__(self, key, value): 107 | return setattr(self, key, value) 108 | 109 | def __contains__(self, key): 110 | return key in self.__dict__ 111 | 112 | def __repr__(self): 113 | return self.__dict__.__repr__() -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/numbers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Keith Ito 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | """ from MIT-licensed https://github.com/keithito/tacotron """ 22 | 23 | import inflect 24 | import re 25 | 26 | 27 | _inflect = inflect.engine() 28 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 29 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 30 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 31 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 32 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 33 | _number_re = re.compile(r'[0-9]+') 34 | 35 | 36 | def _remove_commas(m): 37 | return m.group(1).replace(',', '') 38 | 39 | 40 | def _expand_decimal_point(m): 41 | return m.group(1).replace('.', ' point ') 42 | 43 | 44 | def _expand_dollars(m): 45 | match = m.group(1) 46 | parts = match.split('.') 47 | if len(parts) > 2: 48 | return match + ' dollars' # Unexpected format 49 | dollars = int(parts[0]) if parts[0] else 0 50 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 51 | if dollars and cents: 52 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 53 | cent_unit = 'cent' if cents == 1 else 'cents' 54 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 55 | elif dollars: 56 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 57 | return '%s %s' % (dollars, dollar_unit) 58 | elif cents: 59 | cent_unit = 'cent' if cents == 1 else 'cents' 60 | return '%s %s' % (cents, cent_unit) 61 | else: 62 | return 'zero dollars' 63 | 64 | 65 | def _expand_ordinal(m): 66 | return _inflect.number_to_words(m.group(0)) 67 | 68 | 69 | def _expand_number(m): 70 | num = int(m.group(0)) 71 | if num > 1000 and num < 3000: 72 | if num == 2000: 73 | return 'two thousand' 74 | elif num > 2000 and num < 2010: 75 | return 'two thousand ' + _inflect.number_to_words(num % 100) 76 | elif num % 100 == 0: 77 | return _inflect.number_to_words(num // 100) + ' hundred' 78 | else: 79 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 80 | else: 81 | return _inflect.number_to_words(num, andword='') 82 | 83 | 84 | def normalize_numbers(text): 85 | text = re.sub(_comma_number_re, _remove_commas, text) 86 | text = re.sub(_pounds_re, r'\1 pounds', text) 87 | text = re.sub(_dollars_re, _expand_dollars, text) 88 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 89 | text = re.sub(_ordinal_re, _expand_ordinal, text) 90 | text = re.sub(_number_re, _expand_number, text) 91 | return text 92 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq -------------------------------------------------------------------------------- /unitspeech/vocoder/alias_free_torch/filter.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import math 8 | 9 | if 'sinc' in dir(torch): 10 | sinc = torch.sinc 11 | else: 12 | # This code is adopted from adefossez's julius.core.sinc under the MIT License 13 | # https://adefossez.github.io/julius/julius/core.html 14 | # LICENSE is in incl_licenses directory. 15 | def sinc(x: torch.Tensor): 16 | """ 17 | Implementation of sinc, i.e. sin(pi * x) / (pi * x) 18 | __Warning__: Different to julius.sinc, the input is multiplied by `pi`! 19 | """ 20 | return torch.where(x == 0, 21 | torch.tensor(1., device=x.device, dtype=x.dtype), 22 | torch.sin(math.pi * x) / math.pi / x) 23 | 24 | 25 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License 26 | # https://adefossez.github.io/julius/julius/lowpass.html 27 | # LICENSE is in incl_licenses directory. 28 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] 29 | even = (kernel_size % 2 == 0) 30 | half_size = kernel_size // 2 31 | 32 | #For kaiser window 33 | delta_f = 4 * half_width 34 | A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 35 | if A > 50.: 36 | beta = 0.1102 * (A - 8.7) 37 | elif A >= 21.: 38 | beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) 39 | else: 40 | beta = 0. 41 | window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) 42 | 43 | # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio 44 | if even: 45 | time = (torch.arange(-half_size, half_size) + 0.5) 46 | else: 47 | time = torch.arange(kernel_size) - half_size 48 | if cutoff == 0: 49 | filter_ = torch.zeros_like(time) 50 | else: 51 | filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) 52 | # Normalize filter to have sum = 1, otherwise we will have a small leakage 53 | # of the constant component in the input signal. 54 | filter_ /= filter_.sum() 55 | filter = filter_.view(1, 1, kernel_size) 56 | 57 | return filter 58 | 59 | 60 | class LowPassFilter1d(nn.Module): 61 | def __init__(self, 62 | cutoff=0.5, 63 | half_width=0.6, 64 | stride: int = 1, 65 | padding: bool = True, 66 | padding_mode: str = 'replicate', 67 | kernel_size: int = 12): 68 | # kernel_size should be even number for stylegan3 setup, 69 | # in this implementation, odd number is also possible. 70 | super().__init__() 71 | if cutoff < -0.: 72 | raise ValueError("Minimum cutoff must be larger than zero.") 73 | if cutoff > 0.5: 74 | raise ValueError("A cutoff above 0.5 does not make sense.") 75 | self.kernel_size = kernel_size 76 | self.even = (kernel_size % 2 == 0) 77 | self.pad_left = kernel_size // 2 - int(self.even) 78 | self.pad_right = kernel_size // 2 79 | self.stride = stride 80 | self.padding = padding 81 | self.padding_mode = padding_mode 82 | filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) 83 | self.register_buffer("filter", filter) 84 | 85 | #input [B, C, T] 86 | def forward(self, x): 87 | _, C, _ = x.shape 88 | 89 | if self.padding: 90 | x = F.pad(x, (self.pad_left, self.pad_right), 91 | mode=self.padding_mode) 92 | out = F.conv1d(x, self.filter.expand(C, -1, -1), 93 | stride=self.stride, groups=C) 94 | 95 | return out -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/cleaners.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Keith Ito 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | """ from MIT-licensed https://github.com/keithito/tacotron """ 22 | ''' 23 | Cleaners are transformations that run over the input text at both training and eval time. 24 | 25 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 26 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 27 | 1. "english_cleaners" for English text 28 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 29 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 30 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 31 | the symbols in symbols.py to match your data). 32 | ''' 33 | 34 | import re 35 | from unidecode import unidecode 36 | from .numbers import normalize_numbers 37 | 38 | 39 | # Regular expression matching whitespace: 40 | _whitespace_re = re.compile(r'\s+') 41 | 42 | # List of (regular expression, replacement) pairs for abbreviations: 43 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 44 | ('mrs', 'misess'), 45 | ('mr', 'mister'), 46 | ('dr', 'doctor'), 47 | ('st', 'saint'), 48 | ('co', 'company'), 49 | ('jr', 'junior'), 50 | ('maj', 'major'), 51 | ('gen', 'general'), 52 | ('drs', 'doctors'), 53 | ('rev', 'reverend'), 54 | ('lt', 'lieutenant'), 55 | ('hon', 'honorable'), 56 | ('sgt', 'sergeant'), 57 | ('capt', 'captain'), 58 | ('esq', 'esquire'), 59 | ('ltd', 'limited'), 60 | ('col', 'colonel'), 61 | ('ft', 'fort'), 62 | ]] 63 | 64 | 65 | def expand_abbreviations(text): 66 | for regex, replacement in _abbreviations: 67 | text = re.sub(regex, replacement, text) 68 | return text 69 | 70 | 71 | def expand_numbers(text): 72 | return normalize_numbers(text) 73 | 74 | 75 | def lowercase(text): 76 | return text.lower() 77 | 78 | 79 | def collapse_whitespace(text): 80 | return re.sub(_whitespace_re, ' ', text) 81 | 82 | 83 | def convert_to_ascii(text): 84 | return unidecode(text) 85 | 86 | 87 | def basic_cleaners(text): 88 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 89 | text = lowercase(text) 90 | text = collapse_whitespace(text) 91 | return text 92 | 93 | 94 | def transliteration_cleaners(text): 95 | '''Pipeline for non-English text that transliterates to ASCII.''' 96 | text = convert_to_ascii(text) 97 | text = lowercase(text) 98 | text = collapse_whitespace(text) 99 | return text 100 | 101 | 102 | def english_cleaners(text): 103 | '''Pipeline for English text, including number and abbreviation expansion.''' 104 | text = convert_to_ascii(text) 105 | text = lowercase(text) 106 | text = expand_numbers(text) 107 | text = expand_abbreviations(text) 108 | text = collapse_whitespace(text) 109 | return text 110 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/README.md: -------------------------------------------------------------------------------- 1 | # Distributed pseudo-units transcription 2 | 3 | If you ever tried to transcribe large-scale aduio datasets (e.g. [LibriLight](https://github.com/facebookresearch/libri-light) dataset with 60k hours) into discrete pseudo-units such as used by the [Generative Spoken Language Modeling (GSLM)](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm) system, you might have noticed that this task is computationally intensive and might be impractical to do in a non-distributed fashion. 4 | 5 | This tool provides a convenient script that can leverage multiple GPUs (on multiple nodes!) to speed up and parallelize pseudo-unit transcription. 6 | We provide recipies for two scenarios: (a) single-node, multiprocess/multi-GPU transcription that leverages distributed.run mechanism of Pytorch, and (b) multi-node, multi-GPU transcription that can be run on a SLURM-managed cluster. 7 | 8 | ## Example scripts 9 | 10 | * `local.sh` runs provides an example of a command to transcribe a dataset in a local parallel mode; 11 | * `slurm.sbatch` is an example of a SLURM sbatch script for a distributed pseudo-unit transcription. 12 | 13 | Finally, `transcribe.py` can be run directly as a single process (see single.sh): 14 | ``` 15 | DENSE_NAME=hubert-base-ls960 16 | KMEANS_NAME=hubert-base-ls960-kmeans-100 17 | MANIFEST=manifest.tsv 18 | TRANSCRIPT=transcript 19 | 20 | python transcribe.py \ 21 | --manifest $MANIFEST \ 22 | --output=$TRANSCRIPT \ 23 | --dense_model=$DENSE_NAME \ 24 | --kmeans_model=$KMEANS_NAME 25 | ``` 26 | 27 | ## Command line arguments 28 | 29 | The transcription script, `transcribe.py` has a few command-line arguments: 30 | * `--dense_model`: sets the dense Hubert model to be used (by its name, e.g. `hubert-base-ls960`); 31 | * `--kmeans_model`: sets the k-mean quantizer to be used, e.g. `hubert-base-ls960-kmeans-100`; 32 | * `--manifest`: specifies the manifest file describing the dataset; 33 | * `--output`: path to the output transcript file. Unit stream will be stored in `.units` file, durations (if requested) - in `.durations`, and F0 values (again, if requested) in `.f0s`; 34 | * `--deduplicate`: if set, consecutive repeats of the same pseudo-unit are collapsed (as it is done in GSLM); 35 | * `--durations`: if set, duration of each token is reported in a `.durations` file (note that if `--deduplicate` is not set, all durations will be equal to 1); 36 | * `--f0s`: if set, duration of mean F0 that correspond to each token is reported in a `.f0s` file (note: F0 extraction is slow). F0 values are rounded to the closest integer and are measured in Hz; 37 | * `--preserve_name`: if set, the transcript contains names of the original audio files; 38 | * `--separator`: a separator between pseudo-unit tokens in the outputs; 39 | * `--distributed_port`: a unique port, required for distributed transcription (defaults to 58554). 40 | 41 | 42 | ## Input format 43 | `transribe.py` takes a manifest file describing an input dataset. A manifest is a tab-separated file with simple format: (a) the first line is a root of the dataset's folder, and (b) each line specifies a relative path to an audio file and its size in frames. Here is an example of a manifest corresponding to LibriSpeech dev-clean: 44 | ``` 45 | /datasets/librispeech/dev-clean 46 | 1272/128104/1272-128104-0000.flac 93680 47 | 1272/128104/1272-128104-0001.flac 77040 48 | 1272/128104/1272-128104-0002.flac 199760 49 | 1272/128104/1272-128104-0003.flac 158400 50 | 1272/128104/1272-128104-0004.flac 470400 51 | 1272/128104/1272-128104-0005.flac 144160 52 | ``` 53 | (`transcribe.py` ignores the duration field.) 54 | 55 | **NB**: fairseq has [an utility](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/wav2vec_manifest.py) for creating manifest files. 56 | 57 | ## Output format 58 | 59 | `transcribe.py` outputs one line per file, with pseudo-units separated by spaces (by default). Hence the output would look something like 60 | ``` 61 | 71 12 56 57 40 63 40 63 93 50 76 53 62 ... 55 20 62 | ... 63 | 71 12 56 57 56 57 40 57 86 58 9 1 27 31 23 69 44 26 ... 64 | ``` 65 | 66 | This format is directly compatible with fairseq-preprocessing. However, if there is a need to link a particular line to its original file, please use `--preserve_name` flag. 67 | -------------------------------------------------------------------------------- /unitspeech/vocoder/README.md: -------------------------------------------------------------------------------- 1 | ## BigVGAN: A Universal Neural Vocoder with Large-Scale Training 2 | #### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon 3 | 4 |
5 | 6 | 7 | ### [Paper](https://arxiv.org/abs/2206.04658) 8 | ### [Audio demo](https://bigvgan-demo.github.io/) 9 | 10 | ## Installation 11 | Clone the repository and install dependencies. 12 | ```shell 13 | # the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries 14 | git clone https://github.com/NVIDIA/BigVGAN 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset. 19 | ``` shell 20 | cd LibriTTS && \ 21 | ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \ 22 | ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \ 23 | ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \ 24 | ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \ 25 | ln -s /path/to/your/LibriTTS/dev-other dev-other && \ 26 | ln -s /path/to/your/LibriTTS/test-clean test-clean && \ 27 | ln -s /path/to/your/LibriTTS/test-other test-other && \ 28 | cd .. 29 | ``` 30 | 31 | ## Training 32 | Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input. 33 | ```shell 34 | python train.py \ 35 | --config configs/bigvgan_24khz_100band.json \ 36 | --input_wavs_dir LibriTTS \ 37 | --input_training_file LibriTTS/train-full.txt \ 38 | --input_validation_file LibriTTS/val-full.txt \ 39 | --list_input_unseen_wavs_dir LibriTTS LibriTTS \ 40 | --list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \ 41 | --checkpoint_path exp/bigvgan 42 | ``` 43 | 44 | ## Synthesis 45 | Synthesize from BigVGAN model. Below is an example command for generating audio from the model. 46 | It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`. 47 | ```shell 48 | python inference.py \ 49 | --checkpoint_file exp/bigvgan/g_05000000 \ 50 | --input_wavs_dir /path/to/your/input_wav \ 51 | --output_dir /path/to/your/output_wav 52 | ``` 53 | 54 | `inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`. 55 | It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`. 56 | 57 | Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model. 58 | ```shell 59 | python inference_e2e.py \ 60 | --checkpoint_file exp/bigvgan/g_05000000 \ 61 | --input_mels_dir /path/to/your/input_mel \ 62 | --output_dir /path/to/your/output_wav 63 | ``` 64 | 65 | ## Pretrained Models 66 | We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq). 67 | One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders. 68 | 69 | |Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned| 70 | |------|---|---|---|---|------|---| 71 | |bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No| 72 | |bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No| 73 | |bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No| 74 | |bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No| 75 | 76 | The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset. 77 | We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications. 78 | Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality. 79 | 80 | 81 | ## TODO 82 | 83 | Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future. 84 | 85 | 86 | ## References 87 | * [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator) 88 | 89 | * [Snake](https://github.com/EdwardDixon/snake) (for periodic activation) 90 | 91 | * [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing) 92 | 93 | * [Julius](https://github.com/adefossez/julius) (for low-pass filter) 94 | 95 | * [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator) 96 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/data/f0_preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import amfm_decompy.basic_tools as basic 8 | import amfm_decompy.pYAAPT as pYAAPT 9 | from librosa.util import normalize 10 | import numpy as np 11 | from scipy.interpolate import interp1d 12 | 13 | F0_FRAME_SPACE = 0.005 # sec 14 | 15 | 16 | def get_f0(audio, rate=16_000): 17 | assert audio.ndim == 1 18 | frame_length = 20.0 # ms 19 | to_pad = int(frame_length / 1000 * rate) // 2 20 | 21 | audio = normalize(audio) * 0.95 22 | audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0) 23 | audio = basic.SignalObj(audio, rate) 24 | pitch = pYAAPT.yaapt( 25 | audio, 26 | frame_length=frame_length, 27 | frame_space=F0_FRAME_SPACE * 1000, 28 | nccf_thresh1=0.25, 29 | tda_frame_length=25.0, 30 | ) 31 | f0 = pitch.samp_values 32 | return f0 33 | 34 | 35 | def align_f0_to_durations(f0, durations, f0_code_ratio, tol=1): 36 | code_len = durations.sum() 37 | targ_len = int(f0_code_ratio * code_len) 38 | diff = f0.size(0) - targ_len 39 | assert abs(diff) <= tol, ( 40 | f"Cannot subsample F0: |{f0.size(0)} - {f0_code_ratio}*{code_len}|" 41 | f" > {tol} (dur=\n{durations})" 42 | ) 43 | if diff > 0: 44 | f0 = f0[:targ_len] 45 | elif diff < 0: 46 | f0 = torch.cat((f0, f0.new_full((-diff,), f0[-1])), 0) 47 | 48 | f0_offset = 0.0 49 | seg_f0s = [] 50 | for dur in durations: 51 | f0_dur = dur.item() * f0_code_ratio 52 | seg_f0 = f0[int(f0_offset) : int(f0_offset + f0_dur)] 53 | seg_f0 = seg_f0[seg_f0 != 0] 54 | if len(seg_f0) == 0: 55 | seg_f0 = torch.tensor(0).type(seg_f0.type()) 56 | else: 57 | seg_f0 = seg_f0.mean() 58 | seg_f0s.append(seg_f0) 59 | f0_offset += f0_dur 60 | 61 | assert int(f0_offset) == f0.size(0), f"{f0_offset} {f0.size()} {durations.sum()}" 62 | return torch.tensor(seg_f0s) 63 | 64 | 65 | class SpeakerMeanNormalize: 66 | def __init__(self, path_to_stats, center=True, scale=False, log=True): 67 | self.stats = torch.load(path_to_stats) 68 | self.center = center 69 | self.scale = scale 70 | self.log = log 71 | 72 | def __call__(self, f0, speaker): 73 | f0 = f0.clone() 74 | mask = f0 != 0.0 75 | if self.log: 76 | f0[mask] = f0[mask].log() 77 | 78 | mean = ( 79 | self.stats[speaker]["logf0_mean"] 80 | if self.log 81 | else self.stats[speaker]["f0_mean"] 82 | ) 83 | std = ( 84 | self.stats[speaker]["logf0_std"] 85 | if self.log 86 | else self.stats[speaker]["f0_std"] 87 | ) 88 | 89 | if self.center: 90 | f0[mask] -= mean 91 | if self.scale: 92 | f0[mask] /= std 93 | 94 | return f0 95 | 96 | 97 | class PromptNormalize: 98 | def __init__(self, center=True, scale=False, log=True): 99 | self.center = center 100 | self.scale = scale 101 | self.log = log 102 | 103 | def __call__(self, f0, _speaker=None): 104 | f0 = f0.clone() 105 | mask = f0 != 0.0 106 | if self.log: 107 | f0[mask] = f0[mask].log() 108 | 109 | if self.center: 110 | f0[mask] -= f0[mask].mean() 111 | if self.scale: 112 | f0[mask] /= f0[mask].std() 113 | 114 | return f0 115 | 116 | 117 | class F0BinQuantizer: 118 | def __init__(self, bins_path): 119 | self.bins = torch.load(bins_path) 120 | 121 | def __call__(self, f0: torch.Tensor): 122 | bin_idx = (f0.view(-1, 1) > self.bins.view(1, -1)).long().sum(dim=1) 123 | return bin_idx 124 | 125 | 126 | def trailing_silence_mask(f0): 127 | """ 128 | >>> f0 = torch.tensor([1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0]) 129 | >>> trailing_silence_mask(f0) 130 | tensor([False, False, False, False, True, True, True]) 131 | """ 132 | assert f0.ndim == 1 133 | mask = ((f0.flip(0) != 0.0).cumsum(0) == 0).flip(0) 134 | return mask 135 | 136 | 137 | def interpolate_f0(f0): 138 | orig_t = np.arange(f0.shape[0]) 139 | f0_interp = f0[:] 140 | ii = f0_interp != 0 141 | if ii.sum() > 1: 142 | f0_interp = interp1d( 143 | orig_t[ii], f0_interp[ii], bounds_error=False, kind="linear", fill_value=0 144 | )(orig_t) 145 | # f0_interp = torch.Tensor(f0_interp).type_as(f0).to(f0.device) 146 | return f0_interp 147 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/text.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Keith Ito 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | """ from MIT-licensed https://github.com/keithito/tacotron """ 22 | import numpy as np 23 | import re 24 | from . import cleaners 25 | from .symbols import symbols 26 | 27 | 28 | # Mappings from symbol to numeric ID and vice versa: 29 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 30 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 31 | 32 | # Regular expression matching text enclosed in curly braces: 33 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 34 | 35 | # Special symbols 36 | SOS_TOK = '' 37 | EOS_TOK = '' 38 | 39 | def text_to_sequence(text, cleaner_names): 40 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 41 | 42 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 43 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 44 | 45 | Args: 46 | text: string to convert to a sequence 47 | cleaner_names: names of the cleaner functions to run the text through 48 | 49 | Returns: 50 | List of integers corresponding to the symbols in the text 51 | ''' 52 | sequence = [] 53 | 54 | # Check for curly braces and treat their contents as ARPAbet: 55 | while len(text): 56 | m = _curly_re.match(text) 57 | if not m: 58 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 59 | break 60 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 61 | sequence += _arpabet_to_sequence(m.group(2)) 62 | text = m.group(3) 63 | 64 | return sequence 65 | 66 | 67 | def sample_code_chunk(code, size): 68 | assert(size > 0 and size <= len(code)) 69 | start = np.random.randint(len(code) - size + 1) 70 | end = start + size 71 | return code[start:end], start, end 72 | 73 | 74 | def code_to_sequence(code, code_dict, collapse_code): 75 | if collapse_code: 76 | prev_c = None 77 | sequence = [] 78 | for c in code: 79 | if c in code_dict and c != prev_c: 80 | sequence.append(code_dict[c]) 81 | prev_c = c 82 | else: 83 | sequence = [code_dict[c] for c in code if c in code_dict] 84 | if len(sequence) < 0.95 * len(code): 85 | print('WARNING : over 5%% codes are OOV') 86 | 87 | return sequence 88 | 89 | 90 | def sequence_to_text(sequence): 91 | '''Converts a sequence of IDs back to a string''' 92 | result = '' 93 | for symbol_id in sequence: 94 | if symbol_id in _id_to_symbol: 95 | s = _id_to_symbol[symbol_id] 96 | # Enclose ARPAbet back in curly braces: 97 | if len(s) > 1 and s[0] == '@': 98 | s = '{%s}' % s[1:] 99 | result += s 100 | return result.replace('}{', ' ') 101 | 102 | 103 | def sequence_to_code(sequence, code_dict): 104 | '''Analogous to sequence_to_text''' 105 | id_to_code = {i: c for c, i in code_dict.items()} 106 | return ' '.join([id_to_code[i] for i in sequence]) 107 | 108 | 109 | def _clean_text(text, cleaner_names): 110 | for name in cleaner_names: 111 | cleaner = getattr(cleaners, name) 112 | if not cleaner: 113 | raise Exception('Unknown cleaner: %s' % name) 114 | text = cleaner(text) 115 | return text 116 | 117 | 118 | def _symbols_to_sequence(symbols): 119 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 120 | 121 | 122 | def _arpabet_to_sequence(text): 123 | return _symbols_to_sequence(['@' + s for s in text.split()]) 124 | 125 | 126 | def _should_keep_symbol(s): 127 | return s in _symbol_to_id and s != '_' and s != '~' 128 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/audio_processing.py: -------------------------------------------------------------------------------- 1 | # BSD 3-Clause License 2 | 3 | # Copyright (c) 2018, NVIDIA Corporation 4 | # All rights reserved. 5 | 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | """https://github.com/NVIDIA/tacotron2""" 32 | 33 | import torch 34 | import numpy as np 35 | from scipy.signal import get_window 36 | import librosa.util as librosa_util 37 | 38 | 39 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 40 | n_fft=800, dtype=np.float32, norm=None): 41 | """ 42 | # from librosa 0.6 43 | Compute the sum-square envelope of a window function at a given hop length. 44 | 45 | This is used to estimate modulation effects induced by windowing 46 | observations in short-time fourier transforms. 47 | 48 | Parameters 49 | ---------- 50 | window : string, tuple, number, callable, or list-like 51 | Window specification, as in `get_window` 52 | 53 | n_frames : int > 0 54 | The number of analysis frames 55 | 56 | hop_length : int > 0 57 | The number of samples to advance between frames 58 | 59 | win_length : [optional] 60 | The length of the window function. By default, this matches `n_fft`. 61 | 62 | n_fft : int > 0 63 | The length of each analysis frame. 64 | 65 | dtype : np.dtype 66 | The data type of the output 67 | 68 | Returns 69 | ------- 70 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 71 | The sum-squared envelope of the window function 72 | """ 73 | if win_length is None: 74 | win_length = n_fft 75 | 76 | n = n_fft + hop_length * (n_frames - 1) 77 | x = np.zeros(n, dtype=dtype) 78 | 79 | # Compute the squared window at the desired length 80 | win_sq = get_window(window, win_length, fftbins=True) 81 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 82 | win_sq = librosa_util.pad_center(win_sq, n_fft) 83 | 84 | # Fill the envelope 85 | for i in range(n_frames): 86 | sample = i * hop_length 87 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 88 | return x 89 | 90 | 91 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 92 | """ 93 | PARAMS 94 | ------ 95 | magnitudes: spectrogram magnitudes 96 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 97 | """ 98 | 99 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 100 | angles = angles.astype(np.float32) 101 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 102 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 103 | 104 | for i in range(n_iters): 105 | _, angles = stft_fn.transform(signal) 106 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 107 | return signal 108 | 109 | 110 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 111 | """ 112 | PARAMS 113 | ------ 114 | C: compression factor 115 | """ 116 | return torch.log(torch.clamp(x, min=clip_val) * C) 117 | 118 | 119 | def dynamic_range_decompression(x, C=1): 120 | """ 121 | PARAMS 122 | ------ 123 | C: compression factor used to compress 124 | """ 125 | return torch.exp(x) / C 126 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/speaker_probing/probes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import math 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 11 | import torch.nn.functional as F 12 | 13 | 14 | class ConstantBaseline(torch.nn.Module): 15 | def __init__(self, total_speakers): 16 | super().__init__() 17 | self.logits = torch.nn.parameter.Parameter(torch.zeros(total_speakers).float()) 18 | 19 | def forward(self, batch): 20 | bsz = batch["units"].size(0) 21 | return ( 22 | F.log_softmax(self.logits, dim=-1) 23 | .unsqueeze(0) 24 | .expand(bsz, self.logits.size(0)) 25 | ) 26 | 27 | 28 | class DiscreteClassifier(torch.nn.Module): 29 | def __init__( 30 | self, 31 | vocab_size, 32 | embedding_size, 33 | n_heads, 34 | hidden_size, 35 | n_layers, 36 | dropout, 37 | pad_value, 38 | total_speakers, 39 | ): 40 | super().__init__() 41 | self.pad_value = pad_value.item() if torch.is_tensor(pad_value) else pad_value 42 | 43 | self.embedding = torch.nn.Embedding(vocab_size, embedding_size) 44 | self.embedding_size = embedding_size 45 | torch.nn.init.normal_( 46 | self.embedding.weight, mean=0, std=self.embedding_size ** -0.5 47 | ) 48 | 49 | self.encoder_classifier = Classifier( 50 | embedding_size, n_heads, hidden_size, n_layers, dropout, total_speakers 51 | ) 52 | 53 | def forward(self, batch): 54 | src = batch["units"] 55 | padding_mask = src == self.pad_value 56 | 57 | src = src.transpose(1, 0) 58 | x = self.embedding(src) * math.sqrt(self.embedding_size) 59 | return self.encoder_classifier(x, padding_mask) 60 | 61 | 62 | class ContinuousClassifier(torch.nn.Module): 63 | def __init__( 64 | self, 65 | input_size, 66 | embedding_size, 67 | n_heads, 68 | hidden_size, 69 | n_layers, 70 | dropout, 71 | pad_value, 72 | total_speakers, 73 | ): 74 | super().__init__() 75 | 76 | self.pad_value = pad_value.item() if torch.is_tensor(pad_value) else pad_value 77 | self.embedding = torch.nn.Linear(input_size, embedding_size) 78 | 79 | self.encoder_classifier = Classifier( 80 | embedding_size, n_heads, hidden_size, n_layers, dropout, total_speakers 81 | ) 82 | 83 | def forward(self, batch): 84 | src = batch["dense"] 85 | padding_mask = batch["units"] == self.pad_value 86 | 87 | src = src.transpose(1, 0) 88 | x = self.embedding(src) # * math.sqrt(self.embedding_size) 89 | return self.encoder_classifier(x, padding_mask) 90 | 91 | 92 | class Classifier(torch.nn.Module): 93 | def __init__( 94 | self, embedding_size, n_heads, hidden_size, n_layers, dropout, total_speakers 95 | ): 96 | super().__init__() 97 | self.pos_encoder = PositionalEncoding(embedding_size, dropout=0.0) 98 | encoder_layers = TransformerEncoderLayer( 99 | embedding_size, n_heads, hidden_size, dropout=dropout 100 | ) 101 | self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers) 102 | 103 | self.embedding_size = embedding_size 104 | self.classifier = torch.nn.Linear(embedding_size, total_speakers) 105 | 106 | def forward(self, x, padding_mask): 107 | x = self.pos_encoder(x) 108 | x = self.transformer_encoder(x, src_key_padding_mask=padding_mask) 109 | 110 | bos_embedding = x[0, :] 111 | logits = self.classifier(bos_embedding) 112 | return F.log_softmax(logits, dim=-1) 113 | 114 | 115 | class PositionalEncoding(nn.Module): 116 | def __init__(self, d_model, dropout=0.1, max_len=5000): 117 | super(PositionalEncoding, self).__init__() 118 | self.dropout = nn.Dropout(p=dropout) 119 | 120 | pe = torch.zeros(max_len, d_model) 121 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 122 | div_term = torch.exp( 123 | torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) 124 | ) 125 | pe[:, 0::2] = torch.sin(position * div_term) 126 | pe[:, 1::2] = torch.cos(position * div_term) 127 | pe = pe.unsqueeze(0).transpose(0, 1) 128 | self.register_buffer("pe", pe) 129 | 130 | def forward(self, x): 131 | assert x.size(0) < self.pe.size(0), f"{x.size()=} {self.pe.size()=}" 132 | 133 | x = x + self.pe[: x.size(0), :] 134 | return self.dropout(x) 135 | -------------------------------------------------------------------------------- /unitspeech/vocoder/activations.py: -------------------------------------------------------------------------------- 1 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | from torch import nn, sin, pow 6 | from torch.nn import Parameter 7 | 8 | 9 | class Snake(nn.Module): 10 | ''' 11 | Implementation of a sine-based periodic activation function 12 | Shape: 13 | - Input: (B, C, T) 14 | - Output: (B, C, T), same shape as the input 15 | Parameters: 16 | - alpha - trainable parameter 17 | References: 18 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 19 | https://arxiv.org/abs/2006.08195 20 | Examples: 21 | >>> a1 = snake(256) 22 | >>> x = torch.randn(256) 23 | >>> x = a1(x) 24 | ''' 25 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 26 | ''' 27 | Initialization. 28 | INPUT: 29 | - in_features: shape of the input 30 | - alpha: trainable parameter 31 | alpha is initialized to 1 by default, higher values = higher-frequency. 32 | alpha will be trained along with the rest of your model. 33 | ''' 34 | super(Snake, self).__init__() 35 | self.in_features = in_features 36 | 37 | # initialize alpha 38 | self.alpha_logscale = alpha_logscale 39 | if self.alpha_logscale: # log scale alphas initialized to zeros 40 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 41 | else: # linear scale alphas initialized to ones 42 | self.alpha = Parameter(torch.ones(in_features) * alpha) 43 | 44 | self.alpha.requires_grad = alpha_trainable 45 | 46 | self.no_div_by_zero = 0.000000001 47 | 48 | def forward(self, x): 49 | ''' 50 | Forward pass of the function. 51 | Applies the function to the input elementwise. 52 | Snake ∶= x + 1/a * sin^2 (xa) 53 | ''' 54 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 55 | if self.alpha_logscale: 56 | alpha = torch.exp(alpha) 57 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 58 | 59 | return x 60 | 61 | 62 | class SnakeBeta(nn.Module): 63 | ''' 64 | A modified Snake function which uses separate parameters for the magnitude of the periodic components 65 | Shape: 66 | - Input: (B, C, T) 67 | - Output: (B, C, T), same shape as the input 68 | Parameters: 69 | - alpha - trainable parameter that controls frequency 70 | - beta - trainable parameter that controls magnitude 71 | References: 72 | - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 73 | https://arxiv.org/abs/2006.08195 74 | Examples: 75 | >>> a1 = snakebeta(256) 76 | >>> x = torch.randn(256) 77 | >>> x = a1(x) 78 | ''' 79 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 80 | ''' 81 | Initialization. 82 | INPUT: 83 | - in_features: shape of the input 84 | - alpha - trainable parameter that controls frequency 85 | - beta - trainable parameter that controls magnitude 86 | alpha is initialized to 1 by default, higher values = higher-frequency. 87 | beta is initialized to 1 by default, higher values = higher-magnitude. 88 | alpha will be trained along with the rest of your model. 89 | ''' 90 | super(SnakeBeta, self).__init__() 91 | self.in_features = in_features 92 | 93 | # initialize alpha 94 | self.alpha_logscale = alpha_logscale 95 | if self.alpha_logscale: # log scale alphas initialized to zeros 96 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 97 | self.beta = Parameter(torch.zeros(in_features) * alpha) 98 | else: # linear scale alphas initialized to ones 99 | self.alpha = Parameter(torch.ones(in_features) * alpha) 100 | self.beta = Parameter(torch.ones(in_features) * alpha) 101 | 102 | self.alpha.requires_grad = alpha_trainable 103 | self.beta.requires_grad = alpha_trainable 104 | 105 | self.no_div_by_zero = 0.000000001 106 | 107 | def forward(self, x): 108 | ''' 109 | Forward pass of the function. 110 | Applies the function to the input elementwise. 111 | SnakeBeta ∶= x + 1/b * sin^2 (xa) 112 | ''' 113 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 114 | beta = self.beta.unsqueeze(0).unsqueeze(-1) 115 | if self.alpha_logscale: 116 | alpha = torch.exp(alpha) 117 | beta = torch.exp(beta) 118 | x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 119 | 120 | return x -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/vocoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import torch.nn as nn 7 | import torch 8 | 9 | from .tts_data import TacotronInputDataset 10 | from .model import Tacotron2 11 | from .glow import WaveGlow 12 | from .waveglow_denoiser import Denoiser 13 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER 14 | 15 | from typing import Union 16 | 17 | 18 | class TacotronVocoder(nn.Module): 19 | def __init__( 20 | self, 21 | tacotron_model_path: str, 22 | tacotron_dict_path: str, 23 | waveglow_path: str, 24 | max_decoder_steps: int = 2000, 25 | denoiser_strength: float = 0.1, 26 | ): 27 | super().__init__() 28 | self.max_decoder_steps = max_decoder_steps 29 | self.denoiser_strength = denoiser_strength 30 | ( 31 | self.tacotron_model, 32 | self.tacotron_sample_rate, 33 | self.tacotron_hparams, 34 | ) = load_tacotron( 35 | tacotron_model_path=tacotron_model_path, 36 | code_dict_path=tacotron_dict_path, 37 | max_decoder_steps=self.max_decoder_steps, 38 | ) 39 | self.waveglow_model, self.denoiser_model = load_waveglow_standalone( 40 | waveglow_path=waveglow_path, 41 | ) 42 | self.tts_dataset = TacotronInputDataset(self.tacotron_hparams) 43 | self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) 44 | 45 | def forward(self, units: Union[str, torch.Tensor]) -> torch.Tensor: 46 | if isinstance(units, torch.Tensor): 47 | units_str = " ".join([str(x) for x in units.cpu().tolist()]) 48 | else: 49 | units_str = units 50 | tts_input = self.tts_dataset.get_tensor(units_str) 51 | tts_input = tts_input.to(self.device) 52 | _, _, aud_dn, _ = synthesize_audio( 53 | self.tacotron_model, 54 | self.waveglow_model, 55 | self.denoiser_model, 56 | tts_input.unsqueeze(0), 57 | strength=self.denoiser_strength, 58 | ) 59 | out_audio = aud_dn[0] 60 | return out_audio 61 | 62 | @classmethod 63 | def by_name( 64 | cls, 65 | dense_model_name: str, 66 | quantizer_model_name: str, 67 | vocab_size: int, 68 | max_decoder_steps: int = 2000, 69 | denoiser_strength: float = 0.1, 70 | ): 71 | waveglow_path = CHECKPOINT_MANAGER.get_by_name("waveglow") 72 | 73 | tacotron_checkpoint_name = ( 74 | f"{dense_model_name}-{quantizer_model_name}-{vocab_size}-tacotron" 75 | ) 76 | tacotron_checkpoint_path = CHECKPOINT_MANAGER.get_by_name( 77 | tacotron_checkpoint_name 78 | ) 79 | 80 | checkpoint_codes_name = f"{tacotron_checkpoint_name}-codes" 81 | tacotron_codes_path = CHECKPOINT_MANAGER.get_by_name(checkpoint_codes_name) 82 | 83 | return cls( 84 | tacotron_checkpoint_path, 85 | tacotron_codes_path, 86 | waveglow_path, 87 | max_decoder_steps, 88 | denoiser_strength, 89 | ) 90 | 91 | @property 92 | def device(self) -> torch.device: 93 | return self._float_tensor.device 94 | 95 | @property 96 | def output_sample_rate(self) -> int: 97 | return self.tacotron_sample_rate 98 | 99 | 100 | def synthesize_audio(model, waveglow, denoiser, inp, lab=None, strength=0.0): 101 | assert inp.size(0) == 1 102 | if lab is not None: 103 | lab = torch.LongTensor(1).fill_(lab) 104 | 105 | with torch.inference_mode(): 106 | model_device = next(model.parameters()).device 107 | _, mel, _, ali, has_eos = model.inference( 108 | inp.to(model_device), 109 | lab.to(model_device) if lab is not None else None, 110 | ret_has_eos=True, 111 | ) 112 | aud = waveglow.infer(mel.float(), sigma=0.666) 113 | aud_dn = denoiser(aud.half(), strength=strength).squeeze(1) 114 | return mel, aud, aud_dn, has_eos 115 | 116 | 117 | def load_tacotron(tacotron_model_path, code_dict_path, max_decoder_steps): 118 | ckpt_dict = torch.load(tacotron_model_path, map_location=torch.device("cpu")) 119 | hparams = ckpt_dict["hparams"] 120 | hparams.code_dict = code_dict_path 121 | hparams.max_decoder_steps = max_decoder_steps 122 | sr = hparams.sampling_rate 123 | model = Tacotron2(hparams) 124 | model.load_state_dict(ckpt_dict["model_dict"]) 125 | model = model.half() 126 | model = model.eval() 127 | return model, sr, hparams 128 | 129 | 130 | def load_waveglow_standalone(waveglow_path, device="cpu"): 131 | ckpt_dict = torch.load(waveglow_path, map_location=torch.device("cpu")) 132 | hparams = ckpt_dict["hparams"] 133 | waveglow = WaveGlow(**hparams) 134 | waveglow.load_state_dict(ckpt_dict["model_dict"]) 135 | waveglow = waveglow.eval() 136 | waveglow = waveglow.to(device) 137 | denoiser = Denoiser(waveglow) 138 | denoiser = denoiser.eval() 139 | return waveglow, denoiser 140 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/gslm/sample.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | import random 8 | from typing import Optional 9 | 10 | from fairseq import utils 11 | import numpy as np 12 | import torch 13 | import torchaudio 14 | 15 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder 16 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder 17 | from sampler import UnitLanguageModelSampler 18 | 19 | log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" 20 | logging.basicConfig(format=log_format, level=logging.INFO) 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class GslmPipeline: 25 | def __init__(self, args): 26 | logger.info("Initializing the GSLM pipeline.") 27 | self.device = torch.device("cuda") 28 | if args.seed is not None: 29 | random.seed(args.seed) 30 | np.random.seed(args.seed) 31 | utils.set_torch_seed(args.seed) 32 | 33 | self.temperature = args.temperature 34 | self.tokens_framerate = 0.02 # HuBERT framerate 35 | self.max_length = 1000 36 | self.trim_trailing_audio_frames = 200 37 | self.sampling_kwargs = { 38 | "temperature": self.temperature, 39 | "sampling": True, 40 | "beam": 1, 41 | "prefix_size": -1, 42 | "max_len_a": 0.0, 43 | "max_len_b": self.max_length, 44 | } 45 | logger.info("... Loading the language model") 46 | self.sampler = UnitLanguageModelSampler.from_pretrained( 47 | args.language_model_data_dir, 48 | ) 49 | logger.info("=> Done!") 50 | logger.info("... Loading the encoder") 51 | 52 | self.speech_encoder = SpeechEncoder.by_name( 53 | dense_model_name="hubert-base-ls960", 54 | quantizer_model_name="kmeans", 55 | vocab_size=args.vocab_size, 56 | need_f0=False, 57 | deduplicate=True, 58 | f0_normalizer=None, 59 | f0_quantizer=None, 60 | ).cuda() 61 | 62 | logger.info("=> Done!") 63 | logger.info("... Loading the vocoder") 64 | self.resynthesizer = TacotronVocoder.by_name( 65 | dense_model_name="hubert-base-ls960", 66 | quantizer_model_name="kmeans", 67 | vocab_size=args.vocab_size, 68 | ).cuda() 69 | 70 | logger.info("=> Done!") 71 | logger.info("Pipeline initialized!") 72 | 73 | def __call__(self, raw_audio, sample_rate): 74 | raw_audio = self.speech_encoder.maybe_resample(raw_audio, sample_rate) 75 | 76 | sample = self.speech_encoder(raw_audio) 77 | units = sample["units"] 78 | duration = sample["durations"].sum().item() 79 | prefix_duration = self.tokens_framerate * duration 80 | target_duration = self.tokens_framerate * ( 81 | self.max_length - self.trim_trailing_audio_frames 82 | ) 83 | 84 | unit_str = " ".join(list(map(str, units.tolist()))) 85 | sampled_unit_str = self.sampler.sample([unit_str], **self.sampling_kwargs)[0] 86 | 87 | audio = self.resynthesizer(sampled_unit_str) 88 | audio = audio[ 89 | : int( 90 | self.resynthesizer.output_sample_rate 91 | * (prefix_duration + target_duration) 92 | ) 93 | ] 94 | 95 | return audio 96 | 97 | @property 98 | def output_sample_rate(self) -> int: 99 | return self.resynthesizer.output_sample_rate 100 | 101 | 102 | def main(args): 103 | pipeline = GslmPipeline(args) 104 | 105 | audio, sample_rate = torchaudio.load(args.input_file) 106 | 107 | if audio.ndim == 2: 108 | audio = audio.mean(0) 109 | 110 | if args.prompt_duration_sec: 111 | prompt = int(args.prompt_duration_sec * sample_rate) 112 | audio = audio[:prompt] 113 | 114 | generated_audio = pipeline(audio, sample_rate) 115 | 116 | torchaudio.save( 117 | args.output_file, 118 | generated_audio.cpu().unsqueeze(0), 119 | pipeline.output_sample_rate, 120 | ) 121 | 122 | 123 | def cli_main(): 124 | import argparse 125 | 126 | parser = argparse.ArgumentParser() 127 | parser.add_argument( 128 | "--input-file", 129 | type=str, 130 | required=True, 131 | help="Input filepath", 132 | ) 133 | parser.add_argument( 134 | "--language-model-data-dir", 135 | type=str, 136 | required=True, 137 | help="Path to language model dataset config path", 138 | ) 139 | parser.add_argument( 140 | "--temperature", 141 | type=float, 142 | default=0.7, 143 | help="Temperature: should be above 0.0", 144 | ) 145 | parser.add_argument( 146 | "--prompt-duration-sec", 147 | type=float, 148 | default=None, 149 | help="Cutting prompts to a maximum duration", 150 | ) 151 | parser.add_argument( 152 | "--output-file", type=str, help="Path where generated metadata is saved" 153 | ) 154 | parser.add_argument("--seed", type=int, default=0) 155 | parser.add_argument( 156 | "--vocab-size", 157 | type=int, 158 | choices=[50, 100, 200], 159 | default=100, 160 | help="Vocabulary size used", 161 | ) 162 | 163 | args = parser.parse_args() 164 | 165 | main(args) 166 | 167 | 168 | if __name__ == "__main__": 169 | cli_main() 170 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/tools/distributed_transcribe/transcribe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import torch.distributed as distr 8 | import torch 9 | import pathlib 10 | from data_handler import ManifestDataset 11 | from distributed import init_distributed_context 12 | 13 | import logging 14 | 15 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | SAMPLING_RATE = 16_000 20 | 21 | 22 | def get_args(): 23 | import argparse 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument( 27 | "--vocab_size", 28 | default=100, 29 | type=int, 30 | help="Quantization codebook vocabulary size", 31 | ) 32 | parser.add_argument( 33 | "--dense_model", default="hubert-base-ls960", help="Dense model to be used" 34 | ) 35 | parser.add_argument( 36 | "--quantizer_model", default="kmeans", help="Quantizer model to be used" 37 | ) 38 | 39 | parser.add_argument( 40 | "--manifest", required=True, help="Path to the dataset manifest file" 41 | ) 42 | parser.add_argument( 43 | "--output", 44 | required=True, 45 | help="Path to the output files. Pseudo-units and duration (if requested) streams will be stored in files with .units and .durations suffixes, respectively", 46 | ) 47 | parser.add_argument( 48 | "--deduplicate", 49 | action="store_true", 50 | help="if set, consecutive repeats of the same pseudo-unit are collapsed ('1 2 2 2 3' becomes '1 2 3')", 51 | ) 52 | parser.add_argument( 53 | "--durations", 54 | action="store_true", 55 | help="if set, the token durations stream is produced", 56 | ) 57 | parser.add_argument( 58 | "--f0s", 59 | action="store_true", 60 | help="if set, the F0 stream is produced", 61 | ) 62 | parser.add_argument( 63 | "--preserve_name", 64 | action="store_true", 65 | help="If set, the transcript contains names of the audio files", 66 | ) 67 | parser.add_argument( 68 | "--separator", 69 | type=str, 70 | default=" ", 71 | help="Separator between pseudo-unit tokens", 72 | ) 73 | 74 | parser.add_argument("--distributed_port", type=int, default=58554) 75 | 76 | args = parser.parse_args() 77 | logger.info(f"Launched with args: {args}") 78 | 79 | return args 80 | 81 | 82 | def worker_shard_path(fname, suffix, worker_id) -> pathlib.Path: 83 | return pathlib.Path(fname).with_suffix(f".{suffix}_partial_{worker_id}") 84 | 85 | 86 | def transcribe(args, rank, world_size): 87 | dataset = ManifestDataset(args.manifest) 88 | 89 | speech_encoder = SpeechEncoder.by_name( 90 | dense_model_name=args.dense_model, 91 | quantizer_model_name=args.quantizer_model, 92 | vocab_size=args.vocab_size, 93 | deduplicate=args.deduplicate, 94 | need_f0=args.f0s, 95 | ).cuda() 96 | 97 | output_files = { 98 | "units": open(worker_shard_path(args.output, "units", rank), "w"), 99 | "durations": None 100 | if not args.durations 101 | else open(worker_shard_path(args.output, "durations", rank), "w"), 102 | "f0s": None 103 | if not args.f0s 104 | else open(worker_shard_path(args.output, "f0s", rank), "w"), 105 | } 106 | 107 | # DistributedSampler will pad the dataloader to be divisible 108 | # by the number of workers, which we do not want so we iterate directly 109 | for i in range(rank, len(dataset), world_size): 110 | waveform, name = dataset[i] 111 | encoded = speech_encoder(waveform) 112 | 113 | stream_names = ["units", "durations"] 114 | if args.f0s: 115 | stream_names += ["f0s"] 116 | 117 | for stream_name in stream_names: 118 | stream = encoded[stream_name] 119 | stream = [str(int(x)) for x in stream.tolist()] 120 | stream = args.separator.join(stream) 121 | 122 | stream = f"{name}\t{stream}" if args.preserve_name else stream 123 | print(stream, file=output_files[stream_name]) 124 | 125 | for fout in output_files.values(): 126 | if fout: 127 | fout.close() 128 | 129 | 130 | def main(args): 131 | context = init_distributed_context(args.distributed_port) 132 | logger.info(f"Distributed context {context}") 133 | 134 | n_gpus = torch.cuda.device_count() 135 | with torch.cuda.device(context.local_rank % n_gpus): 136 | transcribe(args, context.rank, context.world_size) 137 | 138 | if context.world_size > 1: 139 | distr.barrier() 140 | 141 | if context.is_leader: 142 | generated_streams = ["units"] 143 | if args.durations: 144 | generated_streams += ["durations"] 145 | if args.f0s: 146 | generated_streams += ["f0s"] 147 | 148 | for stream_name in generated_streams: 149 | merge_files(args.output, stream_name, context.world_size) 150 | 151 | 152 | def merge_files(full_output, suffix, n_workers): 153 | output = full_output + f".{suffix}" 154 | with open(output, "w") as full: 155 | for worker_id in range(n_workers): 156 | partial_path = worker_shard_path(full_output, suffix, worker_id) 157 | partial = open(partial_path, "r") 158 | for line in partial: 159 | print(line.strip(), file=full) 160 | partial_path.unlink() 161 | 162 | 163 | if __name__ == "__main__": 164 | args = get_args() 165 | main(args) 166 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/resynthesis/resynth.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import torchaudio 7 | from unitspeech.textlesslib.textless import dispatch_dense_model, dispatch_quantizer 8 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder 9 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder 10 | 11 | 12 | def get_args(): 13 | import argparse 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument( 17 | "--dense_model_name", 18 | type=str, 19 | default="hubert-base-ls960", 20 | choices=["hubert-base-ls960", "cpc-big-ll6k"], 21 | help="Dense representation model", 22 | ) 23 | parser.add_argument( 24 | "--vocab_size", 25 | type=int, 26 | default=50, 27 | help="Vocabulary size used for resynthesis", 28 | ) 29 | parser.add_argument( 30 | "--input", 31 | required=True, 32 | help="Path to the input audio file", 33 | ) 34 | parser.add_argument( 35 | "--output", 36 | required=True, 37 | help="Path to the output audio file.", 38 | ) 39 | parser.add_argument( 40 | "--decoder_steps", 41 | type=int, 42 | default=100, 43 | help="Maximal number of decoder steps", 44 | ) 45 | 46 | args = parser.parse_args() 47 | return args 48 | 49 | 50 | def get_compression_rate(dense_model, units, wave, vocab_size, sample_rate): 51 | import numpy as np 52 | 53 | assert units.ndim == 1 54 | assert wave.ndim == 1 55 | 56 | time_in_seconds = wave.numel() / sample_rate 57 | 58 | uniform_token_entropy = np.log2(vocab_size) 59 | # calculated on LL-6k train 60 | unigram_token_entropy = { 61 | "hubert-base-ls960": { 62 | 50: 5.458528917634601, 63 | 100: 6.44513268276806, 64 | 200: 7.477069233162813, 65 | }, 66 | "cpc-big-ll6k": { 67 | 50: 5.428271158461133, 68 | 100: 6.413083187885448, 69 | 200: 7.44253841579776, 70 | }, 71 | }[dense_model][vocab_size] 72 | 73 | uniform_bps = uniform_token_entropy * units.size(0) / time_in_seconds 74 | unigram_entropy = unigram_token_entropy * units.size(0) / time_in_seconds 75 | 76 | return uniform_bps, unigram_entropy 77 | 78 | 79 | def main(args): 80 | dense_model_name = args.dense_model_name 81 | quantizer_name = "kmeans" 82 | 83 | # We can build a speech encoder module using names of pre-trained dense and quantizer models. 84 | # The call below will download appropriate checkpoints as needed behind the scenes 85 | encoder = SpeechEncoder.by_name( 86 | dense_model_name=dense_model_name, 87 | quantizer_model_name=quantizer_name, 88 | vocab_size=args.vocab_size, 89 | need_f0=False, 90 | deduplicate=True, 91 | f0_normalizer=None, 92 | f0_quantizer=None, 93 | ).cuda() 94 | 95 | # Alternatively, we can pass dense/quantizer models directly. 96 | # Here, we'll look up the same models as above, but generally those 97 | # could be any other models. 98 | dense_model = dispatch_dense_model(dense_model_name) 99 | quantizer_model = dispatch_quantizer( 100 | dense_model_name, quantizer_name, args.vocab_size 101 | ) 102 | 103 | # .. and use them when initializing the encoder. Same constructor can be used to when we want 104 | # to use models other than pre-defined. 105 | encoder = SpeechEncoder( 106 | dense_model=dense_model, 107 | quantizer_model=quantizer_model, 108 | need_f0=False, 109 | deduplicate=True, 110 | f0_normalizer=None, 111 | f0_quantizer=None, 112 | ).cuda() 113 | 114 | # now let's load an audio example 115 | waveform, input_sample_rate = torchaudio.load(args.input) 116 | if waveform.ndim == 2: 117 | waveform = waveform.mean(dim=0) 118 | 119 | waveform = encoder.maybe_resample(waveform, input_sample_rate) 120 | 121 | # now and convert it in a stream of deduplicated units (as in GSLM) 122 | encoded = encoder(waveform.cuda()) 123 | # encoded is a dict with keys ('dense', 'units', 'durations'). It can also contain 'f0' if SpeechEncoder 124 | # was initialized with need_f0=True flag. 125 | units = encoded[ 126 | "units" 127 | ] # tensor([71, 12, 57, 12, 57, 12, 57, 12, ...], device='cuda:0', dtype=torch.int32) 128 | 129 | # as with encoder, we can setup vocoder by specifying names of pretrained models 130 | # or by passing checkpoint paths directly. The dense/quantizer models are not invokes, 131 | # we just use their names as an index. 132 | vocoder = TacotronVocoder.by_name( 133 | dense_model_name, 134 | quantizer_name, 135 | args.vocab_size, 136 | ).cuda() 137 | 138 | # now we turn those units back into the audio. 139 | audio = vocoder(units) 140 | 141 | # save the audio 142 | torchaudio.save( 143 | args.output, audio.cpu().float().unsqueeze(0), vocoder.output_sample_rate 144 | ) 145 | 146 | uniform_bps, learned_bps = get_compression_rate( 147 | dense_model_name, units, waveform, args.vocab_size, encoder.expected_sample_rate 148 | ) 149 | 150 | print( 151 | f"Audio of length {round(waveform.size(0) / 16_000, 1)} seconds represented as {units.numel()} tokens" 152 | ) 153 | print( 154 | f"\tAssuming uniform token distribution: {round(uniform_bps, 1)} bits per second" 155 | ) 156 | print( 157 | f"\tAssuming unigram token distribution estimated on LL-6K train: {round(learned_bps, 1)} bits per second" 158 | ) 159 | 160 | 161 | if __name__ == "__main__": 162 | args = get_args() 163 | main(args) 164 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/layers.py: -------------------------------------------------------------------------------- 1 | # BSD 3-Clause License 2 | 3 | # Copyright (c) 2018, NVIDIA Corporation 4 | # All rights reserved. 5 | 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | """https://github.com/NVIDIA/tacotron2""" 32 | 33 | import torch 34 | from librosa.filters import mel as librosa_mel_fn 35 | from .audio_processing import dynamic_range_compression 36 | from .audio_processing import dynamic_range_decompression 37 | from .stft import STFT 38 | from .utils import get_mask_from_lengths 39 | 40 | 41 | class LinearNorm(torch.nn.Module): 42 | def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): 43 | super(LinearNorm, self).__init__() 44 | self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) 45 | 46 | torch.nn.init.xavier_uniform_( 47 | self.linear_layer.weight, 48 | gain=torch.nn.init.calculate_gain(w_init_gain)) 49 | 50 | def forward(self, x): 51 | return self.linear_layer(x) 52 | 53 | 54 | class ConvNorm(torch.nn.Module): 55 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, 56 | padding=None, dilation=1, bias=True, w_init_gain='linear'): 57 | super(ConvNorm, self).__init__() 58 | if padding is None: 59 | assert(kernel_size % 2 == 1) 60 | padding = int(dilation * (kernel_size - 1) / 2) 61 | 62 | self.conv = torch.nn.Conv1d(in_channels, out_channels, 63 | kernel_size=kernel_size, stride=stride, 64 | padding=padding, dilation=dilation, 65 | bias=bias) 66 | 67 | torch.nn.init.xavier_uniform_( 68 | self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) 69 | 70 | def forward(self, signal): 71 | conv_signal = self.conv(signal) 72 | return conv_signal 73 | 74 | 75 | class GlobalAvgPool(torch.nn.Module): 76 | def __init__(self): 77 | super(GlobalAvgPool, self).__init__() 78 | 79 | def forward(self, x, lengths=None): 80 | """Average pooling across time steps (dim=1) with optionally lengths. 81 | Args: 82 | x: torch.Tensor of shape (N, T, ...) 83 | lengths: None or torch.Tensor of shape (N,) 84 | dim: dimension to pool 85 | """ 86 | if lengths is None: 87 | return x.mean(dim=1, keepdim=False) 88 | else: 89 | mask = get_mask_from_lengths(lengths).type(x.type()).to(x.device) 90 | mask_shape = list(mask.size()) + [1 for _ in range(x.ndimension()-2)] 91 | mask = mask.reshape(*mask_shape) 92 | numer = (x * mask).sum(dim=1, keepdim=False) 93 | denom = mask.sum(dim=1, keepdim=False) 94 | return numer / denom 95 | 96 | 97 | class TacotronSTFT(torch.nn.Module): 98 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 99 | n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, 100 | mel_fmax=8000.0): 101 | super(TacotronSTFT, self).__init__() 102 | self.n_mel_channels = n_mel_channels 103 | self.sampling_rate = sampling_rate 104 | self.stft_fn = STFT(filter_length, hop_length, win_length) 105 | mel_basis = librosa_mel_fn( 106 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 107 | mel_basis = torch.from_numpy(mel_basis).float() 108 | self.register_buffer('mel_basis', mel_basis) 109 | 110 | def spectral_normalize(self, magnitudes): 111 | output = dynamic_range_compression(magnitudes) 112 | return output 113 | 114 | def spectral_de_normalize(self, magnitudes): 115 | output = dynamic_range_decompression(magnitudes) 116 | return output 117 | 118 | def mel_spectrogram(self, y): 119 | """Computes mel-spectrograms from a batch of waves 120 | PARAMS 121 | ------ 122 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 123 | 124 | RETURNS 125 | ------- 126 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 127 | """ 128 | assert(torch.min(y.data) >= -1) 129 | assert(torch.max(y.data) <= 1) 130 | 131 | magnitudes, phases = self.stft_fn.transform(y) 132 | magnitudes = magnitudes.data 133 | mel_output = torch.matmul(self.mel_basis, magnitudes) 134 | mel_output = self.spectral_normalize(mel_output) 135 | return mel_output 136 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | 4 | Copyright (c) 2017, Prem Seetharaman 5 | All rights reserved. 6 | 7 | * Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | """ 32 | 33 | import torch 34 | import numpy as np 35 | import torch.nn.functional as F 36 | from torch.autograd import Variable 37 | from scipy.signal import get_window 38 | from librosa.util import pad_center, tiny 39 | from .audio_processing import window_sumsquare 40 | 41 | 42 | class STFT(torch.nn.Module): 43 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 44 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 45 | window='hann'): 46 | super(STFT, self).__init__() 47 | self.filter_length = filter_length 48 | self.hop_length = hop_length 49 | self.win_length = win_length 50 | self.window = window 51 | self.forward_transform = None 52 | scale = self.filter_length / self.hop_length 53 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 54 | 55 | cutoff = int((self.filter_length / 2 + 1)) 56 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 57 | np.imag(fourier_basis[:cutoff, :])]) 58 | 59 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 60 | inverse_basis = torch.FloatTensor( 61 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 62 | 63 | if window is not None: 64 | assert(filter_length >= win_length) 65 | # get window and zero center pad it to filter_length 66 | fft_window = get_window(window, win_length, fftbins=True) 67 | fft_window = pad_center(fft_window, filter_length) 68 | fft_window = torch.from_numpy(fft_window).float() 69 | 70 | # window the bases 71 | forward_basis *= fft_window 72 | inverse_basis *= fft_window 73 | 74 | self.register_buffer('forward_basis', forward_basis.float()) 75 | self.register_buffer('inverse_basis', inverse_basis.float()) 76 | 77 | def transform(self, input_data): 78 | num_batches = input_data.size(0) 79 | num_samples = input_data.size(1) 80 | 81 | self.num_samples = num_samples 82 | 83 | # similar to librosa, reflect-pad the input 84 | input_data = input_data.view(num_batches, 1, num_samples) 85 | input_data = F.pad( 86 | input_data.unsqueeze(1), 87 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 88 | mode='reflect') 89 | input_data = input_data.squeeze(1) 90 | 91 | forward_transform = F.conv1d( 92 | input_data, 93 | Variable(self.forward_basis, requires_grad=False), 94 | stride=self.hop_length, 95 | padding=0) 96 | 97 | cutoff = int((self.filter_length / 2) + 1) 98 | real_part = forward_transform[:, :cutoff, :] 99 | imag_part = forward_transform[:, cutoff:, :] 100 | 101 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 102 | phase = torch.autograd.Variable( 103 | torch.atan2(imag_part.data, real_part.data)) 104 | 105 | return magnitude, phase 106 | 107 | def inverse(self, magnitude, phase): 108 | recombine_magnitude_phase = torch.cat( 109 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 110 | 111 | inverse_transform = F.conv_transpose1d( 112 | recombine_magnitude_phase, 113 | Variable(self.inverse_basis, requires_grad=False), 114 | stride=self.hop_length, 115 | padding=0) 116 | 117 | if self.window is not None: 118 | window_sum = window_sumsquare( 119 | self.window, magnitude.size(-1), hop_length=self.hop_length, 120 | win_length=self.win_length, n_fft=self.filter_length, 121 | dtype=np.float32) 122 | # remove modulation effects 123 | approx_nonzero_indices = torch.from_numpy( 124 | np.where(window_sum > tiny(window_sum))[0]) 125 | window_sum = torch.autograd.Variable( 126 | torch.from_numpy(window_sum), requires_grad=False) 127 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 128 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 129 | 130 | # scale by hop ratio 131 | inverse_transform *= float(self.filter_length) / self.hop_length 132 | 133 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 134 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 135 | 136 | return inverse_transform 137 | 138 | def forward(self, input_data): 139 | self.magnitude, self.phase = self.transform(input_data) 140 | reconstruction = self.inverse(self.magnitude, self.phase) 141 | return reconstruction 142 | -------------------------------------------------------------------------------- /scripts/voice_conversion.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import librosa 4 | import os 5 | from scipy.io.wavfile import write 6 | import torch 7 | import torchaudio 8 | from transformers import HubertModel 9 | 10 | from unitspeech.unitspeech import UnitSpeech 11 | from unitspeech.encoder import Encoder 12 | from unitspeech.text import symbols 13 | from unitspeech.util import HParams, fix_len_compatibility, sequence_mask 14 | from unitspeech.vocoder.env import AttrDict 15 | from unitspeech.vocoder.models import BigVGAN 16 | 17 | 18 | @torch.no_grad() 19 | def voice_conversion( 20 | args, contentvec_encoder, decoder, contentvec, contentvec_length, mel_length, spk_emb, num_downsamplings_in_unet 21 | ): 22 | cond_x, x, x_mask = contentvec_encoder(contentvec, contentvec_length) 23 | cond_y = cond_x 24 | y_lengths = torch.LongTensor([contentvec_length]).to(contentvec.device) 25 | 26 | encoder_outputs = torch.nn.functional.interpolate( 27 | cond_y, size=mel_length, mode='linear' 28 | ) 29 | y_max_length = mel_length 30 | y_max_length_ = fix_len_compatibility(mel_length, num_downsamplings_in_unet) 31 | cond_y = torch.cat([encoder_outputs, torch.zeros_like(encoder_outputs)[:, :, :y_max_length_ - mel_length]], dim=-1) 32 | y_mask = sequence_mask(torch.LongTensor([mel_length]).to(y_lengths.device), y_max_length_)\ 33 | .unsqueeze(1).to(x_mask.dtype) 34 | 35 | z = torch.randn_like(cond_y, device=cond_y.device) 36 | 37 | # Generate sample by performing reverse dynamics 38 | decoder_outputs = decoder( 39 | z, y_mask, cond_y, spk_emb, args.diffusion_step, 40 | text_gradient_scale=args.text_gradient_scale, spk_gradient_scale=args.spk_gradient_scale 41 | ) 42 | decoder_outputs = decoder_outputs[:, :, :y_max_length] 43 | return decoder_outputs 44 | 45 | 46 | class HubertModelWithFinalProj(HubertModel): 47 | def __init__(self, config): 48 | super().__init__(config) 49 | 50 | # The final projection layer is only used for backward compatibility. 51 | # Following https://github.com/auspicious3000/contentvec/issues/6 52 | # Remove this layer is necessary to achieve the desired outcome. 53 | self.final_proj = torch.nn.Linear(config.hidden_size, config.classifier_proj_size) 54 | 55 | 56 | def main(args, hps): 57 | # Load the source audio and extract the contentvec. 58 | contentvec_extractor = HubertModelWithFinalProj.from_pretrained("lengyue233/content-vec-best") 59 | _ = contentvec_extractor.cuda().eval() 60 | 61 | wav, sr = librosa.load(args.source_path) 62 | wav = torch.FloatTensor(wav).unsqueeze(0) 63 | resample_fn = torchaudio.transforms.Resample(sr, 16000).to("cuda") 64 | wav = wav.cuda() 65 | mel_length = wav.shape[-1] // hps.data.hop_length 66 | 67 | wav = resample_fn(wav) 68 | contentvec = contentvec_extractor(wav)["last_hidden_state"] 69 | 70 | # Initialize & load model 71 | contentvec_encoder = Encoder( 72 | n_vocab=len(symbols) + 1, 73 | n_feats=hps.data.n_feats, 74 | **hps.encoder 75 | ) 76 | 77 | contentvec_encoder_dict = torch.load(args.encoder_path, map_location=lambda loc, storage: loc) 78 | contentvec_encoder.load_state_dict(contentvec_encoder_dict['model']) 79 | _ = contentvec_encoder.cuda().eval() 80 | 81 | unitspeech = UnitSpeech( 82 | n_feats=hps.data.n_feats, 83 | **hps.decoder 84 | ) 85 | 86 | decoder_dict = torch.load(args.decoder_path, map_location=lambda loc, storage: loc) 87 | unitspeech.load_state_dict(decoder_dict['model']) 88 | _ = unitspeech.cuda().train() 89 | 90 | # Initialize & load vocoder. 91 | with open(hps.train.vocoder_config_path) as f: 92 | h = AttrDict(json.load(f)) 93 | vocoder = BigVGAN(h) 94 | vocoder.load_state_dict(torch.load(hps.train.vocoder_ckpt_path, map_location=lambda loc, storage: loc)['generator']) 95 | _ = vocoder.cuda().eval() 96 | vocoder.remove_weight_norm() 97 | 98 | # Prepare input 99 | contentvec = contentvec.cuda() 100 | contentvec_length = torch.LongTensor([contentvec.shape[1]]).cuda() 101 | 102 | spk_emb = decoder_dict['spk_emb'].cuda() 103 | 104 | # Load the normalization parameters for mel-spectrogram normalization. 105 | mel_min = decoder_dict['mel_min'].cuda() 106 | mel_max = decoder_dict['mel_max'].cuda() 107 | 108 | with torch.no_grad(): 109 | mel_generated = voice_conversion( 110 | args, contentvec_encoder, unitspeech, 111 | contentvec, contentvec_length, mel_length, spk_emb, len(hps.decoder.dim_mults) - 1 112 | ) 113 | 114 | mel_generated = ((mel_generated + 1) / 2 * (mel_max.to(mel_generated.device) - mel_min.to(mel_generated.device)) 115 | + mel_min.to(mel_generated.device)) 116 | 117 | audio_generated = vocoder.forward(mel_generated).cpu().squeeze().clamp(-1, 1).numpy() 118 | 119 | if "/" in args.generated_sample_path: 120 | os.makedirs(os.path.dirname(args.generated_sample_path), exist_ok=True) 121 | write(args.generated_sample_path, hps.data.sampling_rate, audio_generated) 122 | 123 | 124 | if __name__ == "__main__": 125 | parser = argparse.ArgumentParser() 126 | parser.add_argument('--encoder_path', type=str, default="unitspeech/checkpoints/contentvec_encoder.pt", 127 | help='Path of the text encoder checkpoint.') 128 | parser.add_argument('--decoder_path', type=str, default="unitspeech/outputs/finetuned_decoder.pt", 129 | help='Path of the finetuned decoder checkpoint.') 130 | parser.add_argument('--config_path', type=str, default="unitspeech/checkpoints/voice-conversion.json", 131 | help='Path to the configuration file for voice conversion.') 132 | parser.add_argument('--generated_sample_path', type=str, default="unitspeech/outputs/output_vc.wav", 133 | help='The path to save the generated audio.') 134 | 135 | parser.add_argument('--source_path', type=str, required=True, 136 | help='The source audio file path for voice conversion.') 137 | parser.add_argument('--text_gradient_scale', type=float, default=1.0, 138 | help='Gradient scale of classifier-free guidance (cfg) for text condition. (0.0: wo cfg)') 139 | parser.add_argument('--spk_gradient_scale', type=float, default=1.0, 140 | help='Gradient scale of classifier-free guidance (cfg) for speaker condition. (0.0: wo cfg)') 141 | parser.add_argument('--diffusion_step', type=int, default=50, 142 | help='The number of iterations for sampling in the diffusion model.') 143 | args = parser.parse_args() 144 | 145 | with open(args.config_path, "r") as f: 146 | data = f.read() 147 | config = json.loads(data) 148 | 149 | hps = HParams(**config) 150 | 151 | main(args, hps) -------------------------------------------------------------------------------- /unitspeech/vocoder/meldataset.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import math 4 | import os 5 | import random 6 | import torch 7 | import torch.utils.data 8 | import numpy as np 9 | from librosa.util import normalize 10 | from scipy.io.wavfile import read 11 | from librosa.filters import mel as librosa_mel_fn 12 | 13 | MAX_WAV_VALUE = 32768.0 14 | 15 | 16 | def load_wav(full_path): 17 | sampling_rate, data = read(full_path) 18 | return data, sampling_rate 19 | 20 | 21 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 22 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 23 | 24 | 25 | def dynamic_range_decompression(x, C=1): 26 | return np.exp(x) / C 27 | 28 | 29 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 30 | return torch.log(torch.clamp(x, min=clip_val) * C) 31 | 32 | 33 | def dynamic_range_decompression_torch(x, C=1): 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 52 | if torch.min(y) < -1.: 53 | print('min value is ', torch.min(y)) 54 | if torch.max(y) > 1.: 55 | print('max value is ', torch.max(y)) 56 | 57 | global mel_basis, hann_window 58 | if fmax not in mel_basis: 59 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 60 | mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 61 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 62 | 63 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 64 | y = y.squeeze(1) 65 | 66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 67 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) 68 | 69 | spec = torch.sqrt(torch.real(spec * spec.conj() + 1e-9)) 70 | 71 | spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) 72 | spec = spectral_normalize_torch(spec) 73 | 74 | return spec 75 | 76 | 77 | def get_dataset_filelist(a): 78 | with open(a.input_training_file, 'r', encoding='utf-8') as fi: 79 | training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 80 | for x in fi.read().split('\n') if len(x) > 0] 81 | 82 | with open(a.input_validation_file, 'r', encoding='utf-8') as fi: 83 | validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 84 | for x in fi.read().split('\n') if len(x) > 0] 85 | return training_files, validation_files 86 | 87 | 88 | class MelDataset(torch.utils.data.Dataset): 89 | def __init__(self, training_files, segment_size, n_fft, num_mels, 90 | hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, 91 | device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): 92 | self.audio_files = training_files 93 | random.seed(1234) 94 | if shuffle: 95 | random.shuffle(self.audio_files) 96 | self.segment_size = segment_size 97 | self.sampling_rate = sampling_rate 98 | self.split = split 99 | self.n_fft = n_fft 100 | self.num_mels = num_mels 101 | self.hop_size = hop_size 102 | self.win_size = win_size 103 | self.fmin = fmin 104 | self.fmax = fmax 105 | self.fmax_loss = fmax_loss 106 | self.cached_wav = None 107 | self.n_cache_reuse = n_cache_reuse 108 | self._cache_ref_count = 0 109 | self.device = device 110 | self.fine_tuning = fine_tuning 111 | self.base_mels_path = base_mels_path 112 | 113 | def __getitem__(self, index): 114 | filename = self.audio_files[index] 115 | if self._cache_ref_count == 0: 116 | audio, sampling_rate = load_wav(filename) 117 | audio = audio / MAX_WAV_VALUE 118 | if not self.fine_tuning: 119 | audio = normalize(audio) * 0.95 120 | self.cached_wav = audio 121 | if sampling_rate != self.sampling_rate: 122 | raise ValueError("{} SR doesn't match target {} SR".format( 123 | sampling_rate, self.sampling_rate)) 124 | self._cache_ref_count = self.n_cache_reuse 125 | else: 126 | audio = self.cached_wav 127 | self._cache_ref_count -= 1 128 | 129 | audio = torch.FloatTensor(audio) 130 | audio = audio.unsqueeze(0) 131 | 132 | if not self.fine_tuning: 133 | if self.split: 134 | if audio.size(1) >= self.segment_size: 135 | max_audio_start = audio.size(1) - self.segment_size 136 | audio_start = random.randint(0, max_audio_start) 137 | audio = audio[:, audio_start:audio_start+self.segment_size] 138 | else: 139 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 140 | 141 | mel = mel_spectrogram(audio, self.n_fft, self.num_mels, 142 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, 143 | center=False) 144 | else: 145 | mel = np.load( 146 | os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) 147 | mel = torch.from_numpy(mel) 148 | 149 | if len(mel.shape) < 3: 150 | mel = mel.unsqueeze(0) 151 | 152 | if self.split: 153 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 154 | 155 | if audio.size(1) >= self.segment_size: 156 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) 157 | mel = mel[:, :, mel_start:mel_start + frames_per_seg] 158 | audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] 159 | else: 160 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') 161 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 162 | 163 | mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, 164 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, 165 | center=False) 166 | 167 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 168 | 169 | def __len__(self): 170 | return len(self.audio_files) 171 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/data/cpc_feature_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | class CpcFeatureReader(torch.nn.Module): 12 | def __init__( 13 | self, 14 | checkpoint_path, 15 | layer=2, 16 | use_encoder_layer=False, 17 | norm_features=False, 18 | max_chunk=64000, 19 | **kwargs, 20 | ): 21 | super().__init__() 22 | 23 | self.model = self.load_cpc_model(checkpoint_path, layer).eval() 24 | self.max_chunk = max_chunk 25 | self.norm_features = norm_features 26 | self.use_encoder_layer = use_encoder_layer 27 | 28 | def forward(self, x: torch.Tensor) -> torch.Tensor: 29 | return self.get_features(x) 30 | 31 | @torch.inference_mode() 32 | def get_features(self, x: torch.Tensor) -> torch.Tensor: 33 | x = x.view(1, 1, -1) 34 | size = x.size(2) 35 | feat = [] 36 | start = 0 37 | while start < size: 38 | if start + self.max_chunk > size: 39 | break 40 | x_chunk = x[..., start : start + self.max_chunk] 41 | feat_chunk = self.model.extract_features( 42 | source=x_chunk, 43 | get_encoded=self.use_encoder_layer, 44 | norm_output=self.norm_features, 45 | ) 46 | feat.append(feat_chunk) 47 | start += self.max_chunk 48 | 49 | if start < size: 50 | x_chunk = x[:, -self.max_chunk :] 51 | feat_chunk = self.model.extract_features( 52 | source=x_chunk, 53 | get_encoded=self.use_encoder_layer, 54 | norm_output=self.norm_features, 55 | ) 56 | df = x_chunk.size(2) // feat_chunk.size(1) 57 | delta = (size - start) // df 58 | feat.append(feat_chunk[:, -delta:]) 59 | return torch.cat(feat, 1).squeeze(0) 60 | 61 | @property 62 | def code_hop_size(self) -> int: 63 | return 160 64 | 65 | @property 66 | def expected_sample_rate(self) -> int: 67 | return 16_000 68 | 69 | @staticmethod 70 | def load_cpc_model(checkpoint_path: str, layer: int = 2) -> torch.nn.Module: 71 | state_dict = torch.load(checkpoint_path) 72 | weights = state_dict["weights"] 73 | config = state_dict["config"] 74 | if layer is not None: 75 | config["nLevelsGRU"] = layer 76 | 77 | encoder = CPCEncoder(config["hiddenEncoder"]) 78 | ar_net = CPCAR( 79 | config["hiddenEncoder"], config["hiddenGar"], False, config["nLevelsGRU"] 80 | ) 81 | 82 | model = CPCModel(encoder, ar_net) 83 | model.load_state_dict(weights, strict=False) 84 | model.config = config 85 | 86 | return model 87 | 88 | 89 | class ChannelNorm(nn.Module): 90 | def __init__(self, num_features, epsilon=1e-05, affine=True): 91 | super(ChannelNorm, self).__init__() 92 | if affine: 93 | self.weight = nn.parameter.Parameter(torch.Tensor(1, num_features, 1)) 94 | self.bias = nn.parameter.Parameter(torch.Tensor(1, num_features, 1)) 95 | else: 96 | self.weight = None 97 | self.bias = None 98 | self.epsilon = epsilon 99 | self.p = 0 100 | self.affine = affine 101 | self.reset_parameters() 102 | 103 | def reset_parameters(self): 104 | if self.affine: 105 | torch.nn.init.ones_(self.weight) 106 | torch.nn.init.zeros_(self.bias) 107 | 108 | def forward(self, x): 109 | cum_mean = x.mean(dim=1, keepdim=True) 110 | cum_var = x.var(dim=1, keepdim=True) 111 | x = (x - cum_mean) * torch.rsqrt(cum_var + self.epsilon) 112 | if self.weight is not None: 113 | x = x * self.weight + self.bias 114 | return x 115 | 116 | 117 | class CPCEncoder(nn.Module): 118 | def __init__(self, hidden_dim=512): 119 | super(CPCEncoder, self).__init__() 120 | self.conv0 = nn.Conv1d(1, hidden_dim, 10, stride=5, padding=3) 121 | self.batchNorm0 = ChannelNorm(hidden_dim) 122 | self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, 8, stride=4, padding=2) 123 | self.batchNorm1 = ChannelNorm(hidden_dim) 124 | self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) 125 | self.batchNorm2 = ChannelNorm(hidden_dim) 126 | self.conv3 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) 127 | self.batchNorm3 = ChannelNorm(hidden_dim) 128 | self.conv4 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) 129 | self.batchNorm4 = ChannelNorm(hidden_dim) 130 | self.DOWNSAMPLING = 160 131 | 132 | def get_output_dim(self): 133 | return self.conv4.out_channels 134 | 135 | def forward(self, x): 136 | x = F.relu(self.batchNorm0(self.conv0(x))) 137 | x = F.relu(self.batchNorm1(self.conv1(x))) 138 | x = F.relu(self.batchNorm2(self.conv2(x))) 139 | x = F.relu(self.batchNorm3(self.conv3(x))) 140 | x = F.relu(self.batchNorm4(self.conv4(x))) 141 | return x 142 | 143 | 144 | class CPCAR(nn.Module): 145 | def __init__(self, dim_encoded, dim_output, keep_hidden, num_layers): 146 | super(CPCAR, self).__init__() 147 | self.baseNet = nn.LSTM( 148 | dim_encoded, dim_output, num_layers=num_layers, batch_first=True 149 | ) 150 | self.hidden = None 151 | self.keep_hidden = keep_hidden 152 | 153 | def get_output_dim(self): 154 | return self.baseNet.hidden_size 155 | 156 | def forward(self, x): 157 | try: 158 | self.baseNet.flatten_parameters() 159 | except RuntimeError: 160 | pass 161 | x, h = self.baseNet(x, self.hidden) 162 | if self.keep_hidden: 163 | if isinstance(h, tuple): 164 | self.hidden = tuple(x.detach() for x in h) 165 | else: 166 | self.hidden = h.detach() 167 | return x 168 | 169 | 170 | class CPCModel(nn.Module): 171 | def __init__(self, encoder, ar_net): 172 | super(CPCModel, self).__init__() 173 | self.gEncoder = encoder 174 | self.gAR = ar_net 175 | self.config = None 176 | 177 | def forward(self, x, label): 178 | encoded = self.gEncoder(x).permute(0, 2, 1) 179 | cpc_feature = self.gAR(encoded) 180 | return cpc_feature, encoded, label 181 | 182 | def extract_features(self, source, get_encoded=False, norm_output=False): 183 | cpc_feature, encoded, _ = self.forward(source, None) 184 | if get_encoded: 185 | cpc_feature = encoded 186 | if norm_output: 187 | mean = cpc_feature.mean(dim=1, keepdim=True) 188 | var = cpc_feature.var(dim=1, keepdim=True) 189 | cpc_feature = (cpc_feature - mean) / torch.sqrt(var + 1e-08) 190 | return cpc_feature 191 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/textless/vocoders/tacotron2/utils.py: -------------------------------------------------------------------------------- 1 | # BSD 3-Clause License 2 | 3 | # Copyright (c) 2018, NVIDIA Corporation 4 | # All rights reserved. 5 | 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | 16 | # * Neither the name of the copyright holder nor the names of its 17 | # contributors may be used to endorse or promote products derived from 18 | # this software without specific prior written permission. 19 | 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | """https://github.com/NVIDIA/tacotron2""" 32 | 33 | import collections 34 | import io 35 | import json 36 | import librosa 37 | import numpy as np 38 | import soundfile as sf 39 | import time 40 | import torch 41 | from scipy.io.wavfile import read 42 | from .text import SOS_TOK, EOS_TOK 43 | 44 | 45 | def get_mask_from_lengths(lengths): 46 | max_len = torch.max(lengths).item() 47 | ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)) 48 | mask = (ids < lengths.unsqueeze(1)) 49 | return mask 50 | 51 | 52 | def load_wav_to_torch(full_path, sr=None): 53 | data, sr = librosa.load(full_path, sr=sr) 54 | data = np.clip(data, -1, 1) # potentially out of [-1, 1] due to resampling 55 | data = data * 32768.0 # match values loaded by scipy 56 | return torch.FloatTensor(data.astype(np.float32)), sr 57 | 58 | 59 | def read_binary_audio(bin_data, tar_sr=None): 60 | """ 61 | read binary audio (`bytes` or `uint8` `numpy.ndarray`) to `float32` 62 | `numpy.ndarray` 63 | 64 | RETURNS: 65 | data (np.ndarray) : audio of shape (n,) or (2, n) 66 | tar_sr (int) : sample rate 67 | """ 68 | data, ori_sr = sf.read(io.BytesIO(bin_data), dtype='float32') 69 | data = data.T 70 | if (tar_sr is not None) and (ori_sr != tar_sr): 71 | data = librosa.resample(data, ori_sr, tar_sr) 72 | else: 73 | tar_sr = ori_sr 74 | data = np.clip(data, -1, 1) 75 | data = data * 32768.0 76 | return torch.FloatTensor(data.astype(np.float32)), tar_sr 77 | 78 | 79 | def load_filepaths_and_text(filename): 80 | with open(filename, encoding='utf-8') as f: 81 | data = [json.loads(line.rstrip()) for line in f] 82 | return data 83 | 84 | 85 | def to_gpu(x): 86 | x = x.contiguous() 87 | 88 | if torch.cuda.is_available(): 89 | x = x.cuda(non_blocking=True) 90 | return torch.autograd.Variable(x) 91 | 92 | 93 | def load_code_dict(path, add_sos=False, add_eos=False): 94 | if not path: 95 | return {} 96 | 97 | with open(path, 'r') as f: 98 | codes = ['_'] + [line.rstrip() for line in f] # '_' for pad 99 | code_dict = {c: i for i, c in enumerate(codes)} 100 | 101 | if add_sos: 102 | code_dict[SOS_TOK] = len(code_dict) 103 | if add_eos: 104 | code_dict[EOS_TOK] = len(code_dict) 105 | assert(set(code_dict.values()) == set(range(len(code_dict)))) 106 | 107 | return code_dict 108 | 109 | 110 | def load_obs_label_dict(path): 111 | if not path: 112 | return {} 113 | with open(path, 'r') as f: 114 | obs_labels = [line.rstrip() for line in f] 115 | return {c: i for i, c in enumerate(obs_labels)} 116 | 117 | 118 | # A simple timer class inspired from `tnt.TimeMeter` 119 | class CudaTimer: 120 | def __init__(self, keys): 121 | self.keys = keys 122 | self.reset() 123 | 124 | def start(self, key): 125 | s = torch.cuda.Event(enable_timing=True) 126 | s.record() 127 | self.start_events[key].append(s) 128 | return self 129 | 130 | def stop(self, key): 131 | e = torch.cuda.Event(enable_timing=True) 132 | e.record() 133 | self.end_events[key].append(e) 134 | return self 135 | 136 | def reset(self): 137 | self.start_events = collections.defaultdict(list) 138 | self.end_events = collections.defaultdict(list) 139 | self.running_times = collections.defaultdict(float) 140 | self.n = collections.defaultdict(int) 141 | return self 142 | 143 | def value(self): 144 | self._synchronize() 145 | return {k: self.running_times[k] / self.n[k] for k in self.keys} 146 | 147 | def _synchronize(self): 148 | torch.cuda.synchronize() 149 | for k in self.keys: 150 | starts = self.start_events[k] 151 | ends = self.end_events[k] 152 | if len(starts) == 0: 153 | raise ValueError("Trying to divide by zero in TimeMeter") 154 | if len(ends) != len(starts): 155 | raise ValueError("Call stop before checking value!") 156 | time = 0 157 | for start, end in zip(starts, ends): 158 | time += start.elapsed_time(end) 159 | self.running_times[k] += time * 1e-3 160 | self.n[k] += len(starts) 161 | self.start_events = collections.defaultdict(list) 162 | self.end_events = collections.defaultdict(list) 163 | 164 | 165 | # Used to measure the time taken for multiple events 166 | class Timer: 167 | def __init__(self, keys): 168 | self.keys = keys 169 | self.n = {} 170 | self.running_time = {} 171 | self.total_time = {} 172 | self.reset() 173 | 174 | def start(self, key): 175 | self.running_time[key] = time.time() 176 | return self 177 | 178 | def stop(self, key): 179 | self.total_time[key] = time.time() - self.running_time[key] 180 | self.n[key] += 1 181 | self.running_time[key] = None 182 | return self 183 | 184 | def reset(self): 185 | for k in self.keys: 186 | self.total_time[k] = 0 187 | self.running_time[k] = None 188 | self.n[k] = 0 189 | return self 190 | 191 | def value(self): 192 | vals = {} 193 | for k in self.keys: 194 | if self.n[k] == 0: 195 | raise ValueError("Trying to divide by zero in TimeMeter") 196 | else: 197 | vals[k] = self.total_time[k] / self.n[k] 198 | return vals 199 | 200 | -------------------------------------------------------------------------------- /scripts/text_to_speech.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import phonemizer 5 | from scipy.io.wavfile import write 6 | import torch 7 | 8 | from unitspeech.unitspeech import UnitSpeech 9 | from unitspeech.duration_predictor import DurationPredictor 10 | from unitspeech.encoder import Encoder 11 | from unitspeech.text import cleaned_text_to_sequence, phonemize, symbols 12 | from unitspeech.util import HParams, intersperse, fix_len_compatibility, sequence_mask, generate_path 13 | from unitspeech.vocoder.env import AttrDict 14 | from unitspeech.vocoder.models import BigVGAN 15 | 16 | 17 | @torch.no_grad() 18 | def text_to_speech( 19 | args, text_encoder, duration_predictor, decoder, phoneme, phoneme_lengths, spk_emb, num_downsamplings_in_unet 20 | ): 21 | cond_x, x, x_mask = text_encoder(phoneme, phoneme_lengths) 22 | logw = duration_predictor(x, x_mask, w=None, g=spk_emb, reverse=True) 23 | w = torch.exp(logw) * x_mask 24 | w_ceil = torch.ceil(w) * args.length_scale 25 | 26 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 27 | y_max_length = int(y_lengths.max()) 28 | y_max_length_ = fix_len_compatibility(y_max_length, num_downsamplings_in_unet) 29 | 30 | # Using obtained durations `w` construct alignment map `attn` 31 | y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype) 32 | attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2) 33 | attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) 34 | 35 | # Align encoded text and get mu_y 36 | cond_y = torch.matmul(attn.squeeze(1).transpose(1, 2).contiguous(), cond_x.transpose(1, 2).contiguous()) 37 | cond_y = cond_y.transpose(1, 2).contiguous() 38 | 39 | z = torch.randn_like(cond_y, device=cond_y.device) 40 | 41 | # Generate sample by performing reverse dynamics 42 | decoder_outputs = decoder( 43 | z, y_mask, cond_y, spk_emb, args.diffusion_step, 44 | text_gradient_scale=args.text_gradient_scale, spk_gradient_scale=args.spk_gradient_scale 45 | ) 46 | decoder_outputs = decoder_outputs[:, :, :y_max_length] 47 | return decoder_outputs 48 | 49 | 50 | def main(args, hps): 51 | global_phonemizer = phonemizer.backend.EspeakBackend( 52 | language='en-us', preserve_punctuation=True, with_stress=True, language_switch="remove-flags" 53 | ) 54 | 55 | # Initialize & load model 56 | text_encoder = Encoder( 57 | n_vocab=len(symbols) + 1, 58 | n_feats=hps.data.n_feats, 59 | **hps.encoder 60 | ) 61 | 62 | text_encoder_dict = torch.load(args.encoder_path, map_location=lambda loc, storage: loc) 63 | text_encoder.load_state_dict(text_encoder_dict['model']) 64 | _ = text_encoder.cuda().eval() 65 | 66 | duration_predictor = DurationPredictor( 67 | **hps.duration_predictor 68 | ) 69 | 70 | duration_predictor_dict = torch.load(args.duration_predictor_path, map_location=lambda loc, storage: loc) 71 | duration_predictor.load_state_dict(duration_predictor_dict['model']) 72 | _ = duration_predictor.cuda().eval() 73 | 74 | unitspeech = UnitSpeech( 75 | n_feats=hps.data.n_feats, 76 | **hps.decoder 77 | ) 78 | 79 | decoder_dict = torch.load(args.decoder_path, map_location=lambda loc, storage: loc) 80 | unitspeech.load_state_dict(decoder_dict['model']) 81 | _ = unitspeech.cuda().train() 82 | 83 | # Initialize & load vocoder. 84 | with open(hps.train.vocoder_config_path) as f: 85 | h = AttrDict(json.load(f)) 86 | vocoder = BigVGAN(h) 87 | vocoder.load_state_dict(torch.load(hps.train.vocoder_ckpt_path, map_location=lambda loc, storage: loc)['generator']) 88 | _ = vocoder.cuda().eval() 89 | vocoder.remove_weight_norm() 90 | 91 | # Prepare input 92 | phoneme = phonemize(args.text, global_phonemizer) 93 | phoneme = cleaned_text_to_sequence(phoneme) 94 | phoneme = intersperse(phoneme, len(symbols)) # add a blank token, whose id number is len(symbols) 95 | phoneme = torch.LongTensor(phoneme).cuda().unsqueeze(0) 96 | phoneme_lengths = torch.LongTensor([phoneme.shape[-1]]).cuda() 97 | 98 | spk_emb = decoder_dict['spk_emb'].cuda() 99 | 100 | # Load the normalization parameters for mel-spectrogram normalization. 101 | mel_min = decoder_dict['mel_min'].cuda() 102 | mel_max = decoder_dict['mel_max'].cuda() 103 | 104 | with torch.no_grad(): 105 | mel_generated = text_to_speech( 106 | args, text_encoder, duration_predictor, unitspeech, 107 | phoneme, phoneme_lengths, spk_emb, len(hps.decoder.dim_mults) - 1 108 | ) 109 | 110 | mel_generated = ((mel_generated + 1) / 2 * (mel_max.to(mel_generated.device) - mel_min.to(mel_generated.device)) 111 | + mel_min.to(mel_generated.device)) 112 | audio_generated = vocoder.forward(mel_generated).cpu().squeeze().clamp(-1, 1).numpy() 113 | 114 | if "/" in args.generated_sample_path: 115 | os.makedirs(os.path.dirname(args.generated_sample_path), exist_ok=True) 116 | write(args.generated_sample_path, hps.data.sampling_rate, audio_generated) 117 | 118 | 119 | if __name__ == "__main__": 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--encoder_path', type=str, default="unitspeech/checkpoints/text_encoder.pt", 122 | help='Path of the text encoder checkpoint.') 123 | parser.add_argument('--decoder_path', type=str, default="unitspeech/outputs/finetuned_decoder.pt", 124 | help='Path of the finetuned decoder checkpoint.') 125 | parser.add_argument('--duration_predictor_path', type=str, default="unitspeech/checkpoints/duration_predictor.pt", 126 | help='Path of the duration predictor checkpoint.') 127 | parser.add_argument('--config_path', type=str, default="unitspeech/checkpoints/text-to-speech.json", 128 | help='Path to the configuration file for text-to-speech.') 129 | parser.add_argument('--generated_sample_path', type=str, default="unitspeech/outputs/output_tts.wav", 130 | help='The path to save the generated audio.') 131 | 132 | parser.add_argument('--text', type=str, required=True, 133 | help='The desired transcript to be generated.') 134 | parser.add_argument('--text_gradient_scale', type=float, default=1.0, 135 | help='Gradient scale of classifier-free guidance (cfg) for text condition. (0.0: wo cfg)') 136 | parser.add_argument('--spk_gradient_scale', type=float, default=1.0, 137 | help='Gradient scale of classifier-free guidance (cfg) for speaker condition. (0.0: wo cfg)') 138 | parser.add_argument('--length_scale', type=float, default=1.0, 139 | help='The parameter for adjusting speech speed. The smaller it is compared to 1, the faster the speech becomes.') 140 | parser.add_argument('--diffusion_step', type=int, default=50, 141 | help='The number of iterations for sampling in the diffusion model.') 142 | args = parser.parse_args() 143 | 144 | with open(args.config_path, "r") as f: 145 | data = f.read() 146 | config = json.loads(data) 147 | 148 | hps = HParams(**config) 149 | 150 | main(args, hps) -------------------------------------------------------------------------------- /unitspeech/textlesslib/README.md: -------------------------------------------------------------------------------- 1 | # textlesslib 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 4 | 5 | Textless NLP is an active area of research that aims to extend NLP techniques (and tools!) to work directly on spoken language. By using self-supervisedly 6 | learnt discrete speech representations, the area promises to unlock interesting NLP applications on languages without written form or on facets of spoken 7 | language that are unaccessable for text-based approaches, e.g. prosody. To learn more, please check some of the [papers](https://speechbot.github.io/). 8 | 9 | **textlesslib** is a library aimed to facilitate research in Textless NLP. The goal of the library is to speed up the research cycle and 10 | lower the learning curve for those who want to start. We provide highly configurable, off-the-shelf available tools to encode speech 11 | as sequences of discrete values and tools to decode such streams back into the audio domain. A high-level description of the library can also be 12 | found in our paper [[arxiv]](https://arxiv.org/abs/2202.07359). 13 | 14 | 15 | Table of Contents 16 | ================= 17 | 18 | * [Installation](#installation) 19 | * [Usage examples](#usage-examples) 20 | * [Encoding speech](#encoding-speech) 21 | * [Dataset helpers](#dataset-helpers) 22 | * [Data preprocessing](#data-preprocessing) 23 | * [Provided models](#provided-models) 24 | * [Testing](#testing) 25 | * [Citing textless-lib](#citing-textless-lib) 26 | 27 | 28 | ## Installation 29 | ```bash 30 | git clone git@github.com:facebookresearch/textlesslib.git 31 | cd textlesslib 32 | pip install -e . 33 | pip install git+git://github.com:pytorch/fairseq.git@dd106d9534b22e7db859a6b87ffd7780c38341f8 34 | ``` 35 | 36 | ## Usage examples 37 | We include a set of examples in the [examples](./examples) folder: 38 | * [Discrete speech resynthesis (& compression)](./examples/resynthesis/) 39 | * [Probing for speaker information in the representations](./examples/speaker_probing/) 40 | * [Generative Spoken Language Modeling (aka Speech Continuation)](./examples/gslm/) 41 | 42 | There is also a [[Jupyter notebook]](./examples/resynthesis_and_continuation.ipynb) and a [[Google Colab]](https://colab.research.google.com/github/facebookresearch/textlesslib/blob/main/examples/resynthesis_and_continuation.ipynb) that combine discrete resynthesis and speech continuation examples in a step-by-step mini-tutorial. 43 | 44 | We believe those examples can serve both as illustrations for the provided components and provide 45 | a starting point for tinkering in interesting directions. 46 | 47 | ### Encoding speech 48 | Below is an example on loading an audio example and encoding it as a sequence of HuBERT-based discrete tokens (aka pseudo-units). 49 | Downloading of the required checkpoints is handled by textlesslib itself (by default they are stored in `~/.textless`): 50 | 51 | ```python 52 | import torchaudio 53 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder 54 | 55 | dense_model_name = "hubert-base-ls960" 56 | quantizer_name, vocab_size = "kmeans", 100 57 | input_file = "input.wav" 58 | 59 | # now let's load an audio example 60 | waveform, sample_rate = torchaudio.load(input_file) 61 | 62 | # We can build a speech encoder module using names of pre-trained 63 | # dense and quantizer models. The call below will download 64 | # appropriate checkpoints as needed behind the scenes. We can 65 | # also construct an encoder by directly passing model instances 66 | encoder = SpeechEncoder.by_name( 67 | dense_model_name=dense_model_name, 68 | quantizer_model_name=quantizer_name, 69 | vocab_size=vocab_size, 70 | deduplicate=True, 71 | ).cuda() 72 | 73 | 74 | # now convert it in a stream of deduplicated units (as in GSLM) 75 | encoded = encoder(waveform.cuda()) 76 | # encoded is a dict with keys ('dense', 'units', 'durations'). 77 | # It can also contain 'f0' if SpeechEncoder was initialized 78 | # with need_f0=True flag. 79 | units = encoded["units"] # tensor([71, 12, 57, ...], ...) 80 | ``` 81 | Now it can be casted back into the audio domain: 82 | 83 | ```python 84 | # as with encoder, we can setup vocoder by passing checkpoints 85 | # directly or by specifying the expected format by the names 86 | # of dense and quantizer models (these models themselves 87 | # won't be loaded) 88 | vocoder = TacotronVocoder.by_name( 89 | dense_model_name, 90 | quantizer_name, 91 | vocab_size, 92 | ).cuda() 93 | 94 | # now we turn those units back into the audio. 95 | audio = vocoder(units) 96 | 97 | # save the audio 98 | torchaudio.save(output_file, audio.cpu().float().unsqueeze(0), vocoder.output_sample_rate) 99 | ``` 100 | ### Dataset helpers 101 | Below is an example on using `textless` view on the LibriSpeech dataset: 102 | ```python 103 | encoder = SpeechEncoder.by_name( 104 | dense_model_name=dense_model_name, 105 | quantizer_model_name=quantizer_name, 106 | vocab_size=vocab_size, 107 | deduplicate=True, 108 | ).cuda() 109 | 110 | quantized_dataset = QuantizedLibriSpeech( 111 | root=existing_root, speech_encoder=encoder, url=url) 112 | 113 | datum = quantized_dataset[0] 114 | sample_rate, utterance, speaker_id, chapter_id, utterance_id = datum['rest'] 115 | # datum['units'] = tensor([71, 12, 63, ...]) 116 | ``` 117 | In the [probing example](./examples/speaker_probing/) we illustrate how such a dataset 118 | can be used with a standard Pytorch dataloader in a scalable manner. 119 | 120 | ### Data preprocessing 121 | We also provide a [multi-GPU/multi-node preprocessing tool](tools/distributed_transcribe/) 122 | for the cases where on-the-fly processing of audio should be avoided. 123 | 124 | ## Provided models 125 | We provide implementations and pre-trained checkpoints for the following models: 126 | 127 | * Dense representations: HuBERT-base (trained on LibriSpeech 960h) and CPC (trained on 6Kh subset of LibriLight); 128 | * Quantizers: k-means quantizers with vocabulary sizes of 50, 100, 200 for both the dense models (trained on LibriSpeech 960h); 129 | * Decoders: Tacotron2 models for all (dense model x quantizer) combinations (trained on LJSpeech). 130 | 131 | Finally, the pitch extraction is done via YAAPT. 132 | 133 | ## Testing 134 | We use pytest (`pip install pytest pytest-xdist `). Our unit tests are located in the `tests` directory: 135 | ```bash 136 | cd tests && pytest -n 8 137 | ``` 138 | 139 | ## Citing textless-lib 140 | If you find textless-lib useful in your research, please consider citing our work: 141 | ``` 142 | @article{Kharitonov2022, 143 | title={textless-lib: a Library for Textless Spoken Language Processing}, 144 | author={Eugene Kharitonov and Jade Copet and Kushal Lakhotia and Tu Anh Nguyen and Paden Tomasello and Ann Lee and Ali Elkahky and Wei-Ning Hsu and Abdelrahman Mohamed and Emmanuel Dupoux and Yossi Adi}, 145 | year={2022}, 146 | eprint={2202.07359}, 147 | archivePrefix={arXiv}, 148 | primaryClass={cs.CL} 149 | } 150 | ``` 151 | 152 | ## Licence 153 | textlesslib is licensed under MIT, the text of the license can be found [here](LICENSE). 154 | Internally, it uses 155 | * [WaveGlow](https://github.com/NVIDIA/waveglow) - licensed under BSD-3-Clause license; 156 | * [tacotron implementation](https://github.com/keithito/tacotron) - licensed under MIT license; 157 | * [tacotron2 implementation](https://github.com/NVIDIA/tacotron2) - licensed under BSD-3-Clause license; 158 | * [STFT implementation](https://github.com/pseeth/torch-stft) - licensed under BSD-3-Clause license. 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## UnitSpeech: Speaker-adaptive Speech Synthesis with Untranscribed Data (INTERSPEECH 2023, Oral) 2 | #### Heeseung Kim, Sungwon Kim, Jiheum Yeom, Sungroh Yoon 3 | ![model-1](https://github.com/gmltmd789/UnitSpeech/assets/49265950/44cb4991-abb0-44b2-81fd-fce92cc1f3f1) 4 |

5 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jAglTVrBNeEQbOAJ3T_YRotoKqCBPNn9?usp=sharing) 6 | ### [Paper](https://arxiv.org/abs/2306.16083) 7 | ### [Audio demo](https://unitspeech.github.io/) 8 | 9 | ## Updates 10 | ### 2023.07.04 : We changed the normalization method for better speaker similarity. 11 | - **We normalized the mel-spectrogram of the reference audio during fine-tuning using the min and max values of the reference audio's mel-spectrogram, rather than the min and max values obtained from the entire LibriTTS train set.** 12 | - **We observed that this modification helped improve speaker similarity.** 13 | 14 | ### 2023.06.29 : We update our code and checkpoints for better pronunciation. 15 | - **Extract reference speaker embeddings using the [WavLM](https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification#pre-trained-models)-based speaker encoder.** 16 | - **Modeling normalized mel-spectrogram (-1 ~ 1).** 17 | 18 | ### 2023.06.28 : Updated components compared to the version of INTERSPEECH. 19 | - **Change in vocoder (from HiFi-GAN to BigVGAN).** 20 | - **Support for speaker classifier-free guidance (advantageous for adapting to more unique voices.)** 21 | - **Change "training-free text classifier-free guidance" to "text classifier-free guidance" (learning text unconditional embedding).** 22 | - **Ensure compatibility with various recent works on unit-based speech synthesis (number of clusters of unit (K): 200 → 1000)** 23 | - **Substantial improvement in pronunciation accuracy** 24 | - **To improve TTS (Text-to-Speech) pronunciation, an IPA-based phonemizer is used.** 25 | - **To improve VC (Voice Conversion) pronunciation, a contentvec encoder is introduced.** 26 | 27 | 28 | # Warning: Ethical & Legal Considerations 29 | 1. **UnitSpeech was created with the primary objective of facilitating research endeavors.** 30 | 2. **When utilizing samples generated using this model, it is crucial to clearly disclose that the samples were generated using AI technology. Additionally, it is necessary to provide the sources of the audio used in the generation process.** 31 | 3. **We notify that users take full responsibility for any possible negative outcomes and legal & ethical issues that may arise due to their misuse of the model.** 32 | 4. **As a precautionary measure against possible misapplication, we intend to introduce a classification model capable of discerning samples generated through the utilization of this model.** 33 | 34 | ## TO DO 35 | - [ ] Release a classification model to distinguish samples from UnitSpeech 36 | 37 | ## Installation 38 | **Tested on Ubuntu 20.04.5 LTS, Python 3.8, Anaconda (2023.03-1) environment** 39 | First, install the necessary package for the IPA phonemizer. 40 | ```shell 41 | sudo apt-get install espeak=1.48.04+dfsg-8build1 espeak-ng=1.50+dfsg-6 42 | ``` 43 | If you are unable to install the specific versions of espeak and espeak-ng on Ubuntu 18.04 or earlier, please install the available versions of each package.
44 | Note: If you have a different version of espeak-ng, the output of phonemizing text may vary, which can affect pronunciation accuracy. 45 | 46 | After that, create a conda environment and install the unitspeech package and the package required for extracting speaker embeddings. 47 | ```shell 48 | conda create -n unitspeech python=3.8 49 | conda activate unitspeech 50 | git clone https://github.com/gmltmd789/UnitSpeech.git 51 | cd UnitSpeech 52 | pip install -e . 53 | pip install --no-deps s3prl==0.4.10 54 | ``` 55 | 56 | ## Pretrained Models 57 | **We provide the [pretrained models](https://drive.google.com/drive/folders/1yFkb2TAYB_zMmoTuUOXu-zXb3UI9pVJ9?usp=sharing).** 58 | |File Name|Usage| 59 | |------|---| 60 | |contentvec_encoder.pt|Used for any-to-any voice conversion tasks.| 61 | |unit_encoder.pt|Used for fine-tuning and unit-based speech synthesis tasks.
(e.g., Adaptive Speech Synthesis for Speech-to-Unit Translation)| 62 | |text_encoder.pt|Used for adaptive text-to-speech tasks.| 63 | |duration_predictor.pt|Used for adaptive text-to-speech tasks.| 64 | |pretrained_decoder.pt|Used for all adaptive speech synthesis tasks.| 65 | |speaker_encoder.pt|Used for extracting [speaker embeddings](https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification#pre-trained-models).| 66 | |bigvgan.pt|[Vocoder](https://github.com/NVIDIA/BigVGAN) checkpoint.| 67 | |bigvgan-config.json|Configuration for the vocoder.| 68 | 69 | **After downloading the files, please arrange them in the following structure.** 70 | ```buildoutcfg 71 | UnitSpeech/... 72 | unitspeech/... 73 | checkpoints/... 74 | contentvec_encoder.pt 75 | duration_predictor.pt 76 | pretrained_decoder.pt 77 | text_encoder.pt 78 | unit_encoder.pt 79 | ... 80 | speaker_encoder/... 81 | checkpts/... 82 | speaker_encoder.pt 83 | ... 84 | vocoder/... 85 | checkpts/... 86 | bigvgan.pt 87 | bigvgan-config.json 88 | ... 89 | ... 90 | ... 91 | ``` 92 | 93 | ## Fine-tuning 94 | The decoder is fine-tuned using the target speaker's voice, employing the unit encoder. **It is recommended to use a reference English speech with a duration of at least 5~10 seconds.** 95 | 96 | ```shell 97 | python scripts/finetune.py \ 98 | --reference_path REFERENCE_SPEECH_PATH \ 99 | --output_decoder_path FILEPATH1/FINETUNED_DECODER.pt 100 | ``` 101 | 102 | By executing the code, your personalized decoder will be saved as "FILEPATH1/FINETUNED_DECODER.pt".
103 | With the fine-tuned decoder, you can perform adaptive text-to-speech and any-to-any voice conversion, as described below.

104 | By default, fine-tuning is conducted in fp32 using the Adam optimizer with a learning rate of 2e-5 for 500 iterations.
105 | You can adjust the above elements through arguments provided. (--fp16_run, --learning_rate, --n_iters)
106 | **For speakers with unique voices, increasing the number of fine-tuning iterations can help achieve better results.**
107 | 108 | ## Inference 109 | ```shell 110 | # script for adaptive text-to-speech 111 | python scripts/text_to_speech.py \ 112 | --text "TEXT_TO_GENERATE" \ 113 | --decoder_path FILEPATH1/FINETUNED_DECODER.pt \ 114 | --generated_sample_path FILEPATH2/PATH_TO_SAVE_SYNTHESIZED_SPEECH.wav 115 | 116 | 117 | # script for any-to-any voice conversion 118 | python scripts/voice_conversion.py \ 119 | --source_path SOURCE_SPEECH_PATH_TO_CONVERT.wav \ 120 | --decoder_path FILEPATH1/FINETUNED_DECODER.pt \ 121 | --generated_sample_path FILEPATH2/PATH_TO_SAVE_SYNTHESIZED_SPEECH.wav 122 | ``` 123 | You can adjust the number of diffusion steps, text gradient scale, and speaker gradient scale as arguments.
124 | - text_gradient_scale : responsible for pronunciation accuracy and audio quality. Increasing its value makes the pronunciation of the samples more accurate.
125 | - spk_gradient_scale : responsible for speaker similarity. Increasing its value generates voices that are closer to the reference speech.
126 | 127 | By default, text gradient scale is set to 1.0, and speaker gradient scale is set to 1.0.
128 | **If you want better pronunciation and audio quality, please increase the value of "text_gradient_scale." This will slightly reduce speaker similarity.**
129 | **If you want better speaker similarity, please increase the value of "spk_gradient_scale." This will slightly degrade pronunciation accuracy and audio quality.**
130 | 131 | You can adjust the speed of speaking as arguments. (default: 1.0)
132 | - length_scale : Increasing its value (> 1.0) makes the speech slow, while decreasing its value (< 1.0) makes the speech fast
133 | 134 | **Note: Using excessively large gradient scales can degrade the audio quality.** 135 | 136 | ## License 137 | 138 | The code and model weights of UnitSpeech are released under the CC BY-NC-SA 4.0 license. 139 | 140 | ## References 141 | * [BigVGAN](https://github.com/NVIDIA/BigVGAN) (for vocoder) 142 | * [textlesslib](https://github.com/facebookresearch/textlesslib) (for unit extraction) 143 | * [ContentVec](https://github.com/auspicious3000/contentvec) (for contentvec extraction) 144 | * [VITS](https://github.com/jaywalnut310/vits) (for text & IPA phoneme sequence processing) 145 | * [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS) (for overall architecture and code) 146 | * [denoising-diffusion-pytorch](https://github.com/rosinality/denoising-diffusion-pytorch) (for diffusion-based sampler) 147 | * [WavLM](https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification) (for speaker embedding extraction) 148 | 149 | ## Citation 150 | ``` 151 | @misc{kim2023unitspeech, 152 | title={UnitSpeech: Speaker-adaptive Speech Synthesis with Untranscribed Data}, 153 | author={Heeseung Kim and Sungwon Kim and Jiheum Yeom and Sungroh Yoon}, 154 | year={2023}, 155 | eprint={2306.16083}, 156 | archivePrefix={arXiv}, 157 | primaryClass={cs.SD} 158 | } 159 | ``` 160 | -------------------------------------------------------------------------------- /unitspeech/textlesslib/examples/speaker_probing/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import torch 8 | import argparse 9 | from unitspeech.textlesslib.textless.data.quantized_datasets import QuantizedLibriSpeech 10 | from torch.utils.data import DataLoader 11 | import torch.nn.functional as F 12 | from probes import ContinuousClassifier, DiscreteClassifier, ConstantBaseline 13 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder 14 | from unitspeech.textlesslib.textless import dispatch_dense_model, dispatch_quantizer 15 | 16 | 17 | def set_seed_(seed): 18 | torch.manual_seed(seed) 19 | torch.cuda.manual_seed(seed) 20 | 21 | 22 | def move_to(x, device: torch.device): 23 | if hasattr(x, "to"): 24 | return x.to(device) 25 | if isinstance(x, list) or isinstance(x, tuple): 26 | return [move_to(i, device) for i in x] 27 | if isinstance(x, dict): 28 | return {k: move_to(v, device) for k, v in x.items()} 29 | return x 30 | 31 | 32 | def get_args(): 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument( 35 | "--dense_model_name", 36 | type=str, 37 | help="Dense model to be used", 38 | default="hubert-base-ls960", 39 | choices=["hubert-base-ls960", "cpc-big-ll6k"], 40 | ) 41 | parser.add_argument("--vocab_size", type=int, help="Unit vocab size", default=50) 42 | parser.add_argument("--epochs", type=int, default=150) 43 | parser.add_argument( 44 | "--batch_size", type=int, help="Batch size for K-means training", default=32 45 | ) 46 | parser.add_argument("--seed", type=int, default=13) 47 | parser.add_argument( 48 | "--model_type", 49 | choices=["baseline", "discrete", "continuous"], 50 | default="baseline", 51 | ) 52 | 53 | args = parser.parse_args() 54 | 55 | return args 56 | 57 | 58 | def train(model, train_dataloader, valid_dataloader, args): 59 | model.train() 60 | optimizer = torch.optim.Adam(model.parameters()) 61 | 62 | for epoch in range(args.epochs): 63 | train_epoch(model, train_dataloader, optimizer, epoch) 64 | evaluate_model(model, valid_dataloader) 65 | 66 | 67 | def train_epoch(model, dataloader, optimizer, e): 68 | model.train() 69 | n_examples = 0.0 70 | accumulated = torch.zeros(1, dtype=torch.float64).cuda() 71 | 72 | for batch in dataloader: 73 | batch = move_to(batch, torch.cuda.current_device()) 74 | speakers = torch.tensor(batch["rest"][2]).cuda() 75 | 76 | speaker_logprobs = model(batch) 77 | loss = F.nll_loss(speaker_logprobs, speakers) 78 | 79 | optimizer.zero_grad() 80 | loss.backward() 81 | optimizer.step() 82 | 83 | accumulated += loss.detach().sum() 84 | n_examples += speakers.size(0) 85 | 86 | train_loss = (accumulated / n_examples).item() 87 | print(f"Epoch {e} | sliding mean train loss {train_loss}") 88 | 89 | 90 | @torch.no_grad() 91 | def evaluate_model(model, dataloader): 92 | model.eval() 93 | n_examples = 0 94 | accumulated_loss = torch.zeros(1, dtype=torch.float64).cuda() 95 | accuracy = torch.zeros(1, dtype=torch.float64).cuda() 96 | 97 | for batch in dataloader: 98 | batch = move_to(batch, torch.cuda.current_device()) 99 | speakers = torch.tensor(batch["rest"][2]).cuda() 100 | 101 | speaker_logprobs = model(batch) 102 | loss = F.nll_loss(speaker_logprobs, speakers) 103 | accumulated_loss += loss 104 | 105 | accuracy += (speaker_logprobs.argmax(dim=-1) == speakers).sum() 106 | n_examples += speakers.size(0) 107 | 108 | accumulated_loss /= n_examples 109 | accuracy /= n_examples 110 | 111 | print(f"Valid loss: {accumulated_loss.item()}, accuracy: {accuracy.item()}") 112 | 113 | 114 | class SpeakerDatasetWrapper: 115 | def __init__(self, quantized_data, speaker_mapping=None): 116 | self.quantized_data = quantized_data 117 | self.speaker_mapping = ( 118 | speaker_mapping 119 | if speaker_mapping is not None 120 | else self.get_speaker_ids(quantized_data.dataset._walker) 121 | ) 122 | self.collater = self.quantized_data.collater 123 | self.max_length = ( 124 | 10 * 16_000 // self.quantized_data.speech_encoder.code_hop_size 125 | ) 126 | 127 | @staticmethod 128 | def get_speaker_ids(walker): 129 | speaker_mapping = {} 130 | for fileid in walker: 131 | speaker_id, *_ = fileid.split("-") 132 | speaker_id = int(speaker_id) 133 | if speaker_id not in speaker_mapping: 134 | speaker_mapping[speaker_id] = len(speaker_mapping) 135 | return speaker_mapping 136 | 137 | def __getitem__(self, k): 138 | item = self.quantized_data[k] 139 | speaker = item["rest"][2] 140 | item["rest"][2] = self.speaker_mapping[speaker] 141 | 142 | if self.max_length < item["dense"].size(0): 143 | item["dense"] = item["dense"][: self.max_length, :] 144 | item["units"] = item["units"][: self.max_length] 145 | item["durations"] = item["durations"][: self.max_length] 146 | 147 | return item 148 | 149 | def __len__(self): 150 | return len(self.quantized_data) 151 | 152 | 153 | def main(): 154 | args = get_args() 155 | set_seed_(args.seed) 156 | 157 | dense_model_name = args.dense_model_name 158 | quantizer_model_name = "kmeans" 159 | vocab_size = args.vocab_size 160 | 161 | # NB: Hubert is not serializable as-is, so to have a multi-worker dataloader 162 | # we have a worker-around: load the actual checkpoint on the first call - which 163 | # will happen in a worker process already. This behavior is enabled with 164 | # the `lazy_load` flag. 165 | dense_model = dispatch_dense_model(dense_model_name, lazy_load=True) 166 | quantizer_model = dispatch_quantizer( 167 | dense_model_name, quantizer_model_name, vocab_size 168 | ) 169 | 170 | speech_encoder = SpeechEncoder( 171 | dense_model, 172 | quantizer_model, 173 | deduplicate=False, 174 | need_f0=False, 175 | add_bos_eos=True, 176 | ) 177 | 178 | dataset = QuantizedLibriSpeech( 179 | speech_encoder, 180 | root="datasets", 181 | url="dev-clean", 182 | download=True, 183 | device="auto" 184 | # when we set `device` to auto, the dataset instance will check if it is 185 | # running within a worker process of a dataloader. If it is the case, 186 | # it will move SpeechEncoder to one of the available GPUs, depending on the 187 | # worker id. This way we can pack quite a few (GPU-hungry) Hubert instances running across 188 | # all GPUs in parallel, within the same standard DataLoader. 189 | ) 190 | 191 | speaker_mapping = SpeakerDatasetWrapper.get_speaker_ids(dataset.dataset._walker) 192 | max_speaker_id = max(speaker_mapping.values()) 193 | dataset = SpeakerDatasetWrapper(dataset, speaker_mapping) 194 | 195 | valid_size = int(0.1 * len(dataset)) 196 | train_size = len(dataset) - valid_size 197 | train_data, valid_data = torch.utils.data.random_split( 198 | dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(42) 199 | ) 200 | 201 | train_loader = DataLoader( 202 | train_data, 203 | batch_size=args.batch_size, 204 | shuffle=True, 205 | collate_fn=dataset.collater, 206 | num_workers=4, 207 | ) 208 | valid_loader = DataLoader( 209 | valid_data, 210 | batch_size=args.batch_size, 211 | shuffle=False, 212 | collate_fn=dataset.collater, 213 | num_workers=4, 214 | ) 215 | 216 | if args.model_type == "baseline": 217 | model = ConstantBaseline(total_speakers=max_speaker_id + 1) 218 | elif args.model_type == "discrete": 219 | model = DiscreteClassifier( 220 | vocab_size=args.vocab_size + 3, # accounting for bos, pad, eos 221 | embedding_size=32, 222 | n_heads=4, 223 | hidden_size=128, 224 | n_layers=2, 225 | dropout=0.1, 226 | pad_value=dataset.quantized_data.unit_pad, 227 | total_speakers=max_speaker_id + 1, 228 | ) 229 | elif args.model_type == "continuous": 230 | input_size = { 231 | "hubert-base-ls960": 768, 232 | "cpc-big-ll6k": 512, 233 | }[dense_model_name] 234 | 235 | model = ContinuousClassifier( 236 | embedding_size=32, 237 | input_size=input_size, 238 | n_heads=4, 239 | hidden_size=128, 240 | n_layers=2, 241 | dropout=0.1, 242 | pad_value=dataset.quantized_data.unit_pad, 243 | total_speakers=max_speaker_id + 1, 244 | ) 245 | else: 246 | assert False, "unknown model type" 247 | 248 | model.cuda() 249 | train(model, train_loader, valid_loader, args) 250 | 251 | 252 | if __name__ == "__main__": 253 | from torch.multiprocessing import set_start_method 254 | 255 | set_start_method("spawn", force=True) 256 | main() 257 | --------------------------------------------------------------------------------