├── unitspeech
    ├── textlesslib
    │   ├── pytest.ini
    │   ├── CHANGELOG.md
    │   ├── examples
    │   │   ├── __init__.py
    │   │   ├── gslm
    │   │   │   ├── __init__.py
    │   │   │   ├── README.md
    │   │   │   ├── sampler.py
    │   │   │   └── sample.py
    │   │   ├── resynthesis
    │   │   │   ├── README.md
    │   │   │   └── resynth.py
    │   │   └── speaker_probing
    │   │   │   ├── README.md
    │   │   │   ├── probes.py
    │   │   │   └── train.py
    │   ├── textless
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   ├── collater_utils.py
    │   │   │   ├── kmeans_quantizer.py
    │   │   │   ├── hubert_feature_reader.py
    │   │   │   ├── f0_preprocess.py
    │   │   │   └── cpc_feature_reader.py
    │   │   ├── vocoders
    │   │   │   └── tacotron2
    │   │   │   │   ├── tts_data.py
    │   │   │   │   ├── symbols.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cmudict.py
    │   │   │   │   ├── waveglow_denoiser.py
    │   │   │   │   ├── numbers.py
    │   │   │   │   ├── cleaners.py
    │   │   │   │   ├── text.py
    │   │   │   │   ├── audio_processing.py
    │   │   │   │   ├── vocoder.py
    │   │   │   │   ├── layers.py
    │   │   │   │   ├── stft.py
    │   │   │   │   └── utils.py
    │   │   ├── __init__.py
    │   │   └── checkpoint_manager
    │   │   │   └── manager.py
    │   ├── tools
    │   │   └── distributed_transcribe
    │   │   │   ├── __init__.py
    │   │   │   ├── single.sh
    │   │   │   ├── local.sh
    │   │   │   ├── slurm.sbatch
    │   │   │   ├── data_handler.py
    │   │   │   ├── distributed.py
    │   │   │   ├── README.md
    │   │   │   └── transcribe.py
    │   ├── requirements.txt
    │   ├── CITATION.bib
    │   ├── setup.cfg
    │   ├── setup.py
    │   ├── LICENSE
    │   ├── tests
    │   │   ├── test_checkpoint_manager.py
    │   │   ├── test_quantized_dataset.py
    │   │   └── test_model_handling.py
    │   ├── CONTRIBUTING.md
    │   ├── CODE_OF_CONDUCT.md
    │   └── README.md
    ├── vocoder
    │   ├── alias_free_torch
    │   │   ├── __init__.py
    │   │   ├── act.py
    │   │   ├── resample.py
    │   │   └── filter.py
    │   ├── env.py
    │   ├── incl_licenses
    │   │   ├── LICENSE_5
    │   │   ├── LICENSE_1
    │   │   ├── LICENSE_2
    │   │   └── LICENSE_4
    │   ├── LICENSE
    │   ├── xutils.py
    │   ├── README.md
    │   ├── activations.py
    │   └── meldataset.py
    ├── text
    │   ├── symbols.py
    │   ├── __init__.py
    │   ├── LICENSE
    │   └── cleaners.py
    ├── checkpoints
    │   ├── finetune.json
    │   ├── voice-conversion.json
    │   └── text-to-speech.json
    ├── base.py
    ├── duration_predictor.py
    ├── speaker_encoder
    │   └── utils.py
    └── util.py
├── .gitignore
├── setup.py
├── scripts
    ├── voice_conversion.py
    └── text_to_speech.py
└── README.md


/unitspeech/textlesslib/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-modules


--------------------------------------------------------------------------------
/unitspeech/textlesslib/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | ## v0.1.0
3 | 
4 | Initial version


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | 
 3 | .ipynb_checkpoints
 4 | notebooks/.ipynb_checkpoints
 5 | 
 6 | unitspeech/checkpoints
 7 | unitspeech/outputs
 8 | unitspeech/speaker_encoder/checkpts
 9 | unitspeech/vocoder/checkpts
10 | 
11 | unitspeech.egg-info


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/gslm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.1.0
 2 | torchaudio
 3 | pytest
 4 | AMFM_decompy
 5 | librosa
 6 | threadpoolctl==3.0.0
 7 | numpy==1.22.0
 8 | numba==0.53.0
 9 | joblib
10 | scikit-learn
11 | npy-append-array
12 | unidecode
13 | inflect
14 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/alias_free_torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .filter import *
5 | from .resample import *
6 | from .act import *


--------------------------------------------------------------------------------
/unitspeech/textlesslib/CITATION.bib:
--------------------------------------------------------------------------------
1 | @article{Kharitonov2022,
2 |       title={textless-lib: a Library for Textless Spoken Language Processing}, 
3 |       author={Eugene Kharitonov and Jade Copet and Kushal Lakhotia and Tu Anh Nguyen and Paden Tomasello and Ann Lee and Ali Elkahky and Wei-Ning Hsu and Abdelrahman Mohamed and Emmanuel Dupoux and Yossi Adi},
4 |       year={2022},
5 |       eprint={2202.07359},
6 |       archivePrefix={arXiv},
7 |       primaryClass={cs.CL}
8 | }


--------------------------------------------------------------------------------
/unitspeech/textlesslib/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | license_file = LICENSE
 3 | 
 4 | 
 5 | [flake8]
 6 | exclude =
 7 |     .tox,
 8 |     .git,
 9 |     __pycache__,
10 |     build,
11 |     dist,
12 |     *.md,
13 |     *.pyc,
14 |     *.egg-info,
15 |     .cache,
16 |     .eggs,
17 | max-line-length = 120
18 | 
19 | 
20 | [isort]
21 | multi_line_output = 3
22 | include_trailing_comma = True
23 | force_grid_wrap = 0
24 | use_parentheses = True
25 | ensure_newline_before_comments = True
26 | line_length = 88
27 | 
28 | 
29 | [black]
30 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/env.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | class AttrDict(dict):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(AttrDict, self).__init__(*args, **kwargs)
10 |         self.__dict__ = self
11 | 
12 | 
13 | def build_env(config, config_name, path):
14 |     t_path = os.path.join(path, config_name)
15 |     if config != t_path:
16 |         os.makedirs(path, exist_ok=True)
17 |         shutil.copyfile(config, os.path.join(path, config_name))
18 | 


--------------------------------------------------------------------------------
/unitspeech/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | '''
 6 | _pad        = '_'
 7 | _punctuation = ';:,.!?¡¿—…"«»“” '
 8 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
 9 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ̃"
10 | 
11 | 
12 | # Export all symbols:
13 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
14 | 
15 | # Special symbol ids
16 | SPACE_ID = symbols.index(" ")
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name="unitspeech",
 5 |     py_modules=["unitspeech"],
 6 |     install_requires=[
 7 |         "amfm_decompy==1.0.11",
 8 |         "einops==0.6.1",
 9 |         "fairseq==0.12.2",
10 |         "inflect==7.0.0",
11 |         "joblib==1.2.0",
12 |         "librosa==0.10.0.post2",
13 |         "matplotlib==3.7.1",
14 |         "packaging==23.1",
15 |         "phonemizer==3.2.1",
16 |         "torch==2.0.1",
17 |         "torchvision==0.15.2",
18 |         "torchaudio==2.0.2",
19 |         "transformers==4.30.2",
20 |         "unidecode==1.3.6",
21 |     ],
22 | )
23 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/single.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | DENSE_NAME=hubert-base-ls960
 8 | QUANTIZER_NAME=kmeans
 9 | VOCAB_SIZE=50
10 | MANIFEST=manifest.tsv
11 | TRANSCRIPT=transcript
12 | 
13 | python transcribe.py \
14 |     --manifest $MANIFEST \
15 |     --output=$TRANSCRIPT \
16 |     --dense_model=$DENSE_NAME \
17 |     --quantizer_model=$QUANTIZER_NAME \
18 |     --vocab_size=$VOCAB_SIZE \
19 |     --durations --deduplicate
20 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/local.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | WORKERS_PER_NODE=4
 7 | DENSE_NAME=hubert-base-ls960
 8 | QUANTIZER_NAME=kmeans
 9 | VOCAB_SIZE=50
10 | MANIFEST=manifest.tsv
11 | TRANSCRIPT=transcript
12 | 
13 | python -m torch.distributed.run --nproc_per_node=$WORKERS_PER_NODE transcribe.py \
14 |     --manifest $MANIFEST \
15 |     --output=$TRANSCRIPT \
16 |     --dense_model=$DENSE_NAME \
17 |     --quantizer_model=$QUANTIZER_NAME \
18 |     --vocab_size=$VOCAB_SIZE \
19 |     --durations --deduplicate
20 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | with open("requirements.txt") as f:
 9 |     requirements = f.read().splitlines()
10 | 
11 | setup(
12 |     name="textless",
13 |     version="0.1.0",
14 |     url="https://github.com/facebookresearch/textlesslib",
15 |     author="Textless NLP team at Facebook AI Research",
16 |     author_email="kharitonov@fb.com",
17 |     description="Tools for Textless NLP Research",
18 |     packages=find_packages(),
19 |     install_requires=requirements,
20 | )
21 | 


--------------------------------------------------------------------------------
/unitspeech/checkpoints/finetune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": {
 3 |     "n_units": 1000,
 4 |     "n_feats": 80,
 5 |     "n_fft": 1024,
 6 |     "hop_length": 256,
 7 |     "win_length": 1024,
 8 |     "sampling_rate": 22050,
 9 |     "mel_fmin": 0.0,
10 |     "mel_fmax": 8000.0
11 |   },
12 |   "encoder": {
13 |     "n_channels": 192,
14 |     "filter_channels": 768,
15 |     "n_layers": 6,
16 |     "kernel_size": 3,
17 |     "p_dropout": 0.1,
18 |     "n_heads": 2,
19 |     "window_size": 4
20 |   },
21 |   "decoder": {
22 |     "dim": 128,
23 |     "dim_mults": [1, 2, 4, 8],
24 |     "pe_scale": 1000,
25 |     "beta_min": 0.05,
26 |     "beta_max": 20.0,
27 |     "spk_emb_dim": 256
28 |   },
29 |   "train": {
30 |     "out_size_second": 2,
31 |     "vocoder_config_path": "unitspeech/vocoder/checkpts/bigvgan-config.json",
32 |     "vocoder_ckpt_path": "unitspeech/vocoder/checkpts/bigvgan.pt"
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/unitspeech/checkpoints/voice-conversion.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": {
 3 |     "n_feats": 80,
 4 |     "n_fft": 1024,
 5 |     "hop_length": 256,
 6 |     "win_length": 1024,
 7 |     "sampling_rate": 22050,
 8 |     "mel_fmin": 0.0,
 9 |     "mel_fmax": 8000.0
10 |   },
11 |   "encoder": {
12 |     "n_channels": 192,
13 |     "filter_channels": 768,
14 |     "n_layers": 6,
15 |     "kernel_size": 3,
16 |     "p_dropout": 0.1,
17 |     "n_heads": 2,
18 |     "window_size": 4,
19 |     "n_contentvec": 768
20 |   },
21 |   "decoder": {
22 |     "dim": 128,
23 |     "dim_mults": [1, 2, 4, 8],
24 |     "pe_scale": 1000,
25 |     "beta_min": 0.05,
26 |     "beta_max": 20.0,
27 |     "spk_emb_dim": 256
28 |   },
29 |   "train": {
30 |     "out_size_second": 2,
31 |     "vocoder_config_path": "unitspeech/vocoder/checkpts/bigvgan-config.json",
32 |     "vocoder_ckpt_path": "unitspeech/vocoder/checkpts/bigvgan.pt"
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/alias_free_torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from .resample import UpSample1d, DownSample1d
 6 | 
 7 | 
 8 | class Activation1d(nn.Module):
 9 |     def __init__(self,
10 |                  activation,
11 |                  up_ratio: int = 2,
12 |                  down_ratio: int = 2,
13 |                  up_kernel_size: int = 12,
14 |                  down_kernel_size: int = 12):
15 |         super().__init__()
16 |         self.up_ratio = up_ratio
17 |         self.down_ratio = down_ratio
18 |         self.act = activation
19 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
20 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
21 | 
22 |     # x: [B,C,T]
23 |     def forward(self, x):
24 |         x = self.upsample(x)
25 |         x = self.act(x)
26 |         x = self.downsample(x)
27 | 
28 |         return x


--------------------------------------------------------------------------------
/unitspeech/base.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS """
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | class BaseModule(torch.nn.Module):
 8 |     def __init__(self):
 9 |         super(BaseModule, self).__init__()
10 | 
11 |     @property
12 |     def nparams(self):
13 |         """
14 |         Returns number of trainable parameters of the module.
15 |         """
16 |         num_params = 0
17 |         for name, param in self.named_parameters():
18 |             if param.requires_grad:
19 |                 num_params += np.prod(param.detach().cpu().numpy().shape)
20 |         return num_params
21 | 
22 | 
23 |     def relocate_input(self, x: list):
24 |         """
25 |         Relocates provided tensors to the same device set for the module.
26 |         """
27 |         device = next(self.parameters()).device
28 |         for i in range(len(x)):
29 |             if isinstance(x[i], torch.Tensor) and x[i].device != device:
30 |                 x[i] = x[i].to(device)
31 |         return x
32 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/resynthesis/README.md:
--------------------------------------------------------------------------------
 1 | # Discrete Resynthesis example
 2 | 
 3 | In `resynth.py` we showcase a simple demonstration of the audio resynthesis done via HuBERT-based discrete pseudo-units. The code closesly
 4 | follows the [unit2speech module](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/unit2speech) of GSLM.
 5 | 
 6 | # How to run
 7 | Below is an example of running the script:
 8 | ```bash
 9 | python resynth.py --input test_input.wav --output=test_output.wav --vocab_size=100 --decoder_steps=500
10 | ```
11 | 
12 | `resynth.py` supports the following command-line arguments:
13 | * `--dense_model_name`: name of the dense representation model to be used (suppported: `hubert-base-ls960` and `cpc-big-ll6k`);
14 | * `--input`: the input audio file (must have the sample rate of 16 KHz);
15 | * `--output`: the output file name;
16 | * `--vocab_size`: the size of the quantization vocabulary to be used (one of 50, 100, 200);
17 | * `--decoder_steps`: determines the maximal duration of the produces audio.
18 | 


--------------------------------------------------------------------------------
/unitspeech/checkpoints/text-to-speech.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": {
 3 |     "n_feats": 80,
 4 |     "n_fft": 1024,
 5 |     "hop_length": 256,
 6 |     "win_length": 1024,
 7 |     "sampling_rate": 22050,
 8 |     "mel_fmin": 0.0,
 9 |     "mel_fmax": 8000.0
10 |   },
11 |   "encoder": {
12 |     "n_channels": 192,
13 |     "filter_channels": 768,
14 |     "n_layers": 6,
15 |     "kernel_size": 3,
16 |     "p_dropout": 0.1,
17 |     "n_heads": 2,
18 |     "window_size": 4
19 |   },
20 |   "duration_predictor": {
21 |     "in_channels": 192,
22 |     "filter_channels": 256,
23 |     "kernel_size": 3,
24 |     "p_dropout": 0.1,
25 |     "spk_emb_dim": 256
26 |   },
27 |   "decoder": {
28 |     "dim": 128,
29 |     "dim_mults": [1, 2, 4, 8],
30 |     "pe_scale": 1000,
31 |     "beta_min": 0.05,
32 |     "beta_max": 20.0,
33 |     "spk_emb_dim": 256
34 |   },
35 |   "train": {
36 |     "out_size_second": 2,
37 |     "vocoder_config_path": "unitspeech/vocoder/checkpts/bigvgan-config.json",
38 |     "vocoder_ckpt_path": "unitspeech/vocoder/checkpts/bigvgan.pt"
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/slurm.sbatch:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | ## Set output and job name
 8 | #SBATCH --job-name=transcribe
 9 | #SBATCH --output=log.out
10 | #SBATCH --error=log.err
11 | ## partition name
12 | #SBATCH --partition=devlab
13 | 
14 | ## number of nodes, tasks per nodes, etc
15 | #SBATCH --nodes=1
16 | #SBATCH --gpus-per-node=8
17 | #SBATCH --ntasks-per-node=32
18 | #SBATCH --time 4320
19 | #SBATCH --mem 400G
20 | #SBATCH --gres=gpu:8
21 | #SBATCH --cpus-per-task=2
22 | 
23 | DENSE_NAME=hubert-base-ls960
24 | QUANTIZER_NAME=kmeans
25 | VOCAB_SIZE=50
26 | MANIFEST=manifest.tsv
27 | TRANSCRIPT=transcript
28 | 
29 | srun -u python transcribe.py \
30 |     --manifest $MANIFEST \
31 |     --output=$TRANSCRIPT \
32 |     --dense_model=$DENSE_NAME \
33 |     --quantizer_model=$QUANTIZER_NAME \
34 |     --vocab_size=$VOCAB_SIZE \
35 |     --durations --deduplicate
36 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/data_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torchaudio
 7 | 
 8 | import pathlib
 9 | 
10 | import logging
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class ManifestDataset:
16 |     def __init__(self, manifest):
17 |         with open(manifest, "r") as fin:
18 |             self.root = pathlib.Path(fin.readline().strip())
19 |             self.files = [x.strip().split()[0] for x in fin.readlines()]
20 | 
21 |         logger.info(
22 |             f"Init dataset with root in {self.root}, containing {len(self.files)} files"
23 |         )
24 | 
25 |     def __len__(self):
26 |         return len(self.files)
27 | 
28 |     def __getitem__(self, k):
29 |         path = self.root / self.files[k]
30 |         data, sr = torchaudio.load(str(path))
31 | 
32 |         assert sr == 16_000
33 |         return data.squeeze(0), path.with_suffix("").name
34 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/gslm/README.md:
--------------------------------------------------------------------------------
 1 | # Generative Spoken Language Modeling pipeline
 2 | 
 3 | ## Retrieve a language model
 4 | 
 5 | Assume you want to experiment with a pre-trained language model that is trained on HuBERT representations, quantized with a codebook of size 100.
 6 | Firstly, you need to download and unpack the model itself:
 7 | ```bash
 8 | mkdir LM/
 9 | wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km100/hubert100_lm.tgz -O LM/hubert100_lm.tgz
10 | cd LM/ && tar -xvf hubert100_lm.tgz
11 | ```
12 | (other checkpoints can be found in the [Textless NLP GSLM release](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/ulm).)
13 | 
14 | ## Run Speech Continuation on a file
15 | To run the speech continuation pipeline with the previously downloaded models, you can use the following command:
16 | ```bash
17 | python sample.py \
18 | 	--language-model-data-dir=LM/hubert100_lm \
19 | 	--input-file 174-84280-0004.flac \
20 | 	--output-file output_new.wav \
21 | 	--prompt-duration-sec=3 \
22 | 	--temperature=0.7 \
23 | 	--vocab-size=100
24 | ```
25 | 


--------------------------------------------------------------------------------
/unitspeech/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from unitspeech.text import cleaners
 3 | from unitspeech.text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | 
11 | def phonemize(text, global_phonemizer):
12 |     text = cleaners.convert_to_ascii(text)
13 |     text = cleaners.lowercase(text)
14 |     text = cleaners.expand_abbreviations(text)
15 |     phonemes = global_phonemizer.phonemize([text], strip=True)[0]
16 |     phonemes = cleaners.collapse_whitespace(phonemes)
17 |     return phonemes
18 | 
19 | 
20 | def cleaned_text_to_sequence(cleaned_text):
21 |     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
22 |       Args:
23 |         text: string to convert to a sequence
24 |       Returns:
25 |         List of integers corresponding to the symbols in the text
26 |     '''
27 |     sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
28 |     return sequence
29 | 


--------------------------------------------------------------------------------
/unitspeech/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/incl_licenses/LICENSE_5:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Alexandre Défossez
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 4 | associated documentation files (the "Software"), to deal in the Software without restriction,
 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 7 | furnished to do so, subject to the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be included in all copies or
10 | substantial portions of the Software.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/unitspeech/vocoder/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 NVIDIA CORPORATION.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/unitspeech/vocoder/incl_licenses/LICENSE_1:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/unitspeech/vocoder/incl_licenses/LICENSE_2:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Edward Dixon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/unitspeech/textlesslib/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tests/test_checkpoint_manager.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER
 7 | import tempfile
 8 | import pathlib
 9 | import pytest
10 | 
11 | 
12 | def test_checkpoint_manager():
13 |     codes = CHECKPOINT_MANAGER.get_by_name(
14 |         "hubert-base-ls960-kmeans-50-tacotron-codes", download_if_needed=True
15 |     )
16 |     assert pathlib.Path(codes).exists()
17 | 
18 |     with pytest.raises(KeyError):
19 |         codes = CHECKPOINT_MANAGER.get_by_name("123", download_if_needed=True)
20 | 
21 | 
22 | def test_changing_root():
23 |     name = "hubert-base-ls960-kmeans-50-tacotron-codes"
24 | 
25 |     with tempfile.TemporaryDirectory() as tmpdir:
26 |         CHECKPOINT_MANAGER.set_root(tmpdir)
27 |         with pytest.raises(FileNotFoundError):
28 |             CHECKPOINT_MANAGER.get_by_name(name, download_if_needed=False)
29 | 
30 |         CHECKPOINT_MANAGER.get_by_name(name, download_if_needed=True)
31 |         assert (pathlib.Path(tmpdir) / CHECKPOINT_MANAGER.storage[name].fname).exists()
32 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/speaker_probing/README.md:
--------------------------------------------------------------------------------
 1 | # Speaker probing example
 2 | 
 3 | This directory contains a short example that illustrates the speaker probing task. Specifically, we investigate whether an anonymised speaker id can
 4 | be predicted based on their utterances representated as (potentially quantized) HuBERT representations. This example uses LibriSpeech dev-clean as a dataset.
 5 | 
 6 | ## Running example
 7 | To train a simple speaker classifier and get its accuracy on validation data, it is enough to simply run a command:
 8 | ```python train.py --model_type=discrete --seed=0 --epochs=5 --vocab_size=50```
 9 | This will train a small Transformer model on HuBERT representations, quantized into a vocabulary of 50 pseudo-units.
10 | 
11 | ## Command-line arguments
12 | * `--dense_model_name`: dense model to be used. Must be either `hubert-base-ls960` or `cpc-big-ll6k`;
13 | * `--seed`: sets the random seed;
14 | * `--epochs`: sets the number of training epochs;
15 | * `--vocab_size`: sets the size of the codebook. The example uses pre-trained codebooks and support vocabulary sizes of 50, 100, and 200;
16 | * `--model_type`: selects the model/representation to be used. Must be one of [`discrete`, `continuous`, `baseline` (default)].
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tests/test_quantized_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import pathlib
 7 | from unitspeech.textlesslib.textless.data.quantized_datasets import QuantizedLibriSpeech
 8 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
 9 | 
10 | 
11 | def test_quantized_librispeech():
12 |     url = "dev-clean"
13 |     root = "./data"
14 | 
15 |     pathlib.Path(root).mkdir(exist_ok=True)
16 | 
17 |     dense_model_name = "hubert-base-ls960"
18 |     quantizer_name = "kmeans"
19 |     vocab_size = 100
20 | 
21 |     encoder = SpeechEncoder.by_name(
22 |         dense_model_name=dense_model_name,
23 |         quantizer_model_name=quantizer_name,
24 |         vocab_size=vocab_size,
25 |         need_f0=True,
26 |         deduplicate=True,
27 |         f0_normalizer=None,
28 |         f0_quantizer=None,
29 |     )
30 | 
31 |     quantized_dataset = QuantizedLibriSpeech(
32 |         root=root, speech_encoder=encoder, url=url, download=True
33 |     )
34 |     item = quantized_dataset[0]
35 | 
36 |     # checking a few invariants
37 |     assert item["units"].size(0) == item["durations"].size(0) == item["f0"].size(0)
38 |     assert item["durations"].sum().item() == item["dense"].size(0)
39 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/data/collater_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def collate_tensors(stream, pad):
11 |     """
12 |     >>> tensors = [torch.tensor(x) for x in [[1,2,3], [1]]]
13 |     >>> pad = 0
14 |     >>> collate_tensors(tensors, pad)
15 |     tensor([[1, 2, 3],
16 |         [1, 0, 0]])
17 |     """
18 |     assert len(stream) > 0
19 | 
20 |     length = max(v.size(0) for v in stream)
21 |     n_samples = len(stream)
22 | 
23 |     collated = stream[0].new_full((n_samples, length), pad)
24 | 
25 |     for i, v in enumerate(stream):
26 |         collated[i, : v.size(0)] = v
27 | 
28 |     return collated
29 | 
30 | 
31 | def wrap_bos_eos(units, durations, f0, dense, bos, eos):
32 |     assert units.size(0) == durations.size(0) == dense.size(0)
33 |     if f0 is not None:
34 |         assert units.size(0) == f0.size(0)
35 | 
36 |     units = torch.cat([bos, units, eos])
37 |     z = torch.zeros_like(durations[0:1])
38 |     durations = torch.cat([z, durations, z])
39 |     if f0 is not None:
40 |         z = torch.zeros_like(f0[0:1])
41 |         f0 = torch.cat([z, f0, z])
42 |     z = torch.zeros_like(dense[0:1, :])
43 |     dense = torch.cat([z, dense, z])
44 | 
45 |     return units, durations, f0, dense
46 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to textless-lib
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | We develop on GitHub.
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes and add new tests if this is relevant.
15 | 5. Run `black` code-style formatter.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Meta's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style  
33 | We use `black` to enforce a uniform codestyle.
34 | 
35 | ## License
36 | By contributing to textless-lib, you agree that your contributions will be licensed
37 | under the LICENSE file in the root directory of this source tree.


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/data/kmeans_quantizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import torch
 7 | import joblib
 8 | import warnings
 9 | 
10 | 
11 | class KMeansQuantizer(torch.nn.Module):
12 |     def __init__(self, checkpoint_path):
13 |         super().__init__()
14 |         self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
15 |         self.kmeans_model = self.load_kmeans_model(checkpoint_path)
16 | 
17 |     def forward(self, x):
18 |         return torch.from_numpy(self.kmeans_model.predict(x.cpu().numpy())).to(
19 |             self.device
20 |         )
21 | 
22 |     @property
23 |     def vocab_size(self) -> int:
24 |         return self.kmeans_model.n_clusters
25 | 
26 |     @property
27 |     def device(self):
28 |         return self._float_tensor.device
29 | 
30 |     @staticmethod
31 |     def load_kmeans_model(checkpoint_path: str):
32 |         with open(checkpoint_path, "rb") as fd:
33 |             with warnings.catch_warnings():
34 |                 # produces lots of version warnings which can be annoying when we have many workers
35 |                 warnings.simplefilter("ignore")
36 |                 kmeans_model = joblib.load(fd)
37 |                 # some of the GSLM checkpoints (CPC) were saved under a different scikit version
38 |                 if not hasattr(kmeans_model, "_n_threads"):
39 |                     kmeans_model._n_threads = 40
40 | 
41 |         kmeans_model.verbose = False
42 |         return kmeans_model
43 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/incl_licenses/LICENSE_4:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Seungwon Park 박승원
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/unitspeech/vocoder/xutils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import glob
 4 | import os
 5 | import matplotlib
 6 | import torch
 7 | from torch.nn.utils import weight_norm
 8 | matplotlib.use("Agg")
 9 | import matplotlib.pylab as plt
10 | 
11 | 
12 | def plot_spectrogram(spectrogram):
13 |     fig, ax = plt.subplots(figsize=(10, 2))
14 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
15 |                    interpolation='none')
16 |     plt.colorbar(im, ax=ax)
17 | 
18 |     fig.canvas.draw()
19 |     plt.close()
20 | 
21 |     return fig
22 | 
23 | 
24 | def init_weights(m, mean=0.0, std=0.01):
25 |     classname = m.__class__.__name__
26 |     if classname.find("Conv") != -1:
27 |         m.weight.data.normal_(mean, std)
28 | 
29 | 
30 | def apply_weight_norm(m):
31 |     classname = m.__class__.__name__
32 |     if classname.find("Conv") != -1:
33 |         weight_norm(m)
34 | 
35 | 
36 | def get_padding(kernel_size, dilation=1):
37 |     return int((kernel_size*dilation - dilation)/2)
38 | 
39 | 
40 | def load_checkpoint(filepath, device):
41 |     assert os.path.isfile(filepath)
42 |     print("Loading '{}'".format(filepath))
43 |     checkpoint_dict = torch.load(filepath, map_location=device)
44 |     print("Complete.")
45 |     return checkpoint_dict
46 | 
47 | 
48 | def save_checkpoint(filepath, obj):
49 |     print("Saving checkpoint to {}".format(filepath))
50 |     torch.save(obj, filepath)
51 |     print("Complete.")
52 | 
53 | 
54 | def scan_checkpoint(cp_dir, prefix):
55 |     pattern = os.path.join(cp_dir, prefix + '????????')
56 |     cp_list = glob.glob(pattern)
57 |     if len(cp_list) == 0:
58 |         return None
59 |     return sorted(cp_list)[-1]
60 | 
61 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/tts_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | import torch
 8 | import numpy as np
 9 | from .text import (
10 |     EOS_TOK,
11 |     SOS_TOK,
12 |     code_to_sequence,
13 |     text_to_sequence,
14 | )
15 | from .utils import (
16 |     load_code_dict,
17 | )
18 | 
19 | 
20 | class TacotronInputDataset:
21 |     def __init__(self, hparams, append_str=""):
22 |         self.is_text = getattr(hparams, "text_or_code", "text") == "text"
23 |         if not self.is_text:
24 |             self.code_dict = load_code_dict(hparams.code_dict)
25 |             self.code_key = hparams.code_key
26 |         self.add_sos = hparams.add_sos
27 |         self.add_eos = hparams.add_eos
28 |         self.collapse_code = hparams.collapse_code
29 |         self.append_str = append_str
30 | 
31 |     def process_code(self, inp_str):
32 |         inp_toks = inp_str.split()
33 |         if self.add_sos:
34 |             inp_toks = [SOS_TOK] + inp_toks
35 |         if self.add_eos:
36 |             inp_toks = inp_toks + [EOS_TOK]
37 |         return code_to_sequence(inp_toks, self.code_dict, self.collapse_code)
38 | 
39 |     def process_text(self, inp_str):
40 |         return text_to_sequence(inp_str, ["english_cleaners"])
41 | 
42 |     def get_tensor(self, inp_str):
43 |         # uid, txt, inp_str = self._get_data(idx)
44 |         inp_str = inp_str + self.append_str
45 |         if self.is_text:
46 |             inp_toks = self.process_text(inp_str)
47 |         else:
48 |             inp_toks = self.process_code(inp_str)
49 |         return torch.from_numpy(np.array(inp_toks)).long()
50 | 
51 |     def __len__(self):
52 |         return len(self.data)
53 | 


--------------------------------------------------------------------------------
/unitspeech/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Cleaners are transformations that run over the input text at both training and eval time.
 5 | 
 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 8 |   1. "english_cleaners" for English text
 9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 |      the symbols in symbols.py to match your data).
13 | '''
14 | 
15 | import re
16 | from unidecode import unidecode
17 | 
18 | 
19 | # Regular expression matching whitespace:
20 | _whitespace_re = re.compile(r'\s+')
21 | 
22 | # List of (regular expression, replacement) pairs for abbreviations:
23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
24 |   ('mrs', 'misess'),
25 |   ('mr', 'mister'),
26 |   ('dr', 'doctor'),
27 |   ('st', 'saint'),
28 |   ('co', 'company'),
29 |   ('jr', 'junior'),
30 |   ('maj', 'major'),
31 |   ('gen', 'general'),
32 |   ('drs', 'doctors'),
33 |   ('rev', 'reverend'),
34 |   ('lt', 'lieutenant'),
35 |   ('hon', 'honorable'),
36 |   ('sgt', 'sergeant'),
37 |   ('capt', 'captain'),
38 |   ('esq', 'esquire'),
39 |   ('ltd', 'limited'),
40 |   ('col', 'colonel'),
41 |   ('ft', 'fort'),
42 | ]]
43 | 
44 | 
45 | def expand_abbreviations(text):
46 |   for regex, replacement in _abbreviations:
47 |     text = re.sub(regex, replacement, text)
48 |   return text
49 | 
50 | 
51 | def lowercase(text):
52 |   return text.lower()
53 | 
54 | 
55 | def collapse_whitespace(text):
56 |   return re.sub(_whitespace_re, ' ', text)
57 | 
58 | 
59 | def convert_to_ascii(text):
60 |   return unidecode(text)
61 | 
62 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from unitspeech.textlesslib.textless.data.cpc_feature_reader import CpcFeatureReader
 7 | from unitspeech.textlesslib.textless.data.hubert_feature_reader import HubertFeatureReader
 8 | from unitspeech.textlesslib.textless.data.kmeans_quantizer import KMeansQuantizer
 9 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER
10 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder
11 | 
12 | DENSE_MODELS = {
13 |     "hubert-base-ls960": HubertFeatureReader,
14 |     "mhubert-base-vp_en_es_fr": HubertFeatureReader,
15 |     "cpc-big-ll6k": CpcFeatureReader,
16 | }
17 | 
18 | 
19 | QUANTIZER_MODELS = {
20 |     "kmeans": KMeansQuantizer,
21 | }
22 | 
23 | 
24 | def dispatch_dense_model(name: str, **kwargs):
25 |     model_class = DENSE_MODELS[name]
26 |     checkpoint_path = CHECKPOINT_MANAGER.get_by_name(name)
27 |     return model_class(checkpoint_path, **kwargs)
28 | 
29 | 
30 | def dispatch_quantizer(dense_model_name: str, quantizer_name: str, vocab_size: int):
31 |     quantizer_checkpoint_name = f"{dense_model_name}-{quantizer_name}-{vocab_size}"
32 |     checkpoint_path = CHECKPOINT_MANAGER.get_by_name(quantizer_checkpoint_name)
33 |     quantizer = QUANTIZER_MODELS[quantizer_name](checkpoint_path)
34 |     return quantizer
35 | 
36 | 
37 | def dispatch_vocoder(
38 |     dense_model_name: str,
39 |     quantizer_name: str,
40 |     vocoder_name: str,
41 |     vocab_size: int,
42 | ):
43 |     if vocoder_name == "tacotron":
44 |         vocoder = TacotronVocoder.by_name(
45 |             dense_model_name,
46 |             quantizer_name,
47 |             vocab_size,
48 |         )
49 |     else:
50 |         assert False, "Unsupported vocoder name"
51 |     return vocoder
52 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/symbols.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017 Keith Ito
 2 | 
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | 
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | 
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | # THE SOFTWARE.
20 | 
21 | """ from MIT-licensed https://github.com/keithito/tacotron """
22 | 
23 | '''
24 | Defines the set of symbols used in text input to the model.
25 | 
26 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
27 | from . import cmudict
28 | 
29 | _pad        = '_'
30 | _punctuation = '!\'(),.:;? '
31 | _special = '-'
32 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
33 | 
34 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
35 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
36 | 
37 | # Export all symbols:
38 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
39 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/alias_free_torch/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | from .filter import LowPassFilter1d
 7 | from .filter import kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15 |         self.stride = ratio
16 |         self.pad = self.kernel_size // ratio - 1
17 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18 |         self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19 |         filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
20 |                                       half_width=0.6 / ratio,
21 |                                       kernel_size=self.kernel_size)
22 |         self.register_buffer("filter", filter)
23 | 
24 |     # x: [B, C, T]
25 |     def forward(self, x):
26 |         _, C, _ = x.shape
27 | 
28 |         x = F.pad(x, (self.pad, self.pad), mode='replicate')
29 |         x = self.ratio * F.conv_transpose1d(
30 |             x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
31 |         x = x[..., self.pad_left:-self.pad_right]
32 | 
33 |         return x
34 | 
35 | 
36 | class DownSample1d(nn.Module):
37 |     def __init__(self, ratio=2, kernel_size=None):
38 |         super().__init__()
39 |         self.ratio = ratio
40 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
41 |         self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
42 |                                        half_width=0.6 / ratio,
43 |                                        stride=ratio,
44 |                                        kernel_size=self.kernel_size)
45 | 
46 |     def forward(self, x):
47 |         xx = self.lowpass(x)
48 | 
49 |         return xx


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/checkpoint_manager/manager.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from typing import Union
 7 | 
 8 | from dataclasses import dataclass
 9 | # Modified (UnitSpeech)
10 | # from torchaudio.datasets.utils import download_url
11 | from torch.hub import download_url_to_file
12 | import pathlib
13 | 
14 | 
15 | @dataclass
16 | class Checkpoint:
17 |     name: str
18 |     remote_path: str
19 |     fname: str
20 |     sha256: str
21 | 
22 | 
23 | class CheckpointManager:
24 |     def __init__(self, disk_root: Union[str, pathlib.Path] = "~/.textless/"):
25 |         self.disk_root = pathlib.Path(disk_root).expanduser().resolve()
26 |         if not self.disk_root.exists():
27 |             self.disk_root.mkdir()
28 | 
29 |         self.storage: dict[str, Checkpoint] = {}
30 | 
31 |     def add_checkpoint(self, checkpoint: Checkpoint) -> None:
32 |         name = checkpoint.name
33 |         assert name not in self.storage
34 |         self.storage[name] = checkpoint
35 | 
36 |     def download_by_name(self, name: str) -> None:
37 |         checkpoint = self.storage[name]
38 |         # Modified (UnitSpeech)
39 |         # download_url(
40 |         #     checkpoint.remote_path,
41 |         #     self.disk_root,
42 |         #     hash_value=checkpoint.sha256,
43 |         #     hash_type="sha256",
44 |         #     filename=checkpoint.fname,
45 |         # )
46 |         download_url_to_file(
47 |             checkpoint.remote_path,
48 |             self.disk_root / checkpoint.fname
49 |         )
50 | 
51 |     def get_by_name(self, name: str, download_if_needed: bool = True) -> pathlib.Path:
52 |         checkpoint = self.storage[name]
53 |         disk_name = self.disk_root / checkpoint.fname
54 | 
55 |         if not disk_name.exists():
56 |             if download_if_needed:
57 |                 self.download_by_name(name)
58 |             else:
59 |                 raise FileNotFoundError(
60 |                     f"Checkpoint {checkpoint} was not found locally at {disk_name}, please set `allow_download` flag"
61 |                 )
62 |         return disk_name
63 | 
64 |     def set_root(self, new_root: Union[str, pathlib.Path]) -> None:
65 |         self.disk_root = pathlib.Path(new_root)
66 | 


--------------------------------------------------------------------------------
/unitspeech/duration_predictor.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jaywalnut310/glow-tts """
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | from unitspeech.base import BaseModule
 7 | 
 8 | 
 9 | class LayerNorm(nn.Module):
10 |     def __init__(self, channels, eps=1e-5):
11 |         super().__init__()
12 |         self.channels = channels
13 |         self.eps = eps
14 | 
15 |         self.gamma = nn.Parameter(torch.ones(channels))
16 |         self.beta = nn.Parameter(torch.zeros(channels))
17 | 
18 |     def forward(self, x):
19 |         x = x.transpose(1, -1).contiguous()
20 |         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
21 |         return x.transpose(1, -1).contiguous()
22 | 
23 | 
24 | class DurationPredictor(BaseModule):
25 |     def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, spk_emb_dim=0):
26 |         super(DurationPredictor, self).__init__()
27 |         in_channels = in_channels + spk_emb_dim
28 | 
29 |         self.in_channels = in_channels
30 |         self.filter_channels = filter_channels
31 |         self.p_dropout = p_dropout
32 | 
33 |         self.drop = torch.nn.Dropout(p_dropout)
34 |         self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels,
35 |                                       kernel_size, padding=kernel_size//2)
36 |         self.norm_1 = LayerNorm(filter_channels)
37 |         self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels,
38 |                                       kernel_size, padding=kernel_size//2)
39 |         self.norm_2 = LayerNorm(filter_channels)
40 |         self.proj = torch.nn.Conv1d(filter_channels, 1, 1)
41 | 
42 |     def forward(self, x, x_mask, w=None, g=None, reverse=False):
43 |         x = torch.detach(x)
44 |         if g is not None:
45 |             x = torch.cat([x, g.transpose(1, 2).repeat(1, 1, x.shape[-1])], dim=1)
46 |         x = self.conv_1(x * x_mask)
47 |         x = torch.relu(x)
48 |         x = self.norm_1(x)
49 |         x = self.drop(x)
50 |         x = self.conv_2(x * x_mask)
51 |         x = torch.relu(x)
52 |         x = self.norm_2(x)
53 |         x = self.drop(x)
54 |         logw = self.proj(x * x_mask) * x_mask
55 |         if not reverse:
56 |             logw_ = torch.log(w + 1e-6) * x_mask
57 |             return torch.sum((logw - logw_) ** 2) / torch.sum(x_mask)  # for averaging
58 |         else:
59 |             return logw


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | import torch
 8 | import unitspeech.textlesslib.textless.vocoders.tacotron2 as tacotron2
 9 | import sys
10 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER
11 | from .waveglow_denoiser import Denoiser
12 | from .model import Tacotron2
13 | from .tts_data import TacotronInputDataset
14 | 
15 | 
16 | def get_waveglow(download_if_needed=True):
17 | 
18 |     waveglow_path = CHECKPOINT_MANAGER.get_by_name(
19 |         "waveglow", download_if_needed=download_if_needed
20 |     )
21 | 
22 |     sys.path.append(tacotron2.__path__[0])
23 |     waveglow = torch.load(waveglow_path)["model"]
24 |     sys.path.pop()
25 | 
26 |     waveglow = waveglow.cuda().eval()
27 |     denoiser = Denoiser(waveglow)
28 |     return waveglow, denoiser
29 | 
30 | 
31 | def load_tacotron(model_name, max_decoder_steps, download_if_needed=True):
32 |     tacotron_path = CHECKPOINT_MANAGER.get_by_name(
33 |         model_name, download_if_needed=download_if_needed
34 |     )
35 |     ckpt_dict = torch.load(tacotron_path)
36 | 
37 |     hparams = ckpt_dict["hparams"]
38 |     codes_path = CHECKPOINT_MANAGER.get_by_name(
39 |         f"{model_name}-codes", download_if_needed=download_if_needed
40 |     )
41 |     hparams.code_dict = codes_path
42 | 
43 |     hparams.max_decoder_steps = max_decoder_steps
44 |     model = Tacotron2(hparams)
45 |     model.load_state_dict(ckpt_dict["model_dict"])
46 |     model = model.cuda().eval().half()
47 | 
48 |     tts_dataset = TacotronInputDataset(hparams)
49 | 
50 |     return model, tts_dataset
51 | 
52 | 
53 | def synthesize_audio(
54 |     units, model, tts_dataset, waveglow, denoiser, lab=None, denoiser_strength=0.0
55 | ):
56 |     quantized_units_str = " ".join(map(str, units.tolist()))
57 |     tokens = tts_dataset.get_tensor(quantized_units_str).cuda().unsqueeze(0)
58 | 
59 |     if lab is not None:
60 |         lab = torch.LongTensor(1).cuda().fill_(lab)
61 | 
62 |     with torch.no_grad():
63 |         _, mel, _, ali, has_eos = model.inference(tokens, lab, ret_has_eos=True)
64 |         mel = mel.float()
65 |         audio = waveglow.infer(mel, sigma=0.666)
66 |         denoised_audio = denoiser(audio, strength=denoiser_strength).squeeze(1)
67 |     return mel, audio, denoised_audio, has_eos
68 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tests/test_model_handling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import pytest
 7 | from unitspeech.textlesslib.textless import dispatch_dense_model, dispatch_quantizer
 8 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
 9 | import torch
10 | from itertools import product
11 | 
12 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder
13 | 
14 | 
15 | def test_model_dispatch():
16 |     dense_model_name = "hubert-base-ls960"
17 |     quantizer_name = "kmeans"
18 |     vocab_size = 100
19 | 
20 |     # getting dense model
21 |     dense_model = dispatch_dense_model(dense_model_name)
22 |     assert isinstance(dense_model, torch.nn.Module)
23 | 
24 |     # getting a quantizer for it
25 |     assert (
26 |         dispatch_quantizer(dense_model_name, quantizer_name, vocab_size=vocab_size)
27 |         is not None
28 |     )
29 | 
30 |     with pytest.raises(KeyError):
31 |         assert dispatch_quantizer(dense_model_name, quantizer_name, vocab_size=101)
32 | 
33 |     # getting a vocoder for it
34 |     assert (
35 |         TacotronVocoder.by_name(
36 |             dense_model_name=dense_model_name,
37 |             quantizer_model_name=quantizer_name,
38 |             vocab_size=vocab_size,
39 |         )
40 |         is not None
41 |     )
42 | 
43 | 
44 | densename_vocabsize = list(product(["hubert-base-ls960", "cpc-big-ll6k"], [50, 100, 200]))
45 | 
46 | 
47 | @pytest.mark.parametrize("dense_name,vocab_size", densename_vocabsize)
48 | def test_speech_encoder(dense_name, vocab_size):
49 |     quantizer_name = "kmeans"
50 | 
51 |     encoder = SpeechEncoder.by_name(
52 |         dense_model_name=dense_name,
53 |         quantizer_model_name=quantizer_name,
54 |         vocab_size=vocab_size,
55 |         need_f0=False,
56 |         deduplicate=True,
57 |         f0_normalizer=None,
58 |         f0_quantizer=None,
59 |     )
60 | 
61 |     assert encoder is not None
62 | 
63 |     # let's pass 0.5s of silence thru it
64 |     waveform = torch.zeros(encoder.expected_sample_rate // 2)
65 |     encoded = encoder(waveform)
66 | 
67 |     assert encoded
68 | 
69 | 
70 | @pytest.mark.parametrize("dense_name,vocab_size", densename_vocabsize)
71 | def test_vocoder_lookup(dense_name, vocab_size):
72 |     quantizer_name = "kmeans"
73 | 
74 |     vocoder = TacotronVocoder.by_name(
75 |         dense_model_name=dense_name,
76 |         quantizer_model_name=quantizer_name,
77 |         vocab_size=vocab_size,
78 |     )
79 |     assert vocoder is not None
80 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/data/hubert_feature_reader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | import torch
 8 | import fairseq
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | class HubertFeatureReader(torch.nn.Module):
13 |     def __init__(
14 |         self, checkpoint_path, layer=6, max_chunk=100 * 16_000, lazy_load=False
15 |     ):
16 |         super().__init__()
17 |         # NB: fairseq doesn't support pathlib.Path
18 |         self.checkpoint_path = str(checkpoint_path)
19 |         self.should_normalize = False
20 |         self.lazy_load = lazy_load
21 |         self.model = None
22 |         self.layer = layer
23 |         self.max_chunk = max_chunk
24 |         # this is useful for determining the device
25 |         self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
26 |         if not self.lazy_load:
27 |             self.load_checkpoint_()
28 | 
29 |     @torch.no_grad()  # otherwise some non-leaf nodes appear which breaks serialization
30 |     def load_checkpoint_(self):
31 |         model, _, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
32 |             [self.checkpoint_path]
33 |         )
34 |         self.model = model[0].eval()
35 |         self.model = self.model.to(self.device)
36 |         for parameter in self.model.parameters():
37 |             parameter.requires_grad_(False)
38 | 
39 |         self.should_normalize = task.cfg.normalize
40 | 
41 |     @property
42 |     def device(self):
43 |         return self._float_tensor.device
44 | 
45 |     @property
46 |     def code_hop_size(self) -> int:
47 |         return 320
48 | 
49 |     @property
50 |     def expected_sample_rate(self) -> int:
51 |         return 16_000
52 | 
53 |     def forward(self, x):
54 |         if self.lazy_load and self.model is None:
55 |             self.load_checkpoint_()
56 | 
57 |         return self.get_features(x)
58 | 
59 |     @torch.inference_mode()
60 |     def get_features(self, x):
61 |         x = x.to(self.device)
62 |         if self.should_normalize:
63 |             x = F.layer_norm(x, x.shape)
64 |         x = x.view(1, -1)
65 | 
66 |         feat = []
67 |         for start in range(0, x.size(1), self.max_chunk):
68 |             x_chunk = x[:, start : start + self.max_chunk]
69 |             feat_chunk, _ = self.model.extract_features(
70 |                 source=x_chunk,
71 |                 padding_mask=None,
72 |                 mask=False,
73 |                 output_layer=self.layer,
74 |             )
75 |             feat.append(feat_chunk)
76 |         return torch.cat(feat, 1).squeeze(0).cpu()
77 | 


--------------------------------------------------------------------------------
/unitspeech/speaker_encoder/utils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification """
 2 | 
 3 | import torch
 4 | import fairseq
 5 | from packaging import version
 6 | import torch.nn.functional as F
 7 | from fairseq import tasks
 8 | from fairseq.checkpoint_utils import load_checkpoint_to_cpu
 9 | from fairseq.dataclass.utils import convert_namespace_to_omegaconf
10 | from omegaconf import OmegaConf
11 | from s3prl.upstream.interfaces import UpstreamBase
12 | from torch.nn.utils.rnn import pad_sequence
13 | 
14 | def load_model(filepath):
15 |     state = torch.load(filepath, map_location=lambda storage, loc: storage)
16 |     # state = load_checkpoint_to_cpu(filepath)
17 |     state["cfg"] = OmegaConf.create(state["cfg"])
18 | 
19 |     if "args" in state and state["args"] is not None:
20 |         cfg = convert_namespace_to_omegaconf(state["args"])
21 |     elif "cfg" in state and state["cfg"] is not None:
22 |         cfg = state["cfg"]
23 |     else:
24 |         raise RuntimeError(
25 |             f"Neither args nor cfg exist in state keys = {state.keys()}"
26 |             )
27 | 
28 |     task = tasks.setup_task(cfg.task)
29 |     if "task_state" in state:
30 |         task.load_state_dict(state["task_state"])
31 | 
32 |     model = task.build_model(cfg.model)
33 | 
34 |     return model, cfg, task
35 | 
36 | 
37 | ###################
38 | # UPSTREAM EXPERT #
39 | ###################
40 | class UpstreamExpert(UpstreamBase):
41 |     def __init__(self, ckpt, **kwargs):
42 |         super().__init__(**kwargs)
43 |         assert version.parse(fairseq.__version__) > version.parse(
44 |             "0.10.2"
45 |         ), "Please install the fairseq master branch."
46 | 
47 |         model, cfg, task = load_model(ckpt)
48 |         self.model = model
49 |         self.task = task
50 | 
51 |         if len(self.hooks) == 0:
52 |             module_name = "self.model.encoder.layers"
53 |             for module_id in range(len(eval(module_name))):
54 |                 self.add_hook(
55 |                     f"{module_name}[{module_id}]",
56 |                     lambda input, output: input[0].transpose(0, 1),
57 |                 )
58 |             self.add_hook("self.model.encoder", lambda input, output: output[0])
59 | 
60 |     def forward(self, wavs):
61 |         if self.task.cfg.normalize:
62 |             wavs = [F.layer_norm(wav, wav.shape) for wav in wavs]
63 | 
64 |         device = wavs[0].device
65 |         wav_lengths = torch.LongTensor([len(wav) for wav in wavs]).to(device)
66 |         wav_padding_mask = ~torch.lt(
67 |             torch.arange(max(wav_lengths)).unsqueeze(0).to(device),
68 |             wav_lengths.unsqueeze(1),
69 |         )
70 |         padded_wav = pad_sequence(wavs, batch_first=True)
71 | 
72 |         features, feat_padding_mask = self.model.extract_features(
73 |             padded_wav,
74 |             padding_mask=wav_padding_mask,
75 |             mask=None,
76 |         )
77 |         return {
78 |             "default": features,
79 |         }
80 | 
81 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import os
 7 | import subprocess
 8 | from dataclasses import dataclass
 9 | import torch.distributed as dist
10 | 
11 | 
12 | @dataclass(frozen=True, repr=True, eq=True, unsafe_hash=True)
13 | class DistributedContext:
14 |     is_distributed: bool
15 |     rank: int
16 |     local_rank: int
17 |     world_size: int
18 |     mode: str
19 | 
20 |     @property
21 |     def is_leader(self) -> bool:
22 |         return self.rank == 0
23 | 
24 | 
25 | def init_distributed_context(port: int) -> DistributedContext:
26 |     # Sometimes the nccl backend hangs on the barrier op (https://github.com/pytorch/pytorch/issues/53658).
27 |     # Since it is the only op we care about here, we'd use the gloo backend.
28 |     BACKEND = "gloo"
29 | 
30 |     # default, non-distributed context
31 |     context = DistributedContext(
32 |         is_distributed=False, rank=0, local_rank=0, world_size=1, mode="none"
33 |     )
34 | 
35 |     launch_keys = ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK", "LOCAL_RANK"]
36 |     slurm_keys = [
37 |         "SLURM_LOCALID",
38 |         "SLURM_PROCID",
39 |         "SLURM_NTASKS",
40 |         "SLURM_NODEID",
41 |         "SLURM_JOB_NODELIST",
42 |     ]
43 | 
44 |     # is it torch.distributed.launch?
45 |     if all(key in os.environ for key in launch_keys):
46 |         init_method = "env://"
47 |         world_size = int(os.environ["WORLD_SIZE"])
48 |         rank = int(os.environ["RANK"])
49 |         local_rank = int(os.environ["LOCAL_RANK"])
50 |         context = DistributedContext(
51 |             is_distributed=True,
52 |             rank=rank,
53 |             world_size=world_size,
54 |             local_rank=local_rank,
55 |             mode="launch",
56 |         )
57 |         dist.init_process_group(
58 |             backend=BACKEND, init_method=init_method, world_size=world_size, rank=rank
59 |         )
60 |     # is it slurm?
61 |     elif all(key in os.environ for key in slurm_keys):
62 |         init_method = "env://"
63 |         local_rank = int(os.environ["SLURM_LOCALID"])
64 |         rank = int(os.environ["SLURM_PROCID"])
65 |         world_size = int(os.environ["SLURM_NTASKS"])
66 | 
67 |         hostnames = subprocess.check_output(
68 |             ["scontrol", "show", "hostnames", os.environ["SLURM_JOB_NODELIST"]]
69 |         )
70 |         leader_addr = hostnames.split()[0].decode("utf-8")
71 | 
72 |         os.environ["MASTER_ADDR"] = leader_addr
73 |         os.environ["MASTER_PORT"] = str(port)
74 |         os.environ["WORLD_SIZE"] = str(world_size)
75 |         os.environ["RANK"] = str(rank)
76 | 
77 |         context = DistributedContext(
78 |             is_distributed=True,
79 |             rank=rank,
80 |             local_rank=local_rank,
81 |             world_size=world_size,
82 |             mode="slurm",
83 |         )
84 |         dist.init_process_group(
85 |             backend=BACKEND,
86 |             init_method=init_method,
87 |             world_size=world_size,
88 |             rank=rank,
89 |         )
90 | 
91 |     return context
92 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/gslm/sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import typing as tp
 7 | 
 8 | from fairseq import hub_utils, utils
 9 | from fairseq.hub_utils import GeneratorHubInterface
10 | 
11 | 
12 | class UnitLanguageModelSampler(GeneratorHubInterface):
13 |     """
14 |     A simple PyTorch interface for ULM
15 |     """
16 | 
17 |     def __init__(self, cfg, task, models):
18 |         super().__init__(cfg, task, models)
19 |         self.model = self.models[0]
20 |         self.model.eval()
21 | 
22 |     def encode(self, unit_str):
23 |         tokens = self.task.source_dictionary.encode_line(
24 |             unit_str, add_if_not_exist=False
25 |         ).long()
26 |         return tokens
27 | 
28 |     def get_prefix_size(self):
29 |         return self.cfg.generation.prefix_size
30 | 
31 |     def post_process_predictions(self, src_tokens, hypos):
32 |         src_tokens = utils.strip_pad(src_tokens, self.tgt_dict.pad())
33 |         src_str = None
34 |         if self.task.source_dictionary is not None:
35 |             src_str = self.task.source_dictionary.string(
36 |                 src_tokens, self.cfg.common_eval.post_process
37 |             )
38 |         return [
39 |             utils.post_process_prediction(
40 |                 hypo_tokens=hypo["tokens"].int().cpu(),
41 |                 src_str=src_str,
42 |                 alignment=hypo["alignment"],
43 |                 align_dict=self.align_dict,
44 |                 tgt_dict=self.tgt_dict,
45 |                 remove_bpe=self.cfg.common_eval.post_process,
46 |             )[1]
47 |             for hypo in hypos
48 |         ]
49 | 
50 |     def sample(
51 |         self, sentences: tp.List[str], beam: int = 1, verbose: bool = False, **kwargs
52 |     ):
53 |         hypotheses = self.sample_top_hypotheses(sentences, beam, verbose, **kwargs)
54 |         return [hypos[0] for hypos in hypotheses]
55 | 
56 |     def sample_top_hypotheses(
57 |         self, sentences: tp.List[str], beam: int = 1, verbose: bool = False, **kwargs
58 |     ) -> tp.List[str]:
59 |         if isinstance(sentences, str):
60 |             return self.sample_top_hypotheses(
61 |                 [sentences], beam=beam, verbose=verbose, **kwargs
62 |             )[0]
63 |         tokenized_sentences = [self.encode(sentence) for sentence in sentences]
64 |         batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs)
65 | 
66 |         return [
67 |             self.post_process_predictions(src_tokens, hypos)
68 |             for src_tokens, hypos in zip(tokenized_sentences, batched_hypos)
69 |         ]
70 | 
71 |     @classmethod
72 |     def from_pretrained(
73 |         cls,
74 |         model_name_or_path,
75 |         checkpoint_file="checkpoint_best.pt",
76 |         data_name_or_path=".",
77 |         **kwargs,
78 |     ):
79 |         x = hub_utils.from_pretrained(
80 |             model_name_or_path,
81 |             checkpoint_file,
82 |             data_name_or_path,
83 |             archive_map=None,
84 |             bpe=None,
85 |             load_checkpoint_heads=True,
86 |             sample_break_mode="eos",
87 |             **kwargs,
88 |         )
89 |         return cls(x["args"], x["task"], x["models"])
90 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/cmudict.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017 Keith Ito
 2 | 
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | 
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | 
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | # THE SOFTWARE.
20 | 
21 | """ from MIT-licensed https://github.com/keithito/tacotron """
22 | 
23 | import re
24 | 
25 | 
26 | valid_symbols = [
27 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
28 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
29 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
30 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
31 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
32 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
33 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
34 | ]
35 | 
36 | _valid_symbol_set = set(valid_symbols)
37 | 
38 | 
39 | class CMUDict:
40 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
41 |   def __init__(self, file_or_path, keep_ambiguous=True):
42 |     if isinstance(file_or_path, str):
43 |       with open(file_or_path, encoding='latin-1') as f:
44 |         entries = _parse_cmudict(f)
45 |     else:
46 |       entries = _parse_cmudict(file_or_path)
47 |     if not keep_ambiguous:
48 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
49 |     self._entries = entries
50 | 
51 | 
52 |   def __len__(self):
53 |     return len(self._entries)
54 | 
55 | 
56 |   def lookup(self, word):
57 |     '''Returns list of ARPAbet pronunciations of the given word.'''
58 |     return self._entries.get(word.upper())
59 | 
60 | 
61 | 
62 | _alt_re = re.compile(r'\([0-9]+\)')
63 | 
64 | 
65 | def _parse_cmudict(file):
66 |   cmudict = {}
67 |   for line in file:
68 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
69 |       parts = line.split('  ')
70 |       word = re.sub(_alt_re, '', parts[0])
71 |       pronunciation = _get_pronunciation(parts[1])
72 |       if pronunciation:
73 |         if word in cmudict:
74 |           cmudict[word].append(pronunciation)
75 |         else:
76 |           cmudict[word] = [pronunciation]
77 |   return cmudict
78 | 
79 | 
80 | def _get_pronunciation(s):
81 |   parts = s.strip().split(' ')
82 |   for part in parts:
83 |     if part not in _valid_symbol_set:
84 |       return None
85 |   return ' '.join(parts)
86 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/waveglow_denoiser.py:
--------------------------------------------------------------------------------
 1 | # *****************************************************************************
 2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | #  Redistribution and use in source and binary forms, with or without
 5 | #  modification, are permitted provided that the following conditions are met:
 6 | #      * Redistributions of source code must retain the above copyright
 7 | #        notice, this list of conditions and the following disclaimer.
 8 | #      * Redistributions in binary form must reproduce the above copyright
 9 | #        notice, this list of conditions and the following disclaimer in the
10 | #        documentation and/or other materials provided with the distribution.
11 | #      * Neither the name of the NVIDIA CORPORATION nor the
12 | #        names of its contributors may be used to endorse or promote products
13 | #        derived from this software without specific prior written permission.
14 | #
15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | # *****************************************************************************
27 | 
28 | import torch
29 | from .layers import STFT
30 | 
31 | 
32 | class Denoiser(torch.nn.Module):
33 |     """ Removes model bias from audio produced with waveglow """
34 | 
35 |     def __init__(self, waveglow, filter_length=1024, n_overlap=4,
36 |                  win_length=1024, mode='zeros'):
37 |         super(Denoiser, self).__init__()
38 |         self.stft = STFT(filter_length=filter_length,
39 |                          hop_length=int(filter_length/n_overlap),
40 |                          win_length=win_length)
41 |         if mode == 'zeros':
42 |             mel_input = torch.zeros(
43 |                 (1, 80, 88),
44 |                 dtype=waveglow.upsample.weight.dtype,
45 |                 device=waveglow.upsample.weight.device)
46 |         elif mode == 'normal':
47 |             mel_input = torch.randn(
48 |                 (1, 80, 88),
49 |                 dtype=waveglow.upsample.weight.dtype,
50 |                 device=waveglow.upsample.weight.device)
51 |         else:
52 |             raise Exception("Mode {} if not supported".format(mode))
53 | 
54 |         with torch.no_grad():
55 |             bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
56 |             bias_spec, _ = self.stft.transform(bias_audio)
57 | 
58 |         self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
59 | 
60 |     def forward(self, audio, strength=0.1):
61 |         audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
62 |         audio_spec_denoised = audio_spec - self.bias_spec * strength
63 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
64 |         audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
65 |         return audio_denoised
66 | 


--------------------------------------------------------------------------------
/unitspeech/util.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS """
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | def sequence_mask(length, max_length=None):
  7 |     if max_length is None:
  8 |         max_length = length.max()
  9 |     x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
 10 |     return x.unsqueeze(0) < length.unsqueeze(1)
 11 | 
 12 | 
 13 | def generate_path(duration, mask):
 14 |     device = duration.device
 15 | 
 16 |     b, t_x, t_y = mask.shape
 17 |     cum_duration = torch.cumsum(duration, 1)
 18 |     path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
 19 | 
 20 |     cum_duration_flat = cum_duration.view(b * t_x)
 21 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
 22 |     path = path.view(b, t_x, t_y)
 23 |     path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0],
 24 |                                           [1, 0], [0, 0]]))[:, :-1]
 25 |     path = path * mask
 26 |     return path
 27 | 
 28 | 
 29 | def convert_pad_shape(pad_shape):
 30 |     l = pad_shape[::-1]
 31 |     pad_shape = [item for sublist in l for item in sublist]
 32 |     return pad_shape
 33 | 
 34 | 
 35 | def fix_len_compatibility(length, num_downsamplings_in_unet=3):
 36 |     while True:
 37 |         if length % (2**num_downsamplings_in_unet) == 0:
 38 |             return int(length)
 39 |         length += 1
 40 | 
 41 | 
 42 | def intersperse(lst, item):
 43 |     # Adds blank symbol
 44 |     result = [item] * (len(lst) * 2 + 1)
 45 |     result[1::2] = lst
 46 |     return result
 47 | 
 48 | 
 49 | def process_unit(encoded, sampling_rate, hop_length):
 50 |     # A method that aligns units and durations (50Hz) extracted from 16kHz audio with
 51 |     # mel-spectrograms extracted from 22,050Hz audio.
 52 | 
 53 |     unit = encoded["units"].cpu().tolist()
 54 |     duration = encoded["durations"].cpu().tolist()
 55 | 
 56 |     duration = [int(i) * (sampling_rate // 50) for i in duration]
 57 | 
 58 |     expand_unit = []
 59 | 
 60 |     for u, d in zip(unit, duration):
 61 |         for _ in range(d):
 62 |             expand_unit.append(u)
 63 | 
 64 |     new_length = len(expand_unit) // hop_length * hop_length
 65 | 
 66 |     unit = torch.LongTensor(expand_unit)[:new_length].reshape(-1, hop_length).mode(1)[0].tolist()
 67 | 
 68 |     squeezed_unit = [unit[0]]
 69 |     squeezed_duration = [1]
 70 | 
 71 |     for u in unit[1:]:
 72 |         if u == squeezed_unit[-1]:
 73 |             squeezed_duration[-1] += 1
 74 |         else:
 75 |             squeezed_unit.append(u)
 76 |             squeezed_duration.append(1)
 77 | 
 78 |     unit = torch.LongTensor(squeezed_unit)
 79 |     duration = torch.LongTensor(squeezed_duration)
 80 | 
 81 |     return unit, duration
 82 | 
 83 | 
 84 | class HParams():
 85 |     def __init__(self, **kwargs):
 86 |         for k, v in kwargs.items():
 87 |             if type(v) == dict:
 88 |                 v = HParams(**v)
 89 |             self[k] = v
 90 | 
 91 |     def keys(self):
 92 |         return self.__dict__.keys()
 93 | 
 94 |     def items(self):
 95 |         return self.__dict__.items()
 96 | 
 97 |     def values(self):
 98 |         return self.__dict__.values()
 99 | 
100 |     def __len__(self):
101 |         return len(self.__dict__)
102 | 
103 |     def __getitem__(self, key):
104 |         return getattr(self, key)
105 | 
106 |     def __setitem__(self, key, value):
107 |         return setattr(self, key, value)
108 | 
109 |     def __contains__(self, key):
110 |         return key in self.__dict__
111 | 
112 |     def __repr__(self):
113 |         return self.__dict__.__repr__()


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/numbers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017 Keith Ito
 2 | 
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | 
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | 
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | # THE SOFTWARE.
20 | 
21 | """ from MIT-licensed https://github.com/keithito/tacotron """
22 | 
23 | import inflect
24 | import re
25 | 
26 | 
27 | _inflect = inflect.engine()
28 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
29 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
30 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
31 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
32 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
33 | _number_re = re.compile(r'[0-9]+')
34 | 
35 | 
36 | def _remove_commas(m):
37 |   return m.group(1).replace(',', '')
38 | 
39 | 
40 | def _expand_decimal_point(m):
41 |   return m.group(1).replace('.', ' point ')
42 | 
43 | 
44 | def _expand_dollars(m):
45 |   match = m.group(1)
46 |   parts = match.split('.')
47 |   if len(parts) > 2:
48 |     return match + ' dollars'  # Unexpected format
49 |   dollars = int(parts[0]) if parts[0] else 0
50 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
51 |   if dollars and cents:
52 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
53 |     cent_unit = 'cent' if cents == 1 else 'cents'
54 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
55 |   elif dollars:
56 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
57 |     return '%s %s' % (dollars, dollar_unit)
58 |   elif cents:
59 |     cent_unit = 'cent' if cents == 1 else 'cents'
60 |     return '%s %s' % (cents, cent_unit)
61 |   else:
62 |     return 'zero dollars'
63 | 
64 | 
65 | def _expand_ordinal(m):
66 |   return _inflect.number_to_words(m.group(0))
67 | 
68 | 
69 | def _expand_number(m):
70 |   num = int(m.group(0))
71 |   if num > 1000 and num < 3000:
72 |     if num == 2000:
73 |       return 'two thousand'
74 |     elif num > 2000 and num < 2010:
75 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
76 |     elif num % 100 == 0:
77 |       return _inflect.number_to_words(num // 100) + ' hundred'
78 |     else:
79 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
80 |   else:
81 |     return _inflect.number_to_words(num, andword='')
82 | 
83 | 
84 | def normalize_numbers(text):
85 |   text = re.sub(_comma_number_re, _remove_commas, text)
86 |   text = re.sub(_pounds_re, r'\1 pounds', text)
87 |   text = re.sub(_dollars_re, _expand_dollars, text)
88 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
89 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
90 |   text = re.sub(_number_re, _expand_number, text)
91 |   return text
92 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq


--------------------------------------------------------------------------------
/unitspeech/vocoder/alias_free_torch/filter.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import math
 8 | 
 9 | if 'sinc' in dir(torch):
10 |     sinc = torch.sinc
11 | else:
12 |     # This code is adopted from adefossez's julius.core.sinc under the MIT License
13 |     # https://adefossez.github.io/julius/julius/core.html
14 |     #   LICENSE is in incl_licenses directory.
15 |     def sinc(x: torch.Tensor):
16 |         """
17 |         Implementation of sinc, i.e. sin(pi * x) / (pi * x)
18 |         __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
19 |         """
20 |         return torch.where(x == 0,
21 |                            torch.tensor(1., device=x.device, dtype=x.dtype),
22 |                            torch.sin(math.pi * x) / math.pi / x)
23 | 
24 | 
25 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
26 | # https://adefossez.github.io/julius/julius/lowpass.html
27 | #   LICENSE is in incl_licenses directory.
28 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
29 |     even = (kernel_size % 2 == 0)
30 |     half_size = kernel_size // 2
31 | 
32 |     #For kaiser window
33 |     delta_f = 4 * half_width
34 |     A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
35 |     if A > 50.:
36 |         beta = 0.1102 * (A - 8.7)
37 |     elif A >= 21.:
38 |         beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
39 |     else:
40 |         beta = 0.
41 |     window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
42 | 
43 |     # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
44 |     if even:
45 |         time = (torch.arange(-half_size, half_size) + 0.5)
46 |     else:
47 |         time = torch.arange(kernel_size) - half_size
48 |     if cutoff == 0:
49 |         filter_ = torch.zeros_like(time)
50 |     else:
51 |         filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
52 |         # Normalize filter to have sum = 1, otherwise we will have a small leakage
53 |         # of the constant component in the input signal.
54 |         filter_ /= filter_.sum()
55 |         filter = filter_.view(1, 1, kernel_size)
56 | 
57 |     return filter
58 | 
59 | 
60 | class LowPassFilter1d(nn.Module):
61 |     def __init__(self,
62 |                  cutoff=0.5,
63 |                  half_width=0.6,
64 |                  stride: int = 1,
65 |                  padding: bool = True,
66 |                  padding_mode: str = 'replicate',
67 |                  kernel_size: int = 12):
68 |         # kernel_size should be even number for stylegan3 setup,
69 |         # in this implementation, odd number is also possible.
70 |         super().__init__()
71 |         if cutoff < -0.:
72 |             raise ValueError("Minimum cutoff must be larger than zero.")
73 |         if cutoff > 0.5:
74 |             raise ValueError("A cutoff above 0.5 does not make sense.")
75 |         self.kernel_size = kernel_size
76 |         self.even = (kernel_size % 2 == 0)
77 |         self.pad_left = kernel_size // 2 - int(self.even)
78 |         self.pad_right = kernel_size // 2
79 |         self.stride = stride
80 |         self.padding = padding
81 |         self.padding_mode = padding_mode
82 |         filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
83 |         self.register_buffer("filter", filter)
84 | 
85 |     #input [B, C, T]
86 |     def forward(self, x):
87 |         _, C, _ = x.shape
88 | 
89 |         if self.padding:
90 |             x = F.pad(x, (self.pad_left, self.pad_right),
91 |                       mode=self.padding_mode)
92 |         out = F.conv1d(x, self.filter.expand(C, -1, -1),
93 |                        stride=self.stride, groups=C)
94 | 
95 |         return out


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/cleaners.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017 Keith Ito
  2 | 
  3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | # of this software and associated documentation files (the "Software"), to deal
  5 | # in the Software without restriction, including without limitation the rights
  6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7 | # copies of the Software, and to permit persons to whom the Software is
  8 | # furnished to do so, subject to the following conditions:
  9 | 
 10 | # The above copyright notice and this permission notice shall be included in
 11 | # all copies or substantial portions of the Software.
 12 | 
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 19 | # THE SOFTWARE.
 20 | 
 21 | """ from MIT-licensed https://github.com/keithito/tacotron """
 22 | '''
 23 | Cleaners are transformations that run over the input text at both training and eval time.
 24 | 
 25 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 26 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 27 |   1. "english_cleaners" for English text
 28 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 29 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 30 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 31 |      the symbols in symbols.py to match your data).
 32 | '''
 33 | 
 34 | import re
 35 | from unidecode import unidecode
 36 | from .numbers import normalize_numbers
 37 | 
 38 | 
 39 | # Regular expression matching whitespace:
 40 | _whitespace_re = re.compile(r'\s+')
 41 | 
 42 | # List of (regular expression, replacement) pairs for abbreviations:
 43 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 44 |   ('mrs', 'misess'),
 45 |   ('mr', 'mister'),
 46 |   ('dr', 'doctor'),
 47 |   ('st', 'saint'),
 48 |   ('co', 'company'),
 49 |   ('jr', 'junior'),
 50 |   ('maj', 'major'),
 51 |   ('gen', 'general'),
 52 |   ('drs', 'doctors'),
 53 |   ('rev', 'reverend'),
 54 |   ('lt', 'lieutenant'),
 55 |   ('hon', 'honorable'),
 56 |   ('sgt', 'sergeant'),
 57 |   ('capt', 'captain'),
 58 |   ('esq', 'esquire'),
 59 |   ('ltd', 'limited'),
 60 |   ('col', 'colonel'),
 61 |   ('ft', 'fort'),
 62 | ]]
 63 | 
 64 | 
 65 | def expand_abbreviations(text):
 66 |   for regex, replacement in _abbreviations:
 67 |     text = re.sub(regex, replacement, text)
 68 |   return text
 69 | 
 70 | 
 71 | def expand_numbers(text):
 72 |   return normalize_numbers(text)
 73 | 
 74 | 
 75 | def lowercase(text):
 76 |   return text.lower()
 77 | 
 78 | 
 79 | def collapse_whitespace(text):
 80 |   return re.sub(_whitespace_re, ' ', text)
 81 | 
 82 | 
 83 | def convert_to_ascii(text):
 84 |   return unidecode(text)
 85 | 
 86 | 
 87 | def basic_cleaners(text):
 88 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
 89 |   text = lowercase(text)
 90 |   text = collapse_whitespace(text)
 91 |   return text
 92 | 
 93 | 
 94 | def transliteration_cleaners(text):
 95 |   '''Pipeline for non-English text that transliterates to ASCII.'''
 96 |   text = convert_to_ascii(text)
 97 |   text = lowercase(text)
 98 |   text = collapse_whitespace(text)
 99 |   return text
100 | 
101 | 
102 | def english_cleaners(text):
103 |   '''Pipeline for English text, including number and abbreviation expansion.'''
104 |   text = convert_to_ascii(text)
105 |   text = lowercase(text)
106 |   text = expand_numbers(text)
107 |   text = expand_abbreviations(text)
108 |   text = collapse_whitespace(text)
109 |   return text
110 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/README.md:
--------------------------------------------------------------------------------
 1 | # Distributed pseudo-units transcription
 2 | 
 3 | If you ever tried to transcribe large-scale aduio datasets (e.g. [LibriLight](https://github.com/facebookresearch/libri-light) dataset with 60k hours) into discrete pseudo-units such as used by the [Generative Spoken Language Modeling (GSLM)](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm) system, you might have noticed that this task is computationally intensive and might be impractical to do in a non-distributed fashion.
 4 | 
 5 | This tool provides a convenient script that can leverage multiple GPUs (on multiple nodes!) to speed up and parallelize pseudo-unit transcription.
 6 | We provide recipies for two scenarios: (a) single-node, multiprocess/multi-GPU transcription that leverages distributed.run mechanism of Pytorch, and (b) multi-node, multi-GPU transcription that can be run on a SLURM-managed cluster.
 7 | 
 8 | ## Example scripts
 9 | 
10 | * `local.sh` runs provides an example of a command to transcribe a dataset in a local parallel mode;
11 | * `slurm.sbatch` is an example of a SLURM sbatch script for a distributed pseudo-unit transcription.
12 | 
13 | Finally, `transcribe.py` can be run directly as a single process (see single.sh):
14 | ```
15 | DENSE_NAME=hubert-base-ls960
16 | KMEANS_NAME=hubert-base-ls960-kmeans-100
17 | MANIFEST=manifest.tsv
18 | TRANSCRIPT=transcript
19 | 
20 | python transcribe.py \
21 |     --manifest $MANIFEST \
22 |     --output=$TRANSCRIPT \
23 |     --dense_model=$DENSE_NAME \
24 |     --kmeans_model=$KMEANS_NAME
25 |  ```
26 |  
27 | ## Command line arguments
28 | 
29 | The transcription script, `transcribe.py` has a few command-line arguments:
30 | * `--dense_model`: sets the dense Hubert model to be used (by its name, e.g. `hubert-base-ls960`);
31 | * `--kmeans_model`: sets the k-mean quantizer to be used, e.g. `hubert-base-ls960-kmeans-100`;
32 | * `--manifest`: specifies the manifest file describing the dataset;
33 | * `--output`: path to the output transcript file. Unit stream will be stored in `<output>.units` file, durations (if requested) - in `<output>.durations`, and F0 values (again, if requested) in `<output>.f0s`;
34 | * `--deduplicate`: if set, consecutive repeats of the same pseudo-unit are collapsed (as it is done in GSLM);
35 | * `--durations`: if set, duration of each token is reported in a `<output>.durations` file (note that if `--deduplicate` is not set, all durations will be equal to 1);
36 | * `--f0s`: if set, duration of mean F0 that correspond to each token is reported in a `<output>.f0s` file (note: F0 extraction is slow). F0 values are rounded to the closest integer and are measured in Hz;
37 | * `--preserve_name`: if set, the transcript contains names of the original audio files;
38 | * `--separator`: a separator between pseudo-unit tokens in the outputs;
39 | * `--distributed_port`: a unique port, required for distributed transcription (defaults to 58554).
40 | 
41 | 
42 | ## Input format
43 | `transribe.py` takes a manifest file describing an input dataset. A manifest is a tab-separated file with simple format: (a) the first line is a root of the dataset's folder, and (b) each line specifies a relative path to an audio file and its size in frames. Here is an example of a manifest corresponding to LibriSpeech dev-clean:
44 | ```
45 | /datasets/librispeech/dev-clean
46 | 1272/128104/1272-128104-0000.flac 93680
47 | 1272/128104/1272-128104-0001.flac 77040
48 | 1272/128104/1272-128104-0002.flac 199760
49 | 1272/128104/1272-128104-0003.flac 158400
50 | 1272/128104/1272-128104-0004.flac 470400
51 | 1272/128104/1272-128104-0005.flac 144160
52 | ```
53 | (`transcribe.py` ignores the duration field.)
54 | 
55 | **NB**: fairseq has [an utility](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/wav2vec_manifest.py) for creating manifest files. 
56 | 
57 | ## Output format
58 | 
59 | `transcribe.py` outputs one line per file, with pseudo-units separated by spaces (by default). Hence the output would look something like
60 | ```
61 | 71 12 56 57 40 63 40 63 93 50 76 53 62 ... 55 20
62 | ...
63 | 71 12 56 57 56 57 40 57 86 58 9 1 27 31 23 69 44 26 ...
64 | ```
65 | 
66 | This format is directly compatible with fairseq-preprocessing. However, if there is a need to link a particular line to its original file, please use `--preserve_name` flag.
67 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/README.md:
--------------------------------------------------------------------------------
 1 | ## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
 2 | #### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
 3 | 
 4 | <center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
 5 | 
 6 | 
 7 | ### [Paper](https://arxiv.org/abs/2206.04658)
 8 | ### [Audio demo](https://bigvgan-demo.github.io/)
 9 | 
10 | ## Installation
11 | Clone the repository and install dependencies.
12 | ```shell
13 | # the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries
14 | git clone https://github.com/NVIDIA/BigVGAN
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset.
19 | ``` shell
20 | cd LibriTTS && \
21 | ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
22 | ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
23 | ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
24 | ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
25 | ln -s /path/to/your/LibriTTS/dev-other dev-other && \
26 | ln -s /path/to/your/LibriTTS/test-clean test-clean && \
27 | ln -s /path/to/your/LibriTTS/test-other test-other && \
28 | cd ..
29 | ```
30 | 
31 | ## Training
32 | Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input.
33 | ```shell
34 | python train.py \
35 | --config configs/bigvgan_24khz_100band.json \
36 | --input_wavs_dir LibriTTS \
37 | --input_training_file LibriTTS/train-full.txt \
38 | --input_validation_file LibriTTS/val-full.txt \
39 | --list_input_unseen_wavs_dir LibriTTS LibriTTS \
40 | --list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \
41 | --checkpoint_path exp/bigvgan
42 | ```
43 | 
44 | ## Synthesis
45 | Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
46 | It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
47 | ```shell
48 | python inference.py \
49 | --checkpoint_file exp/bigvgan/g_05000000 \
50 | --input_wavs_dir /path/to/your/input_wav \
51 | --output_dir /path/to/your/output_wav
52 | ```
53 | 
54 | `inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
55 | It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
56 | 
57 | Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
58 | ```shell
59 | python inference_e2e.py \
60 | --checkpoint_file exp/bigvgan/g_05000000 \
61 | --input_mels_dir /path/to/your/input_mel \
62 | --output_dir /path/to/your/output_wav
63 | ```
64 | 
65 | ## Pretrained Models
66 | We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq).
67 | One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders.
68 | 
69 | |Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned|
70 | |------|---|---|---|---|------|---|
71 | |bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No|
72 | |bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No|
73 | |bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No|
74 | |bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No|
75 | 
76 | The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset.
77 | We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
78 | Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality.
79 | 
80 | 
81 | ## TODO
82 | 
83 | Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future. 
84 | 
85 | 
86 | ## References
87 | * [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
88 | 
89 | * [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
90 | 
91 | * [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
92 | 
93 | * [Julius](https://github.com/adefossez/julius) (for low-pass filter)
94 | 
95 | * [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
96 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/data/f0_preprocess.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | import amfm_decompy.basic_tools as basic
  8 | import amfm_decompy.pYAAPT as pYAAPT
  9 | from librosa.util import normalize
 10 | import numpy as np
 11 | from scipy.interpolate import interp1d
 12 | 
 13 | F0_FRAME_SPACE = 0.005  # sec
 14 | 
 15 | 
 16 | def get_f0(audio, rate=16_000):
 17 |     assert audio.ndim == 1
 18 |     frame_length = 20.0  # ms
 19 |     to_pad = int(frame_length / 1000 * rate) // 2
 20 | 
 21 |     audio = normalize(audio) * 0.95
 22 |     audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0)
 23 |     audio = basic.SignalObj(audio, rate)
 24 |     pitch = pYAAPT.yaapt(
 25 |         audio,
 26 |         frame_length=frame_length,
 27 |         frame_space=F0_FRAME_SPACE * 1000,
 28 |         nccf_thresh1=0.25,
 29 |         tda_frame_length=25.0,
 30 |     )
 31 |     f0 = pitch.samp_values
 32 |     return f0
 33 | 
 34 | 
 35 | def align_f0_to_durations(f0, durations, f0_code_ratio, tol=1):
 36 |     code_len = durations.sum()
 37 |     targ_len = int(f0_code_ratio * code_len)
 38 |     diff = f0.size(0) - targ_len
 39 |     assert abs(diff) <= tol, (
 40 |         f"Cannot subsample F0: |{f0.size(0)} - {f0_code_ratio}*{code_len}|"
 41 |         f" > {tol} (dur=\n{durations})"
 42 |     )
 43 |     if diff > 0:
 44 |         f0 = f0[:targ_len]
 45 |     elif diff < 0:
 46 |         f0 = torch.cat((f0, f0.new_full((-diff,), f0[-1])), 0)
 47 | 
 48 |     f0_offset = 0.0
 49 |     seg_f0s = []
 50 |     for dur in durations:
 51 |         f0_dur = dur.item() * f0_code_ratio
 52 |         seg_f0 = f0[int(f0_offset) : int(f0_offset + f0_dur)]
 53 |         seg_f0 = seg_f0[seg_f0 != 0]
 54 |         if len(seg_f0) == 0:
 55 |             seg_f0 = torch.tensor(0).type(seg_f0.type())
 56 |         else:
 57 |             seg_f0 = seg_f0.mean()
 58 |         seg_f0s.append(seg_f0)
 59 |         f0_offset += f0_dur
 60 | 
 61 |     assert int(f0_offset) == f0.size(0), f"{f0_offset} {f0.size()} {durations.sum()}"
 62 |     return torch.tensor(seg_f0s)
 63 | 
 64 | 
 65 | class SpeakerMeanNormalize:
 66 |     def __init__(self, path_to_stats, center=True, scale=False, log=True):
 67 |         self.stats = torch.load(path_to_stats)
 68 |         self.center = center
 69 |         self.scale = scale
 70 |         self.log = log
 71 | 
 72 |     def __call__(self, f0, speaker):
 73 |         f0 = f0.clone()
 74 |         mask = f0 != 0.0
 75 |         if self.log:
 76 |             f0[mask] = f0[mask].log()
 77 | 
 78 |         mean = (
 79 |             self.stats[speaker]["logf0_mean"]
 80 |             if self.log
 81 |             else self.stats[speaker]["f0_mean"]
 82 |         )
 83 |         std = (
 84 |             self.stats[speaker]["logf0_std"]
 85 |             if self.log
 86 |             else self.stats[speaker]["f0_std"]
 87 |         )
 88 | 
 89 |         if self.center:
 90 |             f0[mask] -= mean
 91 |         if self.scale:
 92 |             f0[mask] /= std
 93 | 
 94 |         return f0
 95 | 
 96 | 
 97 | class PromptNormalize:
 98 |     def __init__(self, center=True, scale=False, log=True):
 99 |         self.center = center
100 |         self.scale = scale
101 |         self.log = log
102 | 
103 |     def __call__(self, f0, _speaker=None):
104 |         f0 = f0.clone()
105 |         mask = f0 != 0.0
106 |         if self.log:
107 |             f0[mask] = f0[mask].log()
108 | 
109 |         if self.center:
110 |             f0[mask] -= f0[mask].mean()
111 |         if self.scale:
112 |             f0[mask] /= f0[mask].std()
113 | 
114 |         return f0
115 | 
116 | 
117 | class F0BinQuantizer:
118 |     def __init__(self, bins_path):
119 |         self.bins = torch.load(bins_path)
120 | 
121 |     def __call__(self, f0: torch.Tensor):
122 |         bin_idx = (f0.view(-1, 1) > self.bins.view(1, -1)).long().sum(dim=1)
123 |         return bin_idx
124 | 
125 | 
126 | def trailing_silence_mask(f0):
127 |     """
128 |     >>> f0 = torch.tensor([1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0])
129 |     >>> trailing_silence_mask(f0)
130 |     tensor([False, False, False, False,  True,  True,  True])
131 |     """
132 |     assert f0.ndim == 1
133 |     mask = ((f0.flip(0) != 0.0).cumsum(0) == 0).flip(0)
134 |     return mask
135 | 
136 | 
137 | def interpolate_f0(f0):
138 |     orig_t = np.arange(f0.shape[0])
139 |     f0_interp = f0[:]
140 |     ii = f0_interp != 0
141 |     if ii.sum() > 1:
142 |         f0_interp = interp1d(
143 |             orig_t[ii], f0_interp[ii], bounds_error=False, kind="linear", fill_value=0
144 |         )(orig_t)
145 |         # f0_interp = torch.Tensor(f0_interp).type_as(f0).to(f0.device)
146 |     return f0_interp
147 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/text.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017 Keith Ito
  2 | 
  3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | # of this software and associated documentation files (the "Software"), to deal
  5 | # in the Software without restriction, including without limitation the rights
  6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7 | # copies of the Software, and to permit persons to whom the Software is
  8 | # furnished to do so, subject to the following conditions:
  9 | 
 10 | # The above copyright notice and this permission notice shall be included in
 11 | # all copies or substantial portions of the Software.
 12 | 
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 19 | # THE SOFTWARE.
 20 | 
 21 | """ from MIT-licensed https://github.com/keithito/tacotron """
 22 | import numpy as np
 23 | import re
 24 | from . import cleaners
 25 | from .symbols import symbols
 26 | 
 27 | 
 28 | # Mappings from symbol to numeric ID and vice versa:
 29 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 30 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 31 | 
 32 | # Regular expression matching text enclosed in curly braces:
 33 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
 34 | 
 35 | # Special symbols
 36 | SOS_TOK = '<s>'
 37 | EOS_TOK = '</s>'
 38 | 
 39 | def text_to_sequence(text, cleaner_names):
 40 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 41 | 
 42 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
 43 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
 44 | 
 45 |     Args:
 46 |       text: string to convert to a sequence
 47 |       cleaner_names: names of the cleaner functions to run the text through
 48 | 
 49 |     Returns:
 50 |       List of integers corresponding to the symbols in the text
 51 |   '''
 52 |   sequence = []
 53 | 
 54 |   # Check for curly braces and treat their contents as ARPAbet:
 55 |   while len(text):
 56 |     m = _curly_re.match(text)
 57 |     if not m:
 58 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
 59 |       break
 60 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
 61 |     sequence += _arpabet_to_sequence(m.group(2))
 62 |     text = m.group(3)
 63 | 
 64 |   return sequence
 65 | 
 66 | 
 67 | def sample_code_chunk(code, size):
 68 |     assert(size > 0 and size <= len(code))
 69 |     start = np.random.randint(len(code) - size + 1)
 70 |     end = start + size
 71 |     return code[start:end], start, end
 72 | 
 73 | 
 74 | def code_to_sequence(code, code_dict, collapse_code):
 75 |     if collapse_code:
 76 |         prev_c = None
 77 |         sequence = []
 78 |         for c in code:
 79 |             if c in code_dict and c != prev_c:
 80 |                 sequence.append(code_dict[c])
 81 |                 prev_c = c
 82 |     else:
 83 |         sequence = [code_dict[c] for c in code if c in code_dict]
 84 |         if len(sequence) < 0.95 * len(code):
 85 |             print('WARNING : over 5%% codes are OOV')
 86 | 
 87 |     return sequence
 88 | 
 89 | 
 90 | def sequence_to_text(sequence):
 91 |   '''Converts a sequence of IDs back to a string'''
 92 |   result = ''
 93 |   for symbol_id in sequence:
 94 |     if symbol_id in _id_to_symbol:
 95 |       s = _id_to_symbol[symbol_id]
 96 |       # Enclose ARPAbet back in curly braces:
 97 |       if len(s) > 1 and s[0] == '@':
 98 |         s = '{%s}' % s[1:]
 99 |       result += s
100 |   return result.replace('}{', ' ')
101 | 
102 | 
103 | def sequence_to_code(sequence, code_dict):
104 |     '''Analogous to sequence_to_text'''
105 |     id_to_code = {i: c for c, i in code_dict.items()}
106 |     return ' '.join([id_to_code[i] for i in sequence])
107 | 
108 | 
109 | def _clean_text(text, cleaner_names):
110 |   for name in cleaner_names:
111 |     cleaner = getattr(cleaners, name)
112 |     if not cleaner:
113 |       raise Exception('Unknown cleaner: %s' % name)
114 |     text = cleaner(text)
115 |   return text
116 | 
117 | 
118 | def _symbols_to_sequence(symbols):
119 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
120 | 
121 | 
122 | def _arpabet_to_sequence(text):
123 |   return _symbols_to_sequence(['@' + s for s in text.split()])
124 | 
125 | 
126 | def _should_keep_symbol(s):
127 |   return s in _symbol_to_id and s != '_' and s != '~'
128 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/audio_processing.py:
--------------------------------------------------------------------------------
  1 | # BSD 3-Clause License
  2 | 
  3 | # Copyright (c) 2018, NVIDIA Corporation
  4 | # All rights reserved.
  5 | 
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | 
  9 | # * Redistributions of source code must retain the above copyright notice, this
 10 | #   list of conditions and the following disclaimer.
 11 | 
 12 | # * Redistributions in binary form must reproduce the above copyright notice,
 13 | #   this list of conditions and the following disclaimer in the documentation
 14 | #   and/or other materials provided with the distribution.
 15 | 
 16 | # * Neither the name of the copyright holder nor the names of its
 17 | #   contributors may be used to endorse or promote products derived from
 18 | #   this software without specific prior written permission.
 19 | 
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | """https://github.com/NVIDIA/tacotron2"""
 32 | 
 33 | import torch
 34 | import numpy as np
 35 | from scipy.signal import get_window
 36 | import librosa.util as librosa_util
 37 | 
 38 | 
 39 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 40 |                      n_fft=800, dtype=np.float32, norm=None):
 41 |     """
 42 |     # from librosa 0.6
 43 |     Compute the sum-square envelope of a window function at a given hop length.
 44 | 
 45 |     This is used to estimate modulation effects induced by windowing
 46 |     observations in short-time fourier transforms.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     window : string, tuple, number, callable, or list-like
 51 |         Window specification, as in `get_window`
 52 | 
 53 |     n_frames : int > 0
 54 |         The number of analysis frames
 55 | 
 56 |     hop_length : int > 0
 57 |         The number of samples to advance between frames
 58 | 
 59 |     win_length : [optional]
 60 |         The length of the window function.  By default, this matches `n_fft`.
 61 | 
 62 |     n_fft : int > 0
 63 |         The length of each analysis frame.
 64 | 
 65 |     dtype : np.dtype
 66 |         The data type of the output
 67 | 
 68 |     Returns
 69 |     -------
 70 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 71 |         The sum-squared envelope of the window function
 72 |     """
 73 |     if win_length is None:
 74 |         win_length = n_fft
 75 | 
 76 |     n = n_fft + hop_length * (n_frames - 1)
 77 |     x = np.zeros(n, dtype=dtype)
 78 | 
 79 |     # Compute the squared window at the desired length
 80 |     win_sq = get_window(window, win_length, fftbins=True)
 81 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
 82 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
 83 | 
 84 |     # Fill the envelope
 85 |     for i in range(n_frames):
 86 |         sample = i * hop_length
 87 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
 88 |     return x
 89 | 
 90 | 
 91 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
 92 |     """
 93 |     PARAMS
 94 |     ------
 95 |     magnitudes: spectrogram magnitudes
 96 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
 97 |     """
 98 | 
 99 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
100 |     angles = angles.astype(np.float32)
101 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
102 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
103 | 
104 |     for i in range(n_iters):
105 |         _, angles = stft_fn.transform(signal)
106 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
107 |     return signal
108 | 
109 | 
110 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
111 |     """
112 |     PARAMS
113 |     ------
114 |     C: compression factor
115 |     """
116 |     return torch.log(torch.clamp(x, min=clip_val) * C)
117 | 
118 | 
119 | def dynamic_range_decompression(x, C=1):
120 |     """
121 |     PARAMS
122 |     ------
123 |     C: compression factor used to compress
124 |     """
125 |     return torch.exp(x) / C
126 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/speaker_probing/probes.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | import math
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.nn import TransformerEncoder, TransformerEncoderLayer
 11 | import torch.nn.functional as F
 12 | 
 13 | 
 14 | class ConstantBaseline(torch.nn.Module):
 15 |     def __init__(self, total_speakers):
 16 |         super().__init__()
 17 |         self.logits = torch.nn.parameter.Parameter(torch.zeros(total_speakers).float())
 18 | 
 19 |     def forward(self, batch):
 20 |         bsz = batch["units"].size(0)
 21 |         return (
 22 |             F.log_softmax(self.logits, dim=-1)
 23 |             .unsqueeze(0)
 24 |             .expand(bsz, self.logits.size(0))
 25 |         )
 26 | 
 27 | 
 28 | class DiscreteClassifier(torch.nn.Module):
 29 |     def __init__(
 30 |         self,
 31 |         vocab_size,
 32 |         embedding_size,
 33 |         n_heads,
 34 |         hidden_size,
 35 |         n_layers,
 36 |         dropout,
 37 |         pad_value,
 38 |         total_speakers,
 39 |     ):
 40 |         super().__init__()
 41 |         self.pad_value = pad_value.item() if torch.is_tensor(pad_value) else pad_value
 42 | 
 43 |         self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
 44 |         self.embedding_size = embedding_size
 45 |         torch.nn.init.normal_(
 46 |             self.embedding.weight, mean=0, std=self.embedding_size ** -0.5
 47 |         )
 48 | 
 49 |         self.encoder_classifier = Classifier(
 50 |             embedding_size, n_heads, hidden_size, n_layers, dropout, total_speakers
 51 |         )
 52 | 
 53 |     def forward(self, batch):
 54 |         src = batch["units"]
 55 |         padding_mask = src == self.pad_value
 56 | 
 57 |         src = src.transpose(1, 0)
 58 |         x = self.embedding(src) * math.sqrt(self.embedding_size)
 59 |         return self.encoder_classifier(x, padding_mask)
 60 | 
 61 | 
 62 | class ContinuousClassifier(torch.nn.Module):
 63 |     def __init__(
 64 |         self,
 65 |         input_size,
 66 |         embedding_size,
 67 |         n_heads,
 68 |         hidden_size,
 69 |         n_layers,
 70 |         dropout,
 71 |         pad_value,
 72 |         total_speakers,
 73 |     ):
 74 |         super().__init__()
 75 | 
 76 |         self.pad_value = pad_value.item() if torch.is_tensor(pad_value) else pad_value
 77 |         self.embedding = torch.nn.Linear(input_size, embedding_size)
 78 | 
 79 |         self.encoder_classifier = Classifier(
 80 |             embedding_size, n_heads, hidden_size, n_layers, dropout, total_speakers
 81 |         )
 82 | 
 83 |     def forward(self, batch):
 84 |         src = batch["dense"]
 85 |         padding_mask = batch["units"] == self.pad_value
 86 | 
 87 |         src = src.transpose(1, 0)
 88 |         x = self.embedding(src)  # * math.sqrt(self.embedding_size)
 89 |         return self.encoder_classifier(x, padding_mask)
 90 | 
 91 | 
 92 | class Classifier(torch.nn.Module):
 93 |     def __init__(
 94 |         self, embedding_size, n_heads, hidden_size, n_layers, dropout, total_speakers
 95 |     ):
 96 |         super().__init__()
 97 |         self.pos_encoder = PositionalEncoding(embedding_size, dropout=0.0)
 98 |         encoder_layers = TransformerEncoderLayer(
 99 |             embedding_size, n_heads, hidden_size, dropout=dropout
100 |         )
101 |         self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
102 | 
103 |         self.embedding_size = embedding_size
104 |         self.classifier = torch.nn.Linear(embedding_size, total_speakers)
105 | 
106 |     def forward(self, x, padding_mask):
107 |         x = self.pos_encoder(x)
108 |         x = self.transformer_encoder(x, src_key_padding_mask=padding_mask)
109 | 
110 |         bos_embedding = x[0, :]
111 |         logits = self.classifier(bos_embedding)
112 |         return F.log_softmax(logits, dim=-1)
113 | 
114 | 
115 | class PositionalEncoding(nn.Module):
116 |     def __init__(self, d_model, dropout=0.1, max_len=5000):
117 |         super(PositionalEncoding, self).__init__()
118 |         self.dropout = nn.Dropout(p=dropout)
119 | 
120 |         pe = torch.zeros(max_len, d_model)
121 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
122 |         div_term = torch.exp(
123 |             torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
124 |         )
125 |         pe[:, 0::2] = torch.sin(position * div_term)
126 |         pe[:, 1::2] = torch.cos(position * div_term)
127 |         pe = pe.unsqueeze(0).transpose(0, 1)
128 |         self.register_buffer("pe", pe)
129 | 
130 |     def forward(self, x):
131 |         assert x.size(0) < self.pe.size(0), f"{x.size()=} {self.pe.size()=}"
132 | 
133 |         x = x + self.pe[: x.size(0), :]
134 |         return self.dropout(x)
135 | 


--------------------------------------------------------------------------------
/unitspeech/vocoder/activations.py:
--------------------------------------------------------------------------------
  1 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import torch
  5 | from torch import nn, sin, pow
  6 | from torch.nn import Parameter
  7 | 
  8 | 
  9 | class Snake(nn.Module):
 10 |     '''
 11 |     Implementation of a sine-based periodic activation function
 12 |     Shape:
 13 |         - Input: (B, C, T)
 14 |         - Output: (B, C, T), same shape as the input
 15 |     Parameters:
 16 |         - alpha - trainable parameter
 17 |     References:
 18 |         - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
 19 |         https://arxiv.org/abs/2006.08195
 20 |     Examples:
 21 |         >>> a1 = snake(256)
 22 |         >>> x = torch.randn(256)
 23 |         >>> x = a1(x)
 24 |     '''
 25 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
 26 |         '''
 27 |         Initialization.
 28 |         INPUT:
 29 |             - in_features: shape of the input
 30 |             - alpha: trainable parameter
 31 |             alpha is initialized to 1 by default, higher values = higher-frequency.
 32 |             alpha will be trained along with the rest of your model.
 33 |         '''
 34 |         super(Snake, self).__init__()
 35 |         self.in_features = in_features
 36 | 
 37 |         # initialize alpha
 38 |         self.alpha_logscale = alpha_logscale
 39 |         if self.alpha_logscale: # log scale alphas initialized to zeros
 40 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
 41 |         else: # linear scale alphas initialized to ones
 42 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
 43 | 
 44 |         self.alpha.requires_grad = alpha_trainable
 45 | 
 46 |         self.no_div_by_zero = 0.000000001
 47 | 
 48 |     def forward(self, x):
 49 |         '''
 50 |         Forward pass of the function.
 51 |         Applies the function to the input elementwise.
 52 |         Snake ∶= x + 1/a * sin^2 (xa)
 53 |         '''
 54 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
 55 |         if self.alpha_logscale:
 56 |             alpha = torch.exp(alpha)
 57 |         x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
 58 | 
 59 |         return x
 60 | 
 61 | 
 62 | class SnakeBeta(nn.Module):
 63 |     '''
 64 |     A modified Snake function which uses separate parameters for the magnitude of the periodic components
 65 |     Shape:
 66 |         - Input: (B, C, T)
 67 |         - Output: (B, C, T), same shape as the input
 68 |     Parameters:
 69 |         - alpha - trainable parameter that controls frequency
 70 |         - beta - trainable parameter that controls magnitude
 71 |     References:
 72 |         - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
 73 |         https://arxiv.org/abs/2006.08195
 74 |     Examples:
 75 |         >>> a1 = snakebeta(256)
 76 |         >>> x = torch.randn(256)
 77 |         >>> x = a1(x)
 78 |     '''
 79 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
 80 |         '''
 81 |         Initialization.
 82 |         INPUT:
 83 |             - in_features: shape of the input
 84 |             - alpha - trainable parameter that controls frequency
 85 |             - beta - trainable parameter that controls magnitude
 86 |             alpha is initialized to 1 by default, higher values = higher-frequency.
 87 |             beta is initialized to 1 by default, higher values = higher-magnitude.
 88 |             alpha will be trained along with the rest of your model.
 89 |         '''
 90 |         super(SnakeBeta, self).__init__()
 91 |         self.in_features = in_features
 92 | 
 93 |         # initialize alpha
 94 |         self.alpha_logscale = alpha_logscale
 95 |         if self.alpha_logscale: # log scale alphas initialized to zeros
 96 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
 97 |             self.beta = Parameter(torch.zeros(in_features) * alpha)
 98 |         else: # linear scale alphas initialized to ones
 99 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
100 |             self.beta = Parameter(torch.ones(in_features) * alpha)
101 | 
102 |         self.alpha.requires_grad = alpha_trainable
103 |         self.beta.requires_grad = alpha_trainable
104 | 
105 |         self.no_div_by_zero = 0.000000001
106 | 
107 |     def forward(self, x):
108 |         '''
109 |         Forward pass of the function.
110 |         Applies the function to the input elementwise.
111 |         SnakeBeta ∶= x + 1/b * sin^2 (xa)
112 |         '''
113 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
114 |         beta = self.beta.unsqueeze(0).unsqueeze(-1)
115 |         if self.alpha_logscale:
116 |             alpha = torch.exp(alpha)
117 |             beta = torch.exp(beta)
118 |         x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
119 | 
120 |         return x


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/vocoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch.nn as nn
  7 | import torch
  8 | 
  9 | from .tts_data import TacotronInputDataset
 10 | from .model import Tacotron2
 11 | from .glow import WaveGlow
 12 | from .waveglow_denoiser import Denoiser
 13 | from unitspeech.textlesslib.textless.checkpoint_manager import CHECKPOINT_MANAGER
 14 | 
 15 | from typing import Union
 16 | 
 17 | 
 18 | class TacotronVocoder(nn.Module):
 19 |     def __init__(
 20 |         self,
 21 |         tacotron_model_path: str,
 22 |         tacotron_dict_path: str,
 23 |         waveglow_path: str,
 24 |         max_decoder_steps: int = 2000,
 25 |         denoiser_strength: float = 0.1,
 26 |     ):
 27 |         super().__init__()
 28 |         self.max_decoder_steps = max_decoder_steps
 29 |         self.denoiser_strength = denoiser_strength
 30 |         (
 31 |             self.tacotron_model,
 32 |             self.tacotron_sample_rate,
 33 |             self.tacotron_hparams,
 34 |         ) = load_tacotron(
 35 |             tacotron_model_path=tacotron_model_path,
 36 |             code_dict_path=tacotron_dict_path,
 37 |             max_decoder_steps=self.max_decoder_steps,
 38 |         )
 39 |         self.waveglow_model, self.denoiser_model = load_waveglow_standalone(
 40 |             waveglow_path=waveglow_path,
 41 |         )
 42 |         self.tts_dataset = TacotronInputDataset(self.tacotron_hparams)
 43 |         self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
 44 | 
 45 |     def forward(self, units: Union[str, torch.Tensor]) -> torch.Tensor:
 46 |         if isinstance(units, torch.Tensor):
 47 |             units_str = " ".join([str(x) for x in units.cpu().tolist()])
 48 |         else:
 49 |             units_str = units
 50 |         tts_input = self.tts_dataset.get_tensor(units_str)
 51 |         tts_input = tts_input.to(self.device)
 52 |         _, _, aud_dn, _ = synthesize_audio(
 53 |             self.tacotron_model,
 54 |             self.waveglow_model,
 55 |             self.denoiser_model,
 56 |             tts_input.unsqueeze(0),
 57 |             strength=self.denoiser_strength,
 58 |         )
 59 |         out_audio = aud_dn[0]
 60 |         return out_audio
 61 | 
 62 |     @classmethod
 63 |     def by_name(
 64 |         cls,
 65 |         dense_model_name: str,
 66 |         quantizer_model_name: str,
 67 |         vocab_size: int,
 68 |         max_decoder_steps: int = 2000,
 69 |         denoiser_strength: float = 0.1,
 70 |     ):
 71 |         waveglow_path = CHECKPOINT_MANAGER.get_by_name("waveglow")
 72 | 
 73 |         tacotron_checkpoint_name = (
 74 |             f"{dense_model_name}-{quantizer_model_name}-{vocab_size}-tacotron"
 75 |         )
 76 |         tacotron_checkpoint_path = CHECKPOINT_MANAGER.get_by_name(
 77 |             tacotron_checkpoint_name
 78 |         )
 79 | 
 80 |         checkpoint_codes_name = f"{tacotron_checkpoint_name}-codes"
 81 |         tacotron_codes_path = CHECKPOINT_MANAGER.get_by_name(checkpoint_codes_name)
 82 | 
 83 |         return cls(
 84 |             tacotron_checkpoint_path,
 85 |             tacotron_codes_path,
 86 |             waveglow_path,
 87 |             max_decoder_steps,
 88 |             denoiser_strength,
 89 |         )
 90 | 
 91 |     @property
 92 |     def device(self) -> torch.device:
 93 |         return self._float_tensor.device
 94 | 
 95 |     @property
 96 |     def output_sample_rate(self) -> int:
 97 |         return self.tacotron_sample_rate
 98 | 
 99 | 
100 | def synthesize_audio(model, waveglow, denoiser, inp, lab=None, strength=0.0):
101 |     assert inp.size(0) == 1
102 |     if lab is not None:
103 |         lab = torch.LongTensor(1).fill_(lab)
104 | 
105 |     with torch.inference_mode():
106 |         model_device = next(model.parameters()).device
107 |         _, mel, _, ali, has_eos = model.inference(
108 |             inp.to(model_device),
109 |             lab.to(model_device) if lab is not None else None,
110 |             ret_has_eos=True,
111 |         )
112 |         aud = waveglow.infer(mel.float(), sigma=0.666)
113 |         aud_dn = denoiser(aud.half(), strength=strength).squeeze(1)
114 |     return mel, aud, aud_dn, has_eos
115 | 
116 | 
117 | def load_tacotron(tacotron_model_path, code_dict_path, max_decoder_steps):
118 |     ckpt_dict = torch.load(tacotron_model_path, map_location=torch.device("cpu"))
119 |     hparams = ckpt_dict["hparams"]
120 |     hparams.code_dict = code_dict_path
121 |     hparams.max_decoder_steps = max_decoder_steps
122 |     sr = hparams.sampling_rate
123 |     model = Tacotron2(hparams)
124 |     model.load_state_dict(ckpt_dict["model_dict"])
125 |     model = model.half()
126 |     model = model.eval()
127 |     return model, sr, hparams
128 | 
129 | 
130 | def load_waveglow_standalone(waveglow_path, device="cpu"):
131 |     ckpt_dict = torch.load(waveglow_path, map_location=torch.device("cpu"))
132 |     hparams = ckpt_dict["hparams"]
133 |     waveglow = WaveGlow(**hparams)
134 |     waveglow.load_state_dict(ckpt_dict["model_dict"])
135 |     waveglow = waveglow.eval()
136 |     waveglow = waveglow.to(device)
137 |     denoiser = Denoiser(waveglow)
138 |     denoiser = denoiser.eval()
139 |     return waveglow, denoiser
140 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/gslm/sample.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import logging
  7 | import random
  8 | from typing import Optional
  9 | 
 10 | from fairseq import utils
 11 | import numpy as np
 12 | import torch
 13 | import torchaudio
 14 | 
 15 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
 16 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder
 17 | from sampler import UnitLanguageModelSampler
 18 | 
 19 | log_format = "[%(asctime)s] [%(levelname)s]: %(message)s"
 20 | logging.basicConfig(format=log_format, level=logging.INFO)
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class GslmPipeline:
 25 |     def __init__(self, args):
 26 |         logger.info("Initializing the GSLM pipeline.")
 27 |         self.device = torch.device("cuda")
 28 |         if args.seed is not None:
 29 |             random.seed(args.seed)
 30 |             np.random.seed(args.seed)
 31 |             utils.set_torch_seed(args.seed)
 32 | 
 33 |         self.temperature = args.temperature
 34 |         self.tokens_framerate = 0.02  # HuBERT framerate
 35 |         self.max_length = 1000
 36 |         self.trim_trailing_audio_frames = 200
 37 |         self.sampling_kwargs = {
 38 |             "temperature": self.temperature,
 39 |             "sampling": True,
 40 |             "beam": 1,
 41 |             "prefix_size": -1,
 42 |             "max_len_a": 0.0,
 43 |             "max_len_b": self.max_length,
 44 |         }
 45 |         logger.info("... Loading the language model")
 46 |         self.sampler = UnitLanguageModelSampler.from_pretrained(
 47 |             args.language_model_data_dir,
 48 |         )
 49 |         logger.info("=> Done!")
 50 |         logger.info("... Loading the encoder")
 51 | 
 52 |         self.speech_encoder = SpeechEncoder.by_name(
 53 |             dense_model_name="hubert-base-ls960",
 54 |             quantizer_model_name="kmeans",
 55 |             vocab_size=args.vocab_size,
 56 |             need_f0=False,
 57 |             deduplicate=True,
 58 |             f0_normalizer=None,
 59 |             f0_quantizer=None,
 60 |         ).cuda()
 61 | 
 62 |         logger.info("=> Done!")
 63 |         logger.info("... Loading the vocoder")
 64 |         self.resynthesizer = TacotronVocoder.by_name(
 65 |             dense_model_name="hubert-base-ls960",
 66 |             quantizer_model_name="kmeans",
 67 |             vocab_size=args.vocab_size,
 68 |         ).cuda()
 69 | 
 70 |         logger.info("=> Done!")
 71 |         logger.info("Pipeline initialized!")
 72 | 
 73 |     def __call__(self, raw_audio, sample_rate):
 74 |         raw_audio = self.speech_encoder.maybe_resample(raw_audio, sample_rate)
 75 | 
 76 |         sample = self.speech_encoder(raw_audio)
 77 |         units = sample["units"]
 78 |         duration = sample["durations"].sum().item()
 79 |         prefix_duration = self.tokens_framerate * duration
 80 |         target_duration = self.tokens_framerate * (
 81 |             self.max_length - self.trim_trailing_audio_frames
 82 |         )
 83 | 
 84 |         unit_str = " ".join(list(map(str, units.tolist())))
 85 |         sampled_unit_str = self.sampler.sample([unit_str], **self.sampling_kwargs)[0]
 86 | 
 87 |         audio = self.resynthesizer(sampled_unit_str)
 88 |         audio = audio[
 89 |             : int(
 90 |                 self.resynthesizer.output_sample_rate
 91 |                 * (prefix_duration + target_duration)
 92 |             )
 93 |         ]
 94 | 
 95 |         return audio
 96 | 
 97 |     @property
 98 |     def output_sample_rate(self) -> int:
 99 |         return self.resynthesizer.output_sample_rate
100 | 
101 | 
102 | def main(args):
103 |     pipeline = GslmPipeline(args)
104 | 
105 |     audio, sample_rate = torchaudio.load(args.input_file)
106 | 
107 |     if audio.ndim == 2:
108 |         audio = audio.mean(0)
109 | 
110 |     if args.prompt_duration_sec:
111 |         prompt = int(args.prompt_duration_sec * sample_rate)
112 |         audio = audio[:prompt]
113 | 
114 |     generated_audio = pipeline(audio, sample_rate)
115 | 
116 |     torchaudio.save(
117 |         args.output_file,
118 |         generated_audio.cpu().unsqueeze(0),
119 |         pipeline.output_sample_rate,
120 |     )
121 | 
122 | 
123 | def cli_main():
124 |     import argparse
125 | 
126 |     parser = argparse.ArgumentParser()
127 |     parser.add_argument(
128 |         "--input-file",
129 |         type=str,
130 |         required=True,
131 |         help="Input filepath",
132 |     )
133 |     parser.add_argument(
134 |         "--language-model-data-dir",
135 |         type=str,
136 |         required=True,
137 |         help="Path to language model dataset config path",
138 |     )
139 |     parser.add_argument(
140 |         "--temperature",
141 |         type=float,
142 |         default=0.7,
143 |         help="Temperature: should be above 0.0",
144 |     )
145 |     parser.add_argument(
146 |         "--prompt-duration-sec",
147 |         type=float,
148 |         default=None,
149 |         help="Cutting prompts to a maximum duration",
150 |     )
151 |     parser.add_argument(
152 |         "--output-file", type=str, help="Path where generated metadata is saved"
153 |     )
154 |     parser.add_argument("--seed", type=int, default=0)
155 |     parser.add_argument(
156 |         "--vocab-size",
157 |         type=int,
158 |         choices=[50, 100, 200],
159 |         default=100,
160 |         help="Vocabulary size used",
161 |     )
162 | 
163 |     args = parser.parse_args()
164 | 
165 |     main(args)
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     cli_main()
170 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/tools/distributed_transcribe/transcribe.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | import torch.distributed as distr
  8 | import torch
  9 | import pathlib
 10 | from data_handler import ManifestDataset
 11 | from distributed import init_distributed_context
 12 | 
 13 | import logging
 14 | 
 15 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | SAMPLING_RATE = 16_000
 20 | 
 21 | 
 22 | def get_args():
 23 |     import argparse
 24 | 
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument(
 27 |         "--vocab_size",
 28 |         default=100,
 29 |         type=int,
 30 |         help="Quantization codebook vocabulary size",
 31 |     )
 32 |     parser.add_argument(
 33 |         "--dense_model", default="hubert-base-ls960", help="Dense model to be used"
 34 |     )
 35 |     parser.add_argument(
 36 |         "--quantizer_model", default="kmeans", help="Quantizer model to be used"
 37 |     )
 38 | 
 39 |     parser.add_argument(
 40 |         "--manifest", required=True, help="Path to the dataset manifest file"
 41 |     )
 42 |     parser.add_argument(
 43 |         "--output",
 44 |         required=True,
 45 |         help="Path to the output files. Pseudo-units and duration (if requested) streams will be stored in files with .units and .durations suffixes, respectively",
 46 |     )
 47 |     parser.add_argument(
 48 |         "--deduplicate",
 49 |         action="store_true",
 50 |         help="if set, consecutive repeats of the same pseudo-unit are collapsed ('1 2 2 2 3' becomes '1 2 3')",
 51 |     )
 52 |     parser.add_argument(
 53 |         "--durations",
 54 |         action="store_true",
 55 |         help="if set, the token durations stream is produced",
 56 |     )
 57 |     parser.add_argument(
 58 |         "--f0s",
 59 |         action="store_true",
 60 |         help="if set, the F0 stream is produced",
 61 |     )
 62 |     parser.add_argument(
 63 |         "--preserve_name",
 64 |         action="store_true",
 65 |         help="If set, the transcript contains names of the audio files",
 66 |     )
 67 |     parser.add_argument(
 68 |         "--separator",
 69 |         type=str,
 70 |         default=" ",
 71 |         help="Separator between pseudo-unit tokens",
 72 |     )
 73 | 
 74 |     parser.add_argument("--distributed_port", type=int, default=58554)
 75 | 
 76 |     args = parser.parse_args()
 77 |     logger.info(f"Launched with args: {args}")
 78 | 
 79 |     return args
 80 | 
 81 | 
 82 | def worker_shard_path(fname, suffix, worker_id) -> pathlib.Path:
 83 |     return pathlib.Path(fname).with_suffix(f".{suffix}_partial_{worker_id}")
 84 | 
 85 | 
 86 | def transcribe(args, rank, world_size):
 87 |     dataset = ManifestDataset(args.manifest)
 88 | 
 89 |     speech_encoder = SpeechEncoder.by_name(
 90 |         dense_model_name=args.dense_model,
 91 |         quantizer_model_name=args.quantizer_model,
 92 |         vocab_size=args.vocab_size,
 93 |         deduplicate=args.deduplicate,
 94 |         need_f0=args.f0s,
 95 |     ).cuda()
 96 | 
 97 |     output_files = {
 98 |         "units": open(worker_shard_path(args.output, "units", rank), "w"),
 99 |         "durations": None
100 |         if not args.durations
101 |         else open(worker_shard_path(args.output, "durations", rank), "w"),
102 |         "f0s": None
103 |         if not args.f0s
104 |         else open(worker_shard_path(args.output, "f0s", rank), "w"),
105 |     }
106 | 
107 |     # DistributedSampler will pad the dataloader to be divisible
108 |     # by the number of workers, which we do not want so we iterate directly
109 |     for i in range(rank, len(dataset), world_size):
110 |         waveform, name = dataset[i]
111 |         encoded = speech_encoder(waveform)
112 | 
113 |         stream_names = ["units", "durations"]
114 |         if args.f0s:
115 |             stream_names += ["f0s"]
116 | 
117 |         for stream_name in stream_names:
118 |             stream = encoded[stream_name]
119 |             stream = [str(int(x)) for x in stream.tolist()]
120 |             stream = args.separator.join(stream)
121 | 
122 |             stream = f"{name}\t{stream}" if args.preserve_name else stream
123 |             print(stream, file=output_files[stream_name])
124 | 
125 |     for fout in output_files.values():
126 |         if fout:
127 |             fout.close()
128 | 
129 | 
130 | def main(args):
131 |     context = init_distributed_context(args.distributed_port)
132 |     logger.info(f"Distributed context {context}")
133 | 
134 |     n_gpus = torch.cuda.device_count()
135 |     with torch.cuda.device(context.local_rank % n_gpus):
136 |         transcribe(args, context.rank, context.world_size)
137 | 
138 |     if context.world_size > 1:
139 |         distr.barrier()
140 | 
141 |     if context.is_leader:
142 |         generated_streams = ["units"]
143 |         if args.durations:
144 |             generated_streams += ["durations"]
145 |         if args.f0s:
146 |             generated_streams += ["f0s"]
147 | 
148 |         for stream_name in generated_streams:
149 |             merge_files(args.output, stream_name, context.world_size)
150 | 
151 | 
152 | def merge_files(full_output, suffix, n_workers):
153 |     output = full_output + f".{suffix}"
154 |     with open(output, "w") as full:
155 |         for worker_id in range(n_workers):
156 |             partial_path = worker_shard_path(full_output, suffix, worker_id)
157 |             partial = open(partial_path, "r")
158 |             for line in partial:
159 |                 print(line.strip(), file=full)
160 |             partial_path.unlink()
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     args = get_args()
165 |     main(args)
166 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/resynthesis/resynth.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torchaudio
  7 | from unitspeech.textlesslib.textless import dispatch_dense_model, dispatch_quantizer
  8 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
  9 | from unitspeech.textlesslib.textless.vocoders.tacotron2.vocoder import TacotronVocoder
 10 | 
 11 | 
 12 | def get_args():
 13 |     import argparse
 14 | 
 15 |     parser = argparse.ArgumentParser()
 16 |     parser.add_argument(
 17 |         "--dense_model_name",
 18 |         type=str,
 19 |         default="hubert-base-ls960",
 20 |         choices=["hubert-base-ls960", "cpc-big-ll6k"],
 21 |         help="Dense representation model",
 22 |     )
 23 |     parser.add_argument(
 24 |         "--vocab_size",
 25 |         type=int,
 26 |         default=50,
 27 |         help="Vocabulary size used for resynthesis",
 28 |     )
 29 |     parser.add_argument(
 30 |         "--input",
 31 |         required=True,
 32 |         help="Path to the input audio file",
 33 |     )
 34 |     parser.add_argument(
 35 |         "--output",
 36 |         required=True,
 37 |         help="Path to the output audio file.",
 38 |     )
 39 |     parser.add_argument(
 40 |         "--decoder_steps",
 41 |         type=int,
 42 |         default=100,
 43 |         help="Maximal number of decoder steps",
 44 |     )
 45 | 
 46 |     args = parser.parse_args()
 47 |     return args
 48 | 
 49 | 
 50 | def get_compression_rate(dense_model, units, wave, vocab_size, sample_rate):
 51 |     import numpy as np
 52 | 
 53 |     assert units.ndim == 1
 54 |     assert wave.ndim == 1
 55 | 
 56 |     time_in_seconds = wave.numel() / sample_rate
 57 | 
 58 |     uniform_token_entropy = np.log2(vocab_size)
 59 |     # calculated on LL-6k train
 60 |     unigram_token_entropy = {
 61 |         "hubert-base-ls960": {
 62 |             50: 5.458528917634601,
 63 |             100: 6.44513268276806,
 64 |             200: 7.477069233162813,
 65 |         },
 66 |         "cpc-big-ll6k": {
 67 |             50: 5.428271158461133,
 68 |             100: 6.413083187885448,
 69 |             200: 7.44253841579776,
 70 |         },
 71 |     }[dense_model][vocab_size]
 72 | 
 73 |     uniform_bps = uniform_token_entropy * units.size(0) / time_in_seconds
 74 |     unigram_entropy = unigram_token_entropy * units.size(0) / time_in_seconds
 75 | 
 76 |     return uniform_bps, unigram_entropy
 77 | 
 78 | 
 79 | def main(args):
 80 |     dense_model_name = args.dense_model_name
 81 |     quantizer_name = "kmeans"
 82 | 
 83 |     # We can build a speech encoder module using names of pre-trained dense and quantizer models.
 84 |     # The call below will download appropriate checkpoints as needed behind the scenes
 85 |     encoder = SpeechEncoder.by_name(
 86 |         dense_model_name=dense_model_name,
 87 |         quantizer_model_name=quantizer_name,
 88 |         vocab_size=args.vocab_size,
 89 |         need_f0=False,
 90 |         deduplicate=True,
 91 |         f0_normalizer=None,
 92 |         f0_quantizer=None,
 93 |     ).cuda()
 94 | 
 95 |     # Alternatively, we can pass dense/quantizer models directly.
 96 |     # Here, we'll look up the same models as above, but generally those
 97 |     # could be any other models.
 98 |     dense_model = dispatch_dense_model(dense_model_name)
 99 |     quantizer_model = dispatch_quantizer(
100 |         dense_model_name, quantizer_name, args.vocab_size
101 |     )
102 | 
103 |     # .. and use them when initializing the encoder. Same constructor can be used to when we want
104 |     # to use models other than pre-defined.
105 |     encoder = SpeechEncoder(
106 |         dense_model=dense_model,
107 |         quantizer_model=quantizer_model,
108 |         need_f0=False,
109 |         deduplicate=True,
110 |         f0_normalizer=None,
111 |         f0_quantizer=None,
112 |     ).cuda()
113 | 
114 |     # now let's load an audio example
115 |     waveform, input_sample_rate = torchaudio.load(args.input)
116 |     if waveform.ndim == 2:
117 |         waveform = waveform.mean(dim=0)
118 | 
119 |     waveform = encoder.maybe_resample(waveform, input_sample_rate)
120 | 
121 |     # now and convert it in a stream of deduplicated units (as in GSLM)
122 |     encoded = encoder(waveform.cuda())
123 |     # encoded is a dict with keys ('dense', 'units', 'durations'). It can also contain 'f0' if SpeechEncoder
124 |     # was initialized with need_f0=True flag.
125 |     units = encoded[
126 |         "units"
127 |     ]  # tensor([71, 12, 57, 12, 57, 12, 57, 12, ...], device='cuda:0', dtype=torch.int32)
128 | 
129 |     # as with encoder, we can setup vocoder by specifying names of pretrained models
130 |     # or by passing checkpoint paths directly. The dense/quantizer models are not invokes,
131 |     # we just use their names as an index.
132 |     vocoder = TacotronVocoder.by_name(
133 |         dense_model_name,
134 |         quantizer_name,
135 |         args.vocab_size,
136 |     ).cuda()
137 | 
138 |     # now we turn those units back into the audio.
139 |     audio = vocoder(units)
140 | 
141 |     # save the audio
142 |     torchaudio.save(
143 |         args.output, audio.cpu().float().unsqueeze(0), vocoder.output_sample_rate
144 |     )
145 | 
146 |     uniform_bps, learned_bps = get_compression_rate(
147 |         dense_model_name, units, waveform, args.vocab_size, encoder.expected_sample_rate
148 |     )
149 | 
150 |     print(
151 |         f"Audio of length {round(waveform.size(0) / 16_000, 1)} seconds represented as {units.numel()} tokens"
152 |     )
153 |     print(
154 |         f"\tAssuming uniform token distribution: {round(uniform_bps, 1)} bits per second"
155 |     )
156 |     print(
157 |         f"\tAssuming unigram token distribution estimated on LL-6K train: {round(learned_bps, 1)} bits per second"
158 |     )
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     args = get_args()
163 |     main(args)
164 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/layers.py:
--------------------------------------------------------------------------------
  1 | # BSD 3-Clause License
  2 | 
  3 | # Copyright (c) 2018, NVIDIA Corporation
  4 | # All rights reserved.
  5 | 
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | 
  9 | # * Redistributions of source code must retain the above copyright notice, this
 10 | #   list of conditions and the following disclaimer.
 11 | 
 12 | # * Redistributions in binary form must reproduce the above copyright notice,
 13 | #   this list of conditions and the following disclaimer in the documentation
 14 | #   and/or other materials provided with the distribution.
 15 | 
 16 | # * Neither the name of the copyright holder nor the names of its
 17 | #   contributors may be used to endorse or promote products derived from
 18 | #   this software without specific prior written permission.
 19 | 
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | """https://github.com/NVIDIA/tacotron2"""
 32 | 
 33 | import torch
 34 | from librosa.filters import mel as librosa_mel_fn
 35 | from .audio_processing import dynamic_range_compression
 36 | from .audio_processing import dynamic_range_decompression
 37 | from .stft import STFT
 38 | from .utils import get_mask_from_lengths
 39 | 
 40 | 
 41 | class LinearNorm(torch.nn.Module):
 42 |     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
 43 |         super(LinearNorm, self).__init__()
 44 |         self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
 45 | 
 46 |         torch.nn.init.xavier_uniform_(
 47 |             self.linear_layer.weight,
 48 |             gain=torch.nn.init.calculate_gain(w_init_gain))
 49 | 
 50 |     def forward(self, x):
 51 |         return self.linear_layer(x)
 52 | 
 53 | 
 54 | class ConvNorm(torch.nn.Module):
 55 |     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
 56 |                  padding=None, dilation=1, bias=True, w_init_gain='linear'):
 57 |         super(ConvNorm, self).__init__()
 58 |         if padding is None:
 59 |             assert(kernel_size % 2 == 1)
 60 |             padding = int(dilation * (kernel_size - 1) / 2)
 61 | 
 62 |         self.conv = torch.nn.Conv1d(in_channels, out_channels,
 63 |                                     kernel_size=kernel_size, stride=stride,
 64 |                                     padding=padding, dilation=dilation,
 65 |                                     bias=bias)
 66 | 
 67 |         torch.nn.init.xavier_uniform_(
 68 |             self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
 69 | 
 70 |     def forward(self, signal):
 71 |         conv_signal = self.conv(signal)
 72 |         return conv_signal
 73 | 
 74 | 
 75 | class GlobalAvgPool(torch.nn.Module):
 76 |     def __init__(self):
 77 |         super(GlobalAvgPool, self).__init__()
 78 | 
 79 |     def forward(self, x, lengths=None):
 80 |         """Average pooling across time steps (dim=1) with optionally lengths.
 81 |         Args:
 82 |             x: torch.Tensor of shape (N, T, ...)
 83 |             lengths: None or torch.Tensor of shape (N,)
 84 |             dim: dimension to pool
 85 |         """
 86 |         if lengths is None:
 87 |             return x.mean(dim=1, keepdim=False)
 88 |         else:
 89 |             mask = get_mask_from_lengths(lengths).type(x.type()).to(x.device)
 90 |             mask_shape = list(mask.size()) + [1 for _ in range(x.ndimension()-2)]
 91 |             mask = mask.reshape(*mask_shape)
 92 |             numer = (x * mask).sum(dim=1, keepdim=False)
 93 |             denom = mask.sum(dim=1, keepdim=False)
 94 |             return numer / denom
 95 | 
 96 | 
 97 | class TacotronSTFT(torch.nn.Module):
 98 |     def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
 99 |                  n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
100 |                  mel_fmax=8000.0):
101 |         super(TacotronSTFT, self).__init__()
102 |         self.n_mel_channels = n_mel_channels
103 |         self.sampling_rate = sampling_rate
104 |         self.stft_fn = STFT(filter_length, hop_length, win_length)
105 |         mel_basis = librosa_mel_fn(
106 |             sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
107 |         mel_basis = torch.from_numpy(mel_basis).float()
108 |         self.register_buffer('mel_basis', mel_basis)
109 | 
110 |     def spectral_normalize(self, magnitudes):
111 |         output = dynamic_range_compression(magnitudes)
112 |         return output
113 | 
114 |     def spectral_de_normalize(self, magnitudes):
115 |         output = dynamic_range_decompression(magnitudes)
116 |         return output
117 | 
118 |     def mel_spectrogram(self, y):
119 |         """Computes mel-spectrograms from a batch of waves
120 |         PARAMS
121 |         ------
122 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
123 | 
124 |         RETURNS
125 |         -------
126 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
127 |         """
128 |         assert(torch.min(y.data) >= -1)
129 |         assert(torch.max(y.data) <= 1)
130 | 
131 |         magnitudes, phases = self.stft_fn.transform(y)
132 |         magnitudes = magnitudes.data
133 |         mel_output = torch.matmul(self.mel_basis, magnitudes)
134 |         mel_output = self.spectral_normalize(mel_output)
135 |         return mel_output
136 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | 
  4 | Copyright (c) 2017, Prem Seetharaman
  5 | All rights reserved.
  6 | 
  7 | * Redistribution and use in source and binary forms, with or without
  8 |   modification, are permitted provided that the following conditions are met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright notice,
 11 |   this list of conditions and the following disclaimer.
 12 | 
 13 | * Redistributions in binary form must reproduce the above copyright notice, this
 14 |   list of conditions and the following disclaimer in the
 15 |   documentation and/or other materials provided with the distribution.
 16 | 
 17 | * Neither the name of the copyright holder nor the names of its
 18 |   contributors may be used to endorse or promote products derived from this
 19 |   software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | """
 32 | 
 33 | import torch
 34 | import numpy as np
 35 | import torch.nn.functional as F
 36 | from torch.autograd import Variable
 37 | from scipy.signal import get_window
 38 | from librosa.util import pad_center, tiny
 39 | from .audio_processing import window_sumsquare
 40 | 
 41 | 
 42 | class STFT(torch.nn.Module):
 43 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 44 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
 45 |                  window='hann'):
 46 |         super(STFT, self).__init__()
 47 |         self.filter_length = filter_length
 48 |         self.hop_length = hop_length
 49 |         self.win_length = win_length
 50 |         self.window = window
 51 |         self.forward_transform = None
 52 |         scale = self.filter_length / self.hop_length
 53 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 54 | 
 55 |         cutoff = int((self.filter_length / 2 + 1))
 56 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 57 |                                    np.imag(fourier_basis[:cutoff, :])])
 58 | 
 59 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 60 |         inverse_basis = torch.FloatTensor(
 61 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 62 | 
 63 |         if window is not None:
 64 |             assert(filter_length >= win_length)
 65 |             # get window and zero center pad it to filter_length
 66 |             fft_window = get_window(window, win_length, fftbins=True)
 67 |             fft_window = pad_center(fft_window, filter_length)
 68 |             fft_window = torch.from_numpy(fft_window).float()
 69 | 
 70 |             # window the bases
 71 |             forward_basis *= fft_window
 72 |             inverse_basis *= fft_window
 73 | 
 74 |         self.register_buffer('forward_basis', forward_basis.float())
 75 |         self.register_buffer('inverse_basis', inverse_basis.float())
 76 | 
 77 |     def transform(self, input_data):
 78 |         num_batches = input_data.size(0)
 79 |         num_samples = input_data.size(1)
 80 | 
 81 |         self.num_samples = num_samples
 82 | 
 83 |         # similar to librosa, reflect-pad the input
 84 |         input_data = input_data.view(num_batches, 1, num_samples)
 85 |         input_data = F.pad(
 86 |             input_data.unsqueeze(1),
 87 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 88 |             mode='reflect')
 89 |         input_data = input_data.squeeze(1)
 90 | 
 91 |         forward_transform = F.conv1d(
 92 |             input_data,
 93 |             Variable(self.forward_basis, requires_grad=False),
 94 |             stride=self.hop_length,
 95 |             padding=0)
 96 | 
 97 |         cutoff = int((self.filter_length / 2) + 1)
 98 |         real_part = forward_transform[:, :cutoff, :]
 99 |         imag_part = forward_transform[:, cutoff:, :]
100 | 
101 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
102 |         phase = torch.autograd.Variable(
103 |             torch.atan2(imag_part.data, real_part.data))
104 | 
105 |         return magnitude, phase
106 | 
107 |     def inverse(self, magnitude, phase):
108 |         recombine_magnitude_phase = torch.cat(
109 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
110 | 
111 |         inverse_transform = F.conv_transpose1d(
112 |             recombine_magnitude_phase,
113 |             Variable(self.inverse_basis, requires_grad=False),
114 |             stride=self.hop_length,
115 |             padding=0)
116 | 
117 |         if self.window is not None:
118 |             window_sum = window_sumsquare(
119 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
120 |                 win_length=self.win_length, n_fft=self.filter_length,
121 |                 dtype=np.float32)
122 |             # remove modulation effects
123 |             approx_nonzero_indices = torch.from_numpy(
124 |                 np.where(window_sum > tiny(window_sum))[0])
125 |             window_sum = torch.autograd.Variable(
126 |                 torch.from_numpy(window_sum), requires_grad=False)
127 |             window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
128 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
129 | 
130 |             # scale by hop ratio
131 |             inverse_transform *= float(self.filter_length) / self.hop_length
132 | 
133 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
134 |         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
135 | 
136 |         return inverse_transform
137 | 
138 |     def forward(self, input_data):
139 |         self.magnitude, self.phase = self.transform(input_data)
140 |         reconstruction = self.inverse(self.magnitude, self.phase)
141 |         return reconstruction
142 | 


--------------------------------------------------------------------------------
/scripts/voice_conversion.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import librosa
  4 | import os
  5 | from scipy.io.wavfile import write
  6 | import torch
  7 | import torchaudio
  8 | from transformers import HubertModel
  9 | 
 10 | from unitspeech.unitspeech import UnitSpeech
 11 | from unitspeech.encoder import Encoder
 12 | from unitspeech.text import symbols
 13 | from unitspeech.util import HParams, fix_len_compatibility, sequence_mask
 14 | from unitspeech.vocoder.env import AttrDict
 15 | from unitspeech.vocoder.models import BigVGAN
 16 | 
 17 | 
 18 | @torch.no_grad()
 19 | def voice_conversion(
 20 |         args, contentvec_encoder, decoder, contentvec, contentvec_length, mel_length, spk_emb, num_downsamplings_in_unet
 21 | ):
 22 |     cond_x, x, x_mask = contentvec_encoder(contentvec, contentvec_length)
 23 |     cond_y = cond_x
 24 |     y_lengths = torch.LongTensor([contentvec_length]).to(contentvec.device)
 25 | 
 26 |     encoder_outputs = torch.nn.functional.interpolate(
 27 |         cond_y, size=mel_length, mode='linear'
 28 |     )
 29 |     y_max_length = mel_length
 30 |     y_max_length_ = fix_len_compatibility(mel_length, num_downsamplings_in_unet)
 31 |     cond_y = torch.cat([encoder_outputs, torch.zeros_like(encoder_outputs)[:, :, :y_max_length_ - mel_length]], dim=-1)
 32 |     y_mask = sequence_mask(torch.LongTensor([mel_length]).to(y_lengths.device), y_max_length_)\
 33 |         .unsqueeze(1).to(x_mask.dtype)
 34 | 
 35 |     z = torch.randn_like(cond_y, device=cond_y.device)
 36 | 
 37 |     # Generate sample by performing reverse dynamics
 38 |     decoder_outputs = decoder(
 39 |         z, y_mask, cond_y, spk_emb, args.diffusion_step,
 40 |         text_gradient_scale=args.text_gradient_scale, spk_gradient_scale=args.spk_gradient_scale
 41 |     )
 42 |     decoder_outputs = decoder_outputs[:, :, :y_max_length]
 43 |     return decoder_outputs
 44 | 
 45 | 
 46 | class HubertModelWithFinalProj(HubertModel):
 47 |     def __init__(self, config):
 48 |         super().__init__(config)
 49 | 
 50 |         # The final projection layer is only used for backward compatibility.
 51 |         # Following https://github.com/auspicious3000/contentvec/issues/6
 52 |         # Remove this layer is necessary to achieve the desired outcome.
 53 |         self.final_proj = torch.nn.Linear(config.hidden_size, config.classifier_proj_size)
 54 | 
 55 | 
 56 | def main(args, hps):
 57 |     # Load the source audio and extract the contentvec.
 58 |     contentvec_extractor = HubertModelWithFinalProj.from_pretrained("lengyue233/content-vec-best")
 59 |     _ = contentvec_extractor.cuda().eval()
 60 | 
 61 |     wav, sr = librosa.load(args.source_path)
 62 |     wav = torch.FloatTensor(wav).unsqueeze(0)
 63 |     resample_fn = torchaudio.transforms.Resample(sr, 16000).to("cuda")
 64 |     wav = wav.cuda()
 65 |     mel_length = wav.shape[-1] // hps.data.hop_length
 66 | 
 67 |     wav = resample_fn(wav)
 68 |     contentvec = contentvec_extractor(wav)["last_hidden_state"]
 69 | 
 70 |     # Initialize & load model
 71 |     contentvec_encoder = Encoder(
 72 |         n_vocab=len(symbols) + 1,
 73 |         n_feats=hps.data.n_feats,
 74 |         **hps.encoder
 75 |     )
 76 | 
 77 |     contentvec_encoder_dict = torch.load(args.encoder_path, map_location=lambda loc, storage: loc)
 78 |     contentvec_encoder.load_state_dict(contentvec_encoder_dict['model'])
 79 |     _ = contentvec_encoder.cuda().eval()
 80 | 
 81 |     unitspeech = UnitSpeech(
 82 |         n_feats=hps.data.n_feats,
 83 |         **hps.decoder
 84 |     )
 85 | 
 86 |     decoder_dict = torch.load(args.decoder_path, map_location=lambda loc, storage: loc)
 87 |     unitspeech.load_state_dict(decoder_dict['model'])
 88 |     _ = unitspeech.cuda().train()
 89 | 
 90 |     # Initialize & load vocoder.
 91 |     with open(hps.train.vocoder_config_path) as f:
 92 |         h = AttrDict(json.load(f))
 93 |     vocoder = BigVGAN(h)
 94 |     vocoder.load_state_dict(torch.load(hps.train.vocoder_ckpt_path, map_location=lambda loc, storage: loc)['generator'])
 95 |     _ = vocoder.cuda().eval()
 96 |     vocoder.remove_weight_norm()
 97 | 
 98 |     # Prepare input
 99 |     contentvec = contentvec.cuda()
100 |     contentvec_length = torch.LongTensor([contentvec.shape[1]]).cuda()
101 | 
102 |     spk_emb = decoder_dict['spk_emb'].cuda()
103 | 
104 |     # Load the normalization parameters for mel-spectrogram normalization.
105 |     mel_min = decoder_dict['mel_min'].cuda()
106 |     mel_max = decoder_dict['mel_max'].cuda()
107 | 
108 |     with torch.no_grad():
109 |         mel_generated = voice_conversion(
110 |             args, contentvec_encoder, unitspeech,
111 |             contentvec, contentvec_length, mel_length, spk_emb, len(hps.decoder.dim_mults) - 1
112 |         )
113 | 
114 |         mel_generated = ((mel_generated + 1) / 2 * (mel_max.to(mel_generated.device) - mel_min.to(mel_generated.device))
115 |                          + mel_min.to(mel_generated.device))
116 | 
117 |         audio_generated = vocoder.forward(mel_generated).cpu().squeeze().clamp(-1, 1).numpy()
118 | 
119 |     if "/" in args.generated_sample_path:
120 |         os.makedirs(os.path.dirname(args.generated_sample_path), exist_ok=True)
121 |     write(args.generated_sample_path, hps.data.sampling_rate, audio_generated)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     parser = argparse.ArgumentParser()
126 |     parser.add_argument('--encoder_path', type=str, default="unitspeech/checkpoints/contentvec_encoder.pt",
127 |                         help='Path of the text encoder checkpoint.')
128 |     parser.add_argument('--decoder_path', type=str, default="unitspeech/outputs/finetuned_decoder.pt",
129 |                         help='Path of the finetuned decoder checkpoint.')
130 |     parser.add_argument('--config_path', type=str, default="unitspeech/checkpoints/voice-conversion.json",
131 |                         help='Path to the configuration file for voice conversion.')
132 |     parser.add_argument('--generated_sample_path', type=str, default="unitspeech/outputs/output_vc.wav",
133 |                         help='The path to save the generated audio.')
134 | 
135 |     parser.add_argument('--source_path', type=str, required=True,
136 |                         help='The source audio file path for voice conversion.')
137 |     parser.add_argument('--text_gradient_scale', type=float, default=1.0,
138 |                         help='Gradient scale of classifier-free guidance (cfg) for text condition. (0.0: wo cfg)')
139 |     parser.add_argument('--spk_gradient_scale', type=float, default=1.0,
140 |                         help='Gradient scale of classifier-free guidance (cfg) for speaker condition. (0.0: wo cfg)')
141 |     parser.add_argument('--diffusion_step', type=int, default=50,
142 |                         help='The number of iterations for sampling in the diffusion model.')
143 |     args = parser.parse_args()
144 | 
145 |     with open(args.config_path, "r") as f:
146 |         data = f.read()
147 |     config = json.loads(data)
148 | 
149 |     hps = HParams(**config)
150 | 
151 |     main(args, hps)


--------------------------------------------------------------------------------
/unitspeech/vocoder/meldataset.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/jik876/hifi-gan """
  2 | 
  3 | import math
  4 | import os
  5 | import random
  6 | import torch
  7 | import torch.utils.data
  8 | import numpy as np
  9 | from librosa.util import normalize
 10 | from scipy.io.wavfile import read
 11 | from librosa.filters import mel as librosa_mel_fn
 12 | 
 13 | MAX_WAV_VALUE = 32768.0
 14 | 
 15 | 
 16 | def load_wav(full_path):
 17 |     sampling_rate, data = read(full_path)
 18 |     return data, sampling_rate
 19 | 
 20 | 
 21 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 22 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 23 | 
 24 | 
 25 | def dynamic_range_decompression(x, C=1):
 26 |     return np.exp(x) / C
 27 | 
 28 | 
 29 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 30 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 31 | 
 32 | 
 33 | def dynamic_range_decompression_torch(x, C=1):
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 52 |     if torch.min(y) < -1.:
 53 |         print('min value is ', torch.min(y))
 54 |     if torch.max(y) > 1.:
 55 |         print('max value is ', torch.max(y))
 56 | 
 57 |     global mel_basis, hann_window
 58 |     if fmax not in mel_basis:
 59 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
 60 |         mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 61 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
 62 | 
 63 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 64 |     y = y.squeeze(1)
 65 | 
 66 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
 67 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
 68 | 
 69 |     spec = torch.sqrt(torch.real(spec * spec.conj() + 1e-9))
 70 | 
 71 |     spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
 72 |     spec = spectral_normalize_torch(spec)
 73 | 
 74 |     return spec
 75 | 
 76 | 
 77 | def get_dataset_filelist(a):
 78 |     with open(a.input_training_file, 'r', encoding='utf-8') as fi:
 79 |         training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
 80 |                           for x in fi.read().split('\n') if len(x) > 0]
 81 | 
 82 |     with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
 83 |         validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
 84 |                             for x in fi.read().split('\n') if len(x) > 0]
 85 |     return training_files, validation_files
 86 | 
 87 | 
 88 | class MelDataset(torch.utils.data.Dataset):
 89 |     def __init__(self, training_files, segment_size, n_fft, num_mels,
 90 |                  hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
 91 |                  device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
 92 |         self.audio_files = training_files
 93 |         random.seed(1234)
 94 |         if shuffle:
 95 |             random.shuffle(self.audio_files)
 96 |         self.segment_size = segment_size
 97 |         self.sampling_rate = sampling_rate
 98 |         self.split = split
 99 |         self.n_fft = n_fft
100 |         self.num_mels = num_mels
101 |         self.hop_size = hop_size
102 |         self.win_size = win_size
103 |         self.fmin = fmin
104 |         self.fmax = fmax
105 |         self.fmax_loss = fmax_loss
106 |         self.cached_wav = None
107 |         self.n_cache_reuse = n_cache_reuse
108 |         self._cache_ref_count = 0
109 |         self.device = device
110 |         self.fine_tuning = fine_tuning
111 |         self.base_mels_path = base_mels_path
112 | 
113 |     def __getitem__(self, index):
114 |         filename = self.audio_files[index]
115 |         if self._cache_ref_count == 0:
116 |             audio, sampling_rate = load_wav(filename)
117 |             audio = audio / MAX_WAV_VALUE
118 |             if not self.fine_tuning:
119 |                 audio = normalize(audio) * 0.95
120 |             self.cached_wav = audio
121 |             if sampling_rate != self.sampling_rate:
122 |                 raise ValueError("{} SR doesn't match target {} SR".format(
123 |                     sampling_rate, self.sampling_rate))
124 |             self._cache_ref_count = self.n_cache_reuse
125 |         else:
126 |             audio = self.cached_wav
127 |             self._cache_ref_count -= 1
128 | 
129 |         audio = torch.FloatTensor(audio)
130 |         audio = audio.unsqueeze(0)
131 | 
132 |         if not self.fine_tuning:
133 |             if self.split:
134 |                 if audio.size(1) >= self.segment_size:
135 |                     max_audio_start = audio.size(1) - self.segment_size
136 |                     audio_start = random.randint(0, max_audio_start)
137 |                     audio = audio[:, audio_start:audio_start+self.segment_size]
138 |                 else:
139 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
140 | 
141 |             mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
142 |                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
143 |                                   center=False)
144 |         else:
145 |             mel = np.load(
146 |                 os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
147 |             mel = torch.from_numpy(mel)
148 | 
149 |             if len(mel.shape) < 3:
150 |                 mel = mel.unsqueeze(0)
151 | 
152 |             if self.split:
153 |                 frames_per_seg = math.ceil(self.segment_size / self.hop_size)
154 | 
155 |                 if audio.size(1) >= self.segment_size:
156 |                     mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
157 |                     mel = mel[:, :, mel_start:mel_start + frames_per_seg]
158 |                     audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
159 |                 else:
160 |                     mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
161 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
162 | 
163 |         mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
164 |                                    self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
165 |                                    center=False)
166 | 
167 |         return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
168 | 
169 |     def __len__(self):
170 |         return len(self.audio_files)
171 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/data/cpc_feature_reader.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class CpcFeatureReader(torch.nn.Module):
 12 |     def __init__(
 13 |         self,
 14 |         checkpoint_path,
 15 |         layer=2,
 16 |         use_encoder_layer=False,
 17 |         norm_features=False,
 18 |         max_chunk=64000,
 19 |         **kwargs,
 20 |     ):
 21 |         super().__init__()
 22 | 
 23 |         self.model = self.load_cpc_model(checkpoint_path, layer).eval()
 24 |         self.max_chunk = max_chunk
 25 |         self.norm_features = norm_features
 26 |         self.use_encoder_layer = use_encoder_layer
 27 | 
 28 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 29 |         return self.get_features(x)
 30 | 
 31 |     @torch.inference_mode()
 32 |     def get_features(self, x: torch.Tensor) -> torch.Tensor:
 33 |         x = x.view(1, 1, -1)
 34 |         size = x.size(2)
 35 |         feat = []
 36 |         start = 0
 37 |         while start < size:
 38 |             if start + self.max_chunk > size:
 39 |                 break
 40 |             x_chunk = x[..., start : start + self.max_chunk]
 41 |             feat_chunk = self.model.extract_features(
 42 |                 source=x_chunk,
 43 |                 get_encoded=self.use_encoder_layer,
 44 |                 norm_output=self.norm_features,
 45 |             )
 46 |             feat.append(feat_chunk)
 47 |             start += self.max_chunk
 48 | 
 49 |         if start < size:
 50 |             x_chunk = x[:, -self.max_chunk :]
 51 |             feat_chunk = self.model.extract_features(
 52 |                 source=x_chunk,
 53 |                 get_encoded=self.use_encoder_layer,
 54 |                 norm_output=self.norm_features,
 55 |             )
 56 |             df = x_chunk.size(2) // feat_chunk.size(1)
 57 |             delta = (size - start) // df
 58 |             feat.append(feat_chunk[:, -delta:])
 59 |         return torch.cat(feat, 1).squeeze(0)
 60 | 
 61 |     @property
 62 |     def code_hop_size(self) -> int:
 63 |         return 160
 64 | 
 65 |     @property
 66 |     def expected_sample_rate(self) -> int:
 67 |         return 16_000
 68 | 
 69 |     @staticmethod
 70 |     def load_cpc_model(checkpoint_path: str, layer: int = 2) -> torch.nn.Module:
 71 |         state_dict = torch.load(checkpoint_path)
 72 |         weights = state_dict["weights"]
 73 |         config = state_dict["config"]
 74 |         if layer is not None:
 75 |             config["nLevelsGRU"] = layer
 76 | 
 77 |         encoder = CPCEncoder(config["hiddenEncoder"])
 78 |         ar_net = CPCAR(
 79 |             config["hiddenEncoder"], config["hiddenGar"], False, config["nLevelsGRU"]
 80 |         )
 81 | 
 82 |         model = CPCModel(encoder, ar_net)
 83 |         model.load_state_dict(weights, strict=False)
 84 |         model.config = config
 85 | 
 86 |         return model
 87 | 
 88 | 
 89 | class ChannelNorm(nn.Module):
 90 |     def __init__(self, num_features, epsilon=1e-05, affine=True):
 91 |         super(ChannelNorm, self).__init__()
 92 |         if affine:
 93 |             self.weight = nn.parameter.Parameter(torch.Tensor(1, num_features, 1))
 94 |             self.bias = nn.parameter.Parameter(torch.Tensor(1, num_features, 1))
 95 |         else:
 96 |             self.weight = None
 97 |             self.bias = None
 98 |         self.epsilon = epsilon
 99 |         self.p = 0
100 |         self.affine = affine
101 |         self.reset_parameters()
102 | 
103 |     def reset_parameters(self):
104 |         if self.affine:
105 |             torch.nn.init.ones_(self.weight)
106 |             torch.nn.init.zeros_(self.bias)
107 | 
108 |     def forward(self, x):
109 |         cum_mean = x.mean(dim=1, keepdim=True)
110 |         cum_var = x.var(dim=1, keepdim=True)
111 |         x = (x - cum_mean) * torch.rsqrt(cum_var + self.epsilon)
112 |         if self.weight is not None:
113 |             x = x * self.weight + self.bias
114 |         return x
115 | 
116 | 
117 | class CPCEncoder(nn.Module):
118 |     def __init__(self, hidden_dim=512):
119 |         super(CPCEncoder, self).__init__()
120 |         self.conv0 = nn.Conv1d(1, hidden_dim, 10, stride=5, padding=3)
121 |         self.batchNorm0 = ChannelNorm(hidden_dim)
122 |         self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, 8, stride=4, padding=2)
123 |         self.batchNorm1 = ChannelNorm(hidden_dim)
124 |         self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1)
125 |         self.batchNorm2 = ChannelNorm(hidden_dim)
126 |         self.conv3 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1)
127 |         self.batchNorm3 = ChannelNorm(hidden_dim)
128 |         self.conv4 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1)
129 |         self.batchNorm4 = ChannelNorm(hidden_dim)
130 |         self.DOWNSAMPLING = 160
131 | 
132 |     def get_output_dim(self):
133 |         return self.conv4.out_channels
134 | 
135 |     def forward(self, x):
136 |         x = F.relu(self.batchNorm0(self.conv0(x)))
137 |         x = F.relu(self.batchNorm1(self.conv1(x)))
138 |         x = F.relu(self.batchNorm2(self.conv2(x)))
139 |         x = F.relu(self.batchNorm3(self.conv3(x)))
140 |         x = F.relu(self.batchNorm4(self.conv4(x)))
141 |         return x
142 | 
143 | 
144 | class CPCAR(nn.Module):
145 |     def __init__(self, dim_encoded, dim_output, keep_hidden, num_layers):
146 |         super(CPCAR, self).__init__()
147 |         self.baseNet = nn.LSTM(
148 |             dim_encoded, dim_output, num_layers=num_layers, batch_first=True
149 |         )
150 |         self.hidden = None
151 |         self.keep_hidden = keep_hidden
152 | 
153 |     def get_output_dim(self):
154 |         return self.baseNet.hidden_size
155 | 
156 |     def forward(self, x):
157 |         try:
158 |             self.baseNet.flatten_parameters()
159 |         except RuntimeError:
160 |             pass
161 |         x, h = self.baseNet(x, self.hidden)
162 |         if self.keep_hidden:
163 |             if isinstance(h, tuple):
164 |                 self.hidden = tuple(x.detach() for x in h)
165 |             else:
166 |                 self.hidden = h.detach()
167 |         return x
168 | 
169 | 
170 | class CPCModel(nn.Module):
171 |     def __init__(self, encoder, ar_net):
172 |         super(CPCModel, self).__init__()
173 |         self.gEncoder = encoder
174 |         self.gAR = ar_net
175 |         self.config = None
176 | 
177 |     def forward(self, x, label):
178 |         encoded = self.gEncoder(x).permute(0, 2, 1)
179 |         cpc_feature = self.gAR(encoded)
180 |         return cpc_feature, encoded, label
181 | 
182 |     def extract_features(self, source, get_encoded=False, norm_output=False):
183 |         cpc_feature, encoded, _ = self.forward(source, None)
184 |         if get_encoded:
185 |             cpc_feature = encoded
186 |         if norm_output:
187 |             mean = cpc_feature.mean(dim=1, keepdim=True)
188 |             var = cpc_feature.var(dim=1, keepdim=True)
189 |             cpc_feature = (cpc_feature - mean) / torch.sqrt(var + 1e-08)
190 |         return cpc_feature
191 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/textless/vocoders/tacotron2/utils.py:
--------------------------------------------------------------------------------
  1 | # BSD 3-Clause License
  2 | 
  3 | # Copyright (c) 2018, NVIDIA Corporation
  4 | # All rights reserved.
  5 | 
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | 
  9 | # * Redistributions of source code must retain the above copyright notice, this
 10 | #   list of conditions and the following disclaimer.
 11 | 
 12 | # * Redistributions in binary form must reproduce the above copyright notice,
 13 | #   this list of conditions and the following disclaimer in the documentation
 14 | #   and/or other materials provided with the distribution.
 15 | 
 16 | # * Neither the name of the copyright holder nor the names of its
 17 | #   contributors may be used to endorse or promote products derived from
 18 | #   this software without specific prior written permission.
 19 | 
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 24 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 26 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 27 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 28 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | """https://github.com/NVIDIA/tacotron2"""
 32 | 
 33 | import collections
 34 | import io
 35 | import json
 36 | import librosa
 37 | import numpy as np
 38 | import soundfile as sf
 39 | import time
 40 | import torch
 41 | from scipy.io.wavfile import read
 42 | from .text import SOS_TOK, EOS_TOK
 43 | 
 44 | 
 45 | def get_mask_from_lengths(lengths):
 46 |     max_len = torch.max(lengths).item()
 47 |     ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
 48 |     mask = (ids < lengths.unsqueeze(1))
 49 |     return mask
 50 | 
 51 | 
 52 | def load_wav_to_torch(full_path, sr=None):
 53 |     data, sr = librosa.load(full_path, sr=sr)
 54 |     data = np.clip(data, -1, 1)  # potentially out of [-1, 1] due to resampling
 55 |     data = data * 32768.0  # match values loaded by scipy
 56 |     return torch.FloatTensor(data.astype(np.float32)), sr
 57 | 
 58 | 
 59 | def read_binary_audio(bin_data, tar_sr=None):
 60 |     """
 61 |     read binary audio (`bytes` or `uint8` `numpy.ndarray`) to `float32`
 62 |     `numpy.ndarray`
 63 | 
 64 |     RETURNS:
 65 |         data (np.ndarray) : audio of shape (n,) or (2, n)
 66 |         tar_sr (int) : sample rate
 67 |     """
 68 |     data, ori_sr = sf.read(io.BytesIO(bin_data), dtype='float32')
 69 |     data = data.T
 70 |     if (tar_sr is not None) and (ori_sr != tar_sr):
 71 |         data = librosa.resample(data, ori_sr, tar_sr)
 72 |     else:
 73 |         tar_sr = ori_sr
 74 |     data = np.clip(data, -1, 1)
 75 |     data = data * 32768.0
 76 |     return torch.FloatTensor(data.astype(np.float32)), tar_sr
 77 | 
 78 | 
 79 | def load_filepaths_and_text(filename):
 80 |     with open(filename, encoding='utf-8') as f:
 81 |         data = [json.loads(line.rstrip()) for line in f]
 82 |     return data
 83 | 
 84 | 
 85 | def to_gpu(x):
 86 |     x = x.contiguous()
 87 | 
 88 |     if torch.cuda.is_available():
 89 |         x = x.cuda(non_blocking=True)
 90 |     return torch.autograd.Variable(x)
 91 | 
 92 | 
 93 | def load_code_dict(path, add_sos=False, add_eos=False):
 94 |     if not path:
 95 |         return {}
 96 | 
 97 |     with open(path, 'r') as f:
 98 |         codes = ['_'] + [line.rstrip() for line in f]  # '_' for pad
 99 |     code_dict = {c: i for i, c in enumerate(codes)}
100 | 
101 |     if add_sos:
102 |         code_dict[SOS_TOK] = len(code_dict)
103 |     if add_eos:
104 |         code_dict[EOS_TOK] = len(code_dict)
105 |     assert(set(code_dict.values()) == set(range(len(code_dict))))
106 | 
107 |     return code_dict
108 | 
109 | 
110 | def load_obs_label_dict(path):
111 |     if not path:
112 |         return {}
113 |     with open(path, 'r') as f:
114 |         obs_labels = [line.rstrip() for line in f]
115 |     return {c: i for i, c in enumerate(obs_labels)}
116 | 
117 | 
118 | # A simple timer class inspired from `tnt.TimeMeter`
119 | class CudaTimer:
120 |     def __init__(self, keys):
121 |         self.keys = keys
122 |         self.reset()
123 | 
124 |     def start(self, key):
125 |         s = torch.cuda.Event(enable_timing=True)
126 |         s.record()
127 |         self.start_events[key].append(s)
128 |         return self
129 | 
130 |     def stop(self, key):
131 |         e = torch.cuda.Event(enable_timing=True)
132 |         e.record()
133 |         self.end_events[key].append(e)
134 |         return self
135 | 
136 |     def reset(self):
137 |         self.start_events = collections.defaultdict(list)
138 |         self.end_events = collections.defaultdict(list)
139 |         self.running_times = collections.defaultdict(float)
140 |         self.n = collections.defaultdict(int)
141 |         return self
142 | 
143 |     def value(self):
144 |         self._synchronize()
145 |         return {k: self.running_times[k] / self.n[k] for k in self.keys}
146 | 
147 |     def _synchronize(self):
148 |         torch.cuda.synchronize()
149 |         for k in self.keys:
150 |             starts = self.start_events[k]
151 |             ends = self.end_events[k]
152 |             if len(starts) == 0:
153 |                 raise ValueError("Trying to divide by zero in TimeMeter")
154 |             if len(ends) != len(starts):
155 |                 raise ValueError("Call stop before checking value!")
156 |             time = 0
157 |             for start, end in zip(starts, ends):
158 |                 time += start.elapsed_time(end)
159 |             self.running_times[k] += time * 1e-3
160 |             self.n[k] += len(starts)
161 |         self.start_events = collections.defaultdict(list)
162 |         self.end_events = collections.defaultdict(list)
163 | 
164 | 
165 | # Used to measure the time taken for multiple events
166 | class Timer:
167 |     def __init__(self, keys):
168 |         self.keys = keys
169 |         self.n = {}
170 |         self.running_time = {}
171 |         self.total_time = {}
172 |         self.reset()
173 | 
174 |     def start(self, key):
175 |         self.running_time[key] = time.time()
176 |         return self
177 | 
178 |     def stop(self, key):
179 |         self.total_time[key] = time.time() - self.running_time[key]
180 |         self.n[key] += 1
181 |         self.running_time[key] = None
182 |         return self
183 | 
184 |     def reset(self):
185 |         for k in self.keys:
186 |             self.total_time[k] = 0
187 |             self.running_time[k] = None
188 |             self.n[k] = 0
189 |         return self
190 | 
191 |     def value(self):
192 |         vals = {}
193 |         for k in self.keys:
194 |             if self.n[k] == 0:
195 |                 raise ValueError("Trying to divide by zero in TimeMeter")
196 |             else:
197 |                 vals[k] = self.total_time[k] / self.n[k]
198 |         return vals
199 | 
200 | 


--------------------------------------------------------------------------------
/scripts/text_to_speech.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import phonemizer
  5 | from scipy.io.wavfile import write
  6 | import torch
  7 | 
  8 | from unitspeech.unitspeech import UnitSpeech
  9 | from unitspeech.duration_predictor import DurationPredictor
 10 | from unitspeech.encoder import Encoder
 11 | from unitspeech.text import cleaned_text_to_sequence, phonemize, symbols
 12 | from unitspeech.util import HParams, intersperse, fix_len_compatibility, sequence_mask, generate_path
 13 | from unitspeech.vocoder.env import AttrDict
 14 | from unitspeech.vocoder.models import BigVGAN
 15 | 
 16 | 
 17 | @torch.no_grad()
 18 | def text_to_speech(
 19 |         args, text_encoder, duration_predictor, decoder, phoneme, phoneme_lengths, spk_emb, num_downsamplings_in_unet
 20 | ):
 21 |     cond_x, x, x_mask = text_encoder(phoneme, phoneme_lengths)
 22 |     logw = duration_predictor(x, x_mask, w=None, g=spk_emb, reverse=True)
 23 |     w = torch.exp(logw) * x_mask
 24 |     w_ceil = torch.ceil(w) * args.length_scale
 25 | 
 26 |     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
 27 |     y_max_length = int(y_lengths.max())
 28 |     y_max_length_ = fix_len_compatibility(y_max_length, num_downsamplings_in_unet)
 29 | 
 30 |     # Using obtained durations `w` construct alignment map `attn`
 31 |     y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
 32 |     attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
 33 |     attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
 34 | 
 35 |     # Align encoded text and get mu_y
 36 |     cond_y = torch.matmul(attn.squeeze(1).transpose(1, 2).contiguous(), cond_x.transpose(1, 2).contiguous())
 37 |     cond_y = cond_y.transpose(1, 2).contiguous()
 38 | 
 39 |     z = torch.randn_like(cond_y, device=cond_y.device)
 40 | 
 41 |     # Generate sample by performing reverse dynamics
 42 |     decoder_outputs = decoder(
 43 |         z, y_mask, cond_y, spk_emb, args.diffusion_step,
 44 |         text_gradient_scale=args.text_gradient_scale, spk_gradient_scale=args.spk_gradient_scale
 45 |     )
 46 |     decoder_outputs = decoder_outputs[:, :, :y_max_length]
 47 |     return decoder_outputs
 48 | 
 49 | 
 50 | def main(args, hps):
 51 |     global_phonemizer = phonemizer.backend.EspeakBackend(
 52 |         language='en-us', preserve_punctuation=True, with_stress=True, language_switch="remove-flags"
 53 |     )
 54 | 
 55 |     # Initialize & load model
 56 |     text_encoder = Encoder(
 57 |         n_vocab=len(symbols) + 1,
 58 |         n_feats=hps.data.n_feats,
 59 |         **hps.encoder
 60 |     )
 61 | 
 62 |     text_encoder_dict = torch.load(args.encoder_path, map_location=lambda loc, storage: loc)
 63 |     text_encoder.load_state_dict(text_encoder_dict['model'])
 64 |     _ = text_encoder.cuda().eval()
 65 | 
 66 |     duration_predictor = DurationPredictor(
 67 |         **hps.duration_predictor
 68 |     )
 69 | 
 70 |     duration_predictor_dict = torch.load(args.duration_predictor_path, map_location=lambda loc, storage: loc)
 71 |     duration_predictor.load_state_dict(duration_predictor_dict['model'])
 72 |     _ = duration_predictor.cuda().eval()
 73 | 
 74 |     unitspeech = UnitSpeech(
 75 |         n_feats=hps.data.n_feats,
 76 |         **hps.decoder
 77 |     )
 78 | 
 79 |     decoder_dict = torch.load(args.decoder_path, map_location=lambda loc, storage: loc)
 80 |     unitspeech.load_state_dict(decoder_dict['model'])
 81 |     _ = unitspeech.cuda().train()
 82 | 
 83 |     # Initialize & load vocoder.
 84 |     with open(hps.train.vocoder_config_path) as f:
 85 |         h = AttrDict(json.load(f))
 86 |     vocoder = BigVGAN(h)
 87 |     vocoder.load_state_dict(torch.load(hps.train.vocoder_ckpt_path, map_location=lambda loc, storage: loc)['generator'])
 88 |     _ = vocoder.cuda().eval()
 89 |     vocoder.remove_weight_norm()
 90 | 
 91 |     # Prepare input
 92 |     phoneme = phonemize(args.text, global_phonemizer)
 93 |     phoneme = cleaned_text_to_sequence(phoneme)
 94 |     phoneme = intersperse(phoneme, len(symbols))  # add a blank token, whose id number is len(symbols)
 95 |     phoneme = torch.LongTensor(phoneme).cuda().unsqueeze(0)
 96 |     phoneme_lengths = torch.LongTensor([phoneme.shape[-1]]).cuda()
 97 | 
 98 |     spk_emb = decoder_dict['spk_emb'].cuda()
 99 | 
100 |     # Load the normalization parameters for mel-spectrogram normalization.
101 |     mel_min = decoder_dict['mel_min'].cuda()
102 |     mel_max = decoder_dict['mel_max'].cuda()
103 | 
104 |     with torch.no_grad():
105 |         mel_generated = text_to_speech(
106 |             args, text_encoder, duration_predictor, unitspeech,
107 |             phoneme, phoneme_lengths, spk_emb, len(hps.decoder.dim_mults) - 1
108 |         )
109 | 
110 |         mel_generated = ((mel_generated + 1) / 2 * (mel_max.to(mel_generated.device) - mel_min.to(mel_generated.device))
111 |                          + mel_min.to(mel_generated.device))
112 |         audio_generated = vocoder.forward(mel_generated).cpu().squeeze().clamp(-1, 1).numpy()
113 | 
114 |     if "/" in args.generated_sample_path:
115 |         os.makedirs(os.path.dirname(args.generated_sample_path), exist_ok=True)
116 |     write(args.generated_sample_path, hps.data.sampling_rate, audio_generated)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     parser = argparse.ArgumentParser()
121 |     parser.add_argument('--encoder_path', type=str, default="unitspeech/checkpoints/text_encoder.pt",
122 |                         help='Path of the text encoder checkpoint.')
123 |     parser.add_argument('--decoder_path', type=str, default="unitspeech/outputs/finetuned_decoder.pt",
124 |                         help='Path of the finetuned decoder checkpoint.')
125 |     parser.add_argument('--duration_predictor_path', type=str, default="unitspeech/checkpoints/duration_predictor.pt",
126 |                         help='Path of the duration predictor checkpoint.')
127 |     parser.add_argument('--config_path', type=str, default="unitspeech/checkpoints/text-to-speech.json",
128 |                         help='Path to the configuration file for text-to-speech.')
129 |     parser.add_argument('--generated_sample_path', type=str, default="unitspeech/outputs/output_tts.wav",
130 |                         help='The path to save the generated audio.')
131 | 
132 |     parser.add_argument('--text', type=str, required=True,
133 |                         help='The desired transcript to be generated.')
134 |     parser.add_argument('--text_gradient_scale', type=float, default=1.0,
135 |                         help='Gradient scale of classifier-free guidance (cfg) for text condition. (0.0: wo cfg)')
136 |     parser.add_argument('--spk_gradient_scale', type=float, default=1.0,
137 |                         help='Gradient scale of classifier-free guidance (cfg) for speaker condition. (0.0: wo cfg)')
138 |     parser.add_argument('--length_scale', type=float, default=1.0,
139 |                         help='The parameter for adjusting speech speed. The smaller it is compared to 1, the faster the speech becomes.')
140 |     parser.add_argument('--diffusion_step', type=int, default=50,
141 |                         help='The number of iterations for sampling in the diffusion model.')
142 |     args = parser.parse_args()
143 | 
144 |     with open(args.config_path, "r") as f:
145 |         data = f.read()
146 |     config = json.loads(data)
147 | 
148 |     hps = HParams(**config)
149 | 
150 |     main(args, hps)


--------------------------------------------------------------------------------
/unitspeech/textlesslib/README.md:
--------------------------------------------------------------------------------
  1 | # textlesslib
  2 | 
  3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  4 | 
  5 | Textless NLP is an active area of research that aims to extend NLP techniques (and tools!) to work directly on spoken language. By using self-supervisedly
  6 | learnt discrete speech representations, the area promises to unlock interesting NLP applications on languages without written form or on facets of spoken 
  7 | language that are unaccessable for text-based approaches, e.g. prosody. To learn more, please check some of the [papers](https://speechbot.github.io/).
  8 | 
  9 | **textlesslib** is a library aimed to facilitate research in Textless NLP. The goal of the library is to speed up the research cycle and
 10 | lower the learning curve for those who want to start. We provide highly configurable, off-the-shelf available tools to encode speech
 11 | as sequences of discrete values and tools to decode such streams back into the audio domain. A high-level description of the library can also be
 12 | found in our paper [[arxiv]](https://arxiv.org/abs/2202.07359).
 13 | 
 14 | 
 15 | Table of Contents
 16 | =================
 17 | 
 18 |    * [Installation](#installation)
 19 |    * [Usage examples](#usage-examples)
 20 |       * [Encoding speech](#encoding-speech)
 21 |       * [Dataset helpers](#dataset-helpers)
 22 |       * [Data preprocessing](#data-preprocessing)
 23 |    * [Provided models](#provided-models)
 24 |    * [Testing](#testing)
 25 |    * [Citing textless-lib](#citing-textless-lib)
 26 | 
 27 | 
 28 | ## Installation
 29 | ```bash
 30 | git clone git@github.com:facebookresearch/textlesslib.git
 31 | cd textlesslib
 32 | pip install -e .
 33 | pip install git+git://github.com:pytorch/fairseq.git@dd106d9534b22e7db859a6b87ffd7780c38341f8
 34 | ```
 35 | 
 36 | ## Usage examples
 37 | We include a set of examples in the [examples](./examples) folder:
 38 | *  [Discrete speech resynthesis (& compression)](./examples/resynthesis/) 
 39 | *  [Probing for speaker information in the representations](./examples/speaker_probing/)
 40 | *  [Generative Spoken Language Modeling (aka Speech Continuation)](./examples/gslm/)
 41 | 
 42 | There is also a [[Jupyter notebook]](./examples/resynthesis_and_continuation.ipynb) and a [[Google Colab]](https://colab.research.google.com/github/facebookresearch/textlesslib/blob/main/examples/resynthesis_and_continuation.ipynb) that combine discrete resynthesis and speech continuation examples in a step-by-step mini-tutorial.
 43 | 
 44 | We believe those examples can serve both as illustrations for the provided components and provide 
 45 | a starting point for tinkering in interesting directions.
 46 | 
 47 | ### Encoding speech
 48 | Below is an example on loading an audio example and encoding it as a sequence of HuBERT-based discrete tokens (aka pseudo-units).
 49 | Downloading of the required checkpoints is handled by textlesslib itself (by default they are stored in `~/.textless`):
 50 | 
 51 | ```python
 52 | import torchaudio
 53 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
 54 | 
 55 | dense_model_name = "hubert-base-ls960"
 56 | quantizer_name, vocab_size = "kmeans", 100
 57 | input_file = "input.wav"
 58 | 
 59 | # now let's load an audio example
 60 | waveform, sample_rate = torchaudio.load(input_file)
 61 | 
 62 | # We can build a speech encoder module using names of pre-trained
 63 | # dense and quantizer models.  The call below will download
 64 | # appropriate checkpoints as needed behind the scenes. We can
 65 | # also construct an encoder by directly passing model instances
 66 | encoder = SpeechEncoder.by_name(
 67 |     dense_model_name=dense_model_name,
 68 |     quantizer_model_name=quantizer_name,
 69 |     vocab_size=vocab_size,
 70 |     deduplicate=True,
 71 | ).cuda()
 72 | 
 73 | 
 74 | # now convert it in a stream of deduplicated units (as in GSLM)
 75 | encoded = encoder(waveform.cuda())
 76 | # encoded is a dict with keys ('dense', 'units', 'durations').
 77 | # It can also contain 'f0' if SpeechEncoder was initialized
 78 | # with need_f0=True flag.
 79 | units = encoded["units"]  # tensor([71, 12, 57, ...], ...)
 80 | ```
 81 | Now it can be casted back into the audio domain:
 82 | 
 83 | ```python
 84 | # as with encoder, we can setup vocoder by passing checkpoints
 85 | # directly or by specifying the expected format by the names
 86 | # of dense and quantizer models (these models themselves
 87 | # won't be loaded)
 88 | vocoder = TacotronVocoder.by_name(
 89 |     dense_model_name,
 90 |     quantizer_name,
 91 |     vocab_size,
 92 | ).cuda()
 93 | 
 94 | # now we turn those units back into the audio.
 95 | audio = vocoder(units)
 96 | 
 97 | # save the audio
 98 | torchaudio.save(output_file, audio.cpu().float().unsqueeze(0), vocoder.output_sample_rate)
 99 | ```
100 | ### Dataset helpers
101 | Below is an example on using `textless` view on the LibriSpeech dataset:
102 | ```python
103 | encoder = SpeechEncoder.by_name(
104 |   dense_model_name=dense_model_name,
105 |   quantizer_model_name=quantizer_name,
106 |   vocab_size=vocab_size,
107 |   deduplicate=True,
108 | ).cuda()
109 | 
110 | quantized_dataset = QuantizedLibriSpeech(
111 |   root=existing_root, speech_encoder=encoder, url=url)
112 | 
113 | datum = quantized_dataset[0]
114 | sample_rate, utterance, speaker_id, chapter_id, utterance_id = datum['rest']
115 | # datum['units'] = tensor([71, 12, 63, ...])
116 | ```
117 | In the [probing example](./examples/speaker_probing/) we illustrate how such a dataset
118 | can be used with a standard Pytorch dataloader in a scalable manner.
119 | 
120 | ### Data preprocessing
121 | We also provide a [multi-GPU/multi-node preprocessing tool](tools/distributed_transcribe/)
122 | for the cases where on-the-fly processing of audio should be avoided.
123 | 
124 | ## Provided models
125 | We provide implementations and pre-trained checkpoints for the following models:
126 | 
127 | * Dense representations: HuBERT-base (trained on LibriSpeech 960h) and CPC (trained on 6Kh subset of LibriLight);
128 | * Quantizers: k-means quantizers with vocabulary sizes of 50, 100, 200 for both the dense models (trained on LibriSpeech 960h);
129 | * Decoders: Tacotron2 models for all (dense model x quantizer) combinations (trained on LJSpeech).
130 | 
131 | Finally, the pitch extraction is done via YAAPT.
132 | 
133 | ## Testing
134 | We use pytest (`pip install pytest pytest-xdist `). Our unit tests are located in the `tests` directory:
135 | ```bash
136 | cd tests && pytest -n 8
137 | ```
138 | 
139 | ## Citing textless-lib
140 | If you find textless-lib useful in your research, please consider citing our work:
141 | ```
142 | @article{Kharitonov2022,
143 |       title={textless-lib: a Library for Textless Spoken Language Processing}, 
144 |       author={Eugene Kharitonov and Jade Copet and Kushal Lakhotia and Tu Anh Nguyen and Paden Tomasello and Ann Lee and Ali Elkahky and Wei-Ning Hsu and Abdelrahman Mohamed and Emmanuel Dupoux and Yossi Adi},
145 |       year={2022},
146 |       eprint={2202.07359},
147 |       archivePrefix={arXiv},
148 |       primaryClass={cs.CL}
149 | }
150 | ```
151 | 
152 | ## Licence
153 | textlesslib is licensed under MIT, the text of the license can be found [here](LICENSE).
154 | Internally, it uses 
155 | * [WaveGlow](https://github.com/NVIDIA/waveglow) - licensed under BSD-3-Clause license;
156 | * [tacotron implementation](https://github.com/keithito/tacotron) - licensed under MIT license;
157 | * [tacotron2 implementation](https://github.com/NVIDIA/tacotron2) - licensed under BSD-3-Clause license;
158 | * [STFT implementation](https://github.com/pseeth/torch-stft) - licensed under BSD-3-Clause license.
159 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## UnitSpeech: Speaker-adaptive Speech Synthesis with Untranscribed Data (INTERSPEECH 2023, Oral)
  2 | #### Heeseung Kim, Sungwon Kim, Jiheum Yeom, Sungroh Yoon
  3 | ![model-1](https://github.com/gmltmd789/UnitSpeech/assets/49265950/44cb4991-abb0-44b2-81fd-fce92cc1f3f1)
  4 | <br><br>
  5 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jAglTVrBNeEQbOAJ3T_YRotoKqCBPNn9?usp=sharing)
  6 | ### [Paper](https://arxiv.org/abs/2306.16083)
  7 | ### [Audio demo](https://unitspeech.github.io/)
  8 | 
  9 | ## Updates
 10 | ### 2023.07.04 : We changed the normalization method for better speaker similarity.
 11 | - **We normalized the mel-spectrogram of the reference audio during fine-tuning using the min and max values of the reference audio's mel-spectrogram, rather than the min and max values obtained from the entire LibriTTS train set.**
 12 |   - **We observed that this modification helped improve speaker similarity.**
 13 | 
 14 | ### 2023.06.29 : We update our code and checkpoints for better pronunciation.
 15 | - **Extract reference speaker embeddings using the [WavLM](https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification#pre-trained-models)-based speaker encoder.**
 16 | - **Modeling normalized mel-spectrogram (-1 ~ 1).**
 17 | 
 18 | ### 2023.06.28 : Updated components compared to the version of INTERSPEECH.
 19 | - **Change in vocoder (from HiFi-GAN to BigVGAN).**
 20 | - **Support for speaker classifier-free guidance (advantageous for adapting to more unique voices.)**
 21 | - **Change "training-free text classifier-free guidance" to "text classifier-free guidance" (learning text unconditional embedding).**
 22 | - **Ensure compatibility with various recent works on unit-based speech synthesis (number of clusters of unit (K): 200 &rightarrow; 1000)**
 23 | - **Substantial improvement in pronunciation accuracy**
 24 |   - **To improve TTS (Text-to-Speech) pronunciation, an IPA-based phonemizer is used.** 
 25 |   - **To improve VC (Voice Conversion) pronunciation, a contentvec encoder is introduced.**
 26 | 
 27 | 
 28 | # Warning: Ethical & Legal Considerations
 29 | 1. **UnitSpeech was created with the primary objective of facilitating research endeavors.**
 30 | 2. **When utilizing samples generated using this model, it is crucial to clearly disclose that the samples were generated using AI technology. Additionally, it is necessary to provide the sources of the audio used in the generation process.**
 31 | 3. **We notify that users take full responsibility for any possible negative outcomes and legal & ethical issues that may arise due to their misuse of the model.**
 32 | 4. **As a precautionary measure against possible misapplication, we intend to introduce a classification model capable of discerning samples generated through the utilization of this model.**
 33 | 
 34 | ## TO DO
 35 | - [ ] Release a classification model to distinguish samples from UnitSpeech
 36 | 
 37 | ## Installation
 38 | **Tested on Ubuntu 20.04.5 LTS, Python 3.8, Anaconda (2023.03-1) environment**  
 39 | First, install the necessary package for the IPA phonemizer.
 40 | ```shell
 41 | sudo apt-get install espeak=1.48.04+dfsg-8build1 espeak-ng=1.50+dfsg-6
 42 | ```
 43 | If you are unable to install the specific versions of espeak and espeak-ng on Ubuntu 18.04 or earlier, please install the available versions of each package.<br>
 44 | Note: If you have a different version of espeak-ng, the output of phonemizing text may vary, which can affect pronunciation accuracy.
 45 | 
 46 | After that, create a conda environment and install the unitspeech package and the package required for extracting speaker embeddings.
 47 | ```shell
 48 | conda create -n unitspeech python=3.8
 49 | conda activate unitspeech
 50 | git clone https://github.com/gmltmd789/UnitSpeech.git
 51 | cd UnitSpeech
 52 | pip install -e .
 53 | pip install --no-deps s3prl==0.4.10
 54 | ```
 55 | 
 56 | ## Pretrained Models
 57 | **We provide the [pretrained models](https://drive.google.com/drive/folders/1yFkb2TAYB_zMmoTuUOXu-zXb3UI9pVJ9?usp=sharing).**
 58 | |File Name|Usage|
 59 | |------|---|
 60 | |contentvec_encoder.pt|Used for any-to-any voice conversion tasks.|
 61 | |unit_encoder.pt|Used for fine-tuning and unit-based speech synthesis tasks.<br>(e.g., Adaptive Speech Synthesis for Speech-to-Unit Translation)|
 62 | |text_encoder.pt|Used for adaptive text-to-speech tasks.|
 63 | |duration_predictor.pt|Used for adaptive text-to-speech tasks.|
 64 | |pretrained_decoder.pt|Used for all adaptive speech synthesis tasks.|
 65 | |speaker_encoder.pt|Used for extracting [speaker embeddings](https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification#pre-trained-models).|
 66 | |bigvgan.pt|[Vocoder](https://github.com/NVIDIA/BigVGAN) checkpoint.|
 67 | |bigvgan-config.json|Configuration for the vocoder.|
 68 | 
 69 | **After downloading the files, please arrange them in the following structure.**
 70 | ```buildoutcfg
 71 | UnitSpeech/...
 72 |     unitspeech/...
 73 |         checkpoints/...
 74 |             contentvec_encoder.pt
 75 |             duration_predictor.pt
 76 |             pretrained_decoder.pt
 77 |             text_encoder.pt
 78 |             unit_encoder.pt
 79 |             ...
 80 |         speaker_encoder/...
 81 |             checkpts/...
 82 |                 speaker_encoder.pt
 83 |             ...
 84 |         vocoder/...
 85 |             checkpts/...
 86 |                 bigvgan.pt
 87 |                 bigvgan-config.json
 88 |             ...
 89 |         ...
 90 |     ...
 91 | ```
 92 | 
 93 | ## Fine-tuning
 94 | The decoder is fine-tuned using the target speaker's voice, employing the unit encoder. **It is recommended to use a reference English speech with a duration of at least 5~10 seconds.**
 95 | 
 96 | ```shell
 97 | python scripts/finetune.py \
 98 | --reference_path REFERENCE_SPEECH_PATH \
 99 | --output_decoder_path FILEPATH1/FINETUNED_DECODER.pt
100 | ```
101 | 
102 | By executing the code, your personalized decoder will be saved as "FILEPATH1/FINETUNED_DECODER.pt".<br>
103 | With the fine-tuned decoder, you can perform adaptive text-to-speech and any-to-any voice conversion, as described below. <br> <br>
104 | By default, fine-tuning is conducted in fp32 using the Adam optimizer with a learning rate of 2e-5 for 500 iterations.<br>
105 | You can adjust the above elements through arguments provided. (--fp16_run, --learning_rate, --n_iters)<br>
106 | **For speakers with unique voices, increasing the number of fine-tuning iterations can help achieve better results.** <br>
107 | 
108 | ## Inference
109 | ```shell
110 | # script for adaptive text-to-speech
111 | python scripts/text_to_speech.py \
112 | --text "TEXT_TO_GENERATE" \
113 | --decoder_path FILEPATH1/FINETUNED_DECODER.pt \
114 | --generated_sample_path FILEPATH2/PATH_TO_SAVE_SYNTHESIZED_SPEECH.wav
115 | 
116 | 
117 | # script for any-to-any voice conversion
118 | python scripts/voice_conversion.py \
119 | --source_path SOURCE_SPEECH_PATH_TO_CONVERT.wav \
120 | --decoder_path FILEPATH1/FINETUNED_DECODER.pt \
121 | --generated_sample_path FILEPATH2/PATH_TO_SAVE_SYNTHESIZED_SPEECH.wav
122 | ```
123 | You can adjust the number of diffusion steps, text gradient scale, and speaker gradient scale as arguments.<br>
124 | - text_gradient_scale : responsible for pronunciation accuracy and audio quality. Increasing its value makes the pronunciation of the samples more accurate.<br>
125 | - spk_gradient_scale : responsible for speaker similarity. Increasing its value generates voices that are closer to the reference speech.<br>
126 | 
127 | By default, text gradient scale is set to 1.0, and speaker gradient scale is set to 1.0.<br>
128 | **If you want better pronunciation and audio quality, please increase the value of "text_gradient_scale." This will slightly reduce speaker similarity.**<br>
129 | **If you want better speaker similarity, please increase the value of "spk_gradient_scale." This will slightly degrade pronunciation accuracy and audio quality.**<br>
130 | 
131 | You can adjust the speed of speaking as arguments. (default: 1.0) <br>
132 | - length_scale : Increasing its value (> 1.0) makes the speech slow, while decreasing its value (< 1.0) makes the speech fast <br>
133 | 
134 | **Note: Using excessively large gradient scales can degrade the audio quality.**
135 | 
136 | ## License
137 | 
138 | The code and model weights of UnitSpeech are released under the CC BY-NC-SA 4.0 license.
139 | 
140 | ## References
141 | * [BigVGAN](https://github.com/NVIDIA/BigVGAN) (for vocoder)
142 | * [textlesslib](https://github.com/facebookresearch/textlesslib) (for unit extraction)
143 | * [ContentVec](https://github.com/auspicious3000/contentvec) (for contentvec extraction)
144 | * [VITS](https://github.com/jaywalnut310/vits) (for text & IPA phoneme sequence processing)
145 | * [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS) (for overall architecture and code)
146 | * [denoising-diffusion-pytorch](https://github.com/rosinality/denoising-diffusion-pytorch) (for diffusion-based sampler)
147 | * [WavLM](https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification) (for speaker embedding extraction)
148 | 
149 | ## Citation
150 | ```
151 | @misc{kim2023unitspeech,
152 |       title={UnitSpeech: Speaker-adaptive Speech Synthesis with Untranscribed Data}, 
153 |       author={Heeseung Kim and Sungwon Kim and Jiheum Yeom and Sungroh Yoon},
154 |       year={2023},
155 |       eprint={2306.16083},
156 |       archivePrefix={arXiv},
157 |       primaryClass={cs.SD}
158 | }
159 | ```
160 | 


--------------------------------------------------------------------------------
/unitspeech/textlesslib/examples/speaker_probing/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | import torch
  8 | import argparse
  9 | from unitspeech.textlesslib.textless.data.quantized_datasets import QuantizedLibriSpeech
 10 | from torch.utils.data import DataLoader
 11 | import torch.nn.functional as F
 12 | from probes import ContinuousClassifier, DiscreteClassifier, ConstantBaseline
 13 | from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
 14 | from unitspeech.textlesslib.textless import dispatch_dense_model, dispatch_quantizer
 15 | 
 16 | 
 17 | def set_seed_(seed):
 18 |     torch.manual_seed(seed)
 19 |     torch.cuda.manual_seed(seed)
 20 | 
 21 | 
 22 | def move_to(x, device: torch.device):
 23 |     if hasattr(x, "to"):
 24 |         return x.to(device)
 25 |     if isinstance(x, list) or isinstance(x, tuple):
 26 |         return [move_to(i, device) for i in x]
 27 |     if isinstance(x, dict):
 28 |         return {k: move_to(v, device) for k, v in x.items()}
 29 |     return x
 30 | 
 31 | 
 32 | def get_args():
 33 |     parser = argparse.ArgumentParser()
 34 |     parser.add_argument(
 35 |         "--dense_model_name",
 36 |         type=str,
 37 |         help="Dense model to be used",
 38 |         default="hubert-base-ls960",
 39 |         choices=["hubert-base-ls960", "cpc-big-ll6k"],
 40 |     )
 41 |     parser.add_argument("--vocab_size", type=int, help="Unit vocab size", default=50)
 42 |     parser.add_argument("--epochs", type=int, default=150)
 43 |     parser.add_argument(
 44 |         "--batch_size", type=int, help="Batch size for K-means training", default=32
 45 |     )
 46 |     parser.add_argument("--seed", type=int, default=13)
 47 |     parser.add_argument(
 48 |         "--model_type",
 49 |         choices=["baseline", "discrete", "continuous"],
 50 |         default="baseline",
 51 |     )
 52 | 
 53 |     args = parser.parse_args()
 54 | 
 55 |     return args
 56 | 
 57 | 
 58 | def train(model, train_dataloader, valid_dataloader, args):
 59 |     model.train()
 60 |     optimizer = torch.optim.Adam(model.parameters())
 61 | 
 62 |     for epoch in range(args.epochs):
 63 |         train_epoch(model, train_dataloader, optimizer, epoch)
 64 |         evaluate_model(model, valid_dataloader)
 65 | 
 66 | 
 67 | def train_epoch(model, dataloader, optimizer, e):
 68 |     model.train()
 69 |     n_examples = 0.0
 70 |     accumulated = torch.zeros(1, dtype=torch.float64).cuda()
 71 | 
 72 |     for batch in dataloader:
 73 |         batch = move_to(batch, torch.cuda.current_device())
 74 |         speakers = torch.tensor(batch["rest"][2]).cuda()
 75 | 
 76 |         speaker_logprobs = model(batch)
 77 |         loss = F.nll_loss(speaker_logprobs, speakers)
 78 | 
 79 |         optimizer.zero_grad()
 80 |         loss.backward()
 81 |         optimizer.step()
 82 | 
 83 |         accumulated += loss.detach().sum()
 84 |         n_examples += speakers.size(0)
 85 | 
 86 |     train_loss = (accumulated / n_examples).item()
 87 |     print(f"Epoch {e} | sliding mean train loss {train_loss}")
 88 | 
 89 | 
 90 | @torch.no_grad()
 91 | def evaluate_model(model, dataloader):
 92 |     model.eval()
 93 |     n_examples = 0
 94 |     accumulated_loss = torch.zeros(1, dtype=torch.float64).cuda()
 95 |     accuracy = torch.zeros(1, dtype=torch.float64).cuda()
 96 | 
 97 |     for batch in dataloader:
 98 |         batch = move_to(batch, torch.cuda.current_device())
 99 |         speakers = torch.tensor(batch["rest"][2]).cuda()
100 | 
101 |         speaker_logprobs = model(batch)
102 |         loss = F.nll_loss(speaker_logprobs, speakers)
103 |         accumulated_loss += loss
104 | 
105 |         accuracy += (speaker_logprobs.argmax(dim=-1) == speakers).sum()
106 |         n_examples += speakers.size(0)
107 | 
108 |     accumulated_loss /= n_examples
109 |     accuracy /= n_examples
110 | 
111 |     print(f"Valid loss: {accumulated_loss.item()}, accuracy: {accuracy.item()}")
112 | 
113 | 
114 | class SpeakerDatasetWrapper:
115 |     def __init__(self, quantized_data, speaker_mapping=None):
116 |         self.quantized_data = quantized_data
117 |         self.speaker_mapping = (
118 |             speaker_mapping
119 |             if speaker_mapping is not None
120 |             else self.get_speaker_ids(quantized_data.dataset._walker)
121 |         )
122 |         self.collater = self.quantized_data.collater
123 |         self.max_length = (
124 |             10 * 16_000 // self.quantized_data.speech_encoder.code_hop_size
125 |         )
126 | 
127 |     @staticmethod
128 |     def get_speaker_ids(walker):
129 |         speaker_mapping = {}
130 |         for fileid in walker:
131 |             speaker_id, *_ = fileid.split("-")
132 |             speaker_id = int(speaker_id)
133 |             if speaker_id not in speaker_mapping:
134 |                 speaker_mapping[speaker_id] = len(speaker_mapping)
135 |         return speaker_mapping
136 | 
137 |     def __getitem__(self, k):
138 |         item = self.quantized_data[k]
139 |         speaker = item["rest"][2]
140 |         item["rest"][2] = self.speaker_mapping[speaker]
141 | 
142 |         if self.max_length < item["dense"].size(0):
143 |             item["dense"] = item["dense"][: self.max_length, :]
144 |             item["units"] = item["units"][: self.max_length]
145 |             item["durations"] = item["durations"][: self.max_length]
146 | 
147 |         return item
148 | 
149 |     def __len__(self):
150 |         return len(self.quantized_data)
151 | 
152 | 
153 | def main():
154 |     args = get_args()
155 |     set_seed_(args.seed)
156 | 
157 |     dense_model_name = args.dense_model_name
158 |     quantizer_model_name = "kmeans"
159 |     vocab_size = args.vocab_size
160 | 
161 |     # NB: Hubert is not serializable as-is, so to have a multi-worker dataloader
162 |     # we have a worker-around: load the actual checkpoint on the first call - which
163 |     # will happen in a worker process already. This behavior is enabled with
164 |     # the `lazy_load` flag.
165 |     dense_model = dispatch_dense_model(dense_model_name, lazy_load=True)
166 |     quantizer_model = dispatch_quantizer(
167 |         dense_model_name, quantizer_model_name, vocab_size
168 |     )
169 | 
170 |     speech_encoder = SpeechEncoder(
171 |         dense_model,
172 |         quantizer_model,
173 |         deduplicate=False,
174 |         need_f0=False,
175 |         add_bos_eos=True,
176 |     )
177 | 
178 |     dataset = QuantizedLibriSpeech(
179 |         speech_encoder,
180 |         root="datasets",
181 |         url="dev-clean",
182 |         download=True,
183 |         device="auto"
184 |         # when we set `device` to auto, the dataset instance will check if it is
185 |         # running within a worker process of a dataloader. If it is the case,
186 |         # it will move SpeechEncoder to one of the available GPUs, depending on the
187 |         # worker id. This way we can pack quite a few (GPU-hungry) Hubert instances running across
188 |         # all GPUs in parallel, within the same standard DataLoader.
189 |     )
190 | 
191 |     speaker_mapping = SpeakerDatasetWrapper.get_speaker_ids(dataset.dataset._walker)
192 |     max_speaker_id = max(speaker_mapping.values())
193 |     dataset = SpeakerDatasetWrapper(dataset, speaker_mapping)
194 | 
195 |     valid_size = int(0.1 * len(dataset))
196 |     train_size = len(dataset) - valid_size
197 |     train_data, valid_data = torch.utils.data.random_split(
198 |         dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(42)
199 |     )
200 | 
201 |     train_loader = DataLoader(
202 |         train_data,
203 |         batch_size=args.batch_size,
204 |         shuffle=True,
205 |         collate_fn=dataset.collater,
206 |         num_workers=4,
207 |     )
208 |     valid_loader = DataLoader(
209 |         valid_data,
210 |         batch_size=args.batch_size,
211 |         shuffle=False,
212 |         collate_fn=dataset.collater,
213 |         num_workers=4,
214 |     )
215 | 
216 |     if args.model_type == "baseline":
217 |         model = ConstantBaseline(total_speakers=max_speaker_id + 1)
218 |     elif args.model_type == "discrete":
219 |         model = DiscreteClassifier(
220 |             vocab_size=args.vocab_size + 3,  # accounting for bos, pad, eos
221 |             embedding_size=32,
222 |             n_heads=4,
223 |             hidden_size=128,
224 |             n_layers=2,
225 |             dropout=0.1,
226 |             pad_value=dataset.quantized_data.unit_pad,
227 |             total_speakers=max_speaker_id + 1,
228 |         )
229 |     elif args.model_type == "continuous":
230 |         input_size = {
231 |             "hubert-base-ls960": 768,
232 |             "cpc-big-ll6k": 512,
233 |         }[dense_model_name]
234 | 
235 |         model = ContinuousClassifier(
236 |             embedding_size=32,
237 |             input_size=input_size,
238 |             n_heads=4,
239 |             hidden_size=128,
240 |             n_layers=2,
241 |             dropout=0.1,
242 |             pad_value=dataset.quantized_data.unit_pad,
243 |             total_speakers=max_speaker_id + 1,
244 |         )
245 |     else:
246 |         assert False, "unknown model type"
247 | 
248 |     model.cuda()
249 |     train(model, train_loader, valid_loader, args)
250 | 
251 | 
252 | if __name__ == "__main__":
253 |     from torch.multiprocessing import set_start_method
254 | 
255 |     set_start_method("spawn", force=True)
256 |     main()
257 | 


--------------------------------------------------------------------------------