├── analytics ├── tests │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── test_common.py │ │ ├── trainer │ │ │ ├── __init__.py │ │ │ └── test_trainer.py │ │ ├── vocoders │ │ │ └── test_hifi_gan.py │ │ ├── text │ │ │ ├── test_symbols.py │ │ │ └── test_util.py │ │ ├── utils │ │ │ └── test_utils.py │ │ └── test_data_loader.py │ ├── pytest.ini │ ├── fixtures │ │ ├── stevejobs-1.pt │ │ ├── wavs │ │ │ └── stevejobs-1.wav │ │ ├── sample_spectrogram.pt │ │ ├── sample_spectrogram_tf.pt │ │ ├── ljtest │ │ │ ├── wavs │ │ │ │ ├── LJ001-0001.wav │ │ │ │ ├── LJ001-0002.wav │ │ │ │ ├── LJ001-0003.wav │ │ │ │ ├── LJ001-0004.wav │ │ │ │ ├── LJ001-0005.wav │ │ │ │ ├── LJ001-0006.wav │ │ │ │ ├── LJ001-0007.wav │ │ │ │ ├── LJ001-0008.wav │ │ │ │ ├── LJ001-0009.wav │ │ │ │ ├── LJ001-0010.wav │ │ │ │ ├── LJ001-0011.wav │ │ │ │ ├── LJ001-0012.wav │ │ │ │ ├── LJ001-0013.wav │ │ │ │ ├── LJ001-0014.wav │ │ │ │ ├── LJ001-0015.wav │ │ │ │ └── LJ001-0016.wav │ │ │ ├── list_small.txt │ │ │ ├── taco2_lj2lj.json │ │ │ └── list.txt │ │ └── val.txt │ └── conftest.py └── dependencies │ └── details.png ├── uberduck_ml_dev ├── data │ ├── __init__.py │ ├── normalization.py │ ├── batch.py │ ├── processor.py │ ├── statistics.py │ ├── get.py │ ├── spectrogram.py │ └── ray.py ├── exec │ ├── __init__.py │ ├── train_tacotron2.py │ ├── train_vits.py │ ├── train_radtts_with_ray.py │ ├── normalize_audio.py │ └── split_train_val.py ├── text │ ├── __init__.py │ ├── datestime.py │ ├── grapheme_dictionary.py │ ├── abbreviations.py │ ├── acronyms.py │ ├── cmudict.py │ ├── letters_and_numbers.py │ ├── cleaners.py │ ├── heteronyms │ └── numerical.py ├── utils │ ├── __init__.py │ ├── exec.py │ ├── config.py │ ├── hifiutils.py │ ├── denoiser.py │ └── plot.py ├── models │ ├── __init__.py │ ├── rvc │ │ ├── __init__.py │ │ └── commons.py │ ├── components │ │ ├── __init__.py │ │ ├── decoders │ │ │ └── __init__.py │ │ ├── encoders │ │ │ ├── __init__.py │ │ │ ├── speaker │ │ │ │ ├── __init__.py │ │ │ │ └── base_encoder.py │ │ │ ├── tacotron2.py │ │ │ ├── resnet_speaker_encoder.py │ │ │ └── duration.py │ │ ├── prenet.py │ │ ├── postnet.py │ │ ├── alignment.py │ │ ├── attention.py │ │ └── partialconv1d.py │ ├── utils.py │ └── base.py ├── monitoring │ ├── __init__.py │ ├── generate.py │ ├── wandb.py │ ├── statistics.py │ └── streamlit.py ├── trainer │ ├── __init__.py │ ├── hifigan │ │ ├── __init__.py │ │ ├── train_epoch.py │ │ ├── train.py │ │ └── train_step.py │ ├── radtts │ │ ├── __init__.py │ │ ├── save.py │ │ ├── train_epoch.py │ │ ├── load.py │ │ └── train_step.py │ ├── rvc │ │ ├── __init__.py │ │ ├── train_epoch.py │ │ ├── save.py │ │ └── train.py │ ├── log.py │ └── load.py ├── vendor │ ├── __init__.py │ └── tfcompat │ │ └── __init__.py ├── __init__.py ├── assets │ └── duck.png ├── losses_rvc.py ├── monotonic_align.py ├── e2e.py └── optimizers │ └── radam.py ├── MANIFEST.in ├── tutorials ├── radtts │ ├── train.sh │ ├── download.sh │ └── radtts_data_processing.ipynb └── hifigan │ ├── download.sh │ └── data_processing.py ├── .pre-commit-config.yaml ├── .github └── workflows │ └── main.yml ├── licenses ├── LICENSE3 ├── LICENSE4 └── LICENSE2 ├── README.md ├── .gitignore ├── settings.ini └── setup.py /analytics/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /analytics/tests/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/rvc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /analytics/tests/tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /analytics/tests/tests/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/rvc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/vendor/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /uberduck_ml_dev/vendor/tfcompat/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/decoders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /analytics/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = test_*.py -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/speaker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uberduck_ml_dev/assets/duck.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/uberduck_ml_dev/assets/duck.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include README.md 3 | recursive-exclude * __pycache__ 4 | include uberduck_ml_dev/text/* 5 | -------------------------------------------------------------------------------- /analytics/dependencies/details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/dependencies/details.png -------------------------------------------------------------------------------- /analytics/tests/fixtures/stevejobs-1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/stevejobs-1.pt -------------------------------------------------------------------------------- /analytics/tests/fixtures/wavs/stevejobs-1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/wavs/stevejobs-1.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/sample_spectrogram.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/sample_spectrogram.pt -------------------------------------------------------------------------------- /analytics/tests/fixtures/sample_spectrogram_tf.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/sample_spectrogram_tf.pt -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0001.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0002.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0003.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0004.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0005.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0006.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0007.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0008.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0009.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0010.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0011.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0011.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0012.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0012.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0013.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0013.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0014.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0014.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0015.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0015.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0016.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/HEAD/analytics/tests/fixtures/ljtest/wavs/LJ001-0016.wav -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/exec.py: -------------------------------------------------------------------------------- 1 | __all__ = ["parse_args"] 2 | 3 | import argparse 4 | 5 | 6 | def parse_args(args): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--config", help="Path to JSON config") 9 | args = parser.parse_args(args) 10 | return args 11 | -------------------------------------------------------------------------------- /tutorials/radtts/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../.. 4 | # remember to set training and eval filelists, heteronyms_path and phoneme_dict_path vocoder_config_path and vocoder_checkpoint_path in demo_config.json 5 | python uberduck_ml_dev/exec/train_radtts_with_ray.py --config tutorials/radtts/demo_config.json 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.3.0 10 | hooks: 11 | - id: black 12 | -------------------------------------------------------------------------------- /tutorials/radtts/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/lj_speech.zip 4 | unzip lj_speech.zip 5 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_libritts100360_generator0p5.pt 6 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_22khz_config.json 7 | -------------------------------------------------------------------------------- /tutorials/hifigan/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/lj_speech.zip 4 | unzip lj_speech.zip 5 | # wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_libritts100360_generator0p5.pt 6 | # wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_22khz_config.json 7 | -------------------------------------------------------------------------------- /analytics/tests/tests/models/test_common.py: -------------------------------------------------------------------------------- 1 | from uberduck_ml_dev.models.common import MelSTFT 2 | import torch 3 | 4 | 5 | class TestCommon: 6 | def test_mel_stft(self): 7 | mel_stft = MelSTFT() 8 | mel = mel_stft.mel_spectrogram(torch.clip(torch.randn(1, 1000), -1, 1)) 9 | assert mel.shape[0] == 1 10 | assert mel.shape[1] == 80 11 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/val.txt: -------------------------------------------------------------------------------- 1 | analytics/tests/fixtures/wavs/stevejobs-1.wav|{ W EH1 L } { Y UW1 } { N OW1 } , { AE1 Z } { Y UW1 } { N OW1 } , { DH AH0 } { W EH1 B Z } { AH0 } { P R IH1 T IY0 } { M ER0 AE1 K Y AH0 L AH0 S } { TH IH1 NG } . { AH0 N D } { IH1 T } { W AA1 Z } { AH0 } { V EH1 R IY0 } { S IH1 M P AH0 L } { P EH1 R AH0 D AY2 M } { DH AE1 T } { W AA1 Z } { IH0 N V EH1 N T AH0 D } { W IH1 CH } { W AA1 Z } .|0 2 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/config.py: -------------------------------------------------------------------------------- 1 | from ..models.tacotron2 import DEFAULTS as TACOTRON2_DEFAULTS 2 | 3 | 4 | def tacotron2_training_to_model_config(training_config): 5 | shared_keys = set(TACOTRON2_DEFAULTS.values().keys()).intersection( 6 | training_config.keys() 7 | ) 8 | # NOTE (Sam): only need to save non-default parameters in config unless defaults change. 9 | minimal_model_config = {k: training_config[k] for k in shared_keys} 10 | return minimal_model_config 11 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/save.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def save_checkpoint(model, optimizer, iteration, filepath): 5 | print( 6 | "Saving model and optimizer state at iteration {} to {}".format( 7 | iteration, filepath 8 | ) 9 | ) 10 | 11 | # NOTE (Sam): learning rate not accessible here 12 | torch.save( 13 | { 14 | "state_dict": model.state_dict(), 15 | "iteration": iteration, 16 | "optimizer": optimizer.state_dict(), 17 | }, 18 | filepath, 19 | ) 20 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/hifigan/train_epoch.py: -------------------------------------------------------------------------------- 1 | def train_epoch( 2 | _train_step, 3 | dataloader, 4 | config, 5 | models, 6 | optimization_parameters, 7 | logging_parameters, 8 | iteration, 9 | ): 10 | for batch in dataloader: 11 | print(iteration, "iteration") 12 | _train_step( 13 | batch, 14 | config, 15 | models, 16 | optimization_parameters, 17 | logging_parameters, 18 | iteration, 19 | ) 20 | iteration += 1 21 | 22 | return iteration 23 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/generate.py: -------------------------------------------------------------------------------- 1 | __all__ = [] 2 | 3 | 4 | from ..text.utils import prepare_input_sequence 5 | 6 | 7 | def _get_inference(model, vocoder, texts, speaker_ids, symbol_set, arpabet, cpu_run): 8 | text_padded, input_lengths = prepare_input_sequence( 9 | texts, cpu_run=cpu_run, arpabet=arpabet, symbol_set=symbol_set 10 | ) 11 | # Note (SAM): None is for GST... temporary solution 12 | input_ = text_padded, input_lengths, speaker_ids, None 13 | output = model.inference(input_) 14 | audio = vocoder.infer(output[1][:1]) 15 | return audio 16 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v1 8 | - uses: actions/setup-python@v1 9 | with: 10 | python-version: "3.10" 11 | architecture: "x64" 12 | - name: Install OS dependencies 13 | run: | 14 | sudo apt-get update 15 | sudo apt-get install espeak libsndfile-dev 16 | - name: Install the library 17 | run: | 18 | pip install -e . 19 | - name: Run tests 20 | run: | 21 | python -m pytest 22 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/normalization.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | from scipy.io.wavfile import write 4 | from ..models.tacotron2 import MAX_WAV_VALUE 5 | 6 | load_resampled_normalized_audio = lambda source_path: librosa.load( 7 | source_path, sr=22050 8 | )[0] 9 | float_normalize = lambda x: np.asarray( 10 | (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1) / MAX_WAV_VALUE 11 | ) 12 | int_normalize = lambda x: np.asarray( 13 | (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype=np.int16 14 | ) 15 | save_22k_audio = lambda data, target_path: write( 16 | target_path, 22050, data 17 | ) # must be in this order 18 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/list_small.txt: -------------------------------------------------------------------------------- 1 | analytics/tests/fixtures/ljtest/wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|0 2 | analytics/tests/fixtures/ljtest/wavs/LJ001-0002.wav|in being comparatively modern.|0 3 | analytics/tests/fixtures/ljtest/wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|0 4 | analytics/tests/fixtures/ljtest/wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|0 5 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/rvc/train_epoch.py: -------------------------------------------------------------------------------- 1 | # TODO (Sam): add config arguments to model / optimization / logging and remove. 2 | from .train_step import _train_step 3 | 4 | 5 | def train_epoch( 6 | dataloader, 7 | config, 8 | models, 9 | optimization_parameters, 10 | logging_parameters, 11 | iteration, 12 | ): 13 | for batch in dataloader: 14 | print(iteration, "iteration") 15 | _train_step( 16 | batch, 17 | config, 18 | models, 19 | optimization_parameters, 20 | logging_parameters, 21 | iteration, 22 | ) 23 | iteration += 1 24 | 25 | return iteration 26 | -------------------------------------------------------------------------------- /analytics/tests/tests/vocoders/test_hifi_gan.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import read 2 | from uberduck_ml_dev.models.common import MelSTFT 3 | import torch 4 | 5 | 6 | class TestHifiGan: 7 | def test_hifi_gan(self): 8 | # TODO (Sam): move to settings file. 9 | path = "analytics/tests/fixtures/wavs/stevejobs-1.wav" 10 | sr, data = read(path) 11 | 12 | assert sr == 22050 13 | assert len(data) == 144649 14 | 15 | data = torch.FloatTensor(data / 32768.0).unsqueeze(0) 16 | 17 | melstft = MelSTFT() 18 | mel = melstft.mel_spectrogram(data) 19 | 20 | assert mel.shape[0] == 1 21 | assert mel.shape[1] == 80 22 | assert mel.shape[2] == 566 23 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/log.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import wandb 3 | from ray.air import session 4 | 5 | 6 | @torch.no_grad() 7 | def log(metrics=None, audios=None, images=None, sample_rate=22050): 8 | if session.get_world_rank() != 0: 9 | return 10 | audios = audios or {} 11 | images = images or {} 12 | wandb_metrics = {} 13 | if metrics is not None: 14 | wandb_metrics.update(metrics) 15 | 16 | for k, v in audios.items(): 17 | wandb_metrics[k] = wandb.Audio( 18 | v["audio"].cpu(), sample_rate=sample_rate, caption=v.get("caption") 19 | ) 20 | 21 | for k, v in images.items(): 22 | wandb_metrics[k] = wandb.Image(v) 23 | 24 | wandb.log(wandb_metrics) 25 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/prenet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | from ..common import LinearNorm 4 | 5 | 6 | class Prenet(nn.Module): 7 | def __init__(self, in_dim, sizes): 8 | super().__init__() 9 | in_sizes = [in_dim] + sizes[:-1] 10 | self.layers = nn.ModuleList( 11 | [ 12 | LinearNorm(in_size, out_size, bias=False) 13 | for (in_size, out_size) in zip(in_sizes, sizes) 14 | ] 15 | ) 16 | self.dropout_rate = 0.5 17 | 18 | def forward(self, x): 19 | for linear in self.layers: 20 | x = F.dropout(F.relu(linear(x)), p=self.dropout_rate, training=True) 21 | return x 22 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/datestime.py: -------------------------------------------------------------------------------- 1 | """ adapted from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | _ampm_re = re.compile(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)") 6 | 7 | 8 | def _expand_ampm(m): 9 | matches = list(m.groups(0)) 10 | txt = matches[0] 11 | txt = txt if int(matches[1]) == 0 else txt + " " + matches[1] 12 | 13 | if matches[2][0].lower() == "a": 14 | txt += " a.m." 15 | elif matches[2][0].lower() == "p": 16 | txt += " p.m." 17 | 18 | return txt 19 | 20 | 21 | def normalize_datestime(text): 22 | text = re.sub(_ampm_re, _expand_ampm, text) 23 | # text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text) 24 | return text 25 | -------------------------------------------------------------------------------- /analytics/tests/tests/text/test_symbols.py: -------------------------------------------------------------------------------- 1 | from uberduck_ml_dev.text.symbols import arpabet_to_sequence, symbols_to_sequence 2 | 3 | 4 | class TestSymbols: 5 | def test_arpabet_to_sequence(self): 6 | # NOTE: arpabet_to_sequence does not properly handle whitespace, it should take single words only. 7 | assert ( 8 | len( 9 | arpabet_to_sequence( 10 | "{ S IY } { EH M } { Y UW } { D IH K SH AH N EH R IY }" 11 | ) 12 | ) 13 | == 15 14 | ) 15 | assert arpabet_to_sequence("{ S IY }") == [168, 148] 16 | # But symbols_to_sequence hanldes whitespace 17 | 18 | def test_symbols_to_sequence(self): 19 | assert len(symbols_to_sequence("C M U Dictionary")) == 16 20 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/rvc/save.py: -------------------------------------------------------------------------------- 1 | # TODO (Sam): combine with radtts save_checkpoint 2 | import torch 3 | 4 | 5 | def save_checkpoint( 6 | generator, 7 | generator_optimizer, 8 | discriminator, 9 | discriminator_optimizer, 10 | iteration, 11 | filepath, 12 | ): 13 | print( 14 | "Saving model and optimizer state at iteration {} to {}".format( 15 | iteration, filepath 16 | ) 17 | ) 18 | 19 | # TODO (Sam): figure out where to put learning rate. 20 | torch.save( 21 | { 22 | "generator_state_dict": generator.state_dict(), 23 | "iteration": iteration, 24 | "generator_optimizer": generator_optimizer.state_dict(), 25 | "discriminator_state_dict": discriminator.state_dict(), 26 | "discriminator_optimizer": discriminator_optimizer.state_dict(), 27 | }, 28 | filepath, 29 | ) 30 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import os 4 | import inspect 5 | 6 | 7 | def load_checkpoint(filepath, device, pickle_module=pickle): 8 | assert os.path.isfile(filepath) 9 | print("Loading '{}'".format(filepath)) 10 | checkpoint_dict = torch.load( 11 | filepath, 12 | map_location=torch.device(device), 13 | pickle_module=pickle_module, 14 | ) 15 | print("Complete.") 16 | return checkpoint_dict 17 | 18 | 19 | def load_pretrained(model, checkpoint_path, key_="generator"): 20 | # NOTE (Sam): uncomment for download on anyscale 21 | # response = requests.get(HIFI_GAN_GENERATOR_URL, stream=True) 22 | # bio = BytesIO(response.content) 23 | loaded = torch.load(checkpoint_path) 24 | model.load_state_dict(loaded[key_]) 25 | 26 | 27 | def filter_valid_args(func, **kwargs): 28 | valid_keys = inspect.signature(func).parameters.keys() 29 | return {key: value for key, value in kwargs.items() if key in valid_keys} 30 | -------------------------------------------------------------------------------- /licenses/LICENSE3: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Huawei Technologies Co., Ltd. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/wandb.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import wandb 4 | from tqdm import tqdm 5 | import torch 6 | 7 | from ..text.utils import UTTERANCES 8 | 9 | 10 | def log_sample_utterances( 11 | project="my-project", 12 | name="my-model", 13 | dataset="my-dataset", 14 | architecture="my-architecture", 15 | speaker_ids: List = [], 16 | inference_function=lambda text, speaker_id: False, 17 | ): 18 | wandb.init( 19 | project=project, 20 | name=name, 21 | job_type="eval", 22 | config={"architecture": architecture, "dataset": dataset}, 23 | ) 24 | 25 | with torch.no_grad(): 26 | for speaker_id in tqdm(speaker_ids): 27 | to_log = [] 28 | for utterance in tqdm(UTTERANCES): 29 | inference = inference_function(utterance, speaker_id) 30 | to_log.append( 31 | wandb.Audio(inference, caption=utterance, sample_rate=22050) 32 | ) 33 | torch.cuda.empty_cache() # might not be necessary 34 | wandb.log({f"Speaker {speaker_id}": to_log}) 35 | 36 | wandb.finish() 37 | -------------------------------------------------------------------------------- /licenses/LICENSE4: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Bjarke Felbo, Han Thi Nguyen, Thomas Wolf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /uberduck_ml_dev/text/grapheme_dictionary.py: -------------------------------------------------------------------------------- 1 | # NOTE (Sam): synthesize with other methods 2 | 3 | """ adapted from https://github.com/keithito/tacotron """ 4 | 5 | import re 6 | 7 | _alt_re = re.compile(r"\([0-9]+\)") 8 | 9 | 10 | class Grapheme2PhonemeDictionary: 11 | """Thin wrapper around g2p data.""" 12 | 13 | def __init__(self, file_or_path, keep_ambiguous=True, encoding="latin-1"): 14 | with open(file_or_path, encoding=encoding) as f: 15 | entries = _parse_g2p(f) 16 | if not keep_ambiguous: 17 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 18 | self._entries = entries 19 | 20 | def __len__(self): 21 | return len(self._entries) 22 | 23 | def lookup(self, word): 24 | """Returns list of pronunciations of the given word.""" 25 | return self._entries.get(word.upper()) 26 | 27 | 28 | def _parse_g2p(file): 29 | g2p = {} 30 | for line in file: 31 | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): 32 | parts = line.split(" ") 33 | word = re.sub(_alt_re, "", parts[0]) 34 | pronunciation = parts[1].strip() 35 | if word in g2p: 36 | g2p[word].append(pronunciation) 37 | else: 38 | g2p[word] = [pronunciation] 39 | return g2p 40 | -------------------------------------------------------------------------------- /analytics/tests/tests/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from uberduck_ml_dev.utils.utils import get_mask_from_lengths, sequence_mask 3 | 4 | 5 | class TestUtils: 6 | def test_mask_from_lengths(self): 7 | assert ( 8 | get_mask_from_lengths(torch.LongTensor([1, 3, 2, 1])) 9 | == torch.Tensor( 10 | [ 11 | [True, False, False], 12 | [True, True, True], 13 | [True, True, False], 14 | [True, False, False], 15 | ] 16 | ) 17 | ).all() 18 | 19 | def test_sequence_mask(self): 20 | assert ( 21 | sequence_mask(torch.tensor([1, 3, 2, 1])) 22 | == torch.Tensor( 23 | [ 24 | [True, False, False], 25 | [True, True, True], 26 | [True, True, False], 27 | [True, False, False], 28 | ] 29 | ) 30 | ).all() 31 | assert ( 32 | sequence_mask(torch.tensor([1, 3, 2, 1]), 4) 33 | == torch.Tensor( 34 | [ 35 | [True, False, False, False], 36 | [True, True, True, False], 37 | [True, True, False, False], 38 | [True, False, False, False], 39 | ] 40 | ) 41 | ).all() 42 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/train_tacotron2.py: -------------------------------------------------------------------------------- 1 | __all__ = ["parse_args", "run"] 2 | 3 | from ..trainer.tacotron2 import Tacotron2Trainer 4 | from ..vendor.tfcompat.hparam import HParams 5 | from ..trainer.tacotron2 import DEFAULTS as TACOTRON2_TRAINER_DEFAULTS 6 | import argparse 7 | import sys 8 | import json 9 | import torch 10 | from torch import multiprocessing as mp 11 | 12 | 13 | def parse_args(args): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--config", help="Path to JSON config") 16 | args = parser.parse_args(args) 17 | return args 18 | 19 | 20 | def run(rank, device_count, hparams): 21 | trainer = Tacotron2Trainer(hparams, rank=rank, world_size=device_count) 22 | try: 23 | trainer.train() 24 | except Exception as e: 25 | print(f"Exception raised while training: {e}") 26 | # TODO: save state. 27 | raise e 28 | 29 | 30 | try: 31 | from nbdev.imports import IN_NOTEBOOK 32 | except: 33 | IN_NOTEBOOK = False 34 | if __name__ == "__main__" and not IN_NOTEBOOK: 35 | args = parse_args(sys.argv[1:]) 36 | config = TACOTRON2_TRAINER_DEFAULTS.values() 37 | if args.config: 38 | with open(args.config) as f: 39 | config.update(json.load(f)) 40 | config.update(vars(args)) 41 | hparams = HParams(**config) 42 | if hparams.distributed_run: 43 | device_count = torch.cuda.device_count() 44 | mp.spawn(run, (device_count, hparams), device_count) 45 | else: 46 | run(None, None, hparams) 47 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/train_vits.py: -------------------------------------------------------------------------------- 1 | __all__ = ["parse_args", "run"] 2 | 3 | 4 | import argparse 5 | import json 6 | import librosa # NOTE(zach): importing torch before librosa causes LLVM issues for some unknown reason. 7 | import sys 8 | 9 | import torch 10 | from torch import multiprocessing as mp 11 | 12 | from ..trainer.vits import VITSTrainer 13 | from ..vendor.tfcompat.hparam import HParams 14 | from ..models.vits import DEFAULTS as VITS_DEFAULTS 15 | 16 | 17 | def parse_args(args): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--config", help="Path to JSON config") 20 | args = parser.parse_args(args) 21 | return args 22 | 23 | 24 | def run(rank, device_count, hparams): 25 | trainer = VITSTrainer(hparams, rank=rank, world_size=device_count) 26 | try: 27 | trainer.train() 28 | except Exception as e: 29 | print(f"Exception raised while training: {e}") 30 | # TODO: save state. 31 | raise e 32 | 33 | 34 | try: 35 | from nbdev.imports import IN_NOTEBOOK 36 | except: 37 | IN_NOTEBOOK = False 38 | if __name__ == "__main__" and not IN_NOTEBOOK: 39 | args = parse_args(sys.argv[1:]) 40 | config = VITS_DEFAULTS.values() 41 | if args.config: 42 | with open(args.config) as f: 43 | config.update(json.load(f)) 44 | hparams = HParams(**config) 45 | if hparams.distributed_run: 46 | device_count = torch.cuda.device_count() 47 | mp.spawn(run, (device_count, hparams), device_count) 48 | else: 49 | run(0, 1, hparams) 50 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | _no_period_re = re.compile(r"(No[.])(?=[ ]?[0-9])") 4 | _percent_re = re.compile(r"([ ]?[%])") 5 | _half_re = re.compile("([0-9]½)|(½)") 6 | 7 | 8 | # List of (regular expression, replacement) pairs for abbreviations: 9 | _abbreviations = [ 10 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 11 | for x in [ 12 | ("mrs", "misess"), 13 | ("ms", "miss"), 14 | ("mr", "mister"), 15 | ("dr", "doctor"), 16 | ("st", "saint"), 17 | ("co", "company"), 18 | ("jr", "junior"), 19 | ("maj", "major"), 20 | ("gen", "general"), 21 | ("drs", "doctors"), 22 | ("rev", "reverend"), 23 | ("lt", "lieutenant"), 24 | ("hon", "honorable"), 25 | ("sgt", "sergeant"), 26 | ("capt", "captain"), 27 | ("esq", "esquire"), 28 | ("ltd", "limited"), 29 | ("col", "colonel"), 30 | ("ft", "fort"), 31 | ] 32 | ] 33 | 34 | 35 | def _expand_no_period(m): 36 | word = m.group(0) 37 | if word[0] == "N": 38 | return "Number" 39 | return "number" 40 | 41 | 42 | def _expand_percent(m): 43 | return " percent" 44 | 45 | 46 | def _expand_half(m): 47 | word = m.group(1) 48 | if word is None: 49 | return "half" 50 | return word[0] + " and a half" 51 | 52 | 53 | def normalize_abbreviations(text): 54 | text = re.sub(_no_period_re, _expand_no_period, text) 55 | text = re.sub(_percent_re, _expand_percent, text) 56 | text = re.sub(_half_re, _expand_half, text) 57 | return text 58 | -------------------------------------------------------------------------------- /licenses/LICENSE2: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, NVIDIA Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/hifiutils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import matplotlib 4 | import torch 5 | from torch.nn.utils import weight_norm 6 | 7 | matplotlib.use("Agg") 8 | import matplotlib.pylab as plt 9 | 10 | 11 | def plot_spectrogram(spectrogram): 12 | fig, ax = plt.subplots(figsize=(10, 2)) 13 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 14 | plt.colorbar(im, ax=ax) 15 | 16 | fig.canvas.draw() 17 | plt.close() 18 | 19 | return fig 20 | 21 | 22 | def init_weights(m, mean=0.0, std=0.01): 23 | classname = m.__class__.__name__ 24 | if classname.find("Conv") != -1: 25 | m.weight.data.normal_(mean, std) 26 | 27 | 28 | def apply_weight_norm(m): 29 | classname = m.__class__.__name__ 30 | if classname.find("Conv") != -1: 31 | weight_norm(m) 32 | 33 | 34 | def get_padding(kernel_size, dilation=1): 35 | return int((kernel_size * dilation - dilation) / 2) 36 | 37 | 38 | def load_checkpoint(filepath, device): 39 | assert os.path.isfile(filepath) 40 | print("Loading '{}'".format(filepath)) 41 | checkpoint_dict = torch.load(filepath, map_location=device) 42 | print("Complete.") 43 | return checkpoint_dict 44 | 45 | 46 | def save_checkpoint(filepath, obj): 47 | print("Saving checkpoint to {}".format(filepath)) 48 | torch.save(obj, filepath) 49 | print("Complete.") 50 | 51 | 52 | def scan_checkpoint(cp_dir, prefix): 53 | pattern = os.path.join(cp_dir, prefix + "????????") 54 | cp_list = glob.glob(pattern) 55 | if len(cp_list) == 0: 56 | return None 57 | return sorted(cp_list)[-1] 58 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/batch.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from ..utils.utils import to_gpu 4 | 5 | 6 | class Batch(Dict): 7 | # NOTE (Sam): isn't gate target redundant to output length. 8 | # NOTE (Sam): here types are unused, but TypedDict inheritance doesn't allow methods 9 | # NOTE (Sam): these were also problems with object (I forget), NamedTuple (mutability), dataclass (I forget) 10 | 11 | # text_int_padded: Optional[torch.LongTensor] = None 12 | # input_lengths: Optional[torch.LongTensor] = None 13 | # mel_padded: Optional[torch.FloatTensor] = None # for teacher forcing. 14 | # gate_target: Optional[ 15 | # torch.LongTensor 16 | # ] = None # NOTE (Sam): could be bool - for teacher forcing. 17 | # output_lengths: Optional[torch.LongTensor] = None 18 | # speaker_ids: Optional[torch.LongTensor] = None 19 | # gst: Optional[torch.Tensor] = None 20 | # mel_outputs: Optional[torch.Tensor] = None # predicted. 21 | # mel_outputs_postnet: Optional[torch.Tensor] = None 22 | # gate_predicted: Optional[torch.LongTensor] = None # could be bool. 23 | # alignments: Optional[torch.Tensor] = None 24 | # audio_encodings: Optional[torch.Tensor] = None 25 | 26 | def subset(self, keywords, fragile=False) -> "Batch": 27 | d = {} 28 | for k in keywords: 29 | try: 30 | d[k] = self[k] 31 | except KeyError: 32 | if fragile: 33 | raise 34 | return Batch(**d) 35 | 36 | def to_gpu(self) -> "Batch": 37 | batch_gpu = Batch(**{k: to_gpu(v) for k, v in self.items()}) 38 | return batch_gpu 39 | -------------------------------------------------------------------------------- /uberduck_ml_dev/losses_rvc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | 5 | def feature_loss(fmap_r, fmap_g): 6 | loss = 0 7 | for dr, dg in zip(fmap_r, fmap_g): 8 | for rl, gl in zip(dr, dg): 9 | rl = rl.float().detach() 10 | gl = gl.float() 11 | loss += torch.mean(torch.abs(rl - gl)) 12 | 13 | return loss * 2 14 | 15 | 16 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 17 | loss = 0 18 | r_losses = [] 19 | g_losses = [] 20 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 21 | dr = dr.float() 22 | dg = dg.float() 23 | r_loss = torch.mean((1 - dr) ** 2) 24 | g_loss = torch.mean(dg**2) 25 | loss += r_loss + g_loss 26 | r_losses.append(r_loss.item()) 27 | g_losses.append(g_loss.item()) 28 | 29 | return loss, r_losses, g_losses 30 | 31 | 32 | def generator_loss(disc_outputs): 33 | loss = 0 34 | gen_losses = [] 35 | for dg in disc_outputs: 36 | dg = dg.float() 37 | l = torch.mean((1 - dg) ** 2) 38 | gen_losses.append(l) 39 | loss += l 40 | 41 | return loss, gen_losses 42 | 43 | 44 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 45 | """ 46 | z_p, logs_q: [b, h, t_t] 47 | m_p, logs_p: [b, h, t_t] 48 | """ 49 | z_p = z_p.float() 50 | logs_q = logs_q.float() 51 | m_p = m_p.float() 52 | logs_p = logs_p.float() 53 | z_mask = z_mask.float() 54 | 55 | kl = logs_p - logs_q - 0.5 56 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 57 | kl = torch.sum(kl * z_mask) 58 | l = kl / torch.sum(z_mask) 59 | return l 60 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/train_epoch.py: -------------------------------------------------------------------------------- 1 | from .train_step import _train_step 2 | 3 | 4 | # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset 5 | def train_epoch( 6 | train_dataloader, 7 | log_decoder_samples, 8 | log_attribute_samples, 9 | model, 10 | optim, 11 | steps_per_sample, 12 | scaler, 13 | iters_per_checkpoint, 14 | output_directory, 15 | criterion, 16 | attention_kl_loss, 17 | kl_loss_start_iter, 18 | binarization_start_iter, 19 | iteration, 20 | vocoder, 21 | ): 22 | # def train_epoch(dataset_shard, batch_size, model, optim, steps_per_sample, scaler, scheduler, criterion, attention_kl_loss, kl_loss_start_iter, binarization_start_iter, epoch, iteration): 23 | # for batch_idx, ray_batch_df in enumerate( 24 | # dataset_shard.iter_batches(batch_size=batch_size, prefetch_blocks=6) 25 | # ): 26 | # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset 27 | for batch in train_dataloader: 28 | _train_step( 29 | # ray_batch_df, 30 | # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset 31 | batch, 32 | model, 33 | optim, 34 | iteration, 35 | log_decoder_samples, 36 | log_attribute_samples, 37 | steps_per_sample, 38 | scaler, 39 | iters_per_checkpoint, 40 | output_directory, 41 | criterion, 42 | attention_kl_loss, 43 | kl_loss_start_iter, 44 | binarization_start_iter, 45 | vocoder, 46 | ) 47 | iteration += 1 48 | 49 | return iteration 50 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/processor.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Dict 2 | import os 3 | 4 | 5 | # NOTE (Sam): this torch processor appears to be 10% faster than standard multiprocessing - perhaps this is overkill 6 | class Processor: 7 | def __init__( 8 | self, 9 | function_: Callable, 10 | source_paths: List[str], 11 | target_paths: List[ 12 | str 13 | ], # NOTE (Sam): this is target_folders in certain versions of the code since for example we want to save pitch at f0.pt and pitch mask as f0f.pt. Have to think of a solution. 14 | recompute: bool = True, 15 | ): 16 | self.source_paths = source_paths 17 | self.function_ = function_ 18 | self.target_paths = target_paths 19 | self.recompute = recompute 20 | 21 | def _get_data(self, source_path, target_path): 22 | # NOTE (Sam): we need caching to debug training issues in dev and for speed! 23 | # NOTE (Sam): won't catch issues with recomputation using different parameters but name name 24 | # TODO (Sam): add hashing 25 | if self.recompute or not os.path.exists(target_path): 26 | self.function_(source_path, target_path) 27 | else: 28 | pass 29 | 30 | def __getitem__(self, idx): 31 | try: 32 | self._get_data( 33 | source_path=self.source_paths[idx], 34 | target_path=self.target_paths[idx], 35 | ) 36 | 37 | except Exception as e: 38 | print(f"Error while getting data: index = {idx}") 39 | print(e) 40 | raise 41 | return None 42 | 43 | def __len__(self): 44 | nfiles = len(self.source_paths) 45 | 46 | return nfiles 47 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/train_radtts_with_ray.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import os 4 | 5 | from ray.air.config import ScalingConfig, RunConfig 6 | from ray.train.torch import TorchTrainer 7 | from ray.tune import SyncConfig 8 | from ray.train.torch import TorchTrainer, TorchTrainer 9 | from ray.air.config import ScalingConfig, RunConfig 10 | 11 | from uberduck_ml_dev.trainer.radtts.train import train_func 12 | from uberduck_ml_dev.utils.exec import parse_args 13 | from uberduck_ml_dev.trainer.radtts.train import DEFAULTS as TRAIN_CONFIG 14 | from uberduck_ml_dev.data.data import RADTTS_DEFAULTS as DATA_CONFIG 15 | from uberduck_ml_dev.models.radtts import DEFAULTS as MODEL_CONFIG 16 | 17 | if __name__ == "__main__": 18 | args = parse_args(sys.argv[1:]) 19 | if args.config: 20 | with open(args.config) as f: 21 | config_inputs = json.load(f) 22 | 23 | config = dict( 24 | train_config=TRAIN_CONFIG, data_config=DATA_CONFIG, model_config=MODEL_CONFIG 25 | ) 26 | config["train_config"].update(config_inputs["train_config"]) 27 | config["data_config"].update(config_inputs["data_config"]) 28 | config["model_config"].update(config_inputs["model_config"]) 29 | 30 | os.makedirs(config["train_config"]["output_directory"], exist_ok=True) 31 | trainer = TorchTrainer( 32 | train_loop_per_worker=train_func, 33 | train_loop_config=config, 34 | scaling_config=ScalingConfig( 35 | num_workers=config["train_config"]["n_gpus"], 36 | use_gpu=True, 37 | resources_per_worker=dict( 38 | CPU=config["data_config"]["num_workers"], 39 | GPU=1, 40 | ), 41 | ), 42 | run_config=RunConfig(sync_config=SyncConfig()), 43 | ) 44 | 45 | result = trainer.fit() 46 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/statistics.py: -------------------------------------------------------------------------------- 1 | __all__ = ["get_alignment_metrics"] 2 | 3 | import torch 4 | from ..utils.utils import get_mask_from_lengths 5 | 6 | 7 | def get_alignment_metrics( 8 | alignments, average_across_batch=True, input_lengths=None, output_lengths=None 9 | ): 10 | alignments = alignments.transpose(1, 2) # [B, dec, enc] -> [B, enc, dec] 11 | if input_lengths == None: 12 | input_lengths = torch.ones(alignments.size(0), device=alignments.device) * ( 13 | alignments.shape[1] - 1 14 | ) # [B] # 147 15 | if output_lengths == None: 16 | output_lengths = torch.ones(alignments.size(0), device=alignments.device) * ( 17 | alignments.shape[2] - 1 18 | ) # [B] # 767 19 | 20 | batch_size = alignments.size(0) 21 | optimums = torch.sqrt( 22 | input_lengths.double().pow(2) + output_lengths.double().pow(2) 23 | ).view(batch_size) 24 | 25 | # [B, enc, dec] -> [B, dec], [B, dec] 26 | values, cur_idxs = torch.max(alignments, 1) 27 | 28 | cur_idxs = cur_idxs.float() 29 | prev_indx = torch.cat((cur_idxs[:, 0][:, None], cur_idxs[:, :-1]), dim=1) 30 | dist = ((prev_indx - cur_idxs).pow(2) + 1).pow(0.5) # [B, dec] 31 | dist.masked_fill_( 32 | ~get_mask_from_lengths(output_lengths, max_len=dist.size(1)), 0.0 33 | ) # set dist of padded to zero 34 | dist = dist.sum(dim=(1)) # get total dist for each B 35 | diagonalness = (dist + 1.4142135) / optimums # dist / optimal dist 36 | 37 | maxes = alignments.max(axis=1)[0].mean(axis=1) 38 | if average_across_batch: 39 | diagonalness = diagonalness.mean() 40 | maxes = maxes.mean() 41 | 42 | output = {} 43 | output["diagonalness"] = diagonalness 44 | output["max"] = maxes 45 | 46 | return output 47 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/normalize_audio.py: -------------------------------------------------------------------------------- 1 | __all__ = ["run", "parse_args"] 2 | 3 | 4 | import argparse 5 | import os 6 | import sys 7 | 8 | from ..utils.audio import normalize_audio, trim_audio 9 | 10 | 11 | def run(dirname, backup, top_db): 12 | """Normalize all the audio files in a directory.""" 13 | old_dirname = dirname 14 | if backup: 15 | old_dirname = f"{os.path.normpath(old_dirname)}_backup" 16 | os.rename(dirname, old_dirname) 17 | for dirpath, _, filenames in os.walk(old_dirname): 18 | rel_path = os.path.relpath(dirpath, old_dirname) 19 | for filename in filenames: 20 | if not filename.endswith(".wav"): 21 | continue 22 | old_path = os.path.join(dirpath, filename) 23 | new_path = os.path.join(dirname, rel_path, filename) 24 | if not os.path.exists(os.path.join(dirname, rel_path)): 25 | os.makedirs(os.path.join(dirname, rel_path)) 26 | trim_audio(old_path, new_path, top_db) 27 | 28 | 29 | def parse_args(args): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | "-d", 33 | "--dirname", 34 | help="Path to the directory which contains audio files to normalize.", 35 | ) 36 | parser.add_argument("--backup", dest="backup", action="store_true") 37 | parser.add_argument("--no-backup", dest="backup", action="store_false") 38 | parser.add_argument("--top-db", type=int) 39 | parser.set_defaults(backup=True, top_db=20) 40 | return parser.parse_args(args) 41 | 42 | 43 | try: 44 | from nbdev.imports import IN_NOTEBOOK 45 | except: 46 | IN_NOTEBOOK = False 47 | 48 | if __name__ == "__main__" and not IN_NOTEBOOK: 49 | args = parse_args(sys.argv[1:]) 50 | run(args.dirname, args.backup, args.top_db) 51 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/split_train_val.py: -------------------------------------------------------------------------------- 1 | __all__ = ["write_filenames", "run", "parse_args"] 2 | 3 | 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | from sklearn.model_selection import train_test_split 9 | 10 | 11 | def write_filenames(filenames, output_dir, output_filename): 12 | """ 13 | Writes a list of filenames of as each line of a .txt file specified by output_filename. 14 | """ 15 | with open(os.path.join(output_dir, output_filename), "w") as f: 16 | for item in filenames: 17 | f.write(f"{item}\n") 18 | 19 | 20 | def run( 21 | path, 22 | val_percent=0.2, 23 | val_num=None, 24 | train_file="train.txt", 25 | val_file="val.txt", 26 | ): 27 | """Split file in t 28 | Default behavior only creates a training and validation set (not test set). 29 | """ 30 | with open(path) as f: 31 | lines = [l.strip("\n") for l in f.readlines()] 32 | 33 | train, val = train_test_split(lines, test_size=val_num if val_num else val_percent) 34 | write_filenames(train, Path(os.path.dirname(path)), train_file) 35 | write_filenames(val, Path(os.path.dirname(path)), val_file) 36 | 37 | 38 | import argparse 39 | import sys 40 | 41 | 42 | def parse_args(args): 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument( 45 | "-i", "--in", dest="input_path", help="Path to input file list", required=True 46 | ) 47 | parser.add_argument("-n", "--num_val", dest="num_val", type=float, default=0.1) 48 | args = parser.parse_args(args) 49 | return args 50 | 51 | 52 | try: 53 | from nbdev.imports import IN_NOTEBOOK 54 | except: 55 | IN_NOTEBOOK = False 56 | 57 | if __name__ == "__main__" and not IN_NOTEBOOK: 58 | args = parse_args(sys.argv[1:]) 59 | if args.num_val > 1: 60 | run(args.input_path, val_num=int(args.num_val)) 61 | else: 62 | run(args.input_path, val_percent=args.num_val) 63 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/taco2_lj2lj.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 16, 3 | "checkpoint_name": null, 4 | "checkpoint_path": "analytics/tests/fixtures/results/checkpoints", 5 | "cudnn_enabled": false, 6 | "dataset_path": "./dataset", 7 | "debug": false, 8 | "distributed_run": false, 9 | "epochs": 5, 10 | "epochs_per_checkpoint": 4, 11 | "filter_length": 1024, 12 | "fp16_run": false, 13 | "grad_clip_thresh": 1.0, 14 | "hop_length": 256, 15 | "ignore_layers": ["speaker_embedding.weight"], 16 | "include_f0": false, 17 | "learning_rate": 1e-3, 18 | "log_dir": "analytics/tests/fixtures/results/logs", 19 | "mask_padding": true, 20 | "max_wav_value": 32768.0, 21 | "mel_fmax": 8000, 22 | "mel_fmin": 0, 23 | "n_frames_per_step_initial": 1, 24 | "n_mel_channels": 80, 25 | "symbol_set": "nvidia_taco2", 26 | "n_symbols": 148, 27 | "n_speakers": 1, 28 | "p_arpabet": 1.0, 29 | "reduction_window_schedule": [ 30 | { 31 | "until_step": 10000, 32 | "batch_size": 16, 33 | "n_frames_per_step": 1 34 | }, 35 | { 36 | "until_step": 50000, 37 | "batch_size": 16, 38 | "n_frames_per_step": 1 39 | }, 40 | { 41 | "until_step": 60000, 42 | "batch_size": 16, 43 | "n_frames_per_step": 1 44 | }, 45 | { 46 | "until_step": 70000, 47 | "batch_size": 16, 48 | "n_frames_per_step": 1 49 | }, 50 | { 51 | "until_step": null, 52 | "batch_size": 16, 53 | "n_frames_per_step": 1 54 | } 55 | ], 56 | "sample_inference_speaker_ids": [0], 57 | "seed": 1234, 58 | "sampling_rate": 22050, 59 | "steps_per_sample": 100, 60 | "text_cleaners": ["english_cleaners"], 61 | "training_audiopaths_and_text": "analytics/tests/fixtures/ljtest/list.txt", 62 | "val_audiopaths_and_text": "analytics/tests/fixtures/ljtest/list.txt", 63 | "warm_start_name": "analytics/tests/fixtures/models/taco2ljdefault", 64 | "weight_decay": 1e-6, 65 | "win_length": 1024 66 | } 67 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/streamlit.py: -------------------------------------------------------------------------------- 1 | __all__ = ["run"] 2 | 3 | 4 | import streamlit as st 5 | from collections import OrderedDict 6 | from .generate import _get_inference, MODEL_LIST, MODEL_TYPES 7 | 8 | 9 | def run(): 10 | st.title("Inference inspector") 11 | 12 | symbol_set = st.selectbox( 13 | "What symbol set would you like to use?", ("NVIDIA_TACO2_DEFAULTS") 14 | ) 15 | st.write("You selected:", symbol_set) 16 | 17 | use_arpabet = st.selectbox("Would you like to use arpabet?", ("Yes", "No")) 18 | st.write("You selected:", use_arpabet) 19 | 20 | # st.text_input("Model file name", "test/fixtures/models/taco2ljdefault") 21 | # st.text_input("Model format", OrderedDict) 22 | vocoder_path = st.text_input( 23 | "Vocoder path", "test/fixtures/models/gen_02640000_studio" 24 | ) 25 | vocoder_config = st.text_input("Vocoder config", None) 26 | n_speakers = st.text_input("Number of speakers", 1) 27 | gate_threshold = st.text_input("Gate threshold", 0.1) 28 | 29 | chosen_model = st.sidebar.selectbox("Select model", MODEL_LIST) 30 | chosen_type = st.sidebar.selectbox("Select model save type", MODEL_TYPES) 31 | text = [st.text_input("Text", "Thats silly")] 32 | speakers = [st.text_input("Speaker_id", 0)] 33 | 34 | hparams = TACOTRON2_DEFAULTS 35 | hparams.n_speakers = n_speakers 36 | hparams.gate_threshold = gate_threshold 37 | if n_speakers > 1: 38 | hparams.has_speaker_embedding = True 39 | model = Tacotron2(hparams) 40 | device = "cuda" 41 | model = Tacotron2(hparams) 42 | if chosen_type == "OD": 43 | model.from_pretrained(model_dict=chosen_model, device=device) 44 | if chosen_type == "OD": 45 | model.from_pretrained(warm_start_path=chosen_model, device=device) 46 | 47 | hifigan = HiFiGanGenerator( 48 | config=vocoder_config, 49 | checkpoint=vocoder_file, 50 | cudnn_enabled=True, 51 | ) 52 | 53 | inference = _get_inference(model, vocoder, texts, speakers, symbol_set, arpabet) 54 | 55 | st.audio(inference) 56 | 57 | 58 | if __name__ == "__main__": 59 | run() 60 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/acronyms.py: -------------------------------------------------------------------------------- 1 | import re 2 | from .cmudict import CMUDict 3 | 4 | _letter_to_arpabet = { 5 | "A": "EY1", 6 | "B": "B IY1", 7 | "C": "S IY1", 8 | "D": "D IY1", 9 | "E": "IY1", 10 | "F": "EH1 F", 11 | "G": "JH IY1", 12 | "H": "EY1 CH", 13 | "I": "AY1", 14 | "J": "JH EY1", 15 | "K": "K EY1", 16 | "L": "EH1 L", 17 | "M": "EH1 M", 18 | "N": "EH1 N", 19 | "O": "OW1", 20 | "P": "P IY1", 21 | "Q": "K Y UW1", 22 | "R": "AA1 R", 23 | "S": "EH1 S", 24 | "T": "T IY1", 25 | "U": "Y UW1", 26 | "V": "V IY1", 27 | "X": "EH1 K S", 28 | "Y": "W AY1", 29 | "W": "D AH1 B AH0 L Y UW0", 30 | "Z": "Z IY1", 31 | "s": "Z", 32 | } 33 | 34 | # must ignore roman numerals 35 | # _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)') 36 | _acronym_re = re.compile(r"([A-Z][A-Z]+)s?") 37 | 38 | 39 | class AcronymNormalizer(object): 40 | def __init__(self, phoneme_dict): 41 | self.phoneme_dict = phoneme_dict 42 | 43 | def normalize_acronyms(self, text): 44 | def _expand_acronyms(m, add_spaces=True): 45 | acronym = m.group(0) 46 | # remove dots if they exist 47 | acronym = re.sub("\.", "", acronym) 48 | 49 | acronym = "".join(acronym.split()) 50 | arpabet = self.phoneme_dict.lookup(acronym) 51 | 52 | if arpabet is None: 53 | acronym = list(acronym) 54 | arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym] 55 | # temporary fix 56 | if arpabet[-1] == "{Z}" and len(arpabet) > 1: 57 | arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:] 58 | del arpabet[-1] 59 | arpabet = " ".join(arpabet) 60 | elif len(arpabet) == 1: 61 | arpabet = "{" + arpabet[0] + "}" 62 | else: 63 | arpabet = acronym 64 | return arpabet 65 | 66 | text = re.sub(_acronym_re, _expand_acronyms, text) 67 | return text 68 | 69 | def __call__(self, text): 70 | return self.normalize_acronyms(text) 71 | -------------------------------------------------------------------------------- /analytics/tests/tests/trainer/test_trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | from uberduck_ml_dev.vendor.tfcompat.hparam import HParams 5 | from uberduck_ml_dev.trainer.base import DEFAULTS as TRAINER_DEFAULTS 6 | from uberduck_ml_dev.trainer.base import TTSTrainer 7 | from uberduck_ml_dev.models.common import MelSTFT 8 | 9 | 10 | class TestTrainer: 11 | def test_trainer_base(self): 12 | config = TRAINER_DEFAULTS.values() 13 | 14 | params = dict( 15 | checkpoint_name="test", 16 | checkpoint_path="test_checkpoint", 17 | cudnn_enabled=True, 18 | log_dir="this/is/a/test", 19 | ) 20 | config.update(params) 21 | hparams = HParams(**config) 22 | trainer = TTSTrainer(hparams) 23 | assert trainer.hparams == hparams 24 | 25 | assert trainer.cudnn_enabled == True 26 | mel = torch.load("analytics/tests/fixtures/stevejobs-1.pt") 27 | mel_stft = MelSTFT() 28 | audio = mel_stft.griffin_lim(mel) 29 | assert audio.size(0) == 1 30 | 31 | 32 | class TestTacotron2Trainer: 33 | # NOTE (Sam): this test could be made twice as fast by only running a single epoch,. 34 | # since as it is, the second gradient step is only useful for evaluating the loss 35 | def test_gradient_step(self, lj_trainer): 36 | torch.manual_seed(1234) 37 | lj_trainer.train() 38 | 39 | # NOTE (Sam): this number was taken from master on 8/24/22. 40 | # train_loss_start = 0.320 41 | # train_loss_4_datapoints_1_iteration = 0.319 42 | # NOTE (Sam): new numbers taken after normalization change 12/11/22 43 | # Have to run two iterations for loss to go down now. 44 | # train_loss_start = 0.339 45 | # train_loss_4_datapoints_2_iteration = 0.327 46 | # NOTE (Sam): new numbers taken after enforce_sorted = False 2/7/23 47 | train_loss_start = 0.334 48 | train_loss_4_datapoints_2_iteration = 0.326 49 | assert math.isclose(lj_trainer.loss[0], train_loss_start, abs_tol=5e-4) 50 | 51 | assert math.isclose( 52 | lj_trainer.loss[2], train_loss_4_datapoints_2_iteration, abs_tol=5e-4 53 | ) 54 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/base.py: -------------------------------------------------------------------------------- 1 | __all__ = ["TTSModel", "DEFAULTS"] 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from ..text.symbols import SYMBOL_SETS 7 | from ..vendor.tfcompat.hparam import HParams 8 | 9 | 10 | class TTSModel(nn.Module): 11 | def __init__(self, hparams): 12 | super().__init__() 13 | self.symbol_set = hparams.symbol_set 14 | self.n_symbols = len(SYMBOL_SETS[self.symbol_set]) 15 | self.n_speakers = hparams.n_speakers 16 | # symbols = __import__('uberduck_ml_dev.text.' + hparams.symbols) 17 | 18 | def infer(self): 19 | raise NotImplemented 20 | 21 | def forward(self): 22 | raise NotImplemented 23 | 24 | def from_pretrained( 25 | self, warm_start_path=None, device="cpu", ignore_layers=None, model_dict=None 26 | ): 27 | model_dict = model_dict or dict() 28 | if warm_start_path is None and model_dict is None: 29 | raise Exception( 30 | "TTSModel.from_pretrained requires a warm_start_path or state_dict" 31 | ) 32 | if warm_start_path is not None: 33 | checkpoint = torch.load(warm_start_path, map_location=device) 34 | if ( 35 | "state_dict" in checkpoint.keys() 36 | ): # TODO: remove state_dict once off nvidia 37 | model_dict = checkpoint["state_dict"] 38 | if "model" in checkpoint.keys(): 39 | model_dict = checkpoint["model"] 40 | if ignore_layers: 41 | model_dict = {k: v for k, v in model_dict.items() if k not in ignore_layers} 42 | dummy_dict = self.state_dict() 43 | 44 | for k in self.state_dict().keys(): 45 | if k not in model_dict.keys(): 46 | print( 47 | f"WARNING! Attempting to load a model with out the {k} layer. This could lead to unexpected results during evaluation." 48 | ) 49 | 50 | dummy_dict.update(model_dict) 51 | model_dict = dummy_dict 52 | self.load_state_dict(model_dict) 53 | if device == "cuda": 54 | self.cuda() 55 | 56 | def to_checkpoint(self): 57 | return dict(model=self.state_dict()) 58 | 59 | @classmethod 60 | def create(cls, name, opts, folders, all_speakers=True): 61 | pass 62 | 63 | 64 | DEFAULTS = HParams( 65 | p_arpabet=1.0, 66 | seed=1234, 67 | # NOTE (Sam): make sure users change their configurations for cudnn_enabled = True. 68 | cudnn_enabled=False, 69 | ) 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deprecation note 2 | We are moving away from maintaining this repository. 3 | 4 | # 🦆 ~~Uberduck Synthetic Speech~~ 5 |  6 |  7 |  8 |  9 | [](https://discord.com/invite/ATYWnMu) 10 | 11 | This repository includes 12 |
tags around the doc strings, preserving newlines/indentation.
68 | #monospace_docstrings = False
69 | #Test flags: introduce here the test flags you want to use separated by |
70 | tst_flags=slow
71 | #Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
72 | #custom_sidebar =
73 | #Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
74 | #jekyll_styles = note,warning,tip,important
75 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/attention.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | import torch
3 | from numpy import finfo
4 | from torch.nn import functional as F
5 | from typing import Optional
6 |
7 | from ..common import LinearNorm, LocationLayer
8 |
9 |
10 | class Attention(nn.Module):
11 | def __init__(
12 | self,
13 | attention_rnn_dim,
14 | embedding_dim,
15 | attention_dim,
16 | attention_location_n_filters,
17 | attention_location_kernel_size,
18 | fp16_run,
19 | ):
20 | super(Attention, self).__init__()
21 | self.query_layer = LinearNorm(
22 | attention_rnn_dim, attention_dim, bias=False, w_init_gain="tanh"
23 | )
24 | self.memory_layer = LinearNorm(
25 | embedding_dim, attention_dim, bias=False, w_init_gain="tanh"
26 | )
27 | self.v = LinearNorm(attention_dim, 1, bias=False)
28 | self.location_layer = LocationLayer(
29 | attention_location_n_filters, attention_location_kernel_size, attention_dim
30 | )
31 | if fp16_run:
32 | self.score_mask_value = finfo("float16").min
33 | else:
34 | self.score_mask_value = -float("inf")
35 |
36 | def get_alignment_energies(self, query, processed_memory, attention_weights_cat):
37 | """
38 | PARAMS
39 | ------
40 | query: decoder output (batch, n_mel_channels * n_frames_per_step)
41 | processed_memory: processed encoder outputs (B, T_in, attention_dim)
42 | attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
43 |
44 | RETURNS
45 | -------
46 | alignment (batch, max_time)
47 | """
48 |
49 | processed_query = self.query_layer(query.unsqueeze(1))
50 | processed_attention_weights = self.location_layer(attention_weights_cat)
51 | energies = self.v(
52 | torch.tanh(processed_query + processed_attention_weights + processed_memory)
53 | )
54 |
55 | energies = energies.squeeze(-1)
56 | return energies
57 |
58 | def forward(
59 | self,
60 | attention_hidden_state,
61 | memory,
62 | processed_memory,
63 | attention_weights_cat,
64 | mask,
65 | attention_weights: Optional[torch.Tensor],
66 | ):
67 | """
68 | PARAMS
69 | ------
70 | attention_hidden_state: attention rnn last output
71 | memory: encoder outputs
72 | processed_memory: processed encoder outputs
73 | attention_weights_cat: previous and cummulative attention weights
74 | mask: binary mask for padded data
75 | """
76 | if attention_weights is None:
77 | alignment = self.get_alignment_energies(
78 | attention_hidden_state, processed_memory, attention_weights_cat
79 | )
80 |
81 | if mask is not None:
82 | alignment.data.masked_fill_(mask, self.score_mask_value)
83 |
84 | attention_weights = F.softmax(alignment, dim=1)
85 | attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
86 | attention_context = attention_context.squeeze(1)
87 |
88 | return attention_context, attention_weights
89 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/text/cmudict.py:
--------------------------------------------------------------------------------
1 | __all__ = ["CMUDict", "valid_symbols"]
2 |
3 |
4 | """ from https://github.com/keithito/tacotron """
5 |
6 | import re
7 |
8 |
9 | valid_symbols = [
10 | "AA",
11 | "AA0",
12 | "AA1",
13 | "AA2",
14 | "AE",
15 | "AE0",
16 | "AE1",
17 | "AE2",
18 | "AH",
19 | "AH0",
20 | "AH1",
21 | "AH2",
22 | "AO",
23 | "AO0",
24 | "AO1",
25 | "AO2",
26 | "AW",
27 | "AW0",
28 | "AW1",
29 | "AW2",
30 | "AY",
31 | "AY0",
32 | "AY1",
33 | "AY2",
34 | "B",
35 | "CH",
36 | "D",
37 | "DH",
38 | "EH",
39 | "EH0",
40 | "EH1",
41 | "EH2",
42 | "ER",
43 | "ER0",
44 | "ER1",
45 | "ER2",
46 | "EY",
47 | "EY0",
48 | "EY1",
49 | "EY2",
50 | "F",
51 | "G",
52 | "HH",
53 | "IH",
54 | "IH0",
55 | "IH1",
56 | "IH2",
57 | "IY",
58 | "IY0",
59 | "IY1",
60 | "IY2",
61 | "JH",
62 | "K",
63 | "L",
64 | "M",
65 | "N",
66 | "NG",
67 | "OW",
68 | "OW0",
69 | "OW1",
70 | "OW2",
71 | "OY",
72 | "OY0",
73 | "OY1",
74 | "OY2",
75 | "P",
76 | "R",
77 | "S",
78 | "SH",
79 | "T",
80 | "TH",
81 | "UH",
82 | "UH0",
83 | "UH1",
84 | "UH2",
85 | "UW",
86 | "UW0",
87 | "UW1",
88 | "UW2",
89 | "V",
90 | "W",
91 | "Y",
92 | "Z",
93 | "ZH",
94 | ]
95 |
96 | _valid_symbol_set = set(valid_symbols)
97 |
98 |
99 | class CMUDict:
100 | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
101 |
102 | def __init__(self, file_or_path, keep_ambiguous=True):
103 | if isinstance(file_or_path, str):
104 | with open(file_or_path, encoding="latin-1") as f:
105 | entries = _parse_cmudict(f)
106 | else:
107 | entries = _parse_cmudict(file_or_path)
108 | if not keep_ambiguous:
109 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
110 | self._entries = entries
111 |
112 | def __len__(self):
113 | return len(self._entries)
114 |
115 | def lookup(self, word):
116 | """Returns list of ARPAbet pronunciations of the given word."""
117 | return self._entries.get(word.upper())
118 |
119 |
120 | _alt_re = re.compile(r"\([0-9]+\)")
121 |
122 |
123 | def _parse_cmudict(file):
124 | cmudict = {}
125 | for line in file:
126 | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
127 | parts = line.split(" ")
128 | word = re.sub(_alt_re, "", parts[0])
129 | pronunciation = _get_pronunciation(parts[1])
130 | if pronunciation:
131 | if word in cmudict:
132 | cmudict[word].append(pronunciation)
133 | else:
134 | cmudict[word] = [pronunciation]
135 | return cmudict
136 |
137 |
138 | def _get_pronunciation(s):
139 | parts = s.strip().split(" ")
140 | for part in parts:
141 | if part not in _valid_symbol_set:
142 | return None
143 | return " ".join(parts)
144 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/e2e.py:
--------------------------------------------------------------------------------
1 | __all__ = ["tts", "rhythm_transfer"]
2 |
3 |
4 | import torch
5 |
6 | from .text.symbols import NVIDIA_TACO2_SYMBOLS
7 | from .text.utils import prepare_input_sequence
8 |
9 |
10 | from typing import List
11 |
12 | from .models.tacotron2 import Tacotron2
13 | from .vocoders.hifigan import HiFiGanGenerator
14 |
15 |
16 | def tts(
17 | lines: List[str],
18 | model,
19 | device: str,
20 | vocoder,
21 | arpabet=False,
22 | symbol_set=NVIDIA_TACO2_SYMBOLS,
23 | max_wav_value=32768.0,
24 | speaker_ids=None,
25 | ):
26 | assert isinstance(
27 | model, Tacotron2
28 | ), "Only Tacotron2 text-to-mel models are supported"
29 | assert isinstance(vocoder, HiFiGanGenerator), "Only Hifi GAN vocoders are supported"
30 | cpu_run = device == "cpu"
31 | sequences, input_lengths = prepare_input_sequence(
32 | lines, cpu_run=cpu_run, arpabet=arpabet, symbol_set=symbol_set
33 | )
34 | if speaker_ids is None:
35 | speaker_ids = torch.zeros(len(lines), dtype=torch.long, device=device)
36 | input_ = sequences, input_lengths, speaker_ids
37 | _, mel_outputs_postnet, gate_outputs, alignment, lengths = model.inference(input_)
38 | mels = mel_outputs_postnet
39 | mel = mels[0, :, : lengths[0].item()]
40 | for idx in range(1, mels.size(0)):
41 | length = lengths[idx].item()
42 | mel = torch.cat((mel, mels[idx, :, :length]), dim=-1)
43 | tensor_cls = torch.FloatTensor if device == "cpu" else torch.cuda.FloatTensor
44 | mel = mel[None, :]
45 | y_g_hat = vocoder(tensor_cls(mel).to(device=device))
46 | audio = y_g_hat.reshape(1, -1)
47 | audio = audio * max_wav_value
48 | return audio
49 |
50 |
51 | from typing import Optional
52 |
53 | from .models.common import MelSTFT
54 |
55 |
56 | @torch.no_grad()
57 | def rhythm_transfer(
58 | original_audio: torch.tensor,
59 | original_text: str,
60 | model,
61 | vocoder,
62 | device: str,
63 | symbol_set=NVIDIA_TACO2_SYMBOLS,
64 | arpabet=False,
65 | max_wav_value=32768.0,
66 | speaker_id=0,
67 | ):
68 | assert len(original_audio.shape) == 1
69 | cpu_run = device == "cpu"
70 | # TODO(zach): Support non-default STFT parameters.
71 | stft = MelSTFT()
72 | p_arpabet = float(arpabet)
73 | sequence, input_lengths, _ = prepare_input_sequence(
74 | [original_text], arpabet=arpabet, cpu_run=cpu_run, symbol_set=symbol_set
75 | )
76 | original_target_mel = stft.mel_spectrogram(original_audio[None])
77 | if not cpu_run:
78 | original_target_mel = original_target_mel.cuda()
79 | max_len = original_target_mel.size(2)
80 | speaker_ids = torch.tensor([speaker_id], dtype=torch.long, device=device)
81 | inputs = (
82 | sequence,
83 | input_lengths,
84 | original_target_mel,
85 | max_len,
86 | torch.tensor([max_len], dtype=torch.long, device=device),
87 | speaker_ids,
88 | )
89 | attn = model.get_alignment(inputs)
90 | _, mel_postnet, _, _ = model.inference_noattention(
91 | (sequence, input_lengths, speaker_ids, attn.transpose(0, 1))
92 | )
93 | y_g_hat = vocoder(torch.tensor(mel_postnet, dtype=torch.float, device=device))
94 | audio = y_g_hat.reshape(1, -1)
95 | audio = audio * max_wav_value
96 | return audio
97 |
--------------------------------------------------------------------------------
/tutorials/hifigan/data_processing.py:
--------------------------------------------------------------------------------
1 | import os
2 | from scipy.io.wavfile import read, write
3 | import librosa
4 | import torch
5 | import numpy as np
6 |
7 | from uberduck_ml_dev.data.get import get
8 | from uberduck_ml_dev.data.utils import mel_spectrogram_torch, find_rel_paths
9 | from uberduck_ml_dev.data.data import HIFIGAN_DEFAULTS as DEFAULTS
10 | from uberduck_ml_dev.data.data import MAX_WAV_VALUE
11 |
12 |
13 | data_directory = "" # path to the directory containing the data
14 | ground_truth_rel_paths = find_rel_paths(directory=data_directory, filename="gt.wav")
15 | ground_truth_abs_paths = [
16 | os.path.join(data_directory, ground_truth_rel_path)
17 | for ground_truth_rel_path in ground_truth_rel_paths
18 | ]
19 |
20 |
21 | print("resampling and integer normalizing")
22 |
23 | resampled_normalized_abs_paths = [
24 | resampled_normalized_abs_path.replace(
25 | "gt.wav", "audio_resampledT_normalized32768T.wav"
26 | )
27 | for resampled_normalized_abs_path in ground_truth_abs_paths
28 | ]
29 |
30 | loading_function = lambda filename: librosa.load(filename, sr=22050)[0]
31 | processing_function = lambda x: np.asarray(
32 | (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype=np.int16
33 | )
34 | saving_function = lambda data, filename: write(
35 | filename, 22050, data
36 | ) # must be in this order
37 |
38 |
39 | get(
40 | processing_function,
41 | saving_function,
42 | loading_function,
43 | ground_truth_abs_paths,
44 | resampled_normalized_abs_paths,
45 | True,
46 | )
47 |
48 | print("resampling and float normalizing")
49 |
50 | resampled_normalized_abs_paths = [
51 | resampled_normalized_abs_path.replace("gt.wav", "audio_resampledT_normalized1T.wav")
52 | for resampled_normalized_abs_path in ground_truth_abs_paths
53 | ]
54 |
55 | loading_function = lambda filename: librosa.load(filename, sr=22050)[0]
56 | processing_function = lambda x: np.asarray(
57 | (x / np.abs(x).max()) * (1 - 1 / MAX_WAV_VALUE), dtype=np.float32
58 | )
59 | saving_function = lambda data, filename: write(
60 | filename, 22050, data
61 | ) # must be in this order
62 |
63 |
64 | get(
65 | processing_function,
66 | saving_function,
67 | loading_function,
68 | ground_truth_abs_paths,
69 | resampled_normalized_abs_paths,
70 | True,
71 | )
72 |
73 |
74 | print("computing spectrograms from 1 normalized audio")
75 |
76 | spectrogram_abs_paths = [
77 | ground_truth_abs_path.replace("gt.wav", "spectrogram.pt")
78 | for ground_truth_abs_path in ground_truth_abs_paths
79 | ]
80 |
81 |
82 | processing_function = lambda x: mel_spectrogram_torch(
83 | x,
84 | DEFAULTS["n_fft"],
85 | DEFAULTS["num_mels"],
86 | DEFAULTS["sampling_rate"],
87 | DEFAULTS["hop_size"],
88 | DEFAULTS["win_size"],
89 | DEFAULTS["fmin"],
90 | DEFAULTS["fmax"],
91 | True,
92 | )
93 | loading_function = lambda source_path: torch.Tensor(
94 | read(source_path)[1] / MAX_WAV_VALUE
95 | ).unsqueeze(0)
96 | saving_function = lambda data, target_path: torch.save(data, target_path)
97 |
98 | get(
99 | processing_function,
100 | saving_function,
101 | loading_function,
102 | resampled_normalized_abs_paths,
103 | spectrogram_abs_paths,
104 | True,
105 | )
106 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/text/letters_and_numbers.py:
--------------------------------------------------------------------------------
1 | """ adapted from https://github.com/keithito/tacotron """
2 |
3 | import re
4 |
5 | _letters_and_numbers_re = re.compile(
6 | r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE
7 | )
8 |
9 | _hardware_re = re.compile(
10 | "([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)", re.IGNORECASE
11 | )
12 | _hardware_key = {
13 | "tb": "terabyte",
14 | "gb": "gigabyte",
15 | "mb": "megabyte",
16 | "kb": "kilobyte",
17 | "ghz": "gigahertz",
18 | "mhz": "megahertz",
19 | "khz": "kilohertz",
20 | "hz": "hertz",
21 | "mm": "millimeter",
22 | "cm": "centimeter",
23 | "km": "kilometer",
24 | }
25 |
26 | _dimension_re = re.compile(
27 | r"\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b"
28 | )
29 | _dimension_key = {"m": "meter", "in": "inch", "inch": "inch"}
30 |
31 |
32 | def _expand_letters_and_numbers(m):
33 | text = re.split(r"(\d+)", m.group(0))
34 |
35 | # remove trailing space
36 | if text[-1] == "":
37 | text = text[:-1]
38 | elif text[0] == "":
39 | text = text[1:]
40 |
41 | # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc...
42 | if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit():
43 | text[-2] = text[-2] + text[-1]
44 | text = text[:-1]
45 |
46 | # for combining digits 2 by 2
47 | new_text = []
48 | for i in range(len(text)):
49 | string = text[i]
50 | if string.isdigit() and len(string) < 5:
51 | # heuristics
52 | if len(string) > 2 and string[-2] == "0":
53 | if string[-1] == "0":
54 | string = [string]
55 | else:
56 | string = [string[:-3], string[-2], string[-1]]
57 | elif len(string) % 2 == 0:
58 | string = [string[i : i + 2] for i in range(0, len(string), 2)]
59 | elif len(string) > 2:
60 | string = [string[0]] + [
61 | string[i : i + 2] for i in range(1, len(string), 2)
62 | ]
63 | new_text.extend(string)
64 | else:
65 | new_text.append(string)
66 |
67 | text = new_text
68 | text = " ".join(text)
69 | return text
70 |
71 |
72 | def _expand_hardware(m):
73 | quantity, measure = m.groups(0)
74 | measure = _hardware_key[measure.lower()]
75 | if measure[-1] != "z" and float(quantity.replace(",", "")) > 1:
76 | return "{} {}s".format(quantity, measure)
77 | return "{} {}".format(quantity, measure)
78 |
79 |
80 | def _expand_dimension(m):
81 | text = "".join([x for x in m.groups(0) if x != 0])
82 | text = text.replace(" x ", " by ")
83 | text = text.replace("x", " by ")
84 | if text.endswith(tuple(_dimension_key.keys())):
85 | if text[-2].isdigit():
86 | text = "{} {}".format(text[:-1], _dimension_key[text[-1:]])
87 | elif text[-3].isdigit():
88 | text = "{} {}".format(text[:-2], _dimension_key[text[-2:]])
89 | return text
90 |
91 |
92 | def normalize_letters_and_numbers(text):
93 | text = re.sub(_hardware_re, _expand_hardware, text)
94 | text = re.sub(_dimension_re, _expand_dimension, text)
95 | text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text)
96 | return text
97 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/partialconv1d.py:
--------------------------------------------------------------------------------
1 | # Modified partialconv source code based on implementation from
2 | # https://github.com/NVIDIA/partialconv/blob/master/models/partialconv2d.py
3 | ###############################################################################
4 | # BSD 3-Clause License
5 | #
6 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
7 | #
8 | # Author & Contact: Guilin Liu (guilinl@nvidia.com)
9 | ###############################################################################
10 |
11 | # Original Author & Contact: Guilin Liu (guilinl@nvidia.com)
12 | # Modified by Kevin Shih (kshih@nvidia.com)
13 |
14 | import torch
15 | import torch.nn.functional as F
16 | from torch import nn
17 | from typing import Tuple
18 |
19 |
20 | class PartialConv1d(nn.Conv1d):
21 | def __init__(self, *args, **kwargs):
22 | self.multi_channel = False
23 | self.return_mask = False
24 | super(PartialConv1d, self).__init__(*args, **kwargs)
25 |
26 | self.weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0])
27 | self.slide_winsize = (
28 | self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2]
29 | )
30 |
31 | self.last_size = (None, None, None)
32 | self.update_mask = None
33 | self.mask_ratio = None
34 |
35 | @torch.jit.ignore
36 | def forward(self, input: torch.Tensor, mask_in: torch.Tensor = None):
37 | """
38 | input: standard input to a 1D conv
39 | mask_in: binary mask for valid values, same shape as input
40 | """
41 | assert len(input.shape) == 3
42 | # if a mask is input, or tensor shape changed, update mask ratio
43 | if mask_in is not None or self.last_size != tuple(input.shape):
44 | self.last_size = tuple(input.shape)
45 | with torch.no_grad():
46 | if self.weight_maskUpdater.type() != input.type():
47 | self.weight_maskUpdater = self.weight_maskUpdater.to(input)
48 | if mask_in is None:
49 | mask = torch.ones(1, 1, input.data.shape[2]).to(input)
50 | else:
51 | mask = mask_in
52 | self.update_mask = F.conv1d(
53 | mask,
54 | self.weight_maskUpdater,
55 | bias=None,
56 | stride=self.stride,
57 | padding=self.padding,
58 | dilation=self.dilation,
59 | groups=1,
60 | )
61 | # for mixed precision training, change 1e-8 to 1e-6
62 | self.mask_ratio = self.slide_winsize / (self.update_mask + 1e-6)
63 | self.update_mask = torch.clamp(self.update_mask, 0, 1)
64 | self.mask_ratio = torch.mul(self.mask_ratio, self.update_mask)
65 | raw_out = super(PartialConv1d, self).forward(
66 | torch.mul(input, mask) if mask_in is not None else input
67 | )
68 | if self.bias is not None:
69 | bias_view = self.bias.view(1, self.out_channels, 1)
70 | output = torch.mul(raw_out - bias_view, self.mask_ratio) + bias_view
71 | output = torch.mul(output, self.update_mask)
72 | else:
73 | output = torch.mul(raw_out, self.mask_ratio)
74 |
75 | if self.return_mask:
76 | return output, self.update_mask
77 | else:
78 | return output
79 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/tacotron2.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | import torch
3 | from torch.nn import functional as F
4 |
5 | from ...common import Conv1d
6 |
7 |
8 | class Encoder(nn.Module):
9 | """Encoder module:
10 | - Three 1-d convolution banks
11 | - Bidirectional LSTM
12 | """
13 |
14 | def __init__(self, hparams):
15 | super().__init__()
16 |
17 | convolutions = []
18 | for _ in range(hparams.encoder_n_convolutions):
19 | conv_layer = nn.Sequential(
20 | Conv1d(
21 | hparams.encoder_embedding_dim,
22 | hparams.encoder_embedding_dim,
23 | kernel_size=hparams.encoder_kernel_size,
24 | stride=1,
25 | padding=int((hparams.encoder_kernel_size - 1) / 2),
26 | dilation=1,
27 | w_init_gain="relu",
28 | ),
29 | nn.BatchNorm1d(hparams.encoder_embedding_dim),
30 | )
31 | convolutions.append(conv_layer)
32 | self.convolutions = nn.ModuleList(convolutions)
33 | self.dropout_rate = 0.5
34 |
35 | self.lstm = nn.LSTM(
36 | hparams.encoder_embedding_dim,
37 | int(hparams.encoder_embedding_dim / 2),
38 | 1,
39 | batch_first=True,
40 | bidirectional=True,
41 | )
42 |
43 | def forward(self, x, input_lengths):
44 | if x.size()[0] > 1:
45 | x_embedded = []
46 | for b_ind in range(x.size()[0]): # TODO: Speed up
47 | curr_x = x[b_ind : b_ind + 1, :, : input_lengths[b_ind]].clone()
48 | for conv in self.convolutions:
49 | curr_x = F.dropout(
50 | F.relu(conv(curr_x)), self.dropout_rate, self.training
51 | )
52 | x_embedded.append(curr_x[0].transpose(0, 1))
53 | x = torch.nn.utils.rnn.pad_sequence(x_embedded, batch_first=True)
54 | else:
55 | for conv in self.convolutions:
56 | x = F.dropout(F.relu(conv(x)), self.dropout_rate, self.training)
57 | x = x.transpose(1, 2)
58 |
59 | # pytorch tensor are not reversible, hence the conversion
60 | input_lengths = input_lengths.cpu().numpy()
61 | x = nn.utils.rnn.pack_padded_sequence(
62 | x, input_lengths, batch_first=True, enforce_sorted=False
63 | )
64 |
65 | self.lstm.flatten_parameters()
66 | outputs, _ = self.lstm(x)
67 |
68 | outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
69 | return outputs
70 |
71 | def inference(self, x, input_lengths):
72 | device = x.device
73 | for conv in self.convolutions:
74 | x = F.dropout(F.relu(conv(x)), self.dropout_rate, self.training)
75 |
76 | x = x.transpose(1, 2)
77 |
78 | input_lengths = input_lengths.cpu()
79 | x = nn.utils.rnn.pack_padded_sequence(
80 | x, input_lengths, batch_first=True, enforce_sorted=False
81 | )
82 |
83 | outputs, _ = self.lstm(x)
84 |
85 | outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
86 |
87 | return outputs
88 |
89 |
90 | # NOTE (Sam): for torchscipt compilation
91 | class EncoderForwardIsInfer(Encoder):
92 | def forward(self, x, input_lengths):
93 | return self.inference(x, input_lengths)
94 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/speaker/base_encoder.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/coqui-ai/TTS/blob/dev/TTS/encoder/models/base_encoder.py
2 |
3 | import numpy as np
4 | import torch
5 | import torchaudio
6 |
7 | from torch import nn
8 |
9 |
10 | class PreEmphasis(nn.Module):
11 | def __init__(self, coefficient=0.97):
12 | super().__init__()
13 | self.coefficient = coefficient
14 | self.register_buffer(
15 | "filter",
16 | torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0),
17 | )
18 |
19 | def forward(self, x):
20 | assert len(x.size()) == 2
21 |
22 | x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
23 | return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
24 |
25 |
26 | class BaseEncoder(nn.Module):
27 | """Base `encoder` class. Every new `encoder` model must inherit this.
28 |
29 | It defines common `encoder` specific functions.
30 | """
31 |
32 | # pylint: disable=W0102
33 | def __init__(self):
34 | super(BaseEncoder, self).__init__()
35 |
36 | def get_torch_mel_spectrogram_class(self, audio_config):
37 | return torch.nn.Sequential(
38 | PreEmphasis(audio_config["preemphasis"]),
39 | # TorchSTFT(
40 | # n_fft=audio_config["fft_size"],
41 | # hop_length=audio_config["hop_length"],
42 | # win_length=audio_config["win_length"],
43 | # sample_rate=audio_config["sample_rate"],
44 | # window="hamming_window",
45 | # mel_fmin=0.0,
46 | # mel_fmax=None,
47 | # use_htk=True,
48 | # do_amp_to_db=False,
49 | # n_mels=audio_config["num_mels"],
50 | # power=2.0,
51 | # use_mel=True,
52 | # mel_norm=None,
53 | # )
54 | torchaudio.transforms.MelSpectrogram(
55 | sample_rate=audio_config["sample_rate"],
56 | n_fft=audio_config["fft_size"],
57 | win_length=audio_config["win_length"],
58 | hop_length=audio_config["hop_length"],
59 | window_fn=torch.hamming_window,
60 | n_mels=audio_config["num_mels"],
61 | ),
62 | )
63 |
64 | @torch.no_grad()
65 | def inference(self, x, l2_norm=True):
66 | return self.forward(x, l2_norm)
67 |
68 | @torch.no_grad()
69 | def compute_embedding(
70 | self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True
71 | ):
72 | """
73 | Generate embeddings for a batch of utterances
74 | x: 1xTxD
75 | """
76 | # map to the waveform size
77 | if self.use_torch_spec:
78 | num_frames = num_frames * self.audio_config["hop_length"]
79 |
80 | max_len = x.shape[1]
81 |
82 | if max_len < num_frames:
83 | num_frames = max_len
84 |
85 | offsets = np.linspace(0, max_len - num_frames, num=num_eval)
86 |
87 | frames_batch = []
88 | for offset in offsets:
89 | offset = int(offset)
90 | end_offset = int(offset + num_frames)
91 | frames = x[:, offset:end_offset]
92 | frames_batch.append(frames)
93 |
94 | frames_batch = torch.cat(frames_batch, dim=0)
95 | embeddings = self.inference(frames_batch, l2_norm=l2_norm)
96 |
97 | if return_mean:
98 | embeddings = torch.mean(embeddings, dim=0, keepdim=True)
99 | return embeddings
100 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/data/statistics.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "word_frequencies",
3 | "create_wordcloud",
4 | "count_frequency",
5 | "pace_character",
6 | "pace_phoneme",
7 | "get_sample_format",
8 | "AbsoluteMetrics",
9 | ]
10 |
11 | from typing import List, Any, Dict, Union, Optional
12 | from collections import Counter
13 | import os
14 |
15 | import librosa
16 | import numpy as np
17 | from pydub.utils import mediainfo_json
18 | from wordfreq import word_frequency
19 |
20 | from ..text.utils import text_to_sequence
21 |
22 | # NOTE (Sam): this file could be refactored so that it doesn't contain both speechmetrics and wordfreqencies - very different types of statistics.
23 |
24 |
25 | def word_frequencies(text: str, language: str = "en") -> List[float]:
26 | """
27 | Calculate the frequency [0-1] which the words appear in the english language
28 | """
29 | freqs = []
30 | for word in text.split():
31 | freqs.append(word_frequency(word, language))
32 | return freqs
33 |
34 |
35 | def count_frequency(arr: List[Any]) -> Dict[Any, int]:
36 | """
37 | Calculates the frequency that a value appears in a list
38 | """
39 | return dict(Counter(arr).most_common())
40 |
41 |
42 | def pace_character(
43 | text: str, audio: Union[str, np.ndarray], sr: Optional[int] = None
44 | ) -> float:
45 | """
46 | Calculates the number of characters in the text per second of the audio file. Audio can be a file path or an np array.
47 | """
48 | if isinstance(audio, str):
49 | audio, sr = librosa.load(audio, sr=None)
50 | else:
51 | assert sr, "Sampling rate must be provided if audio is np array"
52 |
53 | return len(text) / librosa.get_duration(audio, sr=sr)
54 |
55 |
56 | def pace_phoneme(
57 | text: str, audio: Union[str, np.ndarray], sr: Optional[int] = None
58 | ) -> float:
59 | """
60 | Calculates the number of phonemes in the text per second of the audio. Audio can be a file path or an np array.
61 | """
62 | if isinstance(audio, str):
63 | audio, sr = librosa.load(audio, sr=None)
64 | else:
65 | assert sr, "Sampling rate must be provided if audio is np array"
66 |
67 | arpabet_seq = text_to_sequence(text, ["english_cleaners"], p_arpabet=1.0)
68 | return len(arpabet_seq) / librosa.get_duration(audio, sr=sr)
69 |
70 |
71 | def get_sample_format(wav_file: str):
72 | """
73 | Get sample format of the .wav file: https://trac.ffmpeg.org/wiki/audio%20types
74 | """
75 | filename, file_extension = os.path.splitext(wav_file)
76 | assert file_extension == ".wav", ".wav file must be supplied"
77 |
78 | info = mediainfo_json(wav_file)
79 | audio_streams = [x for x in info["streams"] if x["codec_type"] == "audio"]
80 | return audio_streams[0].get("sample_fmt")
81 |
82 |
83 | class AbsoluteMetrics:
84 | """This class loads and calculates the absolute metrics, MOSNet and SRMR"""
85 |
86 | def __init__(self, window_length: Optional[int] = None):
87 | # NOTE(zach): There are some problems installing speechmetrics via pip and it's not critical, so import inline to avoid issues in CI.
88 | import speechmetrics
89 |
90 | self.metrics = speechmetrics.load("absolute", window_length)
91 |
92 | def __call__(self, wav_file: str) -> Dict[str, float]:
93 | """
94 | Returns a Dict[str,float] with keys "mosnet" and "srmr"
95 | """
96 | filename, file_extension = os.path.splitext(wav_file)
97 | assert file_extension == ".wav", ".wav file must be supplied"
98 |
99 | return self.metrics(wav_file)
100 |
--------------------------------------------------------------------------------
/analytics/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pytest
3 | import os
4 | import tempfile
5 |
6 | import torch
7 |
8 | import requests
9 |
10 | from uberduck_ml_dev.models.tacotron2 import DEFAULTS as TACOTRON2_DEFAULTS
11 | from uberduck_ml_dev.models.tacotron2 import Tacotron2
12 | from uberduck_ml_dev.trainer.tacotron2 import (
13 | Tacotron2Trainer,
14 | DEFAULTS as TACOTRON2_TRAINER_DEFAULTS,
15 | )
16 | from uberduck_ml_dev.vendor.tfcompat.hparam import HParams
17 |
18 |
19 | # NOTE (Sam): move to Tacotron2 model and remove from Uberduck repo.
20 | def _load_tacotron_uninitialized(overrides=None):
21 | overrides = overrides or {}
22 | defaults = dict(**TACOTRON2_DEFAULTS.values())
23 | defaults.update(overrides)
24 | hparams = HParams(**defaults)
25 | return Tacotron2(hparams)
26 |
27 |
28 | @pytest.fixture(scope="session")
29 | def lj_speech_tacotron2_file():
30 | tf = tempfile.NamedTemporaryFile(suffix=".pt")
31 | # tf.close()
32 | # NOTE (Sam): A canonical LJ statedict used in our warm starting notebook.
33 | url_ = "https://uberduck-demo.s3.us-west-2.amazonaws.com/tacotron2_statedict_lj_test.pt"
34 | res = requests.get(url_)
35 | if res.status_code == 200: # http 200 means success
36 | with open(tf.name, "wb") as file_handle: # wb means Write Binary
37 | file_handle.write(res.content)
38 |
39 | return tf
40 |
41 |
42 | @pytest.fixture
43 | def lj_speech_tacotron2(lj_speech_tacotron2_file):
44 | # NOTE (Sam): this override should no longer be necessary.
45 | device = "cpu"
46 | config_overrides = {}
47 | config_overrides["cudnn_enabled"] = device != "cpu"
48 | _model = _load_tacotron_uninitialized(config_overrides)
49 | checkpoint = torch.load(lj_speech_tacotron2_file.name, map_location=device)
50 | _model.from_pretrained(model_dict=checkpoint["state_dict"], device=device)
51 |
52 | return _model
53 |
54 |
55 | @pytest.fixture
56 | def sample_inference_spectrogram():
57 | # NOTE (Sam): made in Uberduck container using current test code in test_stft_seed.
58 | inference_spectrogram = torch.load(
59 | os.path.join(os.path.dirname(__file__), "fixtures/sample_spectrogram.pt")
60 | )
61 | return inference_spectrogram
62 |
63 |
64 | @pytest.fixture
65 | def sample_inference_tf_spectrogram():
66 | # NOTE (Sam): made with above at timestep 111 and text = "I, Sam, am a very bad boy."
67 | inference_spectrogram = torch.load(
68 | os.path.join(os.path.dirname(__file__), "fixtures/sample_spectrogram_tf.pt")
69 | )
70 |
71 | return inference_spectrogram
72 |
73 |
74 | @pytest.fixture()
75 | def lj_trainer(lj_speech_tacotron2_file):
76 | # NOTE (Sam): It may be nicer to specify trainer here and test-specific parameters (e.g. data) in test itself.
77 | config = TACOTRON2_TRAINER_DEFAULTS.values()
78 | params = dict(
79 | warm_start_name=lj_speech_tacotron2_file.name,
80 | training_audiopaths_and_text=os.path.join(
81 | os.path.dirname(__file__), "fixtures/ljtest/list_small.txt"
82 | ),
83 | val_audiopaths_and_text=os.path.join(
84 | os.path.dirname(__file__), "fixtures/ljtest/list_small.txt"
85 | ),
86 | checkpoint_name="test",
87 | checkpoint_path="test_checkpoint",
88 | epochs=3,
89 | log_dir="",
90 | debug=True,
91 | batch_size=4,
92 | learning_rate=1e-4,
93 | # NOTE (Sam): this effects the reduction in loss in the gradient descent,
94 | # so we need a separate test of validation and logging code.
95 | is_validate=False,
96 | )
97 | config.update(params)
98 | hparams = HParams(**config)
99 |
100 | trainer = Tacotron2Trainer(hparams, rank=0, world_size=1)
101 |
102 | return trainer
103 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import parse_version
2 | from configparser import ConfigParser
3 | import setuptools, re, sys
4 |
5 | assert parse_version(setuptools.__version__) >= parse_version("36.2")
6 |
7 | # note: all settings are in settings.ini; edit there, not here
8 | config = ConfigParser(delimiters=["="])
9 | config.read("settings.ini")
10 | cfg = config["DEFAULT"]
11 |
12 | cfg_keys = "version description keywords author author_email".split()
13 | expected = (
14 | cfg_keys
15 | + "lib_name user branch license status min_python audience language".split()
16 | )
17 | for o in expected:
18 | assert o in cfg, "missing expected setting: {}".format(o)
19 | setup_cfg = {o: cfg[o] for o in cfg_keys}
20 |
21 | if len(sys.argv) > 1 and sys.argv[1] == "version":
22 | print(setup_cfg["version"])
23 | exit()
24 |
25 | licenses = {
26 | "apache2": (
27 | "Apache Software License 2.0",
28 | "OSI Approved :: Apache Software License",
29 | ),
30 | "mit": ("MIT License", "OSI Approved :: MIT License"),
31 | "gpl2": (
32 | "GNU General Public License v2",
33 | "OSI Approved :: GNU General Public License v2 (GPLv2)",
34 | ),
35 | "gpl3": (
36 | "GNU General Public License v3",
37 | "OSI Approved :: GNU General Public License v3 (GPLv3)",
38 | ),
39 | "bsd3": ("BSD License", "OSI Approved :: BSD License"),
40 | }
41 | statuses = [
42 | "1 - Planning",
43 | "2 - Pre-Alpha",
44 | "3 - Alpha",
45 | "4 - Beta",
46 | "5 - Production/Stable",
47 | "6 - Mature",
48 | "7 - Inactive",
49 | ]
50 | py_versions = (
51 | "2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8".split()
52 | )
53 |
54 | lic = licenses.get(cfg["license"].lower(), (cfg["license"], None))
55 | min_python = cfg["min_python"]
56 |
57 | requirements = ["pip", "packaging"]
58 | if cfg.get("requirements"):
59 | requirements += cfg.get("requirements", "").split()
60 | if cfg.get("pip_requirements"):
61 | requirements += cfg.get("pip_requirements", "").split()
62 | dev_requirements = (cfg.get("dev_requirements") or "").split()
63 |
64 | long_description = open("README.md", encoding="utf-8").read()
65 | # 
66 | for ext in ["png", "svg"]:
67 | long_description = re.sub(
68 | r"!\[" + ext + "\]\((.*)\)",
69 | "
73 | + "/"
74 | + cfg["branch"]
75 | + "/\\1)",
76 | long_description,
77 | )
78 | long_description = re.sub(
79 | r"src=\"(.*)\." + ext + '"',
80 | 'src="https://raw.githubusercontent.com/{}/{}'.format(
81 | cfg["user"], cfg["lib_name"]
82 | )
83 | + "/"
84 | + cfg["branch"]
85 | + "/\\1."
86 | + ext
87 | + '"',
88 | long_description,
89 | )
90 |
91 | setuptools.setup(
92 | name=cfg["lib_name"],
93 | license=lic[0],
94 | classifiers=[
95 | "Development Status :: " + statuses[int(cfg["status"])],
96 | "Intended Audience :: " + cfg["audience"].title(),
97 | "Natural Language :: " + cfg["language"].title(),
98 | ]
99 | + [
100 | "Programming Language :: Python :: " + o
101 | for o in py_versions[py_versions.index(min_python) :]
102 | ]
103 | + (["License :: " + lic[1]] if lic[1] else []),
104 | url=cfg["git_url"],
105 | packages=setuptools.find_packages(),
106 | include_package_data=True,
107 | package_data={
108 | "": ["uberduck_ml_dev/text/heteronyms", "uberduck_ml_dev/text/cmudict-0.7b"]
109 | },
110 | install_requires=requirements,
111 | extras_require={"dev": dev_requirements},
112 | python_requires=">=" + cfg["min_python"],
113 | long_description=long_description,
114 | long_description_content_type="text/markdown",
115 | zip_safe=False,
116 | entry_points={"console_scripts": cfg.get("console_scripts", "").split()},
117 | **setup_cfg
118 | )
119 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/data/get.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import DataLoader
2 | import librosa
3 | from pathlib import Path
4 | from tqdm import tqdm
5 | import torch
6 | import os
7 |
8 | from ..data.data import DataMel, DataPitch
9 | from ..data.collate import CollateBlank
10 | from ..data.processor import Processor
11 |
12 |
13 | def get_parallel_torch(data):
14 | data_loader = DataLoader(
15 | data, batch_size=32, collate_fn=CollateBlank(), num_workers=8
16 | )
17 | for batch in data_loader:
18 | pass
19 |
20 |
21 | # TODO (Sam): use get_parallel_torch to reduce boilerplate.
22 | # NOTE (Sam): assumes data is in a directory structure like:
23 | # /tmp/{uuid}/resampled_normalized.wav
24 | # These functions add spectrogram.pt, f0.pt, and coqui_resnet_512_emb.pt to each file-specific directory.
25 | def get_mels(paths, data_config, target_paths):
26 | data = DataMel(audiopaths=paths, data_config=data_config, target_paths=target_paths)
27 |
28 | collate_fn = CollateBlank()
29 |
30 | data_loader = DataLoader(
31 | data,
32 | batch_size=32,
33 | collate_fn=collate_fn,
34 | )
35 | for batch in data_loader:
36 | pass # computes in loader.
37 |
38 |
39 | # NOTE (Sam): pitch, pitchf == f0 coarse, f0bak in rvc parlance.
40 | # NOTE (Sam): sample_rate is also passed as part of data_config
41 | # TODO (Sam): decide on sample_rate v sampling_rate
42 | # NOTE (Sam): pyin (radtts) and parselmouth (rvc) methods seem to generate pitches of different lengths.
43 | def get_pitches(
44 | paths,
45 | data_config=None,
46 | target_folders=None,
47 | method="parselmouth",
48 | sample_rate=None,
49 | recompute=False,
50 | ):
51 | data = DataPitch(
52 | audiopaths=paths,
53 | data_config=data_config,
54 | target_folders=target_folders,
55 | method=method,
56 | sample_rate=data_config["sampling_rate"],
57 | recompute=recompute,
58 | )
59 | get_parallel_torch(data)
60 |
61 |
62 | HUBERT_PATH = "hubert_embedding.pt"
63 | F0_PATH = "f0.pt"
64 | F0F_PATH = "f0f.pt"
65 |
66 |
67 | # NOTE (Sam): this is different from the other get functions because it doesn't use torch dataset.
68 | def get_hubert_embeddings(
69 | audiopaths, hubert_model, output_layer=9, hubert_path=HUBERT_PATH
70 | ):
71 | """Returns the abs path w.r.t penultimate directory name in audiopaths, e.g. suitable for /tmp/{uuid}/resampled_normalized.wav."""
72 | hubert_abs_paths = []
73 | for audiopath in tqdm(audiopaths):
74 | folder_path = str(Path(*Path(audiopath).parts[:-1]))
75 | hubert_abs_path = os.path.join(folder_path, hubert_path)
76 | # TODO (Sam): add hashing to avoid mistakenly not recomputing.
77 | if not os.path.exists(hubert_abs_path):
78 | # NOTE (Sam): Hubert expects 16k sample rate.
79 | audio0, sr = librosa.load(audiopath, sr=16000)
80 | feats = torch.from_numpy(audio0)
81 | feats = feats.float()
82 | feats = feats.view(1, -1)
83 | padding_mask = torch.BoolTensor(feats.shape).to("cpu").fill_(False)
84 | inputs = {
85 | "source": feats.to("cpu"),
86 | "padding_mask": padding_mask,
87 | "output_layer": output_layer,
88 | }
89 |
90 | with torch.no_grad():
91 | logits = hubert_model.extract_features(**inputs)
92 | feats = hubert_model.final_proj(logits[0])
93 | torch.save(feats[0], hubert_abs_path)
94 |
95 | hubert_abs_paths.append(hubert_abs_path)
96 |
97 | return hubert_abs_paths
98 |
99 |
100 | def get(
101 | processing_function,
102 | saving_function,
103 | loading_function,
104 | source_paths,
105 | target_paths,
106 | recompute,
107 | ):
108 | function_ = lambda source_path, target_path: saving_function(
109 | processing_function(loading_function(source_path)), target_path
110 | )
111 | processor = Processor(
112 | function_=function_,
113 | source_paths=source_paths,
114 | target_paths=target_paths,
115 | recompute=recompute,
116 | )
117 |
118 | get_parallel_torch(processor)
119 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/resnet_speaker_encoder.py:
--------------------------------------------------------------------------------
1 | # NOTE (Sam): this is the only component in this repository under copyleft license (Coqui / Mozilla).
2 |
3 | from io import BytesIO
4 | import os
5 | import requests
6 | import json
7 |
8 | from scipy.io.wavfile import read
9 | import torch
10 |
11 | # TODO (Sam): eliminate redundancy.
12 | from .speaker.resnet import ResNetSpeakerEncoder
13 |
14 | DEFAULT_AUDIO_CONFIG = {
15 | "fft_size": 512,
16 | "win_length": 400,
17 | "hop_length": 160,
18 | "frame_shift_ms": None,
19 | "frame_length_ms": None,
20 | "stft_pad_mode": "reflect",
21 | "sample_rate": 22050,
22 | "resample": False,
23 | "preemphasis": 0.97,
24 | "ref_level_db": 20,
25 | "do_sound_norm": False,
26 | "do_trim_silence": False,
27 | "trim_db": 60,
28 | "power": 1.5,
29 | "griffin_lim_iters": 60,
30 | "num_mels": 64,
31 | "mel_fmin": 0.0,
32 | "mel_fmax": 8000.0,
33 | "spec_gain": 20,
34 | "signal_norm": False,
35 | "min_level_db": -100,
36 | "symmetric_norm": False,
37 | "max_norm": 4.0,
38 | "clip_norm": False,
39 | "stats_path": None,
40 | "do_rms_norm": True,
41 | "db_level": -27.0,
42 | }
43 |
44 |
45 | def get_pretrained_model(
46 | config_url=None, model_url=None, config_path=None, model_path=None
47 | ):
48 | assert not ((config_url is not None) and (config_path is not None))
49 | assert not ((model_url is not None) and (model_path is not None))
50 |
51 | if config_path is None:
52 | print("Getting model config...")
53 | if config_url is None:
54 | config_url = os.environ["RESNET_SE_CONFIG_URL"]
55 | response = requests.get(config_url)
56 | resnet_config = response.json()
57 | else:
58 | with open(config_path) as f:
59 | resnet_config = json.load(f)
60 | model_params = resnet_config["model_params"]
61 | if "model_name" in model_params:
62 | del model_params["model_name"]
63 |
64 | audio_config = dict(resnet_config["audio"])
65 | audio_config["sample_rate"] = 22050
66 | model = ResNetSpeakerEncoder(**model_params, audio_config=audio_config)
67 | print("Loading pretrained model...")
68 | load_pretrained(model, model_url=model_url, model_path=model_path)
69 | print("Got pretrained model...")
70 | model.eval()
71 | return model
72 |
73 |
74 | def load_pretrained(model, model_url=None, model_path=None):
75 | assert not ((model_url is not None) and (model_path is not None))
76 | if model_path is not None:
77 | loaded = torch.load(model_path)
78 | else:
79 | if model_url is None:
80 | model_url = os.environ["RESNET_SE_MODEL_URL"]
81 | response = requests.get(model_url, stream=True)
82 | bio = BytesIO(response.content)
83 | loaded = torch.load(bio)
84 | model.load_state_dict(loaded["model"])
85 |
86 |
87 | class ResNetSpeakerEncoderCallable:
88 | def __init__(self, model_path: str, config_path: str):
89 | print("initializing resnet speaker encoder")
90 | with open(config_path) as f:
91 | resnet_config = json.load(f)
92 |
93 | state_dict = torch.load(model_path)["model"]
94 | audio_config = dict(resnet_config["audio"])
95 | model_params = resnet_config["model_params"]
96 | if "model_name" in model_params:
97 | del model_params["model_name"]
98 |
99 | self.device = "cuda"
100 | self.model = ResNetSpeakerEncoder(**model_params, audio_config=audio_config)
101 | self.model.load_state_dict(state_dict)
102 | self.model.eval()
103 | self.model.cuda()
104 |
105 | # NOTE (Sam): might have to accept bytes input for anyscale distributed data loading?
106 | def __call__(self, audiopaths):
107 | print("calling resnet speaker encoder")
108 | for audiopath in audiopaths:
109 | audio_data = read(audiopath)[1]
110 | datum = torch.FloatTensor(audio_data).unsqueeze(-1).t().cuda()
111 | # datum = torch.FloatTensor(audio_data).unsqueeze(-1).t()
112 | emb = self.model(datum)
113 | emb = emb.cpu().detach().numpy()
114 | yield {"audio_embedding": emb}
115 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/text/cleaners.py:
--------------------------------------------------------------------------------
1 | """ adapted from https://github.com/keithito/tacotron """
2 |
3 | """
4 | Cleaners are transformations that run over the input text at both training and eval time.
5 |
6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8 | 1. "english_cleaners" for English text
9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 | the symbols in symbols.py to match your data).
13 | """
14 |
15 | import re
16 | from string import punctuation
17 | from functools import reduce
18 | from unidecode import unidecode
19 | from .numerical import normalize_numbers, normalize_currency
20 | from .acronyms import AcronymNormalizer
21 | from .datestime import normalize_datestime
22 | from .letters_and_numbers import normalize_letters_and_numbers
23 | from .abbreviations import normalize_abbreviations
24 |
25 |
26 | # Regular expression matching whitespace:
27 | _whitespace_re = re.compile(r"\s+")
28 |
29 | # Regular expression separating words enclosed in curly braces for cleaning
30 | _arpa_re = re.compile(r"{[^}]+}|\S+")
31 |
32 |
33 | def expand_abbreviations(text):
34 | return normalize_abbreviations(text)
35 |
36 |
37 | def expand_numbers(text):
38 | return normalize_numbers(text)
39 |
40 |
41 | def expand_currency(text):
42 | return normalize_currency(text)
43 |
44 |
45 | def expand_datestime(text):
46 | return normalize_datestime(text)
47 |
48 |
49 | def expand_letters_and_numbers(text):
50 | return normalize_letters_and_numbers(text)
51 |
52 |
53 | def lowercase(text):
54 | return text.lower()
55 |
56 |
57 | def collapse_whitespace(text):
58 | return re.sub(_whitespace_re, " ", text)
59 |
60 |
61 | def separate_acronyms(text):
62 | text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text)
63 | text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text)
64 | return text
65 |
66 |
67 | def convert_to_ascii(text):
68 | return unidecode(text)
69 |
70 |
71 | def dehyphenize_compound_words(text):
72 | text = re.sub(r"(?<=[a-zA-Z0-9])-(?=[a-zA-Z])", " ", text)
73 | return text
74 |
75 |
76 | def remove_space_before_punctuation(text):
77 | return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
78 |
79 |
80 | class Cleaner(object):
81 | def __init__(self, cleaner_names, phonemedict):
82 | self.cleaner_names = cleaner_names
83 | self.phonemedict = phonemedict
84 | self.acronym_normalizer = AcronymNormalizer(self.phonemedict)
85 |
86 | def __call__(self, text):
87 | for cleaner_name in self.cleaner_names:
88 | sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
89 | for fn in sequence_fns:
90 | text = fn(text)
91 |
92 | text = [
93 | reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split
94 | for split in _arpa_re.findall(text)
95 | ]
96 | text = " ".join(text)
97 | text = remove_space_before_punctuation(text)
98 | return text
99 |
100 | def get_cleaner_fns(self, cleaner_name):
101 | if cleaner_name == "basic_cleaners":
102 | sequence_fns = [lowercase, collapse_whitespace]
103 | word_fns = []
104 | elif cleaner_name == "english_cleaners":
105 | sequence_fns = [collapse_whitespace, convert_to_ascii, lowercase]
106 | word_fns = [expand_numbers, expand_abbreviations]
107 | elif cleaner_name == "radtts_cleaners":
108 | sequence_fns = [
109 | collapse_whitespace,
110 | expand_currency,
111 | expand_datestime,
112 | expand_letters_and_numbers,
113 | ]
114 | word_fns = [expand_numbers, expand_abbreviations]
115 | elif cleaner_name == "transliteration_cleaners":
116 | sequence_fns = [convert_to_ascii, lowercase, collapse_whitespace]
117 | else:
118 | raise Exception("{} cleaner not supported".format(cleaner_name))
119 |
120 | return sequence_fns, word_fns
121 |
--------------------------------------------------------------------------------
/tutorials/radtts/radtts_data_processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "0e3c74a5",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# For computing normalized audio, spectrograms, and pitches\n",
11 | "import os\n",
12 | "from uberduck_ml_dev.data.get import get_mels, get_pitches\n",
13 | "from uberduck_ml_dev.data.data import RADTTS_DEFAULTS as data_config\n",
14 | "\n",
15 | "from uberduck_ml_dev.data.get import get\n",
16 | "import librosa\n",
17 | "import numpy as np\n",
18 | "from scipy.io.wavfile import write\n",
19 | "from datetime import datetime"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 11,
25 | "id": "2710441c",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# data_dir = \"/path/to/data\"\n",
30 | "data_dir = \"/usr/src/app/uberduck_ml_dev/tutorials/radtts/lj/LJSpeech/\""
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 15,
36 | "id": "5cdc25fe",
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "def find_rel_paths(directory, filename):\n",
41 | " for root, dirs, files in os.walk(directory):\n",
42 | " if filename in files:\n",
43 | " yield os.path.relpath(os.path.join(root, filename), directory)\n",
44 | "\n",
45 | "filename = 'gt.wav' # replace with your filename\n",
46 | "rel_path_list = list(find_rel_paths(data_dir, filename))"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 24,
52 | "id": "d9f989f6",
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "local_path_list = [os.path.join(data_dir, rel_path) for rel_path in rel_path_list]\n",
57 | "resampled_normalized_path_list = [os.path.join(data_dir, \n",
58 | " local_path.split('gt.wav')[0],\n",
59 | " 'audio_resampledT_normalized32768T.wav') \n",
60 | " for local_path in local_path_list]\n",
61 | "spectrogram_path_list = [os.path.join(data_dir, local_path.split('gt.wav')[0],\n",
62 | " 'spectrogram.pt') \n",
63 | " for local_path in local_path_list]\n",
64 | "folder_path_list = [os.path.join(data_dir, local_path.split('gt.wav')[0]) \n",
65 | " for local_path in local_path_list]"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "f5ce0f25",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "print(\"resample and normalize\")\n",
76 | "MAX_WAV_VALUE = 32768\n",
77 | "sr = 22050\n",
78 | "loading_function = lambda filename : librosa.load(filename, sr = 22050)[0]\n",
79 | "function_ = lambda x : np.asarray((x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype = np.int16)\n",
80 | "saving_function = lambda data, filename : write(filename, 22050, data) # must be in this order\n",
81 | "\n",
82 | "print(datetime.now())\n",
83 | "get(function_, loading_function, saving_function, local_path_list, resampled_normalized_path_list, False)\n",
84 | "print(datetime.now())"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "id": "ab2d5894",
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "print(\"compute pitches\")\n",
95 | "get_pitches(resampled_normalized_path_list, data_config, folder_path_list, method = 'radtts')"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "08e86d85",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "print(\"compute mels\")\n",
106 | "get_mels(resampled_normalized_path_list, data_config, spectrogram_path_list)"
107 | ]
108 | }
109 | ],
110 | "metadata": {
111 | "kernelspec": {
112 | "display_name": "Python 3",
113 | "language": "python",
114 | "name": "python3"
115 | },
116 | "language_info": {
117 | "codemirror_mode": {
118 | "name": "ipython",
119 | "version": 3
120 | },
121 | "file_extension": ".py",
122 | "mimetype": "text/x-python",
123 | "name": "python",
124 | "nbconvert_exporter": "python",
125 | "pygments_lexer": "ipython3",
126 | "version": "3.8.10"
127 | }
128 | },
129 | "nbformat": 4,
130 | "nbformat_minor": 5
131 | }
132 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/optimizers/radam.py:
--------------------------------------------------------------------------------
1 | # Original source taken from https://github.com/LiyuanLucasLiu/RAdam
2 | #
3 | # Copyright 2019 Liyuan Liu
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | import math
17 |
18 | import torch
19 |
20 | # pylint: disable=no-name-in-module
21 | from torch.optim.optimizer import Optimizer
22 |
23 |
24 | class RAdam(Optimizer):
25 | """RAdam optimizer"""
26 |
27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
28 | """
29 | Init
30 |
31 | :param params: parameters to optimize
32 | :param lr: learning rate
33 | :param betas: beta
34 | :param eps: numerical precision
35 | :param weight_decay: weight decay weight
36 | """
37 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
38 | self.buffer = [[None, None, None] for _ in range(10)]
39 | super().__init__(params, defaults)
40 |
41 | def step(self, closure=None):
42 | loss = None
43 | if closure is not None:
44 | loss = closure()
45 |
46 | for group in self.param_groups:
47 | for p in group["params"]:
48 | if p.grad is None:
49 | continue
50 | grad = p.grad.data.float()
51 | if grad.is_sparse:
52 | raise RuntimeError("RAdam does not support sparse gradients")
53 |
54 | p_data_fp32 = p.data.float()
55 |
56 | state = self.state[p]
57 |
58 | if len(state) == 0:
59 | state["step"] = 0
60 | state["exp_avg"] = torch.zeros_like(p_data_fp32)
61 | state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
62 | else:
63 | state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
64 | state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
65 |
66 | exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
67 | beta1, beta2 = group["betas"]
68 |
69 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
70 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
71 |
72 | state["step"] += 1
73 | buffered = self.buffer[int(state["step"] % 10)]
74 | if state["step"] == buffered[0]:
75 | N_sma, step_size = buffered[1], buffered[2]
76 | else:
77 | buffered[0] = state["step"]
78 | beta2_t = beta2 ** state["step"]
79 | N_sma_max = 2 / (1 - beta2) - 1
80 | N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
81 | buffered[1] = N_sma
82 |
83 | # more conservative since it's an approximated value
84 | if N_sma >= 5:
85 | step_size = (
86 | group["lr"]
87 | * math.sqrt(
88 | (1 - beta2_t)
89 | * (N_sma - 4)
90 | / (N_sma_max - 4)
91 | * (N_sma - 2)
92 | / N_sma
93 | * N_sma_max
94 | / (N_sma_max - 2)
95 | )
96 | / (1 - beta1 ** state["step"])
97 | )
98 | else:
99 | step_size = group["lr"] / (1 - beta1 ** state["step"])
100 | buffered[2] = step_size
101 |
102 | if group["weight_decay"] != 0:
103 | p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
104 |
105 | # more conservative since it's an approximated value
106 | if N_sma >= 5:
107 | denom = exp_avg_sq.sqrt().add_(group["eps"])
108 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
109 | else:
110 | p_data_fp32.add_(-step_size, exp_avg)
111 |
112 | p.data.copy_(p_data_fp32)
113 |
114 | return loss
115 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/hifigan/train.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.cuda.amp import GradScaler
3 | from ray.air.integrations.wandb import setup_wandb
4 | from torch.utils.data import DataLoader
5 | from torch.nn import functional as F
6 |
7 | from ...data.data import Dataset
8 | from ...models.rvc.rvc import MultiPeriodDiscriminator
9 | from ...models.hifigan import MultiDiscriminator
10 |
11 | from ...data.collate import Collate
12 | from ...losses_rvc import (
13 | generator_loss,
14 | discriminator_loss,
15 | feature_loss,
16 | )
17 | from .train_epoch import train_epoch
18 | from .train_step import train_step
19 | from ..rvc.train import DEFAULTS as DEFAULTS
20 | from ...models.hifigan import _load_uninitialized
21 |
22 |
23 | def train_func(config: dict, project: str = "rvc"):
24 | print("Entering training function")
25 | setup_wandb(config, project=project, entity="uberduck-ai", rank_zero_only=False)
26 | train_config = config["train"]
27 | model_config = config["model"]
28 | data_config = config["data"]
29 |
30 | generator = _load_uninitialized(config_overrides=model_config)
31 |
32 | # NOTE (Sam): RVC uses MultiPeriodDiscrimator that has a single scale discriminator
33 | # HiFi++ paper indicates that the precise discriminator structure is not important and that reweighting the loss is sufficient
34 | # Vocos uses additional strcuture.
35 | discriminator = MultiDiscriminator(True)
36 | discriminator = discriminator.to("cuda")
37 |
38 | generator_optimizer = torch.optim.AdamW(
39 | generator.parameters(),
40 | train_config["learning_rate"],
41 | betas=train_config["betas"],
42 | eps=train_config["eps"],
43 | )
44 |
45 | discriminator_optimizer = torch.optim.AdamW(
46 | discriminator.parameters(),
47 | train_config["learning_rate"],
48 | betas=train_config["betas"],
49 | eps=train_config["eps"],
50 | )
51 |
52 | print("Loading checkpoints")
53 | # TODO (Sam): move to "warmstart" or "load_checkpoint" functions
54 | if train_config["warmstart_G_checkpoint_path"] is not None:
55 | generator_checkpoint = torch.load(train_config["warmstart_G_checkpoint_path"])[
56 | "generator"
57 | ]
58 | generator.load_state_dict(
59 | generator_checkpoint
60 | ) # NOTE (Sam): a handful of "enc_q" decoder states not present - doesn't seem to cause an issue
61 | if train_config["warmstart_D_checkpoint_path"] is not None:
62 | discriminator_checkpoint = torch.load(
63 | train_config["warmstart_D_checkpoint_path"]
64 | )["model"]
65 | discriminator.load_state_dict(discriminator_checkpoint)
66 |
67 | generator = generator.cuda()
68 | discriminator = discriminator.cuda()
69 |
70 | models = {"generator": generator, "discriminator": discriminator}
71 | print("Loading dataset")
72 |
73 | train_dataset = Dataset(
74 | filelist_path=data_config["filelist_path"],
75 | mel_suffix=data_config["mel_suffix"],
76 | audio_suffix=data_config["audio_suffix"],
77 | )
78 |
79 | # train_sampler = DistributedBucketSampler(
80 | # train_dataset,
81 | # train_config["batch_size"] * 1,
82 | # [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
83 | # num_replicas=1,
84 | # rank=0,
85 | # shuffle=True,
86 | # )
87 | train_loader = DataLoader(
88 | train_dataset,
89 | num_workers=1,
90 | shuffle=False,
91 | pin_memory=True,
92 | collate_fn=Collate(),
93 | batch_sampler=None,
94 | # batch_sampler=train_sampler,
95 | batch_size=train_config["batch_size"],
96 | persistent_workers=True,
97 | prefetch_factor=8,
98 | )
99 | optimization_parameters = {
100 | "optimizers": {
101 | "generator": generator_optimizer,
102 | "discriminator": discriminator_optimizer,
103 | },
104 | "scaler": GradScaler(),
105 | # NOTE (Sam): need to pass names rather than vector of losses since arguments differ
106 | "losses": {
107 | "l1": {"loss": F.l1_loss, "weight": 1.0},
108 | "feature": {"loss": feature_loss, "weight": 1.0},
109 | "generator": {"loss": generator_loss, "weight": 1.0},
110 | "discriminator": {"loss": discriminator_loss, "weight": 1},
111 | },
112 | }
113 |
114 | iteration = 0
115 | start_epoch = 0
116 | print("Beginning training for ", train_config["epochs"], " epochs")
117 | for epoch in range(start_epoch, train_config["epochs"]):
118 | print(f"Epoch: {epoch}")
119 | iteration = train_epoch(
120 | train_step,
121 | train_loader,
122 | config,
123 | models,
124 | optimization_parameters,
125 | logging_parameters={},
126 | iteration=iteration,
127 | )
128 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/data/spectrogram.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from librosa.filters import mel as librosa_mel_fn
3 |
4 | from .utils import spectral_normalize_torch
5 |
6 | # NOTE (Sam): needed for importable lambdas.
7 | # TODO (Sam): remove redundancy from elsewhere in repo.
8 | hann_window = {}
9 | mel_basis = {}
10 |
11 |
12 | # TODO (Sam): combine with identically-named function is models.common
13 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
14 | """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
15 |
16 | Args:
17 | y :: (B, T) - Audio waveforms
18 | n_fft
19 | sampling_rate
20 | hop_size
21 | win_size
22 | center
23 | Returns:
24 | :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
25 | """
26 | # Validation
27 | if torch.min(y) < -1.0:
28 | print("min value is ", torch.min(y))
29 | if torch.max(y) > 1.0:
30 | print("max value is ", torch.max(y))
31 |
32 | # Window - Cache if needed
33 | global hann_window
34 | dtype_device = str(y.dtype) + "_" + str(y.device)
35 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
36 | if wnsize_dtype_device not in hann_window:
37 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
38 | dtype=y.dtype, device=y.device
39 | )
40 |
41 | # Padding
42 | y = torch.nn.functional.pad(
43 | y.unsqueeze(1),
44 | # NOTE (Sam): combinining n_fft (filter_length) with hop_size reeks of either a bug or sophisticated asympotitc analysis.
45 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
46 | mode="reflect",
47 | )
48 | y = y.squeeze(1)
49 |
50 | # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
51 | spec = torch.stft(
52 | y,
53 | n_fft,
54 | hop_length=hop_size,
55 | win_length=win_size,
56 | window=hann_window[wnsize_dtype_device],
57 | center=center,
58 | pad_mode="reflect",
59 | normalized=False,
60 | onesided=True,
61 | return_complex=False,
62 | )
63 | # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
64 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
65 | return spec
66 |
67 |
68 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
69 | # MelBasis - Cache if needed
70 | global mel_basis
71 | dtype_device = str(spec.dtype) + "_" + str(spec.device)
72 | fmax_dtype_device = str(fmax) + "_" + dtype_device
73 | if fmax_dtype_device not in mel_basis:
74 | mel = librosa_mel_fn(
75 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
76 | )
77 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
78 | dtype=spec.dtype, device=spec.device
79 | )
80 |
81 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
82 | melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
83 | melspec = spectral_normalize_torch(melspec)
84 | return melspec
85 |
86 |
87 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
88 | """
89 | PARAMS
90 | ------
91 | C: compression factor
92 | """
93 | return torch.log(torch.clamp(x, min=clip_val) * C)
94 |
95 |
96 | def spectral_normalize_torch(magnitudes):
97 | return dynamic_range_compression_torch(magnitudes)
98 |
99 |
100 | def mel_spectrogram_torch(
101 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
102 | ):
103 | """Convert waveform into Mel-frequency Log-amplitude spectrogram.
104 |
105 | Args:
106 | y :: (B, T) - Waveforms
107 | Returns:
108 | melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
109 | """
110 | # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
111 | spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
112 |
113 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
114 | melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
115 |
116 | return melspec
117 |
118 |
119 | from ..data.data import HIFIGAN_DEFAULTS as DEFAULTS
120 | from scipy.io.wavfile import read
121 | import librosa
122 |
123 | mel_spec = lambda x: mel_spectrogram_torch(
124 | x,
125 | DEFAULTS["n_fft"],
126 | DEFAULTS["num_mels"],
127 | DEFAULTS["sampling_rate"],
128 | # 100,
129 | # 24000,#DEFAULTS["sampling_rate"],
130 | DEFAULTS["hop_size"],
131 | DEFAULTS["win_size"],
132 | DEFAULTS["fmin"],
133 | None,
134 | False, # center
135 | )
136 |
137 | load_audio = lambda source_path: torch.Tensor(read(source_path)[1]).unsqueeze(0)
138 | save_torch = lambda data, target_path: torch.save(data[0], target_path)
139 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/rvc/train.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.cuda.amp import GradScaler
3 | from ray.air.integrations.wandb import setup_wandb
4 | from torch.utils.data import DataLoader
5 | from torch.nn import functional as F
6 |
7 | from .train_epoch import train_epoch
8 | from ...models.rvc.rvc import (
9 | SynthesizerTrnMs256NSFsid,
10 | MultiPeriodDiscriminator,
11 | )
12 | from ...vendor.tfcompat.hparam import HParams
13 | from ...data.data import (
14 | TextAudioLoaderMultiNSFsid,
15 | DistributedBucketSampler,
16 | )
17 | from ...data.collate import TextAudioCollateMultiNSFsid
18 | from ...losses_rvc import (
19 | generator_loss,
20 | discriminator_loss,
21 | feature_loss,
22 | kl_loss,
23 | )
24 | from uberduck_ml_dev.trainer.rvc.train_epoch import train_epoch
25 |
26 |
27 | def train_func(config: dict, project: str = "rvc"):
28 | print("Entering training function")
29 | setup_wandb(config, project=project, entity="uberduck-ai", rank_zero_only=False)
30 | train_config = config["train"]
31 | model_config = config["model"]
32 | data_config = config["data"]
33 |
34 | generator = SynthesizerTrnMs256NSFsid(
35 | data_config["filter_length"] // 2 + 1,
36 | train_config["segment_size"] // data_config["hop_length"],
37 | **model_config,
38 | is_half=train_config["fp16_run"],
39 | sr=data_config["sampling_rate"],
40 | )
41 |
42 | discriminator = MultiPeriodDiscriminator(model_config["use_spectral_norm"])
43 | generator_optimizer = torch.optim.AdamW(
44 | generator.parameters(),
45 | train_config["learning_rate"],
46 | betas=train_config["betas"],
47 | eps=train_config["eps"],
48 | )
49 |
50 | discriminator_optimizer = torch.optim.AdamW(
51 | discriminator.parameters(),
52 | train_config["learning_rate"],
53 | betas=train_config["betas"],
54 | eps=train_config["eps"],
55 | )
56 |
57 | print("Loading checkpoints")
58 | # TODO (Sam): move to "warmstart" or "load_checkpoint" functions
59 | generator_checkpoint = torch.load(train_config["warmstart_G_checkpoint_path"])[
60 | "model"
61 | ]
62 | discriminator_checkpoint = torch.load(train_config["warmstart_D_checkpoint_path"])[
63 | "model"
64 | ]
65 | discriminator.load_state_dict(discriminator_checkpoint)
66 | generator.load_state_dict(
67 | generator_checkpoint, strict=False
68 | ) # NOTE (Sam): a handful of "enc_q" decoder states not present
69 | generator = generator.cuda()
70 | discriminator = discriminator.cuda()
71 |
72 | models = {"generator": generator, "discriminator": discriminator}
73 |
74 | print("Loading dataset")
75 | train_dataset = TextAudioLoaderMultiNSFsid(
76 | train_config["filelist_path"], HParams(**data_config)
77 | ) # dv is sid
78 | collate_fn = TextAudioCollateMultiNSFsid()
79 | n_gpus = 1
80 | train_sampler = DistributedBucketSampler(
81 | train_dataset,
82 | train_config["batch_size"] * n_gpus,
83 | [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
84 | num_replicas=n_gpus,
85 | rank=0,
86 | shuffle=True,
87 | )
88 | train_loader = DataLoader(
89 | train_dataset,
90 | num_workers=1,
91 | shuffle=False,
92 | pin_memory=True,
93 | collate_fn=collate_fn,
94 | batch_sampler=train_sampler,
95 | persistent_workers=True,
96 | prefetch_factor=8,
97 | )
98 | optimization_parameters = {
99 | "optimizers": {
100 | "generator": generator_optimizer,
101 | "discriminator": discriminator_optimizer,
102 | },
103 | "scaler": GradScaler(),
104 | # NOTE (Sam): need to pass names rather than vector of losses since arguments differ
105 | "losses": {
106 | "l1": {"loss": F.l1_loss, "weight": 1.0},
107 | "kl": {"loss": kl_loss, "weight": 1.0},
108 | "feature": {"loss": feature_loss, "weight": 1.0},
109 | "generator": {"loss": generator_loss, "weight": 1.0},
110 | "discriminator": {"loss": discriminator_loss, "weight": 1},
111 | },
112 | }
113 |
114 | iteration = 0
115 | start_epoch = 0
116 | print("Beginning training for ", train_config["epochs"], " epochs")
117 | for epoch in range(start_epoch, train_config["epochs"]):
118 | print(f"Epoch: {epoch}")
119 | iteration = train_epoch(
120 | train_loader,
121 | config,
122 | models,
123 | optimization_parameters,
124 | logging_parameters={},
125 | iteration=iteration,
126 | )
127 |
128 |
129 | # 40k config
130 | DEFAULTS = {
131 | "log_interval": 200,
132 | "seed": 1234,
133 | "epochs": 20000,
134 | "learning_rate": 1e-4,
135 | "betas": [0.8, 0.99],
136 | "eps": 1e-9,
137 | "batch_size": 4,
138 | "fp16_run": False,
139 | "lr_decay": 0.999875,
140 | "segment_size": 12800,
141 | "init_lr_ratio": 1,
142 | "warmup_epochs": 0,
143 | "c_mel": 45,
144 | "c_kl": 1.0,
145 | "steps_per_sample": 100,
146 | "iters_per_checkpoint": 100,
147 | "output_directory": "/tmp",
148 | }
149 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/text/heteronyms:
--------------------------------------------------------------------------------
1 | abject
2 | abrogate
3 | absent
4 | abstract
5 | abuse
6 | ache
7 | acre
8 | acuminate
9 | addict
10 | address
11 | adduct
12 | adele
13 | advocate
14 | affect
15 | affiliate
16 | agape
17 | aged
18 | agglomerate
19 | aggregate
20 | agonic
21 | agora
22 | allied
23 | ally
24 | alternate
25 | alum
26 | am
27 | analyses
28 | andrea
29 | animate
30 | apply
31 | appropriate
32 | approximate
33 | ares
34 | arithmetic
35 | arsenic
36 | articulate
37 | associate
38 | attribute
39 | august
40 | axes
41 | ay
42 | aye
43 | bases
44 | bass
45 | bathed
46 | bested
47 | bifurcate
48 | blessed
49 | blotto
50 | bow
51 | bowed
52 | bowman
53 | brassy
54 | buffet
55 | bustier
56 | carbonate
57 | celtic
58 | choral
59 | chumash
60 | close
61 | closer
62 | coax
63 | coincidence
64 | color coordinate
65 | colour coordinate
66 | comber
67 | combine
68 | combs
69 | committee
70 | commune
71 | compact
72 | complex
73 | compound
74 | compress
75 | concert
76 | conduct
77 | confine
78 | confines
79 | conflict
80 | conglomerate
81 | conscript
82 | conserve
83 | consist
84 | console
85 | consort
86 | construct
87 | consult
88 | consummate
89 | content
90 | contest
91 | contract
92 | contracts
93 | contrast
94 | converse
95 | convert
96 | convict
97 | coop
98 | coordinate
99 | covey
100 | crooked
101 | curate
102 | cussed
103 | decollate
104 | decrease
105 | defect
106 | defense
107 | delegate
108 | deliberate
109 | denier
110 | desert
111 | detail
112 | deviate
113 | diagnoses
114 | diffuse
115 | digest
116 | discard
117 | discharge
118 | discount
119 | do
120 | document
121 | does
122 | dogged
123 | domesticate
124 | dominican
125 | dove
126 | dr
127 | drawer
128 | duplicate
129 | egress
130 | ejaculate
131 | eject
132 | elaborate
133 | ellipses
134 | email
135 | emu
136 | entrace
137 | entrance
138 | escort
139 | estimate
140 | eta
141 | etna
142 | evening
143 | excise
144 | excuse
145 | exploit
146 | export
147 | extract
148 | fine
149 | flower
150 | forbear
151 | four-legged
152 | frequent
153 | furrier
154 | gallant
155 | gel
156 | geminate
157 | gillie
158 | glower
159 | gotham
160 | graduate
161 | haggis
162 | heavy
163 | hinder
164 | house
165 | housewife
166 | impact
167 | imped
168 | implant
169 | implement
170 | import
171 | impress
172 | incense
173 | incline
174 | increase
175 | infix
176 | insert
177 | instar
178 | insult
179 | integral
180 | intercept
181 | interchange
182 | interflow
183 | interleaf
184 | intermediate
185 | intern
186 | interspace
187 | intimate
188 | intrigue
189 | invalid
190 | invert
191 | invite
192 | irony
193 | jagged
194 | jesses
195 | julies
196 | kite
197 | laminate
198 | laos
199 | lather
200 | lead
201 | learned
202 | leasing
203 | lech
204 | legitimate
205 | lied
206 | lima
207 | lipread
208 | live
209 | lower
210 | lunged
211 | maas
212 | magdalen
213 | manes
214 | mare
215 | marked
216 | merchandise
217 | merlion
218 | minute
219 | misconduct
220 | misled
221 | misprint
222 | mobile
223 | moderate
224 | mong
225 | moped
226 | moth
227 | mouth
228 | mow
229 | mpg
230 | multiply
231 | mush
232 | nana
233 | nice
234 | nice
235 | number
236 | numerate
237 | nun
238 | object
239 | opiate
240 | ornament
241 | outbox
242 | outcry
243 | outpour
244 | outreach
245 | outride
246 | outright
247 | outside
248 | outwork
249 | overall
250 | overbid
251 | overcall
252 | overcast
253 | overfall
254 | overflow
255 | overhaul
256 | overhead
257 | overlap
258 | overlay
259 | overuse
260 | overweight
261 | overwork
262 | pace
263 | palled
264 | palling
265 | para
266 | pasty
267 | pate
268 | pauline
269 | pedal
270 | peer
271 | perfect
272 | periodic
273 | permit
274 | pervert
275 | pinta
276 | placer
277 | platy
278 | polish
279 | polish
280 | poll
281 | pontificate
282 | postulate
283 | pram
284 | prayer
285 | precipitate
286 | predate
287 | predicate
288 | prefix
289 | preposition
290 | present
291 | pretest
292 | primer
293 | proceeds
294 | produce
295 | progress
296 | project
297 | proportionate
298 | prospect
299 | protest
300 | pussy
301 | putter
302 | putting
303 | quite
304 | ragged
305 | raven
306 | re
307 | read
308 | reading
309 | reading
310 | real
311 | rebel
312 | recall
313 | recap
314 | recitative
315 | recollect
316 | record
317 | recreate
318 | recreation
319 | redress
320 | refill
321 | refund
322 | refuse
323 | reject
324 | relay
325 | remake
326 | repaint
327 | reprint
328 | reread
329 | rerun
330 | resent
331 | reside
332 | resign
333 | respray
334 | resume
335 | retard
336 | retest
337 | retread
338 | rewrite
339 | root
340 | routed
341 | routing
342 | row
343 | rugged
344 | rummy
345 | sais
346 | sake
347 | sambuca
348 | saucier
349 | second
350 | secrete
351 | secreted
352 | secreting
353 | segment
354 | separate
355 | sewer
356 | shirk
357 | shower
358 | sin
359 | skied
360 | slaver
361 | slough
362 | sow
363 | spoof
364 | squid
365 | stingy
366 | subject
367 | subordinate
368 | subvert
369 | supply
370 | supposed
371 | survey
372 | suspect
373 | syringes
374 | tabulate
375 | tales
376 | tarrier
377 | tarry
378 | taxes
379 | taxis
380 | tear
381 | theron
382 | thou
383 | three-legged
384 | tier
385 | tinged
386 | torment
387 | transfer
388 | transform
389 | transplant
390 | transport
391 | transpose
392 | tush
393 | two-legged
394 | unionised
395 | unionized
396 | update
397 | uplift
398 | upset
399 | use
400 | used
401 | vale
402 | violist
403 | viva
404 | ware
405 | whinged
406 | whoop
407 | wicked
408 | wind
409 | windy
410 | wino
411 | won
412 | worsted
413 | wound
--------------------------------------------------------------------------------
/uberduck_ml_dev/data/ray.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | import os
3 |
4 | from scipy.io import wavfile
5 | import torch
6 | import numpy as np
7 | import ray
8 | import pandas as pd
9 |
10 |
11 | from .utils import get_energy_average, f0_normalize
12 | from ..models.components.encoders import ResNetSpeakerEncoderCallable
13 |
14 |
15 | # NOTE (Sam): the ray dataset code runs mod cleanup but is seemingly slower than torch dataloader (not 100p sure if this is still true).
16 | def ray_df_preprocessing(df, data_config, tp, stft):
17 | transcripts = df.transcript.tolist()
18 | audio_bytes_list = df.audio_bytes.tolist()
19 | speaker_ids = df.speaker_id.tolist()
20 | f0_paths = df.f0_path.tolist()
21 | audio_embeddings = df.audio_embedding.tolist()
22 | # shuffle_indices = get_shuffle_indices(speaker_ids)
23 | # audio_embeddings = [audio_embeddings[i] for i in shuffle_indices]
24 | collate_input = []
25 | for transcript, audio_bytes, speaker_id, f0_path, audio_embedding in zip(
26 | transcripts, audio_bytes_list, speaker_ids, f0_paths, audio_embeddings
27 | ):
28 | bio = BytesIO(audio_bytes)
29 | sr, wav_data = wavfile.read(bio)
30 | audio = torch.FloatTensor(wav_data)
31 | # NOTE (Sam): why normalize here?
32 | audio_norm = audio / (np.abs(audio).max() * 2)
33 | text_sequence = tp.get_text(transcript)
34 | mel = stft.get_mel(audio_norm, data_config["max_wav_value"])
35 | mel = torch.squeeze(mel, 0)
36 | dikt = torch.load(f0_path)
37 | f0 = dikt["f0"]
38 | p_voiced = dikt["p_voiced"]
39 | voiced_mask = dikt["voiced_mask"]
40 | f0 = f0_normalize(f0, f0_min=data_config["f0_min"])
41 | energy_avg = get_energy_average(mel)
42 | prior_path = "{}_{}".format(text_sequence.shape[0], mel.shape[1])
43 | prior_path = os.path.join("/usr/src/app/radtts/data_cache", prior_path)
44 | prior_path += "_prior.pth"
45 | attn_prior = torch.load(prior_path)
46 | speaker_id = torch.LongTensor([speaker_id])
47 | audio_embedding = torch.FloatTensor(audio_embedding)
48 | # NOTE (Sam): might be faster to return dictionary arrays of batched inputs instead of list
49 | collate_input.append(
50 | {
51 | "text_encoded": text_sequence,
52 | "mel": mel,
53 | "speaker_id": speaker_id,
54 | "f0": f0,
55 | "p_voiced": p_voiced,
56 | "voiced_mask": voiced_mask,
57 | "energy_avg": energy_avg,
58 | "attn_prior": attn_prior,
59 | "audiopath": None,
60 | "audio_embedding": audio_embedding,
61 | }
62 | )
63 |
64 | return collate_input
65 |
66 |
67 | def get_ray_dataset(filelist_path, config_path, model_path):
68 | df = pd.read_csv(
69 | filelist_path,
70 | sep="|",
71 | header=None,
72 | quoting=3,
73 | names=["path", "transcript", "speaker_id", "f0_path", "emb_path"],
74 | )
75 |
76 | paths = df.path.tolist()
77 | transcripts = df.transcript.tolist()
78 | speaker_ids = df.speaker_id.tolist()
79 |
80 | pitches = df.f0_path.tolist()
81 |
82 | parallelism_length = 400
83 | audio_ds = ray.data.read_binary_files(
84 | paths,
85 | parallelism=parallelism_length,
86 | ray_remote_args={"num_cpus": 1.0},
87 | )
88 | audio_ds = audio_ds.map_batches(
89 | lambda x: x, batch_format="pyarrow", batch_size=None
90 | )
91 |
92 | paths_ds = ray.data.from_items(paths, parallelism=parallelism_length)
93 | paths_ds = paths_ds.map_batches(
94 | lambda x: x, batch_format="pyarrow", batch_size=None
95 | )
96 |
97 | transcripts = ray.data.from_items(transcripts, parallelism=parallelism_length)
98 | transcripts_ds = transcripts.map_batches(
99 | lambda x: x, batch_format="pyarrow", batch_size=None
100 | )
101 |
102 | speaker_ids_ds = ray.data.from_items(speaker_ids, parallelism=parallelism_length)
103 | speaker_ids_ds = speaker_ids_ds.map_batches(
104 | lambda x: x, batch_format="pyarrow", batch_size=None
105 | )
106 | pitches_ds = ray.data.from_items(pitches, parallelism=parallelism_length)
107 | pitches_ds = pitches_ds.map_batches(
108 | lambda x: x, batch_format="pyarrow", batch_size=None
109 | )
110 |
111 | embs_ds = ray.data.from_items(paths, parallelism=parallelism_length)
112 | embs_ds = embs_ds.map_batches(
113 | ResNetSpeakerEncoderCallable,
114 | fn_kwargs={"config_path": config_path, "model_path": model_path},
115 | num_gpus=1.0,
116 | compute="actors",
117 | )
118 |
119 | output_dataset = (
120 | transcripts_ds.zip(audio_ds)
121 | .zip(paths_ds)
122 | .zip(speaker_ids_ds)
123 | .zip(pitches_ds)
124 | .zip(embs_ds)
125 | )
126 | output_dataset = output_dataset.map_batches(
127 | lambda table: table.rename(
128 | columns={
129 | "value": "transcript",
130 | "value_1": "audio_bytes",
131 | "value_2": "path",
132 | "value_3": "speaker_id",
133 | "value_4": "f0_path",
134 | "value_5": "emb_path",
135 | }
136 | )
137 | )
138 |
139 | processed_dataset = output_dataset.map_batches(ray_df_preprocessing)
140 | return processed_dataset.fully_executed()
141 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/utils/plot.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "save_figure_to_numpy",
3 | "plot_tensor",
4 | "plot_spectrogram",
5 | "plot_attention",
6 | "plot_attention_phonemes",
7 | "plot_gate_outputs",
8 | ]
9 |
10 |
11 | import numpy as np
12 | import matplotlib
13 |
14 | matplotlib.use("Agg")
15 | import matplotlib.pyplot as plt
16 |
17 | from ..text.symbols import id_to_symbol, DEFAULT_SYMBOLS
18 |
19 |
20 | def save_figure_to_numpy(fig):
21 | """Save figure to a numpy array."""
22 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
23 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
24 | plt.close(fig)
25 | return data
26 |
27 |
28 | def plot_tensor(tensor):
29 | plt.style.use("default")
30 | fig, ax = plt.subplots(figsize=(12, 3))
31 | im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none")
32 | plt.colorbar(im, ax=ax)
33 | plt.tight_layout()
34 | fig.canvas.draw()
35 | data = save_figure_to_numpy(fig)
36 | plt.close()
37 | return data
38 |
39 |
40 | def plot_spectrogram(mel):
41 | figure = plt.figure()
42 | plt.xlabel("Spectrogram frame")
43 | plt.ylabel("Channel")
44 | plt.imshow(mel, aspect="auto", origin="lower", interpolation="none", cmap="inferno")
45 | figure.canvas.draw()
46 | return figure
47 |
48 |
49 | def plot_attention(attention, encoder_length=None, decoder_length=None):
50 | figure = plt.figure()
51 | plt.xlabel("Decoder timestep")
52 | plt.ylabel("Encoder timestep")
53 | plt.imshow(
54 | attention.data.cpu().numpy(),
55 | aspect="auto",
56 | origin="lower",
57 | interpolation="none",
58 | cmap="inferno",
59 | )
60 | title_info = []
61 | if encoder_length is not None:
62 | title_info.append(f"Encoder_length: {encoder_length}")
63 | if decoder_length is not None:
64 | title_info.append(f"Decoder length: {decoder_length}")
65 | title = " ".join(title_info)
66 | plt.title(title)
67 | figure.canvas.draw()
68 | return figure
69 |
70 |
71 | def plot_attention_phonemes(seq, attention, symbol_set=DEFAULT_SYMBOLS):
72 | figure = plt.figure(figsize=(15, 8))
73 | phonemes = []
74 |
75 | for token in seq.numpy():
76 | if token == len(id_to_symbol[symbol_set]):
77 | phonemes.append("~")
78 | else:
79 | phonemes.append(id_to_symbol[symbol_set][token][1:])
80 |
81 | xtick_locs = np.pad(
82 | np.cumsum(np.sum(attention.data.cpu().numpy(), axis=1)), (1, 0)
83 | ).astype(np.int16)[:-1]
84 | ytick_locs = np.arange(seq.shape[-1])
85 | plt.yticks(ytick_locs, phonemes)
86 | plt.xticks(xtick_locs, xtick_locs)
87 |
88 | plt.imshow(
89 | attention.data.cpu().numpy(),
90 | aspect="auto",
91 | origin="lower",
92 | interpolation="none",
93 | cmap="Greys",
94 | )
95 |
96 | i = 0
97 | for phon, y in zip(phonemes, ytick_locs):
98 | if phon == "~":
99 | continue
100 | if i == 4:
101 | plt.axhline(y=y, color="k")
102 | if i == 3:
103 | plt.axhline(y=y, color="r")
104 | if i == 2:
105 | plt.axhline(y=y, color="g")
106 | if i == 1:
107 | plt.axhline(y=y, color="b")
108 | if i == 0:
109 | plt.axhline(y=y, color="m")
110 | i += 1
111 | i = i % 5
112 |
113 | plt.grid(axis="x")
114 | plt.title("Phoneme Alignment")
115 | plt.xlabel("Time (mel frames)")
116 | plt.ylabel("Phonemes")
117 |
118 | return figure
119 |
120 |
121 | def plot_gate_outputs(gate_targets=None, gate_outputs=None):
122 | figure = plt.figure()
123 | plt.xlabel("Frames")
124 | plt.ylabel("Gate state")
125 | ax = figure.add_axes([0, 0, 1, 1])
126 | if gate_targets is not None:
127 | ax.scatter(
128 | range(gate_targets.size(0)),
129 | gate_targets,
130 | alpha=0.5,
131 | color="green",
132 | marker="+",
133 | s=1,
134 | label="target",
135 | )
136 | if gate_outputs is not None:
137 | ax.scatter(
138 | range(gate_outputs.size(0)),
139 | gate_outputs,
140 | alpha=0.5,
141 | color="red",
142 | marker=".",
143 | s=1,
144 | label="predicted",
145 | )
146 | figure.canvas.draw()
147 | return figure
148 |
149 |
150 | def plot_alignment_to_numpy(
151 | alignment, title="", info=None, phoneme_seq=None, vmin=None, vmax=None
152 | ):
153 | if phoneme_seq:
154 | fig, ax = plt.subplots(figsize=(15, 10))
155 | else:
156 | fig, ax = plt.subplots(figsize=(6, 4))
157 | im = ax.imshow(
158 | alignment,
159 | aspect="auto",
160 | origin="lower",
161 | interpolation="none",
162 | vmin=vmin,
163 | vmax=vmax,
164 | )
165 | ax.set_title(title)
166 | fig.colorbar(im, ax=ax)
167 | xlabel = "Decoder timestep"
168 | if info is not None:
169 | xlabel += "\n\n" + info
170 | plt.xlabel(xlabel)
171 | plt.ylabel("Encoder timestep")
172 | plt.tight_layout()
173 |
174 | if phoneme_seq != None:
175 | # for debugging of phonemes and durs in maps. Not used by def in training code
176 | ax.set_yticks(np.arange(len(phoneme_seq)))
177 | ax.set_yticklabels(phoneme_seq)
178 | ax.hlines(np.arange(len(phoneme_seq)), xmin=0.0, xmax=max(ax.get_xticks()))
179 |
180 | fig.canvas.draw()
181 | data = save_figure_to_numpy(fig)
182 | plt.close()
183 | return data
184 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/radtts/train_step.py:
--------------------------------------------------------------------------------
1 | # NOTE (Sam): for use with ray trainer.
2 | from datetime import datetime
3 |
4 | import torch
5 | from torch.cuda.amp import autocast
6 | from ray.air import session
7 |
8 | from .log import get_log_audio
9 | from ..log import log
10 | from .save import save_checkpoint
11 | from ...utils.utils import (
12 | to_gpu,
13 | )
14 |
15 |
16 | # TODO (Sam): it seems like much of this can be made generic for multiple models.
17 | def _train_step(
18 | batch,
19 | model,
20 | optim,
21 | iteration,
22 | log_decoder_samples,
23 | log_attribute_samples,
24 | steps_per_sample,
25 | scaler,
26 | iters_per_checkpoint,
27 | output_directory,
28 | criterion,
29 | attention_kl_loss,
30 | kl_loss_start_iter,
31 | binarization_start_iter,
32 | vocoder,
33 | ):
34 | print(datetime.now(), "entering train step:", iteration)
35 | if iteration >= binarization_start_iter:
36 | binarize = True
37 | else:
38 | binarize = False
39 |
40 | optim.zero_grad()
41 |
42 | with autocast(enabled=False):
43 | batch_dict = batch # torch DataLoader?
44 | # TODO (Sam): move to batch.go_gpu().
45 | mel = to_gpu(batch_dict["mel"])
46 | speaker_ids = to_gpu(batch_dict["speaker_ids"])
47 | attn_prior = to_gpu(batch_dict["attn_prior"])
48 | f0 = to_gpu(batch_dict["f0"])
49 | voiced_mask = to_gpu(batch_dict["voiced_mask"])
50 | text = to_gpu(batch_dict["text"])
51 | in_lens = to_gpu(batch_dict["input_lengths"])
52 | out_lens = to_gpu(batch_dict["output_lengths"])
53 | energy_avg = to_gpu(batch_dict["energy_avg"])
54 | audio_embedding = to_gpu(batch_dict["audio_embedding"])
55 |
56 | outputs = model(
57 | mel,
58 | speaker_ids,
59 | text,
60 | in_lens,
61 | out_lens,
62 | binarize_attention=binarize,
63 | attn_prior=attn_prior,
64 | f0=f0,
65 | energy_avg=energy_avg,
66 | voiced_mask=voiced_mask,
67 | audio_embedding=audio_embedding,
68 | )
69 |
70 | loss_outputs = criterion(outputs, in_lens, out_lens)
71 |
72 | print_list = []
73 | loss = None
74 | for k, (v, w) in loss_outputs.items():
75 | if w > 0:
76 | loss = v * w if loss is None else loss + v * w
77 | print_list.append(" | {}: {:.3f}".format(k, v))
78 |
79 | w_bin = criterion.loss_weights.get("binarization_loss_weight", 1.0)
80 | if binarize and iteration >= kl_loss_start_iter:
81 | binarization_loss = attention_kl_loss(outputs["attn"], outputs["attn_soft"])
82 | loss += binarization_loss * w_bin
83 | else:
84 | binarization_loss = torch.zeros_like(loss)
85 | loss_outputs["binarization_loss"] = (binarization_loss, w_bin)
86 | grad_clip_val = 1.0 # TODO (Sam): make this a config option
87 | print(print_list)
88 | scaler.scale(loss).backward()
89 | if grad_clip_val > 0:
90 | scaler.unscale_(optim)
91 | torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val)
92 |
93 | scaler.step(optim)
94 | scaler.update()
95 |
96 | metrics = {"loss": loss.item()}
97 | for k, (v, w) in loss_outputs.items():
98 | metrics[k] = v.item()
99 |
100 | print("iteration: ", iteration, datetime.now())
101 | log_sample = iteration % steps_per_sample == 0
102 | log_checkpoint = iteration % iters_per_checkpoint == 0
103 |
104 | if log_sample and session.get_world_rank() == 0:
105 | model.eval()
106 | # TODO (Sam): adding tf output logging and out of distribution inference
107 | # TODO (Sam): add logging of ground truth
108 | images, audios = get_log_audio(
109 | batch_dict,
110 | log_decoder_samples,
111 | log_attribute_samples,
112 | model,
113 | speaker_ids,
114 | text,
115 | f0,
116 | energy_avg,
117 | voiced_mask,
118 | vocoder,
119 | )
120 | # TODO (Sam): make out of sample logging cleaner.
121 | # NOTE (Sam): right now this requires precomputation of embeddings and isn't out of sample zero shot.
122 | # gt_path = "/usr/src/app/radtts/ground_truth"
123 | # oos_embs = os.listdir(gt_path)
124 | # # this doesn't help for reasons described above
125 | # for oos_name in oos_embs:
126 | # audio_embedding_oos = torch.load(f"{gt_path}/{oos_name}").cuda()
127 | # _, audios_oos = get_log_audio(
128 | # outputs,
129 | # batch_dict,
130 | # log_decoder_samples,
131 | # log_attribute_samples,
132 | # model,
133 | # speaker_ids,
134 | # text,
135 | # f0,
136 | # energy_avg,
137 | # voiced_mask,
138 | # vocoder,
139 | # oos_name=oos_name,
140 | # audio_embedding_oos=audio_embedding_oos,
141 | # )
142 | # audios.update(audios_oos)
143 | log(
144 | metrics,
145 | audios,
146 | sample_rate=getattr(vocoder, "sr", 22050),
147 | images=images,
148 | )
149 | model.train()
150 | else:
151 | log(metrics)
152 |
153 | if log_checkpoint and session.get_world_rank() == 0:
154 | checkpoint_path = f"{output_directory}/model_{iteration}.pt"
155 | save_checkpoint(model, optim, iteration, checkpoint_path)
156 |
157 | print(f"Loss: {loss.item()}")
158 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/models/rvc/commons.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 |
7 |
8 | def init_weights(m, mean=0.0, std=0.01):
9 | classname = m.__class__.__name__
10 | if classname.find("Conv") != -1:
11 | m.weight.data.normal_(mean, std)
12 |
13 |
14 | def get_padding(kernel_size, dilation=1):
15 | return int((kernel_size * dilation - dilation) / 2)
16 |
17 |
18 | def convert_pad_shape(pad_shape):
19 | l = pad_shape[::-1]
20 | pad_shape = [item for sublist in l for item in sublist]
21 | return pad_shape
22 |
23 |
24 | def kl_divergence(m_p, logs_p, m_q, logs_q):
25 | """KL(P||Q)"""
26 | kl = (logs_q - logs_p) - 0.5
27 | kl += (
28 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
29 | )
30 | return kl
31 |
32 |
33 | def rand_gumbel(shape):
34 | """Sample from the Gumbel distribution, protect from overflows."""
35 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
36 | return -torch.log(-torch.log(uniform_samples))
37 |
38 |
39 | def rand_gumbel_like(x):
40 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
41 | return g
42 |
43 |
44 | def slice_segments(x, ids_str, segment_size=4):
45 | ret = torch.zeros_like(x[:, :, :segment_size])
46 | for i in range(x.size(0)):
47 | idx_str = ids_str[i]
48 | idx_end = idx_str + segment_size
49 | ret[i] = x[i, :, idx_str:idx_end]
50 |
51 | return ret
52 |
53 |
54 | def slice_segments2(x, ids_str, segment_size=4):
55 | ret = torch.zeros_like(x[:, :segment_size])
56 | for i in range(x.size(0)):
57 | idx_str = ids_str[i]
58 | idx_end = idx_str + segment_size
59 | ret[i] = x[i, idx_str:idx_end]
60 | return ret
61 |
62 |
63 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
64 | b, d, t = x.size()
65 | if x_lengths is None:
66 | x_lengths = t
67 | ids_str_max = (
68 | x_lengths - segment_size
69 | ) # + 1 # NOTE (Sam): remove +1 to avoid rounding error when starting with mels.
70 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
71 | ret = slice_segments(x, ids_str, segment_size)
72 | return ret, ids_str
73 |
74 |
75 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
76 | position = torch.arange(length, dtype=torch.float)
77 | num_timescales = channels // 2
78 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
79 | num_timescales - 1
80 | )
81 | inv_timescales = min_timescale * torch.exp(
82 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
83 | )
84 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
85 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
86 | signal = F.pad(signal, [0, 0, 0, channels % 2])
87 | signal = signal.view(1, channels, length)
88 | return signal
89 |
90 |
91 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
92 | b, channels, length = x.size()
93 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
94 | return x + signal.to(dtype=x.dtype, device=x.device)
95 |
96 |
97 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
98 | b, channels, length = x.size()
99 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
100 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
101 |
102 |
103 | def subsequent_mask(length):
104 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
105 | return mask
106 |
107 |
108 | @torch.jit.script
109 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
110 | n_channels_int = n_channels[0]
111 | in_act = input_a + input_b
112 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
113 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
114 | acts = t_act * s_act
115 | return acts
116 |
117 |
118 | def convert_pad_shape(pad_shape):
119 | l = pad_shape[::-1]
120 | pad_shape = [item for sublist in l for item in sublist]
121 | return pad_shape
122 |
123 |
124 | def shift_1d(x):
125 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
126 | return x
127 |
128 |
129 | def sequence_mask(length, max_length=None):
130 | if max_length is None:
131 | max_length = length.max()
132 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
133 | return x.unsqueeze(0) < length.unsqueeze(1)
134 |
135 |
136 | def generate_path(duration, mask):
137 | """
138 | duration: [b, 1, t_x]
139 | mask: [b, 1, t_y, t_x]
140 | """
141 | device = duration.device
142 |
143 | b, _, t_y, t_x = mask.shape
144 | cum_duration = torch.cumsum(duration, -1)
145 |
146 | cum_duration_flat = cum_duration.view(b * t_x)
147 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
148 | path = path.view(b, t_x, t_y)
149 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
150 | path = path.unsqueeze(1).transpose(2, 3) * mask
151 | return path
152 |
153 |
154 | def clip_grad_value_(parameters, clip_value, norm_type=2):
155 | if isinstance(parameters, torch.Tensor):
156 | parameters = [parameters]
157 | parameters = list(filter(lambda p: p.grad is not None, parameters))
158 | norm_type = float(norm_type)
159 | if clip_value is not None:
160 | clip_value = float(clip_value)
161 |
162 | total_norm = 0
163 | for p in parameters:
164 | param_norm = p.grad.data.norm(norm_type)
165 | total_norm += param_norm.item() ** norm_type
166 | if clip_value is not None:
167 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
168 | total_norm = total_norm ** (1.0 / norm_type)
169 | return total_norm
170 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/hifigan/train_step.py:
--------------------------------------------------------------------------------
1 | from torch.cuda.amp import autocast
2 | from ray.air import session
3 | from datetime import datetime
4 | from einops import rearrange
5 |
6 | from ...models.rvc.commons import clip_grad_value_, slice_segments
7 | from ...data.utils import (
8 | mel_spectrogram_torch,
9 | spec_to_mel_torch,
10 | )
11 | from ..log import log
12 | from ..rvc.save import save_checkpoint
13 | from ...models.rvc.commons import rand_slice_segments
14 |
15 | from ...data.data import MAX_WAV_VALUE
16 |
17 |
18 | # NOTE (Sam): passing dict arguments to functions is a bit of a code smell.
19 | # TODO (Sam): the data parameters have slightly different names here
20 | # (e.g. hop_length v hop_size, filter_length v n_fft, num_mels v n_mel_channels, win_length v win_size, mel_fmin v fmin) - unify.
21 | def train_step(
22 | batch, config, models, optimization_parameters, logging_parameters, iteration
23 | ):
24 | data_config = config["data"]
25 | train_config = config["train"]
26 | generator = models["generator"]
27 | discriminator = models["discriminator"]
28 | discriminator_optimizer = optimization_parameters["optimizers"]["discriminator"]
29 | generator_optimizer = optimization_parameters["optimizers"]["generator"]
30 | scaler = optimization_parameters["scaler"]
31 | discriminator_loss = optimization_parameters["losses"]["discriminator"]["loss"]
32 | # NOTE (Sam): The reason to pass the loss as a parameter rather than import it is to reuse the _train_step function for different losses.
33 | l1_loss = optimization_parameters["losses"]["l1"]["loss"]
34 | l1_loss_weight = optimization_parameters["losses"]["l1"]["weight"]
35 | generator_loss = optimization_parameters["losses"]["generator"]["loss"]
36 | generator_loss_weight = optimization_parameters["losses"]["generator"]["weight"]
37 | feature_loss = optimization_parameters["losses"]["feature"]["loss"]
38 | feature_loss_weight = optimization_parameters["losses"]["feature"]["weight"]
39 |
40 | batch = batch.to_gpu()
41 | mel_slices, ids_slice = rand_slice_segments(
42 | batch["mel_padded"],
43 | batch["mel_lengths"],
44 | train_config["segment_size"] // data_config["hop_size"],
45 | )
46 | # NOTE (Sam): it looks like audio_hat is a 3 way tensor to reuse the slice method between mel and audio.
47 | audio_hat = generator(mel_slices)
48 |
49 | # with autocast(enabled=False):
50 | audio_sliced = slice_segments(
51 | batch["audio_padded"].unsqueeze(0) / MAX_WAV_VALUE,
52 | ids_slice * data_config["hop_size"],
53 | train_config["segment_size"],
54 | )
55 |
56 | audio_sliced = rearrange(audio_sliced, "c b t -> b c t")
57 |
58 | y_d_hat_r, y_d_hat_g, _, _ = discriminator(audio_sliced, audio_hat.detach())
59 |
60 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
61 | discriminator_optimizer.zero_grad()
62 | scaler.scale(loss_disc).backward()
63 | scaler.unscale_(discriminator_optimizer)
64 | grad_norm_d = clip_grad_value_(discriminator.parameters(), None)
65 | scaler.step(discriminator_optimizer)
66 |
67 | # with autocast(enabled=False):
68 | y_hat_mel = mel_spectrogram_torch(
69 | audio_hat.float().squeeze(1),
70 | data_config["n_fft"],
71 | data_config["num_mels"],
72 | data_config["sampling_rate"],
73 | data_config["hop_size"],
74 | data_config["win_size"],
75 | data_config["fmin"],
76 | data_config["fmax"],
77 | )
78 |
79 | # if train_config["fp16_run"] == True:
80 | # y_hat_mel = y_hat_mel.half()
81 | # with autocast(enabled=train_config["fp16_run"]):
82 | # NOTE (Sam): y_d_hat are list of coordinates of real and generated data at the output of each block
83 | # fmap_r and fmap_g are the same except earlier in the network.
84 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = discriminator(
85 | audio_sliced,
86 | audio_hat,
87 | )
88 |
89 | loss_mel = l1_loss(mel_slices, y_hat_mel) * train_config["c_mel"]
90 | loss_fm = feature_loss(fmap_r, fmap_g)
91 | loss_gen, losses_gen = generator_loss(y_d_hat_g)
92 | # TODO (Sam): put these in a loss_outputs dict like radtts
93 | loss_gen_all = (
94 | loss_gen * generator_loss_weight
95 | + loss_fm * feature_loss_weight
96 | + loss_mel * l1_loss_weight
97 | )
98 |
99 | generator_optimizer.zero_grad()
100 | scaler.scale(loss_gen_all).backward()
101 | scaler.unscale_(generator_optimizer)
102 | grad_norm_g = clip_grad_value_(generator.parameters(), None)
103 | scaler.step(generator_optimizer)
104 | scaler.update()
105 |
106 | print("iteration: ", iteration, datetime.now())
107 | log_sample = iteration % train_config["steps_per_sample"] == 0
108 | log_checkpoint = iteration % train_config["iters_per_checkpoint"] == 0
109 |
110 | metrics = {
111 | "generator_total_loss": loss_gen_all,
112 | "generator_loss": loss_gen,
113 | "generator_feature_loss": loss_fm,
114 | "generator_loss_mel": loss_mel,
115 | # "discriminator_total_loss": loss_disc,
116 | }
117 |
118 | log(metrics)
119 |
120 | if log_sample and session.get_world_rank() == 0:
121 | import numpy as np
122 |
123 | audios = {
124 | "ground_truth": {
125 | "audio": audio_sliced[0][0] / np.abs(audio_sliced[0][0].cpu()).max()
126 | },
127 | "generated": {"audio": audio_hat[0][0]},
128 | }
129 | images = None
130 |
131 | log(audios=audios, images=images)
132 | if log_checkpoint and session.get_world_rank() == 0:
133 | checkpoint_path = f"{train_config['output_directory']}/model_{iteration}.pt"
134 | save_checkpoint(
135 | generator,
136 | generator_optimizer,
137 | discriminator,
138 | discriminator_optimizer,
139 | iteration,
140 | checkpoint_path,
141 | )
142 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/duration.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 |
7 | from uberduck_ml_dev.models import common
8 |
9 |
10 | class StochasticDurationPredictor(nn.Module):
11 | def __init__(
12 | self,
13 | in_channels,
14 | filter_channels,
15 | kernel_size,
16 | p_dropout,
17 | n_flows=4,
18 | gin_channels=0,
19 | ):
20 | super().__init__()
21 | filter_channels = in_channels # it needs to be removed from future version.
22 | self.in_channels = in_channels
23 | self.filter_channels = filter_channels
24 | self.kernel_size = kernel_size
25 | self.p_dropout = p_dropout
26 | self.n_flows = n_flows
27 | self.gin_channels = gin_channels
28 |
29 | self.log_flow = common.Log()
30 | self.flows = nn.ModuleList()
31 | self.flows.append(common.ElementwiseAffine(2))
32 | for i in range(n_flows):
33 | self.flows.append(
34 | common.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
35 | )
36 | self.flows.append(common.Flip())
37 |
38 | self.post_pre = nn.Conv1d(1, filter_channels, 1)
39 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
40 | self.post_convs = common.DDSConv(
41 | filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
42 | )
43 | self.post_flows = nn.ModuleList()
44 | self.post_flows.append(common.ElementwiseAffine(2))
45 | for i in range(4):
46 | self.post_flows.append(
47 | common.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
48 | )
49 | self.post_flows.append(common.Flip())
50 |
51 | self.pre = nn.Conv1d(in_channels, filter_channels, 1)
52 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
53 | self.convs = common.DDSConv(
54 | filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
55 | )
56 | if gin_channels != 0:
57 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
58 |
59 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
60 | x = torch.detach(x)
61 | x = self.pre(x)
62 | if g is not None:
63 | g = torch.detach(g)
64 | x = x + self.cond(g)
65 | x = self.convs(x, x_mask)
66 | x = self.proj(x) * x_mask
67 |
68 | if not reverse:
69 | flows = self.flows
70 | assert w is not None
71 |
72 | logdet_tot_q = 0
73 | h_w = self.post_pre(w)
74 | h_w = self.post_convs(h_w, x_mask)
75 | h_w = self.post_proj(h_w) * x_mask
76 | e_q = (
77 | torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
78 | * x_mask
79 | )
80 | z_q = e_q
81 | for flow in self.post_flows:
82 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
83 | logdet_tot_q += logdet_q
84 | z_u, z1 = torch.split(z_q, [1, 1], 1)
85 | u = torch.sigmoid(z_u) * x_mask
86 | z0 = (w - u) * x_mask
87 | logdet_tot_q += torch.sum(
88 | (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
89 | )
90 | logq = (
91 | torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
92 | - logdet_tot_q
93 | )
94 |
95 | logdet_tot = 0
96 | z0, logdet = self.log_flow(z0, x_mask)
97 | logdet_tot += logdet
98 | z = torch.cat([z0, z1], 1)
99 | for flow in flows:
100 | z, logdet = flow(z, x_mask, g=x, reverse=reverse)
101 | logdet_tot = logdet_tot + logdet
102 | nll = (
103 | torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
104 | - logdet_tot
105 | )
106 | return nll + logq # [b]
107 | else:
108 | flows = list(reversed(self.flows))
109 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow
110 | z = (
111 | torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
112 | * noise_scale
113 | )
114 | for flow in flows:
115 | z = flow(z, x_mask, g=x, reverse=reverse)
116 | z0, z1 = torch.split(z, [1, 1], 1)
117 | logw = z0
118 | return logw
119 |
120 |
121 | class DurationPredictor(nn.Module):
122 | def __init__(
123 | self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
124 | ):
125 | super().__init__()
126 |
127 | self.in_channels = in_channels
128 | self.filter_channels = filter_channels
129 | self.kernel_size = kernel_size
130 | self.p_dropout = p_dropout
131 | self.gin_channels = gin_channels
132 |
133 | self.drop = nn.Dropout(p_dropout)
134 | self.conv_1 = nn.Conv1d(
135 | in_channels, filter_channels, kernel_size, padding=kernel_size // 2
136 | )
137 | self.norm_1 = common.LayerNorm(filter_channels)
138 | self.conv_2 = nn.Conv1d(
139 | filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
140 | )
141 | self.norm_2 = common.LayerNorm(filter_channels)
142 | self.proj = nn.Conv1d(filter_channels, 1, 1)
143 |
144 | if gin_channels != 0:
145 | self.cond = nn.Conv1d(gin_channels, in_channels, 1)
146 |
147 | def forward(self, x, x_mask, g=None):
148 | x = torch.detach(x)
149 | if g is not None:
150 | g = torch.detach(g)
151 | x = x + self.cond(g)
152 | x = self.conv_1(x * x_mask)
153 | x = torch.relu(x)
154 | x = self.norm_1(x)
155 | x = self.drop(x)
156 | x = self.conv_2(x * x_mask)
157 | x = torch.relu(x)
158 | x = self.norm_2(x)
159 | x = self.drop(x)
160 | x = self.proj(x * x_mask)
161 | return x * x_mask
162 |
--------------------------------------------------------------------------------
/uberduck_ml_dev/text/numerical.py:
--------------------------------------------------------------------------------
1 | """ adapted from https://github.com/keithito/tacotron """
2 |
3 | import inflect
4 | import re
5 |
6 | _magnitudes = ["trillion", "billion", "million", "thousand", "hundred", "m", "b", "t"]
7 | _magnitudes_key = {"m": "million", "b": "billion", "t": "trillion"}
8 | _measurements = "(f|c|k|d|m)"
9 | _measurements_key = {"f": "fahrenheit", "c": "celsius", "k": "thousand", "m": "meters"}
10 | _currency_key = {"$": "dollar", "£": "pound", "€": "euro", "₩": "won"}
11 | _inflect = inflect.engine()
12 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
13 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
14 | _currency_re = re.compile(
15 | r"([\$€£₩])([0-9\.\,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]))?".format(
16 | "|".join(_magnitudes)
17 | ),
18 | re.IGNORECASE,
19 | )
20 | _measurement_re = re.compile(
21 | r"([0-9\.\,]*[0-9]+(\s)?{}\b)".format(_measurements), re.IGNORECASE
22 | )
23 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
24 | # _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
25 | _roman_re = re.compile(
26 | r"\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b"
27 | ) # avoid I
28 | _multiply_re = re.compile(r"(\b[0-9]+)(x)([0-9]+)")
29 | _number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
30 |
31 |
32 | def _remove_commas(m):
33 | return m.group(1).replace(",", "")
34 |
35 |
36 | def _expand_decimal_point(m):
37 | return m.group(1).replace(".", " point ")
38 |
39 |
40 | def _expand_currency(m):
41 | currency = _currency_key[m.group(1)]
42 | quantity = m.group(2)
43 | magnitude = m.group(3)
44 |
45 | # remove commas from quantity to be able to convert to numerical
46 | quantity = quantity.replace(",", "")
47 |
48 | # check for million, billion, etc...
49 | if magnitude is not None and magnitude.lower() in _magnitudes:
50 | if len(magnitude) == 1:
51 | magnitude = _magnitudes_key[magnitude.lower()]
52 | return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + "s")
53 |
54 | parts = quantity.split(".")
55 | if len(parts) > 2:
56 | return quantity + " " + currency + "s" # Unexpected format
57 |
58 | dollars = int(parts[0]) if parts[0] else 0
59 |
60 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
61 | if dollars and cents:
62 | dollar_unit = currency if dollars == 1 else currency + "s"
63 | cent_unit = "cent" if cents == 1 else "cents"
64 | return "{} {}, {} {}".format(
65 | _expand_hundreds(dollars),
66 | dollar_unit,
67 | _inflect.number_to_words(cents),
68 | cent_unit,
69 | )
70 | elif dollars:
71 | dollar_unit = currency if dollars == 1 else currency + "s"
72 | return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
73 | elif cents:
74 | cent_unit = "cent" if cents == 1 else "cents"
75 | return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
76 | else:
77 | return "zero" + " " + currency + "s"
78 |
79 |
80 | def _expand_hundreds(text):
81 | number = float(text)
82 | if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
83 | return _inflect.number_to_words(int(number / 100)) + " hundred"
84 | else:
85 | return _inflect.number_to_words(text)
86 |
87 |
88 | def _expand_ordinal(m):
89 | return _inflect.number_to_words(m.group(0))
90 |
91 |
92 | def _expand_measurement(m):
93 | _, number, measurement = re.split("(\d+(?:\.\d+)?)", m.group(0))
94 | number = _inflect.number_to_words(number)
95 | measurement = "".join(measurement.split())
96 | measurement = _measurements_key[measurement.lower()]
97 | return "{} {}".format(number, measurement)
98 |
99 |
100 | def _expand_range(m):
101 | return " to "
102 |
103 |
104 | def _expand_multiply(m):
105 | left = m.group(1)
106 | right = m.group(3)
107 | return "{} by {}".format(left, right)
108 |
109 |
110 | def _expand_roman(m):
111 | # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
112 | roman_numerals = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
113 | result = 0
114 | num = m.group(0)
115 | for i, c in enumerate(num):
116 | if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]:
117 | result += roman_numerals[c]
118 | else:
119 | result -= roman_numerals[c]
120 | return str(result)
121 |
122 |
123 | def _expand_number(m):
124 | _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
125 | number = int(number)
126 | if (
127 | number > 1000
128 | and number < 10000
129 | and (number % 100 == 0)
130 | and (number % 1000 != 0)
131 | ):
132 | text = _inflect.number_to_words(number // 100) + " hundred"
133 | elif number > 1000 and number < 3000:
134 | if number == 2000:
135 | text = "two thousand"
136 | elif number > 2000 and number < 2010:
137 | text = "two thousand " + _inflect.number_to_words(number % 100)
138 | elif number % 100 == 0:
139 | text = _inflect.number_to_words(number // 100) + " hundred"
140 | else:
141 | number = _inflect.number_to_words(
142 | number, andword="", zero="oh", group=2
143 | ).replace(", ", " ")
144 | number = re.sub(r"-", " ", number)
145 | text = number
146 | else:
147 | number = _inflect.number_to_words(number, andword="and")
148 | number = re.sub(r"-", " ", number)
149 | number = re.sub(r",", "", number)
150 | text = number
151 |
152 | if suffix in ("'s", "s"):
153 | if text[-1] == "y":
154 | text = text[:-1] + "ies"
155 | else:
156 | text = text + suffix
157 |
158 | return text
159 |
160 |
161 | def normalize_currency(text):
162 | return re.sub(_currency_re, _expand_currency, text)
163 |
164 |
165 | def normalize_numbers(text):
166 | text = re.sub(_comma_number_re, _remove_commas, text)
167 | text = re.sub(_currency_re, _expand_currency, text)
168 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
169 | text = re.sub(_ordinal_re, _expand_ordinal, text)
170 | # text = re.sub(_range_re, _expand_range, text)
171 | # text = re.sub(_measurement_re, _expand_measurement, text)
172 | text = re.sub(_roman_re, _expand_roman, text)
173 | text = re.sub(_multiply_re, _expand_multiply, text)
174 | text = re.sub(_number_re, _expand_number, text)
175 | return text
176 |
--------------------------------------------------------------------------------