├── .github └── workflows │ └── main.yml ├── .gitignore ├── .pre-commit-config.yaml ├── MANIFEST.in ├── README.md ├── analytics ├── dependencies │ ├── details.json │ └── details.png └── tests │ ├── __init__.py │ ├── conftest.py │ ├── fixtures │ ├── ljtest │ │ ├── list.txt │ │ ├── list_small.txt │ │ ├── taco2_lj2lj.json │ │ └── wavs │ │ │ ├── LJ001-0001.wav │ │ │ ├── LJ001-0002.wav │ │ │ ├── LJ001-0003.wav │ │ │ ├── LJ001-0004.wav │ │ │ ├── LJ001-0005.wav │ │ │ ├── LJ001-0006.wav │ │ │ ├── LJ001-0007.wav │ │ │ ├── LJ001-0008.wav │ │ │ ├── LJ001-0009.wav │ │ │ ├── LJ001-0010.wav │ │ │ ├── LJ001-0011.wav │ │ │ ├── LJ001-0012.wav │ │ │ ├── LJ001-0013.wav │ │ │ ├── LJ001-0014.wav │ │ │ ├── LJ001-0015.wav │ │ │ └── LJ001-0016.wav │ ├── sample_spectrogram.pt │ ├── sample_spectrogram_tf.pt │ ├── stevejobs-1.pt │ ├── val.txt │ └── wavs │ │ └── stevejobs-1.wav │ ├── pytest.ini │ └── tests │ ├── __init__.py │ ├── models │ ├── __init__.py │ ├── test_common.py │ └── test_tacotron2.py │ ├── test_data_loader.py │ ├── text │ ├── test_symbols.py │ └── test_util.py │ ├── trainer │ ├── __init__.py │ └── test_trainer.py │ ├── utils │ └── test_utils.py │ └── vocoders │ └── test_hifi_gan.py ├── licenses ├── LICENSE ├── LICENSE2 ├── LICENSE3 ├── LICENSE4 └── LICENSE5 ├── settings.ini ├── setup.py ├── tutorials ├── hifigan │ ├── data_processing.py │ └── download.sh └── radtts │ ├── demo_config.json │ ├── download.sh │ ├── radtts_data_processing.ipynb │ └── train.sh └── uberduck_ml_dev ├── __init__.py ├── assets └── duck.png ├── data ├── __init__.py ├── batch.py ├── collate.py ├── data.py ├── get.py ├── hifigan.py ├── normalization.py ├── processor.py ├── ray.py ├── spectrogram.py ├── statistics.py └── utils.py ├── e2e.py ├── exec ├── __init__.py ├── dataset_statistics.py ├── normalize_audio.py ├── split_train_val.py ├── train_radtts_with_ray.py ├── train_tacotron2.py └── train_vits.py ├── losses.py ├── losses_rvc.py ├── models ├── __init__.py ├── avocodo.py ├── base.py ├── common.py ├── components │ ├── __init__.py │ ├── alignment.py │ ├── attention.py │ ├── attentions.py │ ├── attribute_prediction_model.py │ ├── autoregressive_flow.py │ ├── decoders │ │ ├── __init__.py │ │ └── tacotron2.py │ ├── encoders │ │ ├── __init__.py │ │ ├── duration.py │ │ ├── resnet_speaker_encoder.py │ │ ├── speaker │ │ │ ├── __init__.py │ │ │ ├── base_encoder.py │ │ │ └── resnet.py │ │ └── tacotron2.py │ ├── partialconv1d.py │ ├── postnet.py │ ├── prenet.py │ ├── splines.py │ └── transformer.py ├── hifigan.py ├── hubert.py ├── radtts.py ├── rvc │ ├── __init__.py │ ├── attentions.py │ ├── commons.py │ ├── modules.py │ ├── rmvpe.py │ ├── rvc.py │ ├── transforms.py │ ├── vc.py │ └── vc_infer_pipeline.py ├── tacotron2.py ├── torchmoji.py ├── transforms.py ├── utils.py └── vits.py ├── monitoring ├── __init__.py ├── generate.py ├── statistics.py ├── streamlit.py └── wandb.py ├── monotonic_align.py ├── optimizers └── radam.py ├── text ├── __init__.py ├── abbreviations.py ├── acronyms.py ├── cleaners.py ├── cmudict-0.7b ├── cmudict.py ├── datestime.py ├── grapheme_dictionary.py ├── heteronyms ├── letters_and_numbers.py ├── numerical.py ├── symbols.py ├── text_processing.py └── utils.py ├── trainer ├── __init__.py ├── base.py ├── hifigan │ ├── __init__.py │ ├── train.py │ ├── train_epoch.py │ └── train_step.py ├── load.py ├── log.py ├── radtts │ ├── __init__.py │ ├── load.py │ ├── log.py │ ├── save.py │ ├── train.py │ ├── train_epoch.py │ └── train_step.py ├── rvc │ ├── __init__.py │ ├── save.py │ ├── train.py │ ├── train_epoch.py │ ├── train_step.py │ └── utils.py └── tacotron2.py ├── utils ├── __init__.py ├── audio.py ├── config.py ├── denoiser.py ├── exec.py ├── hifiutils.py ├── plot.py └── utils.py └── vendor ├── __init__.py └── tfcompat ├── __init__.py └── hparam.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v1 8 | - uses: actions/setup-python@v1 9 | with: 10 | python-version: "3.10" 11 | architecture: "x64" 12 | - name: Install OS dependencies 13 | run: | 14 | sudo apt-get update 15 | sudo apt-get install espeak libsndfile-dev 16 | - name: Install the library 17 | run: | 18 | pip install -e . 19 | - name: Run tests 20 | run: | 21 | python -m pytest 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | experiments/outputs/* 2 | experiments/outputs_processed/* 3 | experiments/logs/* 4 | src/vendor_tools/* 5 | docker/secrets/gcloud_key.json 6 | docker/secrets/* 7 | nbs/test/fixtures/models/* 8 | nbs/test/fixtures/results/* 9 | test_checkpoint/* 10 | 11 | *.bak 12 | .gitattributes 13 | .last_checked 14 | .gitconfig 15 | *.bak 16 | *.log 17 | *~ 18 | ~* 19 | _tmp* 20 | tmp* 21 | tags 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | env/ 34 | build/ 35 | develop-eggs/ 36 | dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | lib/ 41 | lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/ 62 | .tox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | .hypothesis/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | docs/.jekyll-cache 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # celery beat schedule file 100 | celerybeat-schedule 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # dotenv 106 | .env 107 | 108 | # virtualenv 109 | .venv 110 | venv/ 111 | ENV/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | 126 | .vscode 127 | *.swp 128 | 129 | # osx generated files 130 | .DS_Store 131 | .DS_Store? 132 | .Trashes 133 | ehthumbs.db 134 | Thumbs.db 135 | .idea 136 | 137 | # pytest 138 | .pytest_cache 139 | 140 | # tools/trust-doc-nbs 141 | docs_src/.last_checked 142 | 143 | # symlinks to fastai 144 | docs_src/fastai 145 | tools/fastai 146 | 147 | # link checker 148 | checklink/cookies.txt 149 | 150 | # .gitconfig is now autogenerated 151 | .gitconfig 152 | 153 | nbs/runs 154 | events.out.tfevents* 155 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.3.0 10 | hooks: 11 | - id: black 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include README.md 3 | recursive-exclude * __pycache__ 4 | include uberduck_ml_dev/text/* 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deprecation note 2 | We are moving away from maintaining this repository. 3 | 4 | # 🦆 ~~Uberduck Synthetic Speech~~ 5 |  6 |  7 |  8 |  9 | [](https://discord.com/invite/ATYWnMu) 10 | 11 | This repository includes 12 |
tags around the doc strings, preserving newlines/indentation. 68 | #monospace_docstrings = False 69 | #Test flags: introduce here the test flags you want to use separated by | 70 | tst_flags=slow 71 | #Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True) 72 | #custom_sidebar = 73 | #Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here 74 | #jekyll_styles = note,warning,tip,important 75 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import parse_version 2 | from configparser import ConfigParser 3 | import setuptools, re, sys 4 | 5 | assert parse_version(setuptools.__version__) >= parse_version("36.2") 6 | 7 | # note: all settings are in settings.ini; edit there, not here 8 | config = ConfigParser(delimiters=["="]) 9 | config.read("settings.ini") 10 | cfg = config["DEFAULT"] 11 | 12 | cfg_keys = "version description keywords author author_email".split() 13 | expected = ( 14 | cfg_keys 15 | + "lib_name user branch license status min_python audience language".split() 16 | ) 17 | for o in expected: 18 | assert o in cfg, "missing expected setting: {}".format(o) 19 | setup_cfg = {o: cfg[o] for o in cfg_keys} 20 | 21 | if len(sys.argv) > 1 and sys.argv[1] == "version": 22 | print(setup_cfg["version"]) 23 | exit() 24 | 25 | licenses = { 26 | "apache2": ( 27 | "Apache Software License 2.0", 28 | "OSI Approved :: Apache Software License", 29 | ), 30 | "mit": ("MIT License", "OSI Approved :: MIT License"), 31 | "gpl2": ( 32 | "GNU General Public License v2", 33 | "OSI Approved :: GNU General Public License v2 (GPLv2)", 34 | ), 35 | "gpl3": ( 36 | "GNU General Public License v3", 37 | "OSI Approved :: GNU General Public License v3 (GPLv3)", 38 | ), 39 | "bsd3": ("BSD License", "OSI Approved :: BSD License"), 40 | } 41 | statuses = [ 42 | "1 - Planning", 43 | "2 - Pre-Alpha", 44 | "3 - Alpha", 45 | "4 - Beta", 46 | "5 - Production/Stable", 47 | "6 - Mature", 48 | "7 - Inactive", 49 | ] 50 | py_versions = ( 51 | "2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8".split() 52 | ) 53 | 54 | lic = licenses.get(cfg["license"].lower(), (cfg["license"], None)) 55 | min_python = cfg["min_python"] 56 | 57 | requirements = ["pip", "packaging"] 58 | if cfg.get("requirements"): 59 | requirements += cfg.get("requirements", "").split() 60 | if cfg.get("pip_requirements"): 61 | requirements += cfg.get("pip_requirements", "").split() 62 | dev_requirements = (cfg.get("dev_requirements") or "").split() 63 | 64 | long_description = open("README.md", encoding="utf-8").read() 65 | #  66 | for ext in ["png", "svg"]: 67 | long_description = re.sub( 68 | r"!\[" + ext + "\]\((.*)\)", 69 | " 73 | + "/" 74 | + cfg["branch"] 75 | + "/\\1)", 76 | long_description, 77 | ) 78 | long_description = re.sub( 79 | r"src=\"(.*)\." + ext + '"', 80 | 'src="https://raw.githubusercontent.com/{}/{}'.format( 81 | cfg["user"], cfg["lib_name"] 82 | ) 83 | + "/" 84 | + cfg["branch"] 85 | + "/\\1." 86 | + ext 87 | + '"', 88 | long_description, 89 | ) 90 | 91 | setuptools.setup( 92 | name=cfg["lib_name"], 93 | license=lic[0], 94 | classifiers=[ 95 | "Development Status :: " + statuses[int(cfg["status"])], 96 | "Intended Audience :: " + cfg["audience"].title(), 97 | "Natural Language :: " + cfg["language"].title(), 98 | ] 99 | + [ 100 | "Programming Language :: Python :: " + o 101 | for o in py_versions[py_versions.index(min_python) :] 102 | ] 103 | + (["License :: " + lic[1]] if lic[1] else []), 104 | url=cfg["git_url"], 105 | packages=setuptools.find_packages(), 106 | include_package_data=True, 107 | package_data={ 108 | "": ["uberduck_ml_dev/text/heteronyms", "uberduck_ml_dev/text/cmudict-0.7b"] 109 | }, 110 | install_requires=requirements, 111 | extras_require={"dev": dev_requirements}, 112 | python_requires=">=" + cfg["min_python"], 113 | long_description=long_description, 114 | long_description_content_type="text/markdown", 115 | zip_safe=False, 116 | entry_points={"console_scripts": cfg.get("console_scripts", "").split()}, 117 | **setup_cfg 118 | ) 119 | -------------------------------------------------------------------------------- /tutorials/hifigan/data_processing.py: -------------------------------------------------------------------------------- 1 | import os 2 | from scipy.io.wavfile import read, write 3 | import librosa 4 | import torch 5 | import numpy as np 6 | 7 | from uberduck_ml_dev.data.get import get 8 | from uberduck_ml_dev.data.utils import mel_spectrogram_torch, find_rel_paths 9 | from uberduck_ml_dev.data.data import HIFIGAN_DEFAULTS as DEFAULTS 10 | from uberduck_ml_dev.data.data import MAX_WAV_VALUE 11 | 12 | 13 | data_directory = "" # path to the directory containing the data 14 | ground_truth_rel_paths = find_rel_paths(directory=data_directory, filename="gt.wav") 15 | ground_truth_abs_paths = [ 16 | os.path.join(data_directory, ground_truth_rel_path) 17 | for ground_truth_rel_path in ground_truth_rel_paths 18 | ] 19 | 20 | 21 | print("resampling and integer normalizing") 22 | 23 | resampled_normalized_abs_paths = [ 24 | resampled_normalized_abs_path.replace( 25 | "gt.wav", "audio_resampledT_normalized32768T.wav" 26 | ) 27 | for resampled_normalized_abs_path in ground_truth_abs_paths 28 | ] 29 | 30 | loading_function = lambda filename: librosa.load(filename, sr=22050)[0] 31 | processing_function = lambda x: np.asarray( 32 | (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype=np.int16 33 | ) 34 | saving_function = lambda data, filename: write( 35 | filename, 22050, data 36 | ) # must be in this order 37 | 38 | 39 | get( 40 | processing_function, 41 | saving_function, 42 | loading_function, 43 | ground_truth_abs_paths, 44 | resampled_normalized_abs_paths, 45 | True, 46 | ) 47 | 48 | print("resampling and float normalizing") 49 | 50 | resampled_normalized_abs_paths = [ 51 | resampled_normalized_abs_path.replace("gt.wav", "audio_resampledT_normalized1T.wav") 52 | for resampled_normalized_abs_path in ground_truth_abs_paths 53 | ] 54 | 55 | loading_function = lambda filename: librosa.load(filename, sr=22050)[0] 56 | processing_function = lambda x: np.asarray( 57 | (x / np.abs(x).max()) * (1 - 1 / MAX_WAV_VALUE), dtype=np.float32 58 | ) 59 | saving_function = lambda data, filename: write( 60 | filename, 22050, data 61 | ) # must be in this order 62 | 63 | 64 | get( 65 | processing_function, 66 | saving_function, 67 | loading_function, 68 | ground_truth_abs_paths, 69 | resampled_normalized_abs_paths, 70 | True, 71 | ) 72 | 73 | 74 | print("computing spectrograms from 1 normalized audio") 75 | 76 | spectrogram_abs_paths = [ 77 | ground_truth_abs_path.replace("gt.wav", "spectrogram.pt") 78 | for ground_truth_abs_path in ground_truth_abs_paths 79 | ] 80 | 81 | 82 | processing_function = lambda x: mel_spectrogram_torch( 83 | x, 84 | DEFAULTS["n_fft"], 85 | DEFAULTS["num_mels"], 86 | DEFAULTS["sampling_rate"], 87 | DEFAULTS["hop_size"], 88 | DEFAULTS["win_size"], 89 | DEFAULTS["fmin"], 90 | DEFAULTS["fmax"], 91 | True, 92 | ) 93 | loading_function = lambda source_path: torch.Tensor( 94 | read(source_path)[1] / MAX_WAV_VALUE 95 | ).unsqueeze(0) 96 | saving_function = lambda data, target_path: torch.save(data, target_path) 97 | 98 | get( 99 | processing_function, 100 | saving_function, 101 | loading_function, 102 | resampled_normalized_abs_paths, 103 | spectrogram_abs_paths, 104 | True, 105 | ) 106 | -------------------------------------------------------------------------------- /tutorials/hifigan/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/lj_speech.zip 4 | unzip lj_speech.zip 5 | # wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_libritts100360_generator0p5.pt 6 | # wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_22khz_config.json 7 | -------------------------------------------------------------------------------- /tutorials/radtts/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/lj_speech.zip 4 | unzip lj_speech.zip 5 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_libritts100360_generator0p5.pt 6 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_22khz_config.json 7 | -------------------------------------------------------------------------------- /tutorials/radtts/radtts_data_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0e3c74a5", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# For computing normalized audio, spectrograms, and pitches\n", 11 | "import os\n", 12 | "from uberduck_ml_dev.data.get import get_mels, get_pitches\n", 13 | "from uberduck_ml_dev.data.data import RADTTS_DEFAULTS as data_config\n", 14 | "\n", 15 | "from uberduck_ml_dev.data.get import get\n", 16 | "import librosa\n", 17 | "import numpy as np\n", 18 | "from scipy.io.wavfile import write\n", 19 | "from datetime import datetime" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 11, 25 | "id": "2710441c", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# data_dir = \"/path/to/data\"\n", 30 | "data_dir = \"/usr/src/app/uberduck_ml_dev/tutorials/radtts/lj/LJSpeech/\"" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 15, 36 | "id": "5cdc25fe", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "def find_rel_paths(directory, filename):\n", 41 | " for root, dirs, files in os.walk(directory):\n", 42 | " if filename in files:\n", 43 | " yield os.path.relpath(os.path.join(root, filename), directory)\n", 44 | "\n", 45 | "filename = 'gt.wav' # replace with your filename\n", 46 | "rel_path_list = list(find_rel_paths(data_dir, filename))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 24, 52 | "id": "d9f989f6", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "local_path_list = [os.path.join(data_dir, rel_path) for rel_path in rel_path_list]\n", 57 | "resampled_normalized_path_list = [os.path.join(data_dir, \n", 58 | " local_path.split('gt.wav')[0],\n", 59 | " 'audio_resampledT_normalized32768T.wav') \n", 60 | " for local_path in local_path_list]\n", 61 | "spectrogram_path_list = [os.path.join(data_dir, local_path.split('gt.wav')[0],\n", 62 | " 'spectrogram.pt') \n", 63 | " for local_path in local_path_list]\n", 64 | "folder_path_list = [os.path.join(data_dir, local_path.split('gt.wav')[0]) \n", 65 | " for local_path in local_path_list]" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "f5ce0f25", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "print(\"resample and normalize\")\n", 76 | "MAX_WAV_VALUE = 32768\n", 77 | "sr = 22050\n", 78 | "loading_function = lambda filename : librosa.load(filename, sr = 22050)[0]\n", 79 | "function_ = lambda x : np.asarray((x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype = np.int16)\n", 80 | "saving_function = lambda data, filename : write(filename, 22050, data) # must be in this order\n", 81 | "\n", 82 | "print(datetime.now())\n", 83 | "get(function_, loading_function, saving_function, local_path_list, resampled_normalized_path_list, False)\n", 84 | "print(datetime.now())" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "ab2d5894", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "print(\"compute pitches\")\n", 95 | "get_pitches(resampled_normalized_path_list, data_config, folder_path_list, method = 'radtts')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "08e86d85", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "print(\"compute mels\")\n", 106 | "get_mels(resampled_normalized_path_list, data_config, spectrogram_path_list)" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.8.10" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 5 131 | } 132 | -------------------------------------------------------------------------------- /tutorials/radtts/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../.. 4 | # remember to set training and eval filelists, heteronyms_path and phoneme_dict_path vocoder_config_path and vocoder_checkpoint_path in demo_config.json 5 | python uberduck_ml_dev/exec/train_radtts_with_ray.py --config tutorials/radtts/demo_config.json 6 | -------------------------------------------------------------------------------- /uberduck_ml_dev/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /uberduck_ml_dev/assets/duck.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/assets/duck.png -------------------------------------------------------------------------------- /uberduck_ml_dev/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/data/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/data/batch.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from ..utils.utils import to_gpu 4 | 5 | 6 | class Batch(Dict): 7 | # NOTE (Sam): isn't gate target redundant to output length. 8 | # NOTE (Sam): here types are unused, but TypedDict inheritance doesn't allow methods 9 | # NOTE (Sam): these were also problems with object (I forget), NamedTuple (mutability), dataclass (I forget) 10 | 11 | # text_int_padded: Optional[torch.LongTensor] = None 12 | # input_lengths: Optional[torch.LongTensor] = None 13 | # mel_padded: Optional[torch.FloatTensor] = None # for teacher forcing. 14 | # gate_target: Optional[ 15 | # torch.LongTensor 16 | # ] = None # NOTE (Sam): could be bool - for teacher forcing. 17 | # output_lengths: Optional[torch.LongTensor] = None 18 | # speaker_ids: Optional[torch.LongTensor] = None 19 | # gst: Optional[torch.Tensor] = None 20 | # mel_outputs: Optional[torch.Tensor] = None # predicted. 21 | # mel_outputs_postnet: Optional[torch.Tensor] = None 22 | # gate_predicted: Optional[torch.LongTensor] = None # could be bool. 23 | # alignments: Optional[torch.Tensor] = None 24 | # audio_encodings: Optional[torch.Tensor] = None 25 | 26 | def subset(self, keywords, fragile=False) -> "Batch": 27 | d = {} 28 | for k in keywords: 29 | try: 30 | d[k] = self[k] 31 | except KeyError: 32 | if fragile: 33 | raise 34 | return Batch(**d) 35 | 36 | def to_gpu(self) -> "Batch": 37 | batch_gpu = Batch(**{k: to_gpu(v) for k, v in self.items()}) 38 | return batch_gpu 39 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/get.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | import librosa 3 | from pathlib import Path 4 | from tqdm import tqdm 5 | import torch 6 | import os 7 | 8 | from ..data.data import DataMel, DataPitch 9 | from ..data.collate import CollateBlank 10 | from ..data.processor import Processor 11 | 12 | 13 | def get_parallel_torch(data): 14 | data_loader = DataLoader( 15 | data, batch_size=32, collate_fn=CollateBlank(), num_workers=8 16 | ) 17 | for batch in data_loader: 18 | pass 19 | 20 | 21 | # TODO (Sam): use get_parallel_torch to reduce boilerplate. 22 | # NOTE (Sam): assumes data is in a directory structure like: 23 | # /tmp/{uuid}/resampled_normalized.wav 24 | # These functions add spectrogram.pt, f0.pt, and coqui_resnet_512_emb.pt to each file-specific directory. 25 | def get_mels(paths, data_config, target_paths): 26 | data = DataMel(audiopaths=paths, data_config=data_config, target_paths=target_paths) 27 | 28 | collate_fn = CollateBlank() 29 | 30 | data_loader = DataLoader( 31 | data, 32 | batch_size=32, 33 | collate_fn=collate_fn, 34 | ) 35 | for batch in data_loader: 36 | pass # computes in loader. 37 | 38 | 39 | # NOTE (Sam): pitch, pitchf == f0 coarse, f0bak in rvc parlance. 40 | # NOTE (Sam): sample_rate is also passed as part of data_config 41 | # TODO (Sam): decide on sample_rate v sampling_rate 42 | # NOTE (Sam): pyin (radtts) and parselmouth (rvc) methods seem to generate pitches of different lengths. 43 | def get_pitches( 44 | paths, 45 | data_config=None, 46 | target_folders=None, 47 | method="parselmouth", 48 | sample_rate=None, 49 | recompute=False, 50 | ): 51 | data = DataPitch( 52 | audiopaths=paths, 53 | data_config=data_config, 54 | target_folders=target_folders, 55 | method=method, 56 | sample_rate=data_config["sampling_rate"], 57 | recompute=recompute, 58 | ) 59 | get_parallel_torch(data) 60 | 61 | 62 | HUBERT_PATH = "hubert_embedding.pt" 63 | F0_PATH = "f0.pt" 64 | F0F_PATH = "f0f.pt" 65 | 66 | 67 | # NOTE (Sam): this is different from the other get functions because it doesn't use torch dataset. 68 | def get_hubert_embeddings( 69 | audiopaths, hubert_model, output_layer=9, hubert_path=HUBERT_PATH 70 | ): 71 | """Returns the abs path w.r.t penultimate directory name in audiopaths, e.g. suitable for /tmp/{uuid}/resampled_normalized.wav.""" 72 | hubert_abs_paths = [] 73 | for audiopath in tqdm(audiopaths): 74 | folder_path = str(Path(*Path(audiopath).parts[:-1])) 75 | hubert_abs_path = os.path.join(folder_path, hubert_path) 76 | # TODO (Sam): add hashing to avoid mistakenly not recomputing. 77 | if not os.path.exists(hubert_abs_path): 78 | # NOTE (Sam): Hubert expects 16k sample rate. 79 | audio0, sr = librosa.load(audiopath, sr=16000) 80 | feats = torch.from_numpy(audio0) 81 | feats = feats.float() 82 | feats = feats.view(1, -1) 83 | padding_mask = torch.BoolTensor(feats.shape).to("cpu").fill_(False) 84 | inputs = { 85 | "source": feats.to("cpu"), 86 | "padding_mask": padding_mask, 87 | "output_layer": output_layer, 88 | } 89 | 90 | with torch.no_grad(): 91 | logits = hubert_model.extract_features(**inputs) 92 | feats = hubert_model.final_proj(logits[0]) 93 | torch.save(feats[0], hubert_abs_path) 94 | 95 | hubert_abs_paths.append(hubert_abs_path) 96 | 97 | return hubert_abs_paths 98 | 99 | 100 | def get( 101 | processing_function, 102 | saving_function, 103 | loading_function, 104 | source_paths, 105 | target_paths, 106 | recompute, 107 | ): 108 | function_ = lambda source_path, target_path: saving_function( 109 | processing_function(loading_function(source_path)), target_path 110 | ) 111 | processor = Processor( 112 | function_=function_, 113 | source_paths=source_paths, 114 | target_paths=target_paths, 115 | recompute=recompute, 116 | ) 117 | 118 | get_parallel_torch(processor) 119 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/normalization.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | from scipy.io.wavfile import write 4 | from ..models.tacotron2 import MAX_WAV_VALUE 5 | 6 | load_resampled_normalized_audio = lambda source_path: librosa.load( 7 | source_path, sr=22050 8 | )[0] 9 | float_normalize = lambda x: np.asarray( 10 | (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1) / MAX_WAV_VALUE 11 | ) 12 | int_normalize = lambda x: np.asarray( 13 | (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype=np.int16 14 | ) 15 | save_22k_audio = lambda data, target_path: write( 16 | target_path, 22050, data 17 | ) # must be in this order 18 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/processor.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Dict 2 | import os 3 | 4 | 5 | # NOTE (Sam): this torch processor appears to be 10% faster than standard multiprocessing - perhaps this is overkill 6 | class Processor: 7 | def __init__( 8 | self, 9 | function_: Callable, 10 | source_paths: List[str], 11 | target_paths: List[ 12 | str 13 | ], # NOTE (Sam): this is target_folders in certain versions of the code since for example we want to save pitch at f0.pt and pitch mask as f0f.pt. Have to think of a solution. 14 | recompute: bool = True, 15 | ): 16 | self.source_paths = source_paths 17 | self.function_ = function_ 18 | self.target_paths = target_paths 19 | self.recompute = recompute 20 | 21 | def _get_data(self, source_path, target_path): 22 | # NOTE (Sam): we need caching to debug training issues in dev and for speed! 23 | # NOTE (Sam): won't catch issues with recomputation using different parameters but name name 24 | # TODO (Sam): add hashing 25 | if self.recompute or not os.path.exists(target_path): 26 | self.function_(source_path, target_path) 27 | else: 28 | pass 29 | 30 | def __getitem__(self, idx): 31 | try: 32 | self._get_data( 33 | source_path=self.source_paths[idx], 34 | target_path=self.target_paths[idx], 35 | ) 36 | 37 | except Exception as e: 38 | print(f"Error while getting data: index = {idx}") 39 | print(e) 40 | raise 41 | return None 42 | 43 | def __len__(self): 44 | nfiles = len(self.source_paths) 45 | 46 | return nfiles 47 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/ray.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import os 3 | 4 | from scipy.io import wavfile 5 | import torch 6 | import numpy as np 7 | import ray 8 | import pandas as pd 9 | 10 | 11 | from .utils import get_energy_average, f0_normalize 12 | from ..models.components.encoders import ResNetSpeakerEncoderCallable 13 | 14 | 15 | # NOTE (Sam): the ray dataset code runs mod cleanup but is seemingly slower than torch dataloader (not 100p sure if this is still true). 16 | def ray_df_preprocessing(df, data_config, tp, stft): 17 | transcripts = df.transcript.tolist() 18 | audio_bytes_list = df.audio_bytes.tolist() 19 | speaker_ids = df.speaker_id.tolist() 20 | f0_paths = df.f0_path.tolist() 21 | audio_embeddings = df.audio_embedding.tolist() 22 | # shuffle_indices = get_shuffle_indices(speaker_ids) 23 | # audio_embeddings = [audio_embeddings[i] for i in shuffle_indices] 24 | collate_input = [] 25 | for transcript, audio_bytes, speaker_id, f0_path, audio_embedding in zip( 26 | transcripts, audio_bytes_list, speaker_ids, f0_paths, audio_embeddings 27 | ): 28 | bio = BytesIO(audio_bytes) 29 | sr, wav_data = wavfile.read(bio) 30 | audio = torch.FloatTensor(wav_data) 31 | # NOTE (Sam): why normalize here? 32 | audio_norm = audio / (np.abs(audio).max() * 2) 33 | text_sequence = tp.get_text(transcript) 34 | mel = stft.get_mel(audio_norm, data_config["max_wav_value"]) 35 | mel = torch.squeeze(mel, 0) 36 | dikt = torch.load(f0_path) 37 | f0 = dikt["f0"] 38 | p_voiced = dikt["p_voiced"] 39 | voiced_mask = dikt["voiced_mask"] 40 | f0 = f0_normalize(f0, f0_min=data_config["f0_min"]) 41 | energy_avg = get_energy_average(mel) 42 | prior_path = "{}_{}".format(text_sequence.shape[0], mel.shape[1]) 43 | prior_path = os.path.join("/usr/src/app/radtts/data_cache", prior_path) 44 | prior_path += "_prior.pth" 45 | attn_prior = torch.load(prior_path) 46 | speaker_id = torch.LongTensor([speaker_id]) 47 | audio_embedding = torch.FloatTensor(audio_embedding) 48 | # NOTE (Sam): might be faster to return dictionary arrays of batched inputs instead of list 49 | collate_input.append( 50 | { 51 | "text_encoded": text_sequence, 52 | "mel": mel, 53 | "speaker_id": speaker_id, 54 | "f0": f0, 55 | "p_voiced": p_voiced, 56 | "voiced_mask": voiced_mask, 57 | "energy_avg": energy_avg, 58 | "attn_prior": attn_prior, 59 | "audiopath": None, 60 | "audio_embedding": audio_embedding, 61 | } 62 | ) 63 | 64 | return collate_input 65 | 66 | 67 | def get_ray_dataset(filelist_path, config_path, model_path): 68 | df = pd.read_csv( 69 | filelist_path, 70 | sep="|", 71 | header=None, 72 | quoting=3, 73 | names=["path", "transcript", "speaker_id", "f0_path", "emb_path"], 74 | ) 75 | 76 | paths = df.path.tolist() 77 | transcripts = df.transcript.tolist() 78 | speaker_ids = df.speaker_id.tolist() 79 | 80 | pitches = df.f0_path.tolist() 81 | 82 | parallelism_length = 400 83 | audio_ds = ray.data.read_binary_files( 84 | paths, 85 | parallelism=parallelism_length, 86 | ray_remote_args={"num_cpus": 1.0}, 87 | ) 88 | audio_ds = audio_ds.map_batches( 89 | lambda x: x, batch_format="pyarrow", batch_size=None 90 | ) 91 | 92 | paths_ds = ray.data.from_items(paths, parallelism=parallelism_length) 93 | paths_ds = paths_ds.map_batches( 94 | lambda x: x, batch_format="pyarrow", batch_size=None 95 | ) 96 | 97 | transcripts = ray.data.from_items(transcripts, parallelism=parallelism_length) 98 | transcripts_ds = transcripts.map_batches( 99 | lambda x: x, batch_format="pyarrow", batch_size=None 100 | ) 101 | 102 | speaker_ids_ds = ray.data.from_items(speaker_ids, parallelism=parallelism_length) 103 | speaker_ids_ds = speaker_ids_ds.map_batches( 104 | lambda x: x, batch_format="pyarrow", batch_size=None 105 | ) 106 | pitches_ds = ray.data.from_items(pitches, parallelism=parallelism_length) 107 | pitches_ds = pitches_ds.map_batches( 108 | lambda x: x, batch_format="pyarrow", batch_size=None 109 | ) 110 | 111 | embs_ds = ray.data.from_items(paths, parallelism=parallelism_length) 112 | embs_ds = embs_ds.map_batches( 113 | ResNetSpeakerEncoderCallable, 114 | fn_kwargs={"config_path": config_path, "model_path": model_path}, 115 | num_gpus=1.0, 116 | compute="actors", 117 | ) 118 | 119 | output_dataset = ( 120 | transcripts_ds.zip(audio_ds) 121 | .zip(paths_ds) 122 | .zip(speaker_ids_ds) 123 | .zip(pitches_ds) 124 | .zip(embs_ds) 125 | ) 126 | output_dataset = output_dataset.map_batches( 127 | lambda table: table.rename( 128 | columns={ 129 | "value": "transcript", 130 | "value_1": "audio_bytes", 131 | "value_2": "path", 132 | "value_3": "speaker_id", 133 | "value_4": "f0_path", 134 | "value_5": "emb_path", 135 | } 136 | ) 137 | ) 138 | 139 | processed_dataset = output_dataset.map_batches(ray_df_preprocessing) 140 | return processed_dataset.fully_executed() 141 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/spectrogram.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from librosa.filters import mel as librosa_mel_fn 3 | 4 | from .utils import spectral_normalize_torch 5 | 6 | # NOTE (Sam): needed for importable lambdas. 7 | # TODO (Sam): remove redundancy from elsewhere in repo. 8 | hann_window = {} 9 | mel_basis = {} 10 | 11 | 12 | # TODO (Sam): combine with identically-named function is models.common 13 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 14 | """Convert waveform into Linear-frequency Linear-amplitude spectrogram. 15 | 16 | Args: 17 | y :: (B, T) - Audio waveforms 18 | n_fft 19 | sampling_rate 20 | hop_size 21 | win_size 22 | center 23 | Returns: 24 | :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram 25 | """ 26 | # Validation 27 | if torch.min(y) < -1.0: 28 | print("min value is ", torch.min(y)) 29 | if torch.max(y) > 1.0: 30 | print("max value is ", torch.max(y)) 31 | 32 | # Window - Cache if needed 33 | global hann_window 34 | dtype_device = str(y.dtype) + "_" + str(y.device) 35 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 36 | if wnsize_dtype_device not in hann_window: 37 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 38 | dtype=y.dtype, device=y.device 39 | ) 40 | 41 | # Padding 42 | y = torch.nn.functional.pad( 43 | y.unsqueeze(1), 44 | # NOTE (Sam): combinining n_fft (filter_length) with hop_size reeks of either a bug or sophisticated asympotitc analysis. 45 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 46 | mode="reflect", 47 | ) 48 | y = y.squeeze(1) 49 | 50 | # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) 51 | spec = torch.stft( 52 | y, 53 | n_fft, 54 | hop_length=hop_size, 55 | win_length=win_size, 56 | window=hann_window[wnsize_dtype_device], 57 | center=center, 58 | pad_mode="reflect", 59 | normalized=False, 60 | onesided=True, 61 | return_complex=False, 62 | ) 63 | # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) 64 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 65 | return spec 66 | 67 | 68 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 69 | # MelBasis - Cache if needed 70 | global mel_basis 71 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 72 | fmax_dtype_device = str(fmax) + "_" + dtype_device 73 | if fmax_dtype_device not in mel_basis: 74 | mel = librosa_mel_fn( 75 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 76 | ) 77 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 78 | dtype=spec.dtype, device=spec.device 79 | ) 80 | 81 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) 82 | melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) 83 | melspec = spectral_normalize_torch(melspec) 84 | return melspec 85 | 86 | 87 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 88 | """ 89 | PARAMS 90 | ------ 91 | C: compression factor 92 | """ 93 | return torch.log(torch.clamp(x, min=clip_val) * C) 94 | 95 | 96 | def spectral_normalize_torch(magnitudes): 97 | return dynamic_range_compression_torch(magnitudes) 98 | 99 | 100 | def mel_spectrogram_torch( 101 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 102 | ): 103 | """Convert waveform into Mel-frequency Log-amplitude spectrogram. 104 | 105 | Args: 106 | y :: (B, T) - Waveforms 107 | Returns: 108 | melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram 109 | """ 110 | # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) 111 | spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) 112 | 113 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) 114 | melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) 115 | 116 | return melspec 117 | 118 | 119 | from ..data.data import HIFIGAN_DEFAULTS as DEFAULTS 120 | from scipy.io.wavfile import read 121 | import librosa 122 | 123 | mel_spec = lambda x: mel_spectrogram_torch( 124 | x, 125 | DEFAULTS["n_fft"], 126 | DEFAULTS["num_mels"], 127 | DEFAULTS["sampling_rate"], 128 | # 100, 129 | # 24000,#DEFAULTS["sampling_rate"], 130 | DEFAULTS["hop_size"], 131 | DEFAULTS["win_size"], 132 | DEFAULTS["fmin"], 133 | None, 134 | False, # center 135 | ) 136 | 137 | load_audio = lambda source_path: torch.Tensor(read(source_path)[1]).unsqueeze(0) 138 | save_torch = lambda data, target_path: torch.save(data[0], target_path) 139 | -------------------------------------------------------------------------------- /uberduck_ml_dev/data/statistics.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "word_frequencies", 3 | "create_wordcloud", 4 | "count_frequency", 5 | "pace_character", 6 | "pace_phoneme", 7 | "get_sample_format", 8 | "AbsoluteMetrics", 9 | ] 10 | 11 | from typing import List, Any, Dict, Union, Optional 12 | from collections import Counter 13 | import os 14 | 15 | import librosa 16 | import numpy as np 17 | from pydub.utils import mediainfo_json 18 | from wordfreq import word_frequency 19 | 20 | from ..text.utils import text_to_sequence 21 | 22 | # NOTE (Sam): this file could be refactored so that it doesn't contain both speechmetrics and wordfreqencies - very different types of statistics. 23 | 24 | 25 | def word_frequencies(text: str, language: str = "en") -> List[float]: 26 | """ 27 | Calculate the frequency [0-1] which the words appear in the english language 28 | """ 29 | freqs = [] 30 | for word in text.split(): 31 | freqs.append(word_frequency(word, language)) 32 | return freqs 33 | 34 | 35 | def count_frequency(arr: List[Any]) -> Dict[Any, int]: 36 | """ 37 | Calculates the frequency that a value appears in a list 38 | """ 39 | return dict(Counter(arr).most_common()) 40 | 41 | 42 | def pace_character( 43 | text: str, audio: Union[str, np.ndarray], sr: Optional[int] = None 44 | ) -> float: 45 | """ 46 | Calculates the number of characters in the text per second of the audio file. Audio can be a file path or an np array. 47 | """ 48 | if isinstance(audio, str): 49 | audio, sr = librosa.load(audio, sr=None) 50 | else: 51 | assert sr, "Sampling rate must be provided if audio is np array" 52 | 53 | return len(text) / librosa.get_duration(audio, sr=sr) 54 | 55 | 56 | def pace_phoneme( 57 | text: str, audio: Union[str, np.ndarray], sr: Optional[int] = None 58 | ) -> float: 59 | """ 60 | Calculates the number of phonemes in the text per second of the audio. Audio can be a file path or an np array. 61 | """ 62 | if isinstance(audio, str): 63 | audio, sr = librosa.load(audio, sr=None) 64 | else: 65 | assert sr, "Sampling rate must be provided if audio is np array" 66 | 67 | arpabet_seq = text_to_sequence(text, ["english_cleaners"], p_arpabet=1.0) 68 | return len(arpabet_seq) / librosa.get_duration(audio, sr=sr) 69 | 70 | 71 | def get_sample_format(wav_file: str): 72 | """ 73 | Get sample format of the .wav file: https://trac.ffmpeg.org/wiki/audio%20types 74 | """ 75 | filename, file_extension = os.path.splitext(wav_file) 76 | assert file_extension == ".wav", ".wav file must be supplied" 77 | 78 | info = mediainfo_json(wav_file) 79 | audio_streams = [x for x in info["streams"] if x["codec_type"] == "audio"] 80 | return audio_streams[0].get("sample_fmt") 81 | 82 | 83 | class AbsoluteMetrics: 84 | """This class loads and calculates the absolute metrics, MOSNet and SRMR""" 85 | 86 | def __init__(self, window_length: Optional[int] = None): 87 | # NOTE(zach): There are some problems installing speechmetrics via pip and it's not critical, so import inline to avoid issues in CI. 88 | import speechmetrics 89 | 90 | self.metrics = speechmetrics.load("absolute", window_length) 91 | 92 | def __call__(self, wav_file: str) -> Dict[str, float]: 93 | """ 94 | Returns a Dict[str,float] with keys "mosnet" and "srmr" 95 | """ 96 | filename, file_extension = os.path.splitext(wav_file) 97 | assert file_extension == ".wav", ".wav file must be supplied" 98 | 99 | return self.metrics(wav_file) 100 | -------------------------------------------------------------------------------- /uberduck_ml_dev/e2e.py: -------------------------------------------------------------------------------- 1 | __all__ = ["tts", "rhythm_transfer"] 2 | 3 | 4 | import torch 5 | 6 | from .text.symbols import NVIDIA_TACO2_SYMBOLS 7 | from .text.utils import prepare_input_sequence 8 | 9 | 10 | from typing import List 11 | 12 | from .models.tacotron2 import Tacotron2 13 | from .vocoders.hifigan import HiFiGanGenerator 14 | 15 | 16 | def tts( 17 | lines: List[str], 18 | model, 19 | device: str, 20 | vocoder, 21 | arpabet=False, 22 | symbol_set=NVIDIA_TACO2_SYMBOLS, 23 | max_wav_value=32768.0, 24 | speaker_ids=None, 25 | ): 26 | assert isinstance( 27 | model, Tacotron2 28 | ), "Only Tacotron2 text-to-mel models are supported" 29 | assert isinstance(vocoder, HiFiGanGenerator), "Only Hifi GAN vocoders are supported" 30 | cpu_run = device == "cpu" 31 | sequences, input_lengths = prepare_input_sequence( 32 | lines, cpu_run=cpu_run, arpabet=arpabet, symbol_set=symbol_set 33 | ) 34 | if speaker_ids is None: 35 | speaker_ids = torch.zeros(len(lines), dtype=torch.long, device=device) 36 | input_ = sequences, input_lengths, speaker_ids 37 | _, mel_outputs_postnet, gate_outputs, alignment, lengths = model.inference(input_) 38 | mels = mel_outputs_postnet 39 | mel = mels[0, :, : lengths[0].item()] 40 | for idx in range(1, mels.size(0)): 41 | length = lengths[idx].item() 42 | mel = torch.cat((mel, mels[idx, :, :length]), dim=-1) 43 | tensor_cls = torch.FloatTensor if device == "cpu" else torch.cuda.FloatTensor 44 | mel = mel[None, :] 45 | y_g_hat = vocoder(tensor_cls(mel).to(device=device)) 46 | audio = y_g_hat.reshape(1, -1) 47 | audio = audio * max_wav_value 48 | return audio 49 | 50 | 51 | from typing import Optional 52 | 53 | from .models.common import MelSTFT 54 | 55 | 56 | @torch.no_grad() 57 | def rhythm_transfer( 58 | original_audio: torch.tensor, 59 | original_text: str, 60 | model, 61 | vocoder, 62 | device: str, 63 | symbol_set=NVIDIA_TACO2_SYMBOLS, 64 | arpabet=False, 65 | max_wav_value=32768.0, 66 | speaker_id=0, 67 | ): 68 | assert len(original_audio.shape) == 1 69 | cpu_run = device == "cpu" 70 | # TODO(zach): Support non-default STFT parameters. 71 | stft = MelSTFT() 72 | p_arpabet = float(arpabet) 73 | sequence, input_lengths, _ = prepare_input_sequence( 74 | [original_text], arpabet=arpabet, cpu_run=cpu_run, symbol_set=symbol_set 75 | ) 76 | original_target_mel = stft.mel_spectrogram(original_audio[None]) 77 | if not cpu_run: 78 | original_target_mel = original_target_mel.cuda() 79 | max_len = original_target_mel.size(2) 80 | speaker_ids = torch.tensor([speaker_id], dtype=torch.long, device=device) 81 | inputs = ( 82 | sequence, 83 | input_lengths, 84 | original_target_mel, 85 | max_len, 86 | torch.tensor([max_len], dtype=torch.long, device=device), 87 | speaker_ids, 88 | ) 89 | attn = model.get_alignment(inputs) 90 | _, mel_postnet, _, _ = model.inference_noattention( 91 | (sequence, input_lengths, speaker_ids, attn.transpose(0, 1)) 92 | ) 93 | y_g_hat = vocoder(torch.tensor(mel_postnet, dtype=torch.float, device=device)) 94 | audio = y_g_hat.reshape(1, -1) 95 | audio = audio * max_wav_value 96 | return audio 97 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/exec/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/normalize_audio.py: -------------------------------------------------------------------------------- 1 | __all__ = ["run", "parse_args"] 2 | 3 | 4 | import argparse 5 | import os 6 | import sys 7 | 8 | from ..utils.audio import normalize_audio, trim_audio 9 | 10 | 11 | def run(dirname, backup, top_db): 12 | """Normalize all the audio files in a directory.""" 13 | old_dirname = dirname 14 | if backup: 15 | old_dirname = f"{os.path.normpath(old_dirname)}_backup" 16 | os.rename(dirname, old_dirname) 17 | for dirpath, _, filenames in os.walk(old_dirname): 18 | rel_path = os.path.relpath(dirpath, old_dirname) 19 | for filename in filenames: 20 | if not filename.endswith(".wav"): 21 | continue 22 | old_path = os.path.join(dirpath, filename) 23 | new_path = os.path.join(dirname, rel_path, filename) 24 | if not os.path.exists(os.path.join(dirname, rel_path)): 25 | os.makedirs(os.path.join(dirname, rel_path)) 26 | trim_audio(old_path, new_path, top_db) 27 | 28 | 29 | def parse_args(args): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | "-d", 33 | "--dirname", 34 | help="Path to the directory which contains audio files to normalize.", 35 | ) 36 | parser.add_argument("--backup", dest="backup", action="store_true") 37 | parser.add_argument("--no-backup", dest="backup", action="store_false") 38 | parser.add_argument("--top-db", type=int) 39 | parser.set_defaults(backup=True, top_db=20) 40 | return parser.parse_args(args) 41 | 42 | 43 | try: 44 | from nbdev.imports import IN_NOTEBOOK 45 | except: 46 | IN_NOTEBOOK = False 47 | 48 | if __name__ == "__main__" and not IN_NOTEBOOK: 49 | args = parse_args(sys.argv[1:]) 50 | run(args.dirname, args.backup, args.top_db) 51 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/split_train_val.py: -------------------------------------------------------------------------------- 1 | __all__ = ["write_filenames", "run", "parse_args"] 2 | 3 | 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | from sklearn.model_selection import train_test_split 9 | 10 | 11 | def write_filenames(filenames, output_dir, output_filename): 12 | """ 13 | Writes a list of filenames of as each line of a .txt file specified by output_filename. 14 | """ 15 | with open(os.path.join(output_dir, output_filename), "w") as f: 16 | for item in filenames: 17 | f.write(f"{item}\n") 18 | 19 | 20 | def run( 21 | path, 22 | val_percent=0.2, 23 | val_num=None, 24 | train_file="train.txt", 25 | val_file="val.txt", 26 | ): 27 | """Split file in t 28 | Default behavior only creates a training and validation set (not test set). 29 | """ 30 | with open(path) as f: 31 | lines = [l.strip("\n") for l in f.readlines()] 32 | 33 | train, val = train_test_split(lines, test_size=val_num if val_num else val_percent) 34 | write_filenames(train, Path(os.path.dirname(path)), train_file) 35 | write_filenames(val, Path(os.path.dirname(path)), val_file) 36 | 37 | 38 | import argparse 39 | import sys 40 | 41 | 42 | def parse_args(args): 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument( 45 | "-i", "--in", dest="input_path", help="Path to input file list", required=True 46 | ) 47 | parser.add_argument("-n", "--num_val", dest="num_val", type=float, default=0.1) 48 | args = parser.parse_args(args) 49 | return args 50 | 51 | 52 | try: 53 | from nbdev.imports import IN_NOTEBOOK 54 | except: 55 | IN_NOTEBOOK = False 56 | 57 | if __name__ == "__main__" and not IN_NOTEBOOK: 58 | args = parse_args(sys.argv[1:]) 59 | if args.num_val > 1: 60 | run(args.input_path, val_num=int(args.num_val)) 61 | else: 62 | run(args.input_path, val_percent=args.num_val) 63 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/train_radtts_with_ray.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import os 4 | 5 | from ray.air.config import ScalingConfig, RunConfig 6 | from ray.train.torch import TorchTrainer 7 | from ray.tune import SyncConfig 8 | from ray.train.torch import TorchTrainer, TorchTrainer 9 | from ray.air.config import ScalingConfig, RunConfig 10 | 11 | from uberduck_ml_dev.trainer.radtts.train import train_func 12 | from uberduck_ml_dev.utils.exec import parse_args 13 | from uberduck_ml_dev.trainer.radtts.train import DEFAULTS as TRAIN_CONFIG 14 | from uberduck_ml_dev.data.data import RADTTS_DEFAULTS as DATA_CONFIG 15 | from uberduck_ml_dev.models.radtts import DEFAULTS as MODEL_CONFIG 16 | 17 | if __name__ == "__main__": 18 | args = parse_args(sys.argv[1:]) 19 | if args.config: 20 | with open(args.config) as f: 21 | config_inputs = json.load(f) 22 | 23 | config = dict( 24 | train_config=TRAIN_CONFIG, data_config=DATA_CONFIG, model_config=MODEL_CONFIG 25 | ) 26 | config["train_config"].update(config_inputs["train_config"]) 27 | config["data_config"].update(config_inputs["data_config"]) 28 | config["model_config"].update(config_inputs["model_config"]) 29 | 30 | os.makedirs(config["train_config"]["output_directory"], exist_ok=True) 31 | trainer = TorchTrainer( 32 | train_loop_per_worker=train_func, 33 | train_loop_config=config, 34 | scaling_config=ScalingConfig( 35 | num_workers=config["train_config"]["n_gpus"], 36 | use_gpu=True, 37 | resources_per_worker=dict( 38 | CPU=config["data_config"]["num_workers"], 39 | GPU=1, 40 | ), 41 | ), 42 | run_config=RunConfig(sync_config=SyncConfig()), 43 | ) 44 | 45 | result = trainer.fit() 46 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/train_tacotron2.py: -------------------------------------------------------------------------------- 1 | __all__ = ["parse_args", "run"] 2 | 3 | from ..trainer.tacotron2 import Tacotron2Trainer 4 | from ..vendor.tfcompat.hparam import HParams 5 | from ..trainer.tacotron2 import DEFAULTS as TACOTRON2_TRAINER_DEFAULTS 6 | import argparse 7 | import sys 8 | import json 9 | import torch 10 | from torch import multiprocessing as mp 11 | 12 | 13 | def parse_args(args): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--config", help="Path to JSON config") 16 | args = parser.parse_args(args) 17 | return args 18 | 19 | 20 | def run(rank, device_count, hparams): 21 | trainer = Tacotron2Trainer(hparams, rank=rank, world_size=device_count) 22 | try: 23 | trainer.train() 24 | except Exception as e: 25 | print(f"Exception raised while training: {e}") 26 | # TODO: save state. 27 | raise e 28 | 29 | 30 | try: 31 | from nbdev.imports import IN_NOTEBOOK 32 | except: 33 | IN_NOTEBOOK = False 34 | if __name__ == "__main__" and not IN_NOTEBOOK: 35 | args = parse_args(sys.argv[1:]) 36 | config = TACOTRON2_TRAINER_DEFAULTS.values() 37 | if args.config: 38 | with open(args.config) as f: 39 | config.update(json.load(f)) 40 | config.update(vars(args)) 41 | hparams = HParams(**config) 42 | if hparams.distributed_run: 43 | device_count = torch.cuda.device_count() 44 | mp.spawn(run, (device_count, hparams), device_count) 45 | else: 46 | run(None, None, hparams) 47 | -------------------------------------------------------------------------------- /uberduck_ml_dev/exec/train_vits.py: -------------------------------------------------------------------------------- 1 | __all__ = ["parse_args", "run"] 2 | 3 | 4 | import argparse 5 | import json 6 | import librosa # NOTE(zach): importing torch before librosa causes LLVM issues for some unknown reason. 7 | import sys 8 | 9 | import torch 10 | from torch import multiprocessing as mp 11 | 12 | from ..trainer.vits import VITSTrainer 13 | from ..vendor.tfcompat.hparam import HParams 14 | from ..models.vits import DEFAULTS as VITS_DEFAULTS 15 | 16 | 17 | def parse_args(args): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--config", help="Path to JSON config") 20 | args = parser.parse_args(args) 21 | return args 22 | 23 | 24 | def run(rank, device_count, hparams): 25 | trainer = VITSTrainer(hparams, rank=rank, world_size=device_count) 26 | try: 27 | trainer.train() 28 | except Exception as e: 29 | print(f"Exception raised while training: {e}") 30 | # TODO: save state. 31 | raise e 32 | 33 | 34 | try: 35 | from nbdev.imports import IN_NOTEBOOK 36 | except: 37 | IN_NOTEBOOK = False 38 | if __name__ == "__main__" and not IN_NOTEBOOK: 39 | args = parse_args(sys.argv[1:]) 40 | config = VITS_DEFAULTS.values() 41 | if args.config: 42 | with open(args.config) as f: 43 | config.update(json.load(f)) 44 | hparams = HParams(**config) 45 | if hparams.distributed_run: 46 | device_count = torch.cuda.device_count() 47 | mp.spawn(run, (device_count, hparams), device_count) 48 | else: 49 | run(0, 1, hparams) 50 | -------------------------------------------------------------------------------- /uberduck_ml_dev/losses_rvc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | 5 | def feature_loss(fmap_r, fmap_g): 6 | loss = 0 7 | for dr, dg in zip(fmap_r, fmap_g): 8 | for rl, gl in zip(dr, dg): 9 | rl = rl.float().detach() 10 | gl = gl.float() 11 | loss += torch.mean(torch.abs(rl - gl)) 12 | 13 | return loss * 2 14 | 15 | 16 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 17 | loss = 0 18 | r_losses = [] 19 | g_losses = [] 20 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 21 | dr = dr.float() 22 | dg = dg.float() 23 | r_loss = torch.mean((1 - dr) ** 2) 24 | g_loss = torch.mean(dg**2) 25 | loss += r_loss + g_loss 26 | r_losses.append(r_loss.item()) 27 | g_losses.append(g_loss.item()) 28 | 29 | return loss, r_losses, g_losses 30 | 31 | 32 | def generator_loss(disc_outputs): 33 | loss = 0 34 | gen_losses = [] 35 | for dg in disc_outputs: 36 | dg = dg.float() 37 | l = torch.mean((1 - dg) ** 2) 38 | gen_losses.append(l) 39 | loss += l 40 | 41 | return loss, gen_losses 42 | 43 | 44 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 45 | """ 46 | z_p, logs_q: [b, h, t_t] 47 | m_p, logs_p: [b, h, t_t] 48 | """ 49 | z_p = z_p.float() 50 | logs_q = logs_q.float() 51 | m_p = m_p.float() 52 | logs_p = logs_p.float() 53 | z_mask = z_mask.float() 54 | 55 | kl = logs_p - logs_q - 0.5 56 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 57 | kl = torch.sum(kl * z_mask) 58 | l = kl / torch.sum(z_mask) 59 | return l 60 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/models/base.py: -------------------------------------------------------------------------------- 1 | __all__ = ["TTSModel", "DEFAULTS"] 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from ..text.symbols import SYMBOL_SETS 7 | from ..vendor.tfcompat.hparam import HParams 8 | 9 | 10 | class TTSModel(nn.Module): 11 | def __init__(self, hparams): 12 | super().__init__() 13 | self.symbol_set = hparams.symbol_set 14 | self.n_symbols = len(SYMBOL_SETS[self.symbol_set]) 15 | self.n_speakers = hparams.n_speakers 16 | # symbols = __import__('uberduck_ml_dev.text.' + hparams.symbols) 17 | 18 | def infer(self): 19 | raise NotImplemented 20 | 21 | def forward(self): 22 | raise NotImplemented 23 | 24 | def from_pretrained( 25 | self, warm_start_path=None, device="cpu", ignore_layers=None, model_dict=None 26 | ): 27 | model_dict = model_dict or dict() 28 | if warm_start_path is None and model_dict is None: 29 | raise Exception( 30 | "TTSModel.from_pretrained requires a warm_start_path or state_dict" 31 | ) 32 | if warm_start_path is not None: 33 | checkpoint = torch.load(warm_start_path, map_location=device) 34 | if ( 35 | "state_dict" in checkpoint.keys() 36 | ): # TODO: remove state_dict once off nvidia 37 | model_dict = checkpoint["state_dict"] 38 | if "model" in checkpoint.keys(): 39 | model_dict = checkpoint["model"] 40 | if ignore_layers: 41 | model_dict = {k: v for k, v in model_dict.items() if k not in ignore_layers} 42 | dummy_dict = self.state_dict() 43 | 44 | for k in self.state_dict().keys(): 45 | if k not in model_dict.keys(): 46 | print( 47 | f"WARNING! Attempting to load a model with out the {k} layer. This could lead to unexpected results during evaluation." 48 | ) 49 | 50 | dummy_dict.update(model_dict) 51 | model_dict = dummy_dict 52 | self.load_state_dict(model_dict) 53 | if device == "cuda": 54 | self.cuda() 55 | 56 | def to_checkpoint(self): 57 | return dict(model=self.state_dict()) 58 | 59 | @classmethod 60 | def create(cls, name, opts, folders, all_speakers=True): 61 | pass 62 | 63 | 64 | DEFAULTS = HParams( 65 | p_arpabet=1.0, 66 | seed=1234, 67 | # NOTE (Sam): make sure users change their configurations for cudnn_enabled = True. 68 | cudnn_enabled=False, 69 | ) 70 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/alignment.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: MIT 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | import sys 22 | import numpy as np 23 | from matplotlib import pylab as plt 24 | from numba import jit 25 | 26 | 27 | def save_plot(fname, attn_map): 28 | plt.imshow(attn_map) 29 | plt.savefig(fname) 30 | 31 | 32 | @jit(nopython=True) 33 | def mas_width1(attn_map): 34 | """mas with hardcoded width=1""" 35 | # assumes mel x text 36 | opt = np.zeros_like(attn_map) 37 | attn_map = np.log(attn_map) 38 | attn_map[0, 1:] = -np.inf 39 | log_p = np.zeros_like(attn_map) 40 | log_p[0, :] = attn_map[0, :] 41 | prev_ind = np.zeros_like(attn_map, dtype=np.int64) 42 | for i in range(1, attn_map.shape[0]): 43 | for j in range(attn_map.shape[1]): # for each text dim 44 | prev_log = log_p[i - 1, j] 45 | prev_j = j 46 | 47 | if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]: 48 | prev_log = log_p[i - 1, j - 1] 49 | prev_j = j - 1 50 | 51 | log_p[i, j] = attn_map[i, j] + prev_log 52 | prev_ind[i, j] = prev_j 53 | 54 | # now backtrack 55 | curr_text_idx = attn_map.shape[1] - 1 56 | for i in range(attn_map.shape[0] - 1, -1, -1): 57 | opt[i, curr_text_idx] = 1 58 | curr_text_idx = prev_ind[i, curr_text_idx] 59 | opt[0, curr_text_idx] = 1 60 | return opt 61 | 62 | 63 | if __name__ == "__main__": 64 | attn_ = np.load(sys.argv[1]) 65 | attn = attn_.squeeze() 66 | save_plot("orig.png", attn) 67 | binarized = mas(attn) 68 | save_plot("binarized.png", binarized) 69 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/attention.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from numpy import finfo 4 | from torch.nn import functional as F 5 | from typing import Optional 6 | 7 | from ..common import LinearNorm, LocationLayer 8 | 9 | 10 | class Attention(nn.Module): 11 | def __init__( 12 | self, 13 | attention_rnn_dim, 14 | embedding_dim, 15 | attention_dim, 16 | attention_location_n_filters, 17 | attention_location_kernel_size, 18 | fp16_run, 19 | ): 20 | super(Attention, self).__init__() 21 | self.query_layer = LinearNorm( 22 | attention_rnn_dim, attention_dim, bias=False, w_init_gain="tanh" 23 | ) 24 | self.memory_layer = LinearNorm( 25 | embedding_dim, attention_dim, bias=False, w_init_gain="tanh" 26 | ) 27 | self.v = LinearNorm(attention_dim, 1, bias=False) 28 | self.location_layer = LocationLayer( 29 | attention_location_n_filters, attention_location_kernel_size, attention_dim 30 | ) 31 | if fp16_run: 32 | self.score_mask_value = finfo("float16").min 33 | else: 34 | self.score_mask_value = -float("inf") 35 | 36 | def get_alignment_energies(self, query, processed_memory, attention_weights_cat): 37 | """ 38 | PARAMS 39 | ------ 40 | query: decoder output (batch, n_mel_channels * n_frames_per_step) 41 | processed_memory: processed encoder outputs (B, T_in, attention_dim) 42 | attention_weights_cat: cumulative and prev. att weights (B, 2, max_time) 43 | 44 | RETURNS 45 | ------- 46 | alignment (batch, max_time) 47 | """ 48 | 49 | processed_query = self.query_layer(query.unsqueeze(1)) 50 | processed_attention_weights = self.location_layer(attention_weights_cat) 51 | energies = self.v( 52 | torch.tanh(processed_query + processed_attention_weights + processed_memory) 53 | ) 54 | 55 | energies = energies.squeeze(-1) 56 | return energies 57 | 58 | def forward( 59 | self, 60 | attention_hidden_state, 61 | memory, 62 | processed_memory, 63 | attention_weights_cat, 64 | mask, 65 | attention_weights: Optional[torch.Tensor], 66 | ): 67 | """ 68 | PARAMS 69 | ------ 70 | attention_hidden_state: attention rnn last output 71 | memory: encoder outputs 72 | processed_memory: processed encoder outputs 73 | attention_weights_cat: previous and cummulative attention weights 74 | mask: binary mask for padded data 75 | """ 76 | if attention_weights is None: 77 | alignment = self.get_alignment_energies( 78 | attention_hidden_state, processed_memory, attention_weights_cat 79 | ) 80 | 81 | if mask is not None: 82 | alignment.data.masked_fill_(mask, self.score_mask_value) 83 | 84 | attention_weights = F.softmax(alignment, dim=1) 85 | attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) 86 | attention_context = attention_context.squeeze(1) 87 | 88 | return attention_context, attention_weights 89 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/decoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/decoders/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/encoders/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/duration.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | from uberduck_ml_dev.models import common 8 | 9 | 10 | class StochasticDurationPredictor(nn.Module): 11 | def __init__( 12 | self, 13 | in_channels, 14 | filter_channels, 15 | kernel_size, 16 | p_dropout, 17 | n_flows=4, 18 | gin_channels=0, 19 | ): 20 | super().__init__() 21 | filter_channels = in_channels # it needs to be removed from future version. 22 | self.in_channels = in_channels 23 | self.filter_channels = filter_channels 24 | self.kernel_size = kernel_size 25 | self.p_dropout = p_dropout 26 | self.n_flows = n_flows 27 | self.gin_channels = gin_channels 28 | 29 | self.log_flow = common.Log() 30 | self.flows = nn.ModuleList() 31 | self.flows.append(common.ElementwiseAffine(2)) 32 | for i in range(n_flows): 33 | self.flows.append( 34 | common.ConvFlow(2, filter_channels, kernel_size, n_layers=3) 35 | ) 36 | self.flows.append(common.Flip()) 37 | 38 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 39 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 40 | self.post_convs = common.DDSConv( 41 | filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout 42 | ) 43 | self.post_flows = nn.ModuleList() 44 | self.post_flows.append(common.ElementwiseAffine(2)) 45 | for i in range(4): 46 | self.post_flows.append( 47 | common.ConvFlow(2, filter_channels, kernel_size, n_layers=3) 48 | ) 49 | self.post_flows.append(common.Flip()) 50 | 51 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 52 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 53 | self.convs = common.DDSConv( 54 | filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout 55 | ) 56 | if gin_channels != 0: 57 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 58 | 59 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 60 | x = torch.detach(x) 61 | x = self.pre(x) 62 | if g is not None: 63 | g = torch.detach(g) 64 | x = x + self.cond(g) 65 | x = self.convs(x, x_mask) 66 | x = self.proj(x) * x_mask 67 | 68 | if not reverse: 69 | flows = self.flows 70 | assert w is not None 71 | 72 | logdet_tot_q = 0 73 | h_w = self.post_pre(w) 74 | h_w = self.post_convs(h_w, x_mask) 75 | h_w = self.post_proj(h_w) * x_mask 76 | e_q = ( 77 | torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) 78 | * x_mask 79 | ) 80 | z_q = e_q 81 | for flow in self.post_flows: 82 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 83 | logdet_tot_q += logdet_q 84 | z_u, z1 = torch.split(z_q, [1, 1], 1) 85 | u = torch.sigmoid(z_u) * x_mask 86 | z0 = (w - u) * x_mask 87 | logdet_tot_q += torch.sum( 88 | (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] 89 | ) 90 | logq = ( 91 | torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) 92 | - logdet_tot_q 93 | ) 94 | 95 | logdet_tot = 0 96 | z0, logdet = self.log_flow(z0, x_mask) 97 | logdet_tot += logdet 98 | z = torch.cat([z0, z1], 1) 99 | for flow in flows: 100 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 101 | logdet_tot = logdet_tot + logdet 102 | nll = ( 103 | torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) 104 | - logdet_tot 105 | ) 106 | return nll + logq # [b] 107 | else: 108 | flows = list(reversed(self.flows)) 109 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 110 | z = ( 111 | torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) 112 | * noise_scale 113 | ) 114 | for flow in flows: 115 | z = flow(z, x_mask, g=x, reverse=reverse) 116 | z0, z1 = torch.split(z, [1, 1], 1) 117 | logw = z0 118 | return logw 119 | 120 | 121 | class DurationPredictor(nn.Module): 122 | def __init__( 123 | self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 124 | ): 125 | super().__init__() 126 | 127 | self.in_channels = in_channels 128 | self.filter_channels = filter_channels 129 | self.kernel_size = kernel_size 130 | self.p_dropout = p_dropout 131 | self.gin_channels = gin_channels 132 | 133 | self.drop = nn.Dropout(p_dropout) 134 | self.conv_1 = nn.Conv1d( 135 | in_channels, filter_channels, kernel_size, padding=kernel_size // 2 136 | ) 137 | self.norm_1 = common.LayerNorm(filter_channels) 138 | self.conv_2 = nn.Conv1d( 139 | filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 140 | ) 141 | self.norm_2 = common.LayerNorm(filter_channels) 142 | self.proj = nn.Conv1d(filter_channels, 1, 1) 143 | 144 | if gin_channels != 0: 145 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 146 | 147 | def forward(self, x, x_mask, g=None): 148 | x = torch.detach(x) 149 | if g is not None: 150 | g = torch.detach(g) 151 | x = x + self.cond(g) 152 | x = self.conv_1(x * x_mask) 153 | x = torch.relu(x) 154 | x = self.norm_1(x) 155 | x = self.drop(x) 156 | x = self.conv_2(x * x_mask) 157 | x = torch.relu(x) 158 | x = self.norm_2(x) 159 | x = self.drop(x) 160 | x = self.proj(x * x_mask) 161 | return x * x_mask 162 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/resnet_speaker_encoder.py: -------------------------------------------------------------------------------- 1 | # NOTE (Sam): this is the only component in this repository under copyleft license (Coqui / Mozilla). 2 | 3 | from io import BytesIO 4 | import os 5 | import requests 6 | import json 7 | 8 | from scipy.io.wavfile import read 9 | import torch 10 | 11 | # TODO (Sam): eliminate redundancy. 12 | from .speaker.resnet import ResNetSpeakerEncoder 13 | 14 | DEFAULT_AUDIO_CONFIG = { 15 | "fft_size": 512, 16 | "win_length": 400, 17 | "hop_length": 160, 18 | "frame_shift_ms": None, 19 | "frame_length_ms": None, 20 | "stft_pad_mode": "reflect", 21 | "sample_rate": 22050, 22 | "resample": False, 23 | "preemphasis": 0.97, 24 | "ref_level_db": 20, 25 | "do_sound_norm": False, 26 | "do_trim_silence": False, 27 | "trim_db": 60, 28 | "power": 1.5, 29 | "griffin_lim_iters": 60, 30 | "num_mels": 64, 31 | "mel_fmin": 0.0, 32 | "mel_fmax": 8000.0, 33 | "spec_gain": 20, 34 | "signal_norm": False, 35 | "min_level_db": -100, 36 | "symmetric_norm": False, 37 | "max_norm": 4.0, 38 | "clip_norm": False, 39 | "stats_path": None, 40 | "do_rms_norm": True, 41 | "db_level": -27.0, 42 | } 43 | 44 | 45 | def get_pretrained_model( 46 | config_url=None, model_url=None, config_path=None, model_path=None 47 | ): 48 | assert not ((config_url is not None) and (config_path is not None)) 49 | assert not ((model_url is not None) and (model_path is not None)) 50 | 51 | if config_path is None: 52 | print("Getting model config...") 53 | if config_url is None: 54 | config_url = os.environ["RESNET_SE_CONFIG_URL"] 55 | response = requests.get(config_url) 56 | resnet_config = response.json() 57 | else: 58 | with open(config_path) as f: 59 | resnet_config = json.load(f) 60 | model_params = resnet_config["model_params"] 61 | if "model_name" in model_params: 62 | del model_params["model_name"] 63 | 64 | audio_config = dict(resnet_config["audio"]) 65 | audio_config["sample_rate"] = 22050 66 | model = ResNetSpeakerEncoder(**model_params, audio_config=audio_config) 67 | print("Loading pretrained model...") 68 | load_pretrained(model, model_url=model_url, model_path=model_path) 69 | print("Got pretrained model...") 70 | model.eval() 71 | return model 72 | 73 | 74 | def load_pretrained(model, model_url=None, model_path=None): 75 | assert not ((model_url is not None) and (model_path is not None)) 76 | if model_path is not None: 77 | loaded = torch.load(model_path) 78 | else: 79 | if model_url is None: 80 | model_url = os.environ["RESNET_SE_MODEL_URL"] 81 | response = requests.get(model_url, stream=True) 82 | bio = BytesIO(response.content) 83 | loaded = torch.load(bio) 84 | model.load_state_dict(loaded["model"]) 85 | 86 | 87 | class ResNetSpeakerEncoderCallable: 88 | def __init__(self, model_path: str, config_path: str): 89 | print("initializing resnet speaker encoder") 90 | with open(config_path) as f: 91 | resnet_config = json.load(f) 92 | 93 | state_dict = torch.load(model_path)["model"] 94 | audio_config = dict(resnet_config["audio"]) 95 | model_params = resnet_config["model_params"] 96 | if "model_name" in model_params: 97 | del model_params["model_name"] 98 | 99 | self.device = "cuda" 100 | self.model = ResNetSpeakerEncoder(**model_params, audio_config=audio_config) 101 | self.model.load_state_dict(state_dict) 102 | self.model.eval() 103 | self.model.cuda() 104 | 105 | # NOTE (Sam): might have to accept bytes input for anyscale distributed data loading? 106 | def __call__(self, audiopaths): 107 | print("calling resnet speaker encoder") 108 | for audiopath in audiopaths: 109 | audio_data = read(audiopath)[1] 110 | datum = torch.FloatTensor(audio_data).unsqueeze(-1).t().cuda() 111 | # datum = torch.FloatTensor(audio_data).unsqueeze(-1).t() 112 | emb = self.model(datum) 113 | emb = emb.cpu().detach().numpy() 114 | yield {"audio_embedding": emb} 115 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/speaker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/encoders/speaker/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/speaker/base_encoder.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/coqui-ai/TTS/blob/dev/TTS/encoder/models/base_encoder.py 2 | 3 | import numpy as np 4 | import torch 5 | import torchaudio 6 | 7 | from torch import nn 8 | 9 | 10 | class PreEmphasis(nn.Module): 11 | def __init__(self, coefficient=0.97): 12 | super().__init__() 13 | self.coefficient = coefficient 14 | self.register_buffer( 15 | "filter", 16 | torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0), 17 | ) 18 | 19 | def forward(self, x): 20 | assert len(x.size()) == 2 21 | 22 | x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") 23 | return torch.nn.functional.conv1d(x, self.filter).squeeze(1) 24 | 25 | 26 | class BaseEncoder(nn.Module): 27 | """Base `encoder` class. Every new `encoder` model must inherit this. 28 | 29 | It defines common `encoder` specific functions. 30 | """ 31 | 32 | # pylint: disable=W0102 33 | def __init__(self): 34 | super(BaseEncoder, self).__init__() 35 | 36 | def get_torch_mel_spectrogram_class(self, audio_config): 37 | return torch.nn.Sequential( 38 | PreEmphasis(audio_config["preemphasis"]), 39 | # TorchSTFT( 40 | # n_fft=audio_config["fft_size"], 41 | # hop_length=audio_config["hop_length"], 42 | # win_length=audio_config["win_length"], 43 | # sample_rate=audio_config["sample_rate"], 44 | # window="hamming_window", 45 | # mel_fmin=0.0, 46 | # mel_fmax=None, 47 | # use_htk=True, 48 | # do_amp_to_db=False, 49 | # n_mels=audio_config["num_mels"], 50 | # power=2.0, 51 | # use_mel=True, 52 | # mel_norm=None, 53 | # ) 54 | torchaudio.transforms.MelSpectrogram( 55 | sample_rate=audio_config["sample_rate"], 56 | n_fft=audio_config["fft_size"], 57 | win_length=audio_config["win_length"], 58 | hop_length=audio_config["hop_length"], 59 | window_fn=torch.hamming_window, 60 | n_mels=audio_config["num_mels"], 61 | ), 62 | ) 63 | 64 | @torch.no_grad() 65 | def inference(self, x, l2_norm=True): 66 | return self.forward(x, l2_norm) 67 | 68 | @torch.no_grad() 69 | def compute_embedding( 70 | self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True 71 | ): 72 | """ 73 | Generate embeddings for a batch of utterances 74 | x: 1xTxD 75 | """ 76 | # map to the waveform size 77 | if self.use_torch_spec: 78 | num_frames = num_frames * self.audio_config["hop_length"] 79 | 80 | max_len = x.shape[1] 81 | 82 | if max_len < num_frames: 83 | num_frames = max_len 84 | 85 | offsets = np.linspace(0, max_len - num_frames, num=num_eval) 86 | 87 | frames_batch = [] 88 | for offset in offsets: 89 | offset = int(offset) 90 | end_offset = int(offset + num_frames) 91 | frames = x[:, offset:end_offset] 92 | frames_batch.append(frames) 93 | 94 | frames_batch = torch.cat(frames_batch, dim=0) 95 | embeddings = self.inference(frames_batch, l2_norm=l2_norm) 96 | 97 | if return_mean: 98 | embeddings = torch.mean(embeddings, dim=0, keepdim=True) 99 | return embeddings 100 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/encoders/tacotron2.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | from ...common import Conv1d 6 | 7 | 8 | class Encoder(nn.Module): 9 | """Encoder module: 10 | - Three 1-d convolution banks 11 | - Bidirectional LSTM 12 | """ 13 | 14 | def __init__(self, hparams): 15 | super().__init__() 16 | 17 | convolutions = [] 18 | for _ in range(hparams.encoder_n_convolutions): 19 | conv_layer = nn.Sequential( 20 | Conv1d( 21 | hparams.encoder_embedding_dim, 22 | hparams.encoder_embedding_dim, 23 | kernel_size=hparams.encoder_kernel_size, 24 | stride=1, 25 | padding=int((hparams.encoder_kernel_size - 1) / 2), 26 | dilation=1, 27 | w_init_gain="relu", 28 | ), 29 | nn.BatchNorm1d(hparams.encoder_embedding_dim), 30 | ) 31 | convolutions.append(conv_layer) 32 | self.convolutions = nn.ModuleList(convolutions) 33 | self.dropout_rate = 0.5 34 | 35 | self.lstm = nn.LSTM( 36 | hparams.encoder_embedding_dim, 37 | int(hparams.encoder_embedding_dim / 2), 38 | 1, 39 | batch_first=True, 40 | bidirectional=True, 41 | ) 42 | 43 | def forward(self, x, input_lengths): 44 | if x.size()[0] > 1: 45 | x_embedded = [] 46 | for b_ind in range(x.size()[0]): # TODO: Speed up 47 | curr_x = x[b_ind : b_ind + 1, :, : input_lengths[b_ind]].clone() 48 | for conv in self.convolutions: 49 | curr_x = F.dropout( 50 | F.relu(conv(curr_x)), self.dropout_rate, self.training 51 | ) 52 | x_embedded.append(curr_x[0].transpose(0, 1)) 53 | x = torch.nn.utils.rnn.pad_sequence(x_embedded, batch_first=True) 54 | else: 55 | for conv in self.convolutions: 56 | x = F.dropout(F.relu(conv(x)), self.dropout_rate, self.training) 57 | x = x.transpose(1, 2) 58 | 59 | # pytorch tensor are not reversible, hence the conversion 60 | input_lengths = input_lengths.cpu().numpy() 61 | x = nn.utils.rnn.pack_padded_sequence( 62 | x, input_lengths, batch_first=True, enforce_sorted=False 63 | ) 64 | 65 | self.lstm.flatten_parameters() 66 | outputs, _ = self.lstm(x) 67 | 68 | outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) 69 | return outputs 70 | 71 | def inference(self, x, input_lengths): 72 | device = x.device 73 | for conv in self.convolutions: 74 | x = F.dropout(F.relu(conv(x)), self.dropout_rate, self.training) 75 | 76 | x = x.transpose(1, 2) 77 | 78 | input_lengths = input_lengths.cpu() 79 | x = nn.utils.rnn.pack_padded_sequence( 80 | x, input_lengths, batch_first=True, enforce_sorted=False 81 | ) 82 | 83 | outputs, _ = self.lstm(x) 84 | 85 | outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) 86 | 87 | return outputs 88 | 89 | 90 | # NOTE (Sam): for torchscipt compilation 91 | class EncoderForwardIsInfer(Encoder): 92 | def forward(self, x, input_lengths): 93 | return self.inference(x, input_lengths) 94 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/partialconv1d.py: -------------------------------------------------------------------------------- 1 | # Modified partialconv source code based on implementation from 2 | # https://github.com/NVIDIA/partialconv/blob/master/models/partialconv2d.py 3 | ############################################################################### 4 | # BSD 3-Clause License 5 | # 6 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 7 | # 8 | # Author & Contact: Guilin Liu (guilinl@nvidia.com) 9 | ############################################################################### 10 | 11 | # Original Author & Contact: Guilin Liu (guilinl@nvidia.com) 12 | # Modified by Kevin Shih (kshih@nvidia.com) 13 | 14 | import torch 15 | import torch.nn.functional as F 16 | from torch import nn 17 | from typing import Tuple 18 | 19 | 20 | class PartialConv1d(nn.Conv1d): 21 | def __init__(self, *args, **kwargs): 22 | self.multi_channel = False 23 | self.return_mask = False 24 | super(PartialConv1d, self).__init__(*args, **kwargs) 25 | 26 | self.weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0]) 27 | self.slide_winsize = ( 28 | self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2] 29 | ) 30 | 31 | self.last_size = (None, None, None) 32 | self.update_mask = None 33 | self.mask_ratio = None 34 | 35 | @torch.jit.ignore 36 | def forward(self, input: torch.Tensor, mask_in: torch.Tensor = None): 37 | """ 38 | input: standard input to a 1D conv 39 | mask_in: binary mask for valid values, same shape as input 40 | """ 41 | assert len(input.shape) == 3 42 | # if a mask is input, or tensor shape changed, update mask ratio 43 | if mask_in is not None or self.last_size != tuple(input.shape): 44 | self.last_size = tuple(input.shape) 45 | with torch.no_grad(): 46 | if self.weight_maskUpdater.type() != input.type(): 47 | self.weight_maskUpdater = self.weight_maskUpdater.to(input) 48 | if mask_in is None: 49 | mask = torch.ones(1, 1, input.data.shape[2]).to(input) 50 | else: 51 | mask = mask_in 52 | self.update_mask = F.conv1d( 53 | mask, 54 | self.weight_maskUpdater, 55 | bias=None, 56 | stride=self.stride, 57 | padding=self.padding, 58 | dilation=self.dilation, 59 | groups=1, 60 | ) 61 | # for mixed precision training, change 1e-8 to 1e-6 62 | self.mask_ratio = self.slide_winsize / (self.update_mask + 1e-6) 63 | self.update_mask = torch.clamp(self.update_mask, 0, 1) 64 | self.mask_ratio = torch.mul(self.mask_ratio, self.update_mask) 65 | raw_out = super(PartialConv1d, self).forward( 66 | torch.mul(input, mask) if mask_in is not None else input 67 | ) 68 | if self.bias is not None: 69 | bias_view = self.bias.view(1, self.out_channels, 1) 70 | output = torch.mul(raw_out - bias_view, self.mask_ratio) + bias_view 71 | output = torch.mul(output, self.update_mask) 72 | else: 73 | output = torch.mul(raw_out, self.mask_ratio) 74 | 75 | if self.return_mask: 76 | return output, self.update_mask 77 | else: 78 | return output 79 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/postnet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | import torch 4 | 5 | from ..common import Conv1d 6 | 7 | 8 | class Postnet(nn.Module): 9 | """Postnet 10 | - Five 1-d convolution with 512 channels and kernel size 5 11 | """ 12 | 13 | def __init__(self, hparams): 14 | super(Postnet, self).__init__() 15 | self.dropout_rate = 0.5 16 | self.convolutions = nn.ModuleList() 17 | 18 | self.convolutions.append( 19 | nn.Sequential( 20 | Conv1d( 21 | hparams.n_mel_channels, 22 | hparams.postnet_embedding_dim, 23 | kernel_size=hparams.postnet_kernel_size, 24 | stride=1, 25 | padding=int((hparams.postnet_kernel_size - 1) / 2), 26 | dilation=1, 27 | w_init_gain="tanh", 28 | ), 29 | nn.BatchNorm1d(hparams.postnet_embedding_dim), 30 | ) 31 | ) 32 | 33 | for i in range(1, hparams.postnet_n_convolutions - 1): 34 | self.convolutions.append( 35 | nn.Sequential( 36 | Conv1d( 37 | hparams.postnet_embedding_dim, 38 | hparams.postnet_embedding_dim, 39 | kernel_size=hparams.postnet_kernel_size, 40 | stride=1, 41 | padding=int((hparams.postnet_kernel_size - 1) / 2), 42 | dilation=1, 43 | w_init_gain="tanh", 44 | ), 45 | nn.BatchNorm1d(hparams.postnet_embedding_dim), 46 | ) 47 | ) 48 | 49 | self.convolutions.append( 50 | nn.Sequential( 51 | Conv1d( 52 | hparams.postnet_embedding_dim, 53 | hparams.n_mel_channels, 54 | kernel_size=hparams.postnet_kernel_size, 55 | stride=1, 56 | padding=int((hparams.postnet_kernel_size - 1) / 2), 57 | dilation=1, 58 | w_init_gain="linear", 59 | ), 60 | nn.BatchNorm1d(hparams.n_mel_channels), 61 | ) 62 | ) 63 | 64 | def forward(self, x): 65 | for i, conv in enumerate(self.convolutions): 66 | if i == len(self.convolutions) - 1: 67 | x = F.dropout(conv(x), self.dropout_rate, self.training) 68 | else: 69 | x = F.dropout(torch.tanh(conv(x)), self.dropout_rate, self.training) 70 | 71 | return x 72 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/components/prenet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | from ..common import LinearNorm 4 | 5 | 6 | class Prenet(nn.Module): 7 | def __init__(self, in_dim, sizes): 8 | super().__init__() 9 | in_sizes = [in_dim] + sizes[:-1] 10 | self.layers = nn.ModuleList( 11 | [ 12 | LinearNorm(in_size, out_size, bias=False) 13 | for (in_size, out_size) in zip(in_sizes, sizes) 14 | ] 15 | ) 16 | self.dropout_rate = 0.5 17 | 18 | def forward(self, x): 19 | for linear in self.layers: 20 | x = F.dropout(F.relu(linear(x)), p=self.dropout_rate, training=True) 21 | return x 22 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/rvc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/rvc/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/models/rvc/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size * dilation - dilation) / 2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def kl_divergence(m_p, logs_p, m_q, logs_q): 25 | """KL(P||Q)""" 26 | kl = (logs_q - logs_p) - 0.5 27 | kl += ( 28 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 29 | ) 30 | return kl 31 | 32 | 33 | def rand_gumbel(shape): 34 | """Sample from the Gumbel distribution, protect from overflows.""" 35 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 36 | return -torch.log(-torch.log(uniform_samples)) 37 | 38 | 39 | def rand_gumbel_like(x): 40 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 41 | return g 42 | 43 | 44 | def slice_segments(x, ids_str, segment_size=4): 45 | ret = torch.zeros_like(x[:, :, :segment_size]) 46 | for i in range(x.size(0)): 47 | idx_str = ids_str[i] 48 | idx_end = idx_str + segment_size 49 | ret[i] = x[i, :, idx_str:idx_end] 50 | 51 | return ret 52 | 53 | 54 | def slice_segments2(x, ids_str, segment_size=4): 55 | ret = torch.zeros_like(x[:, :segment_size]) 56 | for i in range(x.size(0)): 57 | idx_str = ids_str[i] 58 | idx_end = idx_str + segment_size 59 | ret[i] = x[i, idx_str:idx_end] 60 | return ret 61 | 62 | 63 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 64 | b, d, t = x.size() 65 | if x_lengths is None: 66 | x_lengths = t 67 | ids_str_max = ( 68 | x_lengths - segment_size 69 | ) # + 1 # NOTE (Sam): remove +1 to avoid rounding error when starting with mels. 70 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 71 | ret = slice_segments(x, ids_str, segment_size) 72 | return ret, ids_str 73 | 74 | 75 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 76 | position = torch.arange(length, dtype=torch.float) 77 | num_timescales = channels // 2 78 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 79 | num_timescales - 1 80 | ) 81 | inv_timescales = min_timescale * torch.exp( 82 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 83 | ) 84 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 85 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 86 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 87 | signal = signal.view(1, channels, length) 88 | return signal 89 | 90 | 91 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 92 | b, channels, length = x.size() 93 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 94 | return x + signal.to(dtype=x.dtype, device=x.device) 95 | 96 | 97 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 98 | b, channels, length = x.size() 99 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 100 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 101 | 102 | 103 | def subsequent_mask(length): 104 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 105 | return mask 106 | 107 | 108 | @torch.jit.script 109 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 110 | n_channels_int = n_channels[0] 111 | in_act = input_a + input_b 112 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 113 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 114 | acts = t_act * s_act 115 | return acts 116 | 117 | 118 | def convert_pad_shape(pad_shape): 119 | l = pad_shape[::-1] 120 | pad_shape = [item for sublist in l for item in sublist] 121 | return pad_shape 122 | 123 | 124 | def shift_1d(x): 125 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 126 | return x 127 | 128 | 129 | def sequence_mask(length, max_length=None): 130 | if max_length is None: 131 | max_length = length.max() 132 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 133 | return x.unsqueeze(0) < length.unsqueeze(1) 134 | 135 | 136 | def generate_path(duration, mask): 137 | """ 138 | duration: [b, 1, t_x] 139 | mask: [b, 1, t_y, t_x] 140 | """ 141 | device = duration.device 142 | 143 | b, _, t_y, t_x = mask.shape 144 | cum_duration = torch.cumsum(duration, -1) 145 | 146 | cum_duration_flat = cum_duration.view(b * t_x) 147 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 148 | path = path.view(b, t_x, t_y) 149 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 150 | path = path.unsqueeze(1).transpose(2, 3) * mask 151 | return path 152 | 153 | 154 | def clip_grad_value_(parameters, clip_value, norm_type=2): 155 | if isinstance(parameters, torch.Tensor): 156 | parameters = [parameters] 157 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 158 | norm_type = float(norm_type) 159 | if clip_value is not None: 160 | clip_value = float(clip_value) 161 | 162 | total_norm = 0 163 | for p in parameters: 164 | param_norm = p.grad.data.norm(norm_type) 165 | total_norm += param_norm.item() ** norm_type 166 | if clip_value is not None: 167 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 168 | total_norm = total_norm ** (1.0 / norm_type) 169 | return total_norm 170 | -------------------------------------------------------------------------------- /uberduck_ml_dev/models/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import os 4 | import inspect 5 | 6 | 7 | def load_checkpoint(filepath, device, pickle_module=pickle): 8 | assert os.path.isfile(filepath) 9 | print("Loading '{}'".format(filepath)) 10 | checkpoint_dict = torch.load( 11 | filepath, 12 | map_location=torch.device(device), 13 | pickle_module=pickle_module, 14 | ) 15 | print("Complete.") 16 | return checkpoint_dict 17 | 18 | 19 | def load_pretrained(model, checkpoint_path, key_="generator"): 20 | # NOTE (Sam): uncomment for download on anyscale 21 | # response = requests.get(HIFI_GAN_GENERATOR_URL, stream=True) 22 | # bio = BytesIO(response.content) 23 | loaded = torch.load(checkpoint_path) 24 | model.load_state_dict(loaded[key_]) 25 | 26 | 27 | def filter_valid_args(func, **kwargs): 28 | valid_keys = inspect.signature(func).parameters.keys() 29 | return {key: value for key, value in kwargs.items() if key in valid_keys} 30 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/monitoring/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/generate.py: -------------------------------------------------------------------------------- 1 | __all__ = [] 2 | 3 | 4 | from ..text.utils import prepare_input_sequence 5 | 6 | 7 | def _get_inference(model, vocoder, texts, speaker_ids, symbol_set, arpabet, cpu_run): 8 | text_padded, input_lengths = prepare_input_sequence( 9 | texts, cpu_run=cpu_run, arpabet=arpabet, symbol_set=symbol_set 10 | ) 11 | # Note (SAM): None is for GST... temporary solution 12 | input_ = text_padded, input_lengths, speaker_ids, None 13 | output = model.inference(input_) 14 | audio = vocoder.infer(output[1][:1]) 15 | return audio 16 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/statistics.py: -------------------------------------------------------------------------------- 1 | __all__ = ["get_alignment_metrics"] 2 | 3 | import torch 4 | from ..utils.utils import get_mask_from_lengths 5 | 6 | 7 | def get_alignment_metrics( 8 | alignments, average_across_batch=True, input_lengths=None, output_lengths=None 9 | ): 10 | alignments = alignments.transpose(1, 2) # [B, dec, enc] -> [B, enc, dec] 11 | if input_lengths == None: 12 | input_lengths = torch.ones(alignments.size(0), device=alignments.device) * ( 13 | alignments.shape[1] - 1 14 | ) # [B] # 147 15 | if output_lengths == None: 16 | output_lengths = torch.ones(alignments.size(0), device=alignments.device) * ( 17 | alignments.shape[2] - 1 18 | ) # [B] # 767 19 | 20 | batch_size = alignments.size(0) 21 | optimums = torch.sqrt( 22 | input_lengths.double().pow(2) + output_lengths.double().pow(2) 23 | ).view(batch_size) 24 | 25 | # [B, enc, dec] -> [B, dec], [B, dec] 26 | values, cur_idxs = torch.max(alignments, 1) 27 | 28 | cur_idxs = cur_idxs.float() 29 | prev_indx = torch.cat((cur_idxs[:, 0][:, None], cur_idxs[:, :-1]), dim=1) 30 | dist = ((prev_indx - cur_idxs).pow(2) + 1).pow(0.5) # [B, dec] 31 | dist.masked_fill_( 32 | ~get_mask_from_lengths(output_lengths, max_len=dist.size(1)), 0.0 33 | ) # set dist of padded to zero 34 | dist = dist.sum(dim=(1)) # get total dist for each B 35 | diagonalness = (dist + 1.4142135) / optimums # dist / optimal dist 36 | 37 | maxes = alignments.max(axis=1)[0].mean(axis=1) 38 | if average_across_batch: 39 | diagonalness = diagonalness.mean() 40 | maxes = maxes.mean() 41 | 42 | output = {} 43 | output["diagonalness"] = diagonalness 44 | output["max"] = maxes 45 | 46 | return output 47 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/streamlit.py: -------------------------------------------------------------------------------- 1 | __all__ = ["run"] 2 | 3 | 4 | import streamlit as st 5 | from collections import OrderedDict 6 | from .generate import _get_inference, MODEL_LIST, MODEL_TYPES 7 | 8 | 9 | def run(): 10 | st.title("Inference inspector") 11 | 12 | symbol_set = st.selectbox( 13 | "What symbol set would you like to use?", ("NVIDIA_TACO2_DEFAULTS") 14 | ) 15 | st.write("You selected:", symbol_set) 16 | 17 | use_arpabet = st.selectbox("Would you like to use arpabet?", ("Yes", "No")) 18 | st.write("You selected:", use_arpabet) 19 | 20 | # st.text_input("Model file name", "test/fixtures/models/taco2ljdefault") 21 | # st.text_input("Model format", OrderedDict) 22 | vocoder_path = st.text_input( 23 | "Vocoder path", "test/fixtures/models/gen_02640000_studio" 24 | ) 25 | vocoder_config = st.text_input("Vocoder config", None) 26 | n_speakers = st.text_input("Number of speakers", 1) 27 | gate_threshold = st.text_input("Gate threshold", 0.1) 28 | 29 | chosen_model = st.sidebar.selectbox("Select model", MODEL_LIST) 30 | chosen_type = st.sidebar.selectbox("Select model save type", MODEL_TYPES) 31 | text = [st.text_input("Text", "Thats silly")] 32 | speakers = [st.text_input("Speaker_id", 0)] 33 | 34 | hparams = TACOTRON2_DEFAULTS 35 | hparams.n_speakers = n_speakers 36 | hparams.gate_threshold = gate_threshold 37 | if n_speakers > 1: 38 | hparams.has_speaker_embedding = True 39 | model = Tacotron2(hparams) 40 | device = "cuda" 41 | model = Tacotron2(hparams) 42 | if chosen_type == "OD": 43 | model.from_pretrained(model_dict=chosen_model, device=device) 44 | if chosen_type == "OD": 45 | model.from_pretrained(warm_start_path=chosen_model, device=device) 46 | 47 | hifigan = HiFiGanGenerator( 48 | config=vocoder_config, 49 | checkpoint=vocoder_file, 50 | cudnn_enabled=True, 51 | ) 52 | 53 | inference = _get_inference(model, vocoder, texts, speakers, symbol_set, arpabet) 54 | 55 | st.audio(inference) 56 | 57 | 58 | if __name__ == "__main__": 59 | run() 60 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monitoring/wandb.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import wandb 4 | from tqdm import tqdm 5 | import torch 6 | 7 | from ..text.utils import UTTERANCES 8 | 9 | 10 | def log_sample_utterances( 11 | project="my-project", 12 | name="my-model", 13 | dataset="my-dataset", 14 | architecture="my-architecture", 15 | speaker_ids: List = [], 16 | inference_function=lambda text, speaker_id: False, 17 | ): 18 | wandb.init( 19 | project=project, 20 | name=name, 21 | job_type="eval", 22 | config={"architecture": architecture, "dataset": dataset}, 23 | ) 24 | 25 | with torch.no_grad(): 26 | for speaker_id in tqdm(speaker_ids): 27 | to_log = [] 28 | for utterance in tqdm(UTTERANCES): 29 | inference = inference_function(utterance, speaker_id) 30 | to_log.append( 31 | wandb.Audio(inference, caption=utterance, sample_rate=22050) 32 | ) 33 | torch.cuda.empty_cache() # might not be necessary 34 | wandb.log({f"Speaker {speaker_id}": to_log}) 35 | 36 | wandb.finish() 37 | -------------------------------------------------------------------------------- /uberduck_ml_dev/monotonic_align.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | try: 5 | from .monotonic_align.core import maximum_path_c 6 | 7 | CYTHON = True 8 | except ModuleNotFoundError: 9 | CYTHON = False 10 | 11 | 12 | def maximum_path(neg_cent, mask): 13 | if CYTHON: 14 | return maximum_path_cython(neg_cent, mask) 15 | return maximum_path_numpy(neg_cent, mask) 16 | 17 | 18 | def maximum_path_cython(neg_cent, mask): 19 | """Cython optimized version. 20 | neg_cent: [b, t_t, t_s] 21 | mask: [b, t_t, t_s] 22 | """ 23 | device = neg_cent.device 24 | dtype = neg_cent.dtype 25 | neg_cent = neg_cent.data.cpu().numpy().astype(np.float32) 26 | path = np.zeros(neg_cent.shape, dtype=np.int32) 27 | 28 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32) 29 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32) 30 | maximum_path_c(path, neg_cent, t_t_max, t_s_max) 31 | return torch.from_numpy(path).to(device=device, dtype=dtype) 32 | 33 | 34 | def maximum_path_numpy(value, mask, max_neg_val=None): 35 | """ 36 | Monotonic alignment search algorithm 37 | Numpy-friendly version. It's about 4 times faster than torch version. 38 | value: [b, t_x, t_y] 39 | mask: [b, t_x, t_y] 40 | """ 41 | if max_neg_val is None: 42 | max_neg_val = -np.inf # Patch for Sphinx complaint 43 | value = value * mask 44 | 45 | device = value.device 46 | dtype = value.dtype 47 | value = value.cpu().detach().numpy() 48 | mask = mask.cpu().detach().numpy().astype(np.bool) 49 | 50 | b, t_x, t_y = value.shape 51 | direction = np.zeros(value.shape, dtype=np.int64) 52 | v = np.zeros((b, t_x), dtype=np.float32) 53 | x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1) 54 | for j in range(t_y): 55 | v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[ 56 | :, :-1 57 | ] 58 | v1 = v 59 | max_mask = v1 >= v0 60 | v_max = np.where(max_mask, v1, v0) 61 | direction[:, :, j] = max_mask 62 | 63 | index_mask = x_range <= j 64 | v = np.where(index_mask, v_max + value[:, :, j], max_neg_val) 65 | direction = np.where(mask, direction, 1) 66 | 67 | path = np.zeros(value.shape, dtype=np.float32) 68 | index = mask[:, :, 0].sum(1).astype(np.int64) - 1 69 | index_range = np.arange(b) 70 | for j in reversed(range(t_y)): 71 | path[index_range, index, j] = 1 72 | index = index + direction[index_range, index, j] - 1 73 | path = path * mask.astype(np.float32) 74 | path = torch.from_numpy(path).to(device=device, dtype=dtype) 75 | return path 76 | -------------------------------------------------------------------------------- /uberduck_ml_dev/optimizers/radam.py: -------------------------------------------------------------------------------- 1 | # Original source taken from https://github.com/LiyuanLucasLiu/RAdam 2 | # 3 | # Copyright 2019 Liyuan Liu 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import math 17 | 18 | import torch 19 | 20 | # pylint: disable=no-name-in-module 21 | from torch.optim.optimizer import Optimizer 22 | 23 | 24 | class RAdam(Optimizer): 25 | """RAdam optimizer""" 26 | 27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 28 | """ 29 | Init 30 | 31 | :param params: parameters to optimize 32 | :param lr: learning rate 33 | :param betas: beta 34 | :param eps: numerical precision 35 | :param weight_decay: weight decay weight 36 | """ 37 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 38 | self.buffer = [[None, None, None] for _ in range(10)] 39 | super().__init__(params, defaults) 40 | 41 | def step(self, closure=None): 42 | loss = None 43 | if closure is not None: 44 | loss = closure() 45 | 46 | for group in self.param_groups: 47 | for p in group["params"]: 48 | if p.grad is None: 49 | continue 50 | grad = p.grad.data.float() 51 | if grad.is_sparse: 52 | raise RuntimeError("RAdam does not support sparse gradients") 53 | 54 | p_data_fp32 = p.data.float() 55 | 56 | state = self.state[p] 57 | 58 | if len(state) == 0: 59 | state["step"] = 0 60 | state["exp_avg"] = torch.zeros_like(p_data_fp32) 61 | state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) 62 | else: 63 | state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32) 64 | state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32) 65 | 66 | exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] 67 | beta1, beta2 = group["betas"] 68 | 69 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 70 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 71 | 72 | state["step"] += 1 73 | buffered = self.buffer[int(state["step"] % 10)] 74 | if state["step"] == buffered[0]: 75 | N_sma, step_size = buffered[1], buffered[2] 76 | else: 77 | buffered[0] = state["step"] 78 | beta2_t = beta2 ** state["step"] 79 | N_sma_max = 2 / (1 - beta2) - 1 80 | N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t) 81 | buffered[1] = N_sma 82 | 83 | # more conservative since it's an approximated value 84 | if N_sma >= 5: 85 | step_size = ( 86 | group["lr"] 87 | * math.sqrt( 88 | (1 - beta2_t) 89 | * (N_sma - 4) 90 | / (N_sma_max - 4) 91 | * (N_sma - 2) 92 | / N_sma 93 | * N_sma_max 94 | / (N_sma_max - 2) 95 | ) 96 | / (1 - beta1 ** state["step"]) 97 | ) 98 | else: 99 | step_size = group["lr"] / (1 - beta1 ** state["step"]) 100 | buffered[2] = step_size 101 | 102 | if group["weight_decay"] != 0: 103 | p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32) 104 | 105 | # more conservative since it's an approximated value 106 | if N_sma >= 5: 107 | denom = exp_avg_sq.sqrt().add_(group["eps"]) 108 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 109 | else: 110 | p_data_fp32.add_(-step_size, exp_avg) 111 | 112 | p.data.copy_(p_data_fp32) 113 | 114 | return loss 115 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/text/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/text/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | _no_period_re = re.compile(r"(No[.])(?=[ ]?[0-9])") 4 | _percent_re = re.compile(r"([ ]?[%])") 5 | _half_re = re.compile("([0-9]½)|(½)") 6 | 7 | 8 | # List of (regular expression, replacement) pairs for abbreviations: 9 | _abbreviations = [ 10 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 11 | for x in [ 12 | ("mrs", "misess"), 13 | ("ms", "miss"), 14 | ("mr", "mister"), 15 | ("dr", "doctor"), 16 | ("st", "saint"), 17 | ("co", "company"), 18 | ("jr", "junior"), 19 | ("maj", "major"), 20 | ("gen", "general"), 21 | ("drs", "doctors"), 22 | ("rev", "reverend"), 23 | ("lt", "lieutenant"), 24 | ("hon", "honorable"), 25 | ("sgt", "sergeant"), 26 | ("capt", "captain"), 27 | ("esq", "esquire"), 28 | ("ltd", "limited"), 29 | ("col", "colonel"), 30 | ("ft", "fort"), 31 | ] 32 | ] 33 | 34 | 35 | def _expand_no_period(m): 36 | word = m.group(0) 37 | if word[0] == "N": 38 | return "Number" 39 | return "number" 40 | 41 | 42 | def _expand_percent(m): 43 | return " percent" 44 | 45 | 46 | def _expand_half(m): 47 | word = m.group(1) 48 | if word is None: 49 | return "half" 50 | return word[0] + " and a half" 51 | 52 | 53 | def normalize_abbreviations(text): 54 | text = re.sub(_no_period_re, _expand_no_period, text) 55 | text = re.sub(_percent_re, _expand_percent, text) 56 | text = re.sub(_half_re, _expand_half, text) 57 | return text 58 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/acronyms.py: -------------------------------------------------------------------------------- 1 | import re 2 | from .cmudict import CMUDict 3 | 4 | _letter_to_arpabet = { 5 | "A": "EY1", 6 | "B": "B IY1", 7 | "C": "S IY1", 8 | "D": "D IY1", 9 | "E": "IY1", 10 | "F": "EH1 F", 11 | "G": "JH IY1", 12 | "H": "EY1 CH", 13 | "I": "AY1", 14 | "J": "JH EY1", 15 | "K": "K EY1", 16 | "L": "EH1 L", 17 | "M": "EH1 M", 18 | "N": "EH1 N", 19 | "O": "OW1", 20 | "P": "P IY1", 21 | "Q": "K Y UW1", 22 | "R": "AA1 R", 23 | "S": "EH1 S", 24 | "T": "T IY1", 25 | "U": "Y UW1", 26 | "V": "V IY1", 27 | "X": "EH1 K S", 28 | "Y": "W AY1", 29 | "W": "D AH1 B AH0 L Y UW0", 30 | "Z": "Z IY1", 31 | "s": "Z", 32 | } 33 | 34 | # must ignore roman numerals 35 | # _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)') 36 | _acronym_re = re.compile(r"([A-Z][A-Z]+)s?") 37 | 38 | 39 | class AcronymNormalizer(object): 40 | def __init__(self, phoneme_dict): 41 | self.phoneme_dict = phoneme_dict 42 | 43 | def normalize_acronyms(self, text): 44 | def _expand_acronyms(m, add_spaces=True): 45 | acronym = m.group(0) 46 | # remove dots if they exist 47 | acronym = re.sub("\.", "", acronym) 48 | 49 | acronym = "".join(acronym.split()) 50 | arpabet = self.phoneme_dict.lookup(acronym) 51 | 52 | if arpabet is None: 53 | acronym = list(acronym) 54 | arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym] 55 | # temporary fix 56 | if arpabet[-1] == "{Z}" and len(arpabet) > 1: 57 | arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:] 58 | del arpabet[-1] 59 | arpabet = " ".join(arpabet) 60 | elif len(arpabet) == 1: 61 | arpabet = "{" + arpabet[0] + "}" 62 | else: 63 | arpabet = acronym 64 | return arpabet 65 | 66 | text = re.sub(_acronym_re, _expand_acronyms, text) 67 | return text 68 | 69 | def __call__(self, text): 70 | return self.normalize_acronyms(text) 71 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ adapted from https://github.com/keithito/tacotron """ 2 | 3 | """ 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | """ 14 | 15 | import re 16 | from string import punctuation 17 | from functools import reduce 18 | from unidecode import unidecode 19 | from .numerical import normalize_numbers, normalize_currency 20 | from .acronyms import AcronymNormalizer 21 | from .datestime import normalize_datestime 22 | from .letters_and_numbers import normalize_letters_and_numbers 23 | from .abbreviations import normalize_abbreviations 24 | 25 | 26 | # Regular expression matching whitespace: 27 | _whitespace_re = re.compile(r"\s+") 28 | 29 | # Regular expression separating words enclosed in curly braces for cleaning 30 | _arpa_re = re.compile(r"{[^}]+}|\S+") 31 | 32 | 33 | def expand_abbreviations(text): 34 | return normalize_abbreviations(text) 35 | 36 | 37 | def expand_numbers(text): 38 | return normalize_numbers(text) 39 | 40 | 41 | def expand_currency(text): 42 | return normalize_currency(text) 43 | 44 | 45 | def expand_datestime(text): 46 | return normalize_datestime(text) 47 | 48 | 49 | def expand_letters_and_numbers(text): 50 | return normalize_letters_and_numbers(text) 51 | 52 | 53 | def lowercase(text): 54 | return text.lower() 55 | 56 | 57 | def collapse_whitespace(text): 58 | return re.sub(_whitespace_re, " ", text) 59 | 60 | 61 | def separate_acronyms(text): 62 | text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text) 63 | text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text) 64 | return text 65 | 66 | 67 | def convert_to_ascii(text): 68 | return unidecode(text) 69 | 70 | 71 | def dehyphenize_compound_words(text): 72 | text = re.sub(r"(?<=[a-zA-Z0-9])-(?=[a-zA-Z])", " ", text) 73 | return text 74 | 75 | 76 | def remove_space_before_punctuation(text): 77 | return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text) 78 | 79 | 80 | class Cleaner(object): 81 | def __init__(self, cleaner_names, phonemedict): 82 | self.cleaner_names = cleaner_names 83 | self.phonemedict = phonemedict 84 | self.acronym_normalizer = AcronymNormalizer(self.phonemedict) 85 | 86 | def __call__(self, text): 87 | for cleaner_name in self.cleaner_names: 88 | sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name) 89 | for fn in sequence_fns: 90 | text = fn(text) 91 | 92 | text = [ 93 | reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split 94 | for split in _arpa_re.findall(text) 95 | ] 96 | text = " ".join(text) 97 | text = remove_space_before_punctuation(text) 98 | return text 99 | 100 | def get_cleaner_fns(self, cleaner_name): 101 | if cleaner_name == "basic_cleaners": 102 | sequence_fns = [lowercase, collapse_whitespace] 103 | word_fns = [] 104 | elif cleaner_name == "english_cleaners": 105 | sequence_fns = [collapse_whitespace, convert_to_ascii, lowercase] 106 | word_fns = [expand_numbers, expand_abbreviations] 107 | elif cleaner_name == "radtts_cleaners": 108 | sequence_fns = [ 109 | collapse_whitespace, 110 | expand_currency, 111 | expand_datestime, 112 | expand_letters_and_numbers, 113 | ] 114 | word_fns = [expand_numbers, expand_abbreviations] 115 | elif cleaner_name == "transliteration_cleaners": 116 | sequence_fns = [convert_to_ascii, lowercase, collapse_whitespace] 117 | else: 118 | raise Exception("{} cleaner not supported".format(cleaner_name)) 119 | 120 | return sequence_fns, word_fns 121 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/cmudict.py: -------------------------------------------------------------------------------- 1 | __all__ = ["CMUDict", "valid_symbols"] 2 | 3 | 4 | """ from https://github.com/keithito/tacotron """ 5 | 6 | import re 7 | 8 | 9 | valid_symbols = [ 10 | "AA", 11 | "AA0", 12 | "AA1", 13 | "AA2", 14 | "AE", 15 | "AE0", 16 | "AE1", 17 | "AE2", 18 | "AH", 19 | "AH0", 20 | "AH1", 21 | "AH2", 22 | "AO", 23 | "AO0", 24 | "AO1", 25 | "AO2", 26 | "AW", 27 | "AW0", 28 | "AW1", 29 | "AW2", 30 | "AY", 31 | "AY0", 32 | "AY1", 33 | "AY2", 34 | "B", 35 | "CH", 36 | "D", 37 | "DH", 38 | "EH", 39 | "EH0", 40 | "EH1", 41 | "EH2", 42 | "ER", 43 | "ER0", 44 | "ER1", 45 | "ER2", 46 | "EY", 47 | "EY0", 48 | "EY1", 49 | "EY2", 50 | "F", 51 | "G", 52 | "HH", 53 | "IH", 54 | "IH0", 55 | "IH1", 56 | "IH2", 57 | "IY", 58 | "IY0", 59 | "IY1", 60 | "IY2", 61 | "JH", 62 | "K", 63 | "L", 64 | "M", 65 | "N", 66 | "NG", 67 | "OW", 68 | "OW0", 69 | "OW1", 70 | "OW2", 71 | "OY", 72 | "OY0", 73 | "OY1", 74 | "OY2", 75 | "P", 76 | "R", 77 | "S", 78 | "SH", 79 | "T", 80 | "TH", 81 | "UH", 82 | "UH0", 83 | "UH1", 84 | "UH2", 85 | "UW", 86 | "UW0", 87 | "UW1", 88 | "UW2", 89 | "V", 90 | "W", 91 | "Y", 92 | "Z", 93 | "ZH", 94 | ] 95 | 96 | _valid_symbol_set = set(valid_symbols) 97 | 98 | 99 | class CMUDict: 100 | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" 101 | 102 | def __init__(self, file_or_path, keep_ambiguous=True): 103 | if isinstance(file_or_path, str): 104 | with open(file_or_path, encoding="latin-1") as f: 105 | entries = _parse_cmudict(f) 106 | else: 107 | entries = _parse_cmudict(file_or_path) 108 | if not keep_ambiguous: 109 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 110 | self._entries = entries 111 | 112 | def __len__(self): 113 | return len(self._entries) 114 | 115 | def lookup(self, word): 116 | """Returns list of ARPAbet pronunciations of the given word.""" 117 | return self._entries.get(word.upper()) 118 | 119 | 120 | _alt_re = re.compile(r"\([0-9]+\)") 121 | 122 | 123 | def _parse_cmudict(file): 124 | cmudict = {} 125 | for line in file: 126 | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): 127 | parts = line.split(" ") 128 | word = re.sub(_alt_re, "", parts[0]) 129 | pronunciation = _get_pronunciation(parts[1]) 130 | if pronunciation: 131 | if word in cmudict: 132 | cmudict[word].append(pronunciation) 133 | else: 134 | cmudict[word] = [pronunciation] 135 | return cmudict 136 | 137 | 138 | def _get_pronunciation(s): 139 | parts = s.strip().split(" ") 140 | for part in parts: 141 | if part not in _valid_symbol_set: 142 | return None 143 | return " ".join(parts) 144 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/datestime.py: -------------------------------------------------------------------------------- 1 | """ adapted from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | _ampm_re = re.compile(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)") 6 | 7 | 8 | def _expand_ampm(m): 9 | matches = list(m.groups(0)) 10 | txt = matches[0] 11 | txt = txt if int(matches[1]) == 0 else txt + " " + matches[1] 12 | 13 | if matches[2][0].lower() == "a": 14 | txt += " a.m." 15 | elif matches[2][0].lower() == "p": 16 | txt += " p.m." 17 | 18 | return txt 19 | 20 | 21 | def normalize_datestime(text): 22 | text = re.sub(_ampm_re, _expand_ampm, text) 23 | # text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text) 24 | return text 25 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/grapheme_dictionary.py: -------------------------------------------------------------------------------- 1 | # NOTE (Sam): synthesize with other methods 2 | 3 | """ adapted from https://github.com/keithito/tacotron """ 4 | 5 | import re 6 | 7 | _alt_re = re.compile(r"\([0-9]+\)") 8 | 9 | 10 | class Grapheme2PhonemeDictionary: 11 | """Thin wrapper around g2p data.""" 12 | 13 | def __init__(self, file_or_path, keep_ambiguous=True, encoding="latin-1"): 14 | with open(file_or_path, encoding=encoding) as f: 15 | entries = _parse_g2p(f) 16 | if not keep_ambiguous: 17 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 18 | self._entries = entries 19 | 20 | def __len__(self): 21 | return len(self._entries) 22 | 23 | def lookup(self, word): 24 | """Returns list of pronunciations of the given word.""" 25 | return self._entries.get(word.upper()) 26 | 27 | 28 | def _parse_g2p(file): 29 | g2p = {} 30 | for line in file: 31 | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): 32 | parts = line.split(" ") 33 | word = re.sub(_alt_re, "", parts[0]) 34 | pronunciation = parts[1].strip() 35 | if word in g2p: 36 | g2p[word].append(pronunciation) 37 | else: 38 | g2p[word] = [pronunciation] 39 | return g2p 40 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/heteronyms: -------------------------------------------------------------------------------- 1 | abject 2 | abrogate 3 | absent 4 | abstract 5 | abuse 6 | ache 7 | acre 8 | acuminate 9 | addict 10 | address 11 | adduct 12 | adele 13 | advocate 14 | affect 15 | affiliate 16 | agape 17 | aged 18 | agglomerate 19 | aggregate 20 | agonic 21 | agora 22 | allied 23 | ally 24 | alternate 25 | alum 26 | am 27 | analyses 28 | andrea 29 | animate 30 | apply 31 | appropriate 32 | approximate 33 | ares 34 | arithmetic 35 | arsenic 36 | articulate 37 | associate 38 | attribute 39 | august 40 | axes 41 | ay 42 | aye 43 | bases 44 | bass 45 | bathed 46 | bested 47 | bifurcate 48 | blessed 49 | blotto 50 | bow 51 | bowed 52 | bowman 53 | brassy 54 | buffet 55 | bustier 56 | carbonate 57 | celtic 58 | choral 59 | chumash 60 | close 61 | closer 62 | coax 63 | coincidence 64 | color coordinate 65 | colour coordinate 66 | comber 67 | combine 68 | combs 69 | committee 70 | commune 71 | compact 72 | complex 73 | compound 74 | compress 75 | concert 76 | conduct 77 | confine 78 | confines 79 | conflict 80 | conglomerate 81 | conscript 82 | conserve 83 | consist 84 | console 85 | consort 86 | construct 87 | consult 88 | consummate 89 | content 90 | contest 91 | contract 92 | contracts 93 | contrast 94 | converse 95 | convert 96 | convict 97 | coop 98 | coordinate 99 | covey 100 | crooked 101 | curate 102 | cussed 103 | decollate 104 | decrease 105 | defect 106 | defense 107 | delegate 108 | deliberate 109 | denier 110 | desert 111 | detail 112 | deviate 113 | diagnoses 114 | diffuse 115 | digest 116 | discard 117 | discharge 118 | discount 119 | do 120 | document 121 | does 122 | dogged 123 | domesticate 124 | dominican 125 | dove 126 | dr 127 | drawer 128 | duplicate 129 | egress 130 | ejaculate 131 | eject 132 | elaborate 133 | ellipses 134 | email 135 | emu 136 | entrace 137 | entrance 138 | escort 139 | estimate 140 | eta 141 | etna 142 | evening 143 | excise 144 | excuse 145 | exploit 146 | export 147 | extract 148 | fine 149 | flower 150 | forbear 151 | four-legged 152 | frequent 153 | furrier 154 | gallant 155 | gel 156 | geminate 157 | gillie 158 | glower 159 | gotham 160 | graduate 161 | haggis 162 | heavy 163 | hinder 164 | house 165 | housewife 166 | impact 167 | imped 168 | implant 169 | implement 170 | import 171 | impress 172 | incense 173 | incline 174 | increase 175 | infix 176 | insert 177 | instar 178 | insult 179 | integral 180 | intercept 181 | interchange 182 | interflow 183 | interleaf 184 | intermediate 185 | intern 186 | interspace 187 | intimate 188 | intrigue 189 | invalid 190 | invert 191 | invite 192 | irony 193 | jagged 194 | jesses 195 | julies 196 | kite 197 | laminate 198 | laos 199 | lather 200 | lead 201 | learned 202 | leasing 203 | lech 204 | legitimate 205 | lied 206 | lima 207 | lipread 208 | live 209 | lower 210 | lunged 211 | maas 212 | magdalen 213 | manes 214 | mare 215 | marked 216 | merchandise 217 | merlion 218 | minute 219 | misconduct 220 | misled 221 | misprint 222 | mobile 223 | moderate 224 | mong 225 | moped 226 | moth 227 | mouth 228 | mow 229 | mpg 230 | multiply 231 | mush 232 | nana 233 | nice 234 | nice 235 | number 236 | numerate 237 | nun 238 | object 239 | opiate 240 | ornament 241 | outbox 242 | outcry 243 | outpour 244 | outreach 245 | outride 246 | outright 247 | outside 248 | outwork 249 | overall 250 | overbid 251 | overcall 252 | overcast 253 | overfall 254 | overflow 255 | overhaul 256 | overhead 257 | overlap 258 | overlay 259 | overuse 260 | overweight 261 | overwork 262 | pace 263 | palled 264 | palling 265 | para 266 | pasty 267 | pate 268 | pauline 269 | pedal 270 | peer 271 | perfect 272 | periodic 273 | permit 274 | pervert 275 | pinta 276 | placer 277 | platy 278 | polish 279 | polish 280 | poll 281 | pontificate 282 | postulate 283 | pram 284 | prayer 285 | precipitate 286 | predate 287 | predicate 288 | prefix 289 | preposition 290 | present 291 | pretest 292 | primer 293 | proceeds 294 | produce 295 | progress 296 | project 297 | proportionate 298 | prospect 299 | protest 300 | pussy 301 | putter 302 | putting 303 | quite 304 | ragged 305 | raven 306 | re 307 | read 308 | reading 309 | reading 310 | real 311 | rebel 312 | recall 313 | recap 314 | recitative 315 | recollect 316 | record 317 | recreate 318 | recreation 319 | redress 320 | refill 321 | refund 322 | refuse 323 | reject 324 | relay 325 | remake 326 | repaint 327 | reprint 328 | reread 329 | rerun 330 | resent 331 | reside 332 | resign 333 | respray 334 | resume 335 | retard 336 | retest 337 | retread 338 | rewrite 339 | root 340 | routed 341 | routing 342 | row 343 | rugged 344 | rummy 345 | sais 346 | sake 347 | sambuca 348 | saucier 349 | second 350 | secrete 351 | secreted 352 | secreting 353 | segment 354 | separate 355 | sewer 356 | shirk 357 | shower 358 | sin 359 | skied 360 | slaver 361 | slough 362 | sow 363 | spoof 364 | squid 365 | stingy 366 | subject 367 | subordinate 368 | subvert 369 | supply 370 | supposed 371 | survey 372 | suspect 373 | syringes 374 | tabulate 375 | tales 376 | tarrier 377 | tarry 378 | taxes 379 | taxis 380 | tear 381 | theron 382 | thou 383 | three-legged 384 | tier 385 | tinged 386 | torment 387 | transfer 388 | transform 389 | transplant 390 | transport 391 | transpose 392 | tush 393 | two-legged 394 | unionised 395 | unionized 396 | update 397 | uplift 398 | upset 399 | use 400 | used 401 | vale 402 | violist 403 | viva 404 | ware 405 | whinged 406 | whoop 407 | wicked 408 | wind 409 | windy 410 | wino 411 | won 412 | worsted 413 | wound -------------------------------------------------------------------------------- /uberduck_ml_dev/text/letters_and_numbers.py: -------------------------------------------------------------------------------- 1 | """ adapted from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | _letters_and_numbers_re = re.compile( 6 | r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE 7 | ) 8 | 9 | _hardware_re = re.compile( 10 | "([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)", re.IGNORECASE 11 | ) 12 | _hardware_key = { 13 | "tb": "terabyte", 14 | "gb": "gigabyte", 15 | "mb": "megabyte", 16 | "kb": "kilobyte", 17 | "ghz": "gigahertz", 18 | "mhz": "megahertz", 19 | "khz": "kilohertz", 20 | "hz": "hertz", 21 | "mm": "millimeter", 22 | "cm": "centimeter", 23 | "km": "kilometer", 24 | } 25 | 26 | _dimension_re = re.compile( 27 | r"\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b" 28 | ) 29 | _dimension_key = {"m": "meter", "in": "inch", "inch": "inch"} 30 | 31 | 32 | def _expand_letters_and_numbers(m): 33 | text = re.split(r"(\d+)", m.group(0)) 34 | 35 | # remove trailing space 36 | if text[-1] == "": 37 | text = text[:-1] 38 | elif text[0] == "": 39 | text = text[1:] 40 | 41 | # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc... 42 | if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit(): 43 | text[-2] = text[-2] + text[-1] 44 | text = text[:-1] 45 | 46 | # for combining digits 2 by 2 47 | new_text = [] 48 | for i in range(len(text)): 49 | string = text[i] 50 | if string.isdigit() and len(string) < 5: 51 | # heuristics 52 | if len(string) > 2 and string[-2] == "0": 53 | if string[-1] == "0": 54 | string = [string] 55 | else: 56 | string = [string[:-3], string[-2], string[-1]] 57 | elif len(string) % 2 == 0: 58 | string = [string[i : i + 2] for i in range(0, len(string), 2)] 59 | elif len(string) > 2: 60 | string = [string[0]] + [ 61 | string[i : i + 2] for i in range(1, len(string), 2) 62 | ] 63 | new_text.extend(string) 64 | else: 65 | new_text.append(string) 66 | 67 | text = new_text 68 | text = " ".join(text) 69 | return text 70 | 71 | 72 | def _expand_hardware(m): 73 | quantity, measure = m.groups(0) 74 | measure = _hardware_key[measure.lower()] 75 | if measure[-1] != "z" and float(quantity.replace(",", "")) > 1: 76 | return "{} {}s".format(quantity, measure) 77 | return "{} {}".format(quantity, measure) 78 | 79 | 80 | def _expand_dimension(m): 81 | text = "".join([x for x in m.groups(0) if x != 0]) 82 | text = text.replace(" x ", " by ") 83 | text = text.replace("x", " by ") 84 | if text.endswith(tuple(_dimension_key.keys())): 85 | if text[-2].isdigit(): 86 | text = "{} {}".format(text[:-1], _dimension_key[text[-1:]]) 87 | elif text[-3].isdigit(): 88 | text = "{} {}".format(text[:-2], _dimension_key[text[-2:]]) 89 | return text 90 | 91 | 92 | def normalize_letters_and_numbers(text): 93 | text = re.sub(_hardware_re, _expand_hardware, text) 94 | text = re.sub(_dimension_re, _expand_dimension, text) 95 | text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text) 96 | return text 97 | -------------------------------------------------------------------------------- /uberduck_ml_dev/text/numerical.py: -------------------------------------------------------------------------------- 1 | """ adapted from https://github.com/keithito/tacotron """ 2 | 3 | import inflect 4 | import re 5 | 6 | _magnitudes = ["trillion", "billion", "million", "thousand", "hundred", "m", "b", "t"] 7 | _magnitudes_key = {"m": "million", "b": "billion", "t": "trillion"} 8 | _measurements = "(f|c|k|d|m)" 9 | _measurements_key = {"f": "fahrenheit", "c": "celsius", "k": "thousand", "m": "meters"} 10 | _currency_key = {"$": "dollar", "£": "pound", "€": "euro", "₩": "won"} 11 | _inflect = inflect.engine() 12 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 13 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 14 | _currency_re = re.compile( 15 | r"([\$€£₩])([0-9\.\,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]))?".format( 16 | "|".join(_magnitudes) 17 | ), 18 | re.IGNORECASE, 19 | ) 20 | _measurement_re = re.compile( 21 | r"([0-9\.\,]*[0-9]+(\s)?{}\b)".format(_measurements), re.IGNORECASE 22 | ) 23 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 24 | # _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?') 25 | _roman_re = re.compile( 26 | r"\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b" 27 | ) # avoid I 28 | _multiply_re = re.compile(r"(\b[0-9]+)(x)([0-9]+)") 29 | _number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+") 30 | 31 | 32 | def _remove_commas(m): 33 | return m.group(1).replace(",", "") 34 | 35 | 36 | def _expand_decimal_point(m): 37 | return m.group(1).replace(".", " point ") 38 | 39 | 40 | def _expand_currency(m): 41 | currency = _currency_key[m.group(1)] 42 | quantity = m.group(2) 43 | magnitude = m.group(3) 44 | 45 | # remove commas from quantity to be able to convert to numerical 46 | quantity = quantity.replace(",", "") 47 | 48 | # check for million, billion, etc... 49 | if magnitude is not None and magnitude.lower() in _magnitudes: 50 | if len(magnitude) == 1: 51 | magnitude = _magnitudes_key[magnitude.lower()] 52 | return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + "s") 53 | 54 | parts = quantity.split(".") 55 | if len(parts) > 2: 56 | return quantity + " " + currency + "s" # Unexpected format 57 | 58 | dollars = int(parts[0]) if parts[0] else 0 59 | 60 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 61 | if dollars and cents: 62 | dollar_unit = currency if dollars == 1 else currency + "s" 63 | cent_unit = "cent" if cents == 1 else "cents" 64 | return "{} {}, {} {}".format( 65 | _expand_hundreds(dollars), 66 | dollar_unit, 67 | _inflect.number_to_words(cents), 68 | cent_unit, 69 | ) 70 | elif dollars: 71 | dollar_unit = currency if dollars == 1 else currency + "s" 72 | return "{} {}".format(_expand_hundreds(dollars), dollar_unit) 73 | elif cents: 74 | cent_unit = "cent" if cents == 1 else "cents" 75 | return "{} {}".format(_inflect.number_to_words(cents), cent_unit) 76 | else: 77 | return "zero" + " " + currency + "s" 78 | 79 | 80 | def _expand_hundreds(text): 81 | number = float(text) 82 | if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0): 83 | return _inflect.number_to_words(int(number / 100)) + " hundred" 84 | else: 85 | return _inflect.number_to_words(text) 86 | 87 | 88 | def _expand_ordinal(m): 89 | return _inflect.number_to_words(m.group(0)) 90 | 91 | 92 | def _expand_measurement(m): 93 | _, number, measurement = re.split("(\d+(?:\.\d+)?)", m.group(0)) 94 | number = _inflect.number_to_words(number) 95 | measurement = "".join(measurement.split()) 96 | measurement = _measurements_key[measurement.lower()] 97 | return "{} {}".format(number, measurement) 98 | 99 | 100 | def _expand_range(m): 101 | return " to " 102 | 103 | 104 | def _expand_multiply(m): 105 | left = m.group(1) 106 | right = m.group(3) 107 | return "{} by {}".format(left, right) 108 | 109 | 110 | def _expand_roman(m): 111 | # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python 112 | roman_numerals = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000} 113 | result = 0 114 | num = m.group(0) 115 | for i, c in enumerate(num): 116 | if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]: 117 | result += roman_numerals[c] 118 | else: 119 | result -= roman_numerals[c] 120 | return str(result) 121 | 122 | 123 | def _expand_number(m): 124 | _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0)) 125 | number = int(number) 126 | if ( 127 | number > 1000 128 | and number < 10000 129 | and (number % 100 == 0) 130 | and (number % 1000 != 0) 131 | ): 132 | text = _inflect.number_to_words(number // 100) + " hundred" 133 | elif number > 1000 and number < 3000: 134 | if number == 2000: 135 | text = "two thousand" 136 | elif number > 2000 and number < 2010: 137 | text = "two thousand " + _inflect.number_to_words(number % 100) 138 | elif number % 100 == 0: 139 | text = _inflect.number_to_words(number // 100) + " hundred" 140 | else: 141 | number = _inflect.number_to_words( 142 | number, andword="", zero="oh", group=2 143 | ).replace(", ", " ") 144 | number = re.sub(r"-", " ", number) 145 | text = number 146 | else: 147 | number = _inflect.number_to_words(number, andword="and") 148 | number = re.sub(r"-", " ", number) 149 | number = re.sub(r",", "", number) 150 | text = number 151 | 152 | if suffix in ("'s", "s"): 153 | if text[-1] == "y": 154 | text = text[:-1] + "ies" 155 | else: 156 | text = text + suffix 157 | 158 | return text 159 | 160 | 161 | def normalize_currency(text): 162 | return re.sub(_currency_re, _expand_currency, text) 163 | 164 | 165 | def normalize_numbers(text): 166 | text = re.sub(_comma_number_re, _remove_commas, text) 167 | text = re.sub(_currency_re, _expand_currency, text) 168 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 169 | text = re.sub(_ordinal_re, _expand_ordinal, text) 170 | # text = re.sub(_range_re, _expand_range, text) 171 | # text = re.sub(_measurement_re, _expand_measurement, text) 172 | text = re.sub(_roman_re, _expand_roman, text) 173 | text = re.sub(_multiply_re, _expand_multiply, text) 174 | text = re.sub(_number_re, _expand_number, text) 175 | return text 176 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/hifigan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/hifigan/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/hifigan/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.cuda.amp import GradScaler 3 | from ray.air.integrations.wandb import setup_wandb 4 | from torch.utils.data import DataLoader 5 | from torch.nn import functional as F 6 | 7 | from ...data.data import Dataset 8 | from ...models.rvc.rvc import MultiPeriodDiscriminator 9 | from ...models.hifigan import MultiDiscriminator 10 | 11 | from ...data.collate import Collate 12 | from ...losses_rvc import ( 13 | generator_loss, 14 | discriminator_loss, 15 | feature_loss, 16 | ) 17 | from .train_epoch import train_epoch 18 | from .train_step import train_step 19 | from ..rvc.train import DEFAULTS as DEFAULTS 20 | from ...models.hifigan import _load_uninitialized 21 | 22 | 23 | def train_func(config: dict, project: str = "rvc"): 24 | print("Entering training function") 25 | setup_wandb(config, project=project, entity="uberduck-ai", rank_zero_only=False) 26 | train_config = config["train"] 27 | model_config = config["model"] 28 | data_config = config["data"] 29 | 30 | generator = _load_uninitialized(config_overrides=model_config) 31 | 32 | # NOTE (Sam): RVC uses MultiPeriodDiscrimator that has a single scale discriminator 33 | # HiFi++ paper indicates that the precise discriminator structure is not important and that reweighting the loss is sufficient 34 | # Vocos uses additional strcuture. 35 | discriminator = MultiDiscriminator(True) 36 | discriminator = discriminator.to("cuda") 37 | 38 | generator_optimizer = torch.optim.AdamW( 39 | generator.parameters(), 40 | train_config["learning_rate"], 41 | betas=train_config["betas"], 42 | eps=train_config["eps"], 43 | ) 44 | 45 | discriminator_optimizer = torch.optim.AdamW( 46 | discriminator.parameters(), 47 | train_config["learning_rate"], 48 | betas=train_config["betas"], 49 | eps=train_config["eps"], 50 | ) 51 | 52 | print("Loading checkpoints") 53 | # TODO (Sam): move to "warmstart" or "load_checkpoint" functions 54 | if train_config["warmstart_G_checkpoint_path"] is not None: 55 | generator_checkpoint = torch.load(train_config["warmstart_G_checkpoint_path"])[ 56 | "generator" 57 | ] 58 | generator.load_state_dict( 59 | generator_checkpoint 60 | ) # NOTE (Sam): a handful of "enc_q" decoder states not present - doesn't seem to cause an issue 61 | if train_config["warmstart_D_checkpoint_path"] is not None: 62 | discriminator_checkpoint = torch.load( 63 | train_config["warmstart_D_checkpoint_path"] 64 | )["model"] 65 | discriminator.load_state_dict(discriminator_checkpoint) 66 | 67 | generator = generator.cuda() 68 | discriminator = discriminator.cuda() 69 | 70 | models = {"generator": generator, "discriminator": discriminator} 71 | print("Loading dataset") 72 | 73 | train_dataset = Dataset( 74 | filelist_path=data_config["filelist_path"], 75 | mel_suffix=data_config["mel_suffix"], 76 | audio_suffix=data_config["audio_suffix"], 77 | ) 78 | 79 | # train_sampler = DistributedBucketSampler( 80 | # train_dataset, 81 | # train_config["batch_size"] * 1, 82 | # [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s 83 | # num_replicas=1, 84 | # rank=0, 85 | # shuffle=True, 86 | # ) 87 | train_loader = DataLoader( 88 | train_dataset, 89 | num_workers=1, 90 | shuffle=False, 91 | pin_memory=True, 92 | collate_fn=Collate(), 93 | batch_sampler=None, 94 | # batch_sampler=train_sampler, 95 | batch_size=train_config["batch_size"], 96 | persistent_workers=True, 97 | prefetch_factor=8, 98 | ) 99 | optimization_parameters = { 100 | "optimizers": { 101 | "generator": generator_optimizer, 102 | "discriminator": discriminator_optimizer, 103 | }, 104 | "scaler": GradScaler(), 105 | # NOTE (Sam): need to pass names rather than vector of losses since arguments differ 106 | "losses": { 107 | "l1": {"loss": F.l1_loss, "weight": 1.0}, 108 | "feature": {"loss": feature_loss, "weight": 1.0}, 109 | "generator": {"loss": generator_loss, "weight": 1.0}, 110 | "discriminator": {"loss": discriminator_loss, "weight": 1}, 111 | }, 112 | } 113 | 114 | iteration = 0 115 | start_epoch = 0 116 | print("Beginning training for ", train_config["epochs"], " epochs") 117 | for epoch in range(start_epoch, train_config["epochs"]): 118 | print(f"Epoch: {epoch}") 119 | iteration = train_epoch( 120 | train_step, 121 | train_loader, 122 | config, 123 | models, 124 | optimization_parameters, 125 | logging_parameters={}, 126 | iteration=iteration, 127 | ) 128 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/hifigan/train_epoch.py: -------------------------------------------------------------------------------- 1 | def train_epoch( 2 | _train_step, 3 | dataloader, 4 | config, 5 | models, 6 | optimization_parameters, 7 | logging_parameters, 8 | iteration, 9 | ): 10 | for batch in dataloader: 11 | print(iteration, "iteration") 12 | _train_step( 13 | batch, 14 | config, 15 | models, 16 | optimization_parameters, 17 | logging_parameters, 18 | iteration, 19 | ) 20 | iteration += 1 21 | 22 | return iteration 23 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/hifigan/train_step.py: -------------------------------------------------------------------------------- 1 | from torch.cuda.amp import autocast 2 | from ray.air import session 3 | from datetime import datetime 4 | from einops import rearrange 5 | 6 | from ...models.rvc.commons import clip_grad_value_, slice_segments 7 | from ...data.utils import ( 8 | mel_spectrogram_torch, 9 | spec_to_mel_torch, 10 | ) 11 | from ..log import log 12 | from ..rvc.save import save_checkpoint 13 | from ...models.rvc.commons import rand_slice_segments 14 | 15 | from ...data.data import MAX_WAV_VALUE 16 | 17 | 18 | # NOTE (Sam): passing dict arguments to functions is a bit of a code smell. 19 | # TODO (Sam): the data parameters have slightly different names here 20 | # (e.g. hop_length v hop_size, filter_length v n_fft, num_mels v n_mel_channels, win_length v win_size, mel_fmin v fmin) - unify. 21 | def train_step( 22 | batch, config, models, optimization_parameters, logging_parameters, iteration 23 | ): 24 | data_config = config["data"] 25 | train_config = config["train"] 26 | generator = models["generator"] 27 | discriminator = models["discriminator"] 28 | discriminator_optimizer = optimization_parameters["optimizers"]["discriminator"] 29 | generator_optimizer = optimization_parameters["optimizers"]["generator"] 30 | scaler = optimization_parameters["scaler"] 31 | discriminator_loss = optimization_parameters["losses"]["discriminator"]["loss"] 32 | # NOTE (Sam): The reason to pass the loss as a parameter rather than import it is to reuse the _train_step function for different losses. 33 | l1_loss = optimization_parameters["losses"]["l1"]["loss"] 34 | l1_loss_weight = optimization_parameters["losses"]["l1"]["weight"] 35 | generator_loss = optimization_parameters["losses"]["generator"]["loss"] 36 | generator_loss_weight = optimization_parameters["losses"]["generator"]["weight"] 37 | feature_loss = optimization_parameters["losses"]["feature"]["loss"] 38 | feature_loss_weight = optimization_parameters["losses"]["feature"]["weight"] 39 | 40 | batch = batch.to_gpu() 41 | mel_slices, ids_slice = rand_slice_segments( 42 | batch["mel_padded"], 43 | batch["mel_lengths"], 44 | train_config["segment_size"] // data_config["hop_size"], 45 | ) 46 | # NOTE (Sam): it looks like audio_hat is a 3 way tensor to reuse the slice method between mel and audio. 47 | audio_hat = generator(mel_slices) 48 | 49 | # with autocast(enabled=False): 50 | audio_sliced = slice_segments( 51 | batch["audio_padded"].unsqueeze(0) / MAX_WAV_VALUE, 52 | ids_slice * data_config["hop_size"], 53 | train_config["segment_size"], 54 | ) 55 | 56 | audio_sliced = rearrange(audio_sliced, "c b t -> b c t") 57 | 58 | y_d_hat_r, y_d_hat_g, _, _ = discriminator(audio_sliced, audio_hat.detach()) 59 | 60 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) 61 | discriminator_optimizer.zero_grad() 62 | scaler.scale(loss_disc).backward() 63 | scaler.unscale_(discriminator_optimizer) 64 | grad_norm_d = clip_grad_value_(discriminator.parameters(), None) 65 | scaler.step(discriminator_optimizer) 66 | 67 | # with autocast(enabled=False): 68 | y_hat_mel = mel_spectrogram_torch( 69 | audio_hat.float().squeeze(1), 70 | data_config["n_fft"], 71 | data_config["num_mels"], 72 | data_config["sampling_rate"], 73 | data_config["hop_size"], 74 | data_config["win_size"], 75 | data_config["fmin"], 76 | data_config["fmax"], 77 | ) 78 | 79 | # if train_config["fp16_run"] == True: 80 | # y_hat_mel = y_hat_mel.half() 81 | # with autocast(enabled=train_config["fp16_run"]): 82 | # NOTE (Sam): y_d_hat are list of coordinates of real and generated data at the output of each block 83 | # fmap_r and fmap_g are the same except earlier in the network. 84 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = discriminator( 85 | audio_sliced, 86 | audio_hat, 87 | ) 88 | 89 | loss_mel = l1_loss(mel_slices, y_hat_mel) * train_config["c_mel"] 90 | loss_fm = feature_loss(fmap_r, fmap_g) 91 | loss_gen, losses_gen = generator_loss(y_d_hat_g) 92 | # TODO (Sam): put these in a loss_outputs dict like radtts 93 | loss_gen_all = ( 94 | loss_gen * generator_loss_weight 95 | + loss_fm * feature_loss_weight 96 | + loss_mel * l1_loss_weight 97 | ) 98 | 99 | generator_optimizer.zero_grad() 100 | scaler.scale(loss_gen_all).backward() 101 | scaler.unscale_(generator_optimizer) 102 | grad_norm_g = clip_grad_value_(generator.parameters(), None) 103 | scaler.step(generator_optimizer) 104 | scaler.update() 105 | 106 | print("iteration: ", iteration, datetime.now()) 107 | log_sample = iteration % train_config["steps_per_sample"] == 0 108 | log_checkpoint = iteration % train_config["iters_per_checkpoint"] == 0 109 | 110 | metrics = { 111 | "generator_total_loss": loss_gen_all, 112 | "generator_loss": loss_gen, 113 | "generator_feature_loss": loss_fm, 114 | "generator_loss_mel": loss_mel, 115 | # "discriminator_total_loss": loss_disc, 116 | } 117 | 118 | log(metrics) 119 | 120 | if log_sample and session.get_world_rank() == 0: 121 | import numpy as np 122 | 123 | audios = { 124 | "ground_truth": { 125 | "audio": audio_sliced[0][0] / np.abs(audio_sliced[0][0].cpu()).max() 126 | }, 127 | "generated": {"audio": audio_hat[0][0]}, 128 | } 129 | images = None 130 | 131 | log(audios=audios, images=images) 132 | if log_checkpoint and session.get_world_rank() == 0: 133 | checkpoint_path = f"{train_config['output_directory']}/model_{iteration}.pt" 134 | save_checkpoint( 135 | generator, 136 | generator_optimizer, 137 | discriminator, 138 | discriminator_optimizer, 139 | iteration, 140 | checkpoint_path, 141 | ) 142 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/load.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from collections import OrderedDict 3 | 4 | from torch.utils.data.distributed import DistributedSampler 5 | from torch.utils.data import DataLoader 6 | 7 | from ..data.data import DataRADTTS as Data 8 | from ..data.collate import DataCollateRADTTS as DataCollate 9 | 10 | 11 | # TODO (Sam): warmstart should load optimizer state as well. 12 | # load_pretrained should just be the state_dict 13 | def warmstart( 14 | checkpoint_path, model, include_layers=[], ignore_layers_warmstart=[], strict=False 15 | ): 16 | pretrained_dict = torch.load(checkpoint_path, map_location="cpu") 17 | pretrained_dict = pretrained_dict["state_dict"] 18 | 19 | is_module = False 20 | if list(pretrained_dict.keys())[0].startswith("module."): 21 | is_module = True 22 | if is_module: 23 | new_state_dict = OrderedDict() 24 | for k, v in pretrained_dict.items(): 25 | name = k[7:] # remove `module.` 26 | new_state_dict[name] = v 27 | pretrained_dict = new_state_dict 28 | 29 | model_dict = model.state_dict() 30 | model_dict.update(pretrained_dict) 31 | model.load_state_dict(model_dict, strict=strict) 32 | print("Warm started from {}".format(checkpoint_path)) 33 | model.train() 34 | return model 35 | 36 | 37 | def prepare_dataloaders(data_config, n_gpus, batch_size): 38 | # Get data, data loaders and collate function ready 39 | ignore_keys = ["training_files", "validation_files"] 40 | print("initializing training dataloader") 41 | trainset = Data( 42 | data_config["training_files"], 43 | **dict((k, v) for k, v in data_config.items() if k not in ignore_keys), 44 | ) 45 | 46 | print("initializing validation dataloader") 47 | data_config_val = data_config.copy() 48 | data_config_val["aug_probabilities"] = None # no aug in val set 49 | valset = Data( 50 | data_config["validation_files"], 51 | **dict((k, v) for k, v in data_config_val.items() if k not in ignore_keys), 52 | speaker_ids=trainset.speaker_ids, 53 | ) 54 | 55 | collate_fn = DataCollate() 56 | 57 | train_sampler, shuffle = None, True 58 | if n_gpus > 1: 59 | train_sampler, shuffle = DistributedSampler(trainset), False 60 | 61 | train_loader = DataLoader( 62 | trainset, 63 | num_workers=8, 64 | shuffle=shuffle, 65 | sampler=train_sampler, 66 | batch_size=batch_size, 67 | pin_memory=False, 68 | drop_last=True, 69 | collate_fn=collate_fn, 70 | ) 71 | 72 | return train_loader, valset, collate_fn 73 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/log.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import wandb 3 | from ray.air import session 4 | 5 | 6 | @torch.no_grad() 7 | def log(metrics=None, audios=None, images=None, sample_rate=22050): 8 | if session.get_world_rank() != 0: 9 | return 10 | audios = audios or {} 11 | images = images or {} 12 | wandb_metrics = {} 13 | if metrics is not None: 14 | wandb_metrics.update(metrics) 15 | 16 | for k, v in audios.items(): 17 | wandb_metrics[k] = wandb.Audio( 18 | v["audio"].cpu(), sample_rate=sample_rate, caption=v.get("caption") 19 | ) 20 | 21 | for k, v in images.items(): 22 | wandb_metrics[k] = wandb.Image(v) 23 | 24 | wandb.log(wandb_metrics) 25 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/radtts/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/load.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from collections import OrderedDict 3 | 4 | from torch.utils.data.distributed import DistributedSampler 5 | from torch.utils.data import DataLoader 6 | 7 | from ...data.data import DataRADTTS as Data 8 | from ...data.collate import DataCollateRADTTS as DataCollate 9 | 10 | 11 | # TODO (Sam): warmstart should load optimizer state as well. 12 | # load_pretrained should just be the state_dict 13 | def warmstart( 14 | checkpoint_path, model, include_layers=[], ignore_layers_warmstart=[], strict=False 15 | ): 16 | pretrained_dict = torch.load(checkpoint_path, map_location="cpu") 17 | pretrained_dict = pretrained_dict["state_dict"] 18 | 19 | is_module = False 20 | if list(pretrained_dict.keys())[0].startswith("module."): 21 | is_module = True 22 | if is_module: 23 | new_state_dict = OrderedDict() 24 | for k, v in pretrained_dict.items(): 25 | name = k[7:] # remove `module.` 26 | new_state_dict[name] = v 27 | pretrained_dict = new_state_dict 28 | 29 | model_dict = model.state_dict() 30 | model_dict.update(pretrained_dict) 31 | model.load_state_dict(model_dict, strict=strict) 32 | print("Warm started from {}".format(checkpoint_path)) 33 | model.train() 34 | return model 35 | 36 | 37 | def prepare_dataloaders(data_config, n_gpus, batch_size): 38 | # Get data, data loaders and collate function ready 39 | ignore_keys = ["training_files", "validation_files"] 40 | print("initializing training dataloader") 41 | trainset = Data( 42 | data_config["training_files"], 43 | **dict((k, v) for k, v in data_config.items() if k not in ignore_keys), 44 | ) 45 | 46 | print("initializing validation dataloader") 47 | data_config_val = data_config.copy() 48 | data_config_val["aug_probabilities"] = None # no aug in val set 49 | valset = Data( 50 | data_config["validation_files"], 51 | **dict((k, v) for k, v in data_config_val.items() if k not in ignore_keys), 52 | speaker_ids=trainset.speaker_ids, 53 | ) 54 | 55 | collate_fn = DataCollate() 56 | 57 | train_sampler, shuffle = None, True 58 | if n_gpus > 1: 59 | train_sampler, shuffle = DistributedSampler(trainset), False 60 | 61 | train_loader = DataLoader( 62 | trainset, 63 | num_workers=data_config["num_workers"], 64 | shuffle=shuffle, 65 | sampler=train_sampler, 66 | batch_size=batch_size, 67 | pin_memory=False, 68 | drop_last=True, 69 | collate_fn=collate_fn, 70 | ) 71 | 72 | return train_loader, valset, collate_fn 73 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/save.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def save_checkpoint(model, optimizer, iteration, filepath): 5 | print( 6 | "Saving model and optimizer state at iteration {} to {}".format( 7 | iteration, filepath 8 | ) 9 | ) 10 | 11 | # NOTE (Sam): learning rate not accessible here 12 | torch.save( 13 | { 14 | "state_dict": model.state_dict(), 15 | "iteration": iteration, 16 | "optimizer": optimizer.state_dict(), 17 | }, 18 | filepath, 19 | ) 20 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/train_epoch.py: -------------------------------------------------------------------------------- 1 | from .train_step import _train_step 2 | 3 | 4 | # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset 5 | def train_epoch( 6 | train_dataloader, 7 | log_decoder_samples, 8 | log_attribute_samples, 9 | model, 10 | optim, 11 | steps_per_sample, 12 | scaler, 13 | iters_per_checkpoint, 14 | output_directory, 15 | criterion, 16 | attention_kl_loss, 17 | kl_loss_start_iter, 18 | binarization_start_iter, 19 | iteration, 20 | vocoder, 21 | ): 22 | # def train_epoch(dataset_shard, batch_size, model, optim, steps_per_sample, scaler, scheduler, criterion, attention_kl_loss, kl_loss_start_iter, binarization_start_iter, epoch, iteration): 23 | # for batch_idx, ray_batch_df in enumerate( 24 | # dataset_shard.iter_batches(batch_size=batch_size, prefetch_blocks=6) 25 | # ): 26 | # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset 27 | for batch in train_dataloader: 28 | _train_step( 29 | # ray_batch_df, 30 | # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset 31 | batch, 32 | model, 33 | optim, 34 | iteration, 35 | log_decoder_samples, 36 | log_attribute_samples, 37 | steps_per_sample, 38 | scaler, 39 | iters_per_checkpoint, 40 | output_directory, 41 | criterion, 42 | attention_kl_loss, 43 | kl_loss_start_iter, 44 | binarization_start_iter, 45 | vocoder, 46 | ) 47 | iteration += 1 48 | 49 | return iteration 50 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/radtts/train_step.py: -------------------------------------------------------------------------------- 1 | # NOTE (Sam): for use with ray trainer. 2 | from datetime import datetime 3 | 4 | import torch 5 | from torch.cuda.amp import autocast 6 | from ray.air import session 7 | 8 | from .log import get_log_audio 9 | from ..log import log 10 | from .save import save_checkpoint 11 | from ...utils.utils import ( 12 | to_gpu, 13 | ) 14 | 15 | 16 | # TODO (Sam): it seems like much of this can be made generic for multiple models. 17 | def _train_step( 18 | batch, 19 | model, 20 | optim, 21 | iteration, 22 | log_decoder_samples, 23 | log_attribute_samples, 24 | steps_per_sample, 25 | scaler, 26 | iters_per_checkpoint, 27 | output_directory, 28 | criterion, 29 | attention_kl_loss, 30 | kl_loss_start_iter, 31 | binarization_start_iter, 32 | vocoder, 33 | ): 34 | print(datetime.now(), "entering train step:", iteration) 35 | if iteration >= binarization_start_iter: 36 | binarize = True 37 | else: 38 | binarize = False 39 | 40 | optim.zero_grad() 41 | 42 | with autocast(enabled=False): 43 | batch_dict = batch # torch DataLoader? 44 | # TODO (Sam): move to batch.go_gpu(). 45 | mel = to_gpu(batch_dict["mel"]) 46 | speaker_ids = to_gpu(batch_dict["speaker_ids"]) 47 | attn_prior = to_gpu(batch_dict["attn_prior"]) 48 | f0 = to_gpu(batch_dict["f0"]) 49 | voiced_mask = to_gpu(batch_dict["voiced_mask"]) 50 | text = to_gpu(batch_dict["text"]) 51 | in_lens = to_gpu(batch_dict["input_lengths"]) 52 | out_lens = to_gpu(batch_dict["output_lengths"]) 53 | energy_avg = to_gpu(batch_dict["energy_avg"]) 54 | audio_embedding = to_gpu(batch_dict["audio_embedding"]) 55 | 56 | outputs = model( 57 | mel, 58 | speaker_ids, 59 | text, 60 | in_lens, 61 | out_lens, 62 | binarize_attention=binarize, 63 | attn_prior=attn_prior, 64 | f0=f0, 65 | energy_avg=energy_avg, 66 | voiced_mask=voiced_mask, 67 | audio_embedding=audio_embedding, 68 | ) 69 | 70 | loss_outputs = criterion(outputs, in_lens, out_lens) 71 | 72 | print_list = [] 73 | loss = None 74 | for k, (v, w) in loss_outputs.items(): 75 | if w > 0: 76 | loss = v * w if loss is None else loss + v * w 77 | print_list.append(" | {}: {:.3f}".format(k, v)) 78 | 79 | w_bin = criterion.loss_weights.get("binarization_loss_weight", 1.0) 80 | if binarize and iteration >= kl_loss_start_iter: 81 | binarization_loss = attention_kl_loss(outputs["attn"], outputs["attn_soft"]) 82 | loss += binarization_loss * w_bin 83 | else: 84 | binarization_loss = torch.zeros_like(loss) 85 | loss_outputs["binarization_loss"] = (binarization_loss, w_bin) 86 | grad_clip_val = 1.0 # TODO (Sam): make this a config option 87 | print(print_list) 88 | scaler.scale(loss).backward() 89 | if grad_clip_val > 0: 90 | scaler.unscale_(optim) 91 | torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) 92 | 93 | scaler.step(optim) 94 | scaler.update() 95 | 96 | metrics = {"loss": loss.item()} 97 | for k, (v, w) in loss_outputs.items(): 98 | metrics[k] = v.item() 99 | 100 | print("iteration: ", iteration, datetime.now()) 101 | log_sample = iteration % steps_per_sample == 0 102 | log_checkpoint = iteration % iters_per_checkpoint == 0 103 | 104 | if log_sample and session.get_world_rank() == 0: 105 | model.eval() 106 | # TODO (Sam): adding tf output logging and out of distribution inference 107 | # TODO (Sam): add logging of ground truth 108 | images, audios = get_log_audio( 109 | batch_dict, 110 | log_decoder_samples, 111 | log_attribute_samples, 112 | model, 113 | speaker_ids, 114 | text, 115 | f0, 116 | energy_avg, 117 | voiced_mask, 118 | vocoder, 119 | ) 120 | # TODO (Sam): make out of sample logging cleaner. 121 | # NOTE (Sam): right now this requires precomputation of embeddings and isn't out of sample zero shot. 122 | # gt_path = "/usr/src/app/radtts/ground_truth" 123 | # oos_embs = os.listdir(gt_path) 124 | # # this doesn't help for reasons described above 125 | # for oos_name in oos_embs: 126 | # audio_embedding_oos = torch.load(f"{gt_path}/{oos_name}").cuda() 127 | # _, audios_oos = get_log_audio( 128 | # outputs, 129 | # batch_dict, 130 | # log_decoder_samples, 131 | # log_attribute_samples, 132 | # model, 133 | # speaker_ids, 134 | # text, 135 | # f0, 136 | # energy_avg, 137 | # voiced_mask, 138 | # vocoder, 139 | # oos_name=oos_name, 140 | # audio_embedding_oos=audio_embedding_oos, 141 | # ) 142 | # audios.update(audios_oos) 143 | log( 144 | metrics, 145 | audios, 146 | sample_rate=getattr(vocoder, "sr", 22050), 147 | images=images, 148 | ) 149 | model.train() 150 | else: 151 | log(metrics) 152 | 153 | if log_checkpoint and session.get_world_rank() == 0: 154 | checkpoint_path = f"{output_directory}/model_{iteration}.pt" 155 | save_checkpoint(model, optim, iteration, checkpoint_path) 156 | 157 | print(f"Loss: {loss.item()}") 158 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/rvc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/rvc/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/rvc/save.py: -------------------------------------------------------------------------------- 1 | # TODO (Sam): combine with radtts save_checkpoint 2 | import torch 3 | 4 | 5 | def save_checkpoint( 6 | generator, 7 | generator_optimizer, 8 | discriminator, 9 | discriminator_optimizer, 10 | iteration, 11 | filepath, 12 | ): 13 | print( 14 | "Saving model and optimizer state at iteration {} to {}".format( 15 | iteration, filepath 16 | ) 17 | ) 18 | 19 | # TODO (Sam): figure out where to put learning rate. 20 | torch.save( 21 | { 22 | "generator_state_dict": generator.state_dict(), 23 | "iteration": iteration, 24 | "generator_optimizer": generator_optimizer.state_dict(), 25 | "discriminator_state_dict": discriminator.state_dict(), 26 | "discriminator_optimizer": discriminator_optimizer.state_dict(), 27 | }, 28 | filepath, 29 | ) 30 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/rvc/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.cuda.amp import GradScaler 3 | from ray.air.integrations.wandb import setup_wandb 4 | from torch.utils.data import DataLoader 5 | from torch.nn import functional as F 6 | 7 | from .train_epoch import train_epoch 8 | from ...models.rvc.rvc import ( 9 | SynthesizerTrnMs256NSFsid, 10 | MultiPeriodDiscriminator, 11 | ) 12 | from ...vendor.tfcompat.hparam import HParams 13 | from ...data.data import ( 14 | TextAudioLoaderMultiNSFsid, 15 | DistributedBucketSampler, 16 | ) 17 | from ...data.collate import TextAudioCollateMultiNSFsid 18 | from ...losses_rvc import ( 19 | generator_loss, 20 | discriminator_loss, 21 | feature_loss, 22 | kl_loss, 23 | ) 24 | from uberduck_ml_dev.trainer.rvc.train_epoch import train_epoch 25 | 26 | 27 | def train_func(config: dict, project: str = "rvc"): 28 | print("Entering training function") 29 | setup_wandb(config, project=project, entity="uberduck-ai", rank_zero_only=False) 30 | train_config = config["train"] 31 | model_config = config["model"] 32 | data_config = config["data"] 33 | 34 | generator = SynthesizerTrnMs256NSFsid( 35 | data_config["filter_length"] // 2 + 1, 36 | train_config["segment_size"] // data_config["hop_length"], 37 | **model_config, 38 | is_half=train_config["fp16_run"], 39 | sr=data_config["sampling_rate"], 40 | ) 41 | 42 | discriminator = MultiPeriodDiscriminator(model_config["use_spectral_norm"]) 43 | generator_optimizer = torch.optim.AdamW( 44 | generator.parameters(), 45 | train_config["learning_rate"], 46 | betas=train_config["betas"], 47 | eps=train_config["eps"], 48 | ) 49 | 50 | discriminator_optimizer = torch.optim.AdamW( 51 | discriminator.parameters(), 52 | train_config["learning_rate"], 53 | betas=train_config["betas"], 54 | eps=train_config["eps"], 55 | ) 56 | 57 | print("Loading checkpoints") 58 | # TODO (Sam): move to "warmstart" or "load_checkpoint" functions 59 | generator_checkpoint = torch.load(train_config["warmstart_G_checkpoint_path"])[ 60 | "model" 61 | ] 62 | discriminator_checkpoint = torch.load(train_config["warmstart_D_checkpoint_path"])[ 63 | "model" 64 | ] 65 | discriminator.load_state_dict(discriminator_checkpoint) 66 | generator.load_state_dict( 67 | generator_checkpoint, strict=False 68 | ) # NOTE (Sam): a handful of "enc_q" decoder states not present 69 | generator = generator.cuda() 70 | discriminator = discriminator.cuda() 71 | 72 | models = {"generator": generator, "discriminator": discriminator} 73 | 74 | print("Loading dataset") 75 | train_dataset = TextAudioLoaderMultiNSFsid( 76 | train_config["filelist_path"], HParams(**data_config) 77 | ) # dv is sid 78 | collate_fn = TextAudioCollateMultiNSFsid() 79 | n_gpus = 1 80 | train_sampler = DistributedBucketSampler( 81 | train_dataset, 82 | train_config["batch_size"] * n_gpus, 83 | [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s 84 | num_replicas=n_gpus, 85 | rank=0, 86 | shuffle=True, 87 | ) 88 | train_loader = DataLoader( 89 | train_dataset, 90 | num_workers=1, 91 | shuffle=False, 92 | pin_memory=True, 93 | collate_fn=collate_fn, 94 | batch_sampler=train_sampler, 95 | persistent_workers=True, 96 | prefetch_factor=8, 97 | ) 98 | optimization_parameters = { 99 | "optimizers": { 100 | "generator": generator_optimizer, 101 | "discriminator": discriminator_optimizer, 102 | }, 103 | "scaler": GradScaler(), 104 | # NOTE (Sam): need to pass names rather than vector of losses since arguments differ 105 | "losses": { 106 | "l1": {"loss": F.l1_loss, "weight": 1.0}, 107 | "kl": {"loss": kl_loss, "weight": 1.0}, 108 | "feature": {"loss": feature_loss, "weight": 1.0}, 109 | "generator": {"loss": generator_loss, "weight": 1.0}, 110 | "discriminator": {"loss": discriminator_loss, "weight": 1}, 111 | }, 112 | } 113 | 114 | iteration = 0 115 | start_epoch = 0 116 | print("Beginning training for ", train_config["epochs"], " epochs") 117 | for epoch in range(start_epoch, train_config["epochs"]): 118 | print(f"Epoch: {epoch}") 119 | iteration = train_epoch( 120 | train_loader, 121 | config, 122 | models, 123 | optimization_parameters, 124 | logging_parameters={}, 125 | iteration=iteration, 126 | ) 127 | 128 | 129 | # 40k config 130 | DEFAULTS = { 131 | "log_interval": 200, 132 | "seed": 1234, 133 | "epochs": 20000, 134 | "learning_rate": 1e-4, 135 | "betas": [0.8, 0.99], 136 | "eps": 1e-9, 137 | "batch_size": 4, 138 | "fp16_run": False, 139 | "lr_decay": 0.999875, 140 | "segment_size": 12800, 141 | "init_lr_ratio": 1, 142 | "warmup_epochs": 0, 143 | "c_mel": 45, 144 | "c_kl": 1.0, 145 | "steps_per_sample": 100, 146 | "iters_per_checkpoint": 100, 147 | "output_directory": "/tmp", 148 | } 149 | -------------------------------------------------------------------------------- /uberduck_ml_dev/trainer/rvc/train_epoch.py: -------------------------------------------------------------------------------- 1 | # TODO (Sam): add config arguments to model / optimization / logging and remove. 2 | from .train_step import _train_step 3 | 4 | 5 | def train_epoch( 6 | dataloader, 7 | config, 8 | models, 9 | optimization_parameters, 10 | logging_parameters, 11 | iteration, 12 | ): 13 | for batch in dataloader: 14 | print(iteration, "iteration") 15 | _train_step( 16 | batch, 17 | config, 18 | models, 19 | optimization_parameters, 20 | logging_parameters, 21 | iteration, 22 | ) 23 | iteration += 1 24 | 25 | return iteration 26 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/utils/__init__.py -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/config.py: -------------------------------------------------------------------------------- 1 | from ..models.tacotron2 import DEFAULTS as TACOTRON2_DEFAULTS 2 | 3 | 4 | def tacotron2_training_to_model_config(training_config): 5 | shared_keys = set(TACOTRON2_DEFAULTS.values().keys()).intersection( 6 | training_config.keys() 7 | ) 8 | # NOTE (Sam): only need to save non-default parameters in config unless defaults change. 9 | minimal_model_config = {k: training_config[k] for k in shared_keys} 10 | return minimal_model_config 11 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/denoiser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Removes bias from HiFi-Gan and Avocodo (typically heard as noise in the audio) 3 | 4 | Usage: 5 | from denoiser import Denoiser 6 | denoiser = Denoiser(HIFIGANGENERATOR, mode="normal") # Experiment with modes "normal" and "zeros" 7 | 8 | # Inference Vocoder 9 | audio = hifigan.vocoder.forward(output[1][:1]) 10 | 11 | audio = audio.squeeze() 12 | audio = audio * 32768.0 13 | 14 | # Denoise 15 | audio_denoised = denoiser(audio.view(1, -1), strength=15)[:, 0] # Change strength if needed 16 | 17 | audio_denoised = audio_denoised.cpu().detach().numpy().reshape(-1) 18 | normalize = (32768.0 / np.max(np.abs(audio_denoised))) ** 0.9 19 | audio_denoised = audio_denoised * normalize 20 | """ 21 | 22 | import sys 23 | import torch 24 | from ..models.common import STFT 25 | 26 | 27 | class Denoiser(torch.nn.Module): 28 | """WaveGlow denoiser, adapted for HiFi-GAN""" 29 | 30 | def __init__( 31 | self, hifigan, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros" 32 | ): 33 | super(Denoiser, self).__init__() 34 | self.stft = STFT( 35 | filter_length=filter_length, 36 | hop_length=int(filter_length / n_overlap), 37 | win_length=win_length, 38 | device=torch.device("cpu"), 39 | ) 40 | 41 | if mode == "zeros": 42 | mel_input = torch.zeros((1, 80, 88)) 43 | elif mode == "normal": 44 | mel_input = torch.randn((1, 80, 88)) 45 | else: 46 | raise Exception("Mode {} if not supported".format(mode)) 47 | 48 | with torch.no_grad(): 49 | bias_audio = ( 50 | hifigan.vocoder.forward(mel_input.to(hifigan.device)) 51 | .view(1, -1) 52 | .float() 53 | ) 54 | bias_spec, _ = self.stft.transform(bias_audio.cpu()) 55 | 56 | self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None]) 57 | 58 | def forward(self, audio, strength=10): 59 | """ 60 | Strength is the amount of bias you want to be removed from the final audio. 61 | Note: A higher strength may remove too much information in the original audio. 62 | 63 | :param audio: Audio data 64 | :param strength: Amount of bias removal. Recommended range 10 - 50 65 | :return: Denoised audio 66 | :rtype: tensor 67 | """ 68 | 69 | audio_spec, audio_angles = self.stft.transform(audio.cpu()) 70 | audio_spec_denoised = audio_spec - self.bias_spec * strength 71 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 72 | audio_denoised = self.stft.inverse(audio_spec_denoised.cpu(), audio_angles) 73 | return audio_denoised 74 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/exec.py: -------------------------------------------------------------------------------- 1 | __all__ = ["parse_args"] 2 | 3 | import argparse 4 | 5 | 6 | def parse_args(args): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--config", help="Path to JSON config") 9 | args = parser.parse_args(args) 10 | return args 11 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/hifiutils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import matplotlib 4 | import torch 5 | from torch.nn.utils import weight_norm 6 | 7 | matplotlib.use("Agg") 8 | import matplotlib.pylab as plt 9 | 10 | 11 | def plot_spectrogram(spectrogram): 12 | fig, ax = plt.subplots(figsize=(10, 2)) 13 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 14 | plt.colorbar(im, ax=ax) 15 | 16 | fig.canvas.draw() 17 | plt.close() 18 | 19 | return fig 20 | 21 | 22 | def init_weights(m, mean=0.0, std=0.01): 23 | classname = m.__class__.__name__ 24 | if classname.find("Conv") != -1: 25 | m.weight.data.normal_(mean, std) 26 | 27 | 28 | def apply_weight_norm(m): 29 | classname = m.__class__.__name__ 30 | if classname.find("Conv") != -1: 31 | weight_norm(m) 32 | 33 | 34 | def get_padding(kernel_size, dilation=1): 35 | return int((kernel_size * dilation - dilation) / 2) 36 | 37 | 38 | def load_checkpoint(filepath, device): 39 | assert os.path.isfile(filepath) 40 | print("Loading '{}'".format(filepath)) 41 | checkpoint_dict = torch.load(filepath, map_location=device) 42 | print("Complete.") 43 | return checkpoint_dict 44 | 45 | 46 | def save_checkpoint(filepath, obj): 47 | print("Saving checkpoint to {}".format(filepath)) 48 | torch.save(obj, filepath) 49 | print("Complete.") 50 | 51 | 52 | def scan_checkpoint(cp_dir, prefix): 53 | pattern = os.path.join(cp_dir, prefix + "????????") 54 | cp_list = glob.glob(pattern) 55 | if len(cp_list) == 0: 56 | return None 57 | return sorted(cp_list)[-1] 58 | -------------------------------------------------------------------------------- /uberduck_ml_dev/utils/plot.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "save_figure_to_numpy", 3 | "plot_tensor", 4 | "plot_spectrogram", 5 | "plot_attention", 6 | "plot_attention_phonemes", 7 | "plot_gate_outputs", 8 | ] 9 | 10 | 11 | import numpy as np 12 | import matplotlib 13 | 14 | matplotlib.use("Agg") 15 | import matplotlib.pyplot as plt 16 | 17 | from ..text.symbols import id_to_symbol, DEFAULT_SYMBOLS 18 | 19 | 20 | def save_figure_to_numpy(fig): 21 | """Save figure to a numpy array.""" 22 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") 23 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 24 | plt.close(fig) 25 | return data 26 | 27 | 28 | def plot_tensor(tensor): 29 | plt.style.use("default") 30 | fig, ax = plt.subplots(figsize=(12, 3)) 31 | im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none") 32 | plt.colorbar(im, ax=ax) 33 | plt.tight_layout() 34 | fig.canvas.draw() 35 | data = save_figure_to_numpy(fig) 36 | plt.close() 37 | return data 38 | 39 | 40 | def plot_spectrogram(mel): 41 | figure = plt.figure() 42 | plt.xlabel("Spectrogram frame") 43 | plt.ylabel("Channel") 44 | plt.imshow(mel, aspect="auto", origin="lower", interpolation="none", cmap="inferno") 45 | figure.canvas.draw() 46 | return figure 47 | 48 | 49 | def plot_attention(attention, encoder_length=None, decoder_length=None): 50 | figure = plt.figure() 51 | plt.xlabel("Decoder timestep") 52 | plt.ylabel("Encoder timestep") 53 | plt.imshow( 54 | attention.data.cpu().numpy(), 55 | aspect="auto", 56 | origin="lower", 57 | interpolation="none", 58 | cmap="inferno", 59 | ) 60 | title_info = [] 61 | if encoder_length is not None: 62 | title_info.append(f"Encoder_length: {encoder_length}") 63 | if decoder_length is not None: 64 | title_info.append(f"Decoder length: {decoder_length}") 65 | title = " ".join(title_info) 66 | plt.title(title) 67 | figure.canvas.draw() 68 | return figure 69 | 70 | 71 | def plot_attention_phonemes(seq, attention, symbol_set=DEFAULT_SYMBOLS): 72 | figure = plt.figure(figsize=(15, 8)) 73 | phonemes = [] 74 | 75 | for token in seq.numpy(): 76 | if token == len(id_to_symbol[symbol_set]): 77 | phonemes.append("~") 78 | else: 79 | phonemes.append(id_to_symbol[symbol_set][token][1:]) 80 | 81 | xtick_locs = np.pad( 82 | np.cumsum(np.sum(attention.data.cpu().numpy(), axis=1)), (1, 0) 83 | ).astype(np.int16)[:-1] 84 | ytick_locs = np.arange(seq.shape[-1]) 85 | plt.yticks(ytick_locs, phonemes) 86 | plt.xticks(xtick_locs, xtick_locs) 87 | 88 | plt.imshow( 89 | attention.data.cpu().numpy(), 90 | aspect="auto", 91 | origin="lower", 92 | interpolation="none", 93 | cmap="Greys", 94 | ) 95 | 96 | i = 0 97 | for phon, y in zip(phonemes, ytick_locs): 98 | if phon == "~": 99 | continue 100 | if i == 4: 101 | plt.axhline(y=y, color="k") 102 | if i == 3: 103 | plt.axhline(y=y, color="r") 104 | if i == 2: 105 | plt.axhline(y=y, color="g") 106 | if i == 1: 107 | plt.axhline(y=y, color="b") 108 | if i == 0: 109 | plt.axhline(y=y, color="m") 110 | i += 1 111 | i = i % 5 112 | 113 | plt.grid(axis="x") 114 | plt.title("Phoneme Alignment") 115 | plt.xlabel("Time (mel frames)") 116 | plt.ylabel("Phonemes") 117 | 118 | return figure 119 | 120 | 121 | def plot_gate_outputs(gate_targets=None, gate_outputs=None): 122 | figure = plt.figure() 123 | plt.xlabel("Frames") 124 | plt.ylabel("Gate state") 125 | ax = figure.add_axes([0, 0, 1, 1]) 126 | if gate_targets is not None: 127 | ax.scatter( 128 | range(gate_targets.size(0)), 129 | gate_targets, 130 | alpha=0.5, 131 | color="green", 132 | marker="+", 133 | s=1, 134 | label="target", 135 | ) 136 | if gate_outputs is not None: 137 | ax.scatter( 138 | range(gate_outputs.size(0)), 139 | gate_outputs, 140 | alpha=0.5, 141 | color="red", 142 | marker=".", 143 | s=1, 144 | label="predicted", 145 | ) 146 | figure.canvas.draw() 147 | return figure 148 | 149 | 150 | def plot_alignment_to_numpy( 151 | alignment, title="", info=None, phoneme_seq=None, vmin=None, vmax=None 152 | ): 153 | if phoneme_seq: 154 | fig, ax = plt.subplots(figsize=(15, 10)) 155 | else: 156 | fig, ax = plt.subplots(figsize=(6, 4)) 157 | im = ax.imshow( 158 | alignment, 159 | aspect="auto", 160 | origin="lower", 161 | interpolation="none", 162 | vmin=vmin, 163 | vmax=vmax, 164 | ) 165 | ax.set_title(title) 166 | fig.colorbar(im, ax=ax) 167 | xlabel = "Decoder timestep" 168 | if info is not None: 169 | xlabel += "\n\n" + info 170 | plt.xlabel(xlabel) 171 | plt.ylabel("Encoder timestep") 172 | plt.tight_layout() 173 | 174 | if phoneme_seq != None: 175 | # for debugging of phonemes and durs in maps. Not used by def in training code 176 | ax.set_yticks(np.arange(len(phoneme_seq))) 177 | ax.set_yticklabels(phoneme_seq) 178 | ax.hlines(np.arange(len(phoneme_seq)), xmin=0.0, xmax=max(ax.get_xticks())) 179 | 180 | fig.canvas.draw() 181 | data = save_figure_to_numpy(fig) 182 | plt.close() 183 | return data 184 | -------------------------------------------------------------------------------- /uberduck_ml_dev/vendor/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /uberduck_ml_dev/vendor/tfcompat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/vendor/tfcompat/__init__.py --------------------------------------------------------------------------------