├── .github └── workflows │ └── main.yml ├── .gitignore ├── .pre-commit-config.yaml ├── MANIFEST.in ├── README.md ├── analytics ├── dependencies │ ├── details.json │ └── details.png └── tests │ ├── __init__.py │ ├── conftest.py │ ├── fixtures │ ├── ljtest │ │ ├── list.txt │ │ ├── list_small.txt │ │ ├── taco2_lj2lj.json │ │ └── wavs │ │ │ ├── LJ001-0001.wav │ │ │ ├── LJ001-0002.wav │ │ │ ├── LJ001-0003.wav │ │ │ ├── LJ001-0004.wav │ │ │ ├── LJ001-0005.wav │ │ │ ├── LJ001-0006.wav │ │ │ ├── LJ001-0007.wav │ │ │ ├── LJ001-0008.wav │ │ │ ├── LJ001-0009.wav │ │ │ ├── LJ001-0010.wav │ │ │ ├── LJ001-0011.wav │ │ │ ├── LJ001-0012.wav │ │ │ ├── LJ001-0013.wav │ │ │ ├── LJ001-0014.wav │ │ │ ├── LJ001-0015.wav │ │ │ └── LJ001-0016.wav │ ├── sample_spectrogram.pt │ ├── sample_spectrogram_tf.pt │ ├── stevejobs-1.pt │ ├── val.txt │ └── wavs │ │ └── stevejobs-1.wav │ ├── pytest.ini │ └── tests │ ├── __init__.py │ ├── models │ ├── __init__.py │ ├── test_common.py │ └── test_tacotron2.py │ ├── test_data_loader.py │ ├── text │ ├── test_symbols.py │ └── test_util.py │ ├── trainer │ ├── __init__.py │ └── test_trainer.py │ ├── utils │ └── test_utils.py │ └── vocoders │ └── test_hifi_gan.py ├── licenses ├── LICENSE ├── LICENSE2 ├── LICENSE3 ├── LICENSE4 └── LICENSE5 ├── settings.ini ├── setup.py ├── tutorials ├── hifigan │ ├── data_processing.py │ └── download.sh └── radtts │ ├── demo_config.json │ ├── download.sh │ ├── radtts_data_processing.ipynb │ └── train.sh └── uberduck_ml_dev ├── __init__.py ├── assets └── duck.png ├── data ├── __init__.py ├── batch.py ├── collate.py ├── data.py ├── get.py ├── hifigan.py ├── normalization.py ├── processor.py ├── ray.py ├── spectrogram.py ├── statistics.py └── utils.py ├── e2e.py ├── exec ├── __init__.py ├── dataset_statistics.py ├── normalize_audio.py ├── split_train_val.py ├── train_radtts_with_ray.py ├── train_tacotron2.py └── train_vits.py ├── losses.py ├── losses_rvc.py ├── models ├── __init__.py ├── avocodo.py ├── base.py ├── common.py ├── components │ ├── __init__.py │ ├── alignment.py │ ├── attention.py │ ├── attentions.py │ ├── attribute_prediction_model.py │ ├── autoregressive_flow.py │ ├── decoders │ │ ├── __init__.py │ │ └── tacotron2.py │ ├── encoders │ │ ├── __init__.py │ │ ├── duration.py │ │ ├── resnet_speaker_encoder.py │ │ ├── speaker │ │ │ ├── __init__.py │ │ │ ├── base_encoder.py │ │ │ └── resnet.py │ │ └── tacotron2.py │ ├── partialconv1d.py │ ├── postnet.py │ ├── prenet.py │ ├── splines.py │ └── transformer.py ├── hifigan.py ├── hubert.py ├── radtts.py ├── rvc │ ├── __init__.py │ ├── attentions.py │ ├── commons.py │ ├── modules.py │ ├── rmvpe.py │ ├── rvc.py │ ├── transforms.py │ ├── vc.py │ └── vc_infer_pipeline.py ├── tacotron2.py ├── torchmoji.py ├── transforms.py ├── utils.py └── vits.py ├── monitoring ├── __init__.py ├── generate.py ├── statistics.py ├── streamlit.py └── wandb.py ├── monotonic_align.py ├── optimizers └── radam.py ├── text ├── __init__.py ├── abbreviations.py ├── acronyms.py ├── cleaners.py ├── cmudict-0.7b ├── cmudict.py ├── datestime.py ├── grapheme_dictionary.py ├── heteronyms ├── letters_and_numbers.py ├── numerical.py ├── symbols.py ├── text_processing.py └── utils.py ├── trainer ├── __init__.py ├── base.py ├── hifigan │ ├── __init__.py │ ├── train.py │ ├── train_epoch.py │ └── train_step.py ├── load.py ├── log.py ├── radtts │ ├── __init__.py │ ├── load.py │ ├── log.py │ ├── save.py │ ├── train.py │ ├── train_epoch.py │ └── train_step.py ├── rvc │ ├── __init__.py │ ├── save.py │ ├── train.py │ ├── train_epoch.py │ ├── train_step.py │ └── utils.py └── tacotron2.py ├── utils ├── __init__.py ├── audio.py ├── config.py ├── denoiser.py ├── exec.py ├── hifiutils.py ├── plot.py └── utils.py └── vendor ├── __init__.py └── tfcompat ├── __init__.py └── hparam.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v1 8 | - uses: actions/setup-python@v1 9 | with: 10 | python-version: "3.10" 11 | architecture: "x64" 12 | - name: Install OS dependencies 13 | run: | 14 | sudo apt-get update 15 | sudo apt-get install espeak libsndfile-dev 16 | - name: Install the library 17 | run: | 18 | pip install -e . 19 | - name: Run tests 20 | run: | 21 | python -m pytest 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | experiments/outputs/* 2 | experiments/outputs_processed/* 3 | experiments/logs/* 4 | src/vendor_tools/* 5 | docker/secrets/gcloud_key.json 6 | docker/secrets/* 7 | nbs/test/fixtures/models/* 8 | nbs/test/fixtures/results/* 9 | test_checkpoint/* 10 | 11 | *.bak 12 | .gitattributes 13 | .last_checked 14 | .gitconfig 15 | *.bak 16 | *.log 17 | *~ 18 | ~* 19 | _tmp* 20 | tmp* 21 | tags 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | env/ 34 | build/ 35 | develop-eggs/ 36 | dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | lib/ 41 | lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/ 62 | .tox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | .hypothesis/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | docs/.jekyll-cache 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # celery beat schedule file 100 | celerybeat-schedule 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # dotenv 106 | .env 107 | 108 | # virtualenv 109 | .venv 110 | venv/ 111 | ENV/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | 126 | .vscode 127 | *.swp 128 | 129 | # osx generated files 130 | .DS_Store 131 | .DS_Store? 132 | .Trashes 133 | ehthumbs.db 134 | Thumbs.db 135 | .idea 136 | 137 | # pytest 138 | .pytest_cache 139 | 140 | # tools/trust-doc-nbs 141 | docs_src/.last_checked 142 | 143 | # symlinks to fastai 144 | docs_src/fastai 145 | tools/fastai 146 | 147 | # link checker 148 | checklink/cookies.txt 149 | 150 | # .gitconfig is now autogenerated 151 | .gitconfig 152 | 153 | nbs/runs 154 | events.out.tfevents* 155 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.3.0 10 | hooks: 11 | - id: black 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include README.md 3 | recursive-exclude * __pycache__ 4 | include uberduck_ml_dev/text/* 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deprecation note 2 | We are moving away from maintaining this repository. 3 | 4 | # 🦆 ~~Uberduck Synthetic Speech~~ 5 | ![](https://img.shields.io/github/forks/uberduck-ai/uberduck-ml-dev) 6 | ![](https://img.shields.io/github/stars/uberduck-ai/uberduck-ml-dev) 7 | ![](https://img.shields.io/github/issues/uberduck-ai/uberduck-ml-dev) 8 | ![GithubActions](https://github.com/uberduck-ai/uberduck-ml-dev/actions/workflows/main.yml/badge.svg) 9 | [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.com/invite/ATYWnMu) 10 | 11 | This repository includes 12 | 17 | 18 | Notebooks are available [**here**](https://app.uberduck.ai/), and models to fine-tune from are available here [**here**](https://huggingface.co/Uberduck). 19 | 20 | ## Summary 21 | 22 | ![Summary](https://github.com/uberduck-ai/uberduck-ml-dev/blob/master/analytics/dependencies/details.png) 23 | 24 | ## Installation 25 | 26 | If you want to install on your own machine, create a virtual environment and install like 27 | 28 | ```bash 29 | conda create -n 'test-env' python=3.10 -y 30 | source activate test-env 31 | pip install git+https://github.com/uberduck-ai/uberduck-ml-dev 32 | ``` 33 | 34 | ## Training 35 | 36 | Train a radtts on LJ Speech as follows 37 | 38 | ``` 39 | cd tutorials/radtts 40 | bash download.sh 41 | bash train.sh 42 | ``` 43 | 44 | You will need to adjust the paths and potentially other training settings in `tutorials/radtts/demo_config.json`. 45 | This code has been tested on a single T4 as well as 2 A6000s. 46 | 47 | For processing of new datasets, see `tutorials/radtts/radtts_data_processing.ipynb`. 48 | 49 | # Development 50 | 51 | We love contributions! 52 | 53 | ## Installation 54 | 55 | To install in development mode, run 56 | 57 | ```bash 58 | pip install pre-commit black # format your code on commit by installing black! 59 | git clone git@github.com:uberduck-ai/uberduck-ml-dev.git 60 | cd uberduck-ml-dev 61 | pre-commit install # Install required Git hooks 62 | python setup.py develop # Install the library 63 | ``` 64 | 65 | ## 🚩 Testing 66 | 67 | In an environment or image with uberduck-ml-dev installed, run 68 | 69 | ```bash 70 | cd uberduck-ml-dev 71 | python -m pytest 72 | ``` 73 | -------------------------------------------------------------------------------- /analytics/dependencies/details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/dependencies/details.png -------------------------------------------------------------------------------- /analytics/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/__init__.py -------------------------------------------------------------------------------- /analytics/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytest 3 | import os 4 | import tempfile 5 | 6 | import torch 7 | 8 | import requests 9 | 10 | from uberduck_ml_dev.models.tacotron2 import DEFAULTS as TACOTRON2_DEFAULTS 11 | from uberduck_ml_dev.models.tacotron2 import Tacotron2 12 | from uberduck_ml_dev.trainer.tacotron2 import ( 13 | Tacotron2Trainer, 14 | DEFAULTS as TACOTRON2_TRAINER_DEFAULTS, 15 | ) 16 | from uberduck_ml_dev.vendor.tfcompat.hparam import HParams 17 | 18 | 19 | # NOTE (Sam): move to Tacotron2 model and remove from Uberduck repo. 20 | def _load_tacotron_uninitialized(overrides=None): 21 | overrides = overrides or {} 22 | defaults = dict(**TACOTRON2_DEFAULTS.values()) 23 | defaults.update(overrides) 24 | hparams = HParams(**defaults) 25 | return Tacotron2(hparams) 26 | 27 | 28 | @pytest.fixture(scope="session") 29 | def lj_speech_tacotron2_file(): 30 | tf = tempfile.NamedTemporaryFile(suffix=".pt") 31 | # tf.close() 32 | # NOTE (Sam): A canonical LJ statedict used in our warm starting notebook. 33 | url_ = "https://uberduck-demo.s3.us-west-2.amazonaws.com/tacotron2_statedict_lj_test.pt" 34 | res = requests.get(url_) 35 | if res.status_code == 200: # http 200 means success 36 | with open(tf.name, "wb") as file_handle: # wb means Write Binary 37 | file_handle.write(res.content) 38 | 39 | return tf 40 | 41 | 42 | @pytest.fixture 43 | def lj_speech_tacotron2(lj_speech_tacotron2_file): 44 | # NOTE (Sam): this override should no longer be necessary. 45 | device = "cpu" 46 | config_overrides = {} 47 | config_overrides["cudnn_enabled"] = device != "cpu" 48 | _model = _load_tacotron_uninitialized(config_overrides) 49 | checkpoint = torch.load(lj_speech_tacotron2_file.name, map_location=device) 50 | _model.from_pretrained(model_dict=checkpoint["state_dict"], device=device) 51 | 52 | return _model 53 | 54 | 55 | @pytest.fixture 56 | def sample_inference_spectrogram(): 57 | # NOTE (Sam): made in Uberduck container using current test code in test_stft_seed. 58 | inference_spectrogram = torch.load( 59 | os.path.join(os.path.dirname(__file__), "fixtures/sample_spectrogram.pt") 60 | ) 61 | return inference_spectrogram 62 | 63 | 64 | @pytest.fixture 65 | def sample_inference_tf_spectrogram(): 66 | # NOTE (Sam): made with above at timestep 111 and text = "I, Sam, am a very bad boy." 67 | inference_spectrogram = torch.load( 68 | os.path.join(os.path.dirname(__file__), "fixtures/sample_spectrogram_tf.pt") 69 | ) 70 | 71 | return inference_spectrogram 72 | 73 | 74 | @pytest.fixture() 75 | def lj_trainer(lj_speech_tacotron2_file): 76 | # NOTE (Sam): It may be nicer to specify trainer here and test-specific parameters (e.g. data) in test itself. 77 | config = TACOTRON2_TRAINER_DEFAULTS.values() 78 | params = dict( 79 | warm_start_name=lj_speech_tacotron2_file.name, 80 | training_audiopaths_and_text=os.path.join( 81 | os.path.dirname(__file__), "fixtures/ljtest/list_small.txt" 82 | ), 83 | val_audiopaths_and_text=os.path.join( 84 | os.path.dirname(__file__), "fixtures/ljtest/list_small.txt" 85 | ), 86 | checkpoint_name="test", 87 | checkpoint_path="test_checkpoint", 88 | epochs=3, 89 | log_dir="", 90 | debug=True, 91 | batch_size=4, 92 | learning_rate=1e-4, 93 | # NOTE (Sam): this effects the reduction in loss in the gradient descent, 94 | # so we need a separate test of validation and logging code. 95 | is_validate=False, 96 | ) 97 | config.update(params) 98 | hparams = HParams(**config) 99 | 100 | trainer = Tacotron2Trainer(hparams, rank=0, world_size=1) 101 | 102 | return trainer 103 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/list.txt: -------------------------------------------------------------------------------- 1 | analytics/tests/fixtures/ljtest/wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|0 2 | analytics/tests/fixtures/ljtest/wavs/LJ001-0002.wav|in being comparatively modern.|0 3 | analytics/tests/fixtures/ljtest/wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|0 4 | analytics/tests/fixtures/ljtest/wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|0 5 | analytics/tests/fixtures/ljtest/wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|0 6 | analytics/tests/fixtures/ljtest/wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|0 7 | analytics/tests/fixtures/ljtest/wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or forty-two line Bible of about 1455,|0 8 | analytics/tests/fixtures/ljtest/wavs/LJ001-0008.wav|has never been surpassed.|0 9 | analytics/tests/fixtures/ljtest/wavs/LJ001-0009.wav|Printing, then, for our purpose, may be considered as the art of making books by means of movable types.|0 10 | analytics/tests/fixtures/ljtest/wavs/LJ001-0010.wav|Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress,|0 11 | analytics/tests/fixtures/ljtest/wavs/LJ001-0011.wav|it is of the first importance that the letter used should be fine in form;|0 12 | analytics/tests/fixtures/ljtest/wavs/LJ001-0012.wav|especially as no more time is occupied, or cost incurred, in casting, setting, or printing beautiful letters|0 13 | analytics/tests/fixtures/ljtest/wavs/LJ001-0013.wav|than in the same operations with ugly ones.|0 14 | analytics/tests/fixtures/ljtest/wavs/LJ001-0014.wav|And it was a matter of course that in the Middle Ages, when the craftsmen took care that beautiful form should always be a part of their productions whatever they were,|0 15 | analytics/tests/fixtures/ljtest/wavs/LJ001-0015.wav|the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves.|0 16 | analytics/tests/fixtures/ljtest/wavs/LJ001-0016.wav|The Middle Ages brought calligraphy to perfection, and it was natural therefore|0 17 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/list_small.txt: -------------------------------------------------------------------------------- 1 | analytics/tests/fixtures/ljtest/wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|0 2 | analytics/tests/fixtures/ljtest/wavs/LJ001-0002.wav|in being comparatively modern.|0 3 | analytics/tests/fixtures/ljtest/wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|0 4 | analytics/tests/fixtures/ljtest/wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|0 5 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/taco2_lj2lj.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 16, 3 | "checkpoint_name": null, 4 | "checkpoint_path": "analytics/tests/fixtures/results/checkpoints", 5 | "cudnn_enabled": false, 6 | "dataset_path": "./dataset", 7 | "debug": false, 8 | "distributed_run": false, 9 | "epochs": 5, 10 | "epochs_per_checkpoint": 4, 11 | "filter_length": 1024, 12 | "fp16_run": false, 13 | "grad_clip_thresh": 1.0, 14 | "hop_length": 256, 15 | "ignore_layers": ["speaker_embedding.weight"], 16 | "include_f0": false, 17 | "learning_rate": 1e-3, 18 | "log_dir": "analytics/tests/fixtures/results/logs", 19 | "mask_padding": true, 20 | "max_wav_value": 32768.0, 21 | "mel_fmax": 8000, 22 | "mel_fmin": 0, 23 | "n_frames_per_step_initial": 1, 24 | "n_mel_channels": 80, 25 | "symbol_set": "nvidia_taco2", 26 | "n_symbols": 148, 27 | "n_speakers": 1, 28 | "p_arpabet": 1.0, 29 | "reduction_window_schedule": [ 30 | { 31 | "until_step": 10000, 32 | "batch_size": 16, 33 | "n_frames_per_step": 1 34 | }, 35 | { 36 | "until_step": 50000, 37 | "batch_size": 16, 38 | "n_frames_per_step": 1 39 | }, 40 | { 41 | "until_step": 60000, 42 | "batch_size": 16, 43 | "n_frames_per_step": 1 44 | }, 45 | { 46 | "until_step": 70000, 47 | "batch_size": 16, 48 | "n_frames_per_step": 1 49 | }, 50 | { 51 | "until_step": null, 52 | "batch_size": 16, 53 | "n_frames_per_step": 1 54 | } 55 | ], 56 | "sample_inference_speaker_ids": [0], 57 | "seed": 1234, 58 | "sampling_rate": 22050, 59 | "steps_per_sample": 100, 60 | "text_cleaners": ["english_cleaners"], 61 | "training_audiopaths_and_text": "analytics/tests/fixtures/ljtest/list.txt", 62 | "val_audiopaths_and_text": "analytics/tests/fixtures/ljtest/list.txt", 63 | "warm_start_name": "analytics/tests/fixtures/models/taco2ljdefault", 64 | "weight_decay": 1e-6, 65 | "win_length": 1024 66 | } 67 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0001.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0002.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0003.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0004.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0005.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0006.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0007.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0008.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0009.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0010.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0011.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0011.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0012.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0012.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0013.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0013.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0014.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0014.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0015.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0015.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/ljtest/wavs/LJ001-0016.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/ljtest/wavs/LJ001-0016.wav -------------------------------------------------------------------------------- /analytics/tests/fixtures/sample_spectrogram.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/sample_spectrogram.pt -------------------------------------------------------------------------------- /analytics/tests/fixtures/sample_spectrogram_tf.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/sample_spectrogram_tf.pt -------------------------------------------------------------------------------- /analytics/tests/fixtures/stevejobs-1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/stevejobs-1.pt -------------------------------------------------------------------------------- /analytics/tests/fixtures/val.txt: -------------------------------------------------------------------------------- 1 | analytics/tests/fixtures/wavs/stevejobs-1.wav|{ W EH1 L } { Y UW1 } { N OW1 } , { AE1 Z } { Y UW1 } { N OW1 } , { DH AH0 } { W EH1 B Z } { AH0 } { P R IH1 T IY0 } { M ER0 AE1 K Y AH0 L AH0 S } { TH IH1 NG } . { AH0 N D } { IH1 T } { W AA1 Z } { AH0 } { V EH1 R IY0 } { S IH1 M P AH0 L } { P EH1 R AH0 D AY2 M } { DH AE1 T } { W AA1 Z } { IH0 N V EH1 N T AH0 D } { W IH1 CH } { W AA1 Z } .|0 2 | -------------------------------------------------------------------------------- /analytics/tests/fixtures/wavs/stevejobs-1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/fixtures/wavs/stevejobs-1.wav -------------------------------------------------------------------------------- /analytics/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = test_*.py -------------------------------------------------------------------------------- /analytics/tests/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/tests/__init__.py -------------------------------------------------------------------------------- /analytics/tests/tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/tests/models/__init__.py -------------------------------------------------------------------------------- /analytics/tests/tests/models/test_common.py: -------------------------------------------------------------------------------- 1 | from uberduck_ml_dev.models.common import MelSTFT 2 | import torch 3 | 4 | 5 | class TestCommon: 6 | def test_mel_stft(self): 7 | mel_stft = MelSTFT() 8 | mel = mel_stft.mel_spectrogram(torch.clip(torch.randn(1, 1000), -1, 1)) 9 | assert mel.shape[0] == 1 10 | assert mel.shape[1] == 80 11 | -------------------------------------------------------------------------------- /analytics/tests/tests/test_data_loader.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | 3 | from uberduck_ml_dev.data.utils import oversample 4 | from uberduck_ml_dev.data.data import Data 5 | from uberduck_ml_dev.data.collate import Collate 6 | 7 | 8 | class TestTextMelCollation: 9 | def test_oversample(self): 10 | mock_fts = [ 11 | ("speaker0/1.wav", "Test one two", "0"), 12 | ("speaker0/2.wav", "Test one two", "0"), 13 | ("speaker1/1.wav", "Test one two", "1"), 14 | ] 15 | assert oversample(mock_fts, {"1": 3}) == [ 16 | ("speaker0/1.wav", "Test one two", "0"), 17 | ("speaker0/2.wav", "Test one two", "0"), 18 | ("speaker1/1.wav", "Test one two", "1"), 19 | ("speaker1/1.wav", "Test one two", "1"), 20 | ("speaker1/1.wav", "Test one two", "1"), 21 | ] 22 | 23 | def test_batch_structure(self): 24 | ds = Data( 25 | "analytics/tests/fixtures/val.txt", 26 | debug=True, 27 | debug_dataset_size=12, 28 | symbol_set="default", 29 | ) 30 | assert len(ds) == 1 31 | collate_fn = Collate() 32 | dl = DataLoader(ds, 12, collate_fn=collate_fn) 33 | for i, batch in enumerate(dl): 34 | assert len(batch) == 6 35 | 36 | def test_batch_dimensions(self): 37 | ds = Data( 38 | audiopaths_and_text="analytics/tests/fixtures/val.txt", 39 | debug=True, 40 | debug_dataset_size=12, 41 | symbol_set="default", 42 | ) 43 | assert len(ds) == 1 44 | collate_fn = Collate() 45 | dl = DataLoader(ds, 12, collate_fn=collate_fn) 46 | for i, batch in enumerate(dl): 47 | output_lengths = batch["mel_lengths"] 48 | gate_target = batch["gate_padded"] 49 | mel_padded = batch["mel_padded"] 50 | assert output_lengths.item() == 566 51 | assert gate_target.size(1) == 566 52 | assert mel_padded.size(2) == 566 53 | assert len(batch) == 6 54 | 55 | def test_batch_dimensions_partial(self): 56 | ds = Data( 57 | "analytics/tests/fixtures/val.txt", 58 | debug=True, 59 | debug_dataset_size=12, 60 | symbol_set="default", 61 | ) 62 | assert len(ds) == 1 63 | collate_fn = Collate(n_frames_per_step=5) 64 | dl = DataLoader(ds, 12, collate_fn=collate_fn) 65 | for i, batch in enumerate(dl): 66 | assert batch["mel_lengths"].item() == 566 67 | assert ( 68 | batch["mel_padded"].size(2) == 566 69 | ) # I'm not sure why this was 570 - maybe 566 + 5 (i.e. the n_frames_per_step) 70 | assert batch["gate_padded"].size(1) == 566 71 | assert len(batch) == 6 72 | -------------------------------------------------------------------------------- /analytics/tests/tests/text/test_symbols.py: -------------------------------------------------------------------------------- 1 | from uberduck_ml_dev.text.symbols import arpabet_to_sequence, symbols_to_sequence 2 | 3 | 4 | class TestSymbols: 5 | def test_arpabet_to_sequence(self): 6 | # NOTE: arpabet_to_sequence does not properly handle whitespace, it should take single words only. 7 | assert ( 8 | len( 9 | arpabet_to_sequence( 10 | "{ S IY } { EH M } { Y UW } { D IH K SH AH N EH R IY }" 11 | ) 12 | ) 13 | == 15 14 | ) 15 | assert arpabet_to_sequence("{ S IY }") == [168, 148] 16 | # But symbols_to_sequence hanldes whitespace 17 | 18 | def test_symbols_to_sequence(self): 19 | assert len(symbols_to_sequence("C M U Dictionary")) == 16 20 | -------------------------------------------------------------------------------- /analytics/tests/tests/text/test_util.py: -------------------------------------------------------------------------------- 1 | from uberduck_ml_dev.text.utils import ( 2 | cleaned_text_to_sequence, 3 | text_to_sequence, 4 | DEFAULT_SYMBOLS, 5 | sequence_to_text, 6 | ) 7 | 8 | 9 | class TestTextUtils: 10 | def text_sequence_to_text(self): 11 | print(text_to_sequence("The pen is | blue.| ", ["english_cleaners"])) 12 | assert len(text_to_sequence("The pen is blue.", ["english_cleaners"])) == 16 13 | assert ( 14 | len(text_to_sequence("The pen is {B L OW0}.", ["english_cleaners"])) == 15 15 | ) 16 | assert ( 17 | sequence_to_text(text_to_sequence("The pen is blue.", ["english_cleaners"])) 18 | == "the pen is blue." 19 | ), sequence_to_text(text_to_sequence("The pen is blue.", ["english_cleaners"])) 20 | assert ( 21 | sequence_to_text( 22 | text_to_sequence("The pen is {B L OW0}.", ["english_cleaners"]) 23 | ) 24 | == "the pen is {B L OW0}." 25 | ) 26 | assert ( 27 | len( 28 | text_to_sequence( 29 | "{N AA1 T} {B AE1 D} {B AA1 R T}, {N AA1 T} {B AE1 D} {AE1 T} {AO1 L}.", 30 | ["english_cleaners"], 31 | ) 32 | ) 33 | == 28 34 | ) 35 | 36 | assert ( 37 | len( 38 | text_to_sequence( 39 | "Not bad bart, not bad at all.", ["english_cleaners"], p_arpabet=1.0 40 | ) 41 | ) 42 | == 28 43 | ) 44 | 45 | def test_text_to_sequence(self): 46 | assert cleaned_text_to_sequence( 47 | "Not bad bart, not bad at all", DEFAULT_SYMBOLS 48 | ) == [ 49 | 62, 50 | 89, 51 | 94, 52 | 9, 53 | 76, 54 | 75, 55 | 78, 56 | 9, 57 | 76, 58 | 75, 59 | 92, 60 | 94, 61 | 4, 62 | 9, 63 | 88, 64 | 89, 65 | 94, 66 | 9, 67 | 76, 68 | 75, 69 | 78, 70 | 9, 71 | 75, 72 | 94, 73 | 9, 74 | 75, 75 | 86, 76 | 86, 77 | ] 78 | assert text_to_sequence( 79 | "Not bad bart, not bad at all", ["english_cleaners"], 0.0, DEFAULT_SYMBOLS 80 | ) == [ 81 | 88, 82 | 89, 83 | 94, 84 | 9, 85 | 76, 86 | 75, 87 | 78, 88 | 9, 89 | 76, 90 | 75, 91 | 92, 92 | 94, 93 | 4, 94 | 9, 95 | 88, 96 | 89, 97 | 94, 98 | 9, 99 | 76, 100 | 75, 101 | 78, 102 | 9, 103 | 75, 104 | 94, 105 | 9, 106 | 75, 107 | 86, 108 | 86, 109 | ] 110 | -------------------------------------------------------------------------------- /analytics/tests/tests/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/analytics/tests/tests/trainer/__init__.py -------------------------------------------------------------------------------- /analytics/tests/tests/trainer/test_trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | from uberduck_ml_dev.vendor.tfcompat.hparam import HParams 5 | from uberduck_ml_dev.trainer.base import DEFAULTS as TRAINER_DEFAULTS 6 | from uberduck_ml_dev.trainer.base import TTSTrainer 7 | from uberduck_ml_dev.models.common import MelSTFT 8 | 9 | 10 | class TestTrainer: 11 | def test_trainer_base(self): 12 | config = TRAINER_DEFAULTS.values() 13 | 14 | params = dict( 15 | checkpoint_name="test", 16 | checkpoint_path="test_checkpoint", 17 | cudnn_enabled=True, 18 | log_dir="this/is/a/test", 19 | ) 20 | config.update(params) 21 | hparams = HParams(**config) 22 | trainer = TTSTrainer(hparams) 23 | assert trainer.hparams == hparams 24 | 25 | assert trainer.cudnn_enabled == True 26 | mel = torch.load("analytics/tests/fixtures/stevejobs-1.pt") 27 | mel_stft = MelSTFT() 28 | audio = mel_stft.griffin_lim(mel) 29 | assert audio.size(0) == 1 30 | 31 | 32 | class TestTacotron2Trainer: 33 | # NOTE (Sam): this test could be made twice as fast by only running a single epoch,. 34 | # since as it is, the second gradient step is only useful for evaluating the loss 35 | def test_gradient_step(self, lj_trainer): 36 | torch.manual_seed(1234) 37 | lj_trainer.train() 38 | 39 | # NOTE (Sam): this number was taken from master on 8/24/22. 40 | # train_loss_start = 0.320 41 | # train_loss_4_datapoints_1_iteration = 0.319 42 | # NOTE (Sam): new numbers taken after normalization change 12/11/22 43 | # Have to run two iterations for loss to go down now. 44 | # train_loss_start = 0.339 45 | # train_loss_4_datapoints_2_iteration = 0.327 46 | # NOTE (Sam): new numbers taken after enforce_sorted = False 2/7/23 47 | train_loss_start = 0.334 48 | train_loss_4_datapoints_2_iteration = 0.326 49 | assert math.isclose(lj_trainer.loss[0], train_loss_start, abs_tol=5e-4) 50 | 51 | assert math.isclose( 52 | lj_trainer.loss[2], train_loss_4_datapoints_2_iteration, abs_tol=5e-4 53 | ) 54 | -------------------------------------------------------------------------------- /analytics/tests/tests/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from uberduck_ml_dev.utils.utils import get_mask_from_lengths, sequence_mask 3 | 4 | 5 | class TestUtils: 6 | def test_mask_from_lengths(self): 7 | assert ( 8 | get_mask_from_lengths(torch.LongTensor([1, 3, 2, 1])) 9 | == torch.Tensor( 10 | [ 11 | [True, False, False], 12 | [True, True, True], 13 | [True, True, False], 14 | [True, False, False], 15 | ] 16 | ) 17 | ).all() 18 | 19 | def test_sequence_mask(self): 20 | assert ( 21 | sequence_mask(torch.tensor([1, 3, 2, 1])) 22 | == torch.Tensor( 23 | [ 24 | [True, False, False], 25 | [True, True, True], 26 | [True, True, False], 27 | [True, False, False], 28 | ] 29 | ) 30 | ).all() 31 | assert ( 32 | sequence_mask(torch.tensor([1, 3, 2, 1]), 4) 33 | == torch.Tensor( 34 | [ 35 | [True, False, False, False], 36 | [True, True, True, False], 37 | [True, True, False, False], 38 | [True, False, False, False], 39 | ] 40 | ) 41 | ).all() 42 | -------------------------------------------------------------------------------- /analytics/tests/tests/vocoders/test_hifi_gan.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import read 2 | from uberduck_ml_dev.models.common import MelSTFT 3 | import torch 4 | 5 | 6 | class TestHifiGan: 7 | def test_hifi_gan(self): 8 | # TODO (Sam): move to settings file. 9 | path = "analytics/tests/fixtures/wavs/stevejobs-1.wav" 10 | sr, data = read(path) 11 | 12 | assert sr == 22050 13 | assert len(data) == 144649 14 | 15 | data = torch.FloatTensor(data / 32768.0).unsqueeze(0) 16 | 17 | melstft = MelSTFT() 18 | mel = melstft.mel_spectrogram(data) 19 | 20 | assert mel.shape[0] == 1 21 | assert mel.shape[1] == 80 22 | assert mel.shape[2] == 566 23 | -------------------------------------------------------------------------------- /licenses/LICENSE2: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, NVIDIA Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /licenses/LICENSE3: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Huawei Technologies Co., Ltd. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /licenses/LICENSE4: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Bjarke Felbo, Han Thi Nguyen, Thomas Wolf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | # All sections below are required unless otherwise specified 3 | host = github 4 | lib_name = uberduck_ml_dev 5 | # For Enterprise Git add variable repo_name and company name 6 | # repo_name = analytics 7 | # company_name = nike 8 | 9 | user = uberduck-ai 10 | description = A description of your project 11 | keywords = some keywords 12 | author = Uberduck 13 | author_email = quack@uberduck.ai 14 | copyright = Uberduck 15 | branch = master 16 | version = 0.0.1 17 | min_python = 3.7 18 | audience = Developers 19 | language = English 20 | # Set to True if you want to create a more fancy sidebar.json than the default 21 | custom_sidebar = False 22 | # Add licenses and see current list in `setup.py` 23 | license = apache2 24 | # From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive 25 | status = 2 26 | 27 | # Optional. Same format as setuptools requirements. Torch version seems to effect random number generator (not 100% certain). 28 | # TODO (Sam): our goal is to rely on as few 3rd party packages as possible. We should try to remove as many of these as possible and integrate torch code directly. 29 | # NOTE (Sam): is it possible to specify no-deps here? 30 | requirements = Cython pytest phonemizer inflect librosa>=0.8.0 matplotlib nltk>=3.6.5 numpy>=1.23.5 pandas pydub scipy scikit-learn tensorboardX torch>=1.13.0 torchaudio>=0.9.0 unidecode seaborn wordfreq einops g2p_en@git+https://github.com/uberduck-ai/g2p emoji text-unidecode pre-commit lmdb ray[default] praat-parselmouth>=0.4.3 torchcrepe==0.0.22 pyworld==0.3.2 faiss-cpu==1.7.4 31 | 32 | # Optional. Same format as setuptools console_scripts 33 | # console_scripts = 34 | # Optional. Same format as setuptools dependency-links 35 | # dep_links = 36 | ### 37 | # You probably won't need to change anything under here, 38 | # unless you have some special requirements 39 | ### 40 | 41 | # Change to, e.g. "nbs", to put your notebooks in nbs dir instead of repo root 42 | doc_path = docs 43 | 44 | # Whether to look for library notebooks recursively in the `nbs_path` dir 45 | recursive = False 46 | 47 | # Anything shown as '%(...)s' is substituted with that setting automatically 48 | doc_host = https://%(user)s.github.io 49 | #For Enterprise Git pages use: 50 | #doc_host = https://pages.github.%(company_name)s.com. 51 | 52 | 53 | doc_baseurl = /%(lib_name)s/ 54 | # For Enterprise Github pages docs use: 55 | # doc_baseurl = /%(repo_name)s/%(lib_name)s/ 56 | 57 | git_url = https://github.com/%(user)s/%(lib_name)s/tree/%(branch)s/ 58 | # For Enterprise Github use: 59 | #git_url = https://github.%(company_name)s.com/%(repo_name)s/%(lib_name)s/tree/%(branch)s/ 60 | 61 | 62 | 63 | lib_path = %(lib_name)s 64 | title = %(lib_name)s 65 | 66 | #Optional advanced parameters 67 | #Monospace docstings: adds
 tags around the doc strings, preserving newlines/indentation.
68 | #monospace_docstrings = False
69 | #Test flags: introduce here the test flags you want to use separated by |
70 | tst_flags=slow
71 | #Custom sidebar: customize sidebar.json yourself for advanced sidebars (False/True)
72 | #custom_sidebar =
73 | #Custom jekyll styles: if you want more jekyll styles than tip/important/warning, set them here
74 | #jekyll_styles = note,warning,tip,important
75 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from pkg_resources import parse_version
  2 | from configparser import ConfigParser
  3 | import setuptools, re, sys
  4 | 
  5 | assert parse_version(setuptools.__version__) >= parse_version("36.2")
  6 | 
  7 | # note: all settings are in settings.ini; edit there, not here
  8 | config = ConfigParser(delimiters=["="])
  9 | config.read("settings.ini")
 10 | cfg = config["DEFAULT"]
 11 | 
 12 | cfg_keys = "version description keywords author author_email".split()
 13 | expected = (
 14 |     cfg_keys
 15 |     + "lib_name user branch license status min_python audience language".split()
 16 | )
 17 | for o in expected:
 18 |     assert o in cfg, "missing expected setting: {}".format(o)
 19 | setup_cfg = {o: cfg[o] for o in cfg_keys}
 20 | 
 21 | if len(sys.argv) > 1 and sys.argv[1] == "version":
 22 |     print(setup_cfg["version"])
 23 |     exit()
 24 | 
 25 | licenses = {
 26 |     "apache2": (
 27 |         "Apache Software License 2.0",
 28 |         "OSI Approved :: Apache Software License",
 29 |     ),
 30 |     "mit": ("MIT License", "OSI Approved :: MIT License"),
 31 |     "gpl2": (
 32 |         "GNU General Public License v2",
 33 |         "OSI Approved :: GNU General Public License v2 (GPLv2)",
 34 |     ),
 35 |     "gpl3": (
 36 |         "GNU General Public License v3",
 37 |         "OSI Approved :: GNU General Public License v3 (GPLv3)",
 38 |     ),
 39 |     "bsd3": ("BSD License", "OSI Approved :: BSD License"),
 40 | }
 41 | statuses = [
 42 |     "1 - Planning",
 43 |     "2 - Pre-Alpha",
 44 |     "3 - Alpha",
 45 |     "4 - Beta",
 46 |     "5 - Production/Stable",
 47 |     "6 - Mature",
 48 |     "7 - Inactive",
 49 | ]
 50 | py_versions = (
 51 |     "2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8".split()
 52 | )
 53 | 
 54 | lic = licenses.get(cfg["license"].lower(), (cfg["license"], None))
 55 | min_python = cfg["min_python"]
 56 | 
 57 | requirements = ["pip", "packaging"]
 58 | if cfg.get("requirements"):
 59 |     requirements += cfg.get("requirements", "").split()
 60 | if cfg.get("pip_requirements"):
 61 |     requirements += cfg.get("pip_requirements", "").split()
 62 | dev_requirements = (cfg.get("dev_requirements") or "").split()
 63 | 
 64 | long_description = open("README.md", encoding="utf-8").read()
 65 | # ![png](docs/images/output_13_0.png)
 66 | for ext in ["png", "svg"]:
 67 |     long_description = re.sub(
 68 |         r"!\[" + ext + "\]\((.*)\)",
 69 |         "!["
 70 |         + ext
 71 |         + "]("
 72 |         + "https://raw.githubusercontent.com/{}/{}".format(cfg["user"], cfg["lib_name"])
 73 |         + "/"
 74 |         + cfg["branch"]
 75 |         + "/\\1)",
 76 |         long_description,
 77 |     )
 78 |     long_description = re.sub(
 79 |         r"src=\"(.*)\." + ext + '"',
 80 |         'src="https://raw.githubusercontent.com/{}/{}'.format(
 81 |             cfg["user"], cfg["lib_name"]
 82 |         )
 83 |         + "/"
 84 |         + cfg["branch"]
 85 |         + "/\\1."
 86 |         + ext
 87 |         + '"',
 88 |         long_description,
 89 |     )
 90 | 
 91 | setuptools.setup(
 92 |     name=cfg["lib_name"],
 93 |     license=lic[0],
 94 |     classifiers=[
 95 |         "Development Status :: " + statuses[int(cfg["status"])],
 96 |         "Intended Audience :: " + cfg["audience"].title(),
 97 |         "Natural Language :: " + cfg["language"].title(),
 98 |     ]
 99 |     + [
100 |         "Programming Language :: Python :: " + o
101 |         for o in py_versions[py_versions.index(min_python) :]
102 |     ]
103 |     + (["License :: " + lic[1]] if lic[1] else []),
104 |     url=cfg["git_url"],
105 |     packages=setuptools.find_packages(),
106 |     include_package_data=True,
107 |     package_data={
108 |         "": ["uberduck_ml_dev/text/heteronyms", "uberduck_ml_dev/text/cmudict-0.7b"]
109 |     },
110 |     install_requires=requirements,
111 |     extras_require={"dev": dev_requirements},
112 |     python_requires=">=" + cfg["min_python"],
113 |     long_description=long_description,
114 |     long_description_content_type="text/markdown",
115 |     zip_safe=False,
116 |     entry_points={"console_scripts": cfg.get("console_scripts", "").split()},
117 |     **setup_cfg
118 | )
119 | 


--------------------------------------------------------------------------------
/tutorials/hifigan/data_processing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from scipy.io.wavfile import read, write
  3 | import librosa
  4 | import torch
  5 | import numpy as np
  6 | 
  7 | from uberduck_ml_dev.data.get import get
  8 | from uberduck_ml_dev.data.utils import mel_spectrogram_torch, find_rel_paths
  9 | from uberduck_ml_dev.data.data import HIFIGAN_DEFAULTS as DEFAULTS
 10 | from uberduck_ml_dev.data.data import MAX_WAV_VALUE
 11 | 
 12 | 
 13 | data_directory = ""  # path to the directory containing the data
 14 | ground_truth_rel_paths = find_rel_paths(directory=data_directory, filename="gt.wav")
 15 | ground_truth_abs_paths = [
 16 |     os.path.join(data_directory, ground_truth_rel_path)
 17 |     for ground_truth_rel_path in ground_truth_rel_paths
 18 | ]
 19 | 
 20 | 
 21 | print("resampling and integer normalizing")
 22 | 
 23 | resampled_normalized_abs_paths = [
 24 |     resampled_normalized_abs_path.replace(
 25 |         "gt.wav", "audio_resampledT_normalized32768T.wav"
 26 |     )
 27 |     for resampled_normalized_abs_path in ground_truth_abs_paths
 28 | ]
 29 | 
 30 | loading_function = lambda filename: librosa.load(filename, sr=22050)[0]
 31 | processing_function = lambda x: np.asarray(
 32 |     (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype=np.int16
 33 | )
 34 | saving_function = lambda data, filename: write(
 35 |     filename, 22050, data
 36 | )  # must be in this order
 37 | 
 38 | 
 39 | get(
 40 |     processing_function,
 41 |     saving_function,
 42 |     loading_function,
 43 |     ground_truth_abs_paths,
 44 |     resampled_normalized_abs_paths,
 45 |     True,
 46 | )
 47 | 
 48 | print("resampling and float normalizing")
 49 | 
 50 | resampled_normalized_abs_paths = [
 51 |     resampled_normalized_abs_path.replace("gt.wav", "audio_resampledT_normalized1T.wav")
 52 |     for resampled_normalized_abs_path in ground_truth_abs_paths
 53 | ]
 54 | 
 55 | loading_function = lambda filename: librosa.load(filename, sr=22050)[0]
 56 | processing_function = lambda x: np.asarray(
 57 |     (x / np.abs(x).max()) * (1 - 1 / MAX_WAV_VALUE), dtype=np.float32
 58 | )
 59 | saving_function = lambda data, filename: write(
 60 |     filename, 22050, data
 61 | )  # must be in this order
 62 | 
 63 | 
 64 | get(
 65 |     processing_function,
 66 |     saving_function,
 67 |     loading_function,
 68 |     ground_truth_abs_paths,
 69 |     resampled_normalized_abs_paths,
 70 |     True,
 71 | )
 72 | 
 73 | 
 74 | print("computing spectrograms from 1 normalized audio")
 75 | 
 76 | spectrogram_abs_paths = [
 77 |     ground_truth_abs_path.replace("gt.wav", "spectrogram.pt")
 78 |     for ground_truth_abs_path in ground_truth_abs_paths
 79 | ]
 80 | 
 81 | 
 82 | processing_function = lambda x: mel_spectrogram_torch(
 83 |     x,
 84 |     DEFAULTS["n_fft"],
 85 |     DEFAULTS["num_mels"],
 86 |     DEFAULTS["sampling_rate"],
 87 |     DEFAULTS["hop_size"],
 88 |     DEFAULTS["win_size"],
 89 |     DEFAULTS["fmin"],
 90 |     DEFAULTS["fmax"],
 91 |     True,
 92 | )
 93 | loading_function = lambda source_path: torch.Tensor(
 94 |     read(source_path)[1] / MAX_WAV_VALUE
 95 | ).unsqueeze(0)
 96 | saving_function = lambda data, target_path: torch.save(data, target_path)
 97 | 
 98 | get(
 99 |     processing_function,
100 |     saving_function,
101 |     loading_function,
102 |     resampled_normalized_abs_paths,
103 |     spectrogram_abs_paths,
104 |     True,
105 | )
106 | 


--------------------------------------------------------------------------------
/tutorials/hifigan/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/lj_speech.zip
4 | unzip lj_speech.zip
5 | # wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_libritts100360_generator0p5.pt
6 | # wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_22khz_config.json
7 | 


--------------------------------------------------------------------------------
/tutorials/radtts/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/lj_speech.zip
4 | unzip lj_speech.zip
5 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_libritts100360_generator0p5.pt
6 | wget https://huggingface.co/datasets/Uberduck/ljspeech/resolve/main/hifigan_22khz_config.json
7 | 


--------------------------------------------------------------------------------
/tutorials/radtts/radtts_data_processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "0e3c74a5",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# For computing normalized audio, spectrograms, and pitches\n",
 11 |     "import os\n",
 12 |     "from uberduck_ml_dev.data.get import get_mels, get_pitches\n",
 13 |     "from uberduck_ml_dev.data.data import RADTTS_DEFAULTS as data_config\n",
 14 |     "\n",
 15 |     "from uberduck_ml_dev.data.get import get\n",
 16 |     "import librosa\n",
 17 |     "import numpy as np\n",
 18 |     "from scipy.io.wavfile import write\n",
 19 |     "from datetime import datetime"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 11,
 25 |    "id": "2710441c",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# data_dir = \"/path/to/data\"\n",
 30 |     "data_dir = \"/usr/src/app/uberduck_ml_dev/tutorials/radtts/lj/LJSpeech/\""
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 15,
 36 |    "id": "5cdc25fe",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "def find_rel_paths(directory, filename):\n",
 41 |     "    for root, dirs, files in os.walk(directory):\n",
 42 |     "        if filename in files:\n",
 43 |     "            yield os.path.relpath(os.path.join(root, filename), directory)\n",
 44 |     "\n",
 45 |     "filename = 'gt.wav'  # replace with your filename\n",
 46 |     "rel_path_list = list(find_rel_paths(data_dir, filename))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 24,
 52 |    "id": "d9f989f6",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "local_path_list = [os.path.join(data_dir, rel_path) for rel_path in rel_path_list]\n",
 57 |     "resampled_normalized_path_list = [os.path.join(data_dir, \n",
 58 |     "                                               local_path.split('gt.wav')[0],\n",
 59 |     "                                               'audio_resampledT_normalized32768T.wav') \n",
 60 |     "                                  for local_path in local_path_list]\n",
 61 |     "spectrogram_path_list = [os.path.join(data_dir, local_path.split('gt.wav')[0],\n",
 62 |     "                                               'spectrogram.pt') \n",
 63 |     "                                  for local_path in local_path_list]\n",
 64 |     "folder_path_list = [os.path.join(data_dir, local_path.split('gt.wav')[0]) \n",
 65 |     "                                  for local_path in local_path_list]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "f5ce0f25",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "print(\"resample and normalize\")\n",
 76 |     "MAX_WAV_VALUE = 32768\n",
 77 |     "sr = 22050\n",
 78 |     "loading_function = lambda filename : librosa.load(filename, sr = 22050)[0]\n",
 79 |     "function_ = lambda x : np.asarray((x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype = np.int16)\n",
 80 |     "saving_function = lambda data, filename  : write(filename, 22050, data) # must be in this order\n",
 81 |     "\n",
 82 |     "print(datetime.now())\n",
 83 |     "get(function_, loading_function, saving_function, local_path_list, resampled_normalized_path_list, False)\n",
 84 |     "print(datetime.now())"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "ab2d5894",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "print(\"compute pitches\")\n",
 95 |     "get_pitches(resampled_normalized_path_list, data_config, folder_path_list, method = 'radtts')"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "08e86d85",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "print(\"compute mels\")\n",
106 |     "get_mels(resampled_normalized_path_list, data_config, spectrogram_path_list)"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "kernelspec": {
112 |    "display_name": "Python 3",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.8.10"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 5
131 | }
132 | 


--------------------------------------------------------------------------------
/tutorials/radtts/train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd ../..
4 | # remember to set training and eval filelists, heteronyms_path and phoneme_dict_path vocoder_config_path and vocoder_checkpoint_path in demo_config.json
5 | python uberduck_ml_dev/exec/train_radtts_with_ray.py --config tutorials/radtts/demo_config.json
6 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/assets/duck.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/assets/duck.png


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/data/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/batch.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from ..utils.utils import to_gpu
 4 | 
 5 | 
 6 | class Batch(Dict):
 7 |     # NOTE (Sam): isn't gate target redundant to output length.
 8 |     # NOTE (Sam): here types are unused, but TypedDict inheritance doesn't allow methods
 9 |     # NOTE (Sam): these were also problems with object (I forget), NamedTuple (mutability), dataclass (I forget)
10 | 
11 |     # text_int_padded: Optional[torch.LongTensor] = None
12 |     # input_lengths: Optional[torch.LongTensor] = None
13 |     # mel_padded: Optional[torch.FloatTensor] = None  # for teacher forcing.
14 |     # gate_target: Optional[
15 |     #     torch.LongTensor
16 |     # ] = None  # NOTE (Sam): could be bool -  for teacher forcing.
17 |     # output_lengths: Optional[torch.LongTensor] = None
18 |     # speaker_ids: Optional[torch.LongTensor] = None
19 |     # gst: Optional[torch.Tensor] = None
20 |     # mel_outputs: Optional[torch.Tensor] = None  # predicted.
21 |     # mel_outputs_postnet: Optional[torch.Tensor] = None
22 |     # gate_predicted: Optional[torch.LongTensor] = None  # could be bool.
23 |     # alignments: Optional[torch.Tensor] = None
24 |     # audio_encodings: Optional[torch.Tensor] = None
25 | 
26 |     def subset(self, keywords, fragile=False) -> "Batch":
27 |         d = {}
28 |         for k in keywords:
29 |             try:
30 |                 d[k] = self[k]
31 |             except KeyError:
32 |                 if fragile:
33 |                     raise
34 |         return Batch(**d)
35 | 
36 |     def to_gpu(self) -> "Batch":
37 |         batch_gpu = Batch(**{k: to_gpu(v) for k, v in self.items()})
38 |         return batch_gpu
39 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/get.py:
--------------------------------------------------------------------------------
  1 | from torch.utils.data import DataLoader
  2 | import librosa
  3 | from pathlib import Path
  4 | from tqdm import tqdm
  5 | import torch
  6 | import os
  7 | 
  8 | from ..data.data import DataMel, DataPitch
  9 | from ..data.collate import CollateBlank
 10 | from ..data.processor import Processor
 11 | 
 12 | 
 13 | def get_parallel_torch(data):
 14 |     data_loader = DataLoader(
 15 |         data, batch_size=32, collate_fn=CollateBlank(), num_workers=8
 16 |     )
 17 |     for batch in data_loader:
 18 |         pass
 19 | 
 20 | 
 21 | # TODO (Sam): use get_parallel_torch to reduce boilerplate.
 22 | # NOTE (Sam): assumes data is in a directory structure like:
 23 | # /tmp/{uuid}/resampled_normalized.wav
 24 | # These functions add spectrogram.pt, f0.pt, and coqui_resnet_512_emb.pt to each file-specific directory.
 25 | def get_mels(paths, data_config, target_paths):
 26 |     data = DataMel(audiopaths=paths, data_config=data_config, target_paths=target_paths)
 27 | 
 28 |     collate_fn = CollateBlank()
 29 | 
 30 |     data_loader = DataLoader(
 31 |         data,
 32 |         batch_size=32,
 33 |         collate_fn=collate_fn,
 34 |     )
 35 |     for batch in data_loader:
 36 |         pass  # computes in loader.
 37 | 
 38 | 
 39 | # NOTE (Sam): pitch, pitchf == f0 coarse, f0bak in rvc parlance.
 40 | # NOTE (Sam): sample_rate is also passed as part of data_config
 41 | # TODO (Sam): decide on sample_rate v sampling_rate
 42 | # NOTE (Sam): pyin (radtts) and parselmouth (rvc) methods seem to generate pitches of different lengths.
 43 | def get_pitches(
 44 |     paths,
 45 |     data_config=None,
 46 |     target_folders=None,
 47 |     method="parselmouth",
 48 |     sample_rate=None,
 49 |     recompute=False,
 50 | ):
 51 |     data = DataPitch(
 52 |         audiopaths=paths,
 53 |         data_config=data_config,
 54 |         target_folders=target_folders,
 55 |         method=method,
 56 |         sample_rate=data_config["sampling_rate"],
 57 |         recompute=recompute,
 58 |     )
 59 |     get_parallel_torch(data)
 60 | 
 61 | 
 62 | HUBERT_PATH = "hubert_embedding.pt"
 63 | F0_PATH = "f0.pt"
 64 | F0F_PATH = "f0f.pt"
 65 | 
 66 | 
 67 | # NOTE (Sam): this is different from the other get functions because it doesn't use torch dataset.
 68 | def get_hubert_embeddings(
 69 |     audiopaths, hubert_model, output_layer=9, hubert_path=HUBERT_PATH
 70 | ):
 71 |     """Returns the abs path w.r.t penultimate directory name in audiopaths, e.g. suitable for /tmp/{uuid}/resampled_normalized.wav."""
 72 |     hubert_abs_paths = []
 73 |     for audiopath in tqdm(audiopaths):
 74 |         folder_path = str(Path(*Path(audiopath).parts[:-1]))
 75 |         hubert_abs_path = os.path.join(folder_path, hubert_path)
 76 |         # TODO (Sam): add hashing to avoid mistakenly not recomputing.
 77 |         if not os.path.exists(hubert_abs_path):
 78 |             # NOTE (Sam): Hubert expects 16k sample rate.
 79 |             audio0, sr = librosa.load(audiopath, sr=16000)
 80 |             feats = torch.from_numpy(audio0)
 81 |             feats = feats.float()
 82 |             feats = feats.view(1, -1)
 83 |             padding_mask = torch.BoolTensor(feats.shape).to("cpu").fill_(False)
 84 |             inputs = {
 85 |                 "source": feats.to("cpu"),
 86 |                 "padding_mask": padding_mask,
 87 |                 "output_layer": output_layer,
 88 |             }
 89 | 
 90 |             with torch.no_grad():
 91 |                 logits = hubert_model.extract_features(**inputs)
 92 |                 feats = hubert_model.final_proj(logits[0])
 93 |                 torch.save(feats[0], hubert_abs_path)
 94 | 
 95 |         hubert_abs_paths.append(hubert_abs_path)
 96 | 
 97 |     return hubert_abs_paths
 98 | 
 99 | 
100 | def get(
101 |     processing_function,
102 |     saving_function,
103 |     loading_function,
104 |     source_paths,
105 |     target_paths,
106 |     recompute,
107 | ):
108 |     function_ = lambda source_path, target_path: saving_function(
109 |         processing_function(loading_function(source_path)), target_path
110 |     )
111 |     processor = Processor(
112 |         function_=function_,
113 |         source_paths=source_paths,
114 |         target_paths=target_paths,
115 |         recompute=recompute,
116 |     )
117 | 
118 |     get_parallel_torch(processor)
119 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/normalization.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import numpy as np
 3 | from scipy.io.wavfile import write
 4 | from ..models.tacotron2 import MAX_WAV_VALUE
 5 | 
 6 | load_resampled_normalized_audio = lambda source_path: librosa.load(
 7 |     source_path, sr=22050
 8 | )[0]
 9 | float_normalize = lambda x: np.asarray(
10 |     (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1) / MAX_WAV_VALUE
11 | )
12 | int_normalize = lambda x: np.asarray(
13 |     (x / np.abs(x).max()) * (MAX_WAV_VALUE - 1), dtype=np.int16
14 | )
15 | save_22k_audio = lambda data, target_path: write(
16 |     target_path, 22050, data
17 | )  # must be in this order
18 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/processor.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, List, Dict
 2 | import os
 3 | 
 4 | 
 5 | # NOTE (Sam): this torch processor appears to be 10% faster than standard multiprocessing - perhaps this is overkill
 6 | class Processor:
 7 |     def __init__(
 8 |         self,
 9 |         function_: Callable,
10 |         source_paths: List[str],
11 |         target_paths: List[
12 |             str
13 |         ],  # NOTE (Sam): this is target_folders in certain versions of the code since for example we want to save pitch at f0.pt and pitch mask as f0f.pt.  Have to think of a solution.
14 |         recompute: bool = True,
15 |     ):
16 |         self.source_paths = source_paths
17 |         self.function_ = function_
18 |         self.target_paths = target_paths
19 |         self.recompute = recompute
20 | 
21 |     def _get_data(self, source_path, target_path):
22 |         # NOTE (Sam): we need caching to debug training issues in dev and for speed!
23 |         # NOTE (Sam): won't catch issues with recomputation using different parameters but name name
24 |         # TODO (Sam): add hashing
25 |         if self.recompute or not os.path.exists(target_path):
26 |             self.function_(source_path, target_path)
27 |         else:
28 |             pass
29 | 
30 |     def __getitem__(self, idx):
31 |         try:
32 |             self._get_data(
33 |                 source_path=self.source_paths[idx],
34 |                 target_path=self.target_paths[idx],
35 |             )
36 | 
37 |         except Exception as e:
38 |             print(f"Error while getting data: index = {idx}")
39 |             print(e)
40 |             raise
41 |         return None
42 | 
43 |     def __len__(self):
44 |         nfiles = len(self.source_paths)
45 | 
46 |         return nfiles
47 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/ray.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | import os
  3 | 
  4 | from scipy.io import wavfile
  5 | import torch
  6 | import numpy as np
  7 | import ray
  8 | import pandas as pd
  9 | 
 10 | 
 11 | from .utils import get_energy_average, f0_normalize
 12 | from ..models.components.encoders import ResNetSpeakerEncoderCallable
 13 | 
 14 | 
 15 | # NOTE (Sam): the ray dataset code runs mod cleanup but is seemingly slower than torch dataloader (not 100p sure if this is still true).
 16 | def ray_df_preprocessing(df, data_config, tp, stft):
 17 |     transcripts = df.transcript.tolist()
 18 |     audio_bytes_list = df.audio_bytes.tolist()
 19 |     speaker_ids = df.speaker_id.tolist()
 20 |     f0_paths = df.f0_path.tolist()
 21 |     audio_embeddings = df.audio_embedding.tolist()
 22 |     # shuffle_indices = get_shuffle_indices(speaker_ids)
 23 |     # audio_embeddings = [audio_embeddings[i] for i in shuffle_indices]
 24 |     collate_input = []
 25 |     for transcript, audio_bytes, speaker_id, f0_path, audio_embedding in zip(
 26 |         transcripts, audio_bytes_list, speaker_ids, f0_paths, audio_embeddings
 27 |     ):
 28 |         bio = BytesIO(audio_bytes)
 29 |         sr, wav_data = wavfile.read(bio)
 30 |         audio = torch.FloatTensor(wav_data)
 31 |         # NOTE (Sam): why normalize here?
 32 |         audio_norm = audio / (np.abs(audio).max() * 2)
 33 |         text_sequence = tp.get_text(transcript)
 34 |         mel = stft.get_mel(audio_norm, data_config["max_wav_value"])
 35 |         mel = torch.squeeze(mel, 0)
 36 |         dikt = torch.load(f0_path)
 37 |         f0 = dikt["f0"]
 38 |         p_voiced = dikt["p_voiced"]
 39 |         voiced_mask = dikt["voiced_mask"]
 40 |         f0 = f0_normalize(f0, f0_min=data_config["f0_min"])
 41 |         energy_avg = get_energy_average(mel)
 42 |         prior_path = "{}_{}".format(text_sequence.shape[0], mel.shape[1])
 43 |         prior_path = os.path.join("/usr/src/app/radtts/data_cache", prior_path)
 44 |         prior_path += "_prior.pth"
 45 |         attn_prior = torch.load(prior_path)
 46 |         speaker_id = torch.LongTensor([speaker_id])
 47 |         audio_embedding = torch.FloatTensor(audio_embedding)
 48 |         # NOTE (Sam): might be faster to return dictionary arrays of batched inputs instead of list
 49 |         collate_input.append(
 50 |             {
 51 |                 "text_encoded": text_sequence,
 52 |                 "mel": mel,
 53 |                 "speaker_id": speaker_id,
 54 |                 "f0": f0,
 55 |                 "p_voiced": p_voiced,
 56 |                 "voiced_mask": voiced_mask,
 57 |                 "energy_avg": energy_avg,
 58 |                 "attn_prior": attn_prior,
 59 |                 "audiopath": None,
 60 |                 "audio_embedding": audio_embedding,
 61 |             }
 62 |         )
 63 | 
 64 |     return collate_input
 65 | 
 66 | 
 67 | def get_ray_dataset(filelist_path, config_path, model_path):
 68 |     df = pd.read_csv(
 69 |         filelist_path,
 70 |         sep="|",
 71 |         header=None,
 72 |         quoting=3,
 73 |         names=["path", "transcript", "speaker_id", "f0_path", "emb_path"],
 74 |     )
 75 | 
 76 |     paths = df.path.tolist()
 77 |     transcripts = df.transcript.tolist()
 78 |     speaker_ids = df.speaker_id.tolist()
 79 | 
 80 |     pitches = df.f0_path.tolist()
 81 | 
 82 |     parallelism_length = 400
 83 |     audio_ds = ray.data.read_binary_files(
 84 |         paths,
 85 |         parallelism=parallelism_length,
 86 |         ray_remote_args={"num_cpus": 1.0},
 87 |     )
 88 |     audio_ds = audio_ds.map_batches(
 89 |         lambda x: x, batch_format="pyarrow", batch_size=None
 90 |     )
 91 | 
 92 |     paths_ds = ray.data.from_items(paths, parallelism=parallelism_length)
 93 |     paths_ds = paths_ds.map_batches(
 94 |         lambda x: x, batch_format="pyarrow", batch_size=None
 95 |     )
 96 | 
 97 |     transcripts = ray.data.from_items(transcripts, parallelism=parallelism_length)
 98 |     transcripts_ds = transcripts.map_batches(
 99 |         lambda x: x, batch_format="pyarrow", batch_size=None
100 |     )
101 | 
102 |     speaker_ids_ds = ray.data.from_items(speaker_ids, parallelism=parallelism_length)
103 |     speaker_ids_ds = speaker_ids_ds.map_batches(
104 |         lambda x: x, batch_format="pyarrow", batch_size=None
105 |     )
106 |     pitches_ds = ray.data.from_items(pitches, parallelism=parallelism_length)
107 |     pitches_ds = pitches_ds.map_batches(
108 |         lambda x: x, batch_format="pyarrow", batch_size=None
109 |     )
110 | 
111 |     embs_ds = ray.data.from_items(paths, parallelism=parallelism_length)
112 |     embs_ds = embs_ds.map_batches(
113 |         ResNetSpeakerEncoderCallable,
114 |         fn_kwargs={"config_path": config_path, "model_path": model_path},
115 |         num_gpus=1.0,
116 |         compute="actors",
117 |     )
118 | 
119 |     output_dataset = (
120 |         transcripts_ds.zip(audio_ds)
121 |         .zip(paths_ds)
122 |         .zip(speaker_ids_ds)
123 |         .zip(pitches_ds)
124 |         .zip(embs_ds)
125 |     )
126 |     output_dataset = output_dataset.map_batches(
127 |         lambda table: table.rename(
128 |             columns={
129 |                 "value": "transcript",
130 |                 "value_1": "audio_bytes",
131 |                 "value_2": "path",
132 |                 "value_3": "speaker_id",
133 |                 "value_4": "f0_path",
134 |                 "value_5": "emb_path",
135 |             }
136 |         )
137 |     )
138 | 
139 |     processed_dataset = output_dataset.map_batches(ray_df_preprocessing)
140 |     return processed_dataset.fully_executed()
141 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/spectrogram.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from librosa.filters import mel as librosa_mel_fn
  3 | 
  4 | from .utils import spectral_normalize_torch
  5 | 
  6 | # NOTE (Sam): needed for importable lambdas.
  7 | # TODO (Sam): remove redundancy from elsewhere in repo.
  8 | hann_window = {}
  9 | mel_basis = {}
 10 | 
 11 | 
 12 | # TODO (Sam): combine with identically-named function is models.common
 13 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 14 |     """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
 15 | 
 16 |     Args:
 17 |         y             :: (B, T) - Audio waveforms
 18 |         n_fft
 19 |         sampling_rate
 20 |         hop_size
 21 |         win_size
 22 |         center
 23 |     Returns:
 24 |         :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
 25 |     """
 26 |     # Validation
 27 |     if torch.min(y) < -1.0:
 28 |         print("min value is ", torch.min(y))
 29 |     if torch.max(y) > 1.0:
 30 |         print("max value is ", torch.max(y))
 31 | 
 32 |     # Window - Cache if needed
 33 |     global hann_window
 34 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 35 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 36 |     if wnsize_dtype_device not in hann_window:
 37 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 38 |             dtype=y.dtype, device=y.device
 39 |         )
 40 | 
 41 |     # Padding
 42 |     y = torch.nn.functional.pad(
 43 |         y.unsqueeze(1),
 44 |         # NOTE (Sam): combinining n_fft (filter_length) with hop_size reeks of either a bug or sophisticated asympotitc analysis.
 45 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 46 |         mode="reflect",
 47 |     )
 48 |     y = y.squeeze(1)
 49 | 
 50 |     # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
 51 |     spec = torch.stft(
 52 |         y,
 53 |         n_fft,
 54 |         hop_length=hop_size,
 55 |         win_length=win_size,
 56 |         window=hann_window[wnsize_dtype_device],
 57 |         center=center,
 58 |         pad_mode="reflect",
 59 |         normalized=False,
 60 |         onesided=True,
 61 |         return_complex=False,
 62 |     )
 63 |     # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
 64 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 65 |     return spec
 66 | 
 67 | 
 68 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 69 |     # MelBasis - Cache if needed
 70 |     global mel_basis
 71 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 72 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 73 |     if fmax_dtype_device not in mel_basis:
 74 |         mel = librosa_mel_fn(
 75 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
 76 |         )
 77 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
 78 |             dtype=spec.dtype, device=spec.device
 79 |         )
 80 | 
 81 |     # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
 82 |     melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 83 |     melspec = spectral_normalize_torch(melspec)
 84 |     return melspec
 85 | 
 86 | 
 87 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 88 |     """
 89 |     PARAMS
 90 |     ------
 91 |     C: compression factor
 92 |     """
 93 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 94 | 
 95 | 
 96 | def spectral_normalize_torch(magnitudes):
 97 |     return dynamic_range_compression_torch(magnitudes)
 98 | 
 99 | 
100 | def mel_spectrogram_torch(
101 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
102 | ):
103 |     """Convert waveform into Mel-frequency Log-amplitude spectrogram.
104 | 
105 |     Args:
106 |         y       :: (B, T)           - Waveforms
107 |     Returns:
108 |         melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
109 |     """
110 |     # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
111 |     spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
112 | 
113 |     # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
114 |     melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
115 | 
116 |     return melspec
117 | 
118 | 
119 | from ..data.data import HIFIGAN_DEFAULTS as DEFAULTS
120 | from scipy.io.wavfile import read
121 | import librosa
122 | 
123 | mel_spec = lambda x: mel_spectrogram_torch(
124 |     x,
125 |     DEFAULTS["n_fft"],
126 |     DEFAULTS["num_mels"],
127 |     DEFAULTS["sampling_rate"],
128 |     #     100,
129 |     #     24000,#DEFAULTS["sampling_rate"],
130 |     DEFAULTS["hop_size"],
131 |     DEFAULTS["win_size"],
132 |     DEFAULTS["fmin"],
133 |     None,
134 |     False,  # center
135 | )
136 | 
137 | load_audio = lambda source_path: torch.Tensor(read(source_path)[1]).unsqueeze(0)
138 | save_torch = lambda data, target_path: torch.save(data[0], target_path)
139 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/data/statistics.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     "word_frequencies",
  3 |     "create_wordcloud",
  4 |     "count_frequency",
  5 |     "pace_character",
  6 |     "pace_phoneme",
  7 |     "get_sample_format",
  8 |     "AbsoluteMetrics",
  9 | ]
 10 | 
 11 | from typing import List, Any, Dict, Union, Optional
 12 | from collections import Counter
 13 | import os
 14 | 
 15 | import librosa
 16 | import numpy as np
 17 | from pydub.utils import mediainfo_json
 18 | from wordfreq import word_frequency
 19 | 
 20 | from ..text.utils import text_to_sequence
 21 | 
 22 | # NOTE (Sam): this file could be refactored so that it doesn't contain both speechmetrics and wordfreqencies - very different types of statistics.
 23 | 
 24 | 
 25 | def word_frequencies(text: str, language: str = "en") -> List[float]:
 26 |     """
 27 |     Calculate the frequency [0-1] which the words appear in the english language
 28 |     """
 29 |     freqs = []
 30 |     for word in text.split():
 31 |         freqs.append(word_frequency(word, language))
 32 |     return freqs
 33 | 
 34 | 
 35 | def count_frequency(arr: List[Any]) -> Dict[Any, int]:
 36 |     """
 37 |     Calculates the frequency that a value appears in a list
 38 |     """
 39 |     return dict(Counter(arr).most_common())
 40 | 
 41 | 
 42 | def pace_character(
 43 |     text: str, audio: Union[str, np.ndarray], sr: Optional[int] = None
 44 | ) -> float:
 45 |     """
 46 |     Calculates the number of characters in the text per second of the audio file. Audio can be a file path or an np array.
 47 |     """
 48 |     if isinstance(audio, str):
 49 |         audio, sr = librosa.load(audio, sr=None)
 50 |     else:
 51 |         assert sr, "Sampling rate must be provided if audio is np array"
 52 | 
 53 |     return len(text) / librosa.get_duration(audio, sr=sr)
 54 | 
 55 | 
 56 | def pace_phoneme(
 57 |     text: str, audio: Union[str, np.ndarray], sr: Optional[int] = None
 58 | ) -> float:
 59 |     """
 60 |     Calculates the number of phonemes in the text per second of the audio. Audio can be a file path or an np array.
 61 |     """
 62 |     if isinstance(audio, str):
 63 |         audio, sr = librosa.load(audio, sr=None)
 64 |     else:
 65 |         assert sr, "Sampling rate must be provided if audio is np array"
 66 | 
 67 |     arpabet_seq = text_to_sequence(text, ["english_cleaners"], p_arpabet=1.0)
 68 |     return len(arpabet_seq) / librosa.get_duration(audio, sr=sr)
 69 | 
 70 | 
 71 | def get_sample_format(wav_file: str):
 72 |     """
 73 |     Get sample format of the .wav file: https://trac.ffmpeg.org/wiki/audio%20types
 74 |     """
 75 |     filename, file_extension = os.path.splitext(wav_file)
 76 |     assert file_extension == ".wav", ".wav file must be supplied"
 77 | 
 78 |     info = mediainfo_json(wav_file)
 79 |     audio_streams = [x for x in info["streams"] if x["codec_type"] == "audio"]
 80 |     return audio_streams[0].get("sample_fmt")
 81 | 
 82 | 
 83 | class AbsoluteMetrics:
 84 |     """This class loads and calculates the absolute metrics, MOSNet and SRMR"""
 85 | 
 86 |     def __init__(self, window_length: Optional[int] = None):
 87 |         # NOTE(zach): There are some problems installing speechmetrics via pip and it's not critical, so import inline to avoid issues in CI.
 88 |         import speechmetrics
 89 | 
 90 |         self.metrics = speechmetrics.load("absolute", window_length)
 91 | 
 92 |     def __call__(self, wav_file: str) -> Dict[str, float]:
 93 |         """
 94 |         Returns a Dict[str,float] with keys "mosnet" and "srmr"
 95 |         """
 96 |         filename, file_extension = os.path.splitext(wav_file)
 97 |         assert file_extension == ".wav", ".wav file must be supplied"
 98 | 
 99 |         return self.metrics(wav_file)
100 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/e2e.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["tts", "rhythm_transfer"]
 2 | 
 3 | 
 4 | import torch
 5 | 
 6 | from .text.symbols import NVIDIA_TACO2_SYMBOLS
 7 | from .text.utils import prepare_input_sequence
 8 | 
 9 | 
10 | from typing import List
11 | 
12 | from .models.tacotron2 import Tacotron2
13 | from .vocoders.hifigan import HiFiGanGenerator
14 | 
15 | 
16 | def tts(
17 |     lines: List[str],
18 |     model,
19 |     device: str,
20 |     vocoder,
21 |     arpabet=False,
22 |     symbol_set=NVIDIA_TACO2_SYMBOLS,
23 |     max_wav_value=32768.0,
24 |     speaker_ids=None,
25 | ):
26 |     assert isinstance(
27 |         model, Tacotron2
28 |     ), "Only Tacotron2 text-to-mel models are supported"
29 |     assert isinstance(vocoder, HiFiGanGenerator), "Only Hifi GAN vocoders are supported"
30 |     cpu_run = device == "cpu"
31 |     sequences, input_lengths = prepare_input_sequence(
32 |         lines, cpu_run=cpu_run, arpabet=arpabet, symbol_set=symbol_set
33 |     )
34 |     if speaker_ids is None:
35 |         speaker_ids = torch.zeros(len(lines), dtype=torch.long, device=device)
36 |     input_ = sequences, input_lengths, speaker_ids
37 |     _, mel_outputs_postnet, gate_outputs, alignment, lengths = model.inference(input_)
38 |     mels = mel_outputs_postnet
39 |     mel = mels[0, :, : lengths[0].item()]
40 |     for idx in range(1, mels.size(0)):
41 |         length = lengths[idx].item()
42 |         mel = torch.cat((mel, mels[idx, :, :length]), dim=-1)
43 |     tensor_cls = torch.FloatTensor if device == "cpu" else torch.cuda.FloatTensor
44 |     mel = mel[None, :]
45 |     y_g_hat = vocoder(tensor_cls(mel).to(device=device))
46 |     audio = y_g_hat.reshape(1, -1)
47 |     audio = audio * max_wav_value
48 |     return audio
49 | 
50 | 
51 | from typing import Optional
52 | 
53 | from .models.common import MelSTFT
54 | 
55 | 
56 | @torch.no_grad()
57 | def rhythm_transfer(
58 |     original_audio: torch.tensor,
59 |     original_text: str,
60 |     model,
61 |     vocoder,
62 |     device: str,
63 |     symbol_set=NVIDIA_TACO2_SYMBOLS,
64 |     arpabet=False,
65 |     max_wav_value=32768.0,
66 |     speaker_id=0,
67 | ):
68 |     assert len(original_audio.shape) == 1
69 |     cpu_run = device == "cpu"
70 |     # TODO(zach): Support non-default STFT parameters.
71 |     stft = MelSTFT()
72 |     p_arpabet = float(arpabet)
73 |     sequence, input_lengths, _ = prepare_input_sequence(
74 |         [original_text], arpabet=arpabet, cpu_run=cpu_run, symbol_set=symbol_set
75 |     )
76 |     original_target_mel = stft.mel_spectrogram(original_audio[None])
77 |     if not cpu_run:
78 |         original_target_mel = original_target_mel.cuda()
79 |     max_len = original_target_mel.size(2)
80 |     speaker_ids = torch.tensor([speaker_id], dtype=torch.long, device=device)
81 |     inputs = (
82 |         sequence,
83 |         input_lengths,
84 |         original_target_mel,
85 |         max_len,
86 |         torch.tensor([max_len], dtype=torch.long, device=device),
87 |         speaker_ids,
88 |     )
89 |     attn = model.get_alignment(inputs)
90 |     _, mel_postnet, _, _ = model.inference_noattention(
91 |         (sequence, input_lengths, speaker_ids, attn.transpose(0, 1))
92 |     )
93 |     y_g_hat = vocoder(torch.tensor(mel_postnet, dtype=torch.float, device=device))
94 |     audio = y_g_hat.reshape(1, -1)
95 |     audio = audio * max_wav_value
96 |     return audio
97 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/exec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/exec/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/exec/normalize_audio.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["run", "parse_args"]
 2 | 
 3 | 
 4 | import argparse
 5 | import os
 6 | import sys
 7 | 
 8 | from ..utils.audio import normalize_audio, trim_audio
 9 | 
10 | 
11 | def run(dirname, backup, top_db):
12 |     """Normalize all the audio files in a directory."""
13 |     old_dirname = dirname
14 |     if backup:
15 |         old_dirname = f"{os.path.normpath(old_dirname)}_backup"
16 |         os.rename(dirname, old_dirname)
17 |     for dirpath, _, filenames in os.walk(old_dirname):
18 |         rel_path = os.path.relpath(dirpath, old_dirname)
19 |         for filename in filenames:
20 |             if not filename.endswith(".wav"):
21 |                 continue
22 |             old_path = os.path.join(dirpath, filename)
23 |             new_path = os.path.join(dirname, rel_path, filename)
24 |             if not os.path.exists(os.path.join(dirname, rel_path)):
25 |                 os.makedirs(os.path.join(dirname, rel_path))
26 |             trim_audio(old_path, new_path, top_db)
27 | 
28 | 
29 | def parse_args(args):
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument(
32 |         "-d",
33 |         "--dirname",
34 |         help="Path to the directory which contains audio files to normalize.",
35 |     )
36 |     parser.add_argument("--backup", dest="backup", action="store_true")
37 |     parser.add_argument("--no-backup", dest="backup", action="store_false")
38 |     parser.add_argument("--top-db", type=int)
39 |     parser.set_defaults(backup=True, top_db=20)
40 |     return parser.parse_args(args)
41 | 
42 | 
43 | try:
44 |     from nbdev.imports import IN_NOTEBOOK
45 | except:
46 |     IN_NOTEBOOK = False
47 | 
48 | if __name__ == "__main__" and not IN_NOTEBOOK:
49 |     args = parse_args(sys.argv[1:])
50 |     run(args.dirname, args.backup, args.top_db)
51 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/exec/split_train_val.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["write_filenames", "run", "parse_args"]
 2 | 
 3 | 
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | 
11 | def write_filenames(filenames, output_dir, output_filename):
12 |     """
13 |     Writes a list of filenames of as each line of a .txt file specified by output_filename.
14 |     """
15 |     with open(os.path.join(output_dir, output_filename), "w") as f:
16 |         for item in filenames:
17 |             f.write(f"{item}\n")
18 | 
19 | 
20 | def run(
21 |     path,
22 |     val_percent=0.2,
23 |     val_num=None,
24 |     train_file="train.txt",
25 |     val_file="val.txt",
26 | ):
27 |     """Split file in t
28 |     Default behavior only creates a training and validation set (not test set).
29 |     """
30 |     with open(path) as f:
31 |         lines = [l.strip("\n") for l in f.readlines()]
32 | 
33 |     train, val = train_test_split(lines, test_size=val_num if val_num else val_percent)
34 |     write_filenames(train, Path(os.path.dirname(path)), train_file)
35 |     write_filenames(val, Path(os.path.dirname(path)), val_file)
36 | 
37 | 
38 | import argparse
39 | import sys
40 | 
41 | 
42 | def parse_args(args):
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument(
45 |         "-i", "--in", dest="input_path", help="Path to input file list", required=True
46 |     )
47 |     parser.add_argument("-n", "--num_val", dest="num_val", type=float, default=0.1)
48 |     args = parser.parse_args(args)
49 |     return args
50 | 
51 | 
52 | try:
53 |     from nbdev.imports import IN_NOTEBOOK
54 | except:
55 |     IN_NOTEBOOK = False
56 | 
57 | if __name__ == "__main__" and not IN_NOTEBOOK:
58 |     args = parse_args(sys.argv[1:])
59 |     if args.num_val > 1:
60 |         run(args.input_path, val_num=int(args.num_val))
61 |     else:
62 |         run(args.input_path, val_percent=args.num_val)
63 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/exec/train_radtts_with_ray.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import os
 4 | 
 5 | from ray.air.config import ScalingConfig, RunConfig
 6 | from ray.train.torch import TorchTrainer
 7 | from ray.tune import SyncConfig
 8 | from ray.train.torch import TorchTrainer, TorchTrainer
 9 | from ray.air.config import ScalingConfig, RunConfig
10 | 
11 | from uberduck_ml_dev.trainer.radtts.train import train_func
12 | from uberduck_ml_dev.utils.exec import parse_args
13 | from uberduck_ml_dev.trainer.radtts.train import DEFAULTS as TRAIN_CONFIG
14 | from uberduck_ml_dev.data.data import RADTTS_DEFAULTS as DATA_CONFIG
15 | from uberduck_ml_dev.models.radtts import DEFAULTS as MODEL_CONFIG
16 | 
17 | if __name__ == "__main__":
18 |     args = parse_args(sys.argv[1:])
19 |     if args.config:
20 |         with open(args.config) as f:
21 |             config_inputs = json.load(f)
22 | 
23 |     config = dict(
24 |         train_config=TRAIN_CONFIG, data_config=DATA_CONFIG, model_config=MODEL_CONFIG
25 |     )
26 |     config["train_config"].update(config_inputs["train_config"])
27 |     config["data_config"].update(config_inputs["data_config"])
28 |     config["model_config"].update(config_inputs["model_config"])
29 | 
30 |     os.makedirs(config["train_config"]["output_directory"], exist_ok=True)
31 |     trainer = TorchTrainer(
32 |         train_loop_per_worker=train_func,
33 |         train_loop_config=config,
34 |         scaling_config=ScalingConfig(
35 |             num_workers=config["train_config"]["n_gpus"],
36 |             use_gpu=True,
37 |             resources_per_worker=dict(
38 |                 CPU=config["data_config"]["num_workers"],
39 |                 GPU=1,
40 |             ),
41 |         ),
42 |         run_config=RunConfig(sync_config=SyncConfig()),
43 |     )
44 | 
45 |     result = trainer.fit()
46 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/exec/train_tacotron2.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["parse_args", "run"]
 2 | 
 3 | from ..trainer.tacotron2 import Tacotron2Trainer
 4 | from ..vendor.tfcompat.hparam import HParams
 5 | from ..trainer.tacotron2 import DEFAULTS as TACOTRON2_TRAINER_DEFAULTS
 6 | import argparse
 7 | import sys
 8 | import json
 9 | import torch
10 | from torch import multiprocessing as mp
11 | 
12 | 
13 | def parse_args(args):
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument("--config", help="Path to JSON config")
16 |     args = parser.parse_args(args)
17 |     return args
18 | 
19 | 
20 | def run(rank, device_count, hparams):
21 |     trainer = Tacotron2Trainer(hparams, rank=rank, world_size=device_count)
22 |     try:
23 |         trainer.train()
24 |     except Exception as e:
25 |         print(f"Exception raised while training: {e}")
26 |         # TODO: save state.
27 |         raise e
28 | 
29 | 
30 | try:
31 |     from nbdev.imports import IN_NOTEBOOK
32 | except:
33 |     IN_NOTEBOOK = False
34 | if __name__ == "__main__" and not IN_NOTEBOOK:
35 |     args = parse_args(sys.argv[1:])
36 |     config = TACOTRON2_TRAINER_DEFAULTS.values()
37 |     if args.config:
38 |         with open(args.config) as f:
39 |             config.update(json.load(f))
40 |     config.update(vars(args))
41 |     hparams = HParams(**config)
42 |     if hparams.distributed_run:
43 |         device_count = torch.cuda.device_count()
44 |         mp.spawn(run, (device_count, hparams), device_count)
45 |     else:
46 |         run(None, None, hparams)
47 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/exec/train_vits.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["parse_args", "run"]
 2 | 
 3 | 
 4 | import argparse
 5 | import json
 6 | import librosa  # NOTE(zach): importing torch before librosa causes LLVM issues for some unknown reason.
 7 | import sys
 8 | 
 9 | import torch
10 | from torch import multiprocessing as mp
11 | 
12 | from ..trainer.vits import VITSTrainer
13 | from ..vendor.tfcompat.hparam import HParams
14 | from ..models.vits import DEFAULTS as VITS_DEFAULTS
15 | 
16 | 
17 | def parse_args(args):
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--config", help="Path to JSON config")
20 |     args = parser.parse_args(args)
21 |     return args
22 | 
23 | 
24 | def run(rank, device_count, hparams):
25 |     trainer = VITSTrainer(hparams, rank=rank, world_size=device_count)
26 |     try:
27 |         trainer.train()
28 |     except Exception as e:
29 |         print(f"Exception raised while training: {e}")
30 |         # TODO: save state.
31 |         raise e
32 | 
33 | 
34 | try:
35 |     from nbdev.imports import IN_NOTEBOOK
36 | except:
37 |     IN_NOTEBOOK = False
38 | if __name__ == "__main__" and not IN_NOTEBOOK:
39 |     args = parse_args(sys.argv[1:])
40 |     config = VITS_DEFAULTS.values()
41 |     if args.config:
42 |         with open(args.config) as f:
43 |             config.update(json.load(f))
44 |     hparams = HParams(**config)
45 |     if hparams.distributed_run:
46 |         device_count = torch.cuda.device_count()
47 |         mp.spawn(run, (device_count, hparams), device_count)
48 |     else:
49 |         run(0, 1, hparams)
50 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/losses_rvc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | 
 4 | 
 5 | def feature_loss(fmap_r, fmap_g):
 6 |     loss = 0
 7 |     for dr, dg in zip(fmap_r, fmap_g):
 8 |         for rl, gl in zip(dr, dg):
 9 |             rl = rl.float().detach()
10 |             gl = gl.float()
11 |             loss += torch.mean(torch.abs(rl - gl))
12 | 
13 |     return loss * 2
14 | 
15 | 
16 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
17 |     loss = 0
18 |     r_losses = []
19 |     g_losses = []
20 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
21 |         dr = dr.float()
22 |         dg = dg.float()
23 |         r_loss = torch.mean((1 - dr) ** 2)
24 |         g_loss = torch.mean(dg**2)
25 |         loss += r_loss + g_loss
26 |         r_losses.append(r_loss.item())
27 |         g_losses.append(g_loss.item())
28 | 
29 |     return loss, r_losses, g_losses
30 | 
31 | 
32 | def generator_loss(disc_outputs):
33 |     loss = 0
34 |     gen_losses = []
35 |     for dg in disc_outputs:
36 |         dg = dg.float()
37 |         l = torch.mean((1 - dg) ** 2)
38 |         gen_losses.append(l)
39 |         loss += l
40 | 
41 |     return loss, gen_losses
42 | 
43 | 
44 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
45 |     """
46 |     z_p, logs_q: [b, h, t_t]
47 |     m_p, logs_p: [b, h, t_t]
48 |     """
49 |     z_p = z_p.float()
50 |     logs_q = logs_q.float()
51 |     m_p = m_p.float()
52 |     logs_p = logs_p.float()
53 |     z_mask = z_mask.float()
54 | 
55 |     kl = logs_p - logs_q - 0.5
56 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
57 |     kl = torch.sum(kl * z_mask)
58 |     l = kl / torch.sum(z_mask)
59 |     return l
60 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/base.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["TTSModel", "DEFAULTS"]
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from ..text.symbols import SYMBOL_SETS
 7 | from ..vendor.tfcompat.hparam import HParams
 8 | 
 9 | 
10 | class TTSModel(nn.Module):
11 |     def __init__(self, hparams):
12 |         super().__init__()
13 |         self.symbol_set = hparams.symbol_set
14 |         self.n_symbols = len(SYMBOL_SETS[self.symbol_set])
15 |         self.n_speakers = hparams.n_speakers
16 |         # symbols = __import__('uberduck_ml_dev.text.' + hparams.symbols)
17 | 
18 |     def infer(self):
19 |         raise NotImplemented
20 | 
21 |     def forward(self):
22 |         raise NotImplemented
23 | 
24 |     def from_pretrained(
25 |         self, warm_start_path=None, device="cpu", ignore_layers=None, model_dict=None
26 |     ):
27 |         model_dict = model_dict or dict()
28 |         if warm_start_path is None and model_dict is None:
29 |             raise Exception(
30 |                 "TTSModel.from_pretrained requires a warm_start_path or state_dict"
31 |             )
32 |         if warm_start_path is not None:
33 |             checkpoint = torch.load(warm_start_path, map_location=device)
34 |             if (
35 |                 "state_dict" in checkpoint.keys()
36 |             ):  # TODO: remove state_dict once off nvidia
37 |                 model_dict = checkpoint["state_dict"]
38 |             if "model" in checkpoint.keys():
39 |                 model_dict = checkpoint["model"]
40 |         if ignore_layers:
41 |             model_dict = {k: v for k, v in model_dict.items() if k not in ignore_layers}
42 |         dummy_dict = self.state_dict()
43 | 
44 |         for k in self.state_dict().keys():
45 |             if k not in model_dict.keys():
46 |                 print(
47 |                     f"WARNING! Attempting to load a model with out the {k} layer. This could lead to unexpected results during evaluation."
48 |                 )
49 | 
50 |         dummy_dict.update(model_dict)
51 |         model_dict = dummy_dict
52 |         self.load_state_dict(model_dict)
53 |         if device == "cuda":
54 |             self.cuda()
55 | 
56 |     def to_checkpoint(self):
57 |         return dict(model=self.state_dict())
58 | 
59 |     @classmethod
60 |     def create(cls, name, opts, folders, all_speakers=True):
61 |         pass
62 | 
63 | 
64 | DEFAULTS = HParams(
65 |     p_arpabet=1.0,
66 |     seed=1234,
67 |     # NOTE (Sam): make sure users change their configurations for cudnn_enabled = True.
68 |     cudnn_enabled=False,
69 | )
70 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/alignment.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: MIT
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a
 5 | # copy of this software and associated documentation files (the "Software"),
 6 | # to deal in the Software without restriction, including without limitation
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | # and/or sell copies of the Software, and to permit persons to whom the
 9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | import sys
22 | import numpy as np
23 | from matplotlib import pylab as plt
24 | from numba import jit
25 | 
26 | 
27 | def save_plot(fname, attn_map):
28 |     plt.imshow(attn_map)
29 |     plt.savefig(fname)
30 | 
31 | 
32 | @jit(nopython=True)
33 | def mas_width1(attn_map):
34 |     """mas with hardcoded width=1"""
35 |     # assumes mel x text
36 |     opt = np.zeros_like(attn_map)
37 |     attn_map = np.log(attn_map)
38 |     attn_map[0, 1:] = -np.inf
39 |     log_p = np.zeros_like(attn_map)
40 |     log_p[0, :] = attn_map[0, :]
41 |     prev_ind = np.zeros_like(attn_map, dtype=np.int64)
42 |     for i in range(1, attn_map.shape[0]):
43 |         for j in range(attn_map.shape[1]):  # for each text dim
44 |             prev_log = log_p[i - 1, j]
45 |             prev_j = j
46 | 
47 |             if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]:
48 |                 prev_log = log_p[i - 1, j - 1]
49 |                 prev_j = j - 1
50 | 
51 |             log_p[i, j] = attn_map[i, j] + prev_log
52 |             prev_ind[i, j] = prev_j
53 | 
54 |     # now backtrack
55 |     curr_text_idx = attn_map.shape[1] - 1
56 |     for i in range(attn_map.shape[0] - 1, -1, -1):
57 |         opt[i, curr_text_idx] = 1
58 |         curr_text_idx = prev_ind[i, curr_text_idx]
59 |     opt[0, curr_text_idx] = 1
60 |     return opt
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     attn_ = np.load(sys.argv[1])
65 |     attn = attn_.squeeze()
66 |     save_plot("orig.png", attn)
67 |     binarized = mas(attn)
68 |     save_plot("binarized.png", binarized)
69 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/attention.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch
 3 | from numpy import finfo
 4 | from torch.nn import functional as F
 5 | from typing import Optional
 6 | 
 7 | from ..common import LinearNorm, LocationLayer
 8 | 
 9 | 
10 | class Attention(nn.Module):
11 |     def __init__(
12 |         self,
13 |         attention_rnn_dim,
14 |         embedding_dim,
15 |         attention_dim,
16 |         attention_location_n_filters,
17 |         attention_location_kernel_size,
18 |         fp16_run,
19 |     ):
20 |         super(Attention, self).__init__()
21 |         self.query_layer = LinearNorm(
22 |             attention_rnn_dim, attention_dim, bias=False, w_init_gain="tanh"
23 |         )
24 |         self.memory_layer = LinearNorm(
25 |             embedding_dim, attention_dim, bias=False, w_init_gain="tanh"
26 |         )
27 |         self.v = LinearNorm(attention_dim, 1, bias=False)
28 |         self.location_layer = LocationLayer(
29 |             attention_location_n_filters, attention_location_kernel_size, attention_dim
30 |         )
31 |         if fp16_run:
32 |             self.score_mask_value = finfo("float16").min
33 |         else:
34 |             self.score_mask_value = -float("inf")
35 | 
36 |     def get_alignment_energies(self, query, processed_memory, attention_weights_cat):
37 |         """
38 |         PARAMS
39 |         ------
40 |         query: decoder output (batch, n_mel_channels * n_frames_per_step)
41 |         processed_memory: processed encoder outputs (B, T_in, attention_dim)
42 |         attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
43 | 
44 |         RETURNS
45 |         -------
46 |         alignment (batch, max_time)
47 |         """
48 | 
49 |         processed_query = self.query_layer(query.unsqueeze(1))
50 |         processed_attention_weights = self.location_layer(attention_weights_cat)
51 |         energies = self.v(
52 |             torch.tanh(processed_query + processed_attention_weights + processed_memory)
53 |         )
54 | 
55 |         energies = energies.squeeze(-1)
56 |         return energies
57 | 
58 |     def forward(
59 |         self,
60 |         attention_hidden_state,
61 |         memory,
62 |         processed_memory,
63 |         attention_weights_cat,
64 |         mask,
65 |         attention_weights: Optional[torch.Tensor],
66 |     ):
67 |         """
68 |         PARAMS
69 |         ------
70 |         attention_hidden_state: attention rnn last output
71 |         memory: encoder outputs
72 |         processed_memory: processed encoder outputs
73 |         attention_weights_cat: previous and cummulative attention weights
74 |         mask: binary mask for padded data
75 |         """
76 |         if attention_weights is None:
77 |             alignment = self.get_alignment_energies(
78 |                 attention_hidden_state, processed_memory, attention_weights_cat
79 |             )
80 | 
81 |             if mask is not None:
82 |                 alignment.data.masked_fill_(mask, self.score_mask_value)
83 | 
84 |             attention_weights = F.softmax(alignment, dim=1)
85 |         attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
86 |         attention_context = attention_context.squeeze(1)
87 | 
88 |         return attention_context, attention_weights
89 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/decoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/decoders/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/encoders/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/duration.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | from uberduck_ml_dev.models import common
  8 | 
  9 | 
 10 | class StochasticDurationPredictor(nn.Module):
 11 |     def __init__(
 12 |         self,
 13 |         in_channels,
 14 |         filter_channels,
 15 |         kernel_size,
 16 |         p_dropout,
 17 |         n_flows=4,
 18 |         gin_channels=0,
 19 |     ):
 20 |         super().__init__()
 21 |         filter_channels = in_channels  # it needs to be removed from future version.
 22 |         self.in_channels = in_channels
 23 |         self.filter_channels = filter_channels
 24 |         self.kernel_size = kernel_size
 25 |         self.p_dropout = p_dropout
 26 |         self.n_flows = n_flows
 27 |         self.gin_channels = gin_channels
 28 | 
 29 |         self.log_flow = common.Log()
 30 |         self.flows = nn.ModuleList()
 31 |         self.flows.append(common.ElementwiseAffine(2))
 32 |         for i in range(n_flows):
 33 |             self.flows.append(
 34 |                 common.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
 35 |             )
 36 |             self.flows.append(common.Flip())
 37 | 
 38 |         self.post_pre = nn.Conv1d(1, filter_channels, 1)
 39 |         self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
 40 |         self.post_convs = common.DDSConv(
 41 |             filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
 42 |         )
 43 |         self.post_flows = nn.ModuleList()
 44 |         self.post_flows.append(common.ElementwiseAffine(2))
 45 |         for i in range(4):
 46 |             self.post_flows.append(
 47 |                 common.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
 48 |             )
 49 |             self.post_flows.append(common.Flip())
 50 | 
 51 |         self.pre = nn.Conv1d(in_channels, filter_channels, 1)
 52 |         self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
 53 |         self.convs = common.DDSConv(
 54 |             filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
 55 |         )
 56 |         if gin_channels != 0:
 57 |             self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 58 | 
 59 |     def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 60 |         x = torch.detach(x)
 61 |         x = self.pre(x)
 62 |         if g is not None:
 63 |             g = torch.detach(g)
 64 |             x = x + self.cond(g)
 65 |         x = self.convs(x, x_mask)
 66 |         x = self.proj(x) * x_mask
 67 | 
 68 |         if not reverse:
 69 |             flows = self.flows
 70 |             assert w is not None
 71 | 
 72 |             logdet_tot_q = 0
 73 |             h_w = self.post_pre(w)
 74 |             h_w = self.post_convs(h_w, x_mask)
 75 |             h_w = self.post_proj(h_w) * x_mask
 76 |             e_q = (
 77 |                 torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
 78 |                 * x_mask
 79 |             )
 80 |             z_q = e_q
 81 |             for flow in self.post_flows:
 82 |                 z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
 83 |                 logdet_tot_q += logdet_q
 84 |             z_u, z1 = torch.split(z_q, [1, 1], 1)
 85 |             u = torch.sigmoid(z_u) * x_mask
 86 |             z0 = (w - u) * x_mask
 87 |             logdet_tot_q += torch.sum(
 88 |                 (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
 89 |             )
 90 |             logq = (
 91 |                 torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
 92 |                 - logdet_tot_q
 93 |             )
 94 | 
 95 |             logdet_tot = 0
 96 |             z0, logdet = self.log_flow(z0, x_mask)
 97 |             logdet_tot += logdet
 98 |             z = torch.cat([z0, z1], 1)
 99 |             for flow in flows:
100 |                 z, logdet = flow(z, x_mask, g=x, reverse=reverse)
101 |                 logdet_tot = logdet_tot + logdet
102 |             nll = (
103 |                 torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
104 |                 - logdet_tot
105 |             )
106 |             return nll + logq  # [b]
107 |         else:
108 |             flows = list(reversed(self.flows))
109 |             flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
110 |             z = (
111 |                 torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
112 |                 * noise_scale
113 |             )
114 |             for flow in flows:
115 |                 z = flow(z, x_mask, g=x, reverse=reverse)
116 |             z0, z1 = torch.split(z, [1, 1], 1)
117 |             logw = z0
118 |             return logw
119 | 
120 | 
121 | class DurationPredictor(nn.Module):
122 |     def __init__(
123 |         self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
124 |     ):
125 |         super().__init__()
126 | 
127 |         self.in_channels = in_channels
128 |         self.filter_channels = filter_channels
129 |         self.kernel_size = kernel_size
130 |         self.p_dropout = p_dropout
131 |         self.gin_channels = gin_channels
132 | 
133 |         self.drop = nn.Dropout(p_dropout)
134 |         self.conv_1 = nn.Conv1d(
135 |             in_channels, filter_channels, kernel_size, padding=kernel_size // 2
136 |         )
137 |         self.norm_1 = common.LayerNorm(filter_channels)
138 |         self.conv_2 = nn.Conv1d(
139 |             filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
140 |         )
141 |         self.norm_2 = common.LayerNorm(filter_channels)
142 |         self.proj = nn.Conv1d(filter_channels, 1, 1)
143 | 
144 |         if gin_channels != 0:
145 |             self.cond = nn.Conv1d(gin_channels, in_channels, 1)
146 | 
147 |     def forward(self, x, x_mask, g=None):
148 |         x = torch.detach(x)
149 |         if g is not None:
150 |             g = torch.detach(g)
151 |             x = x + self.cond(g)
152 |         x = self.conv_1(x * x_mask)
153 |         x = torch.relu(x)
154 |         x = self.norm_1(x)
155 |         x = self.drop(x)
156 |         x = self.conv_2(x * x_mask)
157 |         x = torch.relu(x)
158 |         x = self.norm_2(x)
159 |         x = self.drop(x)
160 |         x = self.proj(x * x_mask)
161 |         return x * x_mask
162 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/resnet_speaker_encoder.py:
--------------------------------------------------------------------------------
  1 | # NOTE (Sam): this is the only component in this repository under copyleft license (Coqui / Mozilla).
  2 | 
  3 | from io import BytesIO
  4 | import os
  5 | import requests
  6 | import json
  7 | 
  8 | from scipy.io.wavfile import read
  9 | import torch
 10 | 
 11 | # TODO (Sam): eliminate redundancy.
 12 | from .speaker.resnet import ResNetSpeakerEncoder
 13 | 
 14 | DEFAULT_AUDIO_CONFIG = {
 15 |     "fft_size": 512,
 16 |     "win_length": 400,
 17 |     "hop_length": 160,
 18 |     "frame_shift_ms": None,
 19 |     "frame_length_ms": None,
 20 |     "stft_pad_mode": "reflect",
 21 |     "sample_rate": 22050,
 22 |     "resample": False,
 23 |     "preemphasis": 0.97,
 24 |     "ref_level_db": 20,
 25 |     "do_sound_norm": False,
 26 |     "do_trim_silence": False,
 27 |     "trim_db": 60,
 28 |     "power": 1.5,
 29 |     "griffin_lim_iters": 60,
 30 |     "num_mels": 64,
 31 |     "mel_fmin": 0.0,
 32 |     "mel_fmax": 8000.0,
 33 |     "spec_gain": 20,
 34 |     "signal_norm": False,
 35 |     "min_level_db": -100,
 36 |     "symmetric_norm": False,
 37 |     "max_norm": 4.0,
 38 |     "clip_norm": False,
 39 |     "stats_path": None,
 40 |     "do_rms_norm": True,
 41 |     "db_level": -27.0,
 42 | }
 43 | 
 44 | 
 45 | def get_pretrained_model(
 46 |     config_url=None, model_url=None, config_path=None, model_path=None
 47 | ):
 48 |     assert not ((config_url is not None) and (config_path is not None))
 49 |     assert not ((model_url is not None) and (model_path is not None))
 50 | 
 51 |     if config_path is None:
 52 |         print("Getting model config...")
 53 |         if config_url is None:
 54 |             config_url = os.environ["RESNET_SE_CONFIG_URL"]
 55 |         response = requests.get(config_url)
 56 |         resnet_config = response.json()
 57 |     else:
 58 |         with open(config_path) as f:
 59 |             resnet_config = json.load(f)
 60 |     model_params = resnet_config["model_params"]
 61 |     if "model_name" in model_params:
 62 |         del model_params["model_name"]
 63 | 
 64 |     audio_config = dict(resnet_config["audio"])
 65 |     audio_config["sample_rate"] = 22050
 66 |     model = ResNetSpeakerEncoder(**model_params, audio_config=audio_config)
 67 |     print("Loading pretrained model...")
 68 |     load_pretrained(model, model_url=model_url, model_path=model_path)
 69 |     print("Got pretrained model...")
 70 |     model.eval()
 71 |     return model
 72 | 
 73 | 
 74 | def load_pretrained(model, model_url=None, model_path=None):
 75 |     assert not ((model_url is not None) and (model_path is not None))
 76 |     if model_path is not None:
 77 |         loaded = torch.load(model_path)
 78 |     else:
 79 |         if model_url is None:
 80 |             model_url = os.environ["RESNET_SE_MODEL_URL"]
 81 |         response = requests.get(model_url, stream=True)
 82 |         bio = BytesIO(response.content)
 83 |         loaded = torch.load(bio)
 84 |     model.load_state_dict(loaded["model"])
 85 | 
 86 | 
 87 | class ResNetSpeakerEncoderCallable:
 88 |     def __init__(self, model_path: str, config_path: str):
 89 |         print("initializing resnet speaker encoder")
 90 |         with open(config_path) as f:
 91 |             resnet_config = json.load(f)
 92 | 
 93 |         state_dict = torch.load(model_path)["model"]
 94 |         audio_config = dict(resnet_config["audio"])
 95 |         model_params = resnet_config["model_params"]
 96 |         if "model_name" in model_params:
 97 |             del model_params["model_name"]
 98 | 
 99 |         self.device = "cuda"
100 |         self.model = ResNetSpeakerEncoder(**model_params, audio_config=audio_config)
101 |         self.model.load_state_dict(state_dict)
102 |         self.model.eval()
103 |         self.model.cuda()
104 | 
105 |     # NOTE (Sam): might have to accept bytes input for anyscale distributed data loading?
106 |     def __call__(self, audiopaths):
107 |         print("calling resnet speaker encoder")
108 |         for audiopath in audiopaths:
109 |             audio_data = read(audiopath)[1]
110 |             datum = torch.FloatTensor(audio_data).unsqueeze(-1).t().cuda()
111 |             # datum = torch.FloatTensor(audio_data).unsqueeze(-1).t()
112 |             emb = self.model(datum)
113 |             emb = emb.cpu().detach().numpy()
114 |             yield {"audio_embedding": emb}
115 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/speaker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/components/encoders/speaker/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/speaker/base_encoder.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/coqui-ai/TTS/blob/dev/TTS/encoder/models/base_encoder.py
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torchaudio
  6 | 
  7 | from torch import nn
  8 | 
  9 | 
 10 | class PreEmphasis(nn.Module):
 11 |     def __init__(self, coefficient=0.97):
 12 |         super().__init__()
 13 |         self.coefficient = coefficient
 14 |         self.register_buffer(
 15 |             "filter",
 16 |             torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0),
 17 |         )
 18 | 
 19 |     def forward(self, x):
 20 |         assert len(x.size()) == 2
 21 | 
 22 |         x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
 23 |         return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
 24 | 
 25 | 
 26 | class BaseEncoder(nn.Module):
 27 |     """Base `encoder` class. Every new `encoder` model must inherit this.
 28 | 
 29 |     It defines common `encoder` specific functions.
 30 |     """
 31 | 
 32 |     # pylint: disable=W0102
 33 |     def __init__(self):
 34 |         super(BaseEncoder, self).__init__()
 35 | 
 36 |     def get_torch_mel_spectrogram_class(self, audio_config):
 37 |         return torch.nn.Sequential(
 38 |             PreEmphasis(audio_config["preemphasis"]),
 39 |             # TorchSTFT(
 40 |             #     n_fft=audio_config["fft_size"],
 41 |             #     hop_length=audio_config["hop_length"],
 42 |             #     win_length=audio_config["win_length"],
 43 |             #     sample_rate=audio_config["sample_rate"],
 44 |             #     window="hamming_window",
 45 |             #     mel_fmin=0.0,
 46 |             #     mel_fmax=None,
 47 |             #     use_htk=True,
 48 |             #     do_amp_to_db=False,
 49 |             #     n_mels=audio_config["num_mels"],
 50 |             #     power=2.0,
 51 |             #     use_mel=True,
 52 |             #     mel_norm=None,
 53 |             # )
 54 |             torchaudio.transforms.MelSpectrogram(
 55 |                 sample_rate=audio_config["sample_rate"],
 56 |                 n_fft=audio_config["fft_size"],
 57 |                 win_length=audio_config["win_length"],
 58 |                 hop_length=audio_config["hop_length"],
 59 |                 window_fn=torch.hamming_window,
 60 |                 n_mels=audio_config["num_mels"],
 61 |             ),
 62 |         )
 63 | 
 64 |     @torch.no_grad()
 65 |     def inference(self, x, l2_norm=True):
 66 |         return self.forward(x, l2_norm)
 67 | 
 68 |     @torch.no_grad()
 69 |     def compute_embedding(
 70 |         self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True
 71 |     ):
 72 |         """
 73 |         Generate embeddings for a batch of utterances
 74 |         x: 1xTxD
 75 |         """
 76 |         # map to the waveform size
 77 |         if self.use_torch_spec:
 78 |             num_frames = num_frames * self.audio_config["hop_length"]
 79 | 
 80 |         max_len = x.shape[1]
 81 | 
 82 |         if max_len < num_frames:
 83 |             num_frames = max_len
 84 | 
 85 |         offsets = np.linspace(0, max_len - num_frames, num=num_eval)
 86 | 
 87 |         frames_batch = []
 88 |         for offset in offsets:
 89 |             offset = int(offset)
 90 |             end_offset = int(offset + num_frames)
 91 |             frames = x[:, offset:end_offset]
 92 |             frames_batch.append(frames)
 93 | 
 94 |         frames_batch = torch.cat(frames_batch, dim=0)
 95 |         embeddings = self.inference(frames_batch, l2_norm=l2_norm)
 96 | 
 97 |         if return_mean:
 98 |             embeddings = torch.mean(embeddings, dim=0, keepdim=True)
 99 |         return embeddings
100 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/encoders/tacotron2.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch
 3 | from torch.nn import functional as F
 4 | 
 5 | from ...common import Conv1d
 6 | 
 7 | 
 8 | class Encoder(nn.Module):
 9 |     """Encoder module:
10 |     - Three 1-d convolution banks
11 |     - Bidirectional LSTM
12 |     """
13 | 
14 |     def __init__(self, hparams):
15 |         super().__init__()
16 | 
17 |         convolutions = []
18 |         for _ in range(hparams.encoder_n_convolutions):
19 |             conv_layer = nn.Sequential(
20 |                 Conv1d(
21 |                     hparams.encoder_embedding_dim,
22 |                     hparams.encoder_embedding_dim,
23 |                     kernel_size=hparams.encoder_kernel_size,
24 |                     stride=1,
25 |                     padding=int((hparams.encoder_kernel_size - 1) / 2),
26 |                     dilation=1,
27 |                     w_init_gain="relu",
28 |                 ),
29 |                 nn.BatchNorm1d(hparams.encoder_embedding_dim),
30 |             )
31 |             convolutions.append(conv_layer)
32 |         self.convolutions = nn.ModuleList(convolutions)
33 |         self.dropout_rate = 0.5
34 | 
35 |         self.lstm = nn.LSTM(
36 |             hparams.encoder_embedding_dim,
37 |             int(hparams.encoder_embedding_dim / 2),
38 |             1,
39 |             batch_first=True,
40 |             bidirectional=True,
41 |         )
42 | 
43 |     def forward(self, x, input_lengths):
44 |         if x.size()[0] > 1:
45 |             x_embedded = []
46 |             for b_ind in range(x.size()[0]):  # TODO: Speed up
47 |                 curr_x = x[b_ind : b_ind + 1, :, : input_lengths[b_ind]].clone()
48 |                 for conv in self.convolutions:
49 |                     curr_x = F.dropout(
50 |                         F.relu(conv(curr_x)), self.dropout_rate, self.training
51 |                     )
52 |                 x_embedded.append(curr_x[0].transpose(0, 1))
53 |             x = torch.nn.utils.rnn.pad_sequence(x_embedded, batch_first=True)
54 |         else:
55 |             for conv in self.convolutions:
56 |                 x = F.dropout(F.relu(conv(x)), self.dropout_rate, self.training)
57 |             x = x.transpose(1, 2)
58 | 
59 |         # pytorch tensor are not reversible, hence the conversion
60 |         input_lengths = input_lengths.cpu().numpy()
61 |         x = nn.utils.rnn.pack_padded_sequence(
62 |             x, input_lengths, batch_first=True, enforce_sorted=False
63 |         )
64 | 
65 |         self.lstm.flatten_parameters()
66 |         outputs, _ = self.lstm(x)
67 | 
68 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
69 |         return outputs
70 | 
71 |     def inference(self, x, input_lengths):
72 |         device = x.device
73 |         for conv in self.convolutions:
74 |             x = F.dropout(F.relu(conv(x)), self.dropout_rate, self.training)
75 | 
76 |         x = x.transpose(1, 2)
77 | 
78 |         input_lengths = input_lengths.cpu()
79 |         x = nn.utils.rnn.pack_padded_sequence(
80 |             x, input_lengths, batch_first=True, enforce_sorted=False
81 |         )
82 | 
83 |         outputs, _ = self.lstm(x)
84 | 
85 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
86 | 
87 |         return outputs
88 | 
89 | 
90 | # NOTE (Sam): for torchscipt compilation
91 | class EncoderForwardIsInfer(Encoder):
92 |     def forward(self, x, input_lengths):
93 |         return self.inference(x, input_lengths)
94 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/partialconv1d.py:
--------------------------------------------------------------------------------
 1 | # Modified partialconv source code based on implementation from
 2 | # https://github.com/NVIDIA/partialconv/blob/master/models/partialconv2d.py
 3 | ###############################################################################
 4 | # BSD 3-Clause License
 5 | #
 6 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 7 | #
 8 | # Author & Contact: Guilin Liu (guilinl@nvidia.com)
 9 | ###############################################################################
10 | 
11 | # Original Author & Contact: Guilin Liu (guilinl@nvidia.com)
12 | # Modified by Kevin Shih (kshih@nvidia.com)
13 | 
14 | import torch
15 | import torch.nn.functional as F
16 | from torch import nn
17 | from typing import Tuple
18 | 
19 | 
20 | class PartialConv1d(nn.Conv1d):
21 |     def __init__(self, *args, **kwargs):
22 |         self.multi_channel = False
23 |         self.return_mask = False
24 |         super(PartialConv1d, self).__init__(*args, **kwargs)
25 | 
26 |         self.weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0])
27 |         self.slide_winsize = (
28 |             self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2]
29 |         )
30 | 
31 |         self.last_size = (None, None, None)
32 |         self.update_mask = None
33 |         self.mask_ratio = None
34 | 
35 |     @torch.jit.ignore
36 |     def forward(self, input: torch.Tensor, mask_in: torch.Tensor = None):
37 |         """
38 |         input: standard input to a 1D conv
39 |         mask_in: binary mask for valid values, same shape as input
40 |         """
41 |         assert len(input.shape) == 3
42 |         # if a mask is input, or tensor shape changed, update mask ratio
43 |         if mask_in is not None or self.last_size != tuple(input.shape):
44 |             self.last_size = tuple(input.shape)
45 |             with torch.no_grad():
46 |                 if self.weight_maskUpdater.type() != input.type():
47 |                     self.weight_maskUpdater = self.weight_maskUpdater.to(input)
48 |                 if mask_in is None:
49 |                     mask = torch.ones(1, 1, input.data.shape[2]).to(input)
50 |                 else:
51 |                     mask = mask_in
52 |                 self.update_mask = F.conv1d(
53 |                     mask,
54 |                     self.weight_maskUpdater,
55 |                     bias=None,
56 |                     stride=self.stride,
57 |                     padding=self.padding,
58 |                     dilation=self.dilation,
59 |                     groups=1,
60 |                 )
61 |                 # for mixed precision training, change 1e-8 to 1e-6
62 |                 self.mask_ratio = self.slide_winsize / (self.update_mask + 1e-6)
63 |                 self.update_mask = torch.clamp(self.update_mask, 0, 1)
64 |                 self.mask_ratio = torch.mul(self.mask_ratio, self.update_mask)
65 |         raw_out = super(PartialConv1d, self).forward(
66 |             torch.mul(input, mask) if mask_in is not None else input
67 |         )
68 |         if self.bias is not None:
69 |             bias_view = self.bias.view(1, self.out_channels, 1)
70 |             output = torch.mul(raw_out - bias_view, self.mask_ratio) + bias_view
71 |             output = torch.mul(output, self.update_mask)
72 |         else:
73 |             output = torch.mul(raw_out, self.mask_ratio)
74 | 
75 |         if self.return_mask:
76 |             return output, self.update_mask
77 |         else:
78 |             return output
79 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/postnet.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch.nn import functional as F
 3 | import torch
 4 | 
 5 | from ..common import Conv1d
 6 | 
 7 | 
 8 | class Postnet(nn.Module):
 9 |     """Postnet
10 |     - Five 1-d convolution with 512 channels and kernel size 5
11 |     """
12 | 
13 |     def __init__(self, hparams):
14 |         super(Postnet, self).__init__()
15 |         self.dropout_rate = 0.5
16 |         self.convolutions = nn.ModuleList()
17 | 
18 |         self.convolutions.append(
19 |             nn.Sequential(
20 |                 Conv1d(
21 |                     hparams.n_mel_channels,
22 |                     hparams.postnet_embedding_dim,
23 |                     kernel_size=hparams.postnet_kernel_size,
24 |                     stride=1,
25 |                     padding=int((hparams.postnet_kernel_size - 1) / 2),
26 |                     dilation=1,
27 |                     w_init_gain="tanh",
28 |                 ),
29 |                 nn.BatchNorm1d(hparams.postnet_embedding_dim),
30 |             )
31 |         )
32 | 
33 |         for i in range(1, hparams.postnet_n_convolutions - 1):
34 |             self.convolutions.append(
35 |                 nn.Sequential(
36 |                     Conv1d(
37 |                         hparams.postnet_embedding_dim,
38 |                         hparams.postnet_embedding_dim,
39 |                         kernel_size=hparams.postnet_kernel_size,
40 |                         stride=1,
41 |                         padding=int((hparams.postnet_kernel_size - 1) / 2),
42 |                         dilation=1,
43 |                         w_init_gain="tanh",
44 |                     ),
45 |                     nn.BatchNorm1d(hparams.postnet_embedding_dim),
46 |                 )
47 |             )
48 | 
49 |         self.convolutions.append(
50 |             nn.Sequential(
51 |                 Conv1d(
52 |                     hparams.postnet_embedding_dim,
53 |                     hparams.n_mel_channels,
54 |                     kernel_size=hparams.postnet_kernel_size,
55 |                     stride=1,
56 |                     padding=int((hparams.postnet_kernel_size - 1) / 2),
57 |                     dilation=1,
58 |                     w_init_gain="linear",
59 |                 ),
60 |                 nn.BatchNorm1d(hparams.n_mel_channels),
61 |             )
62 |         )
63 | 
64 |     def forward(self, x):
65 |         for i, conv in enumerate(self.convolutions):
66 |             if i == len(self.convolutions) - 1:
67 |                 x = F.dropout(conv(x), self.dropout_rate, self.training)
68 |             else:
69 |                 x = F.dropout(torch.tanh(conv(x)), self.dropout_rate, self.training)
70 | 
71 |         return x
72 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/components/prenet.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch.nn import functional as F
 3 | from ..common import LinearNorm
 4 | 
 5 | 
 6 | class Prenet(nn.Module):
 7 |     def __init__(self, in_dim, sizes):
 8 |         super().__init__()
 9 |         in_sizes = [in_dim] + sizes[:-1]
10 |         self.layers = nn.ModuleList(
11 |             [
12 |                 LinearNorm(in_size, out_size, bias=False)
13 |                 for (in_size, out_size) in zip(in_sizes, sizes)
14 |             ]
15 |         )
16 |         self.dropout_rate = 0.5
17 | 
18 |     def forward(self, x):
19 |         for linear in self.layers:
20 |             x = F.dropout(F.relu(linear(x)), p=self.dropout_rate, training=True)
21 |         return x
22 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/rvc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/models/rvc/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/rvc/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def init_weights(m, mean=0.0, std=0.01):
  9 |     classname = m.__class__.__name__
 10 |     if classname.find("Conv") != -1:
 11 |         m.weight.data.normal_(mean, std)
 12 | 
 13 | 
 14 | def get_padding(kernel_size, dilation=1):
 15 |     return int((kernel_size * dilation - dilation) / 2)
 16 | 
 17 | 
 18 | def convert_pad_shape(pad_shape):
 19 |     l = pad_shape[::-1]
 20 |     pad_shape = [item for sublist in l for item in sublist]
 21 |     return pad_shape
 22 | 
 23 | 
 24 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 25 |     """KL(P||Q)"""
 26 |     kl = (logs_q - logs_p) - 0.5
 27 |     kl += (
 28 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 29 |     )
 30 |     return kl
 31 | 
 32 | 
 33 | def rand_gumbel(shape):
 34 |     """Sample from the Gumbel distribution, protect from overflows."""
 35 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 36 |     return -torch.log(-torch.log(uniform_samples))
 37 | 
 38 | 
 39 | def rand_gumbel_like(x):
 40 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 41 |     return g
 42 | 
 43 | 
 44 | def slice_segments(x, ids_str, segment_size=4):
 45 |     ret = torch.zeros_like(x[:, :, :segment_size])
 46 |     for i in range(x.size(0)):
 47 |         idx_str = ids_str[i]
 48 |         idx_end = idx_str + segment_size
 49 |         ret[i] = x[i, :, idx_str:idx_end]
 50 | 
 51 |     return ret
 52 | 
 53 | 
 54 | def slice_segments2(x, ids_str, segment_size=4):
 55 |     ret = torch.zeros_like(x[:, :segment_size])
 56 |     for i in range(x.size(0)):
 57 |         idx_str = ids_str[i]
 58 |         idx_end = idx_str + segment_size
 59 |         ret[i] = x[i, idx_str:idx_end]
 60 |     return ret
 61 | 
 62 | 
 63 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 64 |     b, d, t = x.size()
 65 |     if x_lengths is None:
 66 |         x_lengths = t
 67 |     ids_str_max = (
 68 |         x_lengths - segment_size
 69 |     )  # + 1 # NOTE (Sam): remove +1 to avoid rounding error when starting with mels.
 70 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 71 |     ret = slice_segments(x, ids_str, segment_size)
 72 |     return ret, ids_str
 73 | 
 74 | 
 75 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 76 |     position = torch.arange(length, dtype=torch.float)
 77 |     num_timescales = channels // 2
 78 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 79 |         num_timescales - 1
 80 |     )
 81 |     inv_timescales = min_timescale * torch.exp(
 82 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 83 |     )
 84 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 85 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 86 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 87 |     signal = signal.view(1, channels, length)
 88 |     return signal
 89 | 
 90 | 
 91 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 92 |     b, channels, length = x.size()
 93 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 94 |     return x + signal.to(dtype=x.dtype, device=x.device)
 95 | 
 96 | 
 97 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 98 |     b, channels, length = x.size()
 99 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
100 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
101 | 
102 | 
103 | def subsequent_mask(length):
104 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
105 |     return mask
106 | 
107 | 
108 | @torch.jit.script
109 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
110 |     n_channels_int = n_channels[0]
111 |     in_act = input_a + input_b
112 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
113 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
114 |     acts = t_act * s_act
115 |     return acts
116 | 
117 | 
118 | def convert_pad_shape(pad_shape):
119 |     l = pad_shape[::-1]
120 |     pad_shape = [item for sublist in l for item in sublist]
121 |     return pad_shape
122 | 
123 | 
124 | def shift_1d(x):
125 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
126 |     return x
127 | 
128 | 
129 | def sequence_mask(length, max_length=None):
130 |     if max_length is None:
131 |         max_length = length.max()
132 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
133 |     return x.unsqueeze(0) < length.unsqueeze(1)
134 | 
135 | 
136 | def generate_path(duration, mask):
137 |     """
138 |     duration: [b, 1, t_x]
139 |     mask: [b, 1, t_y, t_x]
140 |     """
141 |     device = duration.device
142 | 
143 |     b, _, t_y, t_x = mask.shape
144 |     cum_duration = torch.cumsum(duration, -1)
145 | 
146 |     cum_duration_flat = cum_duration.view(b * t_x)
147 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
148 |     path = path.view(b, t_x, t_y)
149 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
150 |     path = path.unsqueeze(1).transpose(2, 3) * mask
151 |     return path
152 | 
153 | 
154 | def clip_grad_value_(parameters, clip_value, norm_type=2):
155 |     if isinstance(parameters, torch.Tensor):
156 |         parameters = [parameters]
157 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
158 |     norm_type = float(norm_type)
159 |     if clip_value is not None:
160 |         clip_value = float(clip_value)
161 | 
162 |     total_norm = 0
163 |     for p in parameters:
164 |         param_norm = p.grad.data.norm(norm_type)
165 |         total_norm += param_norm.item() ** norm_type
166 |         if clip_value is not None:
167 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
168 |     total_norm = total_norm ** (1.0 / norm_type)
169 |     return total_norm
170 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/models/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pickle
 3 | import os
 4 | import inspect
 5 | 
 6 | 
 7 | def load_checkpoint(filepath, device, pickle_module=pickle):
 8 |     assert os.path.isfile(filepath)
 9 |     print("Loading '{}'".format(filepath))
10 |     checkpoint_dict = torch.load(
11 |         filepath,
12 |         map_location=torch.device(device),
13 |         pickle_module=pickle_module,
14 |     )
15 |     print("Complete.")
16 |     return checkpoint_dict
17 | 
18 | 
19 | def load_pretrained(model, checkpoint_path, key_="generator"):
20 |     # NOTE (Sam): uncomment for download on anyscale
21 |     # response = requests.get(HIFI_GAN_GENERATOR_URL, stream=True)
22 |     # bio = BytesIO(response.content)
23 |     loaded = torch.load(checkpoint_path)
24 |     model.load_state_dict(loaded[key_])
25 | 
26 | 
27 | def filter_valid_args(func, **kwargs):
28 |     valid_keys = inspect.signature(func).parameters.keys()
29 |     return {key: value for key, value in kwargs.items() if key in valid_keys}
30 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/monitoring/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/monitoring/generate.py:
--------------------------------------------------------------------------------
 1 | __all__ = []
 2 | 
 3 | 
 4 | from ..text.utils import prepare_input_sequence
 5 | 
 6 | 
 7 | def _get_inference(model, vocoder, texts, speaker_ids, symbol_set, arpabet, cpu_run):
 8 |     text_padded, input_lengths = prepare_input_sequence(
 9 |         texts, cpu_run=cpu_run, arpabet=arpabet, symbol_set=symbol_set
10 |     )
11 |     # Note (SAM): None is for GST... temporary solution
12 |     input_ = text_padded, input_lengths, speaker_ids, None
13 |     output = model.inference(input_)
14 |     audio = vocoder.infer(output[1][:1])
15 |     return audio
16 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/monitoring/statistics.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["get_alignment_metrics"]
 2 | 
 3 | import torch
 4 | from ..utils.utils import get_mask_from_lengths
 5 | 
 6 | 
 7 | def get_alignment_metrics(
 8 |     alignments, average_across_batch=True, input_lengths=None, output_lengths=None
 9 | ):
10 |     alignments = alignments.transpose(1, 2)  # [B, dec, enc] -> [B, enc, dec]
11 |     if input_lengths == None:
12 |         input_lengths = torch.ones(alignments.size(0), device=alignments.device) * (
13 |             alignments.shape[1] - 1
14 |         )  # [B] # 147
15 |     if output_lengths == None:
16 |         output_lengths = torch.ones(alignments.size(0), device=alignments.device) * (
17 |             alignments.shape[2] - 1
18 |         )  # [B] # 767
19 | 
20 |     batch_size = alignments.size(0)
21 |     optimums = torch.sqrt(
22 |         input_lengths.double().pow(2) + output_lengths.double().pow(2)
23 |     ).view(batch_size)
24 | 
25 |     # [B, enc, dec] -> [B, dec], [B, dec]
26 |     values, cur_idxs = torch.max(alignments, 1)
27 | 
28 |     cur_idxs = cur_idxs.float()
29 |     prev_indx = torch.cat((cur_idxs[:, 0][:, None], cur_idxs[:, :-1]), dim=1)
30 |     dist = ((prev_indx - cur_idxs).pow(2) + 1).pow(0.5)  # [B, dec]
31 |     dist.masked_fill_(
32 |         ~get_mask_from_lengths(output_lengths, max_len=dist.size(1)), 0.0
33 |     )  # set dist of padded to zero
34 |     dist = dist.sum(dim=(1))  # get total dist for each B
35 |     diagonalness = (dist + 1.4142135) / optimums  # dist / optimal dist
36 | 
37 |     maxes = alignments.max(axis=1)[0].mean(axis=1)
38 |     if average_across_batch:
39 |         diagonalness = diagonalness.mean()
40 |         maxes = maxes.mean()
41 | 
42 |     output = {}
43 |     output["diagonalness"] = diagonalness
44 |     output["max"] = maxes
45 | 
46 |     return output
47 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/monitoring/streamlit.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["run"]
 2 | 
 3 | 
 4 | import streamlit as st
 5 | from collections import OrderedDict
 6 | from .generate import _get_inference, MODEL_LIST, MODEL_TYPES
 7 | 
 8 | 
 9 | def run():
10 |     st.title("Inference inspector")
11 | 
12 |     symbol_set = st.selectbox(
13 |         "What symbol set would you like to use?", ("NVIDIA_TACO2_DEFAULTS")
14 |     )
15 |     st.write("You selected:", symbol_set)
16 | 
17 |     use_arpabet = st.selectbox("Would you like to use arpabet?", ("Yes", "No"))
18 |     st.write("You selected:", use_arpabet)
19 | 
20 |     # st.text_input("Model file name", "test/fixtures/models/taco2ljdefault")
21 |     # st.text_input("Model format", OrderedDict)
22 |     vocoder_path = st.text_input(
23 |         "Vocoder path", "test/fixtures/models/gen_02640000_studio"
24 |     )
25 |     vocoder_config = st.text_input("Vocoder config", None)
26 |     n_speakers = st.text_input("Number of speakers", 1)
27 |     gate_threshold = st.text_input("Gate threshold", 0.1)
28 | 
29 |     chosen_model = st.sidebar.selectbox("Select model", MODEL_LIST)
30 |     chosen_type = st.sidebar.selectbox("Select model save type", MODEL_TYPES)
31 |     text = [st.text_input("Text", "Thats silly")]
32 |     speakers = [st.text_input("Speaker_id", 0)]
33 | 
34 |     hparams = TACOTRON2_DEFAULTS
35 |     hparams.n_speakers = n_speakers
36 |     hparams.gate_threshold = gate_threshold
37 |     if n_speakers > 1:
38 |         hparams.has_speaker_embedding = True
39 |     model = Tacotron2(hparams)
40 |     device = "cuda"
41 |     model = Tacotron2(hparams)
42 |     if chosen_type == "OD":
43 |         model.from_pretrained(model_dict=chosen_model, device=device)
44 |     if chosen_type == "OD":
45 |         model.from_pretrained(warm_start_path=chosen_model, device=device)
46 | 
47 |     hifigan = HiFiGanGenerator(
48 |         config=vocoder_config,
49 |         checkpoint=vocoder_file,
50 |         cudnn_enabled=True,
51 |     )
52 | 
53 |     inference = _get_inference(model, vocoder, texts, speakers, symbol_set, arpabet)
54 | 
55 |     st.audio(inference)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     run()
60 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/monitoring/wandb.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import wandb
 4 | from tqdm import tqdm
 5 | import torch
 6 | 
 7 | from ..text.utils import UTTERANCES
 8 | 
 9 | 
10 | def log_sample_utterances(
11 |     project="my-project",
12 |     name="my-model",
13 |     dataset="my-dataset",
14 |     architecture="my-architecture",
15 |     speaker_ids: List = [],
16 |     inference_function=lambda text, speaker_id: False,
17 | ):
18 |     wandb.init(
19 |         project=project,
20 |         name=name,
21 |         job_type="eval",
22 |         config={"architecture": architecture, "dataset": dataset},
23 |     )
24 | 
25 |     with torch.no_grad():
26 |         for speaker_id in tqdm(speaker_ids):
27 |             to_log = []
28 |             for utterance in tqdm(UTTERANCES):
29 |                 inference = inference_function(utterance, speaker_id)
30 |                 to_log.append(
31 |                     wandb.Audio(inference, caption=utterance, sample_rate=22050)
32 |                 )
33 |                 torch.cuda.empty_cache()  # might not be necessary
34 |             wandb.log({f"Speaker {speaker_id}": to_log})
35 | 
36 |     wandb.finish()
37 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/monotonic_align.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | try:
 5 |     from .monotonic_align.core import maximum_path_c
 6 | 
 7 |     CYTHON = True
 8 | except ModuleNotFoundError:
 9 |     CYTHON = False
10 | 
11 | 
12 | def maximum_path(neg_cent, mask):
13 |     if CYTHON:
14 |         return maximum_path_cython(neg_cent, mask)
15 |     return maximum_path_numpy(neg_cent, mask)
16 | 
17 | 
18 | def maximum_path_cython(neg_cent, mask):
19 |     """Cython optimized version.
20 |     neg_cent: [b, t_t, t_s]
21 |     mask: [b, t_t, t_s]
22 |     """
23 |     device = neg_cent.device
24 |     dtype = neg_cent.dtype
25 |     neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
26 |     path = np.zeros(neg_cent.shape, dtype=np.int32)
27 | 
28 |     t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
29 |     t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
30 |     maximum_path_c(path, neg_cent, t_t_max, t_s_max)
31 |     return torch.from_numpy(path).to(device=device, dtype=dtype)
32 | 
33 | 
34 | def maximum_path_numpy(value, mask, max_neg_val=None):
35 |     """
36 |     Monotonic alignment search algorithm
37 |     Numpy-friendly version. It's about 4 times faster than torch version.
38 |     value: [b, t_x, t_y]
39 |     mask: [b, t_x, t_y]
40 |     """
41 |     if max_neg_val is None:
42 |         max_neg_val = -np.inf  # Patch for Sphinx complaint
43 |     value = value * mask
44 | 
45 |     device = value.device
46 |     dtype = value.dtype
47 |     value = value.cpu().detach().numpy()
48 |     mask = mask.cpu().detach().numpy().astype(np.bool)
49 | 
50 |     b, t_x, t_y = value.shape
51 |     direction = np.zeros(value.shape, dtype=np.int64)
52 |     v = np.zeros((b, t_x), dtype=np.float32)
53 |     x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
54 |     for j in range(t_y):
55 |         v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[
56 |             :, :-1
57 |         ]
58 |         v1 = v
59 |         max_mask = v1 >= v0
60 |         v_max = np.where(max_mask, v1, v0)
61 |         direction[:, :, j] = max_mask
62 | 
63 |         index_mask = x_range <= j
64 |         v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
65 |     direction = np.where(mask, direction, 1)
66 | 
67 |     path = np.zeros(value.shape, dtype=np.float32)
68 |     index = mask[:, :, 0].sum(1).astype(np.int64) - 1
69 |     index_range = np.arange(b)
70 |     for j in reversed(range(t_y)):
71 |         path[index_range, index, j] = 1
72 |         index = index + direction[index_range, index, j] - 1
73 |     path = path * mask.astype(np.float32)
74 |     path = torch.from_numpy(path).to(device=device, dtype=dtype)
75 |     return path
76 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/optimizers/radam.py:
--------------------------------------------------------------------------------
  1 | # Original source taken from https://github.com/LiyuanLucasLiu/RAdam
  2 | #
  3 | # Copyright 2019 Liyuan Liu
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #       http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | import math
 17 | 
 18 | import torch
 19 | 
 20 | # pylint: disable=no-name-in-module
 21 | from torch.optim.optimizer import Optimizer
 22 | 
 23 | 
 24 | class RAdam(Optimizer):
 25 |     """RAdam optimizer"""
 26 | 
 27 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 28 |         """
 29 |         Init
 30 | 
 31 |         :param params: parameters to optimize
 32 |         :param lr: learning rate
 33 |         :param betas: beta
 34 |         :param eps: numerical precision
 35 |         :param weight_decay: weight decay weight
 36 |         """
 37 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 38 |         self.buffer = [[None, None, None] for _ in range(10)]
 39 |         super().__init__(params, defaults)
 40 | 
 41 |     def step(self, closure=None):
 42 |         loss = None
 43 |         if closure is not None:
 44 |             loss = closure()
 45 | 
 46 |         for group in self.param_groups:
 47 |             for p in group["params"]:
 48 |                 if p.grad is None:
 49 |                     continue
 50 |                 grad = p.grad.data.float()
 51 |                 if grad.is_sparse:
 52 |                     raise RuntimeError("RAdam does not support sparse gradients")
 53 | 
 54 |                 p_data_fp32 = p.data.float()
 55 | 
 56 |                 state = self.state[p]
 57 | 
 58 |                 if len(state) == 0:
 59 |                     state["step"] = 0
 60 |                     state["exp_avg"] = torch.zeros_like(p_data_fp32)
 61 |                     state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
 62 |                 else:
 63 |                     state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
 64 |                     state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
 65 | 
 66 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
 67 |                 beta1, beta2 = group["betas"]
 68 | 
 69 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 70 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 71 | 
 72 |                 state["step"] += 1
 73 |                 buffered = self.buffer[int(state["step"] % 10)]
 74 |                 if state["step"] == buffered[0]:
 75 |                     N_sma, step_size = buffered[1], buffered[2]
 76 |                 else:
 77 |                     buffered[0] = state["step"]
 78 |                     beta2_t = beta2 ** state["step"]
 79 |                     N_sma_max = 2 / (1 - beta2) - 1
 80 |                     N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
 81 |                     buffered[1] = N_sma
 82 | 
 83 |                     # more conservative since it's an approximated value
 84 |                     if N_sma >= 5:
 85 |                         step_size = (
 86 |                             group["lr"]
 87 |                             * math.sqrt(
 88 |                                 (1 - beta2_t)
 89 |                                 * (N_sma - 4)
 90 |                                 / (N_sma_max - 4)
 91 |                                 * (N_sma - 2)
 92 |                                 / N_sma
 93 |                                 * N_sma_max
 94 |                                 / (N_sma_max - 2)
 95 |                             )
 96 |                             / (1 - beta1 ** state["step"])
 97 |                         )
 98 |                     else:
 99 |                         step_size = group["lr"] / (1 - beta1 ** state["step"])
100 |                     buffered[2] = step_size
101 | 
102 |                 if group["weight_decay"] != 0:
103 |                     p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
104 | 
105 |                 # more conservative since it's an approximated value
106 |                 if N_sma >= 5:
107 |                     denom = exp_avg_sq.sqrt().add_(group["eps"])
108 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
109 |                 else:
110 |                     p_data_fp32.add_(-step_size, exp_avg)
111 | 
112 |                 p.data.copy_(p_data_fp32)
113 | 
114 |         return loss
115 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/text/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/abbreviations.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | _no_period_re = re.compile(r"(No[.])(?=[ ]?[0-9])")
 4 | _percent_re = re.compile(r"([ ]?[%])")
 5 | _half_re = re.compile("([0-9]½)|(½)")
 6 | 
 7 | 
 8 | # List of (regular expression, replacement) pairs for abbreviations:
 9 | _abbreviations = [
10 |     (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
11 |     for x in [
12 |         ("mrs", "misess"),
13 |         ("ms", "miss"),
14 |         ("mr", "mister"),
15 |         ("dr", "doctor"),
16 |         ("st", "saint"),
17 |         ("co", "company"),
18 |         ("jr", "junior"),
19 |         ("maj", "major"),
20 |         ("gen", "general"),
21 |         ("drs", "doctors"),
22 |         ("rev", "reverend"),
23 |         ("lt", "lieutenant"),
24 |         ("hon", "honorable"),
25 |         ("sgt", "sergeant"),
26 |         ("capt", "captain"),
27 |         ("esq", "esquire"),
28 |         ("ltd", "limited"),
29 |         ("col", "colonel"),
30 |         ("ft", "fort"),
31 |     ]
32 | ]
33 | 
34 | 
35 | def _expand_no_period(m):
36 |     word = m.group(0)
37 |     if word[0] == "N":
38 |         return "Number"
39 |     return "number"
40 | 
41 | 
42 | def _expand_percent(m):
43 |     return " percent"
44 | 
45 | 
46 | def _expand_half(m):
47 |     word = m.group(1)
48 |     if word is None:
49 |         return "half"
50 |     return word[0] + " and a half"
51 | 
52 | 
53 | def normalize_abbreviations(text):
54 |     text = re.sub(_no_period_re, _expand_no_period, text)
55 |     text = re.sub(_percent_re, _expand_percent, text)
56 |     text = re.sub(_half_re, _expand_half, text)
57 |     return text
58 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/acronyms.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from .cmudict import CMUDict
 3 | 
 4 | _letter_to_arpabet = {
 5 |     "A": "EY1",
 6 |     "B": "B IY1",
 7 |     "C": "S IY1",
 8 |     "D": "D IY1",
 9 |     "E": "IY1",
10 |     "F": "EH1 F",
11 |     "G": "JH IY1",
12 |     "H": "EY1 CH",
13 |     "I": "AY1",
14 |     "J": "JH EY1",
15 |     "K": "K EY1",
16 |     "L": "EH1 L",
17 |     "M": "EH1 M",
18 |     "N": "EH1 N",
19 |     "O": "OW1",
20 |     "P": "P IY1",
21 |     "Q": "K Y UW1",
22 |     "R": "AA1 R",
23 |     "S": "EH1 S",
24 |     "T": "T IY1",
25 |     "U": "Y UW1",
26 |     "V": "V IY1",
27 |     "X": "EH1 K S",
28 |     "Y": "W AY1",
29 |     "W": "D AH1 B AH0 L Y UW0",
30 |     "Z": "Z IY1",
31 |     "s": "Z",
32 | }
33 | 
34 | # must ignore roman numerals
35 | # _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)')
36 | _acronym_re = re.compile(r"([A-Z][A-Z]+)s?")
37 | 
38 | 
39 | class AcronymNormalizer(object):
40 |     def __init__(self, phoneme_dict):
41 |         self.phoneme_dict = phoneme_dict
42 | 
43 |     def normalize_acronyms(self, text):
44 |         def _expand_acronyms(m, add_spaces=True):
45 |             acronym = m.group(0)
46 |             # remove dots if they exist
47 |             acronym = re.sub("\.", "", acronym)
48 | 
49 |             acronym = "".join(acronym.split())
50 |             arpabet = self.phoneme_dict.lookup(acronym)
51 | 
52 |             if arpabet is None:
53 |                 acronym = list(acronym)
54 |                 arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
55 |                 # temporary fix
56 |                 if arpabet[-1] == "{Z}" and len(arpabet) > 1:
57 |                     arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:]
58 |                     del arpabet[-1]
59 |                 arpabet = " ".join(arpabet)
60 |             elif len(arpabet) == 1:
61 |                 arpabet = "{" + arpabet[0] + "}"
62 |             else:
63 |                 arpabet = acronym
64 |             return arpabet
65 | 
66 |         text = re.sub(_acronym_re, _expand_acronyms, text)
67 |         return text
68 | 
69 |     def __call__(self, text):
70 |         return self.normalize_acronyms(text)
71 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ adapted from https://github.com/keithito/tacotron """
  2 | 
  3 | """
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |     1. "english_cleaners" for English text
  9 |     2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |          the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |     3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |          the symbols in symbols.py to match your data).
 13 | """
 14 | 
 15 | import re
 16 | from string import punctuation
 17 | from functools import reduce
 18 | from unidecode import unidecode
 19 | from .numerical import normalize_numbers, normalize_currency
 20 | from .acronyms import AcronymNormalizer
 21 | from .datestime import normalize_datestime
 22 | from .letters_and_numbers import normalize_letters_and_numbers
 23 | from .abbreviations import normalize_abbreviations
 24 | 
 25 | 
 26 | # Regular expression matching whitespace:
 27 | _whitespace_re = re.compile(r"\s+")
 28 | 
 29 | # Regular expression separating words enclosed in curly braces for cleaning
 30 | _arpa_re = re.compile(r"{[^}]+}|\S+")
 31 | 
 32 | 
 33 | def expand_abbreviations(text):
 34 |     return normalize_abbreviations(text)
 35 | 
 36 | 
 37 | def expand_numbers(text):
 38 |     return normalize_numbers(text)
 39 | 
 40 | 
 41 | def expand_currency(text):
 42 |     return normalize_currency(text)
 43 | 
 44 | 
 45 | def expand_datestime(text):
 46 |     return normalize_datestime(text)
 47 | 
 48 | 
 49 | def expand_letters_and_numbers(text):
 50 |     return normalize_letters_and_numbers(text)
 51 | 
 52 | 
 53 | def lowercase(text):
 54 |     return text.lower()
 55 | 
 56 | 
 57 | def collapse_whitespace(text):
 58 |     return re.sub(_whitespace_re, " ", text)
 59 | 
 60 | 
 61 | def separate_acronyms(text):
 62 |     text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text)
 63 |     text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text)
 64 |     return text
 65 | 
 66 | 
 67 | def convert_to_ascii(text):
 68 |     return unidecode(text)
 69 | 
 70 | 
 71 | def dehyphenize_compound_words(text):
 72 |     text = re.sub(r"(?<=[a-zA-Z0-9])-(?=[a-zA-Z])", " ", text)
 73 |     return text
 74 | 
 75 | 
 76 | def remove_space_before_punctuation(text):
 77 |     return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
 78 | 
 79 | 
 80 | class Cleaner(object):
 81 |     def __init__(self, cleaner_names, phonemedict):
 82 |         self.cleaner_names = cleaner_names
 83 |         self.phonemedict = phonemedict
 84 |         self.acronym_normalizer = AcronymNormalizer(self.phonemedict)
 85 | 
 86 |     def __call__(self, text):
 87 |         for cleaner_name in self.cleaner_names:
 88 |             sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
 89 |             for fn in sequence_fns:
 90 |                 text = fn(text)
 91 | 
 92 |             text = [
 93 |                 reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split
 94 |                 for split in _arpa_re.findall(text)
 95 |             ]
 96 |             text = " ".join(text)
 97 |         text = remove_space_before_punctuation(text)
 98 |         return text
 99 | 
100 |     def get_cleaner_fns(self, cleaner_name):
101 |         if cleaner_name == "basic_cleaners":
102 |             sequence_fns = [lowercase, collapse_whitespace]
103 |             word_fns = []
104 |         elif cleaner_name == "english_cleaners":
105 |             sequence_fns = [collapse_whitespace, convert_to_ascii, lowercase]
106 |             word_fns = [expand_numbers, expand_abbreviations]
107 |         elif cleaner_name == "radtts_cleaners":
108 |             sequence_fns = [
109 |                 collapse_whitespace,
110 |                 expand_currency,
111 |                 expand_datestime,
112 |                 expand_letters_and_numbers,
113 |             ]
114 |             word_fns = [expand_numbers, expand_abbreviations]
115 |         elif cleaner_name == "transliteration_cleaners":
116 |             sequence_fns = [convert_to_ascii, lowercase, collapse_whitespace]
117 |         else:
118 |             raise Exception("{} cleaner not supported".format(cleaner_name))
119 | 
120 |         return sequence_fns, word_fns
121 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/cmudict.py:
--------------------------------------------------------------------------------
  1 | __all__ = ["CMUDict", "valid_symbols"]
  2 | 
  3 | 
  4 | """ from https://github.com/keithito/tacotron """
  5 | 
  6 | import re
  7 | 
  8 | 
  9 | valid_symbols = [
 10 |     "AA",
 11 |     "AA0",
 12 |     "AA1",
 13 |     "AA2",
 14 |     "AE",
 15 |     "AE0",
 16 |     "AE1",
 17 |     "AE2",
 18 |     "AH",
 19 |     "AH0",
 20 |     "AH1",
 21 |     "AH2",
 22 |     "AO",
 23 |     "AO0",
 24 |     "AO1",
 25 |     "AO2",
 26 |     "AW",
 27 |     "AW0",
 28 |     "AW1",
 29 |     "AW2",
 30 |     "AY",
 31 |     "AY0",
 32 |     "AY1",
 33 |     "AY2",
 34 |     "B",
 35 |     "CH",
 36 |     "D",
 37 |     "DH",
 38 |     "EH",
 39 |     "EH0",
 40 |     "EH1",
 41 |     "EH2",
 42 |     "ER",
 43 |     "ER0",
 44 |     "ER1",
 45 |     "ER2",
 46 |     "EY",
 47 |     "EY0",
 48 |     "EY1",
 49 |     "EY2",
 50 |     "F",
 51 |     "G",
 52 |     "HH",
 53 |     "IH",
 54 |     "IH0",
 55 |     "IH1",
 56 |     "IH2",
 57 |     "IY",
 58 |     "IY0",
 59 |     "IY1",
 60 |     "IY2",
 61 |     "JH",
 62 |     "K",
 63 |     "L",
 64 |     "M",
 65 |     "N",
 66 |     "NG",
 67 |     "OW",
 68 |     "OW0",
 69 |     "OW1",
 70 |     "OW2",
 71 |     "OY",
 72 |     "OY0",
 73 |     "OY1",
 74 |     "OY2",
 75 |     "P",
 76 |     "R",
 77 |     "S",
 78 |     "SH",
 79 |     "T",
 80 |     "TH",
 81 |     "UH",
 82 |     "UH0",
 83 |     "UH1",
 84 |     "UH2",
 85 |     "UW",
 86 |     "UW0",
 87 |     "UW1",
 88 |     "UW2",
 89 |     "V",
 90 |     "W",
 91 |     "Y",
 92 |     "Z",
 93 |     "ZH",
 94 | ]
 95 | 
 96 | _valid_symbol_set = set(valid_symbols)
 97 | 
 98 | 
 99 | class CMUDict:
100 |     """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
101 | 
102 |     def __init__(self, file_or_path, keep_ambiguous=True):
103 |         if isinstance(file_or_path, str):
104 |             with open(file_or_path, encoding="latin-1") as f:
105 |                 entries = _parse_cmudict(f)
106 |         else:
107 |             entries = _parse_cmudict(file_or_path)
108 |         if not keep_ambiguous:
109 |             entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
110 |         self._entries = entries
111 | 
112 |     def __len__(self):
113 |         return len(self._entries)
114 | 
115 |     def lookup(self, word):
116 |         """Returns list of ARPAbet pronunciations of the given word."""
117 |         return self._entries.get(word.upper())
118 | 
119 | 
120 | _alt_re = re.compile(r"\([0-9]+\)")
121 | 
122 | 
123 | def _parse_cmudict(file):
124 |     cmudict = {}
125 |     for line in file:
126 |         if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
127 |             parts = line.split("  ")
128 |             word = re.sub(_alt_re, "", parts[0])
129 |             pronunciation = _get_pronunciation(parts[1])
130 |             if pronunciation:
131 |                 if word in cmudict:
132 |                     cmudict[word].append(pronunciation)
133 |                 else:
134 |                     cmudict[word] = [pronunciation]
135 |     return cmudict
136 | 
137 | 
138 | def _get_pronunciation(s):
139 |     parts = s.strip().split(" ")
140 |     for part in parts:
141 |         if part not in _valid_symbol_set:
142 |             return None
143 |     return " ".join(parts)
144 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/datestime.py:
--------------------------------------------------------------------------------
 1 | """ adapted from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | _ampm_re = re.compile(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)")
 6 | 
 7 | 
 8 | def _expand_ampm(m):
 9 |     matches = list(m.groups(0))
10 |     txt = matches[0]
11 |     txt = txt if int(matches[1]) == 0 else txt + " " + matches[1]
12 | 
13 |     if matches[2][0].lower() == "a":
14 |         txt += " a.m."
15 |     elif matches[2][0].lower() == "p":
16 |         txt += " p.m."
17 | 
18 |     return txt
19 | 
20 | 
21 | def normalize_datestime(text):
22 |     text = re.sub(_ampm_re, _expand_ampm, text)
23 |     # text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text)
24 |     return text
25 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/grapheme_dictionary.py:
--------------------------------------------------------------------------------
 1 | # NOTE (Sam): synthesize with other methods
 2 | 
 3 | """ adapted from https://github.com/keithito/tacotron """
 4 | 
 5 | import re
 6 | 
 7 | _alt_re = re.compile(r"\([0-9]+\)")
 8 | 
 9 | 
10 | class Grapheme2PhonemeDictionary:
11 |     """Thin wrapper around g2p data."""
12 | 
13 |     def __init__(self, file_or_path, keep_ambiguous=True, encoding="latin-1"):
14 |         with open(file_or_path, encoding=encoding) as f:
15 |             entries = _parse_g2p(f)
16 |         if not keep_ambiguous:
17 |             entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
18 |         self._entries = entries
19 | 
20 |     def __len__(self):
21 |         return len(self._entries)
22 | 
23 |     def lookup(self, word):
24 |         """Returns list of pronunciations of the given word."""
25 |         return self._entries.get(word.upper())
26 | 
27 | 
28 | def _parse_g2p(file):
29 |     g2p = {}
30 |     for line in file:
31 |         if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
32 |             parts = line.split("  ")
33 |             word = re.sub(_alt_re, "", parts[0])
34 |             pronunciation = parts[1].strip()
35 |             if word in g2p:
36 |                 g2p[word].append(pronunciation)
37 |             else:
38 |                 g2p[word] = [pronunciation]
39 |     return g2p
40 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/heteronyms:
--------------------------------------------------------------------------------
  1 | abject
  2 | abrogate
  3 | absent
  4 | abstract
  5 | abuse
  6 | ache
  7 | acre
  8 | acuminate
  9 | addict
 10 | address
 11 | adduct
 12 | adele
 13 | advocate
 14 | affect
 15 | affiliate
 16 | agape
 17 | aged
 18 | agglomerate
 19 | aggregate
 20 | agonic
 21 | agora
 22 | allied
 23 | ally
 24 | alternate
 25 | alum
 26 | am
 27 | analyses
 28 | andrea
 29 | animate
 30 | apply
 31 | appropriate
 32 | approximate
 33 | ares
 34 | arithmetic
 35 | arsenic
 36 | articulate
 37 | associate
 38 | attribute
 39 | august
 40 | axes
 41 | ay
 42 | aye
 43 | bases
 44 | bass
 45 | bathed
 46 | bested
 47 | bifurcate
 48 | blessed
 49 | blotto
 50 | bow
 51 | bowed
 52 | bowman
 53 | brassy
 54 | buffet
 55 | bustier
 56 | carbonate
 57 | celtic
 58 | choral
 59 | chumash
 60 | close
 61 | closer
 62 | coax
 63 | coincidence
 64 | color coordinate
 65 | colour coordinate
 66 | comber
 67 | combine
 68 | combs
 69 | committee
 70 | commune
 71 | compact
 72 | complex
 73 | compound
 74 | compress
 75 | concert
 76 | conduct
 77 | confine
 78 | confines
 79 | conflict
 80 | conglomerate
 81 | conscript
 82 | conserve
 83 | consist
 84 | console
 85 | consort
 86 | construct
 87 | consult
 88 | consummate
 89 | content
 90 | contest
 91 | contract
 92 | contracts
 93 | contrast
 94 | converse
 95 | convert
 96 | convict
 97 | coop
 98 | coordinate
 99 | covey
100 | crooked
101 | curate
102 | cussed
103 | decollate
104 | decrease
105 | defect
106 | defense
107 | delegate
108 | deliberate
109 | denier
110 | desert
111 | detail
112 | deviate
113 | diagnoses
114 | diffuse
115 | digest
116 | discard
117 | discharge
118 | discount
119 | do
120 | document
121 | does
122 | dogged
123 | domesticate
124 | dominican
125 | dove
126 | dr
127 | drawer
128 | duplicate
129 | egress
130 | ejaculate
131 | eject
132 | elaborate
133 | ellipses
134 | email
135 | emu
136 | entrace
137 | entrance
138 | escort
139 | estimate
140 | eta
141 | etna
142 | evening
143 | excise
144 | excuse
145 | exploit
146 | export
147 | extract
148 | fine
149 | flower
150 | forbear
151 | four-legged
152 | frequent
153 | furrier
154 | gallant
155 | gel
156 | geminate
157 | gillie
158 | glower
159 | gotham
160 | graduate
161 | haggis
162 | heavy
163 | hinder
164 | house
165 | housewife
166 | impact
167 | imped
168 | implant
169 | implement
170 | import
171 | impress
172 | incense
173 | incline
174 | increase
175 | infix
176 | insert
177 | instar
178 | insult
179 | integral
180 | intercept
181 | interchange
182 | interflow
183 | interleaf
184 | intermediate
185 | intern
186 | interspace
187 | intimate
188 | intrigue
189 | invalid
190 | invert
191 | invite
192 | irony
193 | jagged
194 | jesses
195 | julies
196 | kite
197 | laminate
198 | laos
199 | lather
200 | lead
201 | learned
202 | leasing
203 | lech
204 | legitimate
205 | lied
206 | lima
207 | lipread
208 | live
209 | lower
210 | lunged
211 | maas
212 | magdalen
213 | manes
214 | mare
215 | marked
216 | merchandise
217 | merlion
218 | minute
219 | misconduct
220 | misled
221 | misprint
222 | mobile
223 | moderate
224 | mong
225 | moped
226 | moth
227 | mouth
228 | mow
229 | mpg
230 | multiply
231 | mush
232 | nana
233 | nice
234 | nice
235 | number
236 | numerate
237 | nun
238 | object
239 | opiate
240 | ornament
241 | outbox
242 | outcry
243 | outpour
244 | outreach
245 | outride
246 | outright
247 | outside
248 | outwork
249 | overall
250 | overbid
251 | overcall
252 | overcast
253 | overfall
254 | overflow
255 | overhaul
256 | overhead
257 | overlap
258 | overlay
259 | overuse
260 | overweight
261 | overwork
262 | pace
263 | palled
264 | palling
265 | para
266 | pasty
267 | pate
268 | pauline
269 | pedal
270 | peer
271 | perfect
272 | periodic
273 | permit
274 | pervert
275 | pinta
276 | placer
277 | platy
278 | polish
279 | polish
280 | poll
281 | pontificate
282 | postulate
283 | pram
284 | prayer
285 | precipitate
286 | predate
287 | predicate
288 | prefix
289 | preposition
290 | present
291 | pretest
292 | primer
293 | proceeds
294 | produce
295 | progress
296 | project
297 | proportionate
298 | prospect
299 | protest
300 | pussy
301 | putter
302 | putting
303 | quite
304 | ragged
305 | raven
306 | re
307 | read
308 | reading
309 | reading
310 | real
311 | rebel
312 | recall
313 | recap
314 | recitative
315 | recollect
316 | record
317 | recreate
318 | recreation
319 | redress
320 | refill
321 | refund
322 | refuse
323 | reject
324 | relay
325 | remake
326 | repaint
327 | reprint
328 | reread
329 | rerun
330 | resent
331 | reside
332 | resign
333 | respray
334 | resume
335 | retard
336 | retest
337 | retread
338 | rewrite
339 | root
340 | routed
341 | routing
342 | row
343 | rugged
344 | rummy
345 | sais
346 | sake
347 | sambuca
348 | saucier
349 | second
350 | secrete
351 | secreted
352 | secreting
353 | segment
354 | separate
355 | sewer
356 | shirk
357 | shower
358 | sin
359 | skied
360 | slaver
361 | slough
362 | sow
363 | spoof
364 | squid
365 | stingy
366 | subject
367 | subordinate
368 | subvert
369 | supply
370 | supposed
371 | survey
372 | suspect
373 | syringes
374 | tabulate
375 | tales
376 | tarrier
377 | tarry
378 | taxes
379 | taxis
380 | tear
381 | theron
382 | thou
383 | three-legged
384 | tier
385 | tinged
386 | torment
387 | transfer
388 | transform
389 | transplant
390 | transport
391 | transpose
392 | tush
393 | two-legged
394 | unionised
395 | unionized
396 | update
397 | uplift
398 | upset
399 | use
400 | used
401 | vale
402 | violist
403 | viva
404 | ware
405 | whinged
406 | whoop
407 | wicked
408 | wind
409 | windy
410 | wino
411 | won
412 | worsted
413 | wound


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/letters_and_numbers.py:
--------------------------------------------------------------------------------
 1 | """ adapted from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | _letters_and_numbers_re = re.compile(
 6 |     r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE
 7 | )
 8 | 
 9 | _hardware_re = re.compile(
10 |     "([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)", re.IGNORECASE
11 | )
12 | _hardware_key = {
13 |     "tb": "terabyte",
14 |     "gb": "gigabyte",
15 |     "mb": "megabyte",
16 |     "kb": "kilobyte",
17 |     "ghz": "gigahertz",
18 |     "mhz": "megahertz",
19 |     "khz": "kilohertz",
20 |     "hz": "hertz",
21 |     "mm": "millimeter",
22 |     "cm": "centimeter",
23 |     "km": "kilometer",
24 | }
25 | 
26 | _dimension_re = re.compile(
27 |     r"\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b"
28 | )
29 | _dimension_key = {"m": "meter", "in": "inch", "inch": "inch"}
30 | 
31 | 
32 | def _expand_letters_and_numbers(m):
33 |     text = re.split(r"(\d+)", m.group(0))
34 | 
35 |     # remove trailing space
36 |     if text[-1] == "":
37 |         text = text[:-1]
38 |     elif text[0] == "":
39 |         text = text[1:]
40 | 
41 |     # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc...
42 |     if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit():
43 |         text[-2] = text[-2] + text[-1]
44 |         text = text[:-1]
45 | 
46 |     # for combining digits 2 by 2
47 |     new_text = []
48 |     for i in range(len(text)):
49 |         string = text[i]
50 |         if string.isdigit() and len(string) < 5:
51 |             # heuristics
52 |             if len(string) > 2 and string[-2] == "0":
53 |                 if string[-1] == "0":
54 |                     string = [string]
55 |                 else:
56 |                     string = [string[:-3], string[-2], string[-1]]
57 |             elif len(string) % 2 == 0:
58 |                 string = [string[i : i + 2] for i in range(0, len(string), 2)]
59 |             elif len(string) > 2:
60 |                 string = [string[0]] + [
61 |                     string[i : i + 2] for i in range(1, len(string), 2)
62 |                 ]
63 |             new_text.extend(string)
64 |         else:
65 |             new_text.append(string)
66 | 
67 |     text = new_text
68 |     text = " ".join(text)
69 |     return text
70 | 
71 | 
72 | def _expand_hardware(m):
73 |     quantity, measure = m.groups(0)
74 |     measure = _hardware_key[measure.lower()]
75 |     if measure[-1] != "z" and float(quantity.replace(",", "")) > 1:
76 |         return "{} {}s".format(quantity, measure)
77 |     return "{} {}".format(quantity, measure)
78 | 
79 | 
80 | def _expand_dimension(m):
81 |     text = "".join([x for x in m.groups(0) if x != 0])
82 |     text = text.replace(" x ", " by ")
83 |     text = text.replace("x", " by ")
84 |     if text.endswith(tuple(_dimension_key.keys())):
85 |         if text[-2].isdigit():
86 |             text = "{} {}".format(text[:-1], _dimension_key[text[-1:]])
87 |         elif text[-3].isdigit():
88 |             text = "{} {}".format(text[:-2], _dimension_key[text[-2:]])
89 |     return text
90 | 
91 | 
92 | def normalize_letters_and_numbers(text):
93 |     text = re.sub(_hardware_re, _expand_hardware, text)
94 |     text = re.sub(_dimension_re, _expand_dimension, text)
95 |     text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text)
96 |     return text
97 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/text/numerical.py:
--------------------------------------------------------------------------------
  1 | """ adapted from https://github.com/keithito/tacotron """
  2 | 
  3 | import inflect
  4 | import re
  5 | 
  6 | _magnitudes = ["trillion", "billion", "million", "thousand", "hundred", "m", "b", "t"]
  7 | _magnitudes_key = {"m": "million", "b": "billion", "t": "trillion"}
  8 | _measurements = "(f|c|k|d|m)"
  9 | _measurements_key = {"f": "fahrenheit", "c": "celsius", "k": "thousand", "m": "meters"}
 10 | _currency_key = {"$": "dollar", "£": "pound", "€": "euro", "₩": "won"}
 11 | _inflect = inflect.engine()
 12 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 13 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
 14 | _currency_re = re.compile(
 15 |     r"([\$€£₩])([0-9\.\,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]))?".format(
 16 |         "|".join(_magnitudes)
 17 |     ),
 18 |     re.IGNORECASE,
 19 | )
 20 | _measurement_re = re.compile(
 21 |     r"([0-9\.\,]*[0-9]+(\s)?{}\b)".format(_measurements), re.IGNORECASE
 22 | )
 23 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
 24 | # _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
 25 | _roman_re = re.compile(
 26 |     r"\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b"
 27 | )  # avoid I
 28 | _multiply_re = re.compile(r"(\b[0-9]+)(x)([0-9]+)")
 29 | _number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
 30 | 
 31 | 
 32 | def _remove_commas(m):
 33 |     return m.group(1).replace(",", "")
 34 | 
 35 | 
 36 | def _expand_decimal_point(m):
 37 |     return m.group(1).replace(".", " point ")
 38 | 
 39 | 
 40 | def _expand_currency(m):
 41 |     currency = _currency_key[m.group(1)]
 42 |     quantity = m.group(2)
 43 |     magnitude = m.group(3)
 44 | 
 45 |     # remove commas from quantity to be able to convert to numerical
 46 |     quantity = quantity.replace(",", "")
 47 | 
 48 |     # check for million, billion, etc...
 49 |     if magnitude is not None and magnitude.lower() in _magnitudes:
 50 |         if len(magnitude) == 1:
 51 |             magnitude = _magnitudes_key[magnitude.lower()]
 52 |         return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + "s")
 53 | 
 54 |     parts = quantity.split(".")
 55 |     if len(parts) > 2:
 56 |         return quantity + " " + currency + "s"  # Unexpected format
 57 | 
 58 |     dollars = int(parts[0]) if parts[0] else 0
 59 | 
 60 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
 61 |     if dollars and cents:
 62 |         dollar_unit = currency if dollars == 1 else currency + "s"
 63 |         cent_unit = "cent" if cents == 1 else "cents"
 64 |         return "{} {}, {} {}".format(
 65 |             _expand_hundreds(dollars),
 66 |             dollar_unit,
 67 |             _inflect.number_to_words(cents),
 68 |             cent_unit,
 69 |         )
 70 |     elif dollars:
 71 |         dollar_unit = currency if dollars == 1 else currency + "s"
 72 |         return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
 73 |     elif cents:
 74 |         cent_unit = "cent" if cents == 1 else "cents"
 75 |         return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
 76 |     else:
 77 |         return "zero" + " " + currency + "s"
 78 | 
 79 | 
 80 | def _expand_hundreds(text):
 81 |     number = float(text)
 82 |     if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
 83 |         return _inflect.number_to_words(int(number / 100)) + " hundred"
 84 |     else:
 85 |         return _inflect.number_to_words(text)
 86 | 
 87 | 
 88 | def _expand_ordinal(m):
 89 |     return _inflect.number_to_words(m.group(0))
 90 | 
 91 | 
 92 | def _expand_measurement(m):
 93 |     _, number, measurement = re.split("(\d+(?:\.\d+)?)", m.group(0))
 94 |     number = _inflect.number_to_words(number)
 95 |     measurement = "".join(measurement.split())
 96 |     measurement = _measurements_key[measurement.lower()]
 97 |     return "{} {}".format(number, measurement)
 98 | 
 99 | 
100 | def _expand_range(m):
101 |     return " to "
102 | 
103 | 
104 | def _expand_multiply(m):
105 |     left = m.group(1)
106 |     right = m.group(3)
107 |     return "{} by {}".format(left, right)
108 | 
109 | 
110 | def _expand_roman(m):
111 |     # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
112 |     roman_numerals = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
113 |     result = 0
114 |     num = m.group(0)
115 |     for i, c in enumerate(num):
116 |         if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]:
117 |             result += roman_numerals[c]
118 |         else:
119 |             result -= roman_numerals[c]
120 |     return str(result)
121 | 
122 | 
123 | def _expand_number(m):
124 |     _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
125 |     number = int(number)
126 |     if (
127 |         number > 1000
128 |         and number < 10000
129 |         and (number % 100 == 0)
130 |         and (number % 1000 != 0)
131 |     ):
132 |         text = _inflect.number_to_words(number // 100) + " hundred"
133 |     elif number > 1000 and number < 3000:
134 |         if number == 2000:
135 |             text = "two thousand"
136 |         elif number > 2000 and number < 2010:
137 |             text = "two thousand " + _inflect.number_to_words(number % 100)
138 |         elif number % 100 == 0:
139 |             text = _inflect.number_to_words(number // 100) + " hundred"
140 |         else:
141 |             number = _inflect.number_to_words(
142 |                 number, andword="", zero="oh", group=2
143 |             ).replace(", ", " ")
144 |             number = re.sub(r"-", " ", number)
145 |             text = number
146 |     else:
147 |         number = _inflect.number_to_words(number, andword="and")
148 |         number = re.sub(r"-", " ", number)
149 |         number = re.sub(r",", "", number)
150 |         text = number
151 | 
152 |     if suffix in ("'s", "s"):
153 |         if text[-1] == "y":
154 |             text = text[:-1] + "ies"
155 |         else:
156 |             text = text + suffix
157 | 
158 |     return text
159 | 
160 | 
161 | def normalize_currency(text):
162 |     return re.sub(_currency_re, _expand_currency, text)
163 | 
164 | 
165 | def normalize_numbers(text):
166 |     text = re.sub(_comma_number_re, _remove_commas, text)
167 |     text = re.sub(_currency_re, _expand_currency, text)
168 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
169 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
170 |     # text = re.sub(_range_re, _expand_range, text)
171 |     # text = re.sub(_measurement_re, _expand_measurement, text)
172 |     text = re.sub(_roman_re, _expand_roman, text)
173 |     text = re.sub(_multiply_re, _expand_multiply, text)
174 |     text = re.sub(_number_re, _expand_number, text)
175 |     return text
176 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/hifigan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/hifigan/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/hifigan/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.cuda.amp import GradScaler
  3 | from ray.air.integrations.wandb import setup_wandb
  4 | from torch.utils.data import DataLoader
  5 | from torch.nn import functional as F
  6 | 
  7 | from ...data.data import Dataset
  8 | from ...models.rvc.rvc import MultiPeriodDiscriminator
  9 | from ...models.hifigan import MultiDiscriminator
 10 | 
 11 | from ...data.collate import Collate
 12 | from ...losses_rvc import (
 13 |     generator_loss,
 14 |     discriminator_loss,
 15 |     feature_loss,
 16 | )
 17 | from .train_epoch import train_epoch
 18 | from .train_step import train_step
 19 | from ..rvc.train import DEFAULTS as DEFAULTS
 20 | from ...models.hifigan import _load_uninitialized
 21 | 
 22 | 
 23 | def train_func(config: dict, project: str = "rvc"):
 24 |     print("Entering training function")
 25 |     setup_wandb(config, project=project, entity="uberduck-ai", rank_zero_only=False)
 26 |     train_config = config["train"]
 27 |     model_config = config["model"]
 28 |     data_config = config["data"]
 29 | 
 30 |     generator = _load_uninitialized(config_overrides=model_config)
 31 | 
 32 |     # NOTE (Sam): RVC uses MultiPeriodDiscrimator that has a single scale discriminator
 33 |     # HiFi++ paper indicates that the precise discriminator structure is not important and that reweighting the loss is sufficient
 34 |     # Vocos uses additional strcuture.
 35 |     discriminator = MultiDiscriminator(True)
 36 |     discriminator = discriminator.to("cuda")
 37 | 
 38 |     generator_optimizer = torch.optim.AdamW(
 39 |         generator.parameters(),
 40 |         train_config["learning_rate"],
 41 |         betas=train_config["betas"],
 42 |         eps=train_config["eps"],
 43 |     )
 44 | 
 45 |     discriminator_optimizer = torch.optim.AdamW(
 46 |         discriminator.parameters(),
 47 |         train_config["learning_rate"],
 48 |         betas=train_config["betas"],
 49 |         eps=train_config["eps"],
 50 |     )
 51 | 
 52 |     print("Loading checkpoints")
 53 |     # TODO (Sam): move to "warmstart" or "load_checkpoint" functions
 54 |     if train_config["warmstart_G_checkpoint_path"] is not None:
 55 |         generator_checkpoint = torch.load(train_config["warmstart_G_checkpoint_path"])[
 56 |             "generator"
 57 |         ]
 58 |         generator.load_state_dict(
 59 |             generator_checkpoint
 60 |         )  # NOTE (Sam): a handful of "enc_q" decoder states not present - doesn't seem to cause an issue
 61 |     if train_config["warmstart_D_checkpoint_path"] is not None:
 62 |         discriminator_checkpoint = torch.load(
 63 |             train_config["warmstart_D_checkpoint_path"]
 64 |         )["model"]
 65 |         discriminator.load_state_dict(discriminator_checkpoint)
 66 | 
 67 |     generator = generator.cuda()
 68 |     discriminator = discriminator.cuda()
 69 | 
 70 |     models = {"generator": generator, "discriminator": discriminator}
 71 |     print("Loading dataset")
 72 | 
 73 |     train_dataset = Dataset(
 74 |         filelist_path=data_config["filelist_path"],
 75 |         mel_suffix=data_config["mel_suffix"],
 76 |         audio_suffix=data_config["audio_suffix"],
 77 |     )
 78 | 
 79 |     # train_sampler = DistributedBucketSampler(
 80 |     #     train_dataset,
 81 |     #     train_config["batch_size"] * 1,
 82 |     #     [100, 200, 300, 400, 500, 600, 700, 800, 900],  # 16s
 83 |     #     num_replicas=1,
 84 |     #     rank=0,
 85 |     #     shuffle=True,
 86 |     # )
 87 |     train_loader = DataLoader(
 88 |         train_dataset,
 89 |         num_workers=1,
 90 |         shuffle=False,
 91 |         pin_memory=True,
 92 |         collate_fn=Collate(),
 93 |         batch_sampler=None,
 94 |         # batch_sampler=train_sampler,
 95 |         batch_size=train_config["batch_size"],
 96 |         persistent_workers=True,
 97 |         prefetch_factor=8,
 98 |     )
 99 |     optimization_parameters = {
100 |         "optimizers": {
101 |             "generator": generator_optimizer,
102 |             "discriminator": discriminator_optimizer,
103 |         },
104 |         "scaler": GradScaler(),
105 |         # NOTE (Sam): need to pass names rather than vector of losses since arguments differ
106 |         "losses": {
107 |             "l1": {"loss": F.l1_loss, "weight": 1.0},
108 |             "feature": {"loss": feature_loss, "weight": 1.0},
109 |             "generator": {"loss": generator_loss, "weight": 1.0},
110 |             "discriminator": {"loss": discriminator_loss, "weight": 1},
111 |         },
112 |     }
113 | 
114 |     iteration = 0
115 |     start_epoch = 0
116 |     print("Beginning training for ", train_config["epochs"], " epochs")
117 |     for epoch in range(start_epoch, train_config["epochs"]):
118 |         print(f"Epoch: {epoch}")
119 |         iteration = train_epoch(
120 |             train_step,
121 |             train_loader,
122 |             config,
123 |             models,
124 |             optimization_parameters,
125 |             logging_parameters={},
126 |             iteration=iteration,
127 |         )
128 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/hifigan/train_epoch.py:
--------------------------------------------------------------------------------
 1 | def train_epoch(
 2 |     _train_step,
 3 |     dataloader,
 4 |     config,
 5 |     models,
 6 |     optimization_parameters,
 7 |     logging_parameters,
 8 |     iteration,
 9 | ):
10 |     for batch in dataloader:
11 |         print(iteration, "iteration")
12 |         _train_step(
13 |             batch,
14 |             config,
15 |             models,
16 |             optimization_parameters,
17 |             logging_parameters,
18 |             iteration,
19 |         )
20 |         iteration += 1
21 | 
22 |     return iteration
23 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/hifigan/train_step.py:
--------------------------------------------------------------------------------
  1 | from torch.cuda.amp import autocast
  2 | from ray.air import session
  3 | from datetime import datetime
  4 | from einops import rearrange
  5 | 
  6 | from ...models.rvc.commons import clip_grad_value_, slice_segments
  7 | from ...data.utils import (
  8 |     mel_spectrogram_torch,
  9 |     spec_to_mel_torch,
 10 | )
 11 | from ..log import log
 12 | from ..rvc.save import save_checkpoint
 13 | from ...models.rvc.commons import rand_slice_segments
 14 | 
 15 | from ...data.data import MAX_WAV_VALUE
 16 | 
 17 | 
 18 | # NOTE (Sam): passing dict arguments to functions is a bit of a code smell.
 19 | # TODO (Sam): the data parameters have slightly different names here
 20 | # (e.g. hop_length v hop_size, filter_length v n_fft, num_mels v n_mel_channels, win_length v win_size, mel_fmin v fmin) - unify.
 21 | def train_step(
 22 |     batch, config, models, optimization_parameters, logging_parameters, iteration
 23 | ):
 24 |     data_config = config["data"]
 25 |     train_config = config["train"]
 26 |     generator = models["generator"]
 27 |     discriminator = models["discriminator"]
 28 |     discriminator_optimizer = optimization_parameters["optimizers"]["discriminator"]
 29 |     generator_optimizer = optimization_parameters["optimizers"]["generator"]
 30 |     scaler = optimization_parameters["scaler"]
 31 |     discriminator_loss = optimization_parameters["losses"]["discriminator"]["loss"]
 32 |     # NOTE (Sam): The reason to pass the loss as a parameter rather than import it is to reuse the _train_step function for different losses.
 33 |     l1_loss = optimization_parameters["losses"]["l1"]["loss"]
 34 |     l1_loss_weight = optimization_parameters["losses"]["l1"]["weight"]
 35 |     generator_loss = optimization_parameters["losses"]["generator"]["loss"]
 36 |     generator_loss_weight = optimization_parameters["losses"]["generator"]["weight"]
 37 |     feature_loss = optimization_parameters["losses"]["feature"]["loss"]
 38 |     feature_loss_weight = optimization_parameters["losses"]["feature"]["weight"]
 39 | 
 40 |     batch = batch.to_gpu()
 41 |     mel_slices, ids_slice = rand_slice_segments(
 42 |         batch["mel_padded"],
 43 |         batch["mel_lengths"],
 44 |         train_config["segment_size"] // data_config["hop_size"],
 45 |     )
 46 |     # NOTE (Sam): it looks like audio_hat is a 3 way tensor to reuse the slice method between mel and audio.
 47 |     audio_hat = generator(mel_slices)
 48 | 
 49 |     # with autocast(enabled=False):
 50 |     audio_sliced = slice_segments(
 51 |         batch["audio_padded"].unsqueeze(0) / MAX_WAV_VALUE,
 52 |         ids_slice * data_config["hop_size"],
 53 |         train_config["segment_size"],
 54 |     )
 55 | 
 56 |     audio_sliced = rearrange(audio_sliced, "c b t -> b c t")
 57 | 
 58 |     y_d_hat_r, y_d_hat_g, _, _ = discriminator(audio_sliced, audio_hat.detach())
 59 | 
 60 |     loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
 61 |     discriminator_optimizer.zero_grad()
 62 |     scaler.scale(loss_disc).backward()
 63 |     scaler.unscale_(discriminator_optimizer)
 64 |     grad_norm_d = clip_grad_value_(discriminator.parameters(), None)
 65 |     scaler.step(discriminator_optimizer)
 66 | 
 67 |     # with autocast(enabled=False):
 68 |     y_hat_mel = mel_spectrogram_torch(
 69 |         audio_hat.float().squeeze(1),
 70 |         data_config["n_fft"],
 71 |         data_config["num_mels"],
 72 |         data_config["sampling_rate"],
 73 |         data_config["hop_size"],
 74 |         data_config["win_size"],
 75 |         data_config["fmin"],
 76 |         data_config["fmax"],
 77 |     )
 78 | 
 79 |     # if train_config["fp16_run"] == True:
 80 |     #     y_hat_mel = y_hat_mel.half()
 81 |     # with autocast(enabled=train_config["fp16_run"]):
 82 |     # NOTE (Sam): y_d_hat are list of coordinates of real and generated data at the output of each block
 83 |     # fmap_r and fmap_g are the same except earlier in the network.
 84 |     y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = discriminator(
 85 |         audio_sliced,
 86 |         audio_hat,
 87 |     )
 88 | 
 89 |     loss_mel = l1_loss(mel_slices, y_hat_mel) * train_config["c_mel"]
 90 |     loss_fm = feature_loss(fmap_r, fmap_g)
 91 |     loss_gen, losses_gen = generator_loss(y_d_hat_g)
 92 |     # TODO (Sam): put these in a loss_outputs dict like radtts
 93 |     loss_gen_all = (
 94 |         loss_gen * generator_loss_weight
 95 |         + loss_fm * feature_loss_weight
 96 |         + loss_mel * l1_loss_weight
 97 |     )
 98 | 
 99 |     generator_optimizer.zero_grad()
100 |     scaler.scale(loss_gen_all).backward()
101 |     scaler.unscale_(generator_optimizer)
102 |     grad_norm_g = clip_grad_value_(generator.parameters(), None)
103 |     scaler.step(generator_optimizer)
104 |     scaler.update()
105 | 
106 |     print("iteration: ", iteration, datetime.now())
107 |     log_sample = iteration % train_config["steps_per_sample"] == 0
108 |     log_checkpoint = iteration % train_config["iters_per_checkpoint"] == 0
109 | 
110 |     metrics = {
111 |         "generator_total_loss": loss_gen_all,
112 |         "generator_loss": loss_gen,
113 |         "generator_feature_loss": loss_fm,
114 |         "generator_loss_mel": loss_mel,
115 |         # "discriminator_total_loss": loss_disc,
116 |     }
117 | 
118 |     log(metrics)
119 | 
120 |     if log_sample and session.get_world_rank() == 0:
121 |         import numpy as np
122 | 
123 |         audios = {
124 |             "ground_truth": {
125 |                 "audio": audio_sliced[0][0] / np.abs(audio_sliced[0][0].cpu()).max()
126 |             },
127 |             "generated": {"audio": audio_hat[0][0]},
128 |         }
129 |         images = None
130 | 
131 |         log(audios=audios, images=images)
132 |     if log_checkpoint and session.get_world_rank() == 0:
133 |         checkpoint_path = f"{train_config['output_directory']}/model_{iteration}.pt"
134 |         save_checkpoint(
135 |             generator,
136 |             generator_optimizer,
137 |             discriminator,
138 |             discriminator_optimizer,
139 |             iteration,
140 |             checkpoint_path,
141 |         )
142 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/load.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from collections import OrderedDict
 3 | 
 4 | from torch.utils.data.distributed import DistributedSampler
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | from ..data.data import DataRADTTS as Data
 8 | from ..data.collate import DataCollateRADTTS as DataCollate
 9 | 
10 | 
11 | # TODO (Sam): warmstart should load optimizer state as well.
12 | # load_pretrained should just be the state_dict
13 | def warmstart(
14 |     checkpoint_path, model, include_layers=[], ignore_layers_warmstart=[], strict=False
15 | ):
16 |     pretrained_dict = torch.load(checkpoint_path, map_location="cpu")
17 |     pretrained_dict = pretrained_dict["state_dict"]
18 | 
19 |     is_module = False
20 |     if list(pretrained_dict.keys())[0].startswith("module."):
21 |         is_module = True
22 |     if is_module:
23 |         new_state_dict = OrderedDict()
24 |         for k, v in pretrained_dict.items():
25 |             name = k[7:]  # remove `module.`
26 |             new_state_dict[name] = v
27 |         pretrained_dict = new_state_dict
28 | 
29 |     model_dict = model.state_dict()
30 |     model_dict.update(pretrained_dict)
31 |     model.load_state_dict(model_dict, strict=strict)
32 |     print("Warm started from {}".format(checkpoint_path))
33 |     model.train()
34 |     return model
35 | 
36 | 
37 | def prepare_dataloaders(data_config, n_gpus, batch_size):
38 |     # Get data, data loaders and collate function ready
39 |     ignore_keys = ["training_files", "validation_files"]
40 |     print("initializing training dataloader")
41 |     trainset = Data(
42 |         data_config["training_files"],
43 |         **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
44 |     )
45 | 
46 |     print("initializing validation dataloader")
47 |     data_config_val = data_config.copy()
48 |     data_config_val["aug_probabilities"] = None  # no aug in val set
49 |     valset = Data(
50 |         data_config["validation_files"],
51 |         **dict((k, v) for k, v in data_config_val.items() if k not in ignore_keys),
52 |         speaker_ids=trainset.speaker_ids,
53 |     )
54 | 
55 |     collate_fn = DataCollate()
56 | 
57 |     train_sampler, shuffle = None, True
58 |     if n_gpus > 1:
59 |         train_sampler, shuffle = DistributedSampler(trainset), False
60 | 
61 |     train_loader = DataLoader(
62 |         trainset,
63 |         num_workers=8,
64 |         shuffle=shuffle,
65 |         sampler=train_sampler,
66 |         batch_size=batch_size,
67 |         pin_memory=False,
68 |         drop_last=True,
69 |         collate_fn=collate_fn,
70 |     )
71 | 
72 |     return train_loader, valset, collate_fn
73 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/log.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import wandb
 3 | from ray.air import session
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def log(metrics=None, audios=None, images=None, sample_rate=22050):
 8 |     if session.get_world_rank() != 0:
 9 |         return
10 |     audios = audios or {}
11 |     images = images or {}
12 |     wandb_metrics = {}
13 |     if metrics is not None:
14 |         wandb_metrics.update(metrics)
15 | 
16 |     for k, v in audios.items():
17 |         wandb_metrics[k] = wandb.Audio(
18 |             v["audio"].cpu(), sample_rate=sample_rate, caption=v.get("caption")
19 |         )
20 | 
21 |     for k, v in images.items():
22 |         wandb_metrics[k] = wandb.Image(v)
23 | 
24 |     wandb.log(wandb_metrics)
25 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/radtts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/radtts/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/radtts/load.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from collections import OrderedDict
 3 | 
 4 | from torch.utils.data.distributed import DistributedSampler
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | from ...data.data import DataRADTTS as Data
 8 | from ...data.collate import DataCollateRADTTS as DataCollate
 9 | 
10 | 
11 | # TODO (Sam): warmstart should load optimizer state as well.
12 | # load_pretrained should just be the state_dict
13 | def warmstart(
14 |     checkpoint_path, model, include_layers=[], ignore_layers_warmstart=[], strict=False
15 | ):
16 |     pretrained_dict = torch.load(checkpoint_path, map_location="cpu")
17 |     pretrained_dict = pretrained_dict["state_dict"]
18 | 
19 |     is_module = False
20 |     if list(pretrained_dict.keys())[0].startswith("module."):
21 |         is_module = True
22 |     if is_module:
23 |         new_state_dict = OrderedDict()
24 |         for k, v in pretrained_dict.items():
25 |             name = k[7:]  # remove `module.`
26 |             new_state_dict[name] = v
27 |         pretrained_dict = new_state_dict
28 | 
29 |     model_dict = model.state_dict()
30 |     model_dict.update(pretrained_dict)
31 |     model.load_state_dict(model_dict, strict=strict)
32 |     print("Warm started from {}".format(checkpoint_path))
33 |     model.train()
34 |     return model
35 | 
36 | 
37 | def prepare_dataloaders(data_config, n_gpus, batch_size):
38 |     # Get data, data loaders and collate function ready
39 |     ignore_keys = ["training_files", "validation_files"]
40 |     print("initializing training dataloader")
41 |     trainset = Data(
42 |         data_config["training_files"],
43 |         **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
44 |     )
45 | 
46 |     print("initializing validation dataloader")
47 |     data_config_val = data_config.copy()
48 |     data_config_val["aug_probabilities"] = None  # no aug in val set
49 |     valset = Data(
50 |         data_config["validation_files"],
51 |         **dict((k, v) for k, v in data_config_val.items() if k not in ignore_keys),
52 |         speaker_ids=trainset.speaker_ids,
53 |     )
54 | 
55 |     collate_fn = DataCollate()
56 | 
57 |     train_sampler, shuffle = None, True
58 |     if n_gpus > 1:
59 |         train_sampler, shuffle = DistributedSampler(trainset), False
60 | 
61 |     train_loader = DataLoader(
62 |         trainset,
63 |         num_workers=data_config["num_workers"],
64 |         shuffle=shuffle,
65 |         sampler=train_sampler,
66 |         batch_size=batch_size,
67 |         pin_memory=False,
68 |         drop_last=True,
69 |         collate_fn=collate_fn,
70 |     )
71 | 
72 |     return train_loader, valset, collate_fn
73 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/radtts/save.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def save_checkpoint(model, optimizer, iteration, filepath):
 5 |     print(
 6 |         "Saving model and optimizer state at iteration {} to {}".format(
 7 |             iteration, filepath
 8 |         )
 9 |     )
10 | 
11 |     # NOTE (Sam): learning rate not accessible here
12 |     torch.save(
13 |         {
14 |             "state_dict": model.state_dict(),
15 |             "iteration": iteration,
16 |             "optimizer": optimizer.state_dict(),
17 |         },
18 |         filepath,
19 |     )
20 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/radtts/train_epoch.py:
--------------------------------------------------------------------------------
 1 | from .train_step import _train_step
 2 | 
 3 | 
 4 | # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset
 5 | def train_epoch(
 6 |     train_dataloader,
 7 |     log_decoder_samples,
 8 |     log_attribute_samples,
 9 |     model,
10 |     optim,
11 |     steps_per_sample,
12 |     scaler,
13 |     iters_per_checkpoint,
14 |     output_directory,
15 |     criterion,
16 |     attention_kl_loss,
17 |     kl_loss_start_iter,
18 |     binarization_start_iter,
19 |     iteration,
20 |     vocoder,
21 | ):
22 |     # def train_epoch(dataset_shard, batch_size, model, optim, steps_per_sample, scaler, scheduler, criterion, attention_kl_loss, kl_loss_start_iter, binarization_start_iter, epoch, iteration):
23 |     # for batch_idx, ray_batch_df in enumerate(
24 |     #     dataset_shard.iter_batches(batch_size=batch_size, prefetch_blocks=6)
25 |     # ):
26 |     # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset
27 |     for batch in train_dataloader:
28 |         _train_step(
29 |             # ray_batch_df,
30 |             # NOTE (Sam): uncomment to run with torch DataLoader rather than ray dataset
31 |             batch,
32 |             model,
33 |             optim,
34 |             iteration,
35 |             log_decoder_samples,
36 |             log_attribute_samples,
37 |             steps_per_sample,
38 |             scaler,
39 |             iters_per_checkpoint,
40 |             output_directory,
41 |             criterion,
42 |             attention_kl_loss,
43 |             kl_loss_start_iter,
44 |             binarization_start_iter,
45 |             vocoder,
46 |         )
47 |         iteration += 1
48 | 
49 |     return iteration
50 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/radtts/train_step.py:
--------------------------------------------------------------------------------
  1 | # NOTE (Sam): for use with ray trainer.
  2 | from datetime import datetime
  3 | 
  4 | import torch
  5 | from torch.cuda.amp import autocast
  6 | from ray.air import session
  7 | 
  8 | from .log import get_log_audio
  9 | from ..log import log
 10 | from .save import save_checkpoint
 11 | from ...utils.utils import (
 12 |     to_gpu,
 13 | )
 14 | 
 15 | 
 16 | # TODO (Sam): it seems like much of this can be made generic for multiple models.
 17 | def _train_step(
 18 |     batch,
 19 |     model,
 20 |     optim,
 21 |     iteration,
 22 |     log_decoder_samples,
 23 |     log_attribute_samples,
 24 |     steps_per_sample,
 25 |     scaler,
 26 |     iters_per_checkpoint,
 27 |     output_directory,
 28 |     criterion,
 29 |     attention_kl_loss,
 30 |     kl_loss_start_iter,
 31 |     binarization_start_iter,
 32 |     vocoder,
 33 | ):
 34 |     print(datetime.now(), "entering train step:", iteration)
 35 |     if iteration >= binarization_start_iter:
 36 |         binarize = True
 37 |     else:
 38 |         binarize = False
 39 | 
 40 |     optim.zero_grad()
 41 | 
 42 |     with autocast(enabled=False):
 43 |         batch_dict = batch  # torch DataLoader?
 44 |         # TODO (Sam): move to batch.go_gpu().
 45 |         mel = to_gpu(batch_dict["mel"])
 46 |         speaker_ids = to_gpu(batch_dict["speaker_ids"])
 47 |         attn_prior = to_gpu(batch_dict["attn_prior"])
 48 |         f0 = to_gpu(batch_dict["f0"])
 49 |         voiced_mask = to_gpu(batch_dict["voiced_mask"])
 50 |         text = to_gpu(batch_dict["text"])
 51 |         in_lens = to_gpu(batch_dict["input_lengths"])
 52 |         out_lens = to_gpu(batch_dict["output_lengths"])
 53 |         energy_avg = to_gpu(batch_dict["energy_avg"])
 54 |         audio_embedding = to_gpu(batch_dict["audio_embedding"])
 55 | 
 56 |         outputs = model(
 57 |             mel,
 58 |             speaker_ids,
 59 |             text,
 60 |             in_lens,
 61 |             out_lens,
 62 |             binarize_attention=binarize,
 63 |             attn_prior=attn_prior,
 64 |             f0=f0,
 65 |             energy_avg=energy_avg,
 66 |             voiced_mask=voiced_mask,
 67 |             audio_embedding=audio_embedding,
 68 |         )
 69 | 
 70 |         loss_outputs = criterion(outputs, in_lens, out_lens)
 71 | 
 72 |         print_list = []
 73 |         loss = None
 74 |         for k, (v, w) in loss_outputs.items():
 75 |             if w > 0:
 76 |                 loss = v * w if loss is None else loss + v * w
 77 |             print_list.append("  |  {}: {:.3f}".format(k, v))
 78 | 
 79 |         w_bin = criterion.loss_weights.get("binarization_loss_weight", 1.0)
 80 |         if binarize and iteration >= kl_loss_start_iter:
 81 |             binarization_loss = attention_kl_loss(outputs["attn"], outputs["attn_soft"])
 82 |             loss += binarization_loss * w_bin
 83 |         else:
 84 |             binarization_loss = torch.zeros_like(loss)
 85 |         loss_outputs["binarization_loss"] = (binarization_loss, w_bin)
 86 |     grad_clip_val = 1.0  # TODO (Sam): make this a config option
 87 |     print(print_list)
 88 |     scaler.scale(loss).backward()
 89 |     if grad_clip_val > 0:
 90 |         scaler.unscale_(optim)
 91 |         torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val)
 92 | 
 93 |     scaler.step(optim)
 94 |     scaler.update()
 95 | 
 96 |     metrics = {"loss": loss.item()}
 97 |     for k, (v, w) in loss_outputs.items():
 98 |         metrics[k] = v.item()
 99 | 
100 |     print("iteration: ", iteration, datetime.now())
101 |     log_sample = iteration % steps_per_sample == 0
102 |     log_checkpoint = iteration % iters_per_checkpoint == 0
103 | 
104 |     if log_sample and session.get_world_rank() == 0:
105 |         model.eval()
106 |         # TODO (Sam): adding tf output logging and out of distribution inference
107 |         # TODO (Sam): add logging of ground truth
108 |         images, audios = get_log_audio(
109 |             batch_dict,
110 |             log_decoder_samples,
111 |             log_attribute_samples,
112 |             model,
113 |             speaker_ids,
114 |             text,
115 |             f0,
116 |             energy_avg,
117 |             voiced_mask,
118 |             vocoder,
119 |         )
120 |         # TODO (Sam): make out of sample logging cleaner.
121 |         # NOTE (Sam): right now this requires precomputation of embeddings and isn't out of sample zero shot.
122 |         # gt_path = "/usr/src/app/radtts/ground_truth"
123 |         # oos_embs = os.listdir(gt_path)
124 |         # # this doesn't help for reasons described above
125 |         # for oos_name in oos_embs:
126 |         #     audio_embedding_oos = torch.load(f"{gt_path}/{oos_name}").cuda()
127 |         #     _, audios_oos = get_log_audio(
128 |         #         outputs,
129 |         #         batch_dict,
130 |         #         log_decoder_samples,
131 |         #         log_attribute_samples,
132 |         #         model,
133 |         #         speaker_ids,
134 |         #         text,
135 |         #         f0,
136 |         #         energy_avg,
137 |         #         voiced_mask,
138 |         #         vocoder,
139 |         #         oos_name=oos_name,
140 |         #         audio_embedding_oos=audio_embedding_oos,
141 |         #     )
142 |         #     audios.update(audios_oos)
143 |         log(
144 |             metrics,
145 |             audios,
146 |             sample_rate=getattr(vocoder, "sr", 22050),
147 |             images=images,
148 |         )
149 |         model.train()
150 |     else:
151 |         log(metrics)
152 | 
153 |     if log_checkpoint and session.get_world_rank() == 0:
154 |         checkpoint_path = f"{output_directory}/model_{iteration}.pt"
155 |         save_checkpoint(model, optim, iteration, checkpoint_path)
156 | 
157 |     print(f"Loss: {loss.item()}")
158 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/rvc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/trainer/rvc/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/rvc/save.py:
--------------------------------------------------------------------------------
 1 | # TODO (Sam): combine with radtts save_checkpoint
 2 | import torch
 3 | 
 4 | 
 5 | def save_checkpoint(
 6 |     generator,
 7 |     generator_optimizer,
 8 |     discriminator,
 9 |     discriminator_optimizer,
10 |     iteration,
11 |     filepath,
12 | ):
13 |     print(
14 |         "Saving model and optimizer state at iteration {} to {}".format(
15 |             iteration, filepath
16 |         )
17 |     )
18 | 
19 |     # TODO (Sam): figure out where to put learning rate.
20 |     torch.save(
21 |         {
22 |             "generator_state_dict": generator.state_dict(),
23 |             "iteration": iteration,
24 |             "generator_optimizer": generator_optimizer.state_dict(),
25 |             "discriminator_state_dict": discriminator.state_dict(),
26 |             "discriminator_optimizer": discriminator_optimizer.state_dict(),
27 |         },
28 |         filepath,
29 |     )
30 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/rvc/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.cuda.amp import GradScaler
  3 | from ray.air.integrations.wandb import setup_wandb
  4 | from torch.utils.data import DataLoader
  5 | from torch.nn import functional as F
  6 | 
  7 | from .train_epoch import train_epoch
  8 | from ...models.rvc.rvc import (
  9 |     SynthesizerTrnMs256NSFsid,
 10 |     MultiPeriodDiscriminator,
 11 | )
 12 | from ...vendor.tfcompat.hparam import HParams
 13 | from ...data.data import (
 14 |     TextAudioLoaderMultiNSFsid,
 15 |     DistributedBucketSampler,
 16 | )
 17 | from ...data.collate import TextAudioCollateMultiNSFsid
 18 | from ...losses_rvc import (
 19 |     generator_loss,
 20 |     discriminator_loss,
 21 |     feature_loss,
 22 |     kl_loss,
 23 | )
 24 | from uberduck_ml_dev.trainer.rvc.train_epoch import train_epoch
 25 | 
 26 | 
 27 | def train_func(config: dict, project: str = "rvc"):
 28 |     print("Entering training function")
 29 |     setup_wandb(config, project=project, entity="uberduck-ai", rank_zero_only=False)
 30 |     train_config = config["train"]
 31 |     model_config = config["model"]
 32 |     data_config = config["data"]
 33 | 
 34 |     generator = SynthesizerTrnMs256NSFsid(
 35 |         data_config["filter_length"] // 2 + 1,
 36 |         train_config["segment_size"] // data_config["hop_length"],
 37 |         **model_config,
 38 |         is_half=train_config["fp16_run"],
 39 |         sr=data_config["sampling_rate"],
 40 |     )
 41 | 
 42 |     discriminator = MultiPeriodDiscriminator(model_config["use_spectral_norm"])
 43 |     generator_optimizer = torch.optim.AdamW(
 44 |         generator.parameters(),
 45 |         train_config["learning_rate"],
 46 |         betas=train_config["betas"],
 47 |         eps=train_config["eps"],
 48 |     )
 49 | 
 50 |     discriminator_optimizer = torch.optim.AdamW(
 51 |         discriminator.parameters(),
 52 |         train_config["learning_rate"],
 53 |         betas=train_config["betas"],
 54 |         eps=train_config["eps"],
 55 |     )
 56 | 
 57 |     print("Loading checkpoints")
 58 |     # TODO (Sam): move to "warmstart" or "load_checkpoint" functions
 59 |     generator_checkpoint = torch.load(train_config["warmstart_G_checkpoint_path"])[
 60 |         "model"
 61 |     ]
 62 |     discriminator_checkpoint = torch.load(train_config["warmstart_D_checkpoint_path"])[
 63 |         "model"
 64 |     ]
 65 |     discriminator.load_state_dict(discriminator_checkpoint)
 66 |     generator.load_state_dict(
 67 |         generator_checkpoint, strict=False
 68 |     )  # NOTE (Sam): a handful of "enc_q" decoder states not present
 69 |     generator = generator.cuda()
 70 |     discriminator = discriminator.cuda()
 71 | 
 72 |     models = {"generator": generator, "discriminator": discriminator}
 73 | 
 74 |     print("Loading dataset")
 75 |     train_dataset = TextAudioLoaderMultiNSFsid(
 76 |         train_config["filelist_path"], HParams(**data_config)
 77 |     )  # dv is sid
 78 |     collate_fn = TextAudioCollateMultiNSFsid()
 79 |     n_gpus = 1
 80 |     train_sampler = DistributedBucketSampler(
 81 |         train_dataset,
 82 |         train_config["batch_size"] * n_gpus,
 83 |         [100, 200, 300, 400, 500, 600, 700, 800, 900],  # 16s
 84 |         num_replicas=n_gpus,
 85 |         rank=0,
 86 |         shuffle=True,
 87 |     )
 88 |     train_loader = DataLoader(
 89 |         train_dataset,
 90 |         num_workers=1,
 91 |         shuffle=False,
 92 |         pin_memory=True,
 93 |         collate_fn=collate_fn,
 94 |         batch_sampler=train_sampler,
 95 |         persistent_workers=True,
 96 |         prefetch_factor=8,
 97 |     )
 98 |     optimization_parameters = {
 99 |         "optimizers": {
100 |             "generator": generator_optimizer,
101 |             "discriminator": discriminator_optimizer,
102 |         },
103 |         "scaler": GradScaler(),
104 |         # NOTE (Sam): need to pass names rather than vector of losses since arguments differ
105 |         "losses": {
106 |             "l1": {"loss": F.l1_loss, "weight": 1.0},
107 |             "kl": {"loss": kl_loss, "weight": 1.0},
108 |             "feature": {"loss": feature_loss, "weight": 1.0},
109 |             "generator": {"loss": generator_loss, "weight": 1.0},
110 |             "discriminator": {"loss": discriminator_loss, "weight": 1},
111 |         },
112 |     }
113 | 
114 |     iteration = 0
115 |     start_epoch = 0
116 |     print("Beginning training for ", train_config["epochs"], " epochs")
117 |     for epoch in range(start_epoch, train_config["epochs"]):
118 |         print(f"Epoch: {epoch}")
119 |         iteration = train_epoch(
120 |             train_loader,
121 |             config,
122 |             models,
123 |             optimization_parameters,
124 |             logging_parameters={},
125 |             iteration=iteration,
126 |         )
127 | 
128 | 
129 | # 40k config
130 | DEFAULTS = {
131 |     "log_interval": 200,
132 |     "seed": 1234,
133 |     "epochs": 20000,
134 |     "learning_rate": 1e-4,
135 |     "betas": [0.8, 0.99],
136 |     "eps": 1e-9,
137 |     "batch_size": 4,
138 |     "fp16_run": False,
139 |     "lr_decay": 0.999875,
140 |     "segment_size": 12800,
141 |     "init_lr_ratio": 1,
142 |     "warmup_epochs": 0,
143 |     "c_mel": 45,
144 |     "c_kl": 1.0,
145 |     "steps_per_sample": 100,
146 |     "iters_per_checkpoint": 100,
147 |     "output_directory": "/tmp",
148 | }
149 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/trainer/rvc/train_epoch.py:
--------------------------------------------------------------------------------
 1 | # TODO (Sam): add config arguments to model / optimization / logging and remove.
 2 | from .train_step import _train_step
 3 | 
 4 | 
 5 | def train_epoch(
 6 |     dataloader,
 7 |     config,
 8 |     models,
 9 |     optimization_parameters,
10 |     logging_parameters,
11 |     iteration,
12 | ):
13 |     for batch in dataloader:
14 |         print(iteration, "iteration")
15 |         _train_step(
16 |             batch,
17 |             config,
18 |             models,
19 |             optimization_parameters,
20 |             logging_parameters,
21 |             iteration,
22 |         )
23 |         iteration += 1
24 | 
25 |     return iteration
26 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/utils/__init__.py


--------------------------------------------------------------------------------
/uberduck_ml_dev/utils/config.py:
--------------------------------------------------------------------------------
 1 | from ..models.tacotron2 import DEFAULTS as TACOTRON2_DEFAULTS
 2 | 
 3 | 
 4 | def tacotron2_training_to_model_config(training_config):
 5 |     shared_keys = set(TACOTRON2_DEFAULTS.values().keys()).intersection(
 6 |         training_config.keys()
 7 |     )
 8 |     # NOTE (Sam): only need to save non-default parameters in config unless defaults change.
 9 |     minimal_model_config = {k: training_config[k] for k in shared_keys}
10 |     return minimal_model_config
11 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/utils/denoiser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Removes bias from HiFi-Gan and Avocodo (typically heard as noise in the audio)
 3 | 
 4 | Usage:
 5 | from denoiser import Denoiser
 6 | denoiser = Denoiser(HIFIGANGENERATOR, mode="normal") # Experiment with modes "normal" and "zeros"
 7 | 
 8 | # Inference Vocoder
 9 | audio = hifigan.vocoder.forward(output[1][:1])
10 | 
11 | audio = audio.squeeze()
12 | audio = audio * 32768.0
13 | 
14 | # Denoise
15 | audio_denoised = denoiser(audio.view(1, -1), strength=15)[:, 0] # Change strength if needed
16 | 
17 | audio_denoised = audio_denoised.cpu().detach().numpy().reshape(-1)
18 | normalize = (32768.0 / np.max(np.abs(audio_denoised))) ** 0.9
19 | audio_denoised = audio_denoised * normalize
20 | """
21 | 
22 | import sys
23 | import torch
24 | from ..models.common import STFT
25 | 
26 | 
27 | class Denoiser(torch.nn.Module):
28 |     """WaveGlow denoiser, adapted for HiFi-GAN"""
29 | 
30 |     def __init__(
31 |         self, hifigan, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"
32 |     ):
33 |         super(Denoiser, self).__init__()
34 |         self.stft = STFT(
35 |             filter_length=filter_length,
36 |             hop_length=int(filter_length / n_overlap),
37 |             win_length=win_length,
38 |             device=torch.device("cpu"),
39 |         )
40 | 
41 |         if mode == "zeros":
42 |             mel_input = torch.zeros((1, 80, 88))
43 |         elif mode == "normal":
44 |             mel_input = torch.randn((1, 80, 88))
45 |         else:
46 |             raise Exception("Mode {} if not supported".format(mode))
47 | 
48 |         with torch.no_grad():
49 |             bias_audio = (
50 |                 hifigan.vocoder.forward(mel_input.to(hifigan.device))
51 |                 .view(1, -1)
52 |                 .float()
53 |             )
54 |             bias_spec, _ = self.stft.transform(bias_audio.cpu())
55 | 
56 |         self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
57 | 
58 |     def forward(self, audio, strength=10):
59 |         """
60 |         Strength is the amount of bias you want to be removed from the final audio.
61 |         Note: A higher strength may remove too much information in the original audio.
62 | 
63 |         :param audio: Audio data
64 |         :param strength: Amount of bias removal. Recommended range 10 - 50
65 |         :return: Denoised audio
66 |         :rtype: tensor
67 |         """
68 | 
69 |         audio_spec, audio_angles = self.stft.transform(audio.cpu())
70 |         audio_spec_denoised = audio_spec - self.bias_spec * strength
71 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
72 |         audio_denoised = self.stft.inverse(audio_spec_denoised.cpu(), audio_angles)
73 |         return audio_denoised
74 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/utils/exec.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["parse_args"]
 2 | 
 3 | import argparse
 4 | 
 5 | 
 6 | def parse_args(args):
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--config", help="Path to JSON config")
 9 |     args = parser.parse_args(args)
10 |     return args
11 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/utils/hifiutils.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import matplotlib
 4 | import torch
 5 | from torch.nn.utils import weight_norm
 6 | 
 7 | matplotlib.use("Agg")
 8 | import matplotlib.pylab as plt
 9 | 
10 | 
11 | def plot_spectrogram(spectrogram):
12 |     fig, ax = plt.subplots(figsize=(10, 2))
13 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
14 |     plt.colorbar(im, ax=ax)
15 | 
16 |     fig.canvas.draw()
17 |     plt.close()
18 | 
19 |     return fig
20 | 
21 | 
22 | def init_weights(m, mean=0.0, std=0.01):
23 |     classname = m.__class__.__name__
24 |     if classname.find("Conv") != -1:
25 |         m.weight.data.normal_(mean, std)
26 | 
27 | 
28 | def apply_weight_norm(m):
29 |     classname = m.__class__.__name__
30 |     if classname.find("Conv") != -1:
31 |         weight_norm(m)
32 | 
33 | 
34 | def get_padding(kernel_size, dilation=1):
35 |     return int((kernel_size * dilation - dilation) / 2)
36 | 
37 | 
38 | def load_checkpoint(filepath, device):
39 |     assert os.path.isfile(filepath)
40 |     print("Loading '{}'".format(filepath))
41 |     checkpoint_dict = torch.load(filepath, map_location=device)
42 |     print("Complete.")
43 |     return checkpoint_dict
44 | 
45 | 
46 | def save_checkpoint(filepath, obj):
47 |     print("Saving checkpoint to {}".format(filepath))
48 |     torch.save(obj, filepath)
49 |     print("Complete.")
50 | 
51 | 
52 | def scan_checkpoint(cp_dir, prefix):
53 |     pattern = os.path.join(cp_dir, prefix + "????????")
54 |     cp_list = glob.glob(pattern)
55 |     if len(cp_list) == 0:
56 |         return None
57 |     return sorted(cp_list)[-1]
58 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/utils/plot.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     "save_figure_to_numpy",
  3 |     "plot_tensor",
  4 |     "plot_spectrogram",
  5 |     "plot_attention",
  6 |     "plot_attention_phonemes",
  7 |     "plot_gate_outputs",
  8 | ]
  9 | 
 10 | 
 11 | import numpy as np
 12 | import matplotlib
 13 | 
 14 | matplotlib.use("Agg")
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | from ..text.symbols import id_to_symbol, DEFAULT_SYMBOLS
 18 | 
 19 | 
 20 | def save_figure_to_numpy(fig):
 21 |     """Save figure to a numpy array."""
 22 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
 23 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
 24 |     plt.close(fig)
 25 |     return data
 26 | 
 27 | 
 28 | def plot_tensor(tensor):
 29 |     plt.style.use("default")
 30 |     fig, ax = plt.subplots(figsize=(12, 3))
 31 |     im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none")
 32 |     plt.colorbar(im, ax=ax)
 33 |     plt.tight_layout()
 34 |     fig.canvas.draw()
 35 |     data = save_figure_to_numpy(fig)
 36 |     plt.close()
 37 |     return data
 38 | 
 39 | 
 40 | def plot_spectrogram(mel):
 41 |     figure = plt.figure()
 42 |     plt.xlabel("Spectrogram frame")
 43 |     plt.ylabel("Channel")
 44 |     plt.imshow(mel, aspect="auto", origin="lower", interpolation="none", cmap="inferno")
 45 |     figure.canvas.draw()
 46 |     return figure
 47 | 
 48 | 
 49 | def plot_attention(attention, encoder_length=None, decoder_length=None):
 50 |     figure = plt.figure()
 51 |     plt.xlabel("Decoder timestep")
 52 |     plt.ylabel("Encoder timestep")
 53 |     plt.imshow(
 54 |         attention.data.cpu().numpy(),
 55 |         aspect="auto",
 56 |         origin="lower",
 57 |         interpolation="none",
 58 |         cmap="inferno",
 59 |     )
 60 |     title_info = []
 61 |     if encoder_length is not None:
 62 |         title_info.append(f"Encoder_length: {encoder_length}")
 63 |     if decoder_length is not None:
 64 |         title_info.append(f"Decoder length: {decoder_length}")
 65 |     title = " ".join(title_info)
 66 |     plt.title(title)
 67 |     figure.canvas.draw()
 68 |     return figure
 69 | 
 70 | 
 71 | def plot_attention_phonemes(seq, attention, symbol_set=DEFAULT_SYMBOLS):
 72 |     figure = plt.figure(figsize=(15, 8))
 73 |     phonemes = []
 74 | 
 75 |     for token in seq.numpy():
 76 |         if token == len(id_to_symbol[symbol_set]):
 77 |             phonemes.append("~")
 78 |         else:
 79 |             phonemes.append(id_to_symbol[symbol_set][token][1:])
 80 | 
 81 |     xtick_locs = np.pad(
 82 |         np.cumsum(np.sum(attention.data.cpu().numpy(), axis=1)), (1, 0)
 83 |     ).astype(np.int16)[:-1]
 84 |     ytick_locs = np.arange(seq.shape[-1])
 85 |     plt.yticks(ytick_locs, phonemes)
 86 |     plt.xticks(xtick_locs, xtick_locs)
 87 | 
 88 |     plt.imshow(
 89 |         attention.data.cpu().numpy(),
 90 |         aspect="auto",
 91 |         origin="lower",
 92 |         interpolation="none",
 93 |         cmap="Greys",
 94 |     )
 95 | 
 96 |     i = 0
 97 |     for phon, y in zip(phonemes, ytick_locs):
 98 |         if phon == "~":
 99 |             continue
100 |         if i == 4:
101 |             plt.axhline(y=y, color="k")
102 |         if i == 3:
103 |             plt.axhline(y=y, color="r")
104 |         if i == 2:
105 |             plt.axhline(y=y, color="g")
106 |         if i == 1:
107 |             plt.axhline(y=y, color="b")
108 |         if i == 0:
109 |             plt.axhline(y=y, color="m")
110 |         i += 1
111 |         i = i % 5
112 | 
113 |     plt.grid(axis="x")
114 |     plt.title("Phoneme Alignment")
115 |     plt.xlabel("Time (mel frames)")
116 |     plt.ylabel("Phonemes")
117 | 
118 |     return figure
119 | 
120 | 
121 | def plot_gate_outputs(gate_targets=None, gate_outputs=None):
122 |     figure = plt.figure()
123 |     plt.xlabel("Frames")
124 |     plt.ylabel("Gate state")
125 |     ax = figure.add_axes([0, 0, 1, 1])
126 |     if gate_targets is not None:
127 |         ax.scatter(
128 |             range(gate_targets.size(0)),
129 |             gate_targets,
130 |             alpha=0.5,
131 |             color="green",
132 |             marker="+",
133 |             s=1,
134 |             label="target",
135 |         )
136 |     if gate_outputs is not None:
137 |         ax.scatter(
138 |             range(gate_outputs.size(0)),
139 |             gate_outputs,
140 |             alpha=0.5,
141 |             color="red",
142 |             marker=".",
143 |             s=1,
144 |             label="predicted",
145 |         )
146 |     figure.canvas.draw()
147 |     return figure
148 | 
149 | 
150 | def plot_alignment_to_numpy(
151 |     alignment, title="", info=None, phoneme_seq=None, vmin=None, vmax=None
152 | ):
153 |     if phoneme_seq:
154 |         fig, ax = plt.subplots(figsize=(15, 10))
155 |     else:
156 |         fig, ax = plt.subplots(figsize=(6, 4))
157 |     im = ax.imshow(
158 |         alignment,
159 |         aspect="auto",
160 |         origin="lower",
161 |         interpolation="none",
162 |         vmin=vmin,
163 |         vmax=vmax,
164 |     )
165 |     ax.set_title(title)
166 |     fig.colorbar(im, ax=ax)
167 |     xlabel = "Decoder timestep"
168 |     if info is not None:
169 |         xlabel += "\n\n" + info
170 |     plt.xlabel(xlabel)
171 |     plt.ylabel("Encoder timestep")
172 |     plt.tight_layout()
173 | 
174 |     if phoneme_seq != None:
175 |         # for debugging of phonemes and durs in maps. Not used by def in training code
176 |         ax.set_yticks(np.arange(len(phoneme_seq)))
177 |         ax.set_yticklabels(phoneme_seq)
178 |         ax.hlines(np.arange(len(phoneme_seq)), xmin=0.0, xmax=max(ax.get_xticks()))
179 | 
180 |     fig.canvas.draw()
181 |     data = save_figure_to_numpy(fig)
182 |     plt.close()
183 |     return data
184 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/vendor/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/uberduck_ml_dev/vendor/tfcompat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uberduck-ai/uberduck-ml-dev/377937580f78ea9964c20dec22904a913514c30d/uberduck_ml_dev/vendor/tfcompat/__init__.py


--------------------------------------------------------------------------------