├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README-ja.md ├── README.md ├── bin └── .gitignore ├── configs ├── 32k-768.json ├── 32k.json ├── 40k-768.json ├── 40k.json ├── 48k-768.json └── 48k.json ├── dev.py ├── launch.py ├── lib └── rvc │ ├── attentions.py │ ├── checkpoints.py │ ├── commons.py │ ├── config.py │ ├── data_utils.py │ ├── losses.py │ ├── mel_processing.py │ ├── models.py │ ├── modules.py │ ├── pipeline.py │ ├── preprocessing │ ├── extract_f0.py │ ├── extract_feature.py │ ├── slicer.py │ └── split.py │ ├── train.py │ ├── transforms.py │ └── utils.py ├── models ├── checkpoints │ └── .gitignore ├── embeddings │ └── .gitignore ├── pretrained │ └── .gitignore └── training │ ├── .gitignore │ ├── models │ └── .gitignore │ └── mute │ ├── 0_gt_wavs │ ├── mute32k.wav │ ├── mute40k.wav │ └── mute48k.wav │ ├── 1_16k_wavs │ └── mute.wav │ ├── 2a_f0 │ └── mute.wav.npy │ ├── 2b_f0nsf │ └── mute.wav.npy │ └── 3_feature256 │ └── mute.npy ├── modules ├── cmd_opts.py ├── core.py ├── merge.py ├── models.py ├── separate.py ├── server │ └── model.py ├── shared.py ├── tabs │ ├── inference.py │ ├── merge.py │ ├── server.py │ ├── split.py │ └── training.py ├── ui.py └── utils.py ├── outputs └── .gitignore ├── requirements.txt ├── requirements ├── dev.txt └── main.txt ├── script.js ├── server.py ├── styles.css ├── update.bat ├── update.sh ├── webui-macos-env.sh ├── webui-user.bat ├── webui-user.sh ├── webui.bat ├── webui.py └── webui.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | tmp/ 4 | 5 | 6 | ### Generated by gibo (https://github.com/simonwhitaker/gibo) 7 | ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Global/VisualStudioCode.gitignore 8 | 9 | .vscode/* 10 | !.vscode/settings.json 11 | !.vscode/tasks.json 12 | !.vscode/launch.json 13 | !.vscode/extensions.json 14 | !.vscode/*.code-snippets 15 | 16 | # Local History for Visual Studio Code 17 | .history/ 18 | 19 | # Built Visual Studio Code Extensions 20 | *.vsix 21 | 22 | 23 | ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore 24 | 25 | # Byte-compiled / optimized / DLL files 26 | __pycache__/ 27 | *.py[cod] 28 | *$py.class 29 | 30 | # C extensions 31 | *.so 32 | 33 | # Distribution / packaging 34 | .Python 35 | build/ 36 | develop-eggs/ 37 | dist/ 38 | downloads/ 39 | eggs/ 40 | .eggs/ 41 | # lib/ 42 | lib64/ 43 | parts/ 44 | sdist/ 45 | var/ 46 | wheels/ 47 | share/python-wheels/ 48 | *.egg-info/ 49 | .installed.cfg 50 | *.egg 51 | MANIFEST 52 | 53 | # PyInstaller 54 | # Usually these files are written by a python script from a template 55 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 56 | *.manifest 57 | *.spec 58 | 59 | # Installer logs 60 | pip-log.txt 61 | pip-delete-this-directory.txt 62 | 63 | # Unit test / coverage reports 64 | htmlcov/ 65 | .tox/ 66 | .nox/ 67 | .coverage 68 | .coverage.* 69 | .cache 70 | nosetests.xml 71 | coverage.xml 72 | *.cover 73 | *.py,cover 74 | .hypothesis/ 75 | .pytest_cache/ 76 | cover/ 77 | 78 | # Translations 79 | *.mo 80 | *.pot 81 | 82 | # Django stuff: 83 | *.log 84 | local_settings.py 85 | db.sqlite3 86 | db.sqlite3-journal 87 | 88 | # Flask stuff: 89 | instance/ 90 | .webassets-cache 91 | 92 | # Scrapy stuff: 93 | .scrapy 94 | 95 | # Sphinx documentation 96 | docs/_build/ 97 | 98 | # PyBuilder 99 | .pybuilder/ 100 | target/ 101 | 102 | # Jupyter Notebook 103 | .ipynb_checkpoints 104 | 105 | # IPython 106 | profile_default/ 107 | ipython_config.py 108 | 109 | # pyenv 110 | # For a library or package, you might want to ignore these files since the code is 111 | # intended to run in multiple environments; otherwise, check them in: 112 | # .python-version 113 | 114 | # pipenv 115 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 116 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 117 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 118 | # install all needed dependencies. 119 | #Pipfile.lock 120 | 121 | # poetry 122 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 123 | # This is especially recommended for binary packages to ensure reproducibility, and is more 124 | # commonly ignored for libraries. 125 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 126 | #poetry.lock 127 | 128 | # pdm 129 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 130 | #pdm.lock 131 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 132 | # in version control. 133 | # https://pdm.fming.dev/#use-with-ide 134 | .pdm.toml 135 | 136 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 137 | __pypackages__/ 138 | 139 | # Celery stuff 140 | celerybeat-schedule 141 | celerybeat.pid 142 | 143 | # SageMath parsed files 144 | *.sage.py 145 | 146 | # Environments 147 | .env 148 | .venv 149 | env/ 150 | venv/ 151 | ENV/ 152 | env.bak/ 153 | venv.bak/ 154 | 155 | # Spyder project settings 156 | .spyderproject 157 | .spyproject 158 | 159 | # Rope project settings 160 | .ropeproject 161 | 162 | # mkdocs documentation 163 | /site 164 | 165 | # mypy 166 | .mypy_cache/ 167 | .dmypy.json 168 | dmypy.json 169 | 170 | # Pyre type checker 171 | .pyre/ 172 | 173 | # pytype static type analyzer 174 | .pytype/ 175 | 176 | # Cython debug symbols 177 | cython_debug/ 178 | 179 | # PyCharm 180 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 181 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 182 | # and can be added to the global gitignore or merged into this file. For a more nuclear 183 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 184 | #.idea/ 185 | 186 | 187 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "black", 3 | "editor.codeActionsOnSave": { 4 | "source.organizeImports": true 5 | }, 6 | "editor.formatOnSave": true, 7 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 ddPn08 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README-ja.md: -------------------------------------------------------------------------------- 1 |

RVC-WebUI

2 |
3 |

4 | 5 | [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) の再構築プロジェクト 6 | 7 |

8 |
9 | 10 | --- 11 | 12 |
13 |

14 | 15 | [日本語](README-ja.md) | [English](README.md) 16 | 17 |

18 |
19 | 20 |
21 | 22 | # 起動 23 | 24 | ## Windows 25 | `webui-user.bat` をダブルクリックして、webuiを起動します。 26 | 27 | ## Linux or Mac 28 | `webui.sh` を実行して、webuiを起動します。 29 | 30 |
31 | 32 | ``` 33 | テスト環境: Windows 10, Python 3.10.9, torch 2.0.0+cu118 34 | ``` 35 | 36 |
37 | 38 | # トラブルシューティング 39 | 40 | ## `error: Microsoft Visual C++ 14.0 or greater is required.` 41 | 42 | Microsoft C++ Build Tools がインストールされている必要があります。 43 | 44 | ### Step 1: インストーラーをダウンロード 45 | [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16) 46 | 47 | ### Step 2: `C++ Build Tools` をインストール 48 | インストーラーを実行し、`Workloads` タブで `C++ Build Tools` を選択します。 49 | 50 |
51 | 52 | # クレジット 53 | - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) 54 | - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection) 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

RVC-WebUI

2 |
3 |

4 | 5 | [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) reconstruction project 6 | 7 |

8 |
9 | 10 | --- 11 | 12 |
13 |

14 | 15 | [日本語](README-ja.md) | [English](README.md) 16 | 17 |

18 |
19 | 20 |
21 | 22 | # Launch 23 | 24 | ## Windows 25 | Double click `webui-user.bat` to start the webui. 26 | 27 | ## Linux or Mac 28 | Run `webui.sh` to start the webui. 29 | 30 |
31 | 32 | ``` 33 | Tested environment: Windows 10, Python 3.10.9, torch 2.0.0+cu118 34 | ``` 35 | 36 |
37 | 38 | # Troubleshooting 39 | 40 | ## `error: Microsoft Visual C++ 14.0 or greater is required.` 41 | 42 | Microsoft C++ Build Tools must be installed. 43 | 44 | ### Step 1: Download the installer 45 | [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16) 46 | 47 | ### Step 2: Install `C++ Build Tools` 48 | Run the installer and select `C++ Build Tools` in the `Workloads` tab. 49 | 50 |
51 | 52 | # Credits 53 | - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) 54 | - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection) 55 | -------------------------------------------------------------------------------- /bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /configs/32k-768.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,4,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "emb_channels": 768, 45 | "spk_embed_dim": 109 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /configs/32k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,4,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "emb_channels": 256, 45 | "spk_embed_dim": 109 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /configs/40k-768.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 40000, 21 | "filter_length": 2048, 22 | "hop_length": 400, 23 | "win_length": 2048, 24 | "n_mel_channels": 125, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "emb_channels": 768, 45 | "spk_embed_dim": 109 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /configs/40k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 40000, 21 | "filter_length": 2048, 22 | "hop_length": 400, 23 | "win_length": 2048, 24 | "n_mel_channels": 125, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "emb_channels": 256, 45 | "spk_embed_dim": 109 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /configs/48k-768.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 11520, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,6,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "emb_channels": 768, 45 | "spk_embed_dim": 109 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /configs/48k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 11520, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,6,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "emb_channels": 256, 45 | "spk_embed_dim": 109 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /dev.py: -------------------------------------------------------------------------------- 1 | import modules.ui as ui 2 | 3 | demo = ui.create_ui() 4 | -------------------------------------------------------------------------------- /launch.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import os 3 | import shlex 4 | import subprocess 5 | import sys 6 | 7 | commandline_args = os.environ.get("COMMANDLINE_ARGS", "") 8 | sys.argv += shlex.split(commandline_args) 9 | 10 | python = sys.executable 11 | git = os.environ.get("GIT", "git") 12 | index_url = os.environ.get("INDEX_URL", "") 13 | stored_commit_hash = None 14 | skip_install = False 15 | 16 | 17 | def run(command, desc=None, errdesc=None, custom_env=None): 18 | if desc is not None: 19 | print(desc) 20 | 21 | result = subprocess.run( 22 | command, 23 | stdout=subprocess.PIPE, 24 | stderr=subprocess.PIPE, 25 | shell=True, 26 | env=os.environ if custom_env is None else custom_env, 27 | ) 28 | 29 | if result.returncode != 0: 30 | message = f"""{errdesc or 'Error running command'}. 31 | Command: {command} 32 | Error code: {result.returncode} 33 | stdout: {result.stdout.decode(encoding="utf8", errors="ignore") if len(result.stdout)>0 else ''} 34 | stderr: {result.stderr.decode(encoding="utf8", errors="ignore") if len(result.stderr)>0 else ''} 35 | """ 36 | raise RuntimeError(message) 37 | 38 | return result.stdout.decode(encoding="utf8", errors="ignore") 39 | 40 | 41 | def check_run(command): 42 | result = subprocess.run( 43 | command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 44 | ) 45 | return result.returncode == 0 46 | 47 | 48 | def is_installed(package): 49 | try: 50 | spec = importlib.util.find_spec(package) 51 | except ModuleNotFoundError: 52 | return False 53 | 54 | return spec is not None 55 | 56 | 57 | def commit_hash(): 58 | global stored_commit_hash 59 | 60 | if stored_commit_hash is not None: 61 | return stored_commit_hash 62 | 63 | try: 64 | stored_commit_hash = run(f"{git} rev-parse HEAD").strip() 65 | except Exception: 66 | stored_commit_hash = "" 67 | 68 | return stored_commit_hash 69 | 70 | 71 | def run_pip(args, desc=None): 72 | if skip_install: 73 | return 74 | 75 | index_url_line = f" --index-url {index_url}" if index_url != "" else "" 76 | return run( 77 | f'"{python}" -m pip {args} --prefer-binary{index_url_line}', 78 | desc=f"Installing {desc}", 79 | errdesc=f"Couldn't install {desc}", 80 | ) 81 | 82 | 83 | def run_python(code, desc=None, errdesc=None): 84 | return run(f'"{python}" -c "{code}"', desc, errdesc) 85 | 86 | 87 | def extract_arg(args, name): 88 | return [x for x in args if x != name], name in args 89 | 90 | 91 | def prepare_environment(): 92 | commit = commit_hash() 93 | 94 | print(f"Python {sys.version}") 95 | print(f"Commit hash: {commit}") 96 | 97 | torch_command = os.environ.get( 98 | "TORCH_COMMAND", 99 | "pip install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118", 100 | ) 101 | 102 | sys.argv, skip_install = extract_arg(sys.argv, "--skip-install") 103 | if skip_install: 104 | return 105 | 106 | sys.argv, reinstall_torch = extract_arg(sys.argv, "--reinstall-torch") 107 | ngrok = "--ngrok" in sys.argv 108 | 109 | if reinstall_torch or not is_installed("torch") or not is_installed("torchaudio"): 110 | run( 111 | f'"{python}" -m {torch_command}', 112 | "Installing torch and torchaudio", 113 | "Couldn't install torch", 114 | ) 115 | 116 | if not is_installed("pyngrok") and ngrok: 117 | run_pip("install pyngrok", "ngrok") 118 | 119 | run( 120 | f'"{python}" -m pip install -r requirements.txt', 121 | desc=f"Installing requirements", 122 | errdesc=f"Couldn't install requirements", 123 | ) 124 | 125 | 126 | def start(): 127 | os.environ["PATH"] = ( 128 | os.path.join(os.path.dirname(__file__), "bin") 129 | + os.pathsep 130 | + os.environ.get("PATH", "") 131 | ) 132 | subprocess.run( 133 | [python, "webui.py", *sys.argv[1:]], 134 | ) 135 | 136 | 137 | if __name__ == "__main__": 138 | prepare_environment() 139 | start() 140 | -------------------------------------------------------------------------------- /lib/rvc/attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | from . import commons 8 | from .modules import LayerNorm 9 | 10 | 11 | class Encoder(nn.Module): 12 | def __init__( 13 | self, 14 | hidden_channels, 15 | filter_channels, 16 | n_heads, 17 | n_layers, 18 | kernel_size=1, 19 | p_dropout=0.0, 20 | window_size=10, 21 | **kwargs 22 | ): 23 | super().__init__() 24 | self.hidden_channels = hidden_channels 25 | self.filter_channels = filter_channels 26 | self.n_heads = n_heads 27 | self.n_layers = n_layers 28 | self.kernel_size = kernel_size 29 | self.p_dropout = p_dropout 30 | self.window_size = window_size 31 | 32 | self.drop = nn.Dropout(p_dropout) 33 | self.attn_layers = nn.ModuleList() 34 | self.norm_layers_1 = nn.ModuleList() 35 | self.ffn_layers = nn.ModuleList() 36 | self.norm_layers_2 = nn.ModuleList() 37 | for i in range(self.n_layers): 38 | self.attn_layers.append( 39 | MultiHeadAttention( 40 | hidden_channels, 41 | hidden_channels, 42 | n_heads, 43 | p_dropout=p_dropout, 44 | window_size=window_size, 45 | ) 46 | ) 47 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 48 | self.ffn_layers.append( 49 | FFN( 50 | hidden_channels, 51 | hidden_channels, 52 | filter_channels, 53 | kernel_size, 54 | p_dropout=p_dropout, 55 | ) 56 | ) 57 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 58 | 59 | def forward(self, x, x_mask): 60 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 61 | x = x * x_mask 62 | for i in range(self.n_layers): 63 | y = self.attn_layers[i](x, x, attn_mask) 64 | y = self.drop(y) 65 | x = self.norm_layers_1[i](x + y) 66 | 67 | y = self.ffn_layers[i](x, x_mask) 68 | y = self.drop(y) 69 | x = self.norm_layers_2[i](x + y) 70 | x = x * x_mask 71 | return x 72 | 73 | 74 | class Decoder(nn.Module): 75 | def __init__( 76 | self, 77 | hidden_channels, 78 | filter_channels, 79 | n_heads, 80 | n_layers, 81 | kernel_size=1, 82 | p_dropout=0.0, 83 | proximal_bias=False, 84 | proximal_init=True, 85 | **kwargs 86 | ): 87 | super().__init__() 88 | self.hidden_channels = hidden_channels 89 | self.filter_channels = filter_channels 90 | self.n_heads = n_heads 91 | self.n_layers = n_layers 92 | self.kernel_size = kernel_size 93 | self.p_dropout = p_dropout 94 | self.proximal_bias = proximal_bias 95 | self.proximal_init = proximal_init 96 | 97 | self.drop = nn.Dropout(p_dropout) 98 | self.self_attn_layers = nn.ModuleList() 99 | self.norm_layers_0 = nn.ModuleList() 100 | self.encdec_attn_layers = nn.ModuleList() 101 | self.norm_layers_1 = nn.ModuleList() 102 | self.ffn_layers = nn.ModuleList() 103 | self.norm_layers_2 = nn.ModuleList() 104 | for i in range(self.n_layers): 105 | self.self_attn_layers.append( 106 | MultiHeadAttention( 107 | hidden_channels, 108 | hidden_channels, 109 | n_heads, 110 | p_dropout=p_dropout, 111 | proximal_bias=proximal_bias, 112 | proximal_init=proximal_init, 113 | ) 114 | ) 115 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 116 | self.encdec_attn_layers.append( 117 | MultiHeadAttention( 118 | hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout 119 | ) 120 | ) 121 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 122 | self.ffn_layers.append( 123 | FFN( 124 | hidden_channels, 125 | hidden_channels, 126 | filter_channels, 127 | kernel_size, 128 | p_dropout=p_dropout, 129 | causal=True, 130 | ) 131 | ) 132 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 133 | 134 | def forward(self, x, x_mask, h, h_mask): 135 | """ 136 | x: decoder input 137 | h: encoder output 138 | """ 139 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( 140 | device=x.device, dtype=x.dtype 141 | ) 142 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 143 | x = x * x_mask 144 | for i in range(self.n_layers): 145 | y = self.self_attn_layers[i](x, x, self_attn_mask) 146 | y = self.drop(y) 147 | x = self.norm_layers_0[i](x + y) 148 | 149 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 150 | y = self.drop(y) 151 | x = self.norm_layers_1[i](x + y) 152 | 153 | y = self.ffn_layers[i](x, x_mask) 154 | y = self.drop(y) 155 | x = self.norm_layers_2[i](x + y) 156 | x = x * x_mask 157 | return x 158 | 159 | 160 | class MultiHeadAttention(nn.Module): 161 | def __init__( 162 | self, 163 | channels, 164 | out_channels, 165 | n_heads, 166 | p_dropout=0.0, 167 | window_size=None, 168 | heads_share=True, 169 | block_length=None, 170 | proximal_bias=False, 171 | proximal_init=False, 172 | ): 173 | super().__init__() 174 | assert channels % n_heads == 0 175 | 176 | self.channels = channels 177 | self.out_channels = out_channels 178 | self.n_heads = n_heads 179 | self.p_dropout = p_dropout 180 | self.window_size = window_size 181 | self.heads_share = heads_share 182 | self.block_length = block_length 183 | self.proximal_bias = proximal_bias 184 | self.proximal_init = proximal_init 185 | self.attn = None 186 | 187 | self.k_channels = channels // n_heads 188 | self.conv_q = nn.Conv1d(channels, channels, 1) 189 | self.conv_k = nn.Conv1d(channels, channels, 1) 190 | self.conv_v = nn.Conv1d(channels, channels, 1) 191 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 192 | self.drop = nn.Dropout(p_dropout) 193 | 194 | if window_size is not None: 195 | n_heads_rel = 1 if heads_share else n_heads 196 | rel_stddev = self.k_channels**-0.5 197 | self.emb_rel_k = nn.Parameter( 198 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 199 | * rel_stddev 200 | ) 201 | self.emb_rel_v = nn.Parameter( 202 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 203 | * rel_stddev 204 | ) 205 | 206 | nn.init.xavier_uniform_(self.conv_q.weight) 207 | nn.init.xavier_uniform_(self.conv_k.weight) 208 | nn.init.xavier_uniform_(self.conv_v.weight) 209 | if proximal_init: 210 | with torch.no_grad(): 211 | self.conv_k.weight.copy_(self.conv_q.weight) 212 | self.conv_k.bias.copy_(self.conv_q.bias) 213 | 214 | def forward(self, x, c, attn_mask=None): 215 | q = self.conv_q(x) 216 | k = self.conv_k(c) 217 | v = self.conv_v(c) 218 | 219 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 220 | 221 | x = self.conv_o(x) 222 | return x 223 | 224 | def attention(self, query, key, value, mask=None): 225 | # reshape [b, d, t] -> [b, n_h, t, d_k] 226 | b, d, t_s, t_t = (*key.size(), query.size(2)) 227 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 228 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 229 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 230 | 231 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 232 | if self.window_size is not None: 233 | assert ( 234 | t_s == t_t 235 | ), "Relative attention is only available for self-attention." 236 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 237 | rel_logits = self._matmul_with_relative_keys( 238 | query / math.sqrt(self.k_channels), key_relative_embeddings 239 | ) 240 | scores_local = self._relative_position_to_absolute_position(rel_logits) 241 | scores = scores + scores_local 242 | if self.proximal_bias: 243 | assert t_s == t_t, "Proximal bias is only available for self-attention." 244 | scores = scores + self._attention_bias_proximal(t_s).to( 245 | device=scores.device, dtype=scores.dtype 246 | ) 247 | if mask is not None: 248 | scores = scores.masked_fill(mask == 0, -1e4) 249 | if self.block_length is not None: 250 | assert ( 251 | t_s == t_t 252 | ), "Local attention is only available for self-attention." 253 | block_mask = ( 254 | torch.ones_like(scores) 255 | .triu(-self.block_length) 256 | .tril(self.block_length) 257 | ) 258 | scores = scores.masked_fill(block_mask == 0, -1e4) 259 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 260 | p_attn = self.drop(p_attn) 261 | output = torch.matmul(p_attn, value) 262 | if self.window_size is not None: 263 | relative_weights = self._absolute_position_to_relative_position(p_attn) 264 | value_relative_embeddings = self._get_relative_embeddings( 265 | self.emb_rel_v, t_s 266 | ) 267 | output = output + self._matmul_with_relative_values( 268 | relative_weights, value_relative_embeddings 269 | ) 270 | output = ( 271 | output.transpose(2, 3).contiguous().view(b, d, t_t) 272 | ) # [b, n_h, t_t, d_k] -> [b, d, t_t] 273 | return output, p_attn 274 | 275 | def _matmul_with_relative_values(self, x, y): 276 | """ 277 | x: [b, h, l, m] 278 | y: [h or 1, m, d] 279 | ret: [b, h, l, d] 280 | """ 281 | ret = torch.matmul(x, y.unsqueeze(0)) 282 | return ret 283 | 284 | def _matmul_with_relative_keys(self, x, y): 285 | """ 286 | x: [b, h, l, d] 287 | y: [h or 1, m, d] 288 | ret: [b, h, l, m] 289 | """ 290 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 291 | return ret 292 | 293 | def _get_relative_embeddings(self, relative_embeddings, length): 294 | max_relative_position = 2 * self.window_size + 1 295 | # Pad first before slice to avoid using cond ops. 296 | pad_length = max(length - (self.window_size + 1), 0) 297 | slice_start_position = max((self.window_size + 1) - length, 0) 298 | slice_end_position = slice_start_position + 2 * length - 1 299 | if pad_length > 0: 300 | padded_relative_embeddings = F.pad( 301 | relative_embeddings, 302 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), 303 | ) 304 | else: 305 | padded_relative_embeddings = relative_embeddings 306 | used_relative_embeddings = padded_relative_embeddings[ 307 | :, slice_start_position:slice_end_position 308 | ] 309 | return used_relative_embeddings 310 | 311 | def _relative_position_to_absolute_position(self, x): 312 | """ 313 | x: [b, h, l, 2*l-1] 314 | ret: [b, h, l, l] 315 | """ 316 | batch, heads, length, _ = x.size() 317 | # Concat columns of pad to shift from relative to absolute indexing. 318 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) 319 | 320 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 321 | x_flat = x.view([batch, heads, length * 2 * length]) 322 | x_flat = F.pad( 323 | x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) 324 | ) 325 | 326 | # Reshape and slice out the padded elements. 327 | x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ 328 | :, :, :length, length - 1 : 329 | ] 330 | return x_final 331 | 332 | def _absolute_position_to_relative_position(self, x): 333 | """ 334 | x: [b, h, l, l] 335 | ret: [b, h, l, 2*l-1] 336 | """ 337 | batch, heads, length, _ = x.size() 338 | # padd along column 339 | x = F.pad( 340 | x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) 341 | ) 342 | x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) 343 | # add 0's in the beginning that will skew the elements after reshape 344 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 345 | x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] 346 | return x_final 347 | 348 | def _attention_bias_proximal(self, length): 349 | """Bias for self-attention to encourage attention to close positions. 350 | Args: 351 | length: an integer scalar. 352 | Returns: 353 | a Tensor with shape [1, 1, length, length] 354 | """ 355 | r = torch.arange(length, dtype=torch.float32) 356 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 357 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 358 | 359 | 360 | class FFN(nn.Module): 361 | def __init__( 362 | self, 363 | in_channels, 364 | out_channels, 365 | filter_channels, 366 | kernel_size, 367 | p_dropout=0.0, 368 | activation=None, 369 | causal=False, 370 | ): 371 | super().__init__() 372 | self.in_channels = in_channels 373 | self.out_channels = out_channels 374 | self.filter_channels = filter_channels 375 | self.kernel_size = kernel_size 376 | self.p_dropout = p_dropout 377 | self.activation = activation 378 | self.causal = causal 379 | 380 | if causal: 381 | self.padding = self._causal_padding 382 | else: 383 | self.padding = self._same_padding 384 | 385 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 386 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 387 | self.drop = nn.Dropout(p_dropout) 388 | 389 | def forward(self, x, x_mask): 390 | x = self.conv_1(self.padding(x * x_mask)) 391 | if self.activation == "gelu": 392 | x = x * torch.sigmoid(1.702 * x) 393 | else: 394 | x = torch.relu(x) 395 | x = self.drop(x) 396 | x = self.conv_2(self.padding(x * x_mask)) 397 | return x * x_mask 398 | 399 | def _causal_padding(self, x): 400 | if self.kernel_size == 1: 401 | return x 402 | pad_l = self.kernel_size - 1 403 | pad_r = 0 404 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 405 | x = F.pad(x, commons.convert_pad_shape(padding)) 406 | return x 407 | 408 | def _same_padding(self, x): 409 | if self.kernel_size == 1: 410 | return x 411 | pad_l = (self.kernel_size - 1) // 2 412 | pad_r = self.kernel_size // 2 413 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 414 | x = F.pad(x, commons.convert_pad_shape(padding)) 415 | return x 416 | -------------------------------------------------------------------------------- /lib/rvc/checkpoints.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import OrderedDict 3 | from typing import * 4 | 5 | import torch 6 | 7 | 8 | def write_config(state_dict: Dict[str, Any], cfg: Dict[str, Any]): 9 | state_dict["config"] = [] 10 | for key, x in cfg.items(): 11 | state_dict["config"].append(x) 12 | state_dict["params"] = cfg 13 | 14 | 15 | def create_trained_model( 16 | weights: Dict[str, Any], 17 | version: Literal["v1", "v2"], 18 | sr: str, 19 | f0: bool, 20 | emb_name: str, 21 | emb_ch: int, 22 | emb_output_layer: int, 23 | epoch: int, 24 | speaker_info: Optional[dict[str, int]] 25 | ): 26 | state_dict = OrderedDict() 27 | state_dict["weight"] = {} 28 | for key in weights.keys(): 29 | if "enc_q" in key: 30 | continue 31 | state_dict["weight"][key] = weights[key].half() 32 | if sr == "40k": 33 | write_config( 34 | state_dict, 35 | { 36 | "spec_channels": 1025, 37 | "segment_size": 32, 38 | "inter_channels": 192, 39 | "hidden_channels": 192, 40 | "filter_channels": 768, 41 | "n_heads": 2, 42 | "n_layers": 6, 43 | "kernel_size": 3, 44 | "p_dropout": 0, 45 | "resblock": "1", 46 | "resblock_kernel_sizes": [3, 7, 11], 47 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 48 | "upsample_rates": [10, 10, 2, 2], 49 | "upsample_initial_channel": 512, 50 | "upsample_kernel_sizes": [16, 16, 4, 4], 51 | "spk_embed_dim": 109 if speaker_info is None else len(speaker_info), 52 | "gin_channels": 256, 53 | "emb_channels": emb_ch, 54 | "sr": 40000, 55 | }, 56 | ) 57 | elif sr == "48k": 58 | write_config( 59 | state_dict, 60 | { 61 | "spec_channels": 1025, 62 | "segment_size": 32, 63 | "inter_channels": 192, 64 | "hidden_channels": 192, 65 | "filter_channels": 768, 66 | "n_heads": 2, 67 | "n_layers": 6, 68 | "kernel_size": 3, 69 | "p_dropout": 0, 70 | "resblock": "1", 71 | "resblock_kernel_sizes": [3, 7, 11], 72 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 73 | "upsample_rates": [10, 6, 2, 2, 2], 74 | "upsample_initial_channel": 512, 75 | "upsample_kernel_sizes": [16, 16, 4, 4, 4], 76 | "spk_embed_dim": 109 if speaker_info is None else len(speaker_info), 77 | "gin_channels": 256, 78 | "emb_channels": emb_ch, 79 | "sr": 48000, 80 | }, 81 | ) 82 | elif sr == "32k": 83 | write_config( 84 | state_dict, 85 | { 86 | "spec_channels": 513, 87 | "segment_size": 32, 88 | "inter_channels": 192, 89 | "hidden_channels": 192, 90 | "filter_channels": 768, 91 | "n_heads": 2, 92 | "n_layers": 6, 93 | "kernel_size": 3, 94 | "p_dropout": 0, 95 | "resblock": "1", 96 | "resblock_kernel_sizes": [3, 7, 11], 97 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 98 | "upsample_rates": [10, 4, 2, 2, 2], 99 | "upsample_initial_channel": 512, 100 | "upsample_kernel_sizes": [16, 16, 4, 4, 4], 101 | "spk_embed_dim": 109 if speaker_info is None else len(speaker_info), 102 | "gin_channels": 256, 103 | "emb_channels": emb_ch, 104 | "sr": 32000, 105 | }, 106 | ) 107 | state_dict["version"] = version 108 | state_dict["info"] = f"{epoch}epoch" 109 | state_dict["sr"] = sr 110 | state_dict["f0"] = 1 if f0 else 0 111 | state_dict["embedder_name"] = emb_name 112 | state_dict["embedder_output_layer"] = emb_output_layer 113 | if not speaker_info is None: 114 | state_dict["speaker_info"] = {str(v): str(k) for k, v in speaker_info.items()} 115 | return state_dict 116 | 117 | 118 | def save( 119 | model, 120 | version: Literal["v1", "v2"], 121 | sr: str, 122 | f0: bool, 123 | emb_name: str, 124 | emb_ch: int, 125 | emb_output_layer: int, 126 | filepath: str, 127 | epoch: int, 128 | speaker_info: Optional[dict[str, int]] 129 | ): 130 | if hasattr(model, "module"): 131 | state_dict = model.module.state_dict() 132 | else: 133 | state_dict = model.state_dict() 134 | 135 | print(f"save: emb_name: {emb_name} {emb_ch}") 136 | 137 | state_dict = create_trained_model( 138 | state_dict, 139 | version, 140 | sr, 141 | f0, 142 | emb_name, 143 | emb_ch, 144 | emb_output_layer, 145 | epoch, 146 | speaker_info 147 | ) 148 | os.makedirs(os.path.dirname(filepath), exist_ok=True) 149 | torch.save(state_dict, filepath) 150 | -------------------------------------------------------------------------------- /lib/rvc/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def init_weights(m, mean=0.0, std=0.01): 8 | classname = m.__class__.__name__ 9 | if classname.find("Conv") != -1: 10 | m.weight.data.normal_(mean, std) 11 | 12 | 13 | def get_padding(kernel_size, dilation=1): 14 | return int((kernel_size * dilation - dilation) / 2) 15 | 16 | 17 | def convert_pad_shape(pad_shape): 18 | l = pad_shape[::-1] 19 | pad_shape = [item for sublist in l for item in sublist] 20 | return pad_shape 21 | 22 | 23 | def kl_divergence(m_p, logs_p, m_q, logs_q): 24 | """KL(P||Q)""" 25 | kl = (logs_q - logs_p) - 0.5 26 | kl += ( 27 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 28 | ) 29 | return kl 30 | 31 | 32 | def rand_gumbel(shape): 33 | """Sample from the Gumbel distribution, protect from overflows.""" 34 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 35 | return -torch.log(-torch.log(uniform_samples)) 36 | 37 | 38 | def rand_gumbel_like(x): 39 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 40 | return g 41 | 42 | 43 | def slice_segments(x, ids_str, segment_size=4): 44 | ret = torch.zeros_like(x[:, :, :segment_size]) 45 | for i in range(x.size(0)): 46 | idx_str = ids_str[i] 47 | idx_end = idx_str + segment_size 48 | ret[i] = x[i, :, idx_str:idx_end] 49 | return ret 50 | 51 | 52 | def slice_segments2(x, ids_str, segment_size=4): 53 | ret = torch.zeros_like(x[:, :segment_size]) 54 | for i in range(x.size(0)): 55 | idx_str = ids_str[i] 56 | idx_end = idx_str + segment_size 57 | ret[i] = x[i, idx_str:idx_end] 58 | return ret 59 | 60 | 61 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 62 | b, d, t = x.size() 63 | if x_lengths is None: 64 | x_lengths = t 65 | ids_str_max = x_lengths - segment_size + 1 66 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 67 | ret = slice_segments(x, ids_str, segment_size) 68 | return ret, ids_str 69 | 70 | 71 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 72 | position = torch.arange(length, dtype=torch.float) 73 | num_timescales = channels // 2 74 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 75 | num_timescales - 1 76 | ) 77 | inv_timescales = min_timescale * torch.exp( 78 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 79 | ) 80 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 81 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 82 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 83 | signal = signal.view(1, channels, length) 84 | return signal 85 | 86 | 87 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 88 | b, channels, length = x.size() 89 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 90 | return x + signal.to(dtype=x.dtype, device=x.device) 91 | 92 | 93 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 94 | b, channels, length = x.size() 95 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 96 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 97 | 98 | 99 | def subsequent_mask(length): 100 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 101 | return mask 102 | 103 | 104 | @torch.jit.script 105 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 106 | n_channels_int = n_channels[0] 107 | in_act = input_a + input_b 108 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 109 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 110 | acts = t_act * s_act 111 | return acts 112 | 113 | 114 | def convert_pad_shape(pad_shape): 115 | l = pad_shape[::-1] 116 | pad_shape = [item for sublist in l for item in sublist] 117 | return pad_shape 118 | 119 | 120 | def shift_1d(x): 121 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 122 | return x 123 | 124 | 125 | def sequence_mask(length, max_length=None): 126 | if max_length is None: 127 | max_length = length.max() 128 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 129 | return x.unsqueeze(0) < length.unsqueeze(1) 130 | 131 | 132 | def generate_path(duration, mask): 133 | """ 134 | duration: [b, 1, t_x] 135 | mask: [b, 1, t_y, t_x] 136 | """ 137 | b, _, t_y, t_x = mask.shape 138 | cum_duration = torch.cumsum(duration, -1) 139 | 140 | cum_duration_flat = cum_duration.view(b * t_x) 141 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 142 | path = path.view(b, t_x, t_y) 143 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 144 | path = path.unsqueeze(1).transpose(2, 3) * mask 145 | return path 146 | 147 | 148 | def clip_grad_value_(parameters, clip_value, norm_type=2): 149 | if isinstance(parameters, torch.Tensor): 150 | parameters = [parameters] 151 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 152 | norm_type = float(norm_type) 153 | if clip_value is not None: 154 | clip_value = float(clip_value) 155 | 156 | total_norm = 0 157 | for p in parameters: 158 | param_norm = p.grad.data.norm(norm_type) 159 | total_norm += param_norm.item() ** norm_type 160 | if clip_value is not None: 161 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 162 | total_norm = total_norm ** (1.0 / norm_type) 163 | return total_norm 164 | -------------------------------------------------------------------------------- /lib/rvc/config.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class TrainConfigTrain(BaseModel): 7 | log_interval: int 8 | seed: int 9 | epochs: int 10 | learning_rate: float 11 | betas: List[float] 12 | eps: float 13 | batch_size: int 14 | fp16_run: bool 15 | lr_decay: float 16 | segment_size: int 17 | init_lr_ratio: int 18 | warmup_epochs: int 19 | c_mel: int 20 | c_kl: float 21 | 22 | 23 | class TrainConfigData(BaseModel): 24 | max_wav_value: float 25 | sampling_rate: int 26 | filter_length: int 27 | hop_length: int 28 | win_length: int 29 | n_mel_channels: int 30 | mel_fmin: float 31 | mel_fmax: Any 32 | 33 | 34 | class TrainConfigModel(BaseModel): 35 | inter_channels: int 36 | hidden_channels: int 37 | filter_channels: int 38 | n_heads: int 39 | n_layers: int 40 | kernel_size: int 41 | p_dropout: int 42 | resblock: str 43 | resblock_kernel_sizes: List[int] 44 | resblock_dilation_sizes: List[List[int]] 45 | upsample_rates: List[int] 46 | upsample_initial_channel: int 47 | upsample_kernel_sizes: List[int] 48 | use_spectral_norm: bool 49 | gin_channels: int 50 | emb_channels: int 51 | spk_embed_dim: int 52 | 53 | 54 | class TrainConfig(BaseModel): 55 | version: Literal["v1", "v2"] = "v2" 56 | train: TrainConfigTrain 57 | data: TrainConfigData 58 | model: TrainConfigModel 59 | 60 | 61 | class DatasetMetaItem(BaseModel): 62 | gt_wav: str 63 | co256: str 64 | f0: Optional[str] 65 | f0nsf: Optional[str] 66 | speaker_id: int 67 | 68 | 69 | class DatasetMetadata(BaseModel): 70 | files: Dict[str, DatasetMetaItem] 71 | # mute: DatasetMetaItem 72 | -------------------------------------------------------------------------------- /lib/rvc/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | 15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 16 | loss = 0 17 | r_losses = [] 18 | g_losses = [] 19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 20 | dr = dr.float() 21 | dg = dg.float() 22 | r_loss = torch.mean((1 - dr) ** 2) 23 | g_loss = torch.mean(dg**2) 24 | loss += r_loss + g_loss 25 | r_losses.append(r_loss.item()) 26 | g_losses.append(g_loss.item()) 27 | 28 | return loss, r_losses, g_losses 29 | 30 | 31 | def generator_loss(disc_outputs): 32 | loss = 0 33 | gen_losses = [] 34 | for dg in disc_outputs: 35 | dg = dg.float() 36 | l = torch.mean((1 - dg) ** 2) 37 | gen_losses.append(l) 38 | loss += l 39 | 40 | return loss, gen_losses 41 | 42 | 43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 44 | """ 45 | z_p, logs_q: [b, h, t_t] 46 | m_p, logs_p: [b, h, t_t] 47 | """ 48 | z_p = z_p.float() 49 | logs_q = logs_q.float() 50 | m_p = m_p.float() 51 | logs_p = logs_p.float() 52 | z_mask = z_mask.float() 53 | 54 | kl = logs_p - logs_q - 0.5 55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 56 | kl = torch.sum(kl * z_mask) 57 | l = kl / torch.sum(z_mask) 58 | return l 59 | -------------------------------------------------------------------------------- /lib/rvc/mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | 5 | MAX_WAV_VALUE = 32768.0 6 | 7 | 8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 9 | """ 10 | PARAMS 11 | ------ 12 | C: compression factor 13 | """ 14 | return torch.log(torch.clamp(x, min=clip_val) * C) 15 | 16 | 17 | def dynamic_range_decompression_torch(x, C=1): 18 | """ 19 | PARAMS 20 | ------ 21 | C: compression factor used to compress 22 | """ 23 | return torch.exp(x) / C 24 | 25 | 26 | def spectral_normalize_torch(magnitudes): 27 | return dynamic_range_compression_torch(magnitudes) 28 | 29 | 30 | def spectral_de_normalize_torch(magnitudes): 31 | return dynamic_range_decompression_torch(magnitudes) 32 | 33 | 34 | mel_basis = {} 35 | hann_window = {} 36 | 37 | 38 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 39 | if torch.min(y) < -1.07: 40 | print("min value is ", torch.min(y)) 41 | if torch.max(y) > 1.07: 42 | print("max value is ", torch.max(y)) 43 | 44 | global hann_window 45 | dtype_device = str(y.dtype) + "_" + str(y.device) 46 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 47 | if wnsize_dtype_device not in hann_window: 48 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 49 | dtype=y.dtype, device=y.device 50 | ) 51 | 52 | y = torch.nn.functional.pad( 53 | y.unsqueeze(1), 54 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 55 | mode="reflect", 56 | ) 57 | y = y.squeeze(1) 58 | 59 | # mps does not support torch.stft. 60 | if y.device.type == "mps": 61 | i = y.cpu() 62 | win = hann_window[wnsize_dtype_device].cpu() 63 | else: 64 | i = y 65 | win = hann_window[wnsize_dtype_device] 66 | spec = torch.stft( 67 | i, 68 | n_fft, 69 | hop_length=hop_size, 70 | win_length=win_size, 71 | window=win, 72 | center=center, 73 | pad_mode="reflect", 74 | normalized=False, 75 | onesided=True, 76 | return_complex=False, 77 | ).to(device=y.device) 78 | 79 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 80 | return spec 81 | 82 | 83 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 84 | global mel_basis 85 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 86 | fmax_dtype_device = str(fmax) + "_" + dtype_device 87 | if fmax_dtype_device not in mel_basis: 88 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 89 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 90 | dtype=spec.dtype, device=spec.device 91 | ) 92 | melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) 93 | melspec = spectral_normalize_torch(melspec) 94 | return melspec 95 | 96 | 97 | def mel_spectrogram_torch( 98 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 99 | ): 100 | """Convert waveform into Mel-frequency Log-amplitude spectrogram. 101 | 102 | Args: 103 | y :: (B, T) - Waveforms 104 | Returns: 105 | melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram 106 | """ 107 | # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) 108 | spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) 109 | 110 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) 111 | melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) 112 | 113 | return melspec 114 | -------------------------------------------------------------------------------- /lib/rvc/pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | from typing import * 4 | 5 | import faiss 6 | import numpy as np 7 | import pyworld 8 | import scipy.signal as signal 9 | import torch 10 | import torch.nn.functional as F 11 | import torchcrepe 12 | from torch import Tensor 13 | # from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip 14 | from fairseq.models.hubert import HubertModel 15 | 16 | from .models import SynthesizerTrnMs256NSFSid 17 | 18 | 19 | class VocalConvertPipeline(object): 20 | def __init__(self, tgt_sr: int, device: Union[str, torch.device], is_half: bool): 21 | if isinstance(device, str): 22 | device = torch.device(device) 23 | if device.type == "cuda": 24 | vram = torch.cuda.get_device_properties(device).total_memory / 1024**3 25 | else: 26 | vram = None 27 | 28 | if vram is not None and vram <= 4: 29 | self.x_pad = 1 30 | self.x_query = 5 31 | self.x_center = 30 32 | self.x_max = 32 33 | elif vram is not None and vram <= 5: 34 | self.x_pad = 1 35 | self.x_query = 6 36 | self.x_center = 38 37 | self.x_max = 41 38 | else: 39 | self.x_pad = 3 40 | self.x_query = 10 41 | self.x_center = 60 42 | self.x_max = 65 43 | 44 | self.sr = 16000 # hubert input sample rate 45 | self.window = 160 # hubert input window 46 | self.t_pad = self.sr * self.x_pad # padding time for each utterance 47 | self.t_pad_tgt = tgt_sr * self.x_pad 48 | self.t_pad2 = self.t_pad * 2 49 | self.t_query = self.sr * self.x_query # query time before and after query point 50 | self.t_center = self.sr * self.x_center # query cut point position 51 | self.t_max = self.sr * self.x_max # max time for no query 52 | self.device = device 53 | self.is_half = is_half 54 | 55 | def get_optimal_torch_device(self, index: int = 0) -> torch.device: 56 | # Get cuda device 57 | if torch.cuda.is_available(): 58 | return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast 59 | elif torch.backends.mps.is_available(): 60 | return torch.device("mps") 61 | # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library 62 | # Else wise return the "cpu" as a torch device, 63 | return torch.device("cpu") 64 | 65 | def get_f0_crepe_computation( 66 | self, 67 | x, 68 | f0_min, 69 | f0_max, 70 | p_len, 71 | hop_length=64, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. 72 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full 73 | ): 74 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. 75 | x /= np.quantile(np.abs(x), 0.999) 76 | torch_device = self.get_optimal_torch_device() 77 | audio = torch.from_numpy(x).to(torch_device, copy=True) 78 | audio = torch.unsqueeze(audio, dim=0) 79 | if audio.ndim == 2 and audio.shape[0] > 1: 80 | audio = torch.mean(audio, dim=0, keepdim=True).detach() 81 | audio = audio.detach() 82 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) 83 | pitch: Tensor = torchcrepe.predict( 84 | audio, 85 | self.sr, 86 | hop_length, 87 | f0_min, 88 | f0_max, 89 | model, 90 | batch_size=hop_length * 2, 91 | device=torch_device, 92 | pad=True 93 | ) 94 | p_len = p_len or x.shape[0] // hop_length 95 | # Resize the pitch for final f0 96 | source = np.array(pitch.squeeze(0).cpu().float().numpy()) 97 | source[source < 0.001] = np.nan 98 | target = np.interp( 99 | np.arange(0, len(source) * p_len, len(source)) / p_len, 100 | np.arange(0, len(source)), 101 | source 102 | ) 103 | f0 = np.nan_to_num(target) 104 | return f0 # Resized f0 105 | 106 | def get_f0_official_crepe_computation( 107 | self, 108 | x, 109 | f0_min, 110 | f0_max, 111 | model="full", 112 | ): 113 | # Pick a batch size that doesn't cause memory errors on your gpu 114 | batch_size = 512 115 | # Compute pitch using first gpu 116 | audio = torch.tensor(np.copy(x))[None].float() 117 | f0, pd = torchcrepe.predict( 118 | audio, 119 | self.sr, 120 | self.window, 121 | f0_min, 122 | f0_max, 123 | model, 124 | batch_size=batch_size, 125 | device=self.device, 126 | return_periodicity=True, 127 | ) 128 | pd = torchcrepe.filter.median(pd, 3) 129 | f0 = torchcrepe.filter.mean(f0, 3) 130 | f0[pd < 0.1] = 0 131 | f0 = f0[0].cpu().numpy() 132 | return f0 133 | 134 | def get_f0( 135 | self, 136 | x: np.ndarray, 137 | p_len: int, 138 | f0_up_key: int, 139 | f0_method: str, 140 | inp_f0: np.ndarray = None, 141 | ): 142 | f0_min = 50 143 | f0_max = 1100 144 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 145 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 146 | 147 | if f0_method == "harvest": 148 | f0, t = pyworld.harvest( 149 | x.astype(np.double), 150 | fs=self.sr, 151 | f0_ceil=f0_max, 152 | f0_floor=f0_min, 153 | frame_period=10, 154 | ) 155 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) 156 | f0 = signal.medfilt(f0, 3) 157 | elif f0_method == "dio": 158 | f0, t = pyworld.dio( 159 | x.astype(np.double), 160 | fs=self.sr, 161 | f0_ceil=f0_max, 162 | f0_floor=f0_min, 163 | frame_period=10, 164 | ) 165 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) 166 | f0 = signal.medfilt(f0, 3) 167 | elif f0_method == "mangio-crepe": 168 | f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, 160, "full") 169 | elif f0_method == "crepe": 170 | f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "full") 171 | 172 | f0 *= pow(2, f0_up_key / 12) 173 | tf0 = self.sr // self.window # f0 points per second 174 | if inp_f0 is not None: 175 | delta_t = np.round( 176 | (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 177 | ).astype("int16") 178 | replace_f0 = np.interp( 179 | list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] 180 | ) 181 | shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] 182 | f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ 183 | :shape 184 | ] 185 | 186 | f0bak = f0.copy() 187 | f0_mel = 1127 * np.log(1 + f0 / 700) 188 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 189 | f0_mel_max - f0_mel_min 190 | ) + 1 191 | f0_mel[f0_mel <= 1] = 1 192 | f0_mel[f0_mel > 255] = 255 193 | f0_coarse = np.rint(f0_mel).astype(np.int) 194 | return f0_coarse, f0bak # 1-0 195 | 196 | def _convert( 197 | self, 198 | model: HubertModel, 199 | embedding_output_layer: int, 200 | net_g: SynthesizerTrnMs256NSFSid, 201 | sid: int, 202 | audio: np.ndarray, 203 | pitch: np.ndarray, 204 | pitchf: np.ndarray, 205 | index: faiss.IndexIVFFlat, 206 | big_npy: np.ndarray, 207 | index_rate: float, 208 | ): 209 | feats = torch.from_numpy(audio) 210 | if self.is_half: 211 | feats = feats.half() 212 | else: 213 | feats = feats.float() 214 | if feats.dim() == 2: # double channels 215 | feats = feats.mean(-1) 216 | assert feats.dim() == 1, feats.dim() 217 | feats = feats.view(1, -1) 218 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) 219 | 220 | half_support = ( 221 | self.device.type == "cuda" 222 | and torch.cuda.get_device_capability(self.device)[0] >= 5.3 223 | ) 224 | is_feats_dim_768 = net_g.emb_channels == 768 225 | 226 | if isinstance(model, tuple): 227 | feats = model[0]( 228 | feats.squeeze(0).squeeze(0).to(self.device), 229 | return_tensors="pt", 230 | sampling_rate=16000, 231 | ) 232 | if self.is_half: 233 | feats = feats.input_values.to(self.device).half() 234 | else: 235 | feats = feats.input_values.to(self.device) 236 | with torch.no_grad(): 237 | if is_feats_dim_768: 238 | feats = model[1](feats).last_hidden_state 239 | else: 240 | feats = model[1](feats).extract_features 241 | else: 242 | inputs = { 243 | "source": feats.half().to(self.device) 244 | if half_support 245 | else feats.to(self.device), 246 | "padding_mask": padding_mask.to(self.device), 247 | "output_layer": embedding_output_layer, 248 | } 249 | 250 | if not half_support: 251 | model = model.float() 252 | inputs["source"] = inputs["source"].float() 253 | 254 | with torch.no_grad(): 255 | logits = model.extract_features(**inputs) 256 | if is_feats_dim_768: 257 | feats = logits[0] 258 | else: 259 | feats = model.final_proj(logits[0]) 260 | 261 | if ( 262 | isinstance(index, type(None)) == False 263 | and isinstance(big_npy, type(None)) == False 264 | and index_rate != 0 265 | ): 266 | npy = feats[0].cpu().numpy() 267 | if self.is_half: 268 | npy = npy.astype("float32") 269 | 270 | score, ix = index.search(npy, k=8) 271 | weight = np.square(1 / score) 272 | weight /= weight.sum(axis=1, keepdims=True) 273 | npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) 274 | 275 | if self.is_half: 276 | npy = npy.astype("float16") 277 | feats = ( 278 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate 279 | + (1 - index_rate) * feats 280 | ) 281 | 282 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) 283 | 284 | p_len = audio.shape[0] // self.window 285 | if feats.shape[1] < p_len: 286 | p_len = feats.shape[1] 287 | if pitch != None and pitchf != None: 288 | pitch = pitch[:, :p_len] 289 | pitchf = pitchf[:, :p_len] 290 | p_len = torch.tensor([p_len], device=self.device).long() 291 | with torch.no_grad(): 292 | if pitch != None and pitchf != None: 293 | audio1 = ( 294 | (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) 295 | .data.cpu() 296 | .float() 297 | .numpy() 298 | .astype(np.int16) 299 | ) 300 | else: 301 | audio1 = ( 302 | (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768) 303 | .data.cpu() 304 | .float() 305 | .numpy() 306 | .astype(np.int16) 307 | ) 308 | del feats, p_len, padding_mask 309 | if torch.cuda.is_available(): 310 | torch.cuda.empty_cache() 311 | return audio1 312 | 313 | def __call__( 314 | self, 315 | model: HubertModel, 316 | embedding_output_layer: int, 317 | net_g: SynthesizerTrnMs256NSFSid, 318 | sid: int, 319 | audio: np.ndarray, 320 | transpose: int, 321 | f0_method: str, 322 | file_index: str, 323 | index_rate: float, 324 | if_f0: bool, 325 | f0_file: str = None, 326 | ): 327 | if file_index != "" and os.path.exists(file_index) and index_rate != 0: 328 | try: 329 | index = faiss.read_index(file_index) 330 | # big_npy = np.load(file_big_npy) 331 | big_npy = index.reconstruct_n(0, index.ntotal) 332 | except: 333 | traceback.print_exc() 334 | index = big_npy = None 335 | else: 336 | index = big_npy = None 337 | 338 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) 339 | audio = signal.filtfilt(bh, ah, audio) 340 | 341 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") 342 | opt_ts = [] 343 | if audio_pad.shape[0] > self.t_max: 344 | audio_sum = np.zeros_like(audio) 345 | for i in range(self.window): 346 | audio_sum += audio_pad[i : i - self.window] 347 | for t in range(self.t_center, audio.shape[0], self.t_center): 348 | opt_ts.append( 349 | t 350 | - self.t_query 351 | + np.where( 352 | np.abs(audio_sum[t - self.t_query : t + self.t_query]) 353 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() 354 | )[0][0] 355 | ) 356 | 357 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") 358 | p_len = audio_pad.shape[0] // self.window 359 | inp_f0 = None 360 | if hasattr(f0_file, "name"): 361 | try: 362 | with open(f0_file.name, "r") as f: 363 | lines = f.read().strip("\n").split("\n") 364 | inp_f0 = [] 365 | for line in lines: 366 | inp_f0.append([float(i) for i in line.split(",")]) 367 | inp_f0 = np.array(inp_f0, dtype="float32") 368 | except: 369 | traceback.print_exc() 370 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() 371 | pitch, pitchf = None, None 372 | if if_f0 == 1: 373 | pitch, pitchf = self.get_f0(audio_pad, p_len, transpose, f0_method, inp_f0) 374 | pitch = pitch[:p_len] 375 | pitchf = pitchf[:p_len] 376 | if self.device.type == "mps": 377 | pitchf = pitchf.astype(np.float32) 378 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() 379 | pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() 380 | 381 | audio_opt = [] 382 | 383 | s = 0 384 | t = None 385 | 386 | for t in opt_ts: 387 | t = t // self.window * self.window 388 | if if_f0 == 1: 389 | audio_opt.append( 390 | self._convert( 391 | model, 392 | embedding_output_layer, 393 | net_g, 394 | sid, 395 | audio_pad[s : t + self.t_pad2 + self.window], 396 | pitch[:, s // self.window : (t + self.t_pad2) // self.window], 397 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window], 398 | index, 399 | big_npy, 400 | index_rate, 401 | )[self.t_pad_tgt : -self.t_pad_tgt] 402 | ) 403 | else: 404 | audio_opt.append( 405 | self._convert( 406 | model, 407 | embedding_output_layer, 408 | net_g, 409 | sid, 410 | audio_pad[s : t + self.t_pad2 + self.window], 411 | None, 412 | None, 413 | index, 414 | big_npy, 415 | index_rate, 416 | )[self.t_pad_tgt : -self.t_pad_tgt] 417 | ) 418 | s = t 419 | if if_f0 == 1: 420 | audio_opt.append( 421 | self._convert( 422 | model, 423 | embedding_output_layer, 424 | net_g, 425 | sid, 426 | audio_pad[t:], 427 | pitch[:, t // self.window :] if t is not None else pitch, 428 | pitchf[:, t // self.window :] if t is not None else pitchf, 429 | index, 430 | big_npy, 431 | index_rate, 432 | )[self.t_pad_tgt : -self.t_pad_tgt] 433 | ) 434 | else: 435 | audio_opt.append( 436 | self._convert( 437 | model, 438 | embedding_output_layer, 439 | net_g, 440 | sid, 441 | audio_pad[t:], 442 | None, 443 | None, 444 | index, 445 | big_npy, 446 | index_rate, 447 | )[self.t_pad_tgt : -self.t_pad_tgt] 448 | ) 449 | audio_opt = np.concatenate(audio_opt) 450 | del pitch, pitchf, sid 451 | if torch.cuda.is_available(): 452 | torch.cuda.empty_cache() 453 | return audio_opt 454 | -------------------------------------------------------------------------------- /lib/rvc/preprocessing/extract_f0.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | from concurrent.futures import ProcessPoolExecutor 4 | from typing import * 5 | import multiprocessing as mp 6 | 7 | import numpy as np 8 | import pyworld 9 | import torch 10 | import torchcrepe 11 | from torch import Tensor 12 | from tqdm import tqdm 13 | 14 | from lib.rvc.utils import load_audio 15 | 16 | def get_optimal_torch_device(index: int = 0) -> torch.device: 17 | # Get cuda device 18 | if torch.cuda.is_available(): 19 | return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast 20 | elif torch.backends.mps.is_available(): 21 | return torch.device("mps") 22 | # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library 23 | # Else wise return the "cpu" as a torch device, 24 | return torch.device("cpu") 25 | 26 | def get_f0_official_crepe_computation( 27 | x, 28 | sr, 29 | f0_min, 30 | f0_max, 31 | model="full", 32 | ): 33 | batch_size = 512 34 | torch_device = get_optimal_torch_device() 35 | audio = torch.tensor(np.copy(x))[None].float() 36 | f0, pd = torchcrepe.predict( 37 | audio, 38 | sr, 39 | 160, 40 | f0_min, 41 | f0_max, 42 | model, 43 | batch_size=batch_size, 44 | device=torch_device, 45 | return_periodicity=True, 46 | ) 47 | pd = torchcrepe.filter.median(pd, 3) 48 | f0 = torchcrepe.filter.mean(f0, 3) 49 | f0[pd < 0.1] = 0 50 | f0 = f0[0].cpu().numpy() 51 | f0 = f0[1:] # Get rid of extra first frame 52 | return f0 53 | 54 | def get_f0_crepe_computation( 55 | x, 56 | sr, 57 | f0_min, 58 | f0_max, 59 | hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. 60 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full 61 | ): 62 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. 63 | x /= np.quantile(np.abs(x), 0.999) 64 | torch_device = get_optimal_torch_device() 65 | audio = torch.from_numpy(x).to(torch_device, copy=True) 66 | audio = torch.unsqueeze(audio, dim=0) 67 | if audio.ndim == 2 and audio.shape[0] > 1: 68 | audio = torch.mean(audio, dim=0, keepdim=True).detach() 69 | audio = audio.detach() 70 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) 71 | pitch: Tensor = torchcrepe.predict( 72 | audio, 73 | sr, 74 | hop_length, 75 | f0_min, 76 | f0_max, 77 | model, 78 | batch_size=hop_length * 2, 79 | device=torch_device, 80 | pad=True 81 | ) 82 | p_len = x.shape[0] // hop_length 83 | # Resize the pitch for final f0 84 | source = np.array(pitch.squeeze(0).cpu().float().numpy()) 85 | source[source < 0.001] = np.nan 86 | target = np.interp( 87 | np.arange(0, len(source) * p_len, len(source)) / p_len, 88 | np.arange(0, len(source)), 89 | source 90 | ) 91 | f0 = np.nan_to_num(target) 92 | f0 = f0[1:] # Get rid of extra first frame 93 | return f0 # Resized f0 94 | 95 | 96 | def compute_f0( 97 | path: str, 98 | f0_method: str, 99 | fs: int, 100 | hop: int, 101 | f0_max: float, 102 | f0_min: float, 103 | ): 104 | x = load_audio(path, fs) 105 | if f0_method == "harvest": 106 | f0, t = pyworld.harvest( 107 | x.astype(np.double), 108 | fs=fs, 109 | f0_ceil=f0_max, 110 | f0_floor=f0_min, 111 | frame_period=1000 * hop / fs, 112 | ) 113 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs) 114 | elif f0_method == "dio": 115 | f0, t = pyworld.dio( 116 | x.astype(np.double), 117 | fs=fs, 118 | f0_ceil=f0_max, 119 | f0_floor=f0_min, 120 | frame_period=1000 * hop / fs, 121 | ) 122 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs) 123 | elif f0_method == "mangio-crepe": 124 | f0 = get_f0_crepe_computation(x, fs, f0_min, f0_max, 160, "full") 125 | elif f0_method == "crepe": 126 | f0 = get_f0_official_crepe_computation(x.astype(np.double), fs, f0_min, f0_max, "full") 127 | return f0 128 | 129 | 130 | def coarse_f0(f0, f0_bin, f0_mel_min, f0_mel_max): 131 | f0_mel = 1127 * np.log(1 + f0 / 700) 132 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / ( 133 | f0_mel_max - f0_mel_min 134 | ) + 1 135 | 136 | # use 0 or 1 137 | f0_mel[f0_mel <= 1] = 1 138 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 139 | f0_coarse = np.rint(f0_mel).astype(np.int) 140 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( 141 | f0_coarse.max(), 142 | f0_coarse.min(), 143 | ) 144 | return f0_coarse 145 | 146 | 147 | def processor(paths, f0_method, samplerate=16000, hop_size=160, process_id=0): 148 | fs = samplerate 149 | hop = hop_size 150 | 151 | f0_bin = 256 152 | f0_max = 1100.0 153 | f0_min = 50.0 154 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 155 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 156 | if len(paths) != 0: 157 | for idx, (inp_path, opt_path1, opt_path2) in enumerate( 158 | tqdm(paths, position=1 + process_id) 159 | ): 160 | try: 161 | if ( 162 | os.path.exists(opt_path1 + ".npy") == True 163 | and os.path.exists(opt_path2 + ".npy") == True 164 | ): 165 | continue 166 | featur_pit = compute_f0(inp_path, f0_method, fs, hop, f0_max, f0_min) 167 | np.save( 168 | opt_path2, 169 | featur_pit, 170 | allow_pickle=False, 171 | ) # nsf 172 | coarse_pit = coarse_f0(featur_pit, f0_bin, f0_mel_min, f0_mel_max) 173 | np.save( 174 | opt_path1, 175 | coarse_pit, 176 | allow_pickle=False, 177 | ) # ori 178 | except: 179 | print(f"f0 failed {idx}: {inp_path} {traceback.format_exc()}") 180 | 181 | 182 | def run(training_dir: str, num_processes: int, f0_method: str): 183 | paths = [] 184 | dataset_dir = os.path.join(training_dir, "1_16k_wavs") 185 | opt_dir_f0 = os.path.join(training_dir, "2a_f0") 186 | opt_dir_f0_nsf = os.path.join(training_dir, "2b_f0nsf") 187 | 188 | if os.path.exists(opt_dir_f0) and os.path.exists(opt_dir_f0_nsf): 189 | return 190 | 191 | os.makedirs(opt_dir_f0, exist_ok=True) 192 | os.makedirs(opt_dir_f0_nsf, exist_ok=True) 193 | 194 | names = [] 195 | 196 | for pathname in sorted(list(os.listdir(dataset_dir))): 197 | if os.path.isdir(os.path.join(dataset_dir, pathname)): 198 | for f in sorted(list(os.listdir(os.path.join(dataset_dir, pathname)))): 199 | if "spec" in f: 200 | continue 201 | names.append(os.path.join(pathname, f)) 202 | else: 203 | names.append(pathname) 204 | 205 | for name in names: # dataset_dir/{05d}/file.ext 206 | filepath = os.path.join(dataset_dir, name) 207 | if "spec" in filepath: 208 | continue 209 | opt_filepath_f0 = os.path.join(opt_dir_f0, name) 210 | opt_filepath_f0_nsf = os.path.join(opt_dir_f0_nsf, name) 211 | paths.append([filepath, opt_filepath_f0, opt_filepath_f0_nsf]) 212 | 213 | for dir in set([(os.path.dirname(p[1]), os.path.dirname(p[2])) for p in paths]): 214 | os.makedirs(dir[0], exist_ok=True) 215 | os.makedirs(dir[1], exist_ok=True) 216 | 217 | with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executer: 218 | for i in range(num_processes): 219 | executer.submit(processor, paths[i::num_processes], f0_method, process_id=i) 220 | 221 | processor(paths, f0_method) 222 | -------------------------------------------------------------------------------- /lib/rvc/preprocessing/extract_feature.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import os 3 | import traceback 4 | from concurrent.futures import ProcessPoolExecutor 5 | from typing import * 6 | 7 | import numpy as np 8 | import soundfile as sf 9 | import torch 10 | import torch.nn.functional as F 11 | from fairseq import checkpoint_utils 12 | from tqdm import tqdm 13 | 14 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 15 | MODELS_DIR = os.path.join(ROOT_DIR, "models") 16 | EMBEDDINGS_LIST = { 17 | "hubert-base-japanese": ( 18 | "rinna_hubert_base_jp.pt", 19 | "hubert-base-japanese", 20 | "local", 21 | ), 22 | "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"), 23 | } 24 | 25 | def get_embedder(embedder_name): 26 | if embedder_name in EMBEDDINGS_LIST: 27 | return EMBEDDINGS_LIST[embedder_name] 28 | return None 29 | 30 | 31 | def load_embedder(embedder_path: str, device): 32 | try: 33 | models, cfg, _ = checkpoint_utils.load_model_ensemble_and_task( 34 | [embedder_path], 35 | suffix="", 36 | ) 37 | embedder_model = models[0] 38 | embedder_model = embedder_model.to(device) 39 | if device != "cpu": 40 | embedder_model = embedder_model.half() 41 | else: 42 | embedder_model = embedder_model.float() 43 | embedder_model.eval() 44 | except Exception as e: 45 | print(f"Error: {e} {embedder_path}") 46 | traceback.print_exc() 47 | 48 | return embedder_model, cfg 49 | 50 | 51 | # wave must be 16k, hop_size=320 52 | def readwave(wav_path, normalize=False): 53 | wav, sr = sf.read(wav_path) 54 | assert sr == 16000 55 | feats = torch.from_numpy(wav).float() 56 | if feats.dim() == 2: # double channels 57 | feats = feats.mean(-1) 58 | assert feats.dim() == 1, feats.dim() 59 | if normalize: 60 | with torch.no_grad(): 61 | feats = F.layer_norm(feats, feats.shape) 62 | feats = feats.view(1, -1) 63 | return feats 64 | 65 | 66 | def processor( 67 | todo: List[str], 68 | device: torch.device, 69 | embedder_path: str, 70 | embedder_load_from: str, 71 | embedding_channel: bool, 72 | embedding_output_layer: int, 73 | wav_dir: str, 74 | out_dir: str, 75 | process_id: int, 76 | ): 77 | half_support = ( 78 | device.type == "cuda" and torch.cuda.get_device_capability(device)[0] >= 5.3 79 | ) 80 | is_feats_dim_768 = embedding_channel == 768 81 | 82 | if embedder_load_from == "local" and not os.path.exists(embedder_path): 83 | return f"Embedder not found: {embedder_path}" 84 | 85 | model, cfg = load_embedder(embedder_path, device) 86 | 87 | for file in tqdm(todo, position=1 + process_id): 88 | try: 89 | if file.endswith(".wav"): 90 | wav_filepath = os.path.join(wav_dir, file) 91 | out_filepath = os.path.join(out_dir, file.replace("wav", "npy")) 92 | 93 | if os.path.exists(out_filepath): 94 | continue 95 | 96 | os.makedirs(os.path.dirname(out_filepath), exist_ok=True) 97 | 98 | is_normalize = False if cfg is None else cfg.task.normalize 99 | feats = readwave(wav_filepath, normalize=is_normalize) 100 | padding_mask = torch.BoolTensor(feats.shape).fill_(False) 101 | if isinstance(model, tuple): 102 | feats = model[0]( 103 | feats.squeeze(0).squeeze(0).to(device), 104 | return_tensors="pt", 105 | sampling_rate=16000, 106 | ) 107 | if half_support: 108 | feats = feats.input_values.to(device).half() 109 | else: 110 | feats = feats.input_values.to(device).float() 111 | 112 | with torch.no_grad(): 113 | if half_support: 114 | if is_feats_dim_768: 115 | feats = model[1](feats).last_hidden_state 116 | else: 117 | feats = model[1](feats).extract_features 118 | else: 119 | if is_feats_dim_768: 120 | feats = model[1].float()(feats).last_hidden_state 121 | else: 122 | feats = model[1].float()(feats).extract_features 123 | else: 124 | inputs = { 125 | "source": feats.half().to(device) 126 | if half_support 127 | else feats.to(device), 128 | "padding_mask": padding_mask.to(device), 129 | "output_layer": embedding_output_layer, 130 | } 131 | 132 | # なんかまだこの時点でfloat16なので改めて変換 133 | if not half_support: 134 | model = model.float() 135 | inputs["source"] = inputs["source"].float() 136 | 137 | with torch.no_grad(): 138 | logits = model.extract_features(**inputs) 139 | if is_feats_dim_768: 140 | feats = logits[0] 141 | else: 142 | feats = model.final_proj(logits[0]) 143 | 144 | feats = feats.squeeze(0).float().cpu().numpy() 145 | if np.isnan(feats).sum() == 0: 146 | np.save(out_filepath, feats, allow_pickle=False) 147 | else: 148 | print(f"{file} contains nan") 149 | except Exception as e: 150 | print(f"Error: {e} {file}") 151 | traceback.print_exc() 152 | 153 | 154 | def run( 155 | training_dir: str, 156 | embedder_path: str, 157 | embedder_load_from: str, 158 | embedding_channel: int, 159 | embedding_output_layer: int, 160 | gpu_ids: List[int], 161 | device: Optional[Union[torch.device, str]] = None, 162 | ): 163 | wav_dir = os.path.join(training_dir, "1_16k_wavs") 164 | out_dir = os.path.join(training_dir, "3_feature256") 165 | 166 | num_gpus = len(gpu_ids) 167 | 168 | for gpu_id in gpu_ids: 169 | if num_gpus < gpu_id + 1: 170 | print(f"GPU {gpu_id} is not available") 171 | return 172 | 173 | if os.path.exists(out_dir): 174 | return 175 | 176 | os.makedirs(out_dir, exist_ok=True) 177 | 178 | todo = [ 179 | os.path.join(dir, f) 180 | for dir in sorted(list(os.listdir(wav_dir))) 181 | if os.path.isdir(os.path.join(wav_dir, dir)) 182 | for f in sorted(list(os.listdir(os.path.join(wav_dir, dir)))) 183 | ] 184 | 185 | if device is not None: 186 | if type(device) == str: 187 | device = torch.device(device) 188 | if device.type == "mps": 189 | device = torch.device( 190 | "cpu" 191 | ) # Mac(MPS) crashes when multiprocess, so change to CPU. 192 | processor( 193 | todo, 194 | device, 195 | embedder_path, 196 | embedder_load_from, 197 | embedding_channel, 198 | embedding_output_layer, 199 | wav_dir, 200 | out_dir, 201 | process_id=0, 202 | ) 203 | else: 204 | with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executor: 205 | for i, id in enumerate(gpu_ids): 206 | executor.submit( 207 | processor, 208 | todo[i::num_gpus], 209 | torch.device(f"cuda:{id}"), 210 | embedder_path, 211 | embedder_load_from, 212 | embedding_channel, 213 | embedding_output_layer, 214 | wav_dir, 215 | out_dir, 216 | process_id=i, 217 | ) 218 | -------------------------------------------------------------------------------- /lib/rvc/preprocessing/slicer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # This function is obtained from librosa. 5 | def get_rms( 6 | y, 7 | frame_length=2048, 8 | hop_length=512, 9 | pad_mode="constant", 10 | ): 11 | padding = (int(frame_length // 2), int(frame_length // 2)) 12 | y = np.pad(y, padding, mode=pad_mode) 13 | 14 | axis = -1 15 | # put our new within-frame axis at the end for now 16 | out_strides = y.strides + tuple([y.strides[axis]]) 17 | # Reduce the shape on the framing axis 18 | x_shape_trimmed = list(y.shape) 19 | x_shape_trimmed[axis] -= frame_length - 1 20 | out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) 21 | xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) 22 | if axis < 0: 23 | target_axis = axis - 1 24 | else: 25 | target_axis = axis + 1 26 | xw = np.moveaxis(xw, -1, target_axis) 27 | # Downsample along the target axis 28 | slices = [slice(None)] * xw.ndim 29 | slices[axis] = slice(0, None, hop_length) 30 | x = xw[tuple(slices)] 31 | 32 | # Calculate power 33 | power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) 34 | 35 | return np.sqrt(power) 36 | 37 | 38 | class Slicer: 39 | def __init__( 40 | self, 41 | sr: int, 42 | threshold: float = -40.0, 43 | min_length: int = 5000, 44 | min_interval: int = 300, 45 | hop_size: int = 20, 46 | max_sil_kept: int = 5000, 47 | ): 48 | if not min_length >= min_interval >= hop_size: 49 | raise ValueError( 50 | "The following condition must be satisfied: min_length >= min_interval >= hop_size" 51 | ) 52 | if not max_sil_kept >= hop_size: 53 | raise ValueError( 54 | "The following condition must be satisfied: max_sil_kept >= hop_size" 55 | ) 56 | min_interval = sr * min_interval / 1000 57 | self.threshold = 10 ** (threshold / 20.0) 58 | self.hop_size = round(sr * hop_size / 1000) 59 | self.win_size = min(round(min_interval), 4 * self.hop_size) 60 | self.min_length = round(sr * min_length / 1000 / self.hop_size) 61 | self.min_interval = round(min_interval / self.hop_size) 62 | self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) 63 | 64 | def _apply_slice(self, waveform, begin, end): 65 | if len(waveform.shape) > 1: 66 | return waveform[ 67 | :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) 68 | ] 69 | else: 70 | return waveform[ 71 | begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) 72 | ] 73 | 74 | # @timeit 75 | def slice(self, waveform): 76 | if len(waveform.shape) > 1: 77 | samples = waveform.mean(axis=0) 78 | else: 79 | samples = waveform 80 | if samples.shape[0] <= self.min_length: 81 | return [waveform] 82 | rms_list = get_rms( 83 | y=samples, frame_length=self.win_size, hop_length=self.hop_size 84 | ).squeeze(0) 85 | sil_tags = [] 86 | silence_start = None 87 | clip_start = 0 88 | for i, rms in enumerate(rms_list): 89 | # Keep looping while frame is silent. 90 | if rms < self.threshold: 91 | # Record start of silent frames. 92 | if silence_start is None: 93 | silence_start = i 94 | continue 95 | # Keep looping while frame is not silent and silence start has not been recorded. 96 | if silence_start is None: 97 | continue 98 | # Clear recorded silence start if interval is not enough or clip is too short 99 | is_leading_silence = silence_start == 0 and i > self.max_sil_kept 100 | need_slice_middle = ( 101 | i - silence_start >= self.min_interval 102 | and i - clip_start >= self.min_length 103 | ) 104 | if not is_leading_silence and not need_slice_middle: 105 | silence_start = None 106 | continue 107 | # Need slicing. Record the range of silent frames to be removed. 108 | if i - silence_start <= self.max_sil_kept: 109 | pos = rms_list[silence_start : i + 1].argmin() + silence_start 110 | if silence_start == 0: 111 | sil_tags.append((0, pos)) 112 | else: 113 | sil_tags.append((pos, pos)) 114 | clip_start = pos 115 | elif i - silence_start <= self.max_sil_kept * 2: 116 | pos = rms_list[ 117 | i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 118 | ].argmin() 119 | pos += i - self.max_sil_kept 120 | pos_l = ( 121 | rms_list[ 122 | silence_start : silence_start + self.max_sil_kept + 1 123 | ].argmin() 124 | + silence_start 125 | ) 126 | pos_r = ( 127 | rms_list[i - self.max_sil_kept : i + 1].argmin() 128 | + i 129 | - self.max_sil_kept 130 | ) 131 | if silence_start == 0: 132 | sil_tags.append((0, pos_r)) 133 | clip_start = pos_r 134 | else: 135 | sil_tags.append((min(pos_l, pos), max(pos_r, pos))) 136 | clip_start = max(pos_r, pos) 137 | else: 138 | pos_l = ( 139 | rms_list[ 140 | silence_start : silence_start + self.max_sil_kept + 1 141 | ].argmin() 142 | + silence_start 143 | ) 144 | pos_r = ( 145 | rms_list[i - self.max_sil_kept : i + 1].argmin() 146 | + i 147 | - self.max_sil_kept 148 | ) 149 | if silence_start == 0: 150 | sil_tags.append((0, pos_r)) 151 | else: 152 | sil_tags.append((pos_l, pos_r)) 153 | clip_start = pos_r 154 | silence_start = None 155 | # Deal with trailing silence. 156 | total_frames = rms_list.shape[0] 157 | if ( 158 | silence_start is not None 159 | and total_frames - silence_start >= self.min_interval 160 | ): 161 | silence_end = min(total_frames, silence_start + self.max_sil_kept) 162 | pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start 163 | sil_tags.append((pos, total_frames + 1)) 164 | # Apply and return slices. 165 | if len(sil_tags) == 0: 166 | return [waveform] 167 | else: 168 | chunks = [] 169 | if sil_tags[0][0] > 0: 170 | chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) 171 | for i in range(len(sil_tags) - 1): 172 | chunks.append( 173 | self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) 174 | ) 175 | if sil_tags[-1][1] < total_frames: 176 | chunks.append( 177 | self._apply_slice(waveform, sil_tags[-1][1], total_frames) 178 | ) 179 | return chunks 180 | -------------------------------------------------------------------------------- /lib/rvc/preprocessing/split.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import os 3 | from concurrent.futures import ProcessPoolExecutor 4 | from typing import * 5 | 6 | import librosa 7 | import numpy as np 8 | import scipy.signal as signal 9 | from scipy.io import wavfile 10 | from tqdm import tqdm 11 | 12 | from lib.rvc.utils import load_audio 13 | 14 | from .slicer import Slicer 15 | 16 | 17 | def norm_write( 18 | tmp_audio: np.ndarray, 19 | idx0: int, 20 | idx1: int, 21 | speaker_id: int, 22 | outdir: str, 23 | outdir_16k: str, 24 | sampling_rate: int, 25 | max: float, 26 | alpha: float, 27 | is_normalize: bool, 28 | ): 29 | if is_normalize: 30 | tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + ( 31 | 1 - alpha 32 | ) * tmp_audio 33 | else: 34 | # clip level to max (cause sometimes when floating point decoding) 35 | audio_min = np.min(tmp_audio) 36 | if audio_min < -max: 37 | tmp_audio = tmp_audio / -audio_min * max 38 | audio_max = np.max(tmp_audio) 39 | if audio_max > max: 40 | tmp_audio = tmp_audio / audio_max * max 41 | 42 | wavfile.write( 43 | os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), 44 | sampling_rate, 45 | tmp_audio.astype(np.float32), 46 | ) 47 | 48 | tmp_audio = librosa.resample( 49 | tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" 50 | ) 51 | wavfile.write( 52 | os.path.join(outdir_16k, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), 53 | 16000, 54 | tmp_audio.astype(np.float32), 55 | ) 56 | 57 | 58 | def write_mute( 59 | mute_wave_filename: str, 60 | speaker_id: int, 61 | outdir: str, 62 | outdir_16k: str, 63 | sampling_rate: int, 64 | ): 65 | tmp_audio = load_audio(mute_wave_filename, sampling_rate) 66 | wavfile.write( 67 | os.path.join(outdir, f"{speaker_id:05}", "mute.wav"), 68 | sampling_rate, 69 | tmp_audio.astype(np.float32), 70 | ) 71 | tmp_audio = librosa.resample( 72 | tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" 73 | ) 74 | wavfile.write( 75 | os.path.join(outdir_16k, f"{speaker_id:05}", "mute.wav"), 76 | 16000, 77 | tmp_audio.astype(np.float32), 78 | ) 79 | 80 | 81 | def pipeline( 82 | slicer: Slicer, 83 | datasets: List[Tuple[str, int]], # List[(path, speaker_id)] 84 | outdir: str, 85 | outdir_16k: str, 86 | sampling_rate: int, 87 | is_normalize: bool, 88 | process_id: int = 0, 89 | ): 90 | per = 3.7 91 | overlap = 0.3 92 | tail = per + overlap 93 | max = 0.95 94 | alpha = 0.8 95 | 96 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=sampling_rate) 97 | 98 | for index, (wave_filename, speaker_id) in tqdm(datasets, position=1 + process_id): 99 | audio = load_audio(wave_filename, sampling_rate) 100 | audio = signal.lfilter(bh, ah, audio) 101 | 102 | idx1 = 0 103 | for audio in slicer.slice(audio): 104 | i = 0 105 | while 1: 106 | start = int(sampling_rate * (per - overlap) * i) 107 | i += 1 108 | if len(audio[start:]) > tail * sampling_rate: 109 | tmp_audio = audio[start : start + int(per * sampling_rate)] 110 | norm_write( 111 | tmp_audio, 112 | index, 113 | idx1, 114 | speaker_id, 115 | outdir, 116 | outdir_16k, 117 | sampling_rate, 118 | max, 119 | alpha, 120 | is_normalize, 121 | ) 122 | idx1 += 1 123 | else: 124 | tmp_audio = audio[start:] 125 | break 126 | norm_write( 127 | tmp_audio, 128 | index, 129 | idx1, 130 | speaker_id, 131 | outdir, 132 | outdir_16k, 133 | sampling_rate, 134 | max, 135 | alpha, 136 | is_normalize, 137 | ) 138 | idx1 += 1 139 | 140 | 141 | def preprocess_audio( 142 | datasets: List[Tuple[str, int]], # List[(path, speaker_id)] 143 | sampling_rate: int, 144 | num_processes: int, 145 | training_dir: str, 146 | is_normalize: bool, 147 | mute_wav_path: str, 148 | ): 149 | waves_dir = os.path.join(training_dir, "0_gt_wavs") 150 | waves16k_dir = os.path.join(training_dir, "1_16k_wavs") 151 | if os.path.exists(waves_dir) and os.path.exists(waves16k_dir): 152 | return 153 | 154 | for speaker_id in set([spk for _, spk in datasets]): 155 | os.makedirs(os.path.join(waves_dir, f"{speaker_id:05}"), exist_ok=True) 156 | os.makedirs(os.path.join(waves16k_dir, f"{speaker_id:05}"), exist_ok=True) 157 | 158 | all = [(i, x) for i, x in enumerate(sorted(datasets, key=operator.itemgetter(0)))] 159 | 160 | # n of datasets per process 161 | process_all_nums = [len(all) // num_processes] * num_processes 162 | # add residual datasets 163 | for i in range(len(all) % num_processes): 164 | process_all_nums[i] += 1 165 | 166 | assert len(all) == sum(process_all_nums), print( 167 | f"len(all): {len(all)}, sum(process_all_nums): {sum(process_all_nums)}" 168 | ) 169 | 170 | with ProcessPoolExecutor(max_workers=num_processes) as executor: 171 | all_index = 0 172 | for i in range(num_processes): 173 | data = all[all_index : all_index + process_all_nums[i]] 174 | slicer = Slicer( 175 | sr=sampling_rate, 176 | threshold=-42, 177 | min_length=1500, 178 | min_interval=400, 179 | hop_size=15, 180 | max_sil_kept=500, 181 | ) 182 | executor.submit( 183 | pipeline, 184 | slicer, 185 | data, 186 | waves_dir, 187 | waves16k_dir, 188 | sampling_rate, 189 | is_normalize, 190 | process_id=i, 191 | ) 192 | all_index += process_all_nums[i] 193 | 194 | for speaker_id in set([spk for _, spk in datasets]): 195 | write_mute(mute_wav_path, speaker_id, waves_dir, waves16k_dir, sampling_rate) 196 | -------------------------------------------------------------------------------- /lib/rvc/transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | DEFAULT_MIN_BIN_WIDTH = 1e-3 6 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 7 | DEFAULT_MIN_DERIVATIVE = 1e-3 8 | 9 | 10 | def piecewise_rational_quadratic_transform( 11 | inputs, 12 | unnormalized_widths, 13 | unnormalized_heights, 14 | unnormalized_derivatives, 15 | inverse=False, 16 | tails=None, 17 | tail_bound=1.0, 18 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 19 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 20 | min_derivative=DEFAULT_MIN_DERIVATIVE, 21 | ): 22 | if tails is None: 23 | spline_fn = rational_quadratic_spline 24 | spline_kwargs = {} 25 | else: 26 | spline_fn = unconstrained_rational_quadratic_spline 27 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound} 28 | 29 | outputs, logabsdet = spline_fn( 30 | inputs=inputs, 31 | unnormalized_widths=unnormalized_widths, 32 | unnormalized_heights=unnormalized_heights, 33 | unnormalized_derivatives=unnormalized_derivatives, 34 | inverse=inverse, 35 | min_bin_width=min_bin_width, 36 | min_bin_height=min_bin_height, 37 | min_derivative=min_derivative, 38 | **spline_kwargs 39 | ) 40 | return outputs, logabsdet 41 | 42 | 43 | def searchsorted(bin_locations, inputs, eps=1e-6): 44 | bin_locations[..., -1] += eps 45 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 46 | 47 | 48 | def unconstrained_rational_quadratic_spline( 49 | inputs, 50 | unnormalized_widths, 51 | unnormalized_heights, 52 | unnormalized_derivatives, 53 | inverse=False, 54 | tails="linear", 55 | tail_bound=1.0, 56 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 57 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 58 | min_derivative=DEFAULT_MIN_DERIVATIVE, 59 | ): 60 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 61 | outside_interval_mask = ~inside_interval_mask 62 | 63 | outputs = torch.zeros_like(inputs) 64 | logabsdet = torch.zeros_like(inputs) 65 | 66 | if tails == "linear": 67 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 68 | constant = np.log(np.exp(1 - min_derivative) - 1) 69 | unnormalized_derivatives[..., 0] = constant 70 | unnormalized_derivatives[..., -1] = constant 71 | 72 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 73 | logabsdet[outside_interval_mask] = 0 74 | else: 75 | raise RuntimeError("{} tails are not implemented.".format(tails)) 76 | 77 | ( 78 | outputs[inside_interval_mask], 79 | logabsdet[inside_interval_mask], 80 | ) = rational_quadratic_spline( 81 | inputs=inputs[inside_interval_mask], 82 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 83 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 84 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 85 | inverse=inverse, 86 | left=-tail_bound, 87 | right=tail_bound, 88 | bottom=-tail_bound, 89 | top=tail_bound, 90 | min_bin_width=min_bin_width, 91 | min_bin_height=min_bin_height, 92 | min_derivative=min_derivative, 93 | ) 94 | 95 | return outputs, logabsdet 96 | 97 | 98 | def rational_quadratic_spline( 99 | inputs, 100 | unnormalized_widths, 101 | unnormalized_heights, 102 | unnormalized_derivatives, 103 | inverse=False, 104 | left=0.0, 105 | right=1.0, 106 | bottom=0.0, 107 | top=1.0, 108 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 109 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 110 | min_derivative=DEFAULT_MIN_DERIVATIVE, 111 | ): 112 | if torch.min(inputs) < left or torch.max(inputs) > right: 113 | raise ValueError("Input to a transform is not within its domain") 114 | 115 | num_bins = unnormalized_widths.shape[-1] 116 | 117 | if min_bin_width * num_bins > 1.0: 118 | raise ValueError("Minimal bin width too large for the number of bins") 119 | if min_bin_height * num_bins > 1.0: 120 | raise ValueError("Minimal bin height too large for the number of bins") 121 | 122 | widths = F.softmax(unnormalized_widths, dim=-1) 123 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 124 | cumwidths = torch.cumsum(widths, dim=-1) 125 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) 126 | cumwidths = (right - left) * cumwidths + left 127 | cumwidths[..., 0] = left 128 | cumwidths[..., -1] = right 129 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 130 | 131 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 132 | 133 | heights = F.softmax(unnormalized_heights, dim=-1) 134 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 135 | cumheights = torch.cumsum(heights, dim=-1) 136 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) 137 | cumheights = (top - bottom) * cumheights + bottom 138 | cumheights[..., 0] = bottom 139 | cumheights[..., -1] = top 140 | heights = cumheights[..., 1:] - cumheights[..., :-1] 141 | 142 | if inverse: 143 | bin_idx = searchsorted(cumheights, inputs)[..., None] 144 | else: 145 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 146 | 147 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 148 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 149 | 150 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 151 | delta = heights / widths 152 | input_delta = delta.gather(-1, bin_idx)[..., 0] 153 | 154 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 155 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 156 | 157 | input_heights = heights.gather(-1, bin_idx)[..., 0] 158 | 159 | if inverse: 160 | a = (inputs - input_cumheights) * ( 161 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 162 | ) + input_heights * (input_delta - input_derivatives) 163 | b = input_heights * input_derivatives - (inputs - input_cumheights) * ( 164 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 165 | ) 166 | c = -input_delta * (inputs - input_cumheights) 167 | 168 | discriminant = b.pow(2) - 4 * a * c 169 | assert (discriminant >= 0).all() 170 | 171 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 172 | outputs = root * input_bin_widths + input_cumwidths 173 | 174 | theta_one_minus_theta = root * (1 - root) 175 | denominator = input_delta + ( 176 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 177 | * theta_one_minus_theta 178 | ) 179 | derivative_numerator = input_delta.pow(2) * ( 180 | input_derivatives_plus_one * root.pow(2) 181 | + 2 * input_delta * theta_one_minus_theta 182 | + input_derivatives * (1 - root).pow(2) 183 | ) 184 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 185 | 186 | return outputs, -logabsdet 187 | else: 188 | theta = (inputs - input_cumwidths) / input_bin_widths 189 | theta_one_minus_theta = theta * (1 - theta) 190 | 191 | numerator = input_heights * ( 192 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta 193 | ) 194 | denominator = input_delta + ( 195 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 196 | * theta_one_minus_theta 197 | ) 198 | outputs = input_cumheights + numerator / denominator 199 | 200 | derivative_numerator = input_delta.pow(2) * ( 201 | input_derivatives_plus_one * theta.pow(2) 202 | + 2 * input_delta * theta_one_minus_theta 203 | + input_derivatives * (1 - theta).pow(2) 204 | ) 205 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 206 | 207 | return outputs, logabsdet 208 | -------------------------------------------------------------------------------- /lib/rvc/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | import shutil 5 | import socket 6 | import sys 7 | 8 | import ffmpeg 9 | import matplotlib 10 | import matplotlib.pylab as plt 11 | import numpy as np 12 | import torch 13 | from scipy.io.wavfile import read 14 | from torch.nn import functional as F 15 | 16 | from modules.shared import ROOT_DIR 17 | 18 | from .config import TrainConfig 19 | 20 | matplotlib.use("Agg") 21 | logging.getLogger("matplotlib").setLevel(logging.WARNING) 22 | 23 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 24 | logger = logging 25 | 26 | 27 | def load_audio(file: str, sr): 28 | try: 29 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 30 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 31 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 32 | file = ( 33 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 34 | ) # Prevent small white copy path head and tail with spaces and " and return 35 | out, _ = ( 36 | ffmpeg.input(file, threads=0) 37 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 38 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 39 | ) 40 | except Exception as e: 41 | raise RuntimeError(f"Failed to load audio: {e}") 42 | 43 | return np.frombuffer(out, np.float32).flatten() 44 | 45 | 46 | def find_empty_port(): 47 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 48 | s.bind(("", 0)) 49 | s.listen(1) 50 | port = s.getsockname()[1] 51 | s.close() 52 | return port 53 | 54 | 55 | def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): 56 | assert os.path.isfile(checkpoint_path) 57 | checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") 58 | 59 | saved_state_dict = checkpoint_dict["model"] 60 | if hasattr(model, "module"): 61 | state_dict = model.module.state_dict() 62 | else: 63 | state_dict = model.state_dict() 64 | new_state_dict = {} 65 | for k, v in state_dict.items(): # 模型需要的shape 66 | try: 67 | new_state_dict[k] = saved_state_dict[k] 68 | if saved_state_dict[k].shape != state_dict[k].shape: 69 | print( 70 | f"shape-{k}-mismatch|need-{state_dict[k].shape}|get-{saved_state_dict[k].shape}" 71 | ) 72 | if saved_state_dict[k].dim() == 2: # NOTE: check is this ok? 73 | # for embedded input 256 <==> 768 74 | # this achieves we can continue training from original's pretrained checkpoints when using embedder that 768-th dim output etc. 75 | if saved_state_dict[k].dtype == torch.half: 76 | new_state_dict[k] = ( 77 | F.interpolate( 78 | saved_state_dict[k].float().unsqueeze(0).unsqueeze(0), 79 | size=state_dict[k].shape, 80 | mode="bilinear", 81 | ) 82 | .half() 83 | .squeeze(0) 84 | .squeeze(0) 85 | ) 86 | else: 87 | new_state_dict[k] = ( 88 | F.interpolate( 89 | saved_state_dict[k].unsqueeze(0).unsqueeze(0), 90 | size=state_dict[k].shape, 91 | mode="bilinear", 92 | ) 93 | .squeeze(0) 94 | .squeeze(0) 95 | ) 96 | print( 97 | "interpolated new_state_dict", 98 | k, 99 | "from", 100 | saved_state_dict[k].shape, 101 | "to", 102 | new_state_dict[k].shape, 103 | ) 104 | else: 105 | raise KeyError 106 | except Exception as e: 107 | # print(traceback.format_exc()) 108 | print(f"{k} is not in the checkpoint") 109 | print("error: %s" % e) 110 | new_state_dict[k] = v # 模型自带的随机值 111 | if hasattr(model, "module"): 112 | model.module.load_state_dict(new_state_dict, strict=False) 113 | else: 114 | model.load_state_dict(new_state_dict, strict=False) 115 | print("Loaded model weights") 116 | 117 | epoch = checkpoint_dict["epoch"] 118 | learning_rate = checkpoint_dict["learning_rate"] 119 | if optimizer is not None and load_opt == 1: 120 | optimizer.load_state_dict(checkpoint_dict["optimizer"]) 121 | print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, epoch)) 122 | return model, optimizer, learning_rate, epoch 123 | 124 | 125 | def save_state(model, optimizer, learning_rate, epoch, checkpoint_path): 126 | print( 127 | "Saving model and optimizer state at epoch {} to {}".format( 128 | epoch, checkpoint_path 129 | ) 130 | ) 131 | if hasattr(model, "module"): 132 | state_dict = model.module.state_dict() 133 | else: 134 | state_dict = model.state_dict() 135 | torch.save( 136 | { 137 | "model": state_dict, 138 | "epoch": epoch, 139 | "optimizer": optimizer.state_dict(), 140 | "learning_rate": learning_rate, 141 | }, 142 | checkpoint_path, 143 | ) 144 | 145 | 146 | def summarize( 147 | writer, 148 | global_step, 149 | scalars={}, 150 | histograms={}, 151 | images={}, 152 | audios={}, 153 | audio_sampling_rate=22050, 154 | ): 155 | for k, v in scalars.items(): 156 | writer.add_scalar(k, v, global_step) 157 | for k, v in histograms.items(): 158 | writer.add_histogram(k, v, global_step) 159 | for k, v in images.items(): 160 | writer.add_image(k, v, global_step, dataformats="HWC") 161 | for k, v in audios.items(): 162 | writer.add_audio(k, v, global_step, audio_sampling_rate) 163 | 164 | 165 | def latest_checkpoint_path(dir_path, regex="G_*.pth"): 166 | filelist = glob.glob(os.path.join(dir_path, regex)) 167 | if len(filelist) == 0: 168 | return None 169 | filelist.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) 170 | filepath = filelist[-1] 171 | return filepath 172 | 173 | 174 | def plot_spectrogram_to_numpy(spectrogram): 175 | fig, ax = plt.subplots(figsize=(10, 2)) 176 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 177 | plt.colorbar(im, ax=ax) 178 | plt.xlabel("Frames") 179 | plt.ylabel("Channels") 180 | plt.tight_layout() 181 | 182 | fig.canvas.draw() 183 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") 184 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 185 | plt.close() 186 | return data 187 | 188 | 189 | def plot_alignment_to_numpy(alignment, info=None): 190 | fig, ax = plt.subplots(figsize=(6, 4)) 191 | im = ax.imshow( 192 | alignment.transpose(), aspect="auto", origin="lower", interpolation="none" 193 | ) 194 | fig.colorbar(im, ax=ax) 195 | xlabel = "Decoder timestep" 196 | if info is not None: 197 | xlabel += "\n\n" + info 198 | plt.xlabel(xlabel) 199 | plt.ylabel("Encoder timestep") 200 | plt.tight_layout() 201 | 202 | fig.canvas.draw() 203 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") 204 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 205 | plt.close() 206 | return data 207 | 208 | 209 | def load_wav_to_torch(full_path): 210 | sampling_rate, data = read(full_path) 211 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 212 | 213 | 214 | def load_config(training_dir: str, sample_rate: int, emb_channels: int): 215 | if emb_channels == 256: 216 | config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json") 217 | else: 218 | config_path = os.path.join( 219 | ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json" 220 | ) 221 | config_save_path = os.path.join(training_dir, "config.json") 222 | 223 | shutil.copyfile(config_path, config_save_path) 224 | 225 | return TrainConfig.parse_file(config_save_path) 226 | -------------------------------------------------------------------------------- /models/checkpoints/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /models/embeddings/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /models/pretrained/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /models/training/.gitignore: -------------------------------------------------------------------------------- 1 | */** 2 | 3 | !mute/**/* 4 | !.gitignore 5 | 6 | mute/**/*.pt 7 | -------------------------------------------------------------------------------- /models/training/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /models/training/mute/0_gt_wavs/mute32k.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute32k.wav -------------------------------------------------------------------------------- /models/training/mute/0_gt_wavs/mute40k.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute40k.wav -------------------------------------------------------------------------------- /models/training/mute/0_gt_wavs/mute48k.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute48k.wav -------------------------------------------------------------------------------- /models/training/mute/1_16k_wavs/mute.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/1_16k_wavs/mute.wav -------------------------------------------------------------------------------- /models/training/mute/2a_f0/mute.wav.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/2a_f0/mute.wav.npy -------------------------------------------------------------------------------- /models/training/mute/2b_f0nsf/mute.wav.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/2b_f0nsf/mute.wav.npy -------------------------------------------------------------------------------- /models/training/mute/3_feature256/mute.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/3_feature256/mute.npy -------------------------------------------------------------------------------- /modules/cmd_opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | parser = argparse.ArgumentParser() 4 | 5 | parser.add_argument("--host", help="Host to connect to", type=str, default="127.0.0.1") 6 | parser.add_argument("--port", help="Port to connect to", type=int) 7 | parser.add_argument("--share", help="Enable gradio share", action="store_true") 8 | parser.add_argument( 9 | "--models-dir", help="Path to models directory", type=str, default=None 10 | ) 11 | parser.add_argument( 12 | "--output-dir", help="Path to output directory", type=str, default=None 13 | ) 14 | parser.add_argument( 15 | "--precision", 16 | help="Precision to use", 17 | type=str, 18 | default="fp16", 19 | choices=["fp32", "fp16"], 20 | ) 21 | 22 | opts, _ = parser.parse_known_args() 23 | -------------------------------------------------------------------------------- /modules/core.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import shutil 4 | import sys 5 | from concurrent.futures import ThreadPoolExecutor 6 | 7 | import requests 8 | 9 | from modules.models import MODELS_DIR 10 | from modules.shared import ROOT_DIR 11 | from modules.utils import download_file 12 | 13 | 14 | def get_hf_etag(url: str): 15 | r = requests.head(url) 16 | 17 | etag = r.headers["X-Linked-ETag"] if "X-Linked-ETag" in r.headers else "" 18 | 19 | if etag.startswith('"') and etag.endswith('"'): 20 | etag = etag[1:-1] 21 | 22 | return etag 23 | 24 | 25 | def calc_sha256(filepath: str): 26 | sha256 = hashlib.sha256() 27 | with open(filepath, "rb") as f: 28 | for chunk in iter(lambda: f.read(4096), b""): 29 | sha256.update(chunk) 30 | return sha256.hexdigest() 31 | 32 | 33 | def download_models(): 34 | def hash_check(url: str, out: str): 35 | if not os.path.exists(out): 36 | return False 37 | etag = get_hf_etag(url) 38 | hash = calc_sha256(out) 39 | return etag == hash 40 | 41 | os.makedirs(os.path.join(MODELS_DIR, "pretrained", "v2"), exist_ok=True) 42 | 43 | tasks = [] 44 | for template in [ 45 | "D{}k", 46 | "G{}k", 47 | "f0D{}k", 48 | "f0G{}k", 49 | ]: 50 | basename = template.format("40") 51 | url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/pretrained/v2/{basename}.pth" 52 | out = os.path.join(MODELS_DIR, "pretrained", "v2", f"{basename}.pth") 53 | 54 | if hash_check(url, out): 55 | continue 56 | 57 | tasks.append((url, out)) 58 | 59 | for filename in [ 60 | "checkpoint_best_legacy_500.pt", 61 | ]: 62 | out = os.path.join(MODELS_DIR, "embeddings", filename) 63 | url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}" 64 | 65 | if hash_check(url, out): 66 | continue 67 | 68 | tasks.append( 69 | ( 70 | f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}", 71 | out, 72 | ) 73 | ) 74 | 75 | # japanese-hubert-base (Fairseq) 76 | # from official repo 77 | # NOTE: change filename? 78 | hubert_jp_url = f"https://huggingface.co/rinna/japanese-hubert-base/resolve/main/fairseq/model.pt" 79 | out = os.path.join(MODELS_DIR, "embeddings", "rinna_hubert_base_jp.pt") 80 | if not hash_check(hubert_jp_url, out): 81 | tasks.append( 82 | ( 83 | hubert_jp_url, 84 | out, 85 | ) 86 | ) 87 | 88 | if len(tasks) < 1: 89 | return 90 | 91 | with ThreadPoolExecutor() as pool: 92 | pool.map( 93 | download_file, 94 | *zip( 95 | *[(filename, out, i, True) for i, (filename, out) in enumerate(tasks)] 96 | ), 97 | ) 98 | 99 | 100 | def install_ffmpeg(): 101 | if os.path.exists(os.path.join(ROOT_DIR, "bin", "ffmpeg.exe")): 102 | return 103 | tmpdir = os.path.join(ROOT_DIR, "tmp") 104 | url = ( 105 | "https://www.gyan.dev/ffmpeg/builds/packages/ffmpeg-5.1.2-essentials_build.zip" 106 | ) 107 | out = os.path.join(tmpdir, "ffmpeg.zip") 108 | os.makedirs(os.path.dirname(out), exist_ok=True) 109 | download_file(url, out) 110 | shutil.unpack_archive(out, os.path.join(tmpdir, "ffmpeg")) 111 | shutil.copyfile( 112 | os.path.join( 113 | tmpdir, "ffmpeg", "ffmpeg-5.1.2-essentials_build", "bin", "ffmpeg.exe" 114 | ), 115 | os.path.join(ROOT_DIR, "bin", "ffmpeg.exe"), 116 | ) 117 | os.remove(os.path.join(tmpdir, "ffmpeg.zip")) 118 | shutil.rmtree(os.path.join(tmpdir, "ffmpeg")) 119 | 120 | 121 | def update_modelnames(): 122 | for sr in ["32k", "40k", "48k"]: 123 | files = [ 124 | f"f0G{sr}", 125 | f"f0D{sr}", 126 | f"G{sr}", 127 | f"D{sr}", 128 | ] 129 | for file in files: 130 | filepath = os.path.join(MODELS_DIR, "pretrained", f"{file}.pth") 131 | if os.path.exists(filepath): 132 | os.rename( 133 | filepath, 134 | os.path.join(MODELS_DIR, "pretrained", f"{file}256.pth"), 135 | ) 136 | 137 | if not os.path.exists(os.path.join(MODELS_DIR, "embeddings")): 138 | os.makedirs(os.path.join(MODELS_DIR, "embeddings")) 139 | 140 | if os.path.exists(os.path.join(MODELS_DIR, "hubert_base.pt")): 141 | os.rename( 142 | os.path.join(MODELS_DIR, "hubert_base.pt"), 143 | os.path.join(MODELS_DIR, "embeddings", "hubert_base.pt"), 144 | ) 145 | if os.path.exists(os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt")): 146 | os.rename( 147 | os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt"), 148 | os.path.join(MODELS_DIR, "embeddings", "checkpoint_best_legacy_500.pt"), 149 | ) 150 | 151 | 152 | def preload(): 153 | update_modelnames() 154 | download_models() 155 | if sys.platform == "win32": 156 | install_ffmpeg() 157 | -------------------------------------------------------------------------------- /modules/merge.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import * 3 | 4 | import torch 5 | import tqdm 6 | 7 | 8 | def merge( 9 | path_a: str, 10 | path_b: str, 11 | path_c: str, 12 | alpha: float, 13 | weights: Dict[str, float], 14 | method: str, 15 | ): 16 | def extract(ckpt: Dict[str, Any]): 17 | a = ckpt["model"] 18 | opt = OrderedDict() 19 | opt["weight"] = {} 20 | for key in a.keys(): 21 | if "enc_q" in key: 22 | continue 23 | opt["weight"][key] = a[key] 24 | return opt 25 | 26 | def load_weight(path: str): 27 | print(f"Loading {path}...") 28 | state_dict = torch.load(path, map_location="cpu") 29 | if "model" in state_dict: 30 | weight = extract(state_dict) 31 | else: 32 | weight = state_dict["weight"] 33 | return weight, state_dict 34 | 35 | def get_alpha(key: str): 36 | try: 37 | filtered = sorted( 38 | [x for x in weights.keys() if key.startswith(x)], key=len, reverse=True 39 | ) 40 | if len(filtered) < 1: 41 | return alpha 42 | return weights[filtered[0]] 43 | except: 44 | return alpha 45 | 46 | weight_a, state_dict = load_weight(path_a) 47 | weight_b, _ = load_weight(path_b) 48 | if path_c is not None: 49 | weight_c, _ = load_weight(path_c) 50 | 51 | if sorted(list(weight_a.keys())) != sorted(list(weight_b.keys())): 52 | raise RuntimeError("Failed to merge models.") 53 | 54 | merged = OrderedDict() 55 | merged["weight"] = {} 56 | 57 | def merge_weight(a, b, c, alpha): 58 | if method == "weight_sum": 59 | return (1 - alpha) * a + alpha * b 60 | elif method == "add_diff": 61 | return a + (b - c) * alpha 62 | 63 | for key in tqdm.tqdm(weight_a.keys()): 64 | a = get_alpha(key) 65 | if path_c is not None: 66 | merged["weight"][key] = merge_weight( 67 | weight_a[key], weight_b[key], weight_c[key], a 68 | ) 69 | else: 70 | merged["weight"][key] = merge_weight(weight_a[key], weight_b[key], None, a) 71 | merged["config"] = state_dict["config"] 72 | merged["params"] = state_dict["params"] if "params" in state_dict else None 73 | merged["version"] = state_dict.get("version", "v1") 74 | merged["sr"] = state_dict["sr"] 75 | merged["f0"] = state_dict["f0"] 76 | merged["info"] = state_dict["info"] 77 | merged["embedder_name"] = ( 78 | state_dict["embedder_name"] if "embedder_name" in state_dict else None 79 | ) 80 | merged["embedder_output_layer"] = state_dict.get("embedder_output_layer", "12") 81 | return merged 82 | -------------------------------------------------------------------------------- /modules/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from typing import * 4 | 5 | import torch 6 | from fairseq import checkpoint_utils 7 | from fairseq.models.hubert.hubert import HubertModel 8 | from pydub import AudioSegment 9 | 10 | from lib.rvc.models import (SynthesizerTrnMs256NSFSid, 11 | SynthesizerTrnMs256NSFSidNono) 12 | from lib.rvc.pipeline import VocalConvertPipeline 13 | 14 | from .cmd_opts import opts 15 | from .shared import ROOT_DIR, device, is_half 16 | from .utils import load_audio 17 | 18 | AUDIO_OUT_DIR = opts.output_dir or os.path.join(ROOT_DIR, "outputs") 19 | 20 | 21 | EMBEDDINGS_LIST = { 22 | "hubert-base-japanese": ( 23 | "rinna_hubert_base_jp.pt", 24 | "hubert-base-japanese", 25 | "local", 26 | ), 27 | "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"), 28 | } 29 | 30 | 31 | def update_state_dict(state_dict): 32 | if "params" in state_dict and state_dict["params"] is not None: 33 | return 34 | keys = [ 35 | "spec_channels", 36 | "segment_size", 37 | "inter_channels", 38 | "hidden_channels", 39 | "filter_channels", 40 | "n_heads", 41 | "n_layers", 42 | "kernel_size", 43 | "p_dropout", 44 | "resblock", 45 | "resblock_kernel_sizes", 46 | "resblock_dilation_sizes", 47 | "upsample_rates", 48 | "upsample_initial_channel", 49 | "upsample_kernel_sizes", 50 | "spk_embed_dim", 51 | "gin_channels", 52 | "emb_channels", 53 | "sr", 54 | ] 55 | state_dict["params"] = {} 56 | n = 0 57 | for i, key in enumerate(keys): 58 | i = i - n 59 | if len(state_dict["config"]) != 19 and key == "emb_channels": 60 | # backward compat. 61 | n += 1 62 | continue 63 | state_dict["params"][key] = state_dict["config"][i] 64 | 65 | if not "emb_channels" in state_dict["params"]: 66 | if state_dict.get("version", "v1") == "v1": 67 | state_dict["params"]["emb_channels"] = 256 # for backward compat. 68 | state_dict["embedder_output_layer"] = 9 69 | else: 70 | state_dict["params"]["emb_channels"] = 768 # for backward compat. 71 | state_dict["embedder_output_layer"] = 12 72 | 73 | 74 | class VoiceConvertModel: 75 | def __init__(self, model_name: str, state_dict: Dict[str, Any]) -> None: 76 | update_state_dict(state_dict) 77 | self.model_name = model_name 78 | self.state_dict = state_dict 79 | self.tgt_sr = state_dict["params"]["sr"] 80 | f0 = state_dict.get("f0", 1) 81 | state_dict["params"]["spk_embed_dim"] = state_dict["weight"][ 82 | "emb_g.weight" 83 | ].shape[0] 84 | if not "emb_channels" in state_dict["params"]: 85 | state_dict["params"]["emb_channels"] = 256 # for backward compat. 86 | 87 | if f0 == 1: 88 | self.net_g = SynthesizerTrnMs256NSFSid( 89 | **state_dict["params"], is_half=is_half 90 | ) 91 | else: 92 | self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"]) 93 | 94 | del self.net_g.enc_q 95 | 96 | self.net_g.load_state_dict(state_dict["weight"], strict=False) 97 | self.net_g.eval().to(device) 98 | 99 | if is_half: 100 | self.net_g = self.net_g.half() 101 | else: 102 | self.net_g = self.net_g.float() 103 | 104 | self.vc = VocalConvertPipeline(self.tgt_sr, device, is_half) 105 | self.n_spk = state_dict["params"]["spk_embed_dim"] 106 | 107 | def single( 108 | self, 109 | sid: int, 110 | input_audio: str, 111 | embedder_model_name: str, 112 | embedding_output_layer: str, 113 | f0_up_key: int, 114 | f0_file: str, 115 | f0_method: str, 116 | auto_load_index: bool, 117 | faiss_index_file: str, 118 | index_rate: float, 119 | output_dir: str = AUDIO_OUT_DIR, 120 | ): 121 | if not input_audio: 122 | raise Exception("You need to set Source Audio") 123 | f0_up_key = int(f0_up_key) 124 | audio = load_audio(input_audio, 16000) 125 | 126 | if embedder_model_name == "auto": 127 | embedder_model_name = ( 128 | self.state_dict["embedder_name"] 129 | if "embedder_name" in self.state_dict 130 | else "hubert_base" 131 | ) 132 | if embedder_model_name.endswith("768"): 133 | embedder_model_name = embedder_model_name[:-3] 134 | 135 | if embedder_model_name == "hubert_base": 136 | embedder_model_name = "contentvec" 137 | 138 | if not embedder_model_name in EMBEDDINGS_LIST.keys(): 139 | raise Exception(f"Not supported embedder: {embedder_model_name}") 140 | 141 | if ( 142 | embedder_model == None 143 | or loaded_embedder_model != EMBEDDINGS_LIST[embedder_model_name][1] 144 | ): 145 | print(f"load {embedder_model_name} embedder") 146 | embedder_filename, embedder_name, load_from = get_embedder( 147 | embedder_model_name 148 | ) 149 | load_embedder(embedder_filename, embedder_name) 150 | 151 | if embedding_output_layer == "auto": 152 | embedding_output_layer = ( 153 | self.state_dict["embedding_output_layer"] 154 | if "embedding_output_layer" in self.state_dict 155 | else 12 156 | ) 157 | else: 158 | embedding_output_layer = int(embedding_output_layer) 159 | 160 | f0 = self.state_dict.get("f0", 1) 161 | 162 | if not faiss_index_file and auto_load_index: 163 | faiss_index_file = self.get_index_path(sid) 164 | 165 | audio_opt = self.vc( 166 | embedder_model, 167 | embedding_output_layer, 168 | self.net_g, 169 | sid, 170 | audio, 171 | f0_up_key, 172 | f0_method, 173 | faiss_index_file, 174 | index_rate, 175 | f0, 176 | f0_file=f0_file, 177 | ) 178 | 179 | audio = AudioSegment( 180 | audio_opt, 181 | frame_rate=self.tgt_sr, 182 | sample_width=2, 183 | channels=1, 184 | ) 185 | os.makedirs(output_dir, exist_ok=True) 186 | input_audio_splitext = os.path.splitext(os.path.basename(input_audio))[0] 187 | model_splitext = os.path.splitext(self.model_name)[0] 188 | index = 0 189 | existing_files = os.listdir(output_dir) 190 | for existing_file in existing_files: 191 | result = re.match(r"\d+", existing_file) 192 | if result: 193 | prefix_num = int(result.group(0)) 194 | if index < prefix_num: 195 | index = prefix_num 196 | audio.export( 197 | os.path.join( 198 | output_dir, f"{index+1}-{model_splitext}-{input_audio_splitext}.wav" 199 | ), 200 | format="wav", 201 | ) 202 | return audio_opt 203 | 204 | def get_index_path(self, speaker_id: int): 205 | basename = os.path.splitext(self.model_name)[0] 206 | speaker_index_path = os.path.join( 207 | MODELS_DIR, 208 | "checkpoints", 209 | f"{basename}_index", 210 | f"{basename}.{speaker_id}.index", 211 | ) 212 | if os.path.exists(speaker_index_path): 213 | return speaker_index_path 214 | return os.path.join(MODELS_DIR, "checkpoints", f"{basename}.index") 215 | 216 | 217 | MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models") 218 | vc_model: Optional[VoiceConvertModel] = None 219 | embedder_model: Optional[HubertModel] = None 220 | loaded_embedder_model = "" 221 | 222 | 223 | def get_models(): 224 | dir = os.path.join(ROOT_DIR, "models", "checkpoints") 225 | os.makedirs(dir, exist_ok=True) 226 | return [ 227 | file 228 | for file in os.listdir(dir) 229 | if any([x for x in [".ckpt", ".pth"] if file.endswith(x)]) 230 | ] 231 | 232 | 233 | def get_embedder(embedder_name): 234 | if embedder_name in EMBEDDINGS_LIST: 235 | return EMBEDDINGS_LIST[embedder_name] 236 | return None 237 | 238 | 239 | def load_embedder(emb_file: str, emb_name: str): 240 | global embedder_model, loaded_embedder_model 241 | emb_file = os.path.join(MODELS_DIR, "embeddings", emb_file) 242 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task( 243 | [emb_file], 244 | suffix="", 245 | ) 246 | embedder_model = models[0] 247 | embedder_model = embedder_model.to(device) 248 | 249 | if is_half: 250 | embedder_model = embedder_model.half() 251 | else: 252 | embedder_model = embedder_model.float() 253 | embedder_model.eval() 254 | 255 | loaded_embedder_model = emb_name 256 | 257 | 258 | def get_vc_model(model_name: str): 259 | model_path = os.path.join(MODELS_DIR, "checkpoints", model_name) 260 | weight = torch.load(model_path, map_location="cpu") 261 | return VoiceConvertModel(model_name, weight) 262 | 263 | 264 | def load_model(model_name: str): 265 | global vc_model 266 | vc_model = get_vc_model(model_name) 267 | -------------------------------------------------------------------------------- /modules/separate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import * 3 | 4 | import tqdm 5 | from pydub import AudioSegment 6 | from pydub.silence import split_on_silence 7 | 8 | 9 | def separate_audio( 10 | input: str, 11 | output: str, 12 | silence_thresh: int, 13 | min_silence_len: int = 1000, 14 | keep_silence: int = 100, 15 | margin: int = 0, 16 | padding: bool = False, 17 | min: Optional[int] = None, 18 | max: Optional[int] = None, 19 | ): 20 | if os.path.isfile(input): 21 | input = [input] 22 | elif os.path.isdir(input): 23 | input = [os.path.join(input, f) for f in os.listdir(input)] 24 | else: 25 | raise ValueError("input must be a file or directory") 26 | 27 | os.makedirs(output, exist_ok=True) 28 | 29 | for file in input: 30 | if os.path.splitext(file)[1] == ".mp3": 31 | audio = AudioSegment.from_mp3(file) 32 | elif os.path.splitext(file)[1] == ".wav": 33 | audio = AudioSegment.from_wav(file) 34 | elif os.path.splitext(file)[1] == ".flac": 35 | audio = AudioSegment.from_file(file, "flac") 36 | else: 37 | raise ValueError( 38 | "Invalid file format. Only MP3 and WAV files are supported." 39 | ) 40 | 41 | chunks = split_on_silence( 42 | audio, 43 | min_silence_len=min_silence_len, 44 | silence_thresh=silence_thresh, 45 | keep_silence=keep_silence, 46 | ) 47 | 48 | output_chunks: List[AudioSegment] = [] 49 | 50 | so_short = None 51 | 52 | for chunk in tqdm.tqdm(chunks): 53 | if so_short is not None: 54 | chunk = so_short + chunk 55 | so_short = None 56 | if min is None or len(chunk) > min: 57 | if max is not None and len(chunk) > max: 58 | sub_chunks = [ 59 | chunk[i : i + max + margin] 60 | for i in range(0, len(chunk) - margin, max) 61 | ] 62 | 63 | if len(sub_chunks[-1]) < min: 64 | if padding and len(sub_chunks) > 2: 65 | output_chunks.extend(sub_chunks[0:-2]) 66 | output_chunks.append(sub_chunks[-2] + sub_chunks[-1]) 67 | else: 68 | output_chunks.extend(sub_chunks[0:-1]) 69 | else: 70 | output_chunks.extend(sub_chunks) 71 | else: 72 | output_chunks.append(chunk) 73 | else: 74 | if so_short is None: 75 | so_short = chunk 76 | else: 77 | so_short += chunk 78 | basename = os.path.splitext(os.path.basename(file))[0] 79 | 80 | for i, chunk in enumerate(output_chunks): 81 | filepath = os.path.join(output, f"{basename}_{i}.wav") 82 | chunk.export(filepath, format="wav") 83 | -------------------------------------------------------------------------------- /modules/server/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from typing import * 4 | 5 | import faiss 6 | import numpy as np 7 | import pyworld 8 | import scipy.signal as signal 9 | import torch 10 | import torch.nn.functional as F 11 | import torchaudio 12 | import torchcrepe 13 | from fairseq import checkpoint_utils 14 | from fairseq.models.hubert.hubert import HubertModel 15 | from pydub import AudioSegment 16 | from torch import Tensor 17 | 18 | from lib.rvc.models import (SynthesizerTrnMs256NSFSid, 19 | SynthesizerTrnMs256NSFSidNono) 20 | from lib.rvc.pipeline import VocalConvertPipeline 21 | from modules.cmd_opts import opts 22 | from modules.models import (EMBEDDINGS_LIST, MODELS_DIR, get_embedder, 23 | get_vc_model, update_state_dict) 24 | from modules.shared import ROOT_DIR, device, is_half 25 | 26 | MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models") 27 | vc_model: Optional["VoiceServerModel"] = None 28 | embedder_model: Optional[HubertModel] = None 29 | loaded_embedder_model = "" 30 | 31 | 32 | class VoiceServerModel: 33 | def __init__(self, rvc_model_file: str, faiss_index_file: str) -> None: 34 | # setting vram 35 | global device, is_half 36 | if isinstance(device, str): 37 | device = torch.device(device) 38 | if device.type == "cuda": 39 | vram = torch.cuda.get_device_properties(device).total_memory / 1024**3 40 | else: 41 | vram = None 42 | if vram is not None and vram <= 4: 43 | self.x_pad = 1 44 | self.x_query = 5 45 | self.x_center = 30 46 | self.x_max = 32 47 | elif vram is not None and vram <= 5: 48 | self.x_pad = 1 49 | self.x_query = 6 50 | self.x_center = 38 51 | self.x_max = 41 52 | else: 53 | self.x_pad = 3 54 | self.x_query = 10 55 | self.x_center = 60 56 | self.x_max = 65 57 | 58 | # load_model 59 | state_dict = torch.load(rvc_model_file, map_location="cpu") 60 | update_state_dict(state_dict) 61 | self.state_dict = state_dict 62 | self.tgt_sr = state_dict["params"]["sr"] 63 | self.f0 = state_dict.get("f0", 1) 64 | state_dict["params"]["spk_embed_dim"] = state_dict["weight"][ 65 | "emb_g.weight" 66 | ].shape[0] 67 | if not "emb_channels" in state_dict["params"]: 68 | if state_dict.get("version", "v1") == "v1": 69 | state_dict["params"]["emb_channels"] = 256 # for backward compat. 70 | state_dict["embedder_output_layer"] = 9 71 | else: 72 | state_dict["params"]["emb_channels"] = 768 # for backward compat. 73 | state_dict["embedder_output_layer"] = 12 74 | if self.f0 == 1: 75 | self.net_g = SynthesizerTrnMs256NSFSid( 76 | **state_dict["params"], is_half=is_half 77 | ) 78 | else: 79 | self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"]) 80 | del self.net_g.enc_q 81 | self.net_g.load_state_dict(state_dict["weight"], strict=False) 82 | self.net_g.eval().to(device) 83 | if is_half: 84 | self.net_g = self.net_g.half() 85 | else: 86 | self.net_g = self.net_g.float() 87 | 88 | emb_name = state_dict.get("embedder_name", "contentvec") 89 | if emb_name == "hubert_base": 90 | emb_name = "contentvec" 91 | emb_file = os.path.join(MODELS_DIR, "embeddings", EMBEDDINGS_LIST[emb_name][0]) 92 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task( 93 | [emb_file], 94 | suffix="", 95 | ) 96 | embedder_model = models[0] 97 | embedder_model = embedder_model.to(device) 98 | 99 | if is_half: 100 | embedder_model = embedder_model.half() 101 | else: 102 | embedder_model = embedder_model.float() 103 | embedder_model.eval() 104 | self.embedder_model = embedder_model 105 | 106 | self.embedder_output_layer = state_dict["embedder_output_layer"] 107 | 108 | self.index = None 109 | if faiss_index_file != "" and os.path.exists(faiss_index_file): 110 | self.index = faiss.read_index(faiss_index_file) 111 | self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) 112 | 113 | self.n_spk = state_dict["params"]["spk_embed_dim"] 114 | 115 | self.sr = 16000 # hubert input sample rate 116 | self.window = 160 # hubert input window 117 | self.t_pad = self.sr * self.x_pad # padding time for each utterance 118 | self.t_pad_tgt = self.tgt_sr * self.x_pad 119 | self.t_pad2 = self.t_pad * 2 120 | self.t_query = self.sr * self.x_query # query time before and after query point 121 | self.t_center = self.sr * self.x_center # query cut point position 122 | self.t_max = self.sr * self.x_max # max time for no query 123 | self.device = device 124 | self.is_half = is_half 125 | 126 | def __call__( 127 | self, 128 | audio: np.ndarray, 129 | sr: int, 130 | sid: int, 131 | transpose: int, 132 | f0_method: str, 133 | index_rate: float, 134 | ): 135 | # bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) 136 | # audio = signal.filtfilt(bh, ah, audio) 137 | if sr != self.sr: 138 | audio = torchaudio.functional.resample(torch.from_numpy(audio), sr, self.sr, rolloff=0.99).detach().cpu().numpy() 139 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect" if audio.shape[0] > self.window // 2 else "constant") 140 | 141 | opt_ts = [] 142 | if audio_pad.shape[0] > self.t_max: 143 | audio_sum = np.zeros_like(audio) 144 | for i in range(self.window): 145 | audio_sum += audio_pad[i : i - self.window] 146 | for t in range(self.t_center, audio.shape[0], self.t_center): 147 | opt_ts.append( 148 | t 149 | - self.t_query 150 | + np.where( 151 | np.abs(audio_sum[t - self.t_query : t + self.t_query]) 152 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() 153 | )[0][0] 154 | ) 155 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect" if audio.shape[0] > self.t_pad else "constant") 156 | p_len = audio_pad.shape[0] // self.window 157 | 158 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() 159 | pitch, pitchf = None, None 160 | if self.f0 == 1: 161 | pitch, pitchf = get_f0(audio_pad, self.sr, p_len, transpose, f0_method) 162 | pitch = pitch[:p_len] 163 | pitchf = pitchf[:p_len] 164 | if self.device.type == "mps": 165 | pitchf = pitchf.astype(np.float32) 166 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() 167 | pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() 168 | 169 | audio_opt = [] 170 | 171 | s = 0 172 | t = None 173 | 174 | for t in opt_ts: 175 | t = t // self.window * self.window 176 | if self.f0 == 1: 177 | audio_opt.append( 178 | self._convert( 179 | sid, 180 | audio_pad[s : t + self.t_pad2 + self.window], 181 | pitch[:, s // self.window : (t + self.t_pad2) // self.window], 182 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window], 183 | index_rate, 184 | )[self.t_pad_tgt : -self.t_pad_tgt] 185 | ) 186 | else: 187 | audio_opt.append( 188 | self._convert( 189 | sid, 190 | audio_pad[s : t + self.t_pad2 + self.window], 191 | None, 192 | None, 193 | index_rate, 194 | )[self.t_pad_tgt : -self.t_pad_tgt] 195 | ) 196 | s = t 197 | if self.f0 == 1: 198 | audio_opt.append( 199 | self._convert( 200 | sid, 201 | audio_pad[t:], 202 | pitch[:, t // self.window :] if t is not None else pitch, 203 | pitchf[:, t // self.window :] if t is not None else pitchf, 204 | index_rate, 205 | )[self.t_pad_tgt : -self.t_pad_tgt] 206 | ) 207 | else: 208 | audio_opt.append( 209 | self._convert( 210 | sid, 211 | audio_pad[t:], 212 | None, 213 | None, 214 | index_rate, 215 | )[self.t_pad_tgt : -self.t_pad_tgt] 216 | ) 217 | audio_opt = np.concatenate(audio_opt) 218 | del pitch, pitchf, sid 219 | if torch.cuda.is_available(): 220 | torch.cuda.empty_cache() 221 | return audio_opt 222 | 223 | 224 | def _convert( 225 | self, 226 | sid: int, 227 | audio: np.ndarray, 228 | pitch: Optional[np.ndarray], 229 | pitchf: Optional[np.ndarray], 230 | index_rate: float, 231 | ): 232 | feats = torch.from_numpy(audio) 233 | if self.is_half: 234 | feats = feats.half() 235 | else: 236 | feats = feats.float() 237 | if feats.dim() == 2: # double channels 238 | feats = feats.mean(-1) 239 | assert feats.dim() == 1, feats.dim() 240 | feats = feats.view(1, -1) 241 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) 242 | 243 | half_support = ( 244 | self.device.type == "cuda" 245 | and torch.cuda.get_device_capability(self.device)[0] >= 5.3 246 | ) 247 | is_feats_dim_768 = self.net_g.emb_channels == 768 248 | 249 | if isinstance(self.embedder_model, tuple): 250 | feats = self.embedder_model[0]( 251 | feats.squeeze(0).squeeze(0).to(self.device), 252 | return_tensors="pt", 253 | sampling_rate=16000, 254 | ) 255 | if self.is_half: 256 | feats = feats.input_values.to(self.device).half() 257 | else: 258 | feats = feats.input_values.to(self.device) 259 | with torch.no_grad(): 260 | if is_feats_dim_768: 261 | feats = self.embedder_model[1](feats).last_hidden_state 262 | else: 263 | feats = self.embedder_model[1](feats).extract_features 264 | else: 265 | inputs = { 266 | "source": feats.half().to(self.device) 267 | if half_support 268 | else feats.to(self.device), 269 | "padding_mask": padding_mask.to(self.device), 270 | "output_layer": self.embedder_output_layer, 271 | } 272 | 273 | if not half_support: 274 | self.embedder_model = self.embedder_model.float() 275 | inputs["source"] = inputs["source"].float() 276 | 277 | with torch.no_grad(): 278 | logits = self.embedder_model.extract_features(**inputs) 279 | if is_feats_dim_768: 280 | feats = logits[0] 281 | else: 282 | feats = self.embedder_model.final_proj(logits[0]) 283 | 284 | if ( 285 | isinstance(self.index, type(None)) == False 286 | and isinstance(self.big_npy, type(None)) == False 287 | and index_rate != 0 288 | ): 289 | npy = feats[0].cpu().numpy() 290 | if self.is_half: 291 | npy = npy.astype("float32") 292 | 293 | _, ix = self.index.search(npy, k=1) 294 | npy = self.big_npy[ix[:, 0]] 295 | 296 | if self.is_half: 297 | npy = npy.astype("float16") 298 | feats = ( 299 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate 300 | + (1 - index_rate) * feats 301 | ) 302 | 303 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) 304 | 305 | p_len = audio.shape[0] // self.window 306 | if feats.shape[1] < p_len: 307 | p_len = feats.shape[1] 308 | if pitch != None and pitchf != None: 309 | pitch = pitch[:, :p_len] 310 | pitchf = pitchf[:, :p_len] 311 | p_len = torch.tensor([p_len], device=self.device).long() 312 | with torch.no_grad(): 313 | if pitch != None and pitchf != None: 314 | audio1 = ( 315 | (self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) 316 | .data.cpu() 317 | .float() 318 | .numpy() 319 | .astype(np.int16) 320 | ) 321 | else: 322 | audio1 = ( 323 | (self.net_g.infer(feats, p_len, sid)[0][0, 0] * 32768) 324 | .data.cpu() 325 | .float() 326 | .numpy() 327 | .astype(np.int16) 328 | ) 329 | del feats, p_len, padding_mask 330 | if torch.cuda.is_available(): 331 | torch.cuda.empty_cache() 332 | return audio1 333 | 334 | 335 | # F0 computation 336 | def get_f0_crepe_computation( 337 | x, 338 | sr, 339 | f0_min, 340 | f0_max, 341 | p_len, 342 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full 343 | ): 344 | hop_length = sr // 100 345 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. 346 | x /= np.quantile(np.abs(x), 0.999) 347 | torch_device = self.get_optimal_torch_device() 348 | audio = torch.from_numpy(x).to(torch_device, copy=True) 349 | audio = torch.unsqueeze(audio, dim=0) 350 | if audio.ndim == 2 and audio.shape[0] > 1: 351 | audio = torch.mean(audio, dim=0, keepdim=True).detach() 352 | audio = audio.detach() 353 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) 354 | pitch: Tensor = torchcrepe.predict( 355 | audio, 356 | sr, 357 | sr // 100, 358 | f0_min, 359 | f0_max, 360 | model, 361 | batch_size=hop_length * 2, 362 | device=torch_device, 363 | pad=True 364 | ) 365 | p_len = p_len or x.shape[0] // hop_length 366 | # Resize the pitch for final f0 367 | source = np.array(pitch.squeeze(0).cpu().float().numpy()) 368 | source[source < 0.001] = np.nan 369 | target = np.interp( 370 | np.arange(0, len(source) * p_len, len(source)) / p_len, 371 | np.arange(0, len(source)), 372 | source 373 | ) 374 | f0 = np.nan_to_num(target) 375 | return f0 # Resized f0 376 | 377 | def get_f0_official_crepe_computation( 378 | x, 379 | sr, 380 | f0_min, 381 | f0_max, 382 | model="full", 383 | ): 384 | # Pick a batch size that doesn't cause memory errors on your gpu 385 | batch_size = 512 386 | # Compute pitch using first gpu 387 | audio = torch.tensor(np.copy(x))[None].float() 388 | f0, pd = torchcrepe.predict( 389 | audio, 390 | sr, 391 | sr // 100, 392 | f0_min, 393 | f0_max, 394 | model, 395 | batch_size=batch_size, 396 | device=device, 397 | return_periodicity=True, 398 | ) 399 | pd = torchcrepe.filter.median(pd, 3) 400 | f0 = torchcrepe.filter.mean(f0, 3) 401 | f0[pd < 0.1] = 0 402 | f0 = f0[0].cpu().numpy() 403 | return f0 404 | 405 | def get_f0( 406 | x: np.ndarray, 407 | sr: int, 408 | p_len: int, 409 | f0_up_key: int, 410 | f0_method: str, 411 | ): 412 | f0_min = 50 413 | f0_max = 1100 414 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 415 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 416 | 417 | if f0_method == "harvest": 418 | f0, t = pyworld.harvest( 419 | x.astype(np.double), 420 | fs=sr, 421 | f0_ceil=f0_max, 422 | f0_floor=f0_min, 423 | frame_period=10, 424 | ) 425 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr) 426 | f0 = signal.medfilt(f0, 3) 427 | elif f0_method == "dio": 428 | f0, t = pyworld.dio( 429 | x.astype(np.double), 430 | fs=sr, 431 | f0_ceil=f0_max, 432 | f0_floor=f0_min, 433 | frame_period=10, 434 | ) 435 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr) 436 | f0 = signal.medfilt(f0, 3) 437 | elif f0_method == "mangio-crepe": 438 | f0 = get_f0_crepe_computation(x, sr, f0_min, f0_max, p_len, "full") 439 | elif f0_method == "crepe": 440 | f0 = get_f0_official_crepe_computation(x, sr, f0_min, f0_max, "full") 441 | 442 | f0 *= pow(2, f0_up_key / 12) 443 | f0bak = f0.copy() 444 | f0_mel = 1127 * np.log(1 + f0 / 700) 445 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 446 | f0_mel_max - f0_mel_min 447 | ) + 1 448 | f0_mel[f0_mel <= 1] = 1 449 | f0_mel[f0_mel > 255] = 255 450 | f0_coarse = np.rint(f0_mel).astype(np.int32) 451 | return f0_coarse, f0bak # 1-0 -------------------------------------------------------------------------------- /modules/shared.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | 6 | from modules.cmd_opts import opts 7 | 8 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 9 | MODELS_DIR = os.path.join(ROOT_DIR, "models") 10 | 11 | 12 | def has_mps(): 13 | if sys.platform != "darwin": 14 | return False 15 | else: 16 | if not getattr(torch, "has_mps", False): 17 | return False 18 | try: 19 | torch.zeros(1).to(torch.device("mps")) 20 | return True 21 | except Exception: 22 | return False 23 | 24 | 25 | is_half = opts.precision == "fp16" 26 | half_support = ( 27 | torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 5.3 28 | ) 29 | 30 | if not half_support: 31 | print("WARNING: FP16 is not supported on this GPU") 32 | is_half = False 33 | 34 | device = "cuda:0" 35 | 36 | if not torch.cuda.is_available(): 37 | if has_mps(): 38 | print("Using MPS") 39 | device = "mps" 40 | else: 41 | print("Using CPU") 42 | device = "cpu" 43 | 44 | device = torch.device(device) 45 | -------------------------------------------------------------------------------- /modules/tabs/inference.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import traceback 4 | 5 | import gradio as gr 6 | 7 | from modules import models, ui 8 | from modules.ui import Tab 9 | 10 | 11 | def inference_options_ui(show_out_dir=True): 12 | with gr.Row(equal_height=False): 13 | with gr.Column(): 14 | source_audio = gr.Textbox(label="Source Audio") 15 | out_dir = gr.Textbox( 16 | label="Out folder", 17 | visible=show_out_dir, 18 | placeholder=models.AUDIO_OUT_DIR, 19 | ) 20 | with gr.Column(): 21 | transpose = gr.Slider( 22 | minimum=-20, maximum=20, value=0, step=1, label="Transpose" 23 | ) 24 | pitch_extraction_algo = gr.Radio( 25 | choices=["dio", "harvest", "mangio-crepe", "crepe"], 26 | value="crepe", 27 | label="Pitch Extraction Algorithm", 28 | ) 29 | embedding_model = gr.Radio( 30 | choices=["auto", *models.EMBEDDINGS_LIST.keys()], 31 | value="auto", 32 | label="Embedder Model", 33 | ) 34 | embedding_output_layer = gr.Radio( 35 | choices=["auto", "9", "12"], 36 | value="auto", 37 | label="Embedder Output Layer", 38 | ) 39 | with gr.Column(): 40 | auto_load_index = gr.Checkbox(value=False, label="Auto Load Index") 41 | faiss_index_file = gr.Textbox(value="", label="Faiss Index File Path") 42 | retrieval_feature_ratio = gr.Slider( 43 | minimum=0, 44 | maximum=1, 45 | value=1, 46 | step=0.01, 47 | label="Retrieval Feature Ratio", 48 | ) 49 | with gr.Column(): 50 | fo_curve_file = gr.File(label="F0 Curve File") 51 | 52 | return ( 53 | source_audio, 54 | out_dir, 55 | transpose, 56 | embedding_model, 57 | embedding_output_layer, 58 | pitch_extraction_algo, 59 | auto_load_index, 60 | faiss_index_file, 61 | retrieval_feature_ratio, 62 | fo_curve_file, 63 | ) 64 | 65 | 66 | class Inference(Tab): 67 | def title(self): 68 | return "Inference" 69 | 70 | def sort(self): 71 | return 1 72 | 73 | def ui(self, outlet): 74 | def infer( 75 | sid, 76 | input_audio, 77 | out_dir, 78 | embedder_model, 79 | embedding_output_layer, 80 | f0_up_key, 81 | f0_file, 82 | f0_method, 83 | auto_load_index, 84 | faiss_index_file, 85 | index_rate, 86 | ): 87 | model = models.vc_model 88 | try: 89 | yield "Infering...", None 90 | if out_dir == "": 91 | out_dir = models.AUDIO_OUT_DIR 92 | 93 | if "*" in input_audio: 94 | assert ( 95 | out_dir is not None 96 | ), "Out folder is required for batch processing" 97 | files = glob.glob(input_audio, recursive=True) 98 | elif os.path.isdir(input_audio): 99 | assert ( 100 | out_dir is not None 101 | ), "Out folder is required for batch processing" 102 | files = glob.glob( 103 | os.path.join(input_audio, "**", "*.wav"), recursive=True 104 | ) 105 | else: 106 | files = [input_audio] 107 | for file in files: 108 | audio = model.single( 109 | sid, 110 | file, 111 | embedder_model, 112 | embedding_output_layer, 113 | f0_up_key, 114 | f0_file, 115 | f0_method, 116 | auto_load_index, 117 | faiss_index_file, 118 | index_rate, 119 | output_dir=out_dir, 120 | ) 121 | yield "Success", (model.tgt_sr, audio) if len(files) == 1 else None 122 | except: 123 | yield "Error: " + traceback.format_exc(), None 124 | 125 | with gr.Group(): 126 | with gr.Box(): 127 | with gr.Column(): 128 | _, speaker_id = ui.create_model_list_ui() 129 | 130 | ( 131 | source_audio, 132 | out_dir, 133 | transpose, 134 | embedder_model, 135 | embedding_output_layer, 136 | pitch_extraction_algo, 137 | auto_load_index, 138 | faiss_index_file, 139 | retrieval_feature_ratio, 140 | f0_curve_file, 141 | ) = inference_options_ui() 142 | 143 | with gr.Row(equal_height=False): 144 | with gr.Column(): 145 | status = gr.Textbox(value="", label="Status") 146 | output = gr.Audio(label="Output", interactive=False) 147 | 148 | with gr.Row(): 149 | infer_button = gr.Button("Infer", variant="primary") 150 | 151 | infer_button.click( 152 | infer, 153 | inputs=[ 154 | speaker_id, 155 | source_audio, 156 | out_dir, 157 | embedder_model, 158 | embedding_output_layer, 159 | transpose, 160 | f0_curve_file, 161 | pitch_extraction_algo, 162 | auto_load_index, 163 | faiss_index_file, 164 | retrieval_feature_ratio, 165 | ], 166 | outputs=[status, output], 167 | queue=True, 168 | ) 169 | -------------------------------------------------------------------------------- /modules/tabs/merge.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import * 4 | 5 | import gradio as gr 6 | import torch 7 | 8 | from modules import models 9 | from modules.merge import merge 10 | from modules.tabs.inference import inference_options_ui 11 | from modules.ui import Tab 12 | 13 | MERGE_METHODS = { 14 | "weight_sum": "Weight sum:A*(1-alpha)+B*alpha", 15 | "add_diff": "Add difference:A+(B-C)*alpha", 16 | } 17 | 18 | 19 | class Merge(Tab): 20 | def title(self): 21 | return "Merge" 22 | 23 | def sort(self): 24 | return 3 25 | 26 | def ui(self, outlet): 27 | def merge_ckpt(model_a, model_b, model_c, weight_text, alpha, each_key, method): 28 | model_a = model_a if type(model_a) != list and model_a != "" else None 29 | model_b = model_b if type(model_b) != list and model_b != "" else None 30 | model_c = model_c if type(model_c) != list and model_c != "" else None 31 | 32 | if each_key: 33 | weights = json.loads(weight_text) 34 | else: 35 | weights = {} 36 | 37 | method = [k for k, v in MERGE_METHODS.items() if v == method][0] 38 | return merge( 39 | os.path.join(models.MODELS_DIR, "checkpoints", model_a), 40 | os.path.join(models.MODELS_DIR, "checkpoints", model_b), 41 | os.path.join(models.MODELS_DIR, "checkpoints", model_c) 42 | if model_c 43 | else None, 44 | alpha, 45 | weights, 46 | method, 47 | ) 48 | 49 | def merge_and_save( 50 | model_a, model_b, model_c, alpha, each_key, weight_text, method, out_name 51 | ): 52 | print(each_key) 53 | out_path = os.path.join(models.MODELS_DIR, "checkpoints", out_name) 54 | if os.path.exists(out_path): 55 | return "Model name already exists." 56 | merged = merge_ckpt( 57 | model_a, model_b, model_c, weight_text, alpha, each_key, method 58 | ) 59 | if not out_name.endswith(".pth"): 60 | out_name += ".pth" 61 | torch.save(merged, os.path.join(models.MODELS_DIR, "checkpoints", out_name)) 62 | return "Success" 63 | 64 | def merge_and_gen( 65 | model_a, 66 | model_b, 67 | model_c, 68 | alpha, 69 | each_key, 70 | weight_text, 71 | method, 72 | speaker_id, 73 | source_audio, 74 | embedder_name, 75 | embedding_output_layer, 76 | transpose, 77 | fo_curve_file, 78 | pitch_extraction_algo, 79 | auto_load_index, 80 | faiss_index_file, 81 | retrieval_feature_ratio, 82 | ): 83 | merged = merge_ckpt( 84 | model_a, model_b, model_c, weight_text, alpha, each_key, method 85 | ) 86 | model = models.VoiceConvertModel("merge", merged) 87 | audio = model.single( 88 | speaker_id, 89 | source_audio, 90 | embedder_name, 91 | embedding_output_layer, 92 | transpose, 93 | fo_curve_file, 94 | pitch_extraction_algo, 95 | auto_load_index, 96 | faiss_index_file, 97 | retrieval_feature_ratio, 98 | ) 99 | tgt_sr = model.tgt_sr 100 | del merged 101 | del model 102 | torch.cuda.empty_cache() 103 | return "Success", (tgt_sr, audio) 104 | 105 | def reload_model(): 106 | model_list = models.get_models() 107 | return ( 108 | gr.Dropdown.update(choices=model_list), 109 | gr.Dropdown.update(choices=model_list), 110 | gr.Dropdown.update(choices=model_list), 111 | ) 112 | 113 | def update_speaker_ids(model): 114 | if model == "": 115 | return gr.Slider.update( 116 | maximum=0, 117 | visible=False, 118 | ) 119 | model = torch.load( 120 | os.path.join(models.MODELS_DIR, "checkpoints", model), 121 | map_location="cpu", 122 | ) 123 | vc_model = models.VoiceConvertModel("merge", model) 124 | max = vc_model.n_spk 125 | del model 126 | del vc_model 127 | return gr.Slider.update( 128 | maximum=max, 129 | visible=True, 130 | ) 131 | 132 | with gr.Group(): 133 | with gr.Column(): 134 | with gr.Row(equal_height=False): 135 | model_a = gr.Dropdown(choices=models.get_models(), label="Model A") 136 | model_b = gr.Dropdown(choices=models.get_models(), label="Model B") 137 | model_c = gr.Dropdown(choices=models.get_models(), label="Model C") 138 | reload_model_button = gr.Button("♻️") 139 | reload_model_button.click( 140 | reload_model, outputs=[model_a, model_b, model_c] 141 | ) 142 | with gr.Row(equal_height=False): 143 | method = gr.Radio( 144 | label="Merge method", 145 | choices=list(MERGE_METHODS.values()), 146 | value="Weight sum:A*(1-alpha)+B*alpha", 147 | ) 148 | output_name = gr.Textbox(label="Output name") 149 | each_key = gr.Checkbox(label="Each key merge") 150 | with gr.Row(equal_height=False): 151 | base_alpha = gr.Slider( 152 | label="Base alpha", minimum=0, maximum=1, value=0.5, step=0.01 153 | ) 154 | 155 | default_weights = {} 156 | weights = {} 157 | 158 | def create_weight_ui(name: str, *keys_list: List[List[str]]): 159 | with gr.Accordion(label=name, open=False): 160 | with gr.Row(equal_height=False): 161 | for keys in keys_list: 162 | with gr.Column(): 163 | for key in keys: 164 | default_weights[key] = 0.5 165 | weights[key] = gr.Slider( 166 | label=key, 167 | minimum=0, 168 | maximum=1, 169 | step=0.01, 170 | value=0.5, 171 | ) 172 | 173 | with gr.Box(visible=False) as each_key_ui: 174 | with gr.Column(): 175 | create_weight_ui( 176 | "enc_p", 177 | [ 178 | "enc_p.encoder.attn_layers.0", 179 | "enc_p.encoder.attn_layers.1", 180 | "enc_p.encoder.attn_layers.2", 181 | "enc_p.encoder.attn_layers.3", 182 | "enc_p.encoder.attn_layers.4", 183 | "enc_p.encoder.attn_layers.5", 184 | "enc_p.encoder.norm_layers_1.0", 185 | "enc_p.encoder.norm_layers_1.1", 186 | "enc_p.encoder.norm_layers_1.2", 187 | "enc_p.encoder.norm_layers_1.3", 188 | "enc_p.encoder.norm_layers_1.4", 189 | "enc_p.encoder.norm_layers_1.5", 190 | ], 191 | [ 192 | "enc_p.encoder.ffn_layers.0", 193 | "enc_p.encoder.ffn_layers.1", 194 | "enc_p.encoder.ffn_layers.2", 195 | "enc_p.encoder.ffn_layers.3", 196 | "enc_p.encoder.ffn_layers.4", 197 | "enc_p.encoder.ffn_layers.5", 198 | "enc_p.encoder.norm_layers_2.0", 199 | "enc_p.encoder.norm_layers_2.1", 200 | "enc_p.encoder.norm_layers_2.2", 201 | "enc_p.encoder.norm_layers_2.3", 202 | "enc_p.encoder.norm_layers_2.4", 203 | "enc_p.encoder.norm_layers_2.5", 204 | ], 205 | [ 206 | "enc_p.emb_phone", 207 | "enc_p.emb_pitch", 208 | ], 209 | ) 210 | 211 | create_weight_ui( 212 | "dec", 213 | [ 214 | "dec.noise_convs.0", 215 | "dec.noise_convs.1", 216 | "dec.noise_convs.2", 217 | "dec.noise_convs.3", 218 | "dec.noise_convs.4", 219 | "dec.noise_convs.5", 220 | "dec.ups.0", 221 | "dec.ups.1", 222 | "dec.ups.2", 223 | "dec.ups.3", 224 | ], 225 | [ 226 | "dec.resblocks.0", 227 | "dec.resblocks.1", 228 | "dec.resblocks.2", 229 | "dec.resblocks.3", 230 | "dec.resblocks.4", 231 | "dec.resblocks.5", 232 | "dec.resblocks.6", 233 | "dec.resblocks.7", 234 | "dec.resblocks.8", 235 | "dec.resblocks.9", 236 | "dec.resblocks.10", 237 | "dec.resblocks.11", 238 | ], 239 | [ 240 | "dec.m_source.l_linear", 241 | "dec.conv_pre", 242 | "dec.conv_post", 243 | "dec.cond", 244 | ], 245 | ) 246 | 247 | create_weight_ui( 248 | "flow", 249 | [ 250 | "flow.flows.0", 251 | "flow.flows.1", 252 | "flow.flows.2", 253 | "flow.flows.3", 254 | "flow.flows.4", 255 | "flow.flows.5", 256 | "flow.flows.6", 257 | "emb_g.weight", 258 | ], 259 | ) 260 | 261 | with gr.Accordion(label="JSON", open=False): 262 | weights_text = gr.TextArea( 263 | value=json.dumps(default_weights), 264 | ) 265 | 266 | with gr.Accordion(label="Inference options", open=False): 267 | with gr.Row(equal_height=False): 268 | speaker_id = gr.Slider( 269 | minimum=0, 270 | maximum=2333, 271 | step=1, 272 | label="Speaker ID", 273 | value=0, 274 | visible=True, 275 | interactive=True, 276 | ) 277 | ( 278 | source_audio, 279 | _, 280 | transpose, 281 | embedder_name, 282 | embedding_output_layer, 283 | pitch_extraction_algo, 284 | auto_load_index, 285 | faiss_index_file, 286 | retrieval_feature_ratio, 287 | fo_curve_file, 288 | ) = inference_options_ui(show_out_dir=False) 289 | 290 | with gr.Row(equal_height=False): 291 | with gr.Column(): 292 | status = gr.Textbox(value="", label="Status") 293 | audio_output = gr.Audio(label="Output", interactive=False) 294 | 295 | with gr.Row(equal_height=False): 296 | merge_and_save_button = gr.Button( 297 | "Merge and save", variant="primary" 298 | ) 299 | merge_and_gen_button = gr.Button("Merge and gen", variant="primary") 300 | 301 | def each_key_on_change(each_key): 302 | return gr.update(visible=each_key) 303 | 304 | each_key.change( 305 | fn=each_key_on_change, 306 | inputs=[each_key], 307 | outputs=[each_key_ui], 308 | ) 309 | 310 | def update_weights_text(data): 311 | d = {} 312 | for key in weights.keys(): 313 | d[key] = data[weights[key]] 314 | return json.dumps(d) 315 | 316 | for w in weights.values(): 317 | w.change( 318 | fn=update_weights_text, 319 | inputs={*weights.values()}, 320 | outputs=[weights_text], 321 | ) 322 | 323 | merge_data = [ 324 | model_a, 325 | model_b, 326 | model_c, 327 | base_alpha, 328 | each_key, 329 | weights_text, 330 | method, 331 | ] 332 | 333 | inference_opts = [ 334 | speaker_id, 335 | source_audio, 336 | embedder_name, 337 | embedding_output_layer, 338 | transpose, 339 | fo_curve_file, 340 | pitch_extraction_algo, 341 | auto_load_index, 342 | faiss_index_file, 343 | retrieval_feature_ratio, 344 | ] 345 | 346 | merge_and_save_button.click( 347 | fn=merge_and_save, 348 | inputs=[ 349 | *merge_data, 350 | output_name, 351 | ], 352 | outputs=[status], 353 | ) 354 | merge_and_gen_button.click( 355 | fn=merge_and_gen, 356 | inputs=[ 357 | *merge_data, 358 | *inference_opts, 359 | ], 360 | outputs=[status, audio_output], 361 | ) 362 | 363 | model_a.change( 364 | update_speaker_ids, inputs=[model_a], outputs=[speaker_id] 365 | ) 366 | -------------------------------------------------------------------------------- /modules/tabs/server.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | 4 | import gradio as gr 5 | import requests 6 | import soundfile as sf 7 | import torch.multiprocessing as multiprocessing 8 | from scipy.io.wavfile import write 9 | 10 | from modules.ui import Tab 11 | from server import app 12 | 13 | proc = None 14 | 15 | def server_options_ui(show_out_dir=True): 16 | with gr.Row().style(equal_height=False): 17 | with gr.Row(): 18 | host = gr.Textbox(value="127.0.0.1", label="host") 19 | port = gr.Textbox(value="5001", label="port") 20 | with gr.Row().style(equal_height=False): 21 | with gr.Row(): 22 | rvc_model_file = gr.Textbox(value="", label="RVC model file path") 23 | faiss_index_file = gr.Textbox(value="", label="Faiss index file path") 24 | with gr.Row().style(equal_height=False): 25 | with gr.Row(): 26 | input_voice_file = gr.Textbox(value="", label="input voice file path") 27 | speaker_id = gr.Number( 28 | value=0, 29 | label="speaker_id", 30 | ) 31 | transpose = gr.Slider( 32 | minimum=-20, maximum=20, value=0, step=1, label="transpose" 33 | ) 34 | pitch_extraction_algo = gr.Radio( 35 | choices=["dio", "harvest", "mangio-crepe", "crepe"], 36 | value="crepe", 37 | label="pitch_extraction_algo", 38 | ) 39 | retrieval_feature_ratio = gr.Slider( 40 | minimum=0, 41 | maximum=1, 42 | value=1, 43 | step=0.01, 44 | label="retrieval_feature_ratio", 45 | ) 46 | return ( 47 | host, 48 | port, 49 | rvc_model_file, 50 | faiss_index_file, 51 | input_voice_file, 52 | speaker_id, 53 | transpose, 54 | pitch_extraction_algo, 55 | retrieval_feature_ratio, 56 | ) 57 | 58 | def run(**kwargs): 59 | app.run(**kwargs) 60 | 61 | class Server(Tab): 62 | def title(self): 63 | return "Server(experimental)" 64 | 65 | def sort(self): 66 | return 6 67 | 68 | def ui(self, outlet): 69 | def start(host, port): 70 | if multiprocessing.get_start_method() == 'fork': 71 | multiprocessing.set_start_method('spawn', force=True) 72 | proc = multiprocessing.Process(target = run, kwargs = {'host': host, 'port': port}) 73 | proc.start() 74 | yield "start server" 75 | 76 | def upload(host, port, rvc_model_file, faiss_index_file): 77 | file_names = {"rvc_model_file": rvc_model_file, "faiss_index_file": faiss_index_file} 78 | res = requests.post(f"http://{host}:{port}/upload_model", json=file_names) 79 | yield res.text 80 | 81 | def convert(host, port, input_voice_file, speaker_id, transpose, pitch_extraction_algo, retrieval_feature_ratio): 82 | params = { 83 | "speaker_id": speaker_id, 84 | "transpose": transpose, 85 | "pitch_extraction_algo": pitch_extraction_algo, 86 | "retrieval_feature_ratio": retrieval_feature_ratio 87 | } 88 | 89 | audio, sr = sf.read(input_voice_file) 90 | audio_buffer = io.BytesIO() 91 | write(audio_buffer, rate=sr, data=audio) 92 | json_buffer = io.BytesIO(json.dumps(params).encode('utf-8')) 93 | files = { 94 | "input_wav": audio_buffer, 95 | "params": json_buffer 96 | } 97 | res = requests.post(f"http://{host}:{port}/convert_sound", files=files) 98 | audio, sr = sf.read(io.BytesIO(res.content)) 99 | yield "convert succeed", (sr, audio) 100 | 101 | with gr.Group(): 102 | with gr.Box(): 103 | with gr.Column(): 104 | ( 105 | host, 106 | port, 107 | rvc_model_file, 108 | faiss_index_file, 109 | input_voice_file, 110 | speaker_id, 111 | transpose, 112 | pitch_extraction_algo, 113 | retrieval_feature_ratio, 114 | ) = server_options_ui() 115 | 116 | with gr.Row().style(equal_height=False): 117 | with gr.Column(): 118 | status = gr.Textbox(value="", label="Status") 119 | output = gr.Audio(label="Output", interactive=False) 120 | 121 | with gr.Row(): 122 | start_button = gr.Button("Start server", variant="primary") 123 | upload_button = gr.Button("Upload Model") 124 | convert_button = gr.Button("Convert Voice") 125 | 126 | start_button.click( 127 | start, 128 | inputs=[ 129 | host, 130 | port 131 | ], 132 | outputs=[status], 133 | queue=True, 134 | ) 135 | upload_button.click( 136 | upload, 137 | inputs=[ 138 | host, 139 | port, 140 | rvc_model_file, 141 | faiss_index_file 142 | ], 143 | outputs=[status], 144 | queue=True, 145 | ) 146 | convert_button.click( 147 | convert, 148 | inputs=[ 149 | host, 150 | port, 151 | input_voice_file, 152 | speaker_id, 153 | transpose, 154 | pitch_extraction_algo, 155 | retrieval_feature_ratio 156 | ], 157 | outputs=[status, output], 158 | queue=True, 159 | ) 160 | -------------------------------------------------------------------------------- /modules/tabs/split.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | from modules.separate import separate_audio 4 | from modules.ui import Tab 5 | 6 | 7 | class Split(Tab): 8 | def title(self): 9 | return "Split Audio" 10 | 11 | def sort(self): 12 | return 5 13 | 14 | def ui(self, outlet): 15 | def separate( 16 | input_audio, 17 | output_dir, 18 | silence_thresh, 19 | min_silence_len, 20 | keep_silence, 21 | margin, 22 | padding, 23 | min, 24 | max, 25 | ): 26 | min = None if min == 0 else min 27 | max = None if max == 0 else max 28 | separate_audio( 29 | input_audio, 30 | output_dir, 31 | int(silence_thresh), 32 | int(min_silence_len), 33 | int(keep_silence), 34 | int(margin), 35 | padding, 36 | int(min), 37 | int(max), 38 | ) 39 | return "Success" 40 | 41 | with gr.Group(): 42 | with gr.Column(): 43 | with gr.Row(equal_height=False): 44 | input_audio = gr.Textbox(label="Input Audio (File or Directory)") 45 | output_dir = gr.Textbox(label="Output Directory") 46 | 47 | with gr.Row(equal_height=False): 48 | silence_thresh = gr.Number(value=-40, label="Silence Threshold") 49 | min_silence_len = gr.Number( 50 | value=750, label="Minimum Silence Length" 51 | ) 52 | keep_silence = gr.Number(value=750, label="Keep Silence") 53 | margin = gr.Number(value=0, label="Margin") 54 | padding = gr.Checkbox(value=True, label="Padding") 55 | 56 | with gr.Row(equal_height=False): 57 | min = gr.Number(value=1000, label="Minimum audio length") 58 | max = gr.Number(value=5000, label="Maximum audio length") 59 | 60 | with gr.Row(equal_height=False): 61 | status = gr.Textbox(value="", label="Status") 62 | with gr.Row(equal_height=False): 63 | separate_button = gr.Button("Separate", variant="primary") 64 | 65 | separate_button.click( 66 | separate, 67 | inputs=[ 68 | input_audio, 69 | output_dir, 70 | silence_thresh, 71 | min_silence_len, 72 | keep_silence, 73 | margin, 74 | padding, 75 | min, 76 | max, 77 | ], 78 | outputs=[status], 79 | ) 80 | -------------------------------------------------------------------------------- /modules/ui.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from typing import * 4 | 5 | import gradio as gr 6 | import gradio.routes 7 | import torch 8 | 9 | from . import models, shared 10 | from .core import preload 11 | from .shared import ROOT_DIR 12 | 13 | 14 | class Tab: 15 | TABS_DIR = os.path.join(ROOT_DIR, "modules", "tabs") 16 | 17 | def __init__(self, filepath: str) -> None: 18 | self.filepath = filepath 19 | 20 | def sort(self): 21 | return 1 22 | 23 | def title(self): 24 | return "" 25 | 26 | def ui(self, outlet: Callable): 27 | pass 28 | 29 | def __call__(self): 30 | children_dir = self.filepath[:-3] 31 | children = [] 32 | 33 | if os.path.isdir(children_dir): 34 | for file in os.listdir(children_dir): 35 | if not file.endswith(".py"): 36 | continue 37 | module_name = file[:-3] 38 | parent = os.path.relpath(Tab.TABS_DIR, Tab.TABS_DIR).replace("/", ".") 39 | 40 | if parent.startswith("."): 41 | parent = parent[1:] 42 | if parent.endswith("."): 43 | parent = parent[:-1] 44 | 45 | children.append( 46 | importlib.import_module(f"modules.tabs.{parent}.{module_name}") 47 | ) 48 | 49 | children = sorted(children, key=lambda x: x.sort()) 50 | 51 | tabs = [] 52 | 53 | for child in children: 54 | attrs = child.__dict__ 55 | tab = [x for x in attrs.values() if issubclass(x, Tab)] 56 | if len(tab) > 0: 57 | tabs.append(tab[0]) 58 | 59 | def outlet(): 60 | with gr.Tabs(): 61 | for tab in tabs: 62 | with gr.Tab(tab.title()): 63 | tab() 64 | 65 | return self.ui(outlet) 66 | 67 | 68 | def load_tabs() -> List[Tab]: 69 | tabs = [] 70 | files = os.listdir(os.path.join(ROOT_DIR, "modules", "tabs")) 71 | 72 | for file in files: 73 | if not file.endswith(".py"): 74 | continue 75 | module_name = file[:-3] 76 | module = importlib.import_module(f"modules.tabs.{module_name}") 77 | attrs = module.__dict__ 78 | TabClass = [ 79 | x 80 | for x in attrs.values() 81 | if type(x) == type and issubclass(x, Tab) and not x == Tab 82 | ] 83 | if len(TabClass) > 0: 84 | tabs.append((file, TabClass[0])) 85 | 86 | tabs = sorted([TabClass(file) for file, TabClass in tabs], key=lambda x: x.sort()) 87 | return tabs 88 | 89 | 90 | def webpath(fn): 91 | if fn.startswith(ROOT_DIR): 92 | web_path = os.path.relpath(fn, ROOT_DIR).replace("\\", "/") 93 | else: 94 | web_path = os.path.abspath(fn) 95 | 96 | return f"file={web_path}?{os.path.getmtime(fn)}" 97 | 98 | 99 | def javascript_html(): 100 | script_js = os.path.join(ROOT_DIR, "script.js") 101 | head = f'\n' 102 | 103 | return head 104 | 105 | 106 | def css_html(): 107 | return f'' 108 | 109 | 110 | def create_head(): 111 | head = "" 112 | head += css_html() 113 | head += javascript_html() 114 | 115 | def template_response(*args, **kwargs): 116 | res = shared.gradio_template_response_original(*args, **kwargs) 117 | res.body = res.body.replace(b"", f"{head}".encode("utf8")) 118 | res.init_headers() 119 | return res 120 | 121 | gradio.routes.templates.TemplateResponse = template_response 122 | 123 | 124 | def create_ui(): 125 | preload() 126 | block = gr.Blocks() 127 | 128 | with block: 129 | with gr.Tabs(): 130 | tabs = load_tabs() 131 | for tab in tabs: 132 | with gr.Tab(tab.title()): 133 | tab() 134 | 135 | create_head() 136 | 137 | return block 138 | 139 | 140 | def create_model_list_ui(speaker_id: bool = True, load: bool = True): 141 | speaker_id_info = { 142 | "visible": False, 143 | "maximum": 10000, 144 | } 145 | 146 | def reload_model(raw=False): 147 | model_list = models.get_models() 148 | if len(model_list) > 0: 149 | models.load_model(model_list[0]) 150 | 151 | if models.vc_model is not None: 152 | speaker_id_info["visible"] = True 153 | speaker_id_info["maximum"] = models.vc_model.n_spk 154 | 155 | return model_list if raw else gr.Dropdown.update(choices=model_list) 156 | 157 | model_list = reload_model(raw=True) 158 | 159 | def load_model(model_name): 160 | if load: 161 | models.load_model(model_name) 162 | speaker_id_info["visible"] = True 163 | speaker_id_info["maximum"] = models.vc_model.n_spk 164 | else: 165 | model = models.get_vc_model(model_name) 166 | speaker_id_info["visible"] = True 167 | speaker_id_info["maximum"] = model.n_spk 168 | del model 169 | torch.cuda.empty_cache() 170 | return gr.Slider.update( 171 | maximum=speaker_id_info["maximum"], visible=speaker_id_info["visible"] 172 | ) 173 | 174 | with gr.Row(equal_height=False): 175 | model = gr.Dropdown( 176 | choices=model_list, 177 | label="Model", 178 | value=model_list[0] if len(model_list) > 0 else None, 179 | ) 180 | speaker_id = gr.Slider( 181 | minimum=0, 182 | maximum=speaker_id_info["maximum"], 183 | step=1, 184 | label="Speaker ID", 185 | value=0, 186 | visible=speaker_id and speaker_id_info["visible"], 187 | interactive=True, 188 | ) 189 | reload_model_button = gr.Button("♻️") 190 | 191 | model.change(load_model, inputs=[model], outputs=[speaker_id]) 192 | reload_model_button.click(reload_model, outputs=[model]) 193 | 194 | return model, speaker_id 195 | 196 | 197 | if not hasattr(shared, "gradio_template_response_original"): 198 | shared.gradio_template_response_original = gradio.routes.templates.TemplateResponse 199 | -------------------------------------------------------------------------------- /modules/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import * 3 | 4 | import ffmpeg 5 | import numpy as np 6 | import requests 7 | import torch 8 | from tqdm import tqdm 9 | 10 | from lib.rvc.config import TrainConfig 11 | from modules.shared import ROOT_DIR 12 | 13 | 14 | def load_audio(file: str, sr): 15 | try: 16 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 17 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 18 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 19 | file = ( 20 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 21 | ) # Prevent small white copy path head and tail with spaces and " and return 22 | out, _ = ( 23 | ffmpeg.input(file, threads=0) 24 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 25 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 26 | ) 27 | except Exception as e: 28 | raise RuntimeError(f"Failed to load audio: {e}") 29 | 30 | return np.frombuffer(out, np.float32).flatten() 31 | 32 | 33 | def get_gpus(): 34 | num_gpus = torch.cuda.device_count() 35 | return [torch.device(f"cuda:{i}") for i in range(num_gpus)] 36 | 37 | 38 | def download_file(url: str, out: str, position: int = 0, show: bool = True): 39 | req = requests.get(url, stream=True, allow_redirects=True) 40 | content_length = req.headers.get("content-length") 41 | if show: 42 | progress_bar = tqdm( 43 | total=int(content_length) if content_length is not None else None, 44 | leave=False, 45 | unit="B", 46 | unit_scale=True, 47 | unit_divisor=1024, 48 | position=position, 49 | ) 50 | 51 | # with tqdm 52 | with open(out, "wb") as f: 53 | for chunk in req.iter_content(chunk_size=1024): 54 | if chunk: 55 | if show: 56 | progress_bar.update(len(chunk)) 57 | f.write(chunk) 58 | 59 | 60 | def load_config( 61 | version: Literal["v1", "v2"], 62 | training_dir: str, 63 | sample_rate: str, 64 | emb_channels: int, 65 | fp16: bool, 66 | ): 67 | if emb_channels == 256: 68 | config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json") 69 | else: 70 | config_path = os.path.join( 71 | ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json" 72 | ) 73 | 74 | config = TrainConfig.parse_file(config_path) 75 | config.version = version 76 | config.train.fp16_run = fp16 77 | 78 | config_save_path = os.path.join(training_dir, "config.json") 79 | 80 | with open(config_save_path, "w") as f: 81 | f.write(config.json()) 82 | 83 | return config 84 | -------------------------------------------------------------------------------- /outputs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/main.txt -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # -r main.txt 2 | 3 | black 4 | isort -------------------------------------------------------------------------------- /requirements/main.txt: -------------------------------------------------------------------------------- 1 | gradio==3.36.1 2 | tqdm==4.65.0 3 | numpy==1.23.5 4 | faiss-cpu==1.7.3 5 | fairseq==0.12.2 6 | matplotlib==3.7.1 7 | scipy==1.9.3 8 | librosa==0.9.1 9 | pyworld==0.3.2 10 | soundfile==0.12.1 11 | ffmpeg-python==0.2.0 12 | pydub==0.25.1 13 | soxr==0.3.5 14 | transformers==4.28.1 15 | torchcrepe==0.0.20 16 | Flask==2.3.2 17 | 18 | tensorboard 19 | tensorboardX 20 | requests -------------------------------------------------------------------------------- /script.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/script.js -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import os 4 | import traceback 5 | from typing import * 6 | 7 | import soundfile as sf 8 | from flask import Flask, make_response, request, send_file 9 | from scipy.io.wavfile import write 10 | 11 | from modules.server.model import VoiceServerModel 12 | 13 | model: Optional[VoiceServerModel] = None 14 | app = Flask(__name__) 15 | 16 | @app.route('/ping') 17 | def ping(): 18 | return make_response("server is alive", 200) 19 | 20 | @app.route('/upload_model', methods=['POST']) 21 | def upload_model(): 22 | """ 23 | input: 24 | json: 25 | rvc_model_file: str 26 | specify rvc model's absolute path (.pt, .pth) 27 | faiss_index_file: Optional[str] 28 | specify faiss index'S absolute path (.index) 29 | """ 30 | global model 31 | if request.method == "POST": 32 | rvc_model_file = request.json["rvc_model_file"] 33 | faiss_index_file =request.json["faiss_index_file"] if "faiss_index_file" in request.json else "" 34 | try: 35 | model = VoiceServerModel(rvc_model_file, faiss_index_file) 36 | return make_response("model is load", 200) 37 | except: 38 | traceback.print_exc() 39 | return make_response("model load error", 400) 40 | else: 41 | return make_response("use post method", 400) 42 | 43 | @app.route('/convert_sound', methods=['POST']) 44 | def convert_sound(): 45 | """ 46 | input: 47 | params: json 48 | speaker_id: int 49 | default: 0 50 | transpose: int 51 | default: 0 52 | pitch_extraction_algo: str 53 | default: dio 54 | value: ["dio", "harvest", "mangio-crepe", "crepe"] 55 | retrieval_feature_ratio: float 56 | default: 0 57 | value: 0. ~ 1. 58 | input_wav: wav file 59 | 60 | output: 61 | wavfile 62 | """ 63 | global model 64 | if model is None: 65 | return make_response("please upload model", 400) 66 | print("start") 67 | if request.method == "POST": 68 | input_buffer = io.BytesIO(request.files["input_wav"].stream.read()) 69 | audio, sr = sf.read(input_buffer) 70 | 71 | req_json = json.load(io.BytesIO(request.files["params"].stream.read())) 72 | sid = int(req_json.get("speaker_id", 0)) 73 | transpose = int(req_json.get("transpose", 0)) 74 | pitch_extraction_algo = req_json.get("pitch_extraction_algo", "dio") 75 | if not pitch_extraction_algo in ["dio", "harvest", "mangio-crepe", "crepe"]: 76 | return make_response("bad pitch extraction algo", 400) 77 | retrieval_feature_ratio = float(req_json.get("retrieval_feature_ratio", 0.)) 78 | 79 | out_audio = model(audio, sr, sid, transpose, pitch_extraction_algo, retrieval_feature_ratio) 80 | output_buffer = io.BytesIO() 81 | write(output_buffer, rate=model.tgt_sr, data=out_audio) 82 | output_buffer.seek(0) 83 | response = make_response(send_file(output_buffer, mimetype="audio/wav"), 200) 84 | return response 85 | else: 86 | return make_response("use post method", 400) 87 | 88 | if __name__ == "__main__": 89 | app.run() -------------------------------------------------------------------------------- /styles.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/styles.css -------------------------------------------------------------------------------- /update.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | if exist ".git" ( 4 | git fetch --prune 5 | git reset --hard origin/main 6 | ) else ( 7 | git init 8 | git remote add origin https://github.com/ddPn08/rvc-webui.git 9 | git fetch --prune 10 | git reset --hard origin/main 11 | ) 12 | 13 | pause -------------------------------------------------------------------------------- /update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ -d .git ]; then 3 | git fetch --prune 4 | git reset --hard origin/main 5 | else 6 | git init 7 | git remote add origin 8 | git fetch --prune 9 | git reset --hard origin/main -------------------------------------------------------------------------------- /webui-macos-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #################################################################### 3 | # macOS defaults # 4 | # Please modify webui-user.sh to change these instead of this file # 5 | #################################################################### 6 | 7 | if [[ -x "$(command -v python3.10)" ]] 8 | then 9 | python_cmd="python3.10" 10 | fi 11 | 12 | export COMMANDLINE_ARGS="" 13 | export TORCH_COMMAND="pip install torch torchvision torchaudio" 14 | export PYTORCH_ENABLE_MPS_FALLBACK=1 15 | 16 | #################################################################### -------------------------------------------------------------------------------- /webui-user.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | set PYTHON= 4 | set GIT= 5 | set VENV_DIR= 6 | set COMMANDLINE_ARGS= 7 | 8 | call webui.bat -------------------------------------------------------------------------------- /webui-user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ######################################################### 3 | # Uncomment and change the variables below to your need:# 4 | ######################################################### 5 | 6 | # Commandline arguments for webui.py, for example: export COMMANDLINE_ARGS="--medvram --opt-split-attention" 7 | #export COMMANDLINE_ARGS="" 8 | 9 | # python3 executable 10 | #python_cmd="python3" 11 | 12 | # git executable 13 | #export GIT="git" 14 | 15 | # python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv) 16 | #venv_dir="venv" 17 | 18 | # script to launch to start the app 19 | #export LAUNCH_SCRIPT="launch.py" 20 | 21 | # install command for torch 22 | #export TORCH_COMMAND="pip install torch --extra-index-url https://download.pytorch.org/whl/cu118" 23 | 24 | # Requirements file to use for stable-diffusion-webui 25 | #export REQS_FILE="requirements_versions.txt" 26 | 27 | ########################################### -------------------------------------------------------------------------------- /webui.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | if not defined PYTHON (set PYTHON=python) 4 | if not defined VENV_DIR (set "VENV_DIR=%~dp0%venv") 5 | 6 | 7 | set ERROR_REPORTING=FALSE 8 | 9 | mkdir tmp 2>NUL 10 | 11 | %PYTHON% -c "" >tmp/stdout.txt 2>tmp/stderr.txt 12 | if %ERRORLEVEL% == 0 goto :check_pip 13 | echo Couldn't launch python 14 | goto :show_stdout_stderr 15 | 16 | :check_pip 17 | %PYTHON% -mpip --help >tmp/stdout.txt 2>tmp/stderr.txt 18 | if %ERRORLEVEL% == 0 goto :start_venv 19 | if "%PIP_INSTALLER_LOCATION%" == "" goto :show_stdout_stderr 20 | %PYTHON% "%PIP_INSTALLER_LOCATION%" >tmp/stdout.txt 2>tmp/stderr.txt 21 | if %ERRORLEVEL% == 0 goto :start_venv 22 | echo Couldn't install pip 23 | goto :show_stdout_stderr 24 | 25 | :start_venv 26 | if ["%VENV_DIR%"] == ["-"] goto :launch 27 | if ["%SKIP_VENV%"] == ["1"] goto :launch 28 | 29 | dir "%VENV_DIR%\Scripts\Python.exe" >tmp/stdout.txt 2>tmp/stderr.txt 30 | if %ERRORLEVEL% == 0 goto :activate_venv 31 | 32 | for /f "delims=" %%i in ('CALL %PYTHON% -c "import sys; print(sys.executable)"') do set PYTHON_FULLNAME="%%i" 33 | echo Creating venv in directory %VENV_DIR% using python %PYTHON_FULLNAME% 34 | %PYTHON_FULLNAME% -m venv "%VENV_DIR%" >tmp/stdout.txt 2>tmp/stderr.txt 35 | if %ERRORLEVEL% == 0 goto :activate_venv 36 | echo Unable to create venv in directory "%VENV_DIR%" 37 | goto :show_stdout_stderr 38 | 39 | :activate_venv 40 | set PYTHON="%VENV_DIR%\Scripts\Python.exe" 41 | echo venv %PYTHON% 42 | 43 | :launch 44 | %PYTHON% launch.py %* 45 | pause 46 | exit /b 47 | 48 | :show_stdout_stderr 49 | 50 | echo. 51 | echo exit code: %errorlevel% 52 | 53 | for /f %%i in ("tmp\stdout.txt") do set size=%%~zi 54 | if %size% equ 0 goto :show_stderr 55 | echo. 56 | echo stdout: 57 | type tmp\stdout.txt 58 | 59 | :show_stderr 60 | for /f %%i in ("tmp\stderr.txt") do set size=%%~zi 61 | if %size% equ 0 goto :show_stderr 62 | echo. 63 | echo stderr: 64 | type tmp\stderr.txt 65 | 66 | :endofscript 67 | 68 | echo. 69 | echo Launch unsuccessful. Exiting. 70 | pause -------------------------------------------------------------------------------- /webui.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from modules import cmd_opts, ui 4 | 5 | # なんか知らんが湧いて出てくる ".DS_Store" を無視する。 6 | # ここにこんなコードを置くべきかはわからないけど… 7 | _list_dir = os.listdir 8 | 9 | def listdir4mac(path): 10 | return [file for file in _list_dir(path) if not file.startswith(".")] 11 | 12 | os.listdir = listdir4mac 13 | 14 | 15 | def webui(): 16 | app = ui.create_ui() 17 | app.queue(64) 18 | app, local_url, share_url = app.launch( 19 | server_name=cmd_opts.opts.host, 20 | server_port=cmd_opts.opts.port, 21 | share=cmd_opts.opts.share, 22 | ) 23 | 24 | 25 | if __name__ == "__main__": 26 | webui() 27 | -------------------------------------------------------------------------------- /webui.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ################################################# 3 | # Please do not make any changes to this file, # 4 | # change the variables in webui-user.sh instead # 5 | ################################################# 6 | 7 | # If run from macOS, load defaults from webui-macos-env.sh 8 | if [[ "$OSTYPE" == "darwin"* ]]; then 9 | if [[ -f webui-macos-env.sh ]] 10 | then 11 | source ./webui-macos-env.sh 12 | fi 13 | fi 14 | 15 | # Read variables from webui-user.sh 16 | # shellcheck source=/dev/null 17 | if [[ -f webui-user.sh ]] 18 | then 19 | source ./webui-user.sh 20 | fi 21 | 22 | # python3 executable 23 | if [[ -z "${python_cmd}" ]] 24 | then 25 | python_cmd="python3" 26 | fi 27 | 28 | # git executable 29 | if [[ -z "${GIT}" ]] 30 | then 31 | export GIT="git" 32 | fi 33 | 34 | # python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv) 35 | if [[ -z "${venv_dir}" ]] 36 | then 37 | venv_dir="venv" 38 | fi 39 | 40 | if [[ -z "${LAUNCH_SCRIPT}" ]] 41 | then 42 | LAUNCH_SCRIPT="launch.py" 43 | fi 44 | 45 | # this script cannot be run as root by default 46 | can_run_as_root=0 47 | 48 | # read any command line flags to the webui.sh script 49 | while getopts "f" flag > /dev/null 2>&1 50 | do 51 | case ${flag} in 52 | f) can_run_as_root=1;; 53 | *) break;; 54 | esac 55 | done 56 | 57 | # Disable sentry logging 58 | export ERROR_REPORTING=FALSE 59 | 60 | # Do not reinstall existing pip packages on Debian/Ubuntu 61 | export PIP_IGNORE_INSTALLED=0 62 | 63 | # Pretty print 64 | delimiter="################################################################" 65 | 66 | # Do not run as root 67 | if [[ $(id -u) -eq 0 && can_run_as_root -eq 0 ]] 68 | then 69 | printf "\n%s\n" "${delimiter}" 70 | printf "\e[1m\e[31mERROR: This script must not be launched as root, aborting...\e[0m" 71 | printf "\n%s\n" "${delimiter}" 72 | exit 1 73 | else 74 | printf "\n%s\n" "${delimiter}" 75 | printf "Running on \e[1m\e[32m%s\e[0m user" "$(whoami)" 76 | printf "\n%s\n" "${delimiter}" 77 | fi 78 | 79 | if echo "$gpu_info" | grep -q "AMD" && [[ -z "${TORCH_COMMAND}" ]] 80 | then 81 | export TORCH_COMMAND="pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.2" 82 | fi 83 | 84 | for preq in "${GIT}" "${python_cmd}" 85 | do 86 | if ! hash "${preq}" &>/dev/null 87 | then 88 | printf "\n%s\n" "${delimiter}" 89 | printf "\e[1m\e[31mERROR: %s is not installed, aborting...\e[0m" "${preq}" 90 | printf "\n%s\n" "${delimiter}" 91 | exit 1 92 | fi 93 | done 94 | 95 | if ! "${python_cmd}" -c "import venv" &>/dev/null 96 | then 97 | printf "\n%s\n" "${delimiter}" 98 | printf "\e[1m\e[31mERROR: python3-venv is not installed, aborting...\e[0m" 99 | printf "\n%s\n" "${delimiter}" 100 | exit 1 101 | fi 102 | 103 | printf "\n%s\n" "${delimiter}" 104 | printf "Create and activate python venv" 105 | printf "\n%s\n" "${delimiter}" 106 | if [[ ! -d "${venv_dir}" ]] 107 | then 108 | "${python_cmd}" -m venv "${venv_dir}" 109 | first_launch=1 110 | fi 111 | # shellcheck source=/dev/null 112 | if [[ -f "${venv_dir}"/bin/activate ]] 113 | then 114 | source "${venv_dir}"/bin/activate 115 | else 116 | printf "\n%s\n" "${delimiter}" 117 | printf "\e[1m\e[31mERROR: Cannot activate python venv, aborting...\e[0m" 118 | printf "\n%s\n" "${delimiter}" 119 | exit 1 120 | fi 121 | 122 | printf "\n%s\n" "${delimiter}" 123 | printf "Launching launch.py..." 124 | printf "\n%s\n" "${delimiter}" 125 | exec "${python_cmd}" "${LAUNCH_SCRIPT}" "$@" --------------------------------------------------------------------------------