├── .gitignore
├── .vscode
└── settings.json
├── LICENSE
├── README-ja.md
├── README.md
├── bin
└── .gitignore
├── configs
├── 32k-768.json
├── 32k.json
├── 40k-768.json
├── 40k.json
├── 48k-768.json
└── 48k.json
├── dev.py
├── launch.py
├── lib
└── rvc
│ ├── attentions.py
│ ├── checkpoints.py
│ ├── commons.py
│ ├── config.py
│ ├── data_utils.py
│ ├── losses.py
│ ├── mel_processing.py
│ ├── models.py
│ ├── modules.py
│ ├── pipeline.py
│ ├── preprocessing
│ ├── extract_f0.py
│ ├── extract_feature.py
│ ├── slicer.py
│ └── split.py
│ ├── train.py
│ ├── transforms.py
│ └── utils.py
├── models
├── checkpoints
│ └── .gitignore
├── embeddings
│ └── .gitignore
├── pretrained
│ └── .gitignore
└── training
│ ├── .gitignore
│ ├── models
│ └── .gitignore
│ └── mute
│ ├── 0_gt_wavs
│ ├── mute32k.wav
│ ├── mute40k.wav
│ └── mute48k.wav
│ ├── 1_16k_wavs
│ └── mute.wav
│ ├── 2a_f0
│ └── mute.wav.npy
│ ├── 2b_f0nsf
│ └── mute.wav.npy
│ └── 3_feature256
│ └── mute.npy
├── modules
├── cmd_opts.py
├── core.py
├── merge.py
├── models.py
├── separate.py
├── server
│ └── model.py
├── shared.py
├── tabs
│ ├── inference.py
│ ├── merge.py
│ ├── server.py
│ ├── split.py
│ └── training.py
├── ui.py
└── utils.py
├── outputs
└── .gitignore
├── requirements.txt
├── requirements
├── dev.txt
└── main.txt
├── script.js
├── server.py
├── styles.css
├── update.bat
├── update.sh
├── webui-macos-env.sh
├── webui-user.bat
├── webui-user.sh
├── webui.bat
├── webui.py
└── webui.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
3 | tmp/
4 |
5 |
6 | ### Generated by gibo (https://github.com/simonwhitaker/gibo)
7 | ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Global/VisualStudioCode.gitignore
8 |
9 | .vscode/*
10 | !.vscode/settings.json
11 | !.vscode/tasks.json
12 | !.vscode/launch.json
13 | !.vscode/extensions.json
14 | !.vscode/*.code-snippets
15 |
16 | # Local History for Visual Studio Code
17 | .history/
18 |
19 | # Built Visual Studio Code Extensions
20 | *.vsix
21 |
22 |
23 | ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore
24 |
25 | # Byte-compiled / optimized / DLL files
26 | __pycache__/
27 | *.py[cod]
28 | *$py.class
29 |
30 | # C extensions
31 | *.so
32 |
33 | # Distribution / packaging
34 | .Python
35 | build/
36 | develop-eggs/
37 | dist/
38 | downloads/
39 | eggs/
40 | .eggs/
41 | # lib/
42 | lib64/
43 | parts/
44 | sdist/
45 | var/
46 | wheels/
47 | share/python-wheels/
48 | *.egg-info/
49 | .installed.cfg
50 | *.egg
51 | MANIFEST
52 |
53 | # PyInstaller
54 | # Usually these files are written by a python script from a template
55 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
56 | *.manifest
57 | *.spec
58 |
59 | # Installer logs
60 | pip-log.txt
61 | pip-delete-this-directory.txt
62 |
63 | # Unit test / coverage reports
64 | htmlcov/
65 | .tox/
66 | .nox/
67 | .coverage
68 | .coverage.*
69 | .cache
70 | nosetests.xml
71 | coverage.xml
72 | *.cover
73 | *.py,cover
74 | .hypothesis/
75 | .pytest_cache/
76 | cover/
77 |
78 | # Translations
79 | *.mo
80 | *.pot
81 |
82 | # Django stuff:
83 | *.log
84 | local_settings.py
85 | db.sqlite3
86 | db.sqlite3-journal
87 |
88 | # Flask stuff:
89 | instance/
90 | .webassets-cache
91 |
92 | # Scrapy stuff:
93 | .scrapy
94 |
95 | # Sphinx documentation
96 | docs/_build/
97 |
98 | # PyBuilder
99 | .pybuilder/
100 | target/
101 |
102 | # Jupyter Notebook
103 | .ipynb_checkpoints
104 |
105 | # IPython
106 | profile_default/
107 | ipython_config.py
108 |
109 | # pyenv
110 | # For a library or package, you might want to ignore these files since the code is
111 | # intended to run in multiple environments; otherwise, check them in:
112 | # .python-version
113 |
114 | # pipenv
115 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
117 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
118 | # install all needed dependencies.
119 | #Pipfile.lock
120 |
121 | # poetry
122 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
123 | # This is especially recommended for binary packages to ensure reproducibility, and is more
124 | # commonly ignored for libraries.
125 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
126 | #poetry.lock
127 |
128 | # pdm
129 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
130 | #pdm.lock
131 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
132 | # in version control.
133 | # https://pdm.fming.dev/#use-with-ide
134 | .pdm.toml
135 |
136 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
137 | __pypackages__/
138 |
139 | # Celery stuff
140 | celerybeat-schedule
141 | celerybeat.pid
142 |
143 | # SageMath parsed files
144 | *.sage.py
145 |
146 | # Environments
147 | .env
148 | .venv
149 | env/
150 | venv/
151 | ENV/
152 | env.bak/
153 | venv.bak/
154 |
155 | # Spyder project settings
156 | .spyderproject
157 | .spyproject
158 |
159 | # Rope project settings
160 | .ropeproject
161 |
162 | # mkdocs documentation
163 | /site
164 |
165 | # mypy
166 | .mypy_cache/
167 | .dmypy.json
168 | dmypy.json
169 |
170 | # Pyre type checker
171 | .pyre/
172 |
173 | # pytype static type analyzer
174 | .pytype/
175 |
176 | # Cython debug symbols
177 | cython_debug/
178 |
179 | # PyCharm
180 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
181 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
182 | # and can be added to the global gitignore or merged into this file. For a more nuclear
183 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
184 | #.idea/
185 |
186 |
187 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.formatting.provider": "black",
3 | "editor.codeActionsOnSave": {
4 | "source.organizeImports": true
5 | },
6 | "editor.formatOnSave": true,
7 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 ddPn08
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README-ja.md:
--------------------------------------------------------------------------------
1 |
RVC-WebUI
2 |
3 |
4 |
5 | [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) の再構築プロジェクト
6 |
7 |
8 |
9 |
10 | ---
11 |
12 |
13 |
14 |
15 | [日本語](README-ja.md) | [English](README.md)
16 |
17 |
18 |
19 |
20 |
21 |
22 | # 起動
23 |
24 | ## Windows
25 | `webui-user.bat` をダブルクリックして、webuiを起動します。
26 |
27 | ## Linux or Mac
28 | `webui.sh` を実行して、webuiを起動します。
29 |
30 |
31 |
32 | ```
33 | テスト環境: Windows 10, Python 3.10.9, torch 2.0.0+cu118
34 | ```
35 |
36 |
37 |
38 | # トラブルシューティング
39 |
40 | ## `error: Microsoft Visual C++ 14.0 or greater is required.`
41 |
42 | Microsoft C++ Build Tools がインストールされている必要があります。
43 |
44 | ### Step 1: インストーラーをダウンロード
45 | [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16)
46 |
47 | ### Step 2: `C++ Build Tools` をインストール
48 | インストーラーを実行し、`Workloads` タブで `C++ Build Tools` を選択します。
49 |
50 |
51 |
52 | # クレジット
53 | - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI)
54 | - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection)
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | RVC-WebUI
2 |
3 |
4 |
5 | [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) reconstruction project
6 |
7 |
8 |
9 |
10 | ---
11 |
12 |
13 |
14 |
15 | [日本語](README-ja.md) | [English](README.md)
16 |
17 |
18 |
19 |
20 |
21 |
22 | # Launch
23 |
24 | ## Windows
25 | Double click `webui-user.bat` to start the webui.
26 |
27 | ## Linux or Mac
28 | Run `webui.sh` to start the webui.
29 |
30 |
31 |
32 | ```
33 | Tested environment: Windows 10, Python 3.10.9, torch 2.0.0+cu118
34 | ```
35 |
36 |
37 |
38 | # Troubleshooting
39 |
40 | ## `error: Microsoft Visual C++ 14.0 or greater is required.`
41 |
42 | Microsoft C++ Build Tools must be installed.
43 |
44 | ### Step 1: Download the installer
45 | [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16)
46 |
47 | ### Step 2: Install `C++ Build Tools`
48 | Run the installer and select `C++ Build Tools` in the `Workloads` tab.
49 |
50 |
51 |
52 | # Credits
53 | - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI)
54 | - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection)
55 |
--------------------------------------------------------------------------------
/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/configs/32k-768.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 12800,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 32000,
21 | "filter_length": 1024,
22 | "hop_length": 320,
23 | "win_length": 1024,
24 | "n_mel_channels": 80,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,4,2,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "emb_channels": 768,
45 | "spk_embed_dim": 109
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/configs/32k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 12800,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 32000,
21 | "filter_length": 1024,
22 | "hop_length": 320,
23 | "win_length": 1024,
24 | "n_mel_channels": 80,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,4,2,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "emb_channels": 256,
45 | "spk_embed_dim": 109
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/configs/40k-768.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 12800,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 40000,
21 | "filter_length": 2048,
22 | "hop_length": 400,
23 | "win_length": 2048,
24 | "n_mel_channels": 125,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,10,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "emb_channels": 768,
45 | "spk_embed_dim": 109
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/configs/40k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 12800,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 40000,
21 | "filter_length": 2048,
22 | "hop_length": 400,
23 | "win_length": 2048,
24 | "n_mel_channels": 125,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,10,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "emb_channels": 256,
45 | "spk_embed_dim": 109
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/configs/48k-768.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 11520,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 48000,
21 | "filter_length": 2048,
22 | "hop_length": 480,
23 | "win_length": 2048,
24 | "n_mel_channels": 128,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,6,2,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "emb_channels": 768,
45 | "spk_embed_dim": 109
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/configs/48k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 11520,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 48000,
21 | "filter_length": 2048,
22 | "hop_length": 480,
23 | "win_length": 2048,
24 | "n_mel_channels": 128,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,6,2,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "emb_channels": 256,
45 | "spk_embed_dim": 109
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/dev.py:
--------------------------------------------------------------------------------
1 | import modules.ui as ui
2 |
3 | demo = ui.create_ui()
4 |
--------------------------------------------------------------------------------
/launch.py:
--------------------------------------------------------------------------------
1 | import importlib.util
2 | import os
3 | import shlex
4 | import subprocess
5 | import sys
6 |
7 | commandline_args = os.environ.get("COMMANDLINE_ARGS", "")
8 | sys.argv += shlex.split(commandline_args)
9 |
10 | python = sys.executable
11 | git = os.environ.get("GIT", "git")
12 | index_url = os.environ.get("INDEX_URL", "")
13 | stored_commit_hash = None
14 | skip_install = False
15 |
16 |
17 | def run(command, desc=None, errdesc=None, custom_env=None):
18 | if desc is not None:
19 | print(desc)
20 |
21 | result = subprocess.run(
22 | command,
23 | stdout=subprocess.PIPE,
24 | stderr=subprocess.PIPE,
25 | shell=True,
26 | env=os.environ if custom_env is None else custom_env,
27 | )
28 |
29 | if result.returncode != 0:
30 | message = f"""{errdesc or 'Error running command'}.
31 | Command: {command}
32 | Error code: {result.returncode}
33 | stdout: {result.stdout.decode(encoding="utf8", errors="ignore") if len(result.stdout)>0 else ''}
34 | stderr: {result.stderr.decode(encoding="utf8", errors="ignore") if len(result.stderr)>0 else ''}
35 | """
36 | raise RuntimeError(message)
37 |
38 | return result.stdout.decode(encoding="utf8", errors="ignore")
39 |
40 |
41 | def check_run(command):
42 | result = subprocess.run(
43 | command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
44 | )
45 | return result.returncode == 0
46 |
47 |
48 | def is_installed(package):
49 | try:
50 | spec = importlib.util.find_spec(package)
51 | except ModuleNotFoundError:
52 | return False
53 |
54 | return spec is not None
55 |
56 |
57 | def commit_hash():
58 | global stored_commit_hash
59 |
60 | if stored_commit_hash is not None:
61 | return stored_commit_hash
62 |
63 | try:
64 | stored_commit_hash = run(f"{git} rev-parse HEAD").strip()
65 | except Exception:
66 | stored_commit_hash = ""
67 |
68 | return stored_commit_hash
69 |
70 |
71 | def run_pip(args, desc=None):
72 | if skip_install:
73 | return
74 |
75 | index_url_line = f" --index-url {index_url}" if index_url != "" else ""
76 | return run(
77 | f'"{python}" -m pip {args} --prefer-binary{index_url_line}',
78 | desc=f"Installing {desc}",
79 | errdesc=f"Couldn't install {desc}",
80 | )
81 |
82 |
83 | def run_python(code, desc=None, errdesc=None):
84 | return run(f'"{python}" -c "{code}"', desc, errdesc)
85 |
86 |
87 | def extract_arg(args, name):
88 | return [x for x in args if x != name], name in args
89 |
90 |
91 | def prepare_environment():
92 | commit = commit_hash()
93 |
94 | print(f"Python {sys.version}")
95 | print(f"Commit hash: {commit}")
96 |
97 | torch_command = os.environ.get(
98 | "TORCH_COMMAND",
99 | "pip install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118",
100 | )
101 |
102 | sys.argv, skip_install = extract_arg(sys.argv, "--skip-install")
103 | if skip_install:
104 | return
105 |
106 | sys.argv, reinstall_torch = extract_arg(sys.argv, "--reinstall-torch")
107 | ngrok = "--ngrok" in sys.argv
108 |
109 | if reinstall_torch or not is_installed("torch") or not is_installed("torchaudio"):
110 | run(
111 | f'"{python}" -m {torch_command}',
112 | "Installing torch and torchaudio",
113 | "Couldn't install torch",
114 | )
115 |
116 | if not is_installed("pyngrok") and ngrok:
117 | run_pip("install pyngrok", "ngrok")
118 |
119 | run(
120 | f'"{python}" -m pip install -r requirements.txt',
121 | desc=f"Installing requirements",
122 | errdesc=f"Couldn't install requirements",
123 | )
124 |
125 |
126 | def start():
127 | os.environ["PATH"] = (
128 | os.path.join(os.path.dirname(__file__), "bin")
129 | + os.pathsep
130 | + os.environ.get("PATH", "")
131 | )
132 | subprocess.run(
133 | [python, "webui.py", *sys.argv[1:]],
134 | )
135 |
136 |
137 | if __name__ == "__main__":
138 | prepare_environment()
139 | start()
140 |
--------------------------------------------------------------------------------
/lib/rvc/attentions.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 |
7 | from . import commons
8 | from .modules import LayerNorm
9 |
10 |
11 | class Encoder(nn.Module):
12 | def __init__(
13 | self,
14 | hidden_channels,
15 | filter_channels,
16 | n_heads,
17 | n_layers,
18 | kernel_size=1,
19 | p_dropout=0.0,
20 | window_size=10,
21 | **kwargs
22 | ):
23 | super().__init__()
24 | self.hidden_channels = hidden_channels
25 | self.filter_channels = filter_channels
26 | self.n_heads = n_heads
27 | self.n_layers = n_layers
28 | self.kernel_size = kernel_size
29 | self.p_dropout = p_dropout
30 | self.window_size = window_size
31 |
32 | self.drop = nn.Dropout(p_dropout)
33 | self.attn_layers = nn.ModuleList()
34 | self.norm_layers_1 = nn.ModuleList()
35 | self.ffn_layers = nn.ModuleList()
36 | self.norm_layers_2 = nn.ModuleList()
37 | for i in range(self.n_layers):
38 | self.attn_layers.append(
39 | MultiHeadAttention(
40 | hidden_channels,
41 | hidden_channels,
42 | n_heads,
43 | p_dropout=p_dropout,
44 | window_size=window_size,
45 | )
46 | )
47 | self.norm_layers_1.append(LayerNorm(hidden_channels))
48 | self.ffn_layers.append(
49 | FFN(
50 | hidden_channels,
51 | hidden_channels,
52 | filter_channels,
53 | kernel_size,
54 | p_dropout=p_dropout,
55 | )
56 | )
57 | self.norm_layers_2.append(LayerNorm(hidden_channels))
58 |
59 | def forward(self, x, x_mask):
60 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
61 | x = x * x_mask
62 | for i in range(self.n_layers):
63 | y = self.attn_layers[i](x, x, attn_mask)
64 | y = self.drop(y)
65 | x = self.norm_layers_1[i](x + y)
66 |
67 | y = self.ffn_layers[i](x, x_mask)
68 | y = self.drop(y)
69 | x = self.norm_layers_2[i](x + y)
70 | x = x * x_mask
71 | return x
72 |
73 |
74 | class Decoder(nn.Module):
75 | def __init__(
76 | self,
77 | hidden_channels,
78 | filter_channels,
79 | n_heads,
80 | n_layers,
81 | kernel_size=1,
82 | p_dropout=0.0,
83 | proximal_bias=False,
84 | proximal_init=True,
85 | **kwargs
86 | ):
87 | super().__init__()
88 | self.hidden_channels = hidden_channels
89 | self.filter_channels = filter_channels
90 | self.n_heads = n_heads
91 | self.n_layers = n_layers
92 | self.kernel_size = kernel_size
93 | self.p_dropout = p_dropout
94 | self.proximal_bias = proximal_bias
95 | self.proximal_init = proximal_init
96 |
97 | self.drop = nn.Dropout(p_dropout)
98 | self.self_attn_layers = nn.ModuleList()
99 | self.norm_layers_0 = nn.ModuleList()
100 | self.encdec_attn_layers = nn.ModuleList()
101 | self.norm_layers_1 = nn.ModuleList()
102 | self.ffn_layers = nn.ModuleList()
103 | self.norm_layers_2 = nn.ModuleList()
104 | for i in range(self.n_layers):
105 | self.self_attn_layers.append(
106 | MultiHeadAttention(
107 | hidden_channels,
108 | hidden_channels,
109 | n_heads,
110 | p_dropout=p_dropout,
111 | proximal_bias=proximal_bias,
112 | proximal_init=proximal_init,
113 | )
114 | )
115 | self.norm_layers_0.append(LayerNorm(hidden_channels))
116 | self.encdec_attn_layers.append(
117 | MultiHeadAttention(
118 | hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
119 | )
120 | )
121 | self.norm_layers_1.append(LayerNorm(hidden_channels))
122 | self.ffn_layers.append(
123 | FFN(
124 | hidden_channels,
125 | hidden_channels,
126 | filter_channels,
127 | kernel_size,
128 | p_dropout=p_dropout,
129 | causal=True,
130 | )
131 | )
132 | self.norm_layers_2.append(LayerNorm(hidden_channels))
133 |
134 | def forward(self, x, x_mask, h, h_mask):
135 | """
136 | x: decoder input
137 | h: encoder output
138 | """
139 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
140 | device=x.device, dtype=x.dtype
141 | )
142 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
143 | x = x * x_mask
144 | for i in range(self.n_layers):
145 | y = self.self_attn_layers[i](x, x, self_attn_mask)
146 | y = self.drop(y)
147 | x = self.norm_layers_0[i](x + y)
148 |
149 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
150 | y = self.drop(y)
151 | x = self.norm_layers_1[i](x + y)
152 |
153 | y = self.ffn_layers[i](x, x_mask)
154 | y = self.drop(y)
155 | x = self.norm_layers_2[i](x + y)
156 | x = x * x_mask
157 | return x
158 |
159 |
160 | class MultiHeadAttention(nn.Module):
161 | def __init__(
162 | self,
163 | channels,
164 | out_channels,
165 | n_heads,
166 | p_dropout=0.0,
167 | window_size=None,
168 | heads_share=True,
169 | block_length=None,
170 | proximal_bias=False,
171 | proximal_init=False,
172 | ):
173 | super().__init__()
174 | assert channels % n_heads == 0
175 |
176 | self.channels = channels
177 | self.out_channels = out_channels
178 | self.n_heads = n_heads
179 | self.p_dropout = p_dropout
180 | self.window_size = window_size
181 | self.heads_share = heads_share
182 | self.block_length = block_length
183 | self.proximal_bias = proximal_bias
184 | self.proximal_init = proximal_init
185 | self.attn = None
186 |
187 | self.k_channels = channels // n_heads
188 | self.conv_q = nn.Conv1d(channels, channels, 1)
189 | self.conv_k = nn.Conv1d(channels, channels, 1)
190 | self.conv_v = nn.Conv1d(channels, channels, 1)
191 | self.conv_o = nn.Conv1d(channels, out_channels, 1)
192 | self.drop = nn.Dropout(p_dropout)
193 |
194 | if window_size is not None:
195 | n_heads_rel = 1 if heads_share else n_heads
196 | rel_stddev = self.k_channels**-0.5
197 | self.emb_rel_k = nn.Parameter(
198 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
199 | * rel_stddev
200 | )
201 | self.emb_rel_v = nn.Parameter(
202 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
203 | * rel_stddev
204 | )
205 |
206 | nn.init.xavier_uniform_(self.conv_q.weight)
207 | nn.init.xavier_uniform_(self.conv_k.weight)
208 | nn.init.xavier_uniform_(self.conv_v.weight)
209 | if proximal_init:
210 | with torch.no_grad():
211 | self.conv_k.weight.copy_(self.conv_q.weight)
212 | self.conv_k.bias.copy_(self.conv_q.bias)
213 |
214 | def forward(self, x, c, attn_mask=None):
215 | q = self.conv_q(x)
216 | k = self.conv_k(c)
217 | v = self.conv_v(c)
218 |
219 | x, self.attn = self.attention(q, k, v, mask=attn_mask)
220 |
221 | x = self.conv_o(x)
222 | return x
223 |
224 | def attention(self, query, key, value, mask=None):
225 | # reshape [b, d, t] -> [b, n_h, t, d_k]
226 | b, d, t_s, t_t = (*key.size(), query.size(2))
227 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
228 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
229 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
230 |
231 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
232 | if self.window_size is not None:
233 | assert (
234 | t_s == t_t
235 | ), "Relative attention is only available for self-attention."
236 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
237 | rel_logits = self._matmul_with_relative_keys(
238 | query / math.sqrt(self.k_channels), key_relative_embeddings
239 | )
240 | scores_local = self._relative_position_to_absolute_position(rel_logits)
241 | scores = scores + scores_local
242 | if self.proximal_bias:
243 | assert t_s == t_t, "Proximal bias is only available for self-attention."
244 | scores = scores + self._attention_bias_proximal(t_s).to(
245 | device=scores.device, dtype=scores.dtype
246 | )
247 | if mask is not None:
248 | scores = scores.masked_fill(mask == 0, -1e4)
249 | if self.block_length is not None:
250 | assert (
251 | t_s == t_t
252 | ), "Local attention is only available for self-attention."
253 | block_mask = (
254 | torch.ones_like(scores)
255 | .triu(-self.block_length)
256 | .tril(self.block_length)
257 | )
258 | scores = scores.masked_fill(block_mask == 0, -1e4)
259 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
260 | p_attn = self.drop(p_attn)
261 | output = torch.matmul(p_attn, value)
262 | if self.window_size is not None:
263 | relative_weights = self._absolute_position_to_relative_position(p_attn)
264 | value_relative_embeddings = self._get_relative_embeddings(
265 | self.emb_rel_v, t_s
266 | )
267 | output = output + self._matmul_with_relative_values(
268 | relative_weights, value_relative_embeddings
269 | )
270 | output = (
271 | output.transpose(2, 3).contiguous().view(b, d, t_t)
272 | ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
273 | return output, p_attn
274 |
275 | def _matmul_with_relative_values(self, x, y):
276 | """
277 | x: [b, h, l, m]
278 | y: [h or 1, m, d]
279 | ret: [b, h, l, d]
280 | """
281 | ret = torch.matmul(x, y.unsqueeze(0))
282 | return ret
283 |
284 | def _matmul_with_relative_keys(self, x, y):
285 | """
286 | x: [b, h, l, d]
287 | y: [h or 1, m, d]
288 | ret: [b, h, l, m]
289 | """
290 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
291 | return ret
292 |
293 | def _get_relative_embeddings(self, relative_embeddings, length):
294 | max_relative_position = 2 * self.window_size + 1
295 | # Pad first before slice to avoid using cond ops.
296 | pad_length = max(length - (self.window_size + 1), 0)
297 | slice_start_position = max((self.window_size + 1) - length, 0)
298 | slice_end_position = slice_start_position + 2 * length - 1
299 | if pad_length > 0:
300 | padded_relative_embeddings = F.pad(
301 | relative_embeddings,
302 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
303 | )
304 | else:
305 | padded_relative_embeddings = relative_embeddings
306 | used_relative_embeddings = padded_relative_embeddings[
307 | :, slice_start_position:slice_end_position
308 | ]
309 | return used_relative_embeddings
310 |
311 | def _relative_position_to_absolute_position(self, x):
312 | """
313 | x: [b, h, l, 2*l-1]
314 | ret: [b, h, l, l]
315 | """
316 | batch, heads, length, _ = x.size()
317 | # Concat columns of pad to shift from relative to absolute indexing.
318 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
319 |
320 | # Concat extra elements so to add up to shape (len+1, 2*len-1).
321 | x_flat = x.view([batch, heads, length * 2 * length])
322 | x_flat = F.pad(
323 | x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
324 | )
325 |
326 | # Reshape and slice out the padded elements.
327 | x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
328 | :, :, :length, length - 1 :
329 | ]
330 | return x_final
331 |
332 | def _absolute_position_to_relative_position(self, x):
333 | """
334 | x: [b, h, l, l]
335 | ret: [b, h, l, 2*l-1]
336 | """
337 | batch, heads, length, _ = x.size()
338 | # padd along column
339 | x = F.pad(
340 | x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
341 | )
342 | x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
343 | # add 0's in the beginning that will skew the elements after reshape
344 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
345 | x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
346 | return x_final
347 |
348 | def _attention_bias_proximal(self, length):
349 | """Bias for self-attention to encourage attention to close positions.
350 | Args:
351 | length: an integer scalar.
352 | Returns:
353 | a Tensor with shape [1, 1, length, length]
354 | """
355 | r = torch.arange(length, dtype=torch.float32)
356 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
357 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
358 |
359 |
360 | class FFN(nn.Module):
361 | def __init__(
362 | self,
363 | in_channels,
364 | out_channels,
365 | filter_channels,
366 | kernel_size,
367 | p_dropout=0.0,
368 | activation=None,
369 | causal=False,
370 | ):
371 | super().__init__()
372 | self.in_channels = in_channels
373 | self.out_channels = out_channels
374 | self.filter_channels = filter_channels
375 | self.kernel_size = kernel_size
376 | self.p_dropout = p_dropout
377 | self.activation = activation
378 | self.causal = causal
379 |
380 | if causal:
381 | self.padding = self._causal_padding
382 | else:
383 | self.padding = self._same_padding
384 |
385 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
386 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
387 | self.drop = nn.Dropout(p_dropout)
388 |
389 | def forward(self, x, x_mask):
390 | x = self.conv_1(self.padding(x * x_mask))
391 | if self.activation == "gelu":
392 | x = x * torch.sigmoid(1.702 * x)
393 | else:
394 | x = torch.relu(x)
395 | x = self.drop(x)
396 | x = self.conv_2(self.padding(x * x_mask))
397 | return x * x_mask
398 |
399 | def _causal_padding(self, x):
400 | if self.kernel_size == 1:
401 | return x
402 | pad_l = self.kernel_size - 1
403 | pad_r = 0
404 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
405 | x = F.pad(x, commons.convert_pad_shape(padding))
406 | return x
407 |
408 | def _same_padding(self, x):
409 | if self.kernel_size == 1:
410 | return x
411 | pad_l = (self.kernel_size - 1) // 2
412 | pad_r = self.kernel_size // 2
413 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
414 | x = F.pad(x, commons.convert_pad_shape(padding))
415 | return x
416 |
--------------------------------------------------------------------------------
/lib/rvc/checkpoints.py:
--------------------------------------------------------------------------------
1 | import os
2 | from collections import OrderedDict
3 | from typing import *
4 |
5 | import torch
6 |
7 |
8 | def write_config(state_dict: Dict[str, Any], cfg: Dict[str, Any]):
9 | state_dict["config"] = []
10 | for key, x in cfg.items():
11 | state_dict["config"].append(x)
12 | state_dict["params"] = cfg
13 |
14 |
15 | def create_trained_model(
16 | weights: Dict[str, Any],
17 | version: Literal["v1", "v2"],
18 | sr: str,
19 | f0: bool,
20 | emb_name: str,
21 | emb_ch: int,
22 | emb_output_layer: int,
23 | epoch: int,
24 | speaker_info: Optional[dict[str, int]]
25 | ):
26 | state_dict = OrderedDict()
27 | state_dict["weight"] = {}
28 | for key in weights.keys():
29 | if "enc_q" in key:
30 | continue
31 | state_dict["weight"][key] = weights[key].half()
32 | if sr == "40k":
33 | write_config(
34 | state_dict,
35 | {
36 | "spec_channels": 1025,
37 | "segment_size": 32,
38 | "inter_channels": 192,
39 | "hidden_channels": 192,
40 | "filter_channels": 768,
41 | "n_heads": 2,
42 | "n_layers": 6,
43 | "kernel_size": 3,
44 | "p_dropout": 0,
45 | "resblock": "1",
46 | "resblock_kernel_sizes": [3, 7, 11],
47 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
48 | "upsample_rates": [10, 10, 2, 2],
49 | "upsample_initial_channel": 512,
50 | "upsample_kernel_sizes": [16, 16, 4, 4],
51 | "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
52 | "gin_channels": 256,
53 | "emb_channels": emb_ch,
54 | "sr": 40000,
55 | },
56 | )
57 | elif sr == "48k":
58 | write_config(
59 | state_dict,
60 | {
61 | "spec_channels": 1025,
62 | "segment_size": 32,
63 | "inter_channels": 192,
64 | "hidden_channels": 192,
65 | "filter_channels": 768,
66 | "n_heads": 2,
67 | "n_layers": 6,
68 | "kernel_size": 3,
69 | "p_dropout": 0,
70 | "resblock": "1",
71 | "resblock_kernel_sizes": [3, 7, 11],
72 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
73 | "upsample_rates": [10, 6, 2, 2, 2],
74 | "upsample_initial_channel": 512,
75 | "upsample_kernel_sizes": [16, 16, 4, 4, 4],
76 | "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
77 | "gin_channels": 256,
78 | "emb_channels": emb_ch,
79 | "sr": 48000,
80 | },
81 | )
82 | elif sr == "32k":
83 | write_config(
84 | state_dict,
85 | {
86 | "spec_channels": 513,
87 | "segment_size": 32,
88 | "inter_channels": 192,
89 | "hidden_channels": 192,
90 | "filter_channels": 768,
91 | "n_heads": 2,
92 | "n_layers": 6,
93 | "kernel_size": 3,
94 | "p_dropout": 0,
95 | "resblock": "1",
96 | "resblock_kernel_sizes": [3, 7, 11],
97 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
98 | "upsample_rates": [10, 4, 2, 2, 2],
99 | "upsample_initial_channel": 512,
100 | "upsample_kernel_sizes": [16, 16, 4, 4, 4],
101 | "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
102 | "gin_channels": 256,
103 | "emb_channels": emb_ch,
104 | "sr": 32000,
105 | },
106 | )
107 | state_dict["version"] = version
108 | state_dict["info"] = f"{epoch}epoch"
109 | state_dict["sr"] = sr
110 | state_dict["f0"] = 1 if f0 else 0
111 | state_dict["embedder_name"] = emb_name
112 | state_dict["embedder_output_layer"] = emb_output_layer
113 | if not speaker_info is None:
114 | state_dict["speaker_info"] = {str(v): str(k) for k, v in speaker_info.items()}
115 | return state_dict
116 |
117 |
118 | def save(
119 | model,
120 | version: Literal["v1", "v2"],
121 | sr: str,
122 | f0: bool,
123 | emb_name: str,
124 | emb_ch: int,
125 | emb_output_layer: int,
126 | filepath: str,
127 | epoch: int,
128 | speaker_info: Optional[dict[str, int]]
129 | ):
130 | if hasattr(model, "module"):
131 | state_dict = model.module.state_dict()
132 | else:
133 | state_dict = model.state_dict()
134 |
135 | print(f"save: emb_name: {emb_name} {emb_ch}")
136 |
137 | state_dict = create_trained_model(
138 | state_dict,
139 | version,
140 | sr,
141 | f0,
142 | emb_name,
143 | emb_ch,
144 | emb_output_layer,
145 | epoch,
146 | speaker_info
147 | )
148 | os.makedirs(os.path.dirname(filepath), exist_ok=True)
149 | torch.save(state_dict, filepath)
150 |
--------------------------------------------------------------------------------
/lib/rvc/commons.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch.nn import functional as F
5 |
6 |
7 | def init_weights(m, mean=0.0, std=0.01):
8 | classname = m.__class__.__name__
9 | if classname.find("Conv") != -1:
10 | m.weight.data.normal_(mean, std)
11 |
12 |
13 | def get_padding(kernel_size, dilation=1):
14 | return int((kernel_size * dilation - dilation) / 2)
15 |
16 |
17 | def convert_pad_shape(pad_shape):
18 | l = pad_shape[::-1]
19 | pad_shape = [item for sublist in l for item in sublist]
20 | return pad_shape
21 |
22 |
23 | def kl_divergence(m_p, logs_p, m_q, logs_q):
24 | """KL(P||Q)"""
25 | kl = (logs_q - logs_p) - 0.5
26 | kl += (
27 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
28 | )
29 | return kl
30 |
31 |
32 | def rand_gumbel(shape):
33 | """Sample from the Gumbel distribution, protect from overflows."""
34 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
35 | return -torch.log(-torch.log(uniform_samples))
36 |
37 |
38 | def rand_gumbel_like(x):
39 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
40 | return g
41 |
42 |
43 | def slice_segments(x, ids_str, segment_size=4):
44 | ret = torch.zeros_like(x[:, :, :segment_size])
45 | for i in range(x.size(0)):
46 | idx_str = ids_str[i]
47 | idx_end = idx_str + segment_size
48 | ret[i] = x[i, :, idx_str:idx_end]
49 | return ret
50 |
51 |
52 | def slice_segments2(x, ids_str, segment_size=4):
53 | ret = torch.zeros_like(x[:, :segment_size])
54 | for i in range(x.size(0)):
55 | idx_str = ids_str[i]
56 | idx_end = idx_str + segment_size
57 | ret[i] = x[i, idx_str:idx_end]
58 | return ret
59 |
60 |
61 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
62 | b, d, t = x.size()
63 | if x_lengths is None:
64 | x_lengths = t
65 | ids_str_max = x_lengths - segment_size + 1
66 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
67 | ret = slice_segments(x, ids_str, segment_size)
68 | return ret, ids_str
69 |
70 |
71 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
72 | position = torch.arange(length, dtype=torch.float)
73 | num_timescales = channels // 2
74 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
75 | num_timescales - 1
76 | )
77 | inv_timescales = min_timescale * torch.exp(
78 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
79 | )
80 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
81 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
82 | signal = F.pad(signal, [0, 0, 0, channels % 2])
83 | signal = signal.view(1, channels, length)
84 | return signal
85 |
86 |
87 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
88 | b, channels, length = x.size()
89 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
90 | return x + signal.to(dtype=x.dtype, device=x.device)
91 |
92 |
93 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
94 | b, channels, length = x.size()
95 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
96 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
97 |
98 |
99 | def subsequent_mask(length):
100 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
101 | return mask
102 |
103 |
104 | @torch.jit.script
105 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
106 | n_channels_int = n_channels[0]
107 | in_act = input_a + input_b
108 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
109 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
110 | acts = t_act * s_act
111 | return acts
112 |
113 |
114 | def convert_pad_shape(pad_shape):
115 | l = pad_shape[::-1]
116 | pad_shape = [item for sublist in l for item in sublist]
117 | return pad_shape
118 |
119 |
120 | def shift_1d(x):
121 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
122 | return x
123 |
124 |
125 | def sequence_mask(length, max_length=None):
126 | if max_length is None:
127 | max_length = length.max()
128 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
129 | return x.unsqueeze(0) < length.unsqueeze(1)
130 |
131 |
132 | def generate_path(duration, mask):
133 | """
134 | duration: [b, 1, t_x]
135 | mask: [b, 1, t_y, t_x]
136 | """
137 | b, _, t_y, t_x = mask.shape
138 | cum_duration = torch.cumsum(duration, -1)
139 |
140 | cum_duration_flat = cum_duration.view(b * t_x)
141 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
142 | path = path.view(b, t_x, t_y)
143 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
144 | path = path.unsqueeze(1).transpose(2, 3) * mask
145 | return path
146 |
147 |
148 | def clip_grad_value_(parameters, clip_value, norm_type=2):
149 | if isinstance(parameters, torch.Tensor):
150 | parameters = [parameters]
151 | parameters = list(filter(lambda p: p.grad is not None, parameters))
152 | norm_type = float(norm_type)
153 | if clip_value is not None:
154 | clip_value = float(clip_value)
155 |
156 | total_norm = 0
157 | for p in parameters:
158 | param_norm = p.grad.data.norm(norm_type)
159 | total_norm += param_norm.item() ** norm_type
160 | if clip_value is not None:
161 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
162 | total_norm = total_norm ** (1.0 / norm_type)
163 | return total_norm
164 |
--------------------------------------------------------------------------------
/lib/rvc/config.py:
--------------------------------------------------------------------------------
1 | from typing import *
2 |
3 | from pydantic import BaseModel
4 |
5 |
6 | class TrainConfigTrain(BaseModel):
7 | log_interval: int
8 | seed: int
9 | epochs: int
10 | learning_rate: float
11 | betas: List[float]
12 | eps: float
13 | batch_size: int
14 | fp16_run: bool
15 | lr_decay: float
16 | segment_size: int
17 | init_lr_ratio: int
18 | warmup_epochs: int
19 | c_mel: int
20 | c_kl: float
21 |
22 |
23 | class TrainConfigData(BaseModel):
24 | max_wav_value: float
25 | sampling_rate: int
26 | filter_length: int
27 | hop_length: int
28 | win_length: int
29 | n_mel_channels: int
30 | mel_fmin: float
31 | mel_fmax: Any
32 |
33 |
34 | class TrainConfigModel(BaseModel):
35 | inter_channels: int
36 | hidden_channels: int
37 | filter_channels: int
38 | n_heads: int
39 | n_layers: int
40 | kernel_size: int
41 | p_dropout: int
42 | resblock: str
43 | resblock_kernel_sizes: List[int]
44 | resblock_dilation_sizes: List[List[int]]
45 | upsample_rates: List[int]
46 | upsample_initial_channel: int
47 | upsample_kernel_sizes: List[int]
48 | use_spectral_norm: bool
49 | gin_channels: int
50 | emb_channels: int
51 | spk_embed_dim: int
52 |
53 |
54 | class TrainConfig(BaseModel):
55 | version: Literal["v1", "v2"] = "v2"
56 | train: TrainConfigTrain
57 | data: TrainConfigData
58 | model: TrainConfigModel
59 |
60 |
61 | class DatasetMetaItem(BaseModel):
62 | gt_wav: str
63 | co256: str
64 | f0: Optional[str]
65 | f0nsf: Optional[str]
66 | speaker_id: int
67 |
68 |
69 | class DatasetMetadata(BaseModel):
70 | files: Dict[str, DatasetMetaItem]
71 | # mute: DatasetMetaItem
72 |
--------------------------------------------------------------------------------
/lib/rvc/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def feature_loss(fmap_r, fmap_g):
5 | loss = 0
6 | for dr, dg in zip(fmap_r, fmap_g):
7 | for rl, gl in zip(dr, dg):
8 | rl = rl.float().detach()
9 | gl = gl.float()
10 | loss += torch.mean(torch.abs(rl - gl))
11 |
12 | return loss * 2
13 |
14 |
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 | loss = 0
17 | r_losses = []
18 | g_losses = []
19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 | dr = dr.float()
21 | dg = dg.float()
22 | r_loss = torch.mean((1 - dr) ** 2)
23 | g_loss = torch.mean(dg**2)
24 | loss += r_loss + g_loss
25 | r_losses.append(r_loss.item())
26 | g_losses.append(g_loss.item())
27 |
28 | return loss, r_losses, g_losses
29 |
30 |
31 | def generator_loss(disc_outputs):
32 | loss = 0
33 | gen_losses = []
34 | for dg in disc_outputs:
35 | dg = dg.float()
36 | l = torch.mean((1 - dg) ** 2)
37 | gen_losses.append(l)
38 | loss += l
39 |
40 | return loss, gen_losses
41 |
42 |
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 | """
45 | z_p, logs_q: [b, h, t_t]
46 | m_p, logs_p: [b, h, t_t]
47 | """
48 | z_p = z_p.float()
49 | logs_q = logs_q.float()
50 | m_p = m_p.float()
51 | logs_p = logs_p.float()
52 | z_mask = z_mask.float()
53 |
54 | kl = logs_p - logs_q - 0.5
55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 | kl = torch.sum(kl * z_mask)
57 | l = kl / torch.sum(z_mask)
58 | return l
59 |
--------------------------------------------------------------------------------
/lib/rvc/mel_processing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data
3 | from librosa.filters import mel as librosa_mel_fn
4 |
5 | MAX_WAV_VALUE = 32768.0
6 |
7 |
8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
9 | """
10 | PARAMS
11 | ------
12 | C: compression factor
13 | """
14 | return torch.log(torch.clamp(x, min=clip_val) * C)
15 |
16 |
17 | def dynamic_range_decompression_torch(x, C=1):
18 | """
19 | PARAMS
20 | ------
21 | C: compression factor used to compress
22 | """
23 | return torch.exp(x) / C
24 |
25 |
26 | def spectral_normalize_torch(magnitudes):
27 | return dynamic_range_compression_torch(magnitudes)
28 |
29 |
30 | def spectral_de_normalize_torch(magnitudes):
31 | return dynamic_range_decompression_torch(magnitudes)
32 |
33 |
34 | mel_basis = {}
35 | hann_window = {}
36 |
37 |
38 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
39 | if torch.min(y) < -1.07:
40 | print("min value is ", torch.min(y))
41 | if torch.max(y) > 1.07:
42 | print("max value is ", torch.max(y))
43 |
44 | global hann_window
45 | dtype_device = str(y.dtype) + "_" + str(y.device)
46 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
47 | if wnsize_dtype_device not in hann_window:
48 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
49 | dtype=y.dtype, device=y.device
50 | )
51 |
52 | y = torch.nn.functional.pad(
53 | y.unsqueeze(1),
54 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
55 | mode="reflect",
56 | )
57 | y = y.squeeze(1)
58 |
59 | # mps does not support torch.stft.
60 | if y.device.type == "mps":
61 | i = y.cpu()
62 | win = hann_window[wnsize_dtype_device].cpu()
63 | else:
64 | i = y
65 | win = hann_window[wnsize_dtype_device]
66 | spec = torch.stft(
67 | i,
68 | n_fft,
69 | hop_length=hop_size,
70 | win_length=win_size,
71 | window=win,
72 | center=center,
73 | pad_mode="reflect",
74 | normalized=False,
75 | onesided=True,
76 | return_complex=False,
77 | ).to(device=y.device)
78 |
79 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
80 | return spec
81 |
82 |
83 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
84 | global mel_basis
85 | dtype_device = str(spec.dtype) + "_" + str(spec.device)
86 | fmax_dtype_device = str(fmax) + "_" + dtype_device
87 | if fmax_dtype_device not in mel_basis:
88 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
89 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
90 | dtype=spec.dtype, device=spec.device
91 | )
92 | melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
93 | melspec = spectral_normalize_torch(melspec)
94 | return melspec
95 |
96 |
97 | def mel_spectrogram_torch(
98 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
99 | ):
100 | """Convert waveform into Mel-frequency Log-amplitude spectrogram.
101 |
102 | Args:
103 | y :: (B, T) - Waveforms
104 | Returns:
105 | melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
106 | """
107 | # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
108 | spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
109 |
110 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
111 | melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
112 |
113 | return melspec
114 |
--------------------------------------------------------------------------------
/lib/rvc/pipeline.py:
--------------------------------------------------------------------------------
1 | import os
2 | import traceback
3 | from typing import *
4 |
5 | import faiss
6 | import numpy as np
7 | import pyworld
8 | import scipy.signal as signal
9 | import torch
10 | import torch.nn.functional as F
11 | import torchcrepe
12 | from torch import Tensor
13 | # from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip
14 | from fairseq.models.hubert import HubertModel
15 |
16 | from .models import SynthesizerTrnMs256NSFSid
17 |
18 |
19 | class VocalConvertPipeline(object):
20 | def __init__(self, tgt_sr: int, device: Union[str, torch.device], is_half: bool):
21 | if isinstance(device, str):
22 | device = torch.device(device)
23 | if device.type == "cuda":
24 | vram = torch.cuda.get_device_properties(device).total_memory / 1024**3
25 | else:
26 | vram = None
27 |
28 | if vram is not None and vram <= 4:
29 | self.x_pad = 1
30 | self.x_query = 5
31 | self.x_center = 30
32 | self.x_max = 32
33 | elif vram is not None and vram <= 5:
34 | self.x_pad = 1
35 | self.x_query = 6
36 | self.x_center = 38
37 | self.x_max = 41
38 | else:
39 | self.x_pad = 3
40 | self.x_query = 10
41 | self.x_center = 60
42 | self.x_max = 65
43 |
44 | self.sr = 16000 # hubert input sample rate
45 | self.window = 160 # hubert input window
46 | self.t_pad = self.sr * self.x_pad # padding time for each utterance
47 | self.t_pad_tgt = tgt_sr * self.x_pad
48 | self.t_pad2 = self.t_pad * 2
49 | self.t_query = self.sr * self.x_query # query time before and after query point
50 | self.t_center = self.sr * self.x_center # query cut point position
51 | self.t_max = self.sr * self.x_max # max time for no query
52 | self.device = device
53 | self.is_half = is_half
54 |
55 | def get_optimal_torch_device(self, index: int = 0) -> torch.device:
56 | # Get cuda device
57 | if torch.cuda.is_available():
58 | return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
59 | elif torch.backends.mps.is_available():
60 | return torch.device("mps")
61 | # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
62 | # Else wise return the "cpu" as a torch device,
63 | return torch.device("cpu")
64 |
65 | def get_f0_crepe_computation(
66 | self,
67 | x,
68 | f0_min,
69 | f0_max,
70 | p_len,
71 | hop_length=64, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
72 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
73 | ):
74 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
75 | x /= np.quantile(np.abs(x), 0.999)
76 | torch_device = self.get_optimal_torch_device()
77 | audio = torch.from_numpy(x).to(torch_device, copy=True)
78 | audio = torch.unsqueeze(audio, dim=0)
79 | if audio.ndim == 2 and audio.shape[0] > 1:
80 | audio = torch.mean(audio, dim=0, keepdim=True).detach()
81 | audio = audio.detach()
82 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
83 | pitch: Tensor = torchcrepe.predict(
84 | audio,
85 | self.sr,
86 | hop_length,
87 | f0_min,
88 | f0_max,
89 | model,
90 | batch_size=hop_length * 2,
91 | device=torch_device,
92 | pad=True
93 | )
94 | p_len = p_len or x.shape[0] // hop_length
95 | # Resize the pitch for final f0
96 | source = np.array(pitch.squeeze(0).cpu().float().numpy())
97 | source[source < 0.001] = np.nan
98 | target = np.interp(
99 | np.arange(0, len(source) * p_len, len(source)) / p_len,
100 | np.arange(0, len(source)),
101 | source
102 | )
103 | f0 = np.nan_to_num(target)
104 | return f0 # Resized f0
105 |
106 | def get_f0_official_crepe_computation(
107 | self,
108 | x,
109 | f0_min,
110 | f0_max,
111 | model="full",
112 | ):
113 | # Pick a batch size that doesn't cause memory errors on your gpu
114 | batch_size = 512
115 | # Compute pitch using first gpu
116 | audio = torch.tensor(np.copy(x))[None].float()
117 | f0, pd = torchcrepe.predict(
118 | audio,
119 | self.sr,
120 | self.window,
121 | f0_min,
122 | f0_max,
123 | model,
124 | batch_size=batch_size,
125 | device=self.device,
126 | return_periodicity=True,
127 | )
128 | pd = torchcrepe.filter.median(pd, 3)
129 | f0 = torchcrepe.filter.mean(f0, 3)
130 | f0[pd < 0.1] = 0
131 | f0 = f0[0].cpu().numpy()
132 | return f0
133 |
134 | def get_f0(
135 | self,
136 | x: np.ndarray,
137 | p_len: int,
138 | f0_up_key: int,
139 | f0_method: str,
140 | inp_f0: np.ndarray = None,
141 | ):
142 | f0_min = 50
143 | f0_max = 1100
144 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
145 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
146 |
147 | if f0_method == "harvest":
148 | f0, t = pyworld.harvest(
149 | x.astype(np.double),
150 | fs=self.sr,
151 | f0_ceil=f0_max,
152 | f0_floor=f0_min,
153 | frame_period=10,
154 | )
155 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
156 | f0 = signal.medfilt(f0, 3)
157 | elif f0_method == "dio":
158 | f0, t = pyworld.dio(
159 | x.astype(np.double),
160 | fs=self.sr,
161 | f0_ceil=f0_max,
162 | f0_floor=f0_min,
163 | frame_period=10,
164 | )
165 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
166 | f0 = signal.medfilt(f0, 3)
167 | elif f0_method == "mangio-crepe":
168 | f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, 160, "full")
169 | elif f0_method == "crepe":
170 | f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "full")
171 |
172 | f0 *= pow(2, f0_up_key / 12)
173 | tf0 = self.sr // self.window # f0 points per second
174 | if inp_f0 is not None:
175 | delta_t = np.round(
176 | (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
177 | ).astype("int16")
178 | replace_f0 = np.interp(
179 | list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
180 | )
181 | shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
182 | f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
183 | :shape
184 | ]
185 |
186 | f0bak = f0.copy()
187 | f0_mel = 1127 * np.log(1 + f0 / 700)
188 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
189 | f0_mel_max - f0_mel_min
190 | ) + 1
191 | f0_mel[f0_mel <= 1] = 1
192 | f0_mel[f0_mel > 255] = 255
193 | f0_coarse = np.rint(f0_mel).astype(np.int)
194 | return f0_coarse, f0bak # 1-0
195 |
196 | def _convert(
197 | self,
198 | model: HubertModel,
199 | embedding_output_layer: int,
200 | net_g: SynthesizerTrnMs256NSFSid,
201 | sid: int,
202 | audio: np.ndarray,
203 | pitch: np.ndarray,
204 | pitchf: np.ndarray,
205 | index: faiss.IndexIVFFlat,
206 | big_npy: np.ndarray,
207 | index_rate: float,
208 | ):
209 | feats = torch.from_numpy(audio)
210 | if self.is_half:
211 | feats = feats.half()
212 | else:
213 | feats = feats.float()
214 | if feats.dim() == 2: # double channels
215 | feats = feats.mean(-1)
216 | assert feats.dim() == 1, feats.dim()
217 | feats = feats.view(1, -1)
218 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
219 |
220 | half_support = (
221 | self.device.type == "cuda"
222 | and torch.cuda.get_device_capability(self.device)[0] >= 5.3
223 | )
224 | is_feats_dim_768 = net_g.emb_channels == 768
225 |
226 | if isinstance(model, tuple):
227 | feats = model[0](
228 | feats.squeeze(0).squeeze(0).to(self.device),
229 | return_tensors="pt",
230 | sampling_rate=16000,
231 | )
232 | if self.is_half:
233 | feats = feats.input_values.to(self.device).half()
234 | else:
235 | feats = feats.input_values.to(self.device)
236 | with torch.no_grad():
237 | if is_feats_dim_768:
238 | feats = model[1](feats).last_hidden_state
239 | else:
240 | feats = model[1](feats).extract_features
241 | else:
242 | inputs = {
243 | "source": feats.half().to(self.device)
244 | if half_support
245 | else feats.to(self.device),
246 | "padding_mask": padding_mask.to(self.device),
247 | "output_layer": embedding_output_layer,
248 | }
249 |
250 | if not half_support:
251 | model = model.float()
252 | inputs["source"] = inputs["source"].float()
253 |
254 | with torch.no_grad():
255 | logits = model.extract_features(**inputs)
256 | if is_feats_dim_768:
257 | feats = logits[0]
258 | else:
259 | feats = model.final_proj(logits[0])
260 |
261 | if (
262 | isinstance(index, type(None)) == False
263 | and isinstance(big_npy, type(None)) == False
264 | and index_rate != 0
265 | ):
266 | npy = feats[0].cpu().numpy()
267 | if self.is_half:
268 | npy = npy.astype("float32")
269 |
270 | score, ix = index.search(npy, k=8)
271 | weight = np.square(1 / score)
272 | weight /= weight.sum(axis=1, keepdims=True)
273 | npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
274 |
275 | if self.is_half:
276 | npy = npy.astype("float16")
277 | feats = (
278 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
279 | + (1 - index_rate) * feats
280 | )
281 |
282 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
283 |
284 | p_len = audio.shape[0] // self.window
285 | if feats.shape[1] < p_len:
286 | p_len = feats.shape[1]
287 | if pitch != None and pitchf != None:
288 | pitch = pitch[:, :p_len]
289 | pitchf = pitchf[:, :p_len]
290 | p_len = torch.tensor([p_len], device=self.device).long()
291 | with torch.no_grad():
292 | if pitch != None and pitchf != None:
293 | audio1 = (
294 | (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
295 | .data.cpu()
296 | .float()
297 | .numpy()
298 | .astype(np.int16)
299 | )
300 | else:
301 | audio1 = (
302 | (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
303 | .data.cpu()
304 | .float()
305 | .numpy()
306 | .astype(np.int16)
307 | )
308 | del feats, p_len, padding_mask
309 | if torch.cuda.is_available():
310 | torch.cuda.empty_cache()
311 | return audio1
312 |
313 | def __call__(
314 | self,
315 | model: HubertModel,
316 | embedding_output_layer: int,
317 | net_g: SynthesizerTrnMs256NSFSid,
318 | sid: int,
319 | audio: np.ndarray,
320 | transpose: int,
321 | f0_method: str,
322 | file_index: str,
323 | index_rate: float,
324 | if_f0: bool,
325 | f0_file: str = None,
326 | ):
327 | if file_index != "" and os.path.exists(file_index) and index_rate != 0:
328 | try:
329 | index = faiss.read_index(file_index)
330 | # big_npy = np.load(file_big_npy)
331 | big_npy = index.reconstruct_n(0, index.ntotal)
332 | except:
333 | traceback.print_exc()
334 | index = big_npy = None
335 | else:
336 | index = big_npy = None
337 |
338 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
339 | audio = signal.filtfilt(bh, ah, audio)
340 |
341 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
342 | opt_ts = []
343 | if audio_pad.shape[0] > self.t_max:
344 | audio_sum = np.zeros_like(audio)
345 | for i in range(self.window):
346 | audio_sum += audio_pad[i : i - self.window]
347 | for t in range(self.t_center, audio.shape[0], self.t_center):
348 | opt_ts.append(
349 | t
350 | - self.t_query
351 | + np.where(
352 | np.abs(audio_sum[t - self.t_query : t + self.t_query])
353 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
354 | )[0][0]
355 | )
356 |
357 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
358 | p_len = audio_pad.shape[0] // self.window
359 | inp_f0 = None
360 | if hasattr(f0_file, "name"):
361 | try:
362 | with open(f0_file.name, "r") as f:
363 | lines = f.read().strip("\n").split("\n")
364 | inp_f0 = []
365 | for line in lines:
366 | inp_f0.append([float(i) for i in line.split(",")])
367 | inp_f0 = np.array(inp_f0, dtype="float32")
368 | except:
369 | traceback.print_exc()
370 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
371 | pitch, pitchf = None, None
372 | if if_f0 == 1:
373 | pitch, pitchf = self.get_f0(audio_pad, p_len, transpose, f0_method, inp_f0)
374 | pitch = pitch[:p_len]
375 | pitchf = pitchf[:p_len]
376 | if self.device.type == "mps":
377 | pitchf = pitchf.astype(np.float32)
378 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
379 | pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
380 |
381 | audio_opt = []
382 |
383 | s = 0
384 | t = None
385 |
386 | for t in opt_ts:
387 | t = t // self.window * self.window
388 | if if_f0 == 1:
389 | audio_opt.append(
390 | self._convert(
391 | model,
392 | embedding_output_layer,
393 | net_g,
394 | sid,
395 | audio_pad[s : t + self.t_pad2 + self.window],
396 | pitch[:, s // self.window : (t + self.t_pad2) // self.window],
397 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
398 | index,
399 | big_npy,
400 | index_rate,
401 | )[self.t_pad_tgt : -self.t_pad_tgt]
402 | )
403 | else:
404 | audio_opt.append(
405 | self._convert(
406 | model,
407 | embedding_output_layer,
408 | net_g,
409 | sid,
410 | audio_pad[s : t + self.t_pad2 + self.window],
411 | None,
412 | None,
413 | index,
414 | big_npy,
415 | index_rate,
416 | )[self.t_pad_tgt : -self.t_pad_tgt]
417 | )
418 | s = t
419 | if if_f0 == 1:
420 | audio_opt.append(
421 | self._convert(
422 | model,
423 | embedding_output_layer,
424 | net_g,
425 | sid,
426 | audio_pad[t:],
427 | pitch[:, t // self.window :] if t is not None else pitch,
428 | pitchf[:, t // self.window :] if t is not None else pitchf,
429 | index,
430 | big_npy,
431 | index_rate,
432 | )[self.t_pad_tgt : -self.t_pad_tgt]
433 | )
434 | else:
435 | audio_opt.append(
436 | self._convert(
437 | model,
438 | embedding_output_layer,
439 | net_g,
440 | sid,
441 | audio_pad[t:],
442 | None,
443 | None,
444 | index,
445 | big_npy,
446 | index_rate,
447 | )[self.t_pad_tgt : -self.t_pad_tgt]
448 | )
449 | audio_opt = np.concatenate(audio_opt)
450 | del pitch, pitchf, sid
451 | if torch.cuda.is_available():
452 | torch.cuda.empty_cache()
453 | return audio_opt
454 |
--------------------------------------------------------------------------------
/lib/rvc/preprocessing/extract_f0.py:
--------------------------------------------------------------------------------
1 | import os
2 | import traceback
3 | from concurrent.futures import ProcessPoolExecutor
4 | from typing import *
5 | import multiprocessing as mp
6 |
7 | import numpy as np
8 | import pyworld
9 | import torch
10 | import torchcrepe
11 | from torch import Tensor
12 | from tqdm import tqdm
13 |
14 | from lib.rvc.utils import load_audio
15 |
16 | def get_optimal_torch_device(index: int = 0) -> torch.device:
17 | # Get cuda device
18 | if torch.cuda.is_available():
19 | return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
20 | elif torch.backends.mps.is_available():
21 | return torch.device("mps")
22 | # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
23 | # Else wise return the "cpu" as a torch device,
24 | return torch.device("cpu")
25 |
26 | def get_f0_official_crepe_computation(
27 | x,
28 | sr,
29 | f0_min,
30 | f0_max,
31 | model="full",
32 | ):
33 | batch_size = 512
34 | torch_device = get_optimal_torch_device()
35 | audio = torch.tensor(np.copy(x))[None].float()
36 | f0, pd = torchcrepe.predict(
37 | audio,
38 | sr,
39 | 160,
40 | f0_min,
41 | f0_max,
42 | model,
43 | batch_size=batch_size,
44 | device=torch_device,
45 | return_periodicity=True,
46 | )
47 | pd = torchcrepe.filter.median(pd, 3)
48 | f0 = torchcrepe.filter.mean(f0, 3)
49 | f0[pd < 0.1] = 0
50 | f0 = f0[0].cpu().numpy()
51 | f0 = f0[1:] # Get rid of extra first frame
52 | return f0
53 |
54 | def get_f0_crepe_computation(
55 | x,
56 | sr,
57 | f0_min,
58 | f0_max,
59 | hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
60 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
61 | ):
62 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
63 | x /= np.quantile(np.abs(x), 0.999)
64 | torch_device = get_optimal_torch_device()
65 | audio = torch.from_numpy(x).to(torch_device, copy=True)
66 | audio = torch.unsqueeze(audio, dim=0)
67 | if audio.ndim == 2 and audio.shape[0] > 1:
68 | audio = torch.mean(audio, dim=0, keepdim=True).detach()
69 | audio = audio.detach()
70 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
71 | pitch: Tensor = torchcrepe.predict(
72 | audio,
73 | sr,
74 | hop_length,
75 | f0_min,
76 | f0_max,
77 | model,
78 | batch_size=hop_length * 2,
79 | device=torch_device,
80 | pad=True
81 | )
82 | p_len = x.shape[0] // hop_length
83 | # Resize the pitch for final f0
84 | source = np.array(pitch.squeeze(0).cpu().float().numpy())
85 | source[source < 0.001] = np.nan
86 | target = np.interp(
87 | np.arange(0, len(source) * p_len, len(source)) / p_len,
88 | np.arange(0, len(source)),
89 | source
90 | )
91 | f0 = np.nan_to_num(target)
92 | f0 = f0[1:] # Get rid of extra first frame
93 | return f0 # Resized f0
94 |
95 |
96 | def compute_f0(
97 | path: str,
98 | f0_method: str,
99 | fs: int,
100 | hop: int,
101 | f0_max: float,
102 | f0_min: float,
103 | ):
104 | x = load_audio(path, fs)
105 | if f0_method == "harvest":
106 | f0, t = pyworld.harvest(
107 | x.astype(np.double),
108 | fs=fs,
109 | f0_ceil=f0_max,
110 | f0_floor=f0_min,
111 | frame_period=1000 * hop / fs,
112 | )
113 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs)
114 | elif f0_method == "dio":
115 | f0, t = pyworld.dio(
116 | x.astype(np.double),
117 | fs=fs,
118 | f0_ceil=f0_max,
119 | f0_floor=f0_min,
120 | frame_period=1000 * hop / fs,
121 | )
122 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs)
123 | elif f0_method == "mangio-crepe":
124 | f0 = get_f0_crepe_computation(x, fs, f0_min, f0_max, 160, "full")
125 | elif f0_method == "crepe":
126 | f0 = get_f0_official_crepe_computation(x.astype(np.double), fs, f0_min, f0_max, "full")
127 | return f0
128 |
129 |
130 | def coarse_f0(f0, f0_bin, f0_mel_min, f0_mel_max):
131 | f0_mel = 1127 * np.log(1 + f0 / 700)
132 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
133 | f0_mel_max - f0_mel_min
134 | ) + 1
135 |
136 | # use 0 or 1
137 | f0_mel[f0_mel <= 1] = 1
138 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
139 | f0_coarse = np.rint(f0_mel).astype(np.int)
140 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
141 | f0_coarse.max(),
142 | f0_coarse.min(),
143 | )
144 | return f0_coarse
145 |
146 |
147 | def processor(paths, f0_method, samplerate=16000, hop_size=160, process_id=0):
148 | fs = samplerate
149 | hop = hop_size
150 |
151 | f0_bin = 256
152 | f0_max = 1100.0
153 | f0_min = 50.0
154 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
155 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
156 | if len(paths) != 0:
157 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(
158 | tqdm(paths, position=1 + process_id)
159 | ):
160 | try:
161 | if (
162 | os.path.exists(opt_path1 + ".npy") == True
163 | and os.path.exists(opt_path2 + ".npy") == True
164 | ):
165 | continue
166 | featur_pit = compute_f0(inp_path, f0_method, fs, hop, f0_max, f0_min)
167 | np.save(
168 | opt_path2,
169 | featur_pit,
170 | allow_pickle=False,
171 | ) # nsf
172 | coarse_pit = coarse_f0(featur_pit, f0_bin, f0_mel_min, f0_mel_max)
173 | np.save(
174 | opt_path1,
175 | coarse_pit,
176 | allow_pickle=False,
177 | ) # ori
178 | except:
179 | print(f"f0 failed {idx}: {inp_path} {traceback.format_exc()}")
180 |
181 |
182 | def run(training_dir: str, num_processes: int, f0_method: str):
183 | paths = []
184 | dataset_dir = os.path.join(training_dir, "1_16k_wavs")
185 | opt_dir_f0 = os.path.join(training_dir, "2a_f0")
186 | opt_dir_f0_nsf = os.path.join(training_dir, "2b_f0nsf")
187 |
188 | if os.path.exists(opt_dir_f0) and os.path.exists(opt_dir_f0_nsf):
189 | return
190 |
191 | os.makedirs(opt_dir_f0, exist_ok=True)
192 | os.makedirs(opt_dir_f0_nsf, exist_ok=True)
193 |
194 | names = []
195 |
196 | for pathname in sorted(list(os.listdir(dataset_dir))):
197 | if os.path.isdir(os.path.join(dataset_dir, pathname)):
198 | for f in sorted(list(os.listdir(os.path.join(dataset_dir, pathname)))):
199 | if "spec" in f:
200 | continue
201 | names.append(os.path.join(pathname, f))
202 | else:
203 | names.append(pathname)
204 |
205 | for name in names: # dataset_dir/{05d}/file.ext
206 | filepath = os.path.join(dataset_dir, name)
207 | if "spec" in filepath:
208 | continue
209 | opt_filepath_f0 = os.path.join(opt_dir_f0, name)
210 | opt_filepath_f0_nsf = os.path.join(opt_dir_f0_nsf, name)
211 | paths.append([filepath, opt_filepath_f0, opt_filepath_f0_nsf])
212 |
213 | for dir in set([(os.path.dirname(p[1]), os.path.dirname(p[2])) for p in paths]):
214 | os.makedirs(dir[0], exist_ok=True)
215 | os.makedirs(dir[1], exist_ok=True)
216 |
217 | with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executer:
218 | for i in range(num_processes):
219 | executer.submit(processor, paths[i::num_processes], f0_method, process_id=i)
220 |
221 | processor(paths, f0_method)
222 |
--------------------------------------------------------------------------------
/lib/rvc/preprocessing/extract_feature.py:
--------------------------------------------------------------------------------
1 | import multiprocessing as mp
2 | import os
3 | import traceback
4 | from concurrent.futures import ProcessPoolExecutor
5 | from typing import *
6 |
7 | import numpy as np
8 | import soundfile as sf
9 | import torch
10 | import torch.nn.functional as F
11 | from fairseq import checkpoint_utils
12 | from tqdm import tqdm
13 |
14 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
15 | MODELS_DIR = os.path.join(ROOT_DIR, "models")
16 | EMBEDDINGS_LIST = {
17 | "hubert-base-japanese": (
18 | "rinna_hubert_base_jp.pt",
19 | "hubert-base-japanese",
20 | "local",
21 | ),
22 | "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
23 | }
24 |
25 | def get_embedder(embedder_name):
26 | if embedder_name in EMBEDDINGS_LIST:
27 | return EMBEDDINGS_LIST[embedder_name]
28 | return None
29 |
30 |
31 | def load_embedder(embedder_path: str, device):
32 | try:
33 | models, cfg, _ = checkpoint_utils.load_model_ensemble_and_task(
34 | [embedder_path],
35 | suffix="",
36 | )
37 | embedder_model = models[0]
38 | embedder_model = embedder_model.to(device)
39 | if device != "cpu":
40 | embedder_model = embedder_model.half()
41 | else:
42 | embedder_model = embedder_model.float()
43 | embedder_model.eval()
44 | except Exception as e:
45 | print(f"Error: {e} {embedder_path}")
46 | traceback.print_exc()
47 |
48 | return embedder_model, cfg
49 |
50 |
51 | # wave must be 16k, hop_size=320
52 | def readwave(wav_path, normalize=False):
53 | wav, sr = sf.read(wav_path)
54 | assert sr == 16000
55 | feats = torch.from_numpy(wav).float()
56 | if feats.dim() == 2: # double channels
57 | feats = feats.mean(-1)
58 | assert feats.dim() == 1, feats.dim()
59 | if normalize:
60 | with torch.no_grad():
61 | feats = F.layer_norm(feats, feats.shape)
62 | feats = feats.view(1, -1)
63 | return feats
64 |
65 |
66 | def processor(
67 | todo: List[str],
68 | device: torch.device,
69 | embedder_path: str,
70 | embedder_load_from: str,
71 | embedding_channel: bool,
72 | embedding_output_layer: int,
73 | wav_dir: str,
74 | out_dir: str,
75 | process_id: int,
76 | ):
77 | half_support = (
78 | device.type == "cuda" and torch.cuda.get_device_capability(device)[0] >= 5.3
79 | )
80 | is_feats_dim_768 = embedding_channel == 768
81 |
82 | if embedder_load_from == "local" and not os.path.exists(embedder_path):
83 | return f"Embedder not found: {embedder_path}"
84 |
85 | model, cfg = load_embedder(embedder_path, device)
86 |
87 | for file in tqdm(todo, position=1 + process_id):
88 | try:
89 | if file.endswith(".wav"):
90 | wav_filepath = os.path.join(wav_dir, file)
91 | out_filepath = os.path.join(out_dir, file.replace("wav", "npy"))
92 |
93 | if os.path.exists(out_filepath):
94 | continue
95 |
96 | os.makedirs(os.path.dirname(out_filepath), exist_ok=True)
97 |
98 | is_normalize = False if cfg is None else cfg.task.normalize
99 | feats = readwave(wav_filepath, normalize=is_normalize)
100 | padding_mask = torch.BoolTensor(feats.shape).fill_(False)
101 | if isinstance(model, tuple):
102 | feats = model[0](
103 | feats.squeeze(0).squeeze(0).to(device),
104 | return_tensors="pt",
105 | sampling_rate=16000,
106 | )
107 | if half_support:
108 | feats = feats.input_values.to(device).half()
109 | else:
110 | feats = feats.input_values.to(device).float()
111 |
112 | with torch.no_grad():
113 | if half_support:
114 | if is_feats_dim_768:
115 | feats = model[1](feats).last_hidden_state
116 | else:
117 | feats = model[1](feats).extract_features
118 | else:
119 | if is_feats_dim_768:
120 | feats = model[1].float()(feats).last_hidden_state
121 | else:
122 | feats = model[1].float()(feats).extract_features
123 | else:
124 | inputs = {
125 | "source": feats.half().to(device)
126 | if half_support
127 | else feats.to(device),
128 | "padding_mask": padding_mask.to(device),
129 | "output_layer": embedding_output_layer,
130 | }
131 |
132 | # なんかまだこの時点でfloat16なので改めて変換
133 | if not half_support:
134 | model = model.float()
135 | inputs["source"] = inputs["source"].float()
136 |
137 | with torch.no_grad():
138 | logits = model.extract_features(**inputs)
139 | if is_feats_dim_768:
140 | feats = logits[0]
141 | else:
142 | feats = model.final_proj(logits[0])
143 |
144 | feats = feats.squeeze(0).float().cpu().numpy()
145 | if np.isnan(feats).sum() == 0:
146 | np.save(out_filepath, feats, allow_pickle=False)
147 | else:
148 | print(f"{file} contains nan")
149 | except Exception as e:
150 | print(f"Error: {e} {file}")
151 | traceback.print_exc()
152 |
153 |
154 | def run(
155 | training_dir: str,
156 | embedder_path: str,
157 | embedder_load_from: str,
158 | embedding_channel: int,
159 | embedding_output_layer: int,
160 | gpu_ids: List[int],
161 | device: Optional[Union[torch.device, str]] = None,
162 | ):
163 | wav_dir = os.path.join(training_dir, "1_16k_wavs")
164 | out_dir = os.path.join(training_dir, "3_feature256")
165 |
166 | num_gpus = len(gpu_ids)
167 |
168 | for gpu_id in gpu_ids:
169 | if num_gpus < gpu_id + 1:
170 | print(f"GPU {gpu_id} is not available")
171 | return
172 |
173 | if os.path.exists(out_dir):
174 | return
175 |
176 | os.makedirs(out_dir, exist_ok=True)
177 |
178 | todo = [
179 | os.path.join(dir, f)
180 | for dir in sorted(list(os.listdir(wav_dir)))
181 | if os.path.isdir(os.path.join(wav_dir, dir))
182 | for f in sorted(list(os.listdir(os.path.join(wav_dir, dir))))
183 | ]
184 |
185 | if device is not None:
186 | if type(device) == str:
187 | device = torch.device(device)
188 | if device.type == "mps":
189 | device = torch.device(
190 | "cpu"
191 | ) # Mac(MPS) crashes when multiprocess, so change to CPU.
192 | processor(
193 | todo,
194 | device,
195 | embedder_path,
196 | embedder_load_from,
197 | embedding_channel,
198 | embedding_output_layer,
199 | wav_dir,
200 | out_dir,
201 | process_id=0,
202 | )
203 | else:
204 | with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executor:
205 | for i, id in enumerate(gpu_ids):
206 | executor.submit(
207 | processor,
208 | todo[i::num_gpus],
209 | torch.device(f"cuda:{id}"),
210 | embedder_path,
211 | embedder_load_from,
212 | embedding_channel,
213 | embedding_output_layer,
214 | wav_dir,
215 | out_dir,
216 | process_id=i,
217 | )
218 |
--------------------------------------------------------------------------------
/lib/rvc/preprocessing/slicer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # This function is obtained from librosa.
5 | def get_rms(
6 | y,
7 | frame_length=2048,
8 | hop_length=512,
9 | pad_mode="constant",
10 | ):
11 | padding = (int(frame_length // 2), int(frame_length // 2))
12 | y = np.pad(y, padding, mode=pad_mode)
13 |
14 | axis = -1
15 | # put our new within-frame axis at the end for now
16 | out_strides = y.strides + tuple([y.strides[axis]])
17 | # Reduce the shape on the framing axis
18 | x_shape_trimmed = list(y.shape)
19 | x_shape_trimmed[axis] -= frame_length - 1
20 | out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
21 | xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
22 | if axis < 0:
23 | target_axis = axis - 1
24 | else:
25 | target_axis = axis + 1
26 | xw = np.moveaxis(xw, -1, target_axis)
27 | # Downsample along the target axis
28 | slices = [slice(None)] * xw.ndim
29 | slices[axis] = slice(0, None, hop_length)
30 | x = xw[tuple(slices)]
31 |
32 | # Calculate power
33 | power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
34 |
35 | return np.sqrt(power)
36 |
37 |
38 | class Slicer:
39 | def __init__(
40 | self,
41 | sr: int,
42 | threshold: float = -40.0,
43 | min_length: int = 5000,
44 | min_interval: int = 300,
45 | hop_size: int = 20,
46 | max_sil_kept: int = 5000,
47 | ):
48 | if not min_length >= min_interval >= hop_size:
49 | raise ValueError(
50 | "The following condition must be satisfied: min_length >= min_interval >= hop_size"
51 | )
52 | if not max_sil_kept >= hop_size:
53 | raise ValueError(
54 | "The following condition must be satisfied: max_sil_kept >= hop_size"
55 | )
56 | min_interval = sr * min_interval / 1000
57 | self.threshold = 10 ** (threshold / 20.0)
58 | self.hop_size = round(sr * hop_size / 1000)
59 | self.win_size = min(round(min_interval), 4 * self.hop_size)
60 | self.min_length = round(sr * min_length / 1000 / self.hop_size)
61 | self.min_interval = round(min_interval / self.hop_size)
62 | self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
63 |
64 | def _apply_slice(self, waveform, begin, end):
65 | if len(waveform.shape) > 1:
66 | return waveform[
67 | :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
68 | ]
69 | else:
70 | return waveform[
71 | begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
72 | ]
73 |
74 | # @timeit
75 | def slice(self, waveform):
76 | if len(waveform.shape) > 1:
77 | samples = waveform.mean(axis=0)
78 | else:
79 | samples = waveform
80 | if samples.shape[0] <= self.min_length:
81 | return [waveform]
82 | rms_list = get_rms(
83 | y=samples, frame_length=self.win_size, hop_length=self.hop_size
84 | ).squeeze(0)
85 | sil_tags = []
86 | silence_start = None
87 | clip_start = 0
88 | for i, rms in enumerate(rms_list):
89 | # Keep looping while frame is silent.
90 | if rms < self.threshold:
91 | # Record start of silent frames.
92 | if silence_start is None:
93 | silence_start = i
94 | continue
95 | # Keep looping while frame is not silent and silence start has not been recorded.
96 | if silence_start is None:
97 | continue
98 | # Clear recorded silence start if interval is not enough or clip is too short
99 | is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100 | need_slice_middle = (
101 | i - silence_start >= self.min_interval
102 | and i - clip_start >= self.min_length
103 | )
104 | if not is_leading_silence and not need_slice_middle:
105 | silence_start = None
106 | continue
107 | # Need slicing. Record the range of silent frames to be removed.
108 | if i - silence_start <= self.max_sil_kept:
109 | pos = rms_list[silence_start : i + 1].argmin() + silence_start
110 | if silence_start == 0:
111 | sil_tags.append((0, pos))
112 | else:
113 | sil_tags.append((pos, pos))
114 | clip_start = pos
115 | elif i - silence_start <= self.max_sil_kept * 2:
116 | pos = rms_list[
117 | i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118 | ].argmin()
119 | pos += i - self.max_sil_kept
120 | pos_l = (
121 | rms_list[
122 | silence_start : silence_start + self.max_sil_kept + 1
123 | ].argmin()
124 | + silence_start
125 | )
126 | pos_r = (
127 | rms_list[i - self.max_sil_kept : i + 1].argmin()
128 | + i
129 | - self.max_sil_kept
130 | )
131 | if silence_start == 0:
132 | sil_tags.append((0, pos_r))
133 | clip_start = pos_r
134 | else:
135 | sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136 | clip_start = max(pos_r, pos)
137 | else:
138 | pos_l = (
139 | rms_list[
140 | silence_start : silence_start + self.max_sil_kept + 1
141 | ].argmin()
142 | + silence_start
143 | )
144 | pos_r = (
145 | rms_list[i - self.max_sil_kept : i + 1].argmin()
146 | + i
147 | - self.max_sil_kept
148 | )
149 | if silence_start == 0:
150 | sil_tags.append((0, pos_r))
151 | else:
152 | sil_tags.append((pos_l, pos_r))
153 | clip_start = pos_r
154 | silence_start = None
155 | # Deal with trailing silence.
156 | total_frames = rms_list.shape[0]
157 | if (
158 | silence_start is not None
159 | and total_frames - silence_start >= self.min_interval
160 | ):
161 | silence_end = min(total_frames, silence_start + self.max_sil_kept)
162 | pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163 | sil_tags.append((pos, total_frames + 1))
164 | # Apply and return slices.
165 | if len(sil_tags) == 0:
166 | return [waveform]
167 | else:
168 | chunks = []
169 | if sil_tags[0][0] > 0:
170 | chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
171 | for i in range(len(sil_tags) - 1):
172 | chunks.append(
173 | self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
174 | )
175 | if sil_tags[-1][1] < total_frames:
176 | chunks.append(
177 | self._apply_slice(waveform, sil_tags[-1][1], total_frames)
178 | )
179 | return chunks
180 |
--------------------------------------------------------------------------------
/lib/rvc/preprocessing/split.py:
--------------------------------------------------------------------------------
1 | import operator
2 | import os
3 | from concurrent.futures import ProcessPoolExecutor
4 | from typing import *
5 |
6 | import librosa
7 | import numpy as np
8 | import scipy.signal as signal
9 | from scipy.io import wavfile
10 | from tqdm import tqdm
11 |
12 | from lib.rvc.utils import load_audio
13 |
14 | from .slicer import Slicer
15 |
16 |
17 | def norm_write(
18 | tmp_audio: np.ndarray,
19 | idx0: int,
20 | idx1: int,
21 | speaker_id: int,
22 | outdir: str,
23 | outdir_16k: str,
24 | sampling_rate: int,
25 | max: float,
26 | alpha: float,
27 | is_normalize: bool,
28 | ):
29 | if is_normalize:
30 | tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + (
31 | 1 - alpha
32 | ) * tmp_audio
33 | else:
34 | # clip level to max (cause sometimes when floating point decoding)
35 | audio_min = np.min(tmp_audio)
36 | if audio_min < -max:
37 | tmp_audio = tmp_audio / -audio_min * max
38 | audio_max = np.max(tmp_audio)
39 | if audio_max > max:
40 | tmp_audio = tmp_audio / audio_max * max
41 |
42 | wavfile.write(
43 | os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"),
44 | sampling_rate,
45 | tmp_audio.astype(np.float32),
46 | )
47 |
48 | tmp_audio = librosa.resample(
49 | tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq"
50 | )
51 | wavfile.write(
52 | os.path.join(outdir_16k, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"),
53 | 16000,
54 | tmp_audio.astype(np.float32),
55 | )
56 |
57 |
58 | def write_mute(
59 | mute_wave_filename: str,
60 | speaker_id: int,
61 | outdir: str,
62 | outdir_16k: str,
63 | sampling_rate: int,
64 | ):
65 | tmp_audio = load_audio(mute_wave_filename, sampling_rate)
66 | wavfile.write(
67 | os.path.join(outdir, f"{speaker_id:05}", "mute.wav"),
68 | sampling_rate,
69 | tmp_audio.astype(np.float32),
70 | )
71 | tmp_audio = librosa.resample(
72 | tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq"
73 | )
74 | wavfile.write(
75 | os.path.join(outdir_16k, f"{speaker_id:05}", "mute.wav"),
76 | 16000,
77 | tmp_audio.astype(np.float32),
78 | )
79 |
80 |
81 | def pipeline(
82 | slicer: Slicer,
83 | datasets: List[Tuple[str, int]], # List[(path, speaker_id)]
84 | outdir: str,
85 | outdir_16k: str,
86 | sampling_rate: int,
87 | is_normalize: bool,
88 | process_id: int = 0,
89 | ):
90 | per = 3.7
91 | overlap = 0.3
92 | tail = per + overlap
93 | max = 0.95
94 | alpha = 0.8
95 |
96 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=sampling_rate)
97 |
98 | for index, (wave_filename, speaker_id) in tqdm(datasets, position=1 + process_id):
99 | audio = load_audio(wave_filename, sampling_rate)
100 | audio = signal.lfilter(bh, ah, audio)
101 |
102 | idx1 = 0
103 | for audio in slicer.slice(audio):
104 | i = 0
105 | while 1:
106 | start = int(sampling_rate * (per - overlap) * i)
107 | i += 1
108 | if len(audio[start:]) > tail * sampling_rate:
109 | tmp_audio = audio[start : start + int(per * sampling_rate)]
110 | norm_write(
111 | tmp_audio,
112 | index,
113 | idx1,
114 | speaker_id,
115 | outdir,
116 | outdir_16k,
117 | sampling_rate,
118 | max,
119 | alpha,
120 | is_normalize,
121 | )
122 | idx1 += 1
123 | else:
124 | tmp_audio = audio[start:]
125 | break
126 | norm_write(
127 | tmp_audio,
128 | index,
129 | idx1,
130 | speaker_id,
131 | outdir,
132 | outdir_16k,
133 | sampling_rate,
134 | max,
135 | alpha,
136 | is_normalize,
137 | )
138 | idx1 += 1
139 |
140 |
141 | def preprocess_audio(
142 | datasets: List[Tuple[str, int]], # List[(path, speaker_id)]
143 | sampling_rate: int,
144 | num_processes: int,
145 | training_dir: str,
146 | is_normalize: bool,
147 | mute_wav_path: str,
148 | ):
149 | waves_dir = os.path.join(training_dir, "0_gt_wavs")
150 | waves16k_dir = os.path.join(training_dir, "1_16k_wavs")
151 | if os.path.exists(waves_dir) and os.path.exists(waves16k_dir):
152 | return
153 |
154 | for speaker_id in set([spk for _, spk in datasets]):
155 | os.makedirs(os.path.join(waves_dir, f"{speaker_id:05}"), exist_ok=True)
156 | os.makedirs(os.path.join(waves16k_dir, f"{speaker_id:05}"), exist_ok=True)
157 |
158 | all = [(i, x) for i, x in enumerate(sorted(datasets, key=operator.itemgetter(0)))]
159 |
160 | # n of datasets per process
161 | process_all_nums = [len(all) // num_processes] * num_processes
162 | # add residual datasets
163 | for i in range(len(all) % num_processes):
164 | process_all_nums[i] += 1
165 |
166 | assert len(all) == sum(process_all_nums), print(
167 | f"len(all): {len(all)}, sum(process_all_nums): {sum(process_all_nums)}"
168 | )
169 |
170 | with ProcessPoolExecutor(max_workers=num_processes) as executor:
171 | all_index = 0
172 | for i in range(num_processes):
173 | data = all[all_index : all_index + process_all_nums[i]]
174 | slicer = Slicer(
175 | sr=sampling_rate,
176 | threshold=-42,
177 | min_length=1500,
178 | min_interval=400,
179 | hop_size=15,
180 | max_sil_kept=500,
181 | )
182 | executor.submit(
183 | pipeline,
184 | slicer,
185 | data,
186 | waves_dir,
187 | waves16k_dir,
188 | sampling_rate,
189 | is_normalize,
190 | process_id=i,
191 | )
192 | all_index += process_all_nums[i]
193 |
194 | for speaker_id in set([spk for _, spk in datasets]):
195 | write_mute(mute_wav_path, speaker_id, waves_dir, waves16k_dir, sampling_rate)
196 |
--------------------------------------------------------------------------------
/lib/rvc/transforms.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch.nn import functional as F
4 |
5 | DEFAULT_MIN_BIN_WIDTH = 1e-3
6 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
7 | DEFAULT_MIN_DERIVATIVE = 1e-3
8 |
9 |
10 | def piecewise_rational_quadratic_transform(
11 | inputs,
12 | unnormalized_widths,
13 | unnormalized_heights,
14 | unnormalized_derivatives,
15 | inverse=False,
16 | tails=None,
17 | tail_bound=1.0,
18 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
19 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
20 | min_derivative=DEFAULT_MIN_DERIVATIVE,
21 | ):
22 | if tails is None:
23 | spline_fn = rational_quadratic_spline
24 | spline_kwargs = {}
25 | else:
26 | spline_fn = unconstrained_rational_quadratic_spline
27 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
28 |
29 | outputs, logabsdet = spline_fn(
30 | inputs=inputs,
31 | unnormalized_widths=unnormalized_widths,
32 | unnormalized_heights=unnormalized_heights,
33 | unnormalized_derivatives=unnormalized_derivatives,
34 | inverse=inverse,
35 | min_bin_width=min_bin_width,
36 | min_bin_height=min_bin_height,
37 | min_derivative=min_derivative,
38 | **spline_kwargs
39 | )
40 | return outputs, logabsdet
41 |
42 |
43 | def searchsorted(bin_locations, inputs, eps=1e-6):
44 | bin_locations[..., -1] += eps
45 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
46 |
47 |
48 | def unconstrained_rational_quadratic_spline(
49 | inputs,
50 | unnormalized_widths,
51 | unnormalized_heights,
52 | unnormalized_derivatives,
53 | inverse=False,
54 | tails="linear",
55 | tail_bound=1.0,
56 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
57 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
58 | min_derivative=DEFAULT_MIN_DERIVATIVE,
59 | ):
60 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
61 | outside_interval_mask = ~inside_interval_mask
62 |
63 | outputs = torch.zeros_like(inputs)
64 | logabsdet = torch.zeros_like(inputs)
65 |
66 | if tails == "linear":
67 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
68 | constant = np.log(np.exp(1 - min_derivative) - 1)
69 | unnormalized_derivatives[..., 0] = constant
70 | unnormalized_derivatives[..., -1] = constant
71 |
72 | outputs[outside_interval_mask] = inputs[outside_interval_mask]
73 | logabsdet[outside_interval_mask] = 0
74 | else:
75 | raise RuntimeError("{} tails are not implemented.".format(tails))
76 |
77 | (
78 | outputs[inside_interval_mask],
79 | logabsdet[inside_interval_mask],
80 | ) = rational_quadratic_spline(
81 | inputs=inputs[inside_interval_mask],
82 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
83 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
84 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
85 | inverse=inverse,
86 | left=-tail_bound,
87 | right=tail_bound,
88 | bottom=-tail_bound,
89 | top=tail_bound,
90 | min_bin_width=min_bin_width,
91 | min_bin_height=min_bin_height,
92 | min_derivative=min_derivative,
93 | )
94 |
95 | return outputs, logabsdet
96 |
97 |
98 | def rational_quadratic_spline(
99 | inputs,
100 | unnormalized_widths,
101 | unnormalized_heights,
102 | unnormalized_derivatives,
103 | inverse=False,
104 | left=0.0,
105 | right=1.0,
106 | bottom=0.0,
107 | top=1.0,
108 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
109 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
110 | min_derivative=DEFAULT_MIN_DERIVATIVE,
111 | ):
112 | if torch.min(inputs) < left or torch.max(inputs) > right:
113 | raise ValueError("Input to a transform is not within its domain")
114 |
115 | num_bins = unnormalized_widths.shape[-1]
116 |
117 | if min_bin_width * num_bins > 1.0:
118 | raise ValueError("Minimal bin width too large for the number of bins")
119 | if min_bin_height * num_bins > 1.0:
120 | raise ValueError("Minimal bin height too large for the number of bins")
121 |
122 | widths = F.softmax(unnormalized_widths, dim=-1)
123 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
124 | cumwidths = torch.cumsum(widths, dim=-1)
125 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
126 | cumwidths = (right - left) * cumwidths + left
127 | cumwidths[..., 0] = left
128 | cumwidths[..., -1] = right
129 | widths = cumwidths[..., 1:] - cumwidths[..., :-1]
130 |
131 | derivatives = min_derivative + F.softplus(unnormalized_derivatives)
132 |
133 | heights = F.softmax(unnormalized_heights, dim=-1)
134 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
135 | cumheights = torch.cumsum(heights, dim=-1)
136 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
137 | cumheights = (top - bottom) * cumheights + bottom
138 | cumheights[..., 0] = bottom
139 | cumheights[..., -1] = top
140 | heights = cumheights[..., 1:] - cumheights[..., :-1]
141 |
142 | if inverse:
143 | bin_idx = searchsorted(cumheights, inputs)[..., None]
144 | else:
145 | bin_idx = searchsorted(cumwidths, inputs)[..., None]
146 |
147 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
148 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
149 |
150 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
151 | delta = heights / widths
152 | input_delta = delta.gather(-1, bin_idx)[..., 0]
153 |
154 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
155 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
156 |
157 | input_heights = heights.gather(-1, bin_idx)[..., 0]
158 |
159 | if inverse:
160 | a = (inputs - input_cumheights) * (
161 | input_derivatives + input_derivatives_plus_one - 2 * input_delta
162 | ) + input_heights * (input_delta - input_derivatives)
163 | b = input_heights * input_derivatives - (inputs - input_cumheights) * (
164 | input_derivatives + input_derivatives_plus_one - 2 * input_delta
165 | )
166 | c = -input_delta * (inputs - input_cumheights)
167 |
168 | discriminant = b.pow(2) - 4 * a * c
169 | assert (discriminant >= 0).all()
170 |
171 | root = (2 * c) / (-b - torch.sqrt(discriminant))
172 | outputs = root * input_bin_widths + input_cumwidths
173 |
174 | theta_one_minus_theta = root * (1 - root)
175 | denominator = input_delta + (
176 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
177 | * theta_one_minus_theta
178 | )
179 | derivative_numerator = input_delta.pow(2) * (
180 | input_derivatives_plus_one * root.pow(2)
181 | + 2 * input_delta * theta_one_minus_theta
182 | + input_derivatives * (1 - root).pow(2)
183 | )
184 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
185 |
186 | return outputs, -logabsdet
187 | else:
188 | theta = (inputs - input_cumwidths) / input_bin_widths
189 | theta_one_minus_theta = theta * (1 - theta)
190 |
191 | numerator = input_heights * (
192 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
193 | )
194 | denominator = input_delta + (
195 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
196 | * theta_one_minus_theta
197 | )
198 | outputs = input_cumheights + numerator / denominator
199 |
200 | derivative_numerator = input_delta.pow(2) * (
201 | input_derivatives_plus_one * theta.pow(2)
202 | + 2 * input_delta * theta_one_minus_theta
203 | + input_derivatives * (1 - theta).pow(2)
204 | )
205 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
206 |
207 | return outputs, logabsdet
208 |
--------------------------------------------------------------------------------
/lib/rvc/utils.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import logging
3 | import os
4 | import shutil
5 | import socket
6 | import sys
7 |
8 | import ffmpeg
9 | import matplotlib
10 | import matplotlib.pylab as plt
11 | import numpy as np
12 | import torch
13 | from scipy.io.wavfile import read
14 | from torch.nn import functional as F
15 |
16 | from modules.shared import ROOT_DIR
17 |
18 | from .config import TrainConfig
19 |
20 | matplotlib.use("Agg")
21 | logging.getLogger("matplotlib").setLevel(logging.WARNING)
22 |
23 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
24 | logger = logging
25 |
26 |
27 | def load_audio(file: str, sr):
28 | try:
29 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
30 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
31 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
32 | file = (
33 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
34 | ) # Prevent small white copy path head and tail with spaces and " and return
35 | out, _ = (
36 | ffmpeg.input(file, threads=0)
37 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
38 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
39 | )
40 | except Exception as e:
41 | raise RuntimeError(f"Failed to load audio: {e}")
42 |
43 | return np.frombuffer(out, np.float32).flatten()
44 |
45 |
46 | def find_empty_port():
47 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
48 | s.bind(("", 0))
49 | s.listen(1)
50 | port = s.getsockname()[1]
51 | s.close()
52 | return port
53 |
54 |
55 | def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
56 | assert os.path.isfile(checkpoint_path)
57 | checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
58 |
59 | saved_state_dict = checkpoint_dict["model"]
60 | if hasattr(model, "module"):
61 | state_dict = model.module.state_dict()
62 | else:
63 | state_dict = model.state_dict()
64 | new_state_dict = {}
65 | for k, v in state_dict.items(): # 模型需要的shape
66 | try:
67 | new_state_dict[k] = saved_state_dict[k]
68 | if saved_state_dict[k].shape != state_dict[k].shape:
69 | print(
70 | f"shape-{k}-mismatch|need-{state_dict[k].shape}|get-{saved_state_dict[k].shape}"
71 | )
72 | if saved_state_dict[k].dim() == 2: # NOTE: check is this ok?
73 | # for embedded input 256 <==> 768
74 | # this achieves we can continue training from original's pretrained checkpoints when using embedder that 768-th dim output etc.
75 | if saved_state_dict[k].dtype == torch.half:
76 | new_state_dict[k] = (
77 | F.interpolate(
78 | saved_state_dict[k].float().unsqueeze(0).unsqueeze(0),
79 | size=state_dict[k].shape,
80 | mode="bilinear",
81 | )
82 | .half()
83 | .squeeze(0)
84 | .squeeze(0)
85 | )
86 | else:
87 | new_state_dict[k] = (
88 | F.interpolate(
89 | saved_state_dict[k].unsqueeze(0).unsqueeze(0),
90 | size=state_dict[k].shape,
91 | mode="bilinear",
92 | )
93 | .squeeze(0)
94 | .squeeze(0)
95 | )
96 | print(
97 | "interpolated new_state_dict",
98 | k,
99 | "from",
100 | saved_state_dict[k].shape,
101 | "to",
102 | new_state_dict[k].shape,
103 | )
104 | else:
105 | raise KeyError
106 | except Exception as e:
107 | # print(traceback.format_exc())
108 | print(f"{k} is not in the checkpoint")
109 | print("error: %s" % e)
110 | new_state_dict[k] = v # 模型自带的随机值
111 | if hasattr(model, "module"):
112 | model.module.load_state_dict(new_state_dict, strict=False)
113 | else:
114 | model.load_state_dict(new_state_dict, strict=False)
115 | print("Loaded model weights")
116 |
117 | epoch = checkpoint_dict["epoch"]
118 | learning_rate = checkpoint_dict["learning_rate"]
119 | if optimizer is not None and load_opt == 1:
120 | optimizer.load_state_dict(checkpoint_dict["optimizer"])
121 | print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, epoch))
122 | return model, optimizer, learning_rate, epoch
123 |
124 |
125 | def save_state(model, optimizer, learning_rate, epoch, checkpoint_path):
126 | print(
127 | "Saving model and optimizer state at epoch {} to {}".format(
128 | epoch, checkpoint_path
129 | )
130 | )
131 | if hasattr(model, "module"):
132 | state_dict = model.module.state_dict()
133 | else:
134 | state_dict = model.state_dict()
135 | torch.save(
136 | {
137 | "model": state_dict,
138 | "epoch": epoch,
139 | "optimizer": optimizer.state_dict(),
140 | "learning_rate": learning_rate,
141 | },
142 | checkpoint_path,
143 | )
144 |
145 |
146 | def summarize(
147 | writer,
148 | global_step,
149 | scalars={},
150 | histograms={},
151 | images={},
152 | audios={},
153 | audio_sampling_rate=22050,
154 | ):
155 | for k, v in scalars.items():
156 | writer.add_scalar(k, v, global_step)
157 | for k, v in histograms.items():
158 | writer.add_histogram(k, v, global_step)
159 | for k, v in images.items():
160 | writer.add_image(k, v, global_step, dataformats="HWC")
161 | for k, v in audios.items():
162 | writer.add_audio(k, v, global_step, audio_sampling_rate)
163 |
164 |
165 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
166 | filelist = glob.glob(os.path.join(dir_path, regex))
167 | if len(filelist) == 0:
168 | return None
169 | filelist.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
170 | filepath = filelist[-1]
171 | return filepath
172 |
173 |
174 | def plot_spectrogram_to_numpy(spectrogram):
175 | fig, ax = plt.subplots(figsize=(10, 2))
176 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
177 | plt.colorbar(im, ax=ax)
178 | plt.xlabel("Frames")
179 | plt.ylabel("Channels")
180 | plt.tight_layout()
181 |
182 | fig.canvas.draw()
183 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
184 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
185 | plt.close()
186 | return data
187 |
188 |
189 | def plot_alignment_to_numpy(alignment, info=None):
190 | fig, ax = plt.subplots(figsize=(6, 4))
191 | im = ax.imshow(
192 | alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
193 | )
194 | fig.colorbar(im, ax=ax)
195 | xlabel = "Decoder timestep"
196 | if info is not None:
197 | xlabel += "\n\n" + info
198 | plt.xlabel(xlabel)
199 | plt.ylabel("Encoder timestep")
200 | plt.tight_layout()
201 |
202 | fig.canvas.draw()
203 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
204 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
205 | plt.close()
206 | return data
207 |
208 |
209 | def load_wav_to_torch(full_path):
210 | sampling_rate, data = read(full_path)
211 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate
212 |
213 |
214 | def load_config(training_dir: str, sample_rate: int, emb_channels: int):
215 | if emb_channels == 256:
216 | config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json")
217 | else:
218 | config_path = os.path.join(
219 | ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json"
220 | )
221 | config_save_path = os.path.join(training_dir, "config.json")
222 |
223 | shutil.copyfile(config_path, config_save_path)
224 |
225 | return TrainConfig.parse_file(config_save_path)
226 |
--------------------------------------------------------------------------------
/models/checkpoints/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/models/embeddings/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/models/pretrained/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/models/training/.gitignore:
--------------------------------------------------------------------------------
1 | */**
2 |
3 | !mute/**/*
4 | !.gitignore
5 |
6 | mute/**/*.pt
7 |
--------------------------------------------------------------------------------
/models/training/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/models/training/mute/0_gt_wavs/mute32k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute32k.wav
--------------------------------------------------------------------------------
/models/training/mute/0_gt_wavs/mute40k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute40k.wav
--------------------------------------------------------------------------------
/models/training/mute/0_gt_wavs/mute48k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute48k.wav
--------------------------------------------------------------------------------
/models/training/mute/1_16k_wavs/mute.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/1_16k_wavs/mute.wav
--------------------------------------------------------------------------------
/models/training/mute/2a_f0/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/2a_f0/mute.wav.npy
--------------------------------------------------------------------------------
/models/training/mute/2b_f0nsf/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/2b_f0nsf/mute.wav.npy
--------------------------------------------------------------------------------
/models/training/mute/3_feature256/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/3_feature256/mute.npy
--------------------------------------------------------------------------------
/modules/cmd_opts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | parser = argparse.ArgumentParser()
4 |
5 | parser.add_argument("--host", help="Host to connect to", type=str, default="127.0.0.1")
6 | parser.add_argument("--port", help="Port to connect to", type=int)
7 | parser.add_argument("--share", help="Enable gradio share", action="store_true")
8 | parser.add_argument(
9 | "--models-dir", help="Path to models directory", type=str, default=None
10 | )
11 | parser.add_argument(
12 | "--output-dir", help="Path to output directory", type=str, default=None
13 | )
14 | parser.add_argument(
15 | "--precision",
16 | help="Precision to use",
17 | type=str,
18 | default="fp16",
19 | choices=["fp32", "fp16"],
20 | )
21 |
22 | opts, _ = parser.parse_known_args()
23 |
--------------------------------------------------------------------------------
/modules/core.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import os
3 | import shutil
4 | import sys
5 | from concurrent.futures import ThreadPoolExecutor
6 |
7 | import requests
8 |
9 | from modules.models import MODELS_DIR
10 | from modules.shared import ROOT_DIR
11 | from modules.utils import download_file
12 |
13 |
14 | def get_hf_etag(url: str):
15 | r = requests.head(url)
16 |
17 | etag = r.headers["X-Linked-ETag"] if "X-Linked-ETag" in r.headers else ""
18 |
19 | if etag.startswith('"') and etag.endswith('"'):
20 | etag = etag[1:-1]
21 |
22 | return etag
23 |
24 |
25 | def calc_sha256(filepath: str):
26 | sha256 = hashlib.sha256()
27 | with open(filepath, "rb") as f:
28 | for chunk in iter(lambda: f.read(4096), b""):
29 | sha256.update(chunk)
30 | return sha256.hexdigest()
31 |
32 |
33 | def download_models():
34 | def hash_check(url: str, out: str):
35 | if not os.path.exists(out):
36 | return False
37 | etag = get_hf_etag(url)
38 | hash = calc_sha256(out)
39 | return etag == hash
40 |
41 | os.makedirs(os.path.join(MODELS_DIR, "pretrained", "v2"), exist_ok=True)
42 |
43 | tasks = []
44 | for template in [
45 | "D{}k",
46 | "G{}k",
47 | "f0D{}k",
48 | "f0G{}k",
49 | ]:
50 | basename = template.format("40")
51 | url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/pretrained/v2/{basename}.pth"
52 | out = os.path.join(MODELS_DIR, "pretrained", "v2", f"{basename}.pth")
53 |
54 | if hash_check(url, out):
55 | continue
56 |
57 | tasks.append((url, out))
58 |
59 | for filename in [
60 | "checkpoint_best_legacy_500.pt",
61 | ]:
62 | out = os.path.join(MODELS_DIR, "embeddings", filename)
63 | url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}"
64 |
65 | if hash_check(url, out):
66 | continue
67 |
68 | tasks.append(
69 | (
70 | f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}",
71 | out,
72 | )
73 | )
74 |
75 | # japanese-hubert-base (Fairseq)
76 | # from official repo
77 | # NOTE: change filename?
78 | hubert_jp_url = f"https://huggingface.co/rinna/japanese-hubert-base/resolve/main/fairseq/model.pt"
79 | out = os.path.join(MODELS_DIR, "embeddings", "rinna_hubert_base_jp.pt")
80 | if not hash_check(hubert_jp_url, out):
81 | tasks.append(
82 | (
83 | hubert_jp_url,
84 | out,
85 | )
86 | )
87 |
88 | if len(tasks) < 1:
89 | return
90 |
91 | with ThreadPoolExecutor() as pool:
92 | pool.map(
93 | download_file,
94 | *zip(
95 | *[(filename, out, i, True) for i, (filename, out) in enumerate(tasks)]
96 | ),
97 | )
98 |
99 |
100 | def install_ffmpeg():
101 | if os.path.exists(os.path.join(ROOT_DIR, "bin", "ffmpeg.exe")):
102 | return
103 | tmpdir = os.path.join(ROOT_DIR, "tmp")
104 | url = (
105 | "https://www.gyan.dev/ffmpeg/builds/packages/ffmpeg-5.1.2-essentials_build.zip"
106 | )
107 | out = os.path.join(tmpdir, "ffmpeg.zip")
108 | os.makedirs(os.path.dirname(out), exist_ok=True)
109 | download_file(url, out)
110 | shutil.unpack_archive(out, os.path.join(tmpdir, "ffmpeg"))
111 | shutil.copyfile(
112 | os.path.join(
113 | tmpdir, "ffmpeg", "ffmpeg-5.1.2-essentials_build", "bin", "ffmpeg.exe"
114 | ),
115 | os.path.join(ROOT_DIR, "bin", "ffmpeg.exe"),
116 | )
117 | os.remove(os.path.join(tmpdir, "ffmpeg.zip"))
118 | shutil.rmtree(os.path.join(tmpdir, "ffmpeg"))
119 |
120 |
121 | def update_modelnames():
122 | for sr in ["32k", "40k", "48k"]:
123 | files = [
124 | f"f0G{sr}",
125 | f"f0D{sr}",
126 | f"G{sr}",
127 | f"D{sr}",
128 | ]
129 | for file in files:
130 | filepath = os.path.join(MODELS_DIR, "pretrained", f"{file}.pth")
131 | if os.path.exists(filepath):
132 | os.rename(
133 | filepath,
134 | os.path.join(MODELS_DIR, "pretrained", f"{file}256.pth"),
135 | )
136 |
137 | if not os.path.exists(os.path.join(MODELS_DIR, "embeddings")):
138 | os.makedirs(os.path.join(MODELS_DIR, "embeddings"))
139 |
140 | if os.path.exists(os.path.join(MODELS_DIR, "hubert_base.pt")):
141 | os.rename(
142 | os.path.join(MODELS_DIR, "hubert_base.pt"),
143 | os.path.join(MODELS_DIR, "embeddings", "hubert_base.pt"),
144 | )
145 | if os.path.exists(os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt")):
146 | os.rename(
147 | os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt"),
148 | os.path.join(MODELS_DIR, "embeddings", "checkpoint_best_legacy_500.pt"),
149 | )
150 |
151 |
152 | def preload():
153 | update_modelnames()
154 | download_models()
155 | if sys.platform == "win32":
156 | install_ffmpeg()
157 |
--------------------------------------------------------------------------------
/modules/merge.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from typing import *
3 |
4 | import torch
5 | import tqdm
6 |
7 |
8 | def merge(
9 | path_a: str,
10 | path_b: str,
11 | path_c: str,
12 | alpha: float,
13 | weights: Dict[str, float],
14 | method: str,
15 | ):
16 | def extract(ckpt: Dict[str, Any]):
17 | a = ckpt["model"]
18 | opt = OrderedDict()
19 | opt["weight"] = {}
20 | for key in a.keys():
21 | if "enc_q" in key:
22 | continue
23 | opt["weight"][key] = a[key]
24 | return opt
25 |
26 | def load_weight(path: str):
27 | print(f"Loading {path}...")
28 | state_dict = torch.load(path, map_location="cpu")
29 | if "model" in state_dict:
30 | weight = extract(state_dict)
31 | else:
32 | weight = state_dict["weight"]
33 | return weight, state_dict
34 |
35 | def get_alpha(key: str):
36 | try:
37 | filtered = sorted(
38 | [x for x in weights.keys() if key.startswith(x)], key=len, reverse=True
39 | )
40 | if len(filtered) < 1:
41 | return alpha
42 | return weights[filtered[0]]
43 | except:
44 | return alpha
45 |
46 | weight_a, state_dict = load_weight(path_a)
47 | weight_b, _ = load_weight(path_b)
48 | if path_c is not None:
49 | weight_c, _ = load_weight(path_c)
50 |
51 | if sorted(list(weight_a.keys())) != sorted(list(weight_b.keys())):
52 | raise RuntimeError("Failed to merge models.")
53 |
54 | merged = OrderedDict()
55 | merged["weight"] = {}
56 |
57 | def merge_weight(a, b, c, alpha):
58 | if method == "weight_sum":
59 | return (1 - alpha) * a + alpha * b
60 | elif method == "add_diff":
61 | return a + (b - c) * alpha
62 |
63 | for key in tqdm.tqdm(weight_a.keys()):
64 | a = get_alpha(key)
65 | if path_c is not None:
66 | merged["weight"][key] = merge_weight(
67 | weight_a[key], weight_b[key], weight_c[key], a
68 | )
69 | else:
70 | merged["weight"][key] = merge_weight(weight_a[key], weight_b[key], None, a)
71 | merged["config"] = state_dict["config"]
72 | merged["params"] = state_dict["params"] if "params" in state_dict else None
73 | merged["version"] = state_dict.get("version", "v1")
74 | merged["sr"] = state_dict["sr"]
75 | merged["f0"] = state_dict["f0"]
76 | merged["info"] = state_dict["info"]
77 | merged["embedder_name"] = (
78 | state_dict["embedder_name"] if "embedder_name" in state_dict else None
79 | )
80 | merged["embedder_output_layer"] = state_dict.get("embedder_output_layer", "12")
81 | return merged
82 |
--------------------------------------------------------------------------------
/modules/models.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from typing import *
4 |
5 | import torch
6 | from fairseq import checkpoint_utils
7 | from fairseq.models.hubert.hubert import HubertModel
8 | from pydub import AudioSegment
9 |
10 | from lib.rvc.models import (SynthesizerTrnMs256NSFSid,
11 | SynthesizerTrnMs256NSFSidNono)
12 | from lib.rvc.pipeline import VocalConvertPipeline
13 |
14 | from .cmd_opts import opts
15 | from .shared import ROOT_DIR, device, is_half
16 | from .utils import load_audio
17 |
18 | AUDIO_OUT_DIR = opts.output_dir or os.path.join(ROOT_DIR, "outputs")
19 |
20 |
21 | EMBEDDINGS_LIST = {
22 | "hubert-base-japanese": (
23 | "rinna_hubert_base_jp.pt",
24 | "hubert-base-japanese",
25 | "local",
26 | ),
27 | "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
28 | }
29 |
30 |
31 | def update_state_dict(state_dict):
32 | if "params" in state_dict and state_dict["params"] is not None:
33 | return
34 | keys = [
35 | "spec_channels",
36 | "segment_size",
37 | "inter_channels",
38 | "hidden_channels",
39 | "filter_channels",
40 | "n_heads",
41 | "n_layers",
42 | "kernel_size",
43 | "p_dropout",
44 | "resblock",
45 | "resblock_kernel_sizes",
46 | "resblock_dilation_sizes",
47 | "upsample_rates",
48 | "upsample_initial_channel",
49 | "upsample_kernel_sizes",
50 | "spk_embed_dim",
51 | "gin_channels",
52 | "emb_channels",
53 | "sr",
54 | ]
55 | state_dict["params"] = {}
56 | n = 0
57 | for i, key in enumerate(keys):
58 | i = i - n
59 | if len(state_dict["config"]) != 19 and key == "emb_channels":
60 | # backward compat.
61 | n += 1
62 | continue
63 | state_dict["params"][key] = state_dict["config"][i]
64 |
65 | if not "emb_channels" in state_dict["params"]:
66 | if state_dict.get("version", "v1") == "v1":
67 | state_dict["params"]["emb_channels"] = 256 # for backward compat.
68 | state_dict["embedder_output_layer"] = 9
69 | else:
70 | state_dict["params"]["emb_channels"] = 768 # for backward compat.
71 | state_dict["embedder_output_layer"] = 12
72 |
73 |
74 | class VoiceConvertModel:
75 | def __init__(self, model_name: str, state_dict: Dict[str, Any]) -> None:
76 | update_state_dict(state_dict)
77 | self.model_name = model_name
78 | self.state_dict = state_dict
79 | self.tgt_sr = state_dict["params"]["sr"]
80 | f0 = state_dict.get("f0", 1)
81 | state_dict["params"]["spk_embed_dim"] = state_dict["weight"][
82 | "emb_g.weight"
83 | ].shape[0]
84 | if not "emb_channels" in state_dict["params"]:
85 | state_dict["params"]["emb_channels"] = 256 # for backward compat.
86 |
87 | if f0 == 1:
88 | self.net_g = SynthesizerTrnMs256NSFSid(
89 | **state_dict["params"], is_half=is_half
90 | )
91 | else:
92 | self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"])
93 |
94 | del self.net_g.enc_q
95 |
96 | self.net_g.load_state_dict(state_dict["weight"], strict=False)
97 | self.net_g.eval().to(device)
98 |
99 | if is_half:
100 | self.net_g = self.net_g.half()
101 | else:
102 | self.net_g = self.net_g.float()
103 |
104 | self.vc = VocalConvertPipeline(self.tgt_sr, device, is_half)
105 | self.n_spk = state_dict["params"]["spk_embed_dim"]
106 |
107 | def single(
108 | self,
109 | sid: int,
110 | input_audio: str,
111 | embedder_model_name: str,
112 | embedding_output_layer: str,
113 | f0_up_key: int,
114 | f0_file: str,
115 | f0_method: str,
116 | auto_load_index: bool,
117 | faiss_index_file: str,
118 | index_rate: float,
119 | output_dir: str = AUDIO_OUT_DIR,
120 | ):
121 | if not input_audio:
122 | raise Exception("You need to set Source Audio")
123 | f0_up_key = int(f0_up_key)
124 | audio = load_audio(input_audio, 16000)
125 |
126 | if embedder_model_name == "auto":
127 | embedder_model_name = (
128 | self.state_dict["embedder_name"]
129 | if "embedder_name" in self.state_dict
130 | else "hubert_base"
131 | )
132 | if embedder_model_name.endswith("768"):
133 | embedder_model_name = embedder_model_name[:-3]
134 |
135 | if embedder_model_name == "hubert_base":
136 | embedder_model_name = "contentvec"
137 |
138 | if not embedder_model_name in EMBEDDINGS_LIST.keys():
139 | raise Exception(f"Not supported embedder: {embedder_model_name}")
140 |
141 | if (
142 | embedder_model == None
143 | or loaded_embedder_model != EMBEDDINGS_LIST[embedder_model_name][1]
144 | ):
145 | print(f"load {embedder_model_name} embedder")
146 | embedder_filename, embedder_name, load_from = get_embedder(
147 | embedder_model_name
148 | )
149 | load_embedder(embedder_filename, embedder_name)
150 |
151 | if embedding_output_layer == "auto":
152 | embedding_output_layer = (
153 | self.state_dict["embedding_output_layer"]
154 | if "embedding_output_layer" in self.state_dict
155 | else 12
156 | )
157 | else:
158 | embedding_output_layer = int(embedding_output_layer)
159 |
160 | f0 = self.state_dict.get("f0", 1)
161 |
162 | if not faiss_index_file and auto_load_index:
163 | faiss_index_file = self.get_index_path(sid)
164 |
165 | audio_opt = self.vc(
166 | embedder_model,
167 | embedding_output_layer,
168 | self.net_g,
169 | sid,
170 | audio,
171 | f0_up_key,
172 | f0_method,
173 | faiss_index_file,
174 | index_rate,
175 | f0,
176 | f0_file=f0_file,
177 | )
178 |
179 | audio = AudioSegment(
180 | audio_opt,
181 | frame_rate=self.tgt_sr,
182 | sample_width=2,
183 | channels=1,
184 | )
185 | os.makedirs(output_dir, exist_ok=True)
186 | input_audio_splitext = os.path.splitext(os.path.basename(input_audio))[0]
187 | model_splitext = os.path.splitext(self.model_name)[0]
188 | index = 0
189 | existing_files = os.listdir(output_dir)
190 | for existing_file in existing_files:
191 | result = re.match(r"\d+", existing_file)
192 | if result:
193 | prefix_num = int(result.group(0))
194 | if index < prefix_num:
195 | index = prefix_num
196 | audio.export(
197 | os.path.join(
198 | output_dir, f"{index+1}-{model_splitext}-{input_audio_splitext}.wav"
199 | ),
200 | format="wav",
201 | )
202 | return audio_opt
203 |
204 | def get_index_path(self, speaker_id: int):
205 | basename = os.path.splitext(self.model_name)[0]
206 | speaker_index_path = os.path.join(
207 | MODELS_DIR,
208 | "checkpoints",
209 | f"{basename}_index",
210 | f"{basename}.{speaker_id}.index",
211 | )
212 | if os.path.exists(speaker_index_path):
213 | return speaker_index_path
214 | return os.path.join(MODELS_DIR, "checkpoints", f"{basename}.index")
215 |
216 |
217 | MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models")
218 | vc_model: Optional[VoiceConvertModel] = None
219 | embedder_model: Optional[HubertModel] = None
220 | loaded_embedder_model = ""
221 |
222 |
223 | def get_models():
224 | dir = os.path.join(ROOT_DIR, "models", "checkpoints")
225 | os.makedirs(dir, exist_ok=True)
226 | return [
227 | file
228 | for file in os.listdir(dir)
229 | if any([x for x in [".ckpt", ".pth"] if file.endswith(x)])
230 | ]
231 |
232 |
233 | def get_embedder(embedder_name):
234 | if embedder_name in EMBEDDINGS_LIST:
235 | return EMBEDDINGS_LIST[embedder_name]
236 | return None
237 |
238 |
239 | def load_embedder(emb_file: str, emb_name: str):
240 | global embedder_model, loaded_embedder_model
241 | emb_file = os.path.join(MODELS_DIR, "embeddings", emb_file)
242 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
243 | [emb_file],
244 | suffix="",
245 | )
246 | embedder_model = models[0]
247 | embedder_model = embedder_model.to(device)
248 |
249 | if is_half:
250 | embedder_model = embedder_model.half()
251 | else:
252 | embedder_model = embedder_model.float()
253 | embedder_model.eval()
254 |
255 | loaded_embedder_model = emb_name
256 |
257 |
258 | def get_vc_model(model_name: str):
259 | model_path = os.path.join(MODELS_DIR, "checkpoints", model_name)
260 | weight = torch.load(model_path, map_location="cpu")
261 | return VoiceConvertModel(model_name, weight)
262 |
263 |
264 | def load_model(model_name: str):
265 | global vc_model
266 | vc_model = get_vc_model(model_name)
267 |
--------------------------------------------------------------------------------
/modules/separate.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import *
3 |
4 | import tqdm
5 | from pydub import AudioSegment
6 | from pydub.silence import split_on_silence
7 |
8 |
9 | def separate_audio(
10 | input: str,
11 | output: str,
12 | silence_thresh: int,
13 | min_silence_len: int = 1000,
14 | keep_silence: int = 100,
15 | margin: int = 0,
16 | padding: bool = False,
17 | min: Optional[int] = None,
18 | max: Optional[int] = None,
19 | ):
20 | if os.path.isfile(input):
21 | input = [input]
22 | elif os.path.isdir(input):
23 | input = [os.path.join(input, f) for f in os.listdir(input)]
24 | else:
25 | raise ValueError("input must be a file or directory")
26 |
27 | os.makedirs(output, exist_ok=True)
28 |
29 | for file in input:
30 | if os.path.splitext(file)[1] == ".mp3":
31 | audio = AudioSegment.from_mp3(file)
32 | elif os.path.splitext(file)[1] == ".wav":
33 | audio = AudioSegment.from_wav(file)
34 | elif os.path.splitext(file)[1] == ".flac":
35 | audio = AudioSegment.from_file(file, "flac")
36 | else:
37 | raise ValueError(
38 | "Invalid file format. Only MP3 and WAV files are supported."
39 | )
40 |
41 | chunks = split_on_silence(
42 | audio,
43 | min_silence_len=min_silence_len,
44 | silence_thresh=silence_thresh,
45 | keep_silence=keep_silence,
46 | )
47 |
48 | output_chunks: List[AudioSegment] = []
49 |
50 | so_short = None
51 |
52 | for chunk in tqdm.tqdm(chunks):
53 | if so_short is not None:
54 | chunk = so_short + chunk
55 | so_short = None
56 | if min is None or len(chunk) > min:
57 | if max is not None and len(chunk) > max:
58 | sub_chunks = [
59 | chunk[i : i + max + margin]
60 | for i in range(0, len(chunk) - margin, max)
61 | ]
62 |
63 | if len(sub_chunks[-1]) < min:
64 | if padding and len(sub_chunks) > 2:
65 | output_chunks.extend(sub_chunks[0:-2])
66 | output_chunks.append(sub_chunks[-2] + sub_chunks[-1])
67 | else:
68 | output_chunks.extend(sub_chunks[0:-1])
69 | else:
70 | output_chunks.extend(sub_chunks)
71 | else:
72 | output_chunks.append(chunk)
73 | else:
74 | if so_short is None:
75 | so_short = chunk
76 | else:
77 | so_short += chunk
78 | basename = os.path.splitext(os.path.basename(file))[0]
79 |
80 | for i, chunk in enumerate(output_chunks):
81 | filepath = os.path.join(output, f"{basename}_{i}.wav")
82 | chunk.export(filepath, format="wav")
83 |
--------------------------------------------------------------------------------
/modules/server/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from typing import *
4 |
5 | import faiss
6 | import numpy as np
7 | import pyworld
8 | import scipy.signal as signal
9 | import torch
10 | import torch.nn.functional as F
11 | import torchaudio
12 | import torchcrepe
13 | from fairseq import checkpoint_utils
14 | from fairseq.models.hubert.hubert import HubertModel
15 | from pydub import AudioSegment
16 | from torch import Tensor
17 |
18 | from lib.rvc.models import (SynthesizerTrnMs256NSFSid,
19 | SynthesizerTrnMs256NSFSidNono)
20 | from lib.rvc.pipeline import VocalConvertPipeline
21 | from modules.cmd_opts import opts
22 | from modules.models import (EMBEDDINGS_LIST, MODELS_DIR, get_embedder,
23 | get_vc_model, update_state_dict)
24 | from modules.shared import ROOT_DIR, device, is_half
25 |
26 | MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models")
27 | vc_model: Optional["VoiceServerModel"] = None
28 | embedder_model: Optional[HubertModel] = None
29 | loaded_embedder_model = ""
30 |
31 |
32 | class VoiceServerModel:
33 | def __init__(self, rvc_model_file: str, faiss_index_file: str) -> None:
34 | # setting vram
35 | global device, is_half
36 | if isinstance(device, str):
37 | device = torch.device(device)
38 | if device.type == "cuda":
39 | vram = torch.cuda.get_device_properties(device).total_memory / 1024**3
40 | else:
41 | vram = None
42 | if vram is not None and vram <= 4:
43 | self.x_pad = 1
44 | self.x_query = 5
45 | self.x_center = 30
46 | self.x_max = 32
47 | elif vram is not None and vram <= 5:
48 | self.x_pad = 1
49 | self.x_query = 6
50 | self.x_center = 38
51 | self.x_max = 41
52 | else:
53 | self.x_pad = 3
54 | self.x_query = 10
55 | self.x_center = 60
56 | self.x_max = 65
57 |
58 | # load_model
59 | state_dict = torch.load(rvc_model_file, map_location="cpu")
60 | update_state_dict(state_dict)
61 | self.state_dict = state_dict
62 | self.tgt_sr = state_dict["params"]["sr"]
63 | self.f0 = state_dict.get("f0", 1)
64 | state_dict["params"]["spk_embed_dim"] = state_dict["weight"][
65 | "emb_g.weight"
66 | ].shape[0]
67 | if not "emb_channels" in state_dict["params"]:
68 | if state_dict.get("version", "v1") == "v1":
69 | state_dict["params"]["emb_channels"] = 256 # for backward compat.
70 | state_dict["embedder_output_layer"] = 9
71 | else:
72 | state_dict["params"]["emb_channels"] = 768 # for backward compat.
73 | state_dict["embedder_output_layer"] = 12
74 | if self.f0 == 1:
75 | self.net_g = SynthesizerTrnMs256NSFSid(
76 | **state_dict["params"], is_half=is_half
77 | )
78 | else:
79 | self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"])
80 | del self.net_g.enc_q
81 | self.net_g.load_state_dict(state_dict["weight"], strict=False)
82 | self.net_g.eval().to(device)
83 | if is_half:
84 | self.net_g = self.net_g.half()
85 | else:
86 | self.net_g = self.net_g.float()
87 |
88 | emb_name = state_dict.get("embedder_name", "contentvec")
89 | if emb_name == "hubert_base":
90 | emb_name = "contentvec"
91 | emb_file = os.path.join(MODELS_DIR, "embeddings", EMBEDDINGS_LIST[emb_name][0])
92 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
93 | [emb_file],
94 | suffix="",
95 | )
96 | embedder_model = models[0]
97 | embedder_model = embedder_model.to(device)
98 |
99 | if is_half:
100 | embedder_model = embedder_model.half()
101 | else:
102 | embedder_model = embedder_model.float()
103 | embedder_model.eval()
104 | self.embedder_model = embedder_model
105 |
106 | self.embedder_output_layer = state_dict["embedder_output_layer"]
107 |
108 | self.index = None
109 | if faiss_index_file != "" and os.path.exists(faiss_index_file):
110 | self.index = faiss.read_index(faiss_index_file)
111 | self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
112 |
113 | self.n_spk = state_dict["params"]["spk_embed_dim"]
114 |
115 | self.sr = 16000 # hubert input sample rate
116 | self.window = 160 # hubert input window
117 | self.t_pad = self.sr * self.x_pad # padding time for each utterance
118 | self.t_pad_tgt = self.tgt_sr * self.x_pad
119 | self.t_pad2 = self.t_pad * 2
120 | self.t_query = self.sr * self.x_query # query time before and after query point
121 | self.t_center = self.sr * self.x_center # query cut point position
122 | self.t_max = self.sr * self.x_max # max time for no query
123 | self.device = device
124 | self.is_half = is_half
125 |
126 | def __call__(
127 | self,
128 | audio: np.ndarray,
129 | sr: int,
130 | sid: int,
131 | transpose: int,
132 | f0_method: str,
133 | index_rate: float,
134 | ):
135 | # bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
136 | # audio = signal.filtfilt(bh, ah, audio)
137 | if sr != self.sr:
138 | audio = torchaudio.functional.resample(torch.from_numpy(audio), sr, self.sr, rolloff=0.99).detach().cpu().numpy()
139 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect" if audio.shape[0] > self.window // 2 else "constant")
140 |
141 | opt_ts = []
142 | if audio_pad.shape[0] > self.t_max:
143 | audio_sum = np.zeros_like(audio)
144 | for i in range(self.window):
145 | audio_sum += audio_pad[i : i - self.window]
146 | for t in range(self.t_center, audio.shape[0], self.t_center):
147 | opt_ts.append(
148 | t
149 | - self.t_query
150 | + np.where(
151 | np.abs(audio_sum[t - self.t_query : t + self.t_query])
152 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
153 | )[0][0]
154 | )
155 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect" if audio.shape[0] > self.t_pad else "constant")
156 | p_len = audio_pad.shape[0] // self.window
157 |
158 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
159 | pitch, pitchf = None, None
160 | if self.f0 == 1:
161 | pitch, pitchf = get_f0(audio_pad, self.sr, p_len, transpose, f0_method)
162 | pitch = pitch[:p_len]
163 | pitchf = pitchf[:p_len]
164 | if self.device.type == "mps":
165 | pitchf = pitchf.astype(np.float32)
166 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
167 | pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
168 |
169 | audio_opt = []
170 |
171 | s = 0
172 | t = None
173 |
174 | for t in opt_ts:
175 | t = t // self.window * self.window
176 | if self.f0 == 1:
177 | audio_opt.append(
178 | self._convert(
179 | sid,
180 | audio_pad[s : t + self.t_pad2 + self.window],
181 | pitch[:, s // self.window : (t + self.t_pad2) // self.window],
182 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
183 | index_rate,
184 | )[self.t_pad_tgt : -self.t_pad_tgt]
185 | )
186 | else:
187 | audio_opt.append(
188 | self._convert(
189 | sid,
190 | audio_pad[s : t + self.t_pad2 + self.window],
191 | None,
192 | None,
193 | index_rate,
194 | )[self.t_pad_tgt : -self.t_pad_tgt]
195 | )
196 | s = t
197 | if self.f0 == 1:
198 | audio_opt.append(
199 | self._convert(
200 | sid,
201 | audio_pad[t:],
202 | pitch[:, t // self.window :] if t is not None else pitch,
203 | pitchf[:, t // self.window :] if t is not None else pitchf,
204 | index_rate,
205 | )[self.t_pad_tgt : -self.t_pad_tgt]
206 | )
207 | else:
208 | audio_opt.append(
209 | self._convert(
210 | sid,
211 | audio_pad[t:],
212 | None,
213 | None,
214 | index_rate,
215 | )[self.t_pad_tgt : -self.t_pad_tgt]
216 | )
217 | audio_opt = np.concatenate(audio_opt)
218 | del pitch, pitchf, sid
219 | if torch.cuda.is_available():
220 | torch.cuda.empty_cache()
221 | return audio_opt
222 |
223 |
224 | def _convert(
225 | self,
226 | sid: int,
227 | audio: np.ndarray,
228 | pitch: Optional[np.ndarray],
229 | pitchf: Optional[np.ndarray],
230 | index_rate: float,
231 | ):
232 | feats = torch.from_numpy(audio)
233 | if self.is_half:
234 | feats = feats.half()
235 | else:
236 | feats = feats.float()
237 | if feats.dim() == 2: # double channels
238 | feats = feats.mean(-1)
239 | assert feats.dim() == 1, feats.dim()
240 | feats = feats.view(1, -1)
241 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
242 |
243 | half_support = (
244 | self.device.type == "cuda"
245 | and torch.cuda.get_device_capability(self.device)[0] >= 5.3
246 | )
247 | is_feats_dim_768 = self.net_g.emb_channels == 768
248 |
249 | if isinstance(self.embedder_model, tuple):
250 | feats = self.embedder_model[0](
251 | feats.squeeze(0).squeeze(0).to(self.device),
252 | return_tensors="pt",
253 | sampling_rate=16000,
254 | )
255 | if self.is_half:
256 | feats = feats.input_values.to(self.device).half()
257 | else:
258 | feats = feats.input_values.to(self.device)
259 | with torch.no_grad():
260 | if is_feats_dim_768:
261 | feats = self.embedder_model[1](feats).last_hidden_state
262 | else:
263 | feats = self.embedder_model[1](feats).extract_features
264 | else:
265 | inputs = {
266 | "source": feats.half().to(self.device)
267 | if half_support
268 | else feats.to(self.device),
269 | "padding_mask": padding_mask.to(self.device),
270 | "output_layer": self.embedder_output_layer,
271 | }
272 |
273 | if not half_support:
274 | self.embedder_model = self.embedder_model.float()
275 | inputs["source"] = inputs["source"].float()
276 |
277 | with torch.no_grad():
278 | logits = self.embedder_model.extract_features(**inputs)
279 | if is_feats_dim_768:
280 | feats = logits[0]
281 | else:
282 | feats = self.embedder_model.final_proj(logits[0])
283 |
284 | if (
285 | isinstance(self.index, type(None)) == False
286 | and isinstance(self.big_npy, type(None)) == False
287 | and index_rate != 0
288 | ):
289 | npy = feats[0].cpu().numpy()
290 | if self.is_half:
291 | npy = npy.astype("float32")
292 |
293 | _, ix = self.index.search(npy, k=1)
294 | npy = self.big_npy[ix[:, 0]]
295 |
296 | if self.is_half:
297 | npy = npy.astype("float16")
298 | feats = (
299 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
300 | + (1 - index_rate) * feats
301 | )
302 |
303 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
304 |
305 | p_len = audio.shape[0] // self.window
306 | if feats.shape[1] < p_len:
307 | p_len = feats.shape[1]
308 | if pitch != None and pitchf != None:
309 | pitch = pitch[:, :p_len]
310 | pitchf = pitchf[:, :p_len]
311 | p_len = torch.tensor([p_len], device=self.device).long()
312 | with torch.no_grad():
313 | if pitch != None and pitchf != None:
314 | audio1 = (
315 | (self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
316 | .data.cpu()
317 | .float()
318 | .numpy()
319 | .astype(np.int16)
320 | )
321 | else:
322 | audio1 = (
323 | (self.net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
324 | .data.cpu()
325 | .float()
326 | .numpy()
327 | .astype(np.int16)
328 | )
329 | del feats, p_len, padding_mask
330 | if torch.cuda.is_available():
331 | torch.cuda.empty_cache()
332 | return audio1
333 |
334 |
335 | # F0 computation
336 | def get_f0_crepe_computation(
337 | x,
338 | sr,
339 | f0_min,
340 | f0_max,
341 | p_len,
342 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
343 | ):
344 | hop_length = sr // 100
345 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
346 | x /= np.quantile(np.abs(x), 0.999)
347 | torch_device = self.get_optimal_torch_device()
348 | audio = torch.from_numpy(x).to(torch_device, copy=True)
349 | audio = torch.unsqueeze(audio, dim=0)
350 | if audio.ndim == 2 and audio.shape[0] > 1:
351 | audio = torch.mean(audio, dim=0, keepdim=True).detach()
352 | audio = audio.detach()
353 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
354 | pitch: Tensor = torchcrepe.predict(
355 | audio,
356 | sr,
357 | sr // 100,
358 | f0_min,
359 | f0_max,
360 | model,
361 | batch_size=hop_length * 2,
362 | device=torch_device,
363 | pad=True
364 | )
365 | p_len = p_len or x.shape[0] // hop_length
366 | # Resize the pitch for final f0
367 | source = np.array(pitch.squeeze(0).cpu().float().numpy())
368 | source[source < 0.001] = np.nan
369 | target = np.interp(
370 | np.arange(0, len(source) * p_len, len(source)) / p_len,
371 | np.arange(0, len(source)),
372 | source
373 | )
374 | f0 = np.nan_to_num(target)
375 | return f0 # Resized f0
376 |
377 | def get_f0_official_crepe_computation(
378 | x,
379 | sr,
380 | f0_min,
381 | f0_max,
382 | model="full",
383 | ):
384 | # Pick a batch size that doesn't cause memory errors on your gpu
385 | batch_size = 512
386 | # Compute pitch using first gpu
387 | audio = torch.tensor(np.copy(x))[None].float()
388 | f0, pd = torchcrepe.predict(
389 | audio,
390 | sr,
391 | sr // 100,
392 | f0_min,
393 | f0_max,
394 | model,
395 | batch_size=batch_size,
396 | device=device,
397 | return_periodicity=True,
398 | )
399 | pd = torchcrepe.filter.median(pd, 3)
400 | f0 = torchcrepe.filter.mean(f0, 3)
401 | f0[pd < 0.1] = 0
402 | f0 = f0[0].cpu().numpy()
403 | return f0
404 |
405 | def get_f0(
406 | x: np.ndarray,
407 | sr: int,
408 | p_len: int,
409 | f0_up_key: int,
410 | f0_method: str,
411 | ):
412 | f0_min = 50
413 | f0_max = 1100
414 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
415 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
416 |
417 | if f0_method == "harvest":
418 | f0, t = pyworld.harvest(
419 | x.astype(np.double),
420 | fs=sr,
421 | f0_ceil=f0_max,
422 | f0_floor=f0_min,
423 | frame_period=10,
424 | )
425 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr)
426 | f0 = signal.medfilt(f0, 3)
427 | elif f0_method == "dio":
428 | f0, t = pyworld.dio(
429 | x.astype(np.double),
430 | fs=sr,
431 | f0_ceil=f0_max,
432 | f0_floor=f0_min,
433 | frame_period=10,
434 | )
435 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr)
436 | f0 = signal.medfilt(f0, 3)
437 | elif f0_method == "mangio-crepe":
438 | f0 = get_f0_crepe_computation(x, sr, f0_min, f0_max, p_len, "full")
439 | elif f0_method == "crepe":
440 | f0 = get_f0_official_crepe_computation(x, sr, f0_min, f0_max, "full")
441 |
442 | f0 *= pow(2, f0_up_key / 12)
443 | f0bak = f0.copy()
444 | f0_mel = 1127 * np.log(1 + f0 / 700)
445 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
446 | f0_mel_max - f0_mel_min
447 | ) + 1
448 | f0_mel[f0_mel <= 1] = 1
449 | f0_mel[f0_mel > 255] = 255
450 | f0_coarse = np.rint(f0_mel).astype(np.int32)
451 | return f0_coarse, f0bak # 1-0
--------------------------------------------------------------------------------
/modules/shared.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import torch
5 |
6 | from modules.cmd_opts import opts
7 |
8 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
9 | MODELS_DIR = os.path.join(ROOT_DIR, "models")
10 |
11 |
12 | def has_mps():
13 | if sys.platform != "darwin":
14 | return False
15 | else:
16 | if not getattr(torch, "has_mps", False):
17 | return False
18 | try:
19 | torch.zeros(1).to(torch.device("mps"))
20 | return True
21 | except Exception:
22 | return False
23 |
24 |
25 | is_half = opts.precision == "fp16"
26 | half_support = (
27 | torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 5.3
28 | )
29 |
30 | if not half_support:
31 | print("WARNING: FP16 is not supported on this GPU")
32 | is_half = False
33 |
34 | device = "cuda:0"
35 |
36 | if not torch.cuda.is_available():
37 | if has_mps():
38 | print("Using MPS")
39 | device = "mps"
40 | else:
41 | print("Using CPU")
42 | device = "cpu"
43 |
44 | device = torch.device(device)
45 |
--------------------------------------------------------------------------------
/modules/tabs/inference.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import traceback
4 |
5 | import gradio as gr
6 |
7 | from modules import models, ui
8 | from modules.ui import Tab
9 |
10 |
11 | def inference_options_ui(show_out_dir=True):
12 | with gr.Row(equal_height=False):
13 | with gr.Column():
14 | source_audio = gr.Textbox(label="Source Audio")
15 | out_dir = gr.Textbox(
16 | label="Out folder",
17 | visible=show_out_dir,
18 | placeholder=models.AUDIO_OUT_DIR,
19 | )
20 | with gr.Column():
21 | transpose = gr.Slider(
22 | minimum=-20, maximum=20, value=0, step=1, label="Transpose"
23 | )
24 | pitch_extraction_algo = gr.Radio(
25 | choices=["dio", "harvest", "mangio-crepe", "crepe"],
26 | value="crepe",
27 | label="Pitch Extraction Algorithm",
28 | )
29 | embedding_model = gr.Radio(
30 | choices=["auto", *models.EMBEDDINGS_LIST.keys()],
31 | value="auto",
32 | label="Embedder Model",
33 | )
34 | embedding_output_layer = gr.Radio(
35 | choices=["auto", "9", "12"],
36 | value="auto",
37 | label="Embedder Output Layer",
38 | )
39 | with gr.Column():
40 | auto_load_index = gr.Checkbox(value=False, label="Auto Load Index")
41 | faiss_index_file = gr.Textbox(value="", label="Faiss Index File Path")
42 | retrieval_feature_ratio = gr.Slider(
43 | minimum=0,
44 | maximum=1,
45 | value=1,
46 | step=0.01,
47 | label="Retrieval Feature Ratio",
48 | )
49 | with gr.Column():
50 | fo_curve_file = gr.File(label="F0 Curve File")
51 |
52 | return (
53 | source_audio,
54 | out_dir,
55 | transpose,
56 | embedding_model,
57 | embedding_output_layer,
58 | pitch_extraction_algo,
59 | auto_load_index,
60 | faiss_index_file,
61 | retrieval_feature_ratio,
62 | fo_curve_file,
63 | )
64 |
65 |
66 | class Inference(Tab):
67 | def title(self):
68 | return "Inference"
69 |
70 | def sort(self):
71 | return 1
72 |
73 | def ui(self, outlet):
74 | def infer(
75 | sid,
76 | input_audio,
77 | out_dir,
78 | embedder_model,
79 | embedding_output_layer,
80 | f0_up_key,
81 | f0_file,
82 | f0_method,
83 | auto_load_index,
84 | faiss_index_file,
85 | index_rate,
86 | ):
87 | model = models.vc_model
88 | try:
89 | yield "Infering...", None
90 | if out_dir == "":
91 | out_dir = models.AUDIO_OUT_DIR
92 |
93 | if "*" in input_audio:
94 | assert (
95 | out_dir is not None
96 | ), "Out folder is required for batch processing"
97 | files = glob.glob(input_audio, recursive=True)
98 | elif os.path.isdir(input_audio):
99 | assert (
100 | out_dir is not None
101 | ), "Out folder is required for batch processing"
102 | files = glob.glob(
103 | os.path.join(input_audio, "**", "*.wav"), recursive=True
104 | )
105 | else:
106 | files = [input_audio]
107 | for file in files:
108 | audio = model.single(
109 | sid,
110 | file,
111 | embedder_model,
112 | embedding_output_layer,
113 | f0_up_key,
114 | f0_file,
115 | f0_method,
116 | auto_load_index,
117 | faiss_index_file,
118 | index_rate,
119 | output_dir=out_dir,
120 | )
121 | yield "Success", (model.tgt_sr, audio) if len(files) == 1 else None
122 | except:
123 | yield "Error: " + traceback.format_exc(), None
124 |
125 | with gr.Group():
126 | with gr.Box():
127 | with gr.Column():
128 | _, speaker_id = ui.create_model_list_ui()
129 |
130 | (
131 | source_audio,
132 | out_dir,
133 | transpose,
134 | embedder_model,
135 | embedding_output_layer,
136 | pitch_extraction_algo,
137 | auto_load_index,
138 | faiss_index_file,
139 | retrieval_feature_ratio,
140 | f0_curve_file,
141 | ) = inference_options_ui()
142 |
143 | with gr.Row(equal_height=False):
144 | with gr.Column():
145 | status = gr.Textbox(value="", label="Status")
146 | output = gr.Audio(label="Output", interactive=False)
147 |
148 | with gr.Row():
149 | infer_button = gr.Button("Infer", variant="primary")
150 |
151 | infer_button.click(
152 | infer,
153 | inputs=[
154 | speaker_id,
155 | source_audio,
156 | out_dir,
157 | embedder_model,
158 | embedding_output_layer,
159 | transpose,
160 | f0_curve_file,
161 | pitch_extraction_algo,
162 | auto_load_index,
163 | faiss_index_file,
164 | retrieval_feature_ratio,
165 | ],
166 | outputs=[status, output],
167 | queue=True,
168 | )
169 |
--------------------------------------------------------------------------------
/modules/tabs/merge.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from typing import *
4 |
5 | import gradio as gr
6 | import torch
7 |
8 | from modules import models
9 | from modules.merge import merge
10 | from modules.tabs.inference import inference_options_ui
11 | from modules.ui import Tab
12 |
13 | MERGE_METHODS = {
14 | "weight_sum": "Weight sum:A*(1-alpha)+B*alpha",
15 | "add_diff": "Add difference:A+(B-C)*alpha",
16 | }
17 |
18 |
19 | class Merge(Tab):
20 | def title(self):
21 | return "Merge"
22 |
23 | def sort(self):
24 | return 3
25 |
26 | def ui(self, outlet):
27 | def merge_ckpt(model_a, model_b, model_c, weight_text, alpha, each_key, method):
28 | model_a = model_a if type(model_a) != list and model_a != "" else None
29 | model_b = model_b if type(model_b) != list and model_b != "" else None
30 | model_c = model_c if type(model_c) != list and model_c != "" else None
31 |
32 | if each_key:
33 | weights = json.loads(weight_text)
34 | else:
35 | weights = {}
36 |
37 | method = [k for k, v in MERGE_METHODS.items() if v == method][0]
38 | return merge(
39 | os.path.join(models.MODELS_DIR, "checkpoints", model_a),
40 | os.path.join(models.MODELS_DIR, "checkpoints", model_b),
41 | os.path.join(models.MODELS_DIR, "checkpoints", model_c)
42 | if model_c
43 | else None,
44 | alpha,
45 | weights,
46 | method,
47 | )
48 |
49 | def merge_and_save(
50 | model_a, model_b, model_c, alpha, each_key, weight_text, method, out_name
51 | ):
52 | print(each_key)
53 | out_path = os.path.join(models.MODELS_DIR, "checkpoints", out_name)
54 | if os.path.exists(out_path):
55 | return "Model name already exists."
56 | merged = merge_ckpt(
57 | model_a, model_b, model_c, weight_text, alpha, each_key, method
58 | )
59 | if not out_name.endswith(".pth"):
60 | out_name += ".pth"
61 | torch.save(merged, os.path.join(models.MODELS_DIR, "checkpoints", out_name))
62 | return "Success"
63 |
64 | def merge_and_gen(
65 | model_a,
66 | model_b,
67 | model_c,
68 | alpha,
69 | each_key,
70 | weight_text,
71 | method,
72 | speaker_id,
73 | source_audio,
74 | embedder_name,
75 | embedding_output_layer,
76 | transpose,
77 | fo_curve_file,
78 | pitch_extraction_algo,
79 | auto_load_index,
80 | faiss_index_file,
81 | retrieval_feature_ratio,
82 | ):
83 | merged = merge_ckpt(
84 | model_a, model_b, model_c, weight_text, alpha, each_key, method
85 | )
86 | model = models.VoiceConvertModel("merge", merged)
87 | audio = model.single(
88 | speaker_id,
89 | source_audio,
90 | embedder_name,
91 | embedding_output_layer,
92 | transpose,
93 | fo_curve_file,
94 | pitch_extraction_algo,
95 | auto_load_index,
96 | faiss_index_file,
97 | retrieval_feature_ratio,
98 | )
99 | tgt_sr = model.tgt_sr
100 | del merged
101 | del model
102 | torch.cuda.empty_cache()
103 | return "Success", (tgt_sr, audio)
104 |
105 | def reload_model():
106 | model_list = models.get_models()
107 | return (
108 | gr.Dropdown.update(choices=model_list),
109 | gr.Dropdown.update(choices=model_list),
110 | gr.Dropdown.update(choices=model_list),
111 | )
112 |
113 | def update_speaker_ids(model):
114 | if model == "":
115 | return gr.Slider.update(
116 | maximum=0,
117 | visible=False,
118 | )
119 | model = torch.load(
120 | os.path.join(models.MODELS_DIR, "checkpoints", model),
121 | map_location="cpu",
122 | )
123 | vc_model = models.VoiceConvertModel("merge", model)
124 | max = vc_model.n_spk
125 | del model
126 | del vc_model
127 | return gr.Slider.update(
128 | maximum=max,
129 | visible=True,
130 | )
131 |
132 | with gr.Group():
133 | with gr.Column():
134 | with gr.Row(equal_height=False):
135 | model_a = gr.Dropdown(choices=models.get_models(), label="Model A")
136 | model_b = gr.Dropdown(choices=models.get_models(), label="Model B")
137 | model_c = gr.Dropdown(choices=models.get_models(), label="Model C")
138 | reload_model_button = gr.Button("♻️")
139 | reload_model_button.click(
140 | reload_model, outputs=[model_a, model_b, model_c]
141 | )
142 | with gr.Row(equal_height=False):
143 | method = gr.Radio(
144 | label="Merge method",
145 | choices=list(MERGE_METHODS.values()),
146 | value="Weight sum:A*(1-alpha)+B*alpha",
147 | )
148 | output_name = gr.Textbox(label="Output name")
149 | each_key = gr.Checkbox(label="Each key merge")
150 | with gr.Row(equal_height=False):
151 | base_alpha = gr.Slider(
152 | label="Base alpha", minimum=0, maximum=1, value=0.5, step=0.01
153 | )
154 |
155 | default_weights = {}
156 | weights = {}
157 |
158 | def create_weight_ui(name: str, *keys_list: List[List[str]]):
159 | with gr.Accordion(label=name, open=False):
160 | with gr.Row(equal_height=False):
161 | for keys in keys_list:
162 | with gr.Column():
163 | for key in keys:
164 | default_weights[key] = 0.5
165 | weights[key] = gr.Slider(
166 | label=key,
167 | minimum=0,
168 | maximum=1,
169 | step=0.01,
170 | value=0.5,
171 | )
172 |
173 | with gr.Box(visible=False) as each_key_ui:
174 | with gr.Column():
175 | create_weight_ui(
176 | "enc_p",
177 | [
178 | "enc_p.encoder.attn_layers.0",
179 | "enc_p.encoder.attn_layers.1",
180 | "enc_p.encoder.attn_layers.2",
181 | "enc_p.encoder.attn_layers.3",
182 | "enc_p.encoder.attn_layers.4",
183 | "enc_p.encoder.attn_layers.5",
184 | "enc_p.encoder.norm_layers_1.0",
185 | "enc_p.encoder.norm_layers_1.1",
186 | "enc_p.encoder.norm_layers_1.2",
187 | "enc_p.encoder.norm_layers_1.3",
188 | "enc_p.encoder.norm_layers_1.4",
189 | "enc_p.encoder.norm_layers_1.5",
190 | ],
191 | [
192 | "enc_p.encoder.ffn_layers.0",
193 | "enc_p.encoder.ffn_layers.1",
194 | "enc_p.encoder.ffn_layers.2",
195 | "enc_p.encoder.ffn_layers.3",
196 | "enc_p.encoder.ffn_layers.4",
197 | "enc_p.encoder.ffn_layers.5",
198 | "enc_p.encoder.norm_layers_2.0",
199 | "enc_p.encoder.norm_layers_2.1",
200 | "enc_p.encoder.norm_layers_2.2",
201 | "enc_p.encoder.norm_layers_2.3",
202 | "enc_p.encoder.norm_layers_2.4",
203 | "enc_p.encoder.norm_layers_2.5",
204 | ],
205 | [
206 | "enc_p.emb_phone",
207 | "enc_p.emb_pitch",
208 | ],
209 | )
210 |
211 | create_weight_ui(
212 | "dec",
213 | [
214 | "dec.noise_convs.0",
215 | "dec.noise_convs.1",
216 | "dec.noise_convs.2",
217 | "dec.noise_convs.3",
218 | "dec.noise_convs.4",
219 | "dec.noise_convs.5",
220 | "dec.ups.0",
221 | "dec.ups.1",
222 | "dec.ups.2",
223 | "dec.ups.3",
224 | ],
225 | [
226 | "dec.resblocks.0",
227 | "dec.resblocks.1",
228 | "dec.resblocks.2",
229 | "dec.resblocks.3",
230 | "dec.resblocks.4",
231 | "dec.resblocks.5",
232 | "dec.resblocks.6",
233 | "dec.resblocks.7",
234 | "dec.resblocks.8",
235 | "dec.resblocks.9",
236 | "dec.resblocks.10",
237 | "dec.resblocks.11",
238 | ],
239 | [
240 | "dec.m_source.l_linear",
241 | "dec.conv_pre",
242 | "dec.conv_post",
243 | "dec.cond",
244 | ],
245 | )
246 |
247 | create_weight_ui(
248 | "flow",
249 | [
250 | "flow.flows.0",
251 | "flow.flows.1",
252 | "flow.flows.2",
253 | "flow.flows.3",
254 | "flow.flows.4",
255 | "flow.flows.5",
256 | "flow.flows.6",
257 | "emb_g.weight",
258 | ],
259 | )
260 |
261 | with gr.Accordion(label="JSON", open=False):
262 | weights_text = gr.TextArea(
263 | value=json.dumps(default_weights),
264 | )
265 |
266 | with gr.Accordion(label="Inference options", open=False):
267 | with gr.Row(equal_height=False):
268 | speaker_id = gr.Slider(
269 | minimum=0,
270 | maximum=2333,
271 | step=1,
272 | label="Speaker ID",
273 | value=0,
274 | visible=True,
275 | interactive=True,
276 | )
277 | (
278 | source_audio,
279 | _,
280 | transpose,
281 | embedder_name,
282 | embedding_output_layer,
283 | pitch_extraction_algo,
284 | auto_load_index,
285 | faiss_index_file,
286 | retrieval_feature_ratio,
287 | fo_curve_file,
288 | ) = inference_options_ui(show_out_dir=False)
289 |
290 | with gr.Row(equal_height=False):
291 | with gr.Column():
292 | status = gr.Textbox(value="", label="Status")
293 | audio_output = gr.Audio(label="Output", interactive=False)
294 |
295 | with gr.Row(equal_height=False):
296 | merge_and_save_button = gr.Button(
297 | "Merge and save", variant="primary"
298 | )
299 | merge_and_gen_button = gr.Button("Merge and gen", variant="primary")
300 |
301 | def each_key_on_change(each_key):
302 | return gr.update(visible=each_key)
303 |
304 | each_key.change(
305 | fn=each_key_on_change,
306 | inputs=[each_key],
307 | outputs=[each_key_ui],
308 | )
309 |
310 | def update_weights_text(data):
311 | d = {}
312 | for key in weights.keys():
313 | d[key] = data[weights[key]]
314 | return json.dumps(d)
315 |
316 | for w in weights.values():
317 | w.change(
318 | fn=update_weights_text,
319 | inputs={*weights.values()},
320 | outputs=[weights_text],
321 | )
322 |
323 | merge_data = [
324 | model_a,
325 | model_b,
326 | model_c,
327 | base_alpha,
328 | each_key,
329 | weights_text,
330 | method,
331 | ]
332 |
333 | inference_opts = [
334 | speaker_id,
335 | source_audio,
336 | embedder_name,
337 | embedding_output_layer,
338 | transpose,
339 | fo_curve_file,
340 | pitch_extraction_algo,
341 | auto_load_index,
342 | faiss_index_file,
343 | retrieval_feature_ratio,
344 | ]
345 |
346 | merge_and_save_button.click(
347 | fn=merge_and_save,
348 | inputs=[
349 | *merge_data,
350 | output_name,
351 | ],
352 | outputs=[status],
353 | )
354 | merge_and_gen_button.click(
355 | fn=merge_and_gen,
356 | inputs=[
357 | *merge_data,
358 | *inference_opts,
359 | ],
360 | outputs=[status, audio_output],
361 | )
362 |
363 | model_a.change(
364 | update_speaker_ids, inputs=[model_a], outputs=[speaker_id]
365 | )
366 |
--------------------------------------------------------------------------------
/modules/tabs/server.py:
--------------------------------------------------------------------------------
1 | import io
2 | import json
3 |
4 | import gradio as gr
5 | import requests
6 | import soundfile as sf
7 | import torch.multiprocessing as multiprocessing
8 | from scipy.io.wavfile import write
9 |
10 | from modules.ui import Tab
11 | from server import app
12 |
13 | proc = None
14 |
15 | def server_options_ui(show_out_dir=True):
16 | with gr.Row().style(equal_height=False):
17 | with gr.Row():
18 | host = gr.Textbox(value="127.0.0.1", label="host")
19 | port = gr.Textbox(value="5001", label="port")
20 | with gr.Row().style(equal_height=False):
21 | with gr.Row():
22 | rvc_model_file = gr.Textbox(value="", label="RVC model file path")
23 | faiss_index_file = gr.Textbox(value="", label="Faiss index file path")
24 | with gr.Row().style(equal_height=False):
25 | with gr.Row():
26 | input_voice_file = gr.Textbox(value="", label="input voice file path")
27 | speaker_id = gr.Number(
28 | value=0,
29 | label="speaker_id",
30 | )
31 | transpose = gr.Slider(
32 | minimum=-20, maximum=20, value=0, step=1, label="transpose"
33 | )
34 | pitch_extraction_algo = gr.Radio(
35 | choices=["dio", "harvest", "mangio-crepe", "crepe"],
36 | value="crepe",
37 | label="pitch_extraction_algo",
38 | )
39 | retrieval_feature_ratio = gr.Slider(
40 | minimum=0,
41 | maximum=1,
42 | value=1,
43 | step=0.01,
44 | label="retrieval_feature_ratio",
45 | )
46 | return (
47 | host,
48 | port,
49 | rvc_model_file,
50 | faiss_index_file,
51 | input_voice_file,
52 | speaker_id,
53 | transpose,
54 | pitch_extraction_algo,
55 | retrieval_feature_ratio,
56 | )
57 |
58 | def run(**kwargs):
59 | app.run(**kwargs)
60 |
61 | class Server(Tab):
62 | def title(self):
63 | return "Server(experimental)"
64 |
65 | def sort(self):
66 | return 6
67 |
68 | def ui(self, outlet):
69 | def start(host, port):
70 | if multiprocessing.get_start_method() == 'fork':
71 | multiprocessing.set_start_method('spawn', force=True)
72 | proc = multiprocessing.Process(target = run, kwargs = {'host': host, 'port': port})
73 | proc.start()
74 | yield "start server"
75 |
76 | def upload(host, port, rvc_model_file, faiss_index_file):
77 | file_names = {"rvc_model_file": rvc_model_file, "faiss_index_file": faiss_index_file}
78 | res = requests.post(f"http://{host}:{port}/upload_model", json=file_names)
79 | yield res.text
80 |
81 | def convert(host, port, input_voice_file, speaker_id, transpose, pitch_extraction_algo, retrieval_feature_ratio):
82 | params = {
83 | "speaker_id": speaker_id,
84 | "transpose": transpose,
85 | "pitch_extraction_algo": pitch_extraction_algo,
86 | "retrieval_feature_ratio": retrieval_feature_ratio
87 | }
88 |
89 | audio, sr = sf.read(input_voice_file)
90 | audio_buffer = io.BytesIO()
91 | write(audio_buffer, rate=sr, data=audio)
92 | json_buffer = io.BytesIO(json.dumps(params).encode('utf-8'))
93 | files = {
94 | "input_wav": audio_buffer,
95 | "params": json_buffer
96 | }
97 | res = requests.post(f"http://{host}:{port}/convert_sound", files=files)
98 | audio, sr = sf.read(io.BytesIO(res.content))
99 | yield "convert succeed", (sr, audio)
100 |
101 | with gr.Group():
102 | with gr.Box():
103 | with gr.Column():
104 | (
105 | host,
106 | port,
107 | rvc_model_file,
108 | faiss_index_file,
109 | input_voice_file,
110 | speaker_id,
111 | transpose,
112 | pitch_extraction_algo,
113 | retrieval_feature_ratio,
114 | ) = server_options_ui()
115 |
116 | with gr.Row().style(equal_height=False):
117 | with gr.Column():
118 | status = gr.Textbox(value="", label="Status")
119 | output = gr.Audio(label="Output", interactive=False)
120 |
121 | with gr.Row():
122 | start_button = gr.Button("Start server", variant="primary")
123 | upload_button = gr.Button("Upload Model")
124 | convert_button = gr.Button("Convert Voice")
125 |
126 | start_button.click(
127 | start,
128 | inputs=[
129 | host,
130 | port
131 | ],
132 | outputs=[status],
133 | queue=True,
134 | )
135 | upload_button.click(
136 | upload,
137 | inputs=[
138 | host,
139 | port,
140 | rvc_model_file,
141 | faiss_index_file
142 | ],
143 | outputs=[status],
144 | queue=True,
145 | )
146 | convert_button.click(
147 | convert,
148 | inputs=[
149 | host,
150 | port,
151 | input_voice_file,
152 | speaker_id,
153 | transpose,
154 | pitch_extraction_algo,
155 | retrieval_feature_ratio
156 | ],
157 | outputs=[status, output],
158 | queue=True,
159 | )
160 |
--------------------------------------------------------------------------------
/modules/tabs/split.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 |
3 | from modules.separate import separate_audio
4 | from modules.ui import Tab
5 |
6 |
7 | class Split(Tab):
8 | def title(self):
9 | return "Split Audio"
10 |
11 | def sort(self):
12 | return 5
13 |
14 | def ui(self, outlet):
15 | def separate(
16 | input_audio,
17 | output_dir,
18 | silence_thresh,
19 | min_silence_len,
20 | keep_silence,
21 | margin,
22 | padding,
23 | min,
24 | max,
25 | ):
26 | min = None if min == 0 else min
27 | max = None if max == 0 else max
28 | separate_audio(
29 | input_audio,
30 | output_dir,
31 | int(silence_thresh),
32 | int(min_silence_len),
33 | int(keep_silence),
34 | int(margin),
35 | padding,
36 | int(min),
37 | int(max),
38 | )
39 | return "Success"
40 |
41 | with gr.Group():
42 | with gr.Column():
43 | with gr.Row(equal_height=False):
44 | input_audio = gr.Textbox(label="Input Audio (File or Directory)")
45 | output_dir = gr.Textbox(label="Output Directory")
46 |
47 | with gr.Row(equal_height=False):
48 | silence_thresh = gr.Number(value=-40, label="Silence Threshold")
49 | min_silence_len = gr.Number(
50 | value=750, label="Minimum Silence Length"
51 | )
52 | keep_silence = gr.Number(value=750, label="Keep Silence")
53 | margin = gr.Number(value=0, label="Margin")
54 | padding = gr.Checkbox(value=True, label="Padding")
55 |
56 | with gr.Row(equal_height=False):
57 | min = gr.Number(value=1000, label="Minimum audio length")
58 | max = gr.Number(value=5000, label="Maximum audio length")
59 |
60 | with gr.Row(equal_height=False):
61 | status = gr.Textbox(value="", label="Status")
62 | with gr.Row(equal_height=False):
63 | separate_button = gr.Button("Separate", variant="primary")
64 |
65 | separate_button.click(
66 | separate,
67 | inputs=[
68 | input_audio,
69 | output_dir,
70 | silence_thresh,
71 | min_silence_len,
72 | keep_silence,
73 | margin,
74 | padding,
75 | min,
76 | max,
77 | ],
78 | outputs=[status],
79 | )
80 |
--------------------------------------------------------------------------------
/modules/ui.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from typing import *
4 |
5 | import gradio as gr
6 | import gradio.routes
7 | import torch
8 |
9 | from . import models, shared
10 | from .core import preload
11 | from .shared import ROOT_DIR
12 |
13 |
14 | class Tab:
15 | TABS_DIR = os.path.join(ROOT_DIR, "modules", "tabs")
16 |
17 | def __init__(self, filepath: str) -> None:
18 | self.filepath = filepath
19 |
20 | def sort(self):
21 | return 1
22 |
23 | def title(self):
24 | return ""
25 |
26 | def ui(self, outlet: Callable):
27 | pass
28 |
29 | def __call__(self):
30 | children_dir = self.filepath[:-3]
31 | children = []
32 |
33 | if os.path.isdir(children_dir):
34 | for file in os.listdir(children_dir):
35 | if not file.endswith(".py"):
36 | continue
37 | module_name = file[:-3]
38 | parent = os.path.relpath(Tab.TABS_DIR, Tab.TABS_DIR).replace("/", ".")
39 |
40 | if parent.startswith("."):
41 | parent = parent[1:]
42 | if parent.endswith("."):
43 | parent = parent[:-1]
44 |
45 | children.append(
46 | importlib.import_module(f"modules.tabs.{parent}.{module_name}")
47 | )
48 |
49 | children = sorted(children, key=lambda x: x.sort())
50 |
51 | tabs = []
52 |
53 | for child in children:
54 | attrs = child.__dict__
55 | tab = [x for x in attrs.values() if issubclass(x, Tab)]
56 | if len(tab) > 0:
57 | tabs.append(tab[0])
58 |
59 | def outlet():
60 | with gr.Tabs():
61 | for tab in tabs:
62 | with gr.Tab(tab.title()):
63 | tab()
64 |
65 | return self.ui(outlet)
66 |
67 |
68 | def load_tabs() -> List[Tab]:
69 | tabs = []
70 | files = os.listdir(os.path.join(ROOT_DIR, "modules", "tabs"))
71 |
72 | for file in files:
73 | if not file.endswith(".py"):
74 | continue
75 | module_name = file[:-3]
76 | module = importlib.import_module(f"modules.tabs.{module_name}")
77 | attrs = module.__dict__
78 | TabClass = [
79 | x
80 | for x in attrs.values()
81 | if type(x) == type and issubclass(x, Tab) and not x == Tab
82 | ]
83 | if len(TabClass) > 0:
84 | tabs.append((file, TabClass[0]))
85 |
86 | tabs = sorted([TabClass(file) for file, TabClass in tabs], key=lambda x: x.sort())
87 | return tabs
88 |
89 |
90 | def webpath(fn):
91 | if fn.startswith(ROOT_DIR):
92 | web_path = os.path.relpath(fn, ROOT_DIR).replace("\\", "/")
93 | else:
94 | web_path = os.path.abspath(fn)
95 |
96 | return f"file={web_path}?{os.path.getmtime(fn)}"
97 |
98 |
99 | def javascript_html():
100 | script_js = os.path.join(ROOT_DIR, "script.js")
101 | head = f'\n'
102 |
103 | return head
104 |
105 |
106 | def css_html():
107 | return f''
108 |
109 |
110 | def create_head():
111 | head = ""
112 | head += css_html()
113 | head += javascript_html()
114 |
115 | def template_response(*args, **kwargs):
116 | res = shared.gradio_template_response_original(*args, **kwargs)
117 | res.body = res.body.replace(b"", f"{head}".encode("utf8"))
118 | res.init_headers()
119 | return res
120 |
121 | gradio.routes.templates.TemplateResponse = template_response
122 |
123 |
124 | def create_ui():
125 | preload()
126 | block = gr.Blocks()
127 |
128 | with block:
129 | with gr.Tabs():
130 | tabs = load_tabs()
131 | for tab in tabs:
132 | with gr.Tab(tab.title()):
133 | tab()
134 |
135 | create_head()
136 |
137 | return block
138 |
139 |
140 | def create_model_list_ui(speaker_id: bool = True, load: bool = True):
141 | speaker_id_info = {
142 | "visible": False,
143 | "maximum": 10000,
144 | }
145 |
146 | def reload_model(raw=False):
147 | model_list = models.get_models()
148 | if len(model_list) > 0:
149 | models.load_model(model_list[0])
150 |
151 | if models.vc_model is not None:
152 | speaker_id_info["visible"] = True
153 | speaker_id_info["maximum"] = models.vc_model.n_spk
154 |
155 | return model_list if raw else gr.Dropdown.update(choices=model_list)
156 |
157 | model_list = reload_model(raw=True)
158 |
159 | def load_model(model_name):
160 | if load:
161 | models.load_model(model_name)
162 | speaker_id_info["visible"] = True
163 | speaker_id_info["maximum"] = models.vc_model.n_spk
164 | else:
165 | model = models.get_vc_model(model_name)
166 | speaker_id_info["visible"] = True
167 | speaker_id_info["maximum"] = model.n_spk
168 | del model
169 | torch.cuda.empty_cache()
170 | return gr.Slider.update(
171 | maximum=speaker_id_info["maximum"], visible=speaker_id_info["visible"]
172 | )
173 |
174 | with gr.Row(equal_height=False):
175 | model = gr.Dropdown(
176 | choices=model_list,
177 | label="Model",
178 | value=model_list[0] if len(model_list) > 0 else None,
179 | )
180 | speaker_id = gr.Slider(
181 | minimum=0,
182 | maximum=speaker_id_info["maximum"],
183 | step=1,
184 | label="Speaker ID",
185 | value=0,
186 | visible=speaker_id and speaker_id_info["visible"],
187 | interactive=True,
188 | )
189 | reload_model_button = gr.Button("♻️")
190 |
191 | model.change(load_model, inputs=[model], outputs=[speaker_id])
192 | reload_model_button.click(reload_model, outputs=[model])
193 |
194 | return model, speaker_id
195 |
196 |
197 | if not hasattr(shared, "gradio_template_response_original"):
198 | shared.gradio_template_response_original = gradio.routes.templates.TemplateResponse
199 |
--------------------------------------------------------------------------------
/modules/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import *
3 |
4 | import ffmpeg
5 | import numpy as np
6 | import requests
7 | import torch
8 | from tqdm import tqdm
9 |
10 | from lib.rvc.config import TrainConfig
11 | from modules.shared import ROOT_DIR
12 |
13 |
14 | def load_audio(file: str, sr):
15 | try:
16 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
17 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
18 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
19 | file = (
20 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
21 | ) # Prevent small white copy path head and tail with spaces and " and return
22 | out, _ = (
23 | ffmpeg.input(file, threads=0)
24 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
25 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
26 | )
27 | except Exception as e:
28 | raise RuntimeError(f"Failed to load audio: {e}")
29 |
30 | return np.frombuffer(out, np.float32).flatten()
31 |
32 |
33 | def get_gpus():
34 | num_gpus = torch.cuda.device_count()
35 | return [torch.device(f"cuda:{i}") for i in range(num_gpus)]
36 |
37 |
38 | def download_file(url: str, out: str, position: int = 0, show: bool = True):
39 | req = requests.get(url, stream=True, allow_redirects=True)
40 | content_length = req.headers.get("content-length")
41 | if show:
42 | progress_bar = tqdm(
43 | total=int(content_length) if content_length is not None else None,
44 | leave=False,
45 | unit="B",
46 | unit_scale=True,
47 | unit_divisor=1024,
48 | position=position,
49 | )
50 |
51 | # with tqdm
52 | with open(out, "wb") as f:
53 | for chunk in req.iter_content(chunk_size=1024):
54 | if chunk:
55 | if show:
56 | progress_bar.update(len(chunk))
57 | f.write(chunk)
58 |
59 |
60 | def load_config(
61 | version: Literal["v1", "v2"],
62 | training_dir: str,
63 | sample_rate: str,
64 | emb_channels: int,
65 | fp16: bool,
66 | ):
67 | if emb_channels == 256:
68 | config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json")
69 | else:
70 | config_path = os.path.join(
71 | ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json"
72 | )
73 |
74 | config = TrainConfig.parse_file(config_path)
75 | config.version = version
76 | config.train.fp16_run = fp16
77 |
78 | config_save_path = os.path.join(training_dir, "config.json")
79 |
80 | with open(config_save_path, "w") as f:
81 | f.write(config.json())
82 |
83 | return config
84 |
--------------------------------------------------------------------------------
/outputs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/main.txt
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | # -r main.txt
2 |
3 | black
4 | isort
--------------------------------------------------------------------------------
/requirements/main.txt:
--------------------------------------------------------------------------------
1 | gradio==3.36.1
2 | tqdm==4.65.0
3 | numpy==1.23.5
4 | faiss-cpu==1.7.3
5 | fairseq==0.12.2
6 | matplotlib==3.7.1
7 | scipy==1.9.3
8 | librosa==0.9.1
9 | pyworld==0.3.2
10 | soundfile==0.12.1
11 | ffmpeg-python==0.2.0
12 | pydub==0.25.1
13 | soxr==0.3.5
14 | transformers==4.28.1
15 | torchcrepe==0.0.20
16 | Flask==2.3.2
17 |
18 | tensorboard
19 | tensorboardX
20 | requests
--------------------------------------------------------------------------------
/script.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/script.js
--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
1 | import io
2 | import json
3 | import os
4 | import traceback
5 | from typing import *
6 |
7 | import soundfile as sf
8 | from flask import Flask, make_response, request, send_file
9 | from scipy.io.wavfile import write
10 |
11 | from modules.server.model import VoiceServerModel
12 |
13 | model: Optional[VoiceServerModel] = None
14 | app = Flask(__name__)
15 |
16 | @app.route('/ping')
17 | def ping():
18 | return make_response("server is alive", 200)
19 |
20 | @app.route('/upload_model', methods=['POST'])
21 | def upload_model():
22 | """
23 | input:
24 | json:
25 | rvc_model_file: str
26 | specify rvc model's absolute path (.pt, .pth)
27 | faiss_index_file: Optional[str]
28 | specify faiss index'S absolute path (.index)
29 | """
30 | global model
31 | if request.method == "POST":
32 | rvc_model_file = request.json["rvc_model_file"]
33 | faiss_index_file =request.json["faiss_index_file"] if "faiss_index_file" in request.json else ""
34 | try:
35 | model = VoiceServerModel(rvc_model_file, faiss_index_file)
36 | return make_response("model is load", 200)
37 | except:
38 | traceback.print_exc()
39 | return make_response("model load error", 400)
40 | else:
41 | return make_response("use post method", 400)
42 |
43 | @app.route('/convert_sound', methods=['POST'])
44 | def convert_sound():
45 | """
46 | input:
47 | params: json
48 | speaker_id: int
49 | default: 0
50 | transpose: int
51 | default: 0
52 | pitch_extraction_algo: str
53 | default: dio
54 | value: ["dio", "harvest", "mangio-crepe", "crepe"]
55 | retrieval_feature_ratio: float
56 | default: 0
57 | value: 0. ~ 1.
58 | input_wav: wav file
59 |
60 | output:
61 | wavfile
62 | """
63 | global model
64 | if model is None:
65 | return make_response("please upload model", 400)
66 | print("start")
67 | if request.method == "POST":
68 | input_buffer = io.BytesIO(request.files["input_wav"].stream.read())
69 | audio, sr = sf.read(input_buffer)
70 |
71 | req_json = json.load(io.BytesIO(request.files["params"].stream.read()))
72 | sid = int(req_json.get("speaker_id", 0))
73 | transpose = int(req_json.get("transpose", 0))
74 | pitch_extraction_algo = req_json.get("pitch_extraction_algo", "dio")
75 | if not pitch_extraction_algo in ["dio", "harvest", "mangio-crepe", "crepe"]:
76 | return make_response("bad pitch extraction algo", 400)
77 | retrieval_feature_ratio = float(req_json.get("retrieval_feature_ratio", 0.))
78 |
79 | out_audio = model(audio, sr, sid, transpose, pitch_extraction_algo, retrieval_feature_ratio)
80 | output_buffer = io.BytesIO()
81 | write(output_buffer, rate=model.tgt_sr, data=out_audio)
82 | output_buffer.seek(0)
83 | response = make_response(send_file(output_buffer, mimetype="audio/wav"), 200)
84 | return response
85 | else:
86 | return make_response("use post method", 400)
87 |
88 | if __name__ == "__main__":
89 | app.run()
--------------------------------------------------------------------------------
/styles.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/styles.css
--------------------------------------------------------------------------------
/update.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | if exist ".git" (
4 | git fetch --prune
5 | git reset --hard origin/main
6 | ) else (
7 | git init
8 | git remote add origin https://github.com/ddPn08/rvc-webui.git
9 | git fetch --prune
10 | git reset --hard origin/main
11 | )
12 |
13 | pause
--------------------------------------------------------------------------------
/update.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ -d .git ]; then
3 | git fetch --prune
4 | git reset --hard origin/main
5 | else
6 | git init
7 | git remote add origin
8 | git fetch --prune
9 | git reset --hard origin/main
--------------------------------------------------------------------------------
/webui-macos-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ####################################################################
3 | # macOS defaults #
4 | # Please modify webui-user.sh to change these instead of this file #
5 | ####################################################################
6 |
7 | if [[ -x "$(command -v python3.10)" ]]
8 | then
9 | python_cmd="python3.10"
10 | fi
11 |
12 | export COMMANDLINE_ARGS=""
13 | export TORCH_COMMAND="pip install torch torchvision torchaudio"
14 | export PYTORCH_ENABLE_MPS_FALLBACK=1
15 |
16 | ####################################################################
--------------------------------------------------------------------------------
/webui-user.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | set PYTHON=
4 | set GIT=
5 | set VENV_DIR=
6 | set COMMANDLINE_ARGS=
7 |
8 | call webui.bat
--------------------------------------------------------------------------------
/webui-user.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #########################################################
3 | # Uncomment and change the variables below to your need:#
4 | #########################################################
5 |
6 | # Commandline arguments for webui.py, for example: export COMMANDLINE_ARGS="--medvram --opt-split-attention"
7 | #export COMMANDLINE_ARGS=""
8 |
9 | # python3 executable
10 | #python_cmd="python3"
11 |
12 | # git executable
13 | #export GIT="git"
14 |
15 | # python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv)
16 | #venv_dir="venv"
17 |
18 | # script to launch to start the app
19 | #export LAUNCH_SCRIPT="launch.py"
20 |
21 | # install command for torch
22 | #export TORCH_COMMAND="pip install torch --extra-index-url https://download.pytorch.org/whl/cu118"
23 |
24 | # Requirements file to use for stable-diffusion-webui
25 | #export REQS_FILE="requirements_versions.txt"
26 |
27 | ###########################################
--------------------------------------------------------------------------------
/webui.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | if not defined PYTHON (set PYTHON=python)
4 | if not defined VENV_DIR (set "VENV_DIR=%~dp0%venv")
5 |
6 |
7 | set ERROR_REPORTING=FALSE
8 |
9 | mkdir tmp 2>NUL
10 |
11 | %PYTHON% -c "" >tmp/stdout.txt 2>tmp/stderr.txt
12 | if %ERRORLEVEL% == 0 goto :check_pip
13 | echo Couldn't launch python
14 | goto :show_stdout_stderr
15 |
16 | :check_pip
17 | %PYTHON% -mpip --help >tmp/stdout.txt 2>tmp/stderr.txt
18 | if %ERRORLEVEL% == 0 goto :start_venv
19 | if "%PIP_INSTALLER_LOCATION%" == "" goto :show_stdout_stderr
20 | %PYTHON% "%PIP_INSTALLER_LOCATION%" >tmp/stdout.txt 2>tmp/stderr.txt
21 | if %ERRORLEVEL% == 0 goto :start_venv
22 | echo Couldn't install pip
23 | goto :show_stdout_stderr
24 |
25 | :start_venv
26 | if ["%VENV_DIR%"] == ["-"] goto :launch
27 | if ["%SKIP_VENV%"] == ["1"] goto :launch
28 |
29 | dir "%VENV_DIR%\Scripts\Python.exe" >tmp/stdout.txt 2>tmp/stderr.txt
30 | if %ERRORLEVEL% == 0 goto :activate_venv
31 |
32 | for /f "delims=" %%i in ('CALL %PYTHON% -c "import sys; print(sys.executable)"') do set PYTHON_FULLNAME="%%i"
33 | echo Creating venv in directory %VENV_DIR% using python %PYTHON_FULLNAME%
34 | %PYTHON_FULLNAME% -m venv "%VENV_DIR%" >tmp/stdout.txt 2>tmp/stderr.txt
35 | if %ERRORLEVEL% == 0 goto :activate_venv
36 | echo Unable to create venv in directory "%VENV_DIR%"
37 | goto :show_stdout_stderr
38 |
39 | :activate_venv
40 | set PYTHON="%VENV_DIR%\Scripts\Python.exe"
41 | echo venv %PYTHON%
42 |
43 | :launch
44 | %PYTHON% launch.py %*
45 | pause
46 | exit /b
47 |
48 | :show_stdout_stderr
49 |
50 | echo.
51 | echo exit code: %errorlevel%
52 |
53 | for /f %%i in ("tmp\stdout.txt") do set size=%%~zi
54 | if %size% equ 0 goto :show_stderr
55 | echo.
56 | echo stdout:
57 | type tmp\stdout.txt
58 |
59 | :show_stderr
60 | for /f %%i in ("tmp\stderr.txt") do set size=%%~zi
61 | if %size% equ 0 goto :show_stderr
62 | echo.
63 | echo stderr:
64 | type tmp\stderr.txt
65 |
66 | :endofscript
67 |
68 | echo.
69 | echo Launch unsuccessful. Exiting.
70 | pause
--------------------------------------------------------------------------------
/webui.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from modules import cmd_opts, ui
4 |
5 | # なんか知らんが湧いて出てくる ".DS_Store" を無視する。
6 | # ここにこんなコードを置くべきかはわからないけど…
7 | _list_dir = os.listdir
8 |
9 | def listdir4mac(path):
10 | return [file for file in _list_dir(path) if not file.startswith(".")]
11 |
12 | os.listdir = listdir4mac
13 |
14 |
15 | def webui():
16 | app = ui.create_ui()
17 | app.queue(64)
18 | app, local_url, share_url = app.launch(
19 | server_name=cmd_opts.opts.host,
20 | server_port=cmd_opts.opts.port,
21 | share=cmd_opts.opts.share,
22 | )
23 |
24 |
25 | if __name__ == "__main__":
26 | webui()
27 |
--------------------------------------------------------------------------------
/webui.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #################################################
3 | # Please do not make any changes to this file, #
4 | # change the variables in webui-user.sh instead #
5 | #################################################
6 |
7 | # If run from macOS, load defaults from webui-macos-env.sh
8 | if [[ "$OSTYPE" == "darwin"* ]]; then
9 | if [[ -f webui-macos-env.sh ]]
10 | then
11 | source ./webui-macos-env.sh
12 | fi
13 | fi
14 |
15 | # Read variables from webui-user.sh
16 | # shellcheck source=/dev/null
17 | if [[ -f webui-user.sh ]]
18 | then
19 | source ./webui-user.sh
20 | fi
21 |
22 | # python3 executable
23 | if [[ -z "${python_cmd}" ]]
24 | then
25 | python_cmd="python3"
26 | fi
27 |
28 | # git executable
29 | if [[ -z "${GIT}" ]]
30 | then
31 | export GIT="git"
32 | fi
33 |
34 | # python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv)
35 | if [[ -z "${venv_dir}" ]]
36 | then
37 | venv_dir="venv"
38 | fi
39 |
40 | if [[ -z "${LAUNCH_SCRIPT}" ]]
41 | then
42 | LAUNCH_SCRIPT="launch.py"
43 | fi
44 |
45 | # this script cannot be run as root by default
46 | can_run_as_root=0
47 |
48 | # read any command line flags to the webui.sh script
49 | while getopts "f" flag > /dev/null 2>&1
50 | do
51 | case ${flag} in
52 | f) can_run_as_root=1;;
53 | *) break;;
54 | esac
55 | done
56 |
57 | # Disable sentry logging
58 | export ERROR_REPORTING=FALSE
59 |
60 | # Do not reinstall existing pip packages on Debian/Ubuntu
61 | export PIP_IGNORE_INSTALLED=0
62 |
63 | # Pretty print
64 | delimiter="################################################################"
65 |
66 | # Do not run as root
67 | if [[ $(id -u) -eq 0 && can_run_as_root -eq 0 ]]
68 | then
69 | printf "\n%s\n" "${delimiter}"
70 | printf "\e[1m\e[31mERROR: This script must not be launched as root, aborting...\e[0m"
71 | printf "\n%s\n" "${delimiter}"
72 | exit 1
73 | else
74 | printf "\n%s\n" "${delimiter}"
75 | printf "Running on \e[1m\e[32m%s\e[0m user" "$(whoami)"
76 | printf "\n%s\n" "${delimiter}"
77 | fi
78 |
79 | if echo "$gpu_info" | grep -q "AMD" && [[ -z "${TORCH_COMMAND}" ]]
80 | then
81 | export TORCH_COMMAND="pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.2"
82 | fi
83 |
84 | for preq in "${GIT}" "${python_cmd}"
85 | do
86 | if ! hash "${preq}" &>/dev/null
87 | then
88 | printf "\n%s\n" "${delimiter}"
89 | printf "\e[1m\e[31mERROR: %s is not installed, aborting...\e[0m" "${preq}"
90 | printf "\n%s\n" "${delimiter}"
91 | exit 1
92 | fi
93 | done
94 |
95 | if ! "${python_cmd}" -c "import venv" &>/dev/null
96 | then
97 | printf "\n%s\n" "${delimiter}"
98 | printf "\e[1m\e[31mERROR: python3-venv is not installed, aborting...\e[0m"
99 | printf "\n%s\n" "${delimiter}"
100 | exit 1
101 | fi
102 |
103 | printf "\n%s\n" "${delimiter}"
104 | printf "Create and activate python venv"
105 | printf "\n%s\n" "${delimiter}"
106 | if [[ ! -d "${venv_dir}" ]]
107 | then
108 | "${python_cmd}" -m venv "${venv_dir}"
109 | first_launch=1
110 | fi
111 | # shellcheck source=/dev/null
112 | if [[ -f "${venv_dir}"/bin/activate ]]
113 | then
114 | source "${venv_dir}"/bin/activate
115 | else
116 | printf "\n%s\n" "${delimiter}"
117 | printf "\e[1m\e[31mERROR: Cannot activate python venv, aborting...\e[0m"
118 | printf "\n%s\n" "${delimiter}"
119 | exit 1
120 | fi
121 |
122 | printf "\n%s\n" "${delimiter}"
123 | printf "Launching launch.py..."
124 | printf "\n%s\n" "${delimiter}"
125 | exec "${python_cmd}" "${LAUNCH_SCRIPT}" "$@"
--------------------------------------------------------------------------------