├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README-ja.md
├── README.md
├── bin
    └── .gitignore
├── configs
    ├── 32k-768.json
    ├── 32k.json
    ├── 40k-768.json
    ├── 40k.json
    ├── 48k-768.json
    └── 48k.json
├── dev.py
├── launch.py
├── lib
    └── rvc
    │   ├── attentions.py
    │   ├── checkpoints.py
    │   ├── commons.py
    │   ├── config.py
    │   ├── data_utils.py
    │   ├── losses.py
    │   ├── mel_processing.py
    │   ├── models.py
    │   ├── modules.py
    │   ├── pipeline.py
    │   ├── preprocessing
    │       ├── extract_f0.py
    │       ├── extract_feature.py
    │       ├── slicer.py
    │       └── split.py
    │   ├── train.py
    │   ├── transforms.py
    │   └── utils.py
├── models
    ├── checkpoints
    │   └── .gitignore
    ├── embeddings
    │   └── .gitignore
    ├── pretrained
    │   └── .gitignore
    └── training
    │   ├── .gitignore
    │   ├── models
    │       └── .gitignore
    │   └── mute
    │       ├── 0_gt_wavs
    │           ├── mute32k.wav
    │           ├── mute40k.wav
    │           └── mute48k.wav
    │       ├── 1_16k_wavs
    │           └── mute.wav
    │       ├── 2a_f0
    │           └── mute.wav.npy
    │       ├── 2b_f0nsf
    │           └── mute.wav.npy
    │       └── 3_feature256
    │           └── mute.npy
├── modules
    ├── cmd_opts.py
    ├── core.py
    ├── merge.py
    ├── models.py
    ├── separate.py
    ├── server
    │   └── model.py
    ├── shared.py
    ├── tabs
    │   ├── inference.py
    │   ├── merge.py
    │   ├── server.py
    │   ├── split.py
    │   └── training.py
    ├── ui.py
    └── utils.py
├── outputs
    └── .gitignore
├── requirements.txt
├── requirements
    ├── dev.txt
    └── main.txt
├── script.js
├── server.py
├── styles.css
├── update.bat
├── update.sh
├── webui-macos-env.sh
├── webui-user.bat
├── webui-user.sh
├── webui.bat
├── webui.py
└── webui.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | tmp/
  4 | 
  5 | 
  6 | ### Generated by gibo (https://github.com/simonwhitaker/gibo)
  7 | ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Global/VisualStudioCode.gitignore
  8 | 
  9 | .vscode/*
 10 | !.vscode/settings.json
 11 | !.vscode/tasks.json
 12 | !.vscode/launch.json
 13 | !.vscode/extensions.json
 14 | !.vscode/*.code-snippets
 15 | 
 16 | # Local History for Visual Studio Code
 17 | .history/
 18 | 
 19 | # Built Visual Studio Code Extensions
 20 | *.vsix
 21 | 
 22 | 
 23 | ### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore
 24 | 
 25 | # Byte-compiled / optimized / DLL files
 26 | __pycache__/
 27 | *.py[cod]
 28 | *$py.class
 29 | 
 30 | # C extensions
 31 | *.so
 32 | 
 33 | # Distribution / packaging
 34 | .Python
 35 | build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | # lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | share/python-wheels/
 48 | *.egg-info/
 49 | .installed.cfg
 50 | *.egg
 51 | MANIFEST
 52 | 
 53 | # PyInstaller
 54 | #  Usually these files are written by a python script from a template
 55 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 56 | *.manifest
 57 | *.spec
 58 | 
 59 | # Installer logs
 60 | pip-log.txt
 61 | pip-delete-this-directory.txt
 62 | 
 63 | # Unit test / coverage reports
 64 | htmlcov/
 65 | .tox/
 66 | .nox/
 67 | .coverage
 68 | .coverage.*
 69 | .cache
 70 | nosetests.xml
 71 | coverage.xml
 72 | *.cover
 73 | *.py,cover
 74 | .hypothesis/
 75 | .pytest_cache/
 76 | cover/
 77 | 
 78 | # Translations
 79 | *.mo
 80 | *.pot
 81 | 
 82 | # Django stuff:
 83 | *.log
 84 | local_settings.py
 85 | db.sqlite3
 86 | db.sqlite3-journal
 87 | 
 88 | # Flask stuff:
 89 | instance/
 90 | .webassets-cache
 91 | 
 92 | # Scrapy stuff:
 93 | .scrapy
 94 | 
 95 | # Sphinx documentation
 96 | docs/_build/
 97 | 
 98 | # PyBuilder
 99 | .pybuilder/
100 | target/
101 | 
102 | # Jupyter Notebook
103 | .ipynb_checkpoints
104 | 
105 | # IPython
106 | profile_default/
107 | ipython_config.py
108 | 
109 | # pyenv
110 | #   For a library or package, you might want to ignore these files since the code is
111 | #   intended to run in multiple environments; otherwise, check them in:
112 | # .python-version
113 | 
114 | # pipenv
115 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
117 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
118 | #   install all needed dependencies.
119 | #Pipfile.lock
120 | 
121 | # poetry
122 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
123 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
124 | #   commonly ignored for libraries.
125 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
126 | #poetry.lock
127 | 
128 | # pdm
129 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
130 | #pdm.lock
131 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
132 | #   in version control.
133 | #   https://pdm.fming.dev/#use-with-ide
134 | .pdm.toml
135 | 
136 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
137 | __pypackages__/
138 | 
139 | # Celery stuff
140 | celerybeat-schedule
141 | celerybeat.pid
142 | 
143 | # SageMath parsed files
144 | *.sage.py
145 | 
146 | # Environments
147 | .env
148 | .venv
149 | env/
150 | venv/
151 | ENV/
152 | env.bak/
153 | venv.bak/
154 | 
155 | # Spyder project settings
156 | .spyderproject
157 | .spyproject
158 | 
159 | # Rope project settings
160 | .ropeproject
161 | 
162 | # mkdocs documentation
163 | /site
164 | 
165 | # mypy
166 | .mypy_cache/
167 | .dmypy.json
168 | dmypy.json
169 | 
170 | # Pyre type checker
171 | .pyre/
172 | 
173 | # pytype static type analyzer
174 | .pytype/
175 | 
176 | # Cython debug symbols
177 | cython_debug/
178 | 
179 | # PyCharm
180 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
181 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
182 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
183 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
184 | #.idea/
185 | 
186 | 
187 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.formatting.provider": "black",
3 |     "editor.codeActionsOnSave": {
4 |         "source.organizeImports": true
5 |     },
6 |     "editor.formatOnSave": true,
7 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 ddPn08
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README-ja.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">RVC-WebUI</h1>
 2 | <div align="center">
 3 | <p>
 4 | 
 5 | [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) の再構築プロジェクト
 6 | 
 7 | </p>
 8 | </div>
 9 | 
10 | ---
11 | 
12 | <div align="center">
13 | <p>
14 | 
15 | [日本語](README-ja.md) | [English](README.md)
16 | 
17 | </p>
18 | </div>
19 | 
20 | <br >
21 | 
22 | # 起動
23 | 
24 | ## Windows
25 | `webui-user.bat` をダブルクリックして、webuiを起動します。
26 | 
27 | ## Linux or Mac
28 | `webui.sh` を実行して、webuiを起動します。
29 | 
30 | <br >
31 | 
32 | ```
33 | テスト環境: Windows 10, Python 3.10.9, torch 2.0.0+cu118
34 | ```
35 | 
36 | <br >
37 | 
38 | # トラブルシューティング
39 | 
40 | ## `error: Microsoft Visual C++ 14.0 or greater is required.`
41 | 
42 | Microsoft C++ Build Tools がインストールされている必要があります。
43 | 
44 | ### Step 1: インストーラーをダウンロード
45 | [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16)
46 | 
47 | ### Step 2: `C++ Build Tools` をインストール
48 | インストーラーを実行し、`Workloads` タブで `C++ Build Tools` を選択します。
49 | 
50 | <br >
51 | 
52 | # クレジット
53 | - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI)
54 | - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection)
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">RVC-WebUI</h1>
 2 | <div align="center">
 3 | <p>
 4 | 
 5 | [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) reconstruction project
 6 | 
 7 | </p>
 8 | </div>
 9 | 
10 | ---
11 | 
12 | <div align="center">
13 | <p>
14 | 
15 | [日本語](README-ja.md) | [English](README.md)
16 | 
17 | </p>
18 | </div>
19 | 
20 | <br >
21 | 
22 | # Launch
23 | 
24 | ## Windows
25 | Double click `webui-user.bat` to start the webui.
26 | 
27 | ## Linux or Mac
28 | Run `webui.sh` to start the webui.
29 | 
30 | <br >
31 | 
32 | ```
33 | Tested environment: Windows 10, Python 3.10.9, torch 2.0.0+cu118
34 | ```
35 | 
36 | <br >
37 | 
38 | # Troubleshooting
39 | 
40 | ## `error: Microsoft Visual C++ 14.0 or greater is required.`
41 | 
42 | Microsoft C++ Build Tools must be installed.
43 | 
44 | ### Step 1: Download the installer
45 | [Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16)
46 | 
47 | ### Step 2: Install `C++ Build Tools`
48 | Run the installer and select `C++ Build Tools` in the `Workloads` tab.
49 | 
50 | <br >
51 | 
52 | # Credits
53 | - [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI)
54 | - [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection)
55 | 


--------------------------------------------------------------------------------
/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/configs/32k-768.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 32000,
21 |     "filter_length": 1024,
22 |     "hop_length": 320,
23 |     "win_length": 1024,
24 |     "n_mel_channels": 80,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,4,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "emb_channels": 768,
45 |     "spk_embed_dim": 109
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/configs/32k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 32000,
21 |     "filter_length": 1024,
22 |     "hop_length": 320,
23 |     "win_length": 1024,
24 |     "n_mel_channels": 80,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,4,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "emb_channels": 256,
45 |     "spk_embed_dim": 109
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/configs/40k-768.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 40000,
21 |     "filter_length": 2048,
22 |     "hop_length": 400,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 125,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,10,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "emb_channels": 768,
45 |     "spk_embed_dim": 109
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/configs/40k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 40000,
21 |     "filter_length": 2048,
22 |     "hop_length": 400,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 125,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,10,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "emb_channels": 256,
45 |     "spk_embed_dim": 109
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/configs/48k-768.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 11520,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 48000,
21 |     "filter_length": 2048,
22 |     "hop_length": 480,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 128,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,6,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "emb_channels": 768,
45 |     "spk_embed_dim": 109
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/configs/48k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 11520,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 48000,
21 |     "filter_length": 2048,
22 |     "hop_length": 480,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 128,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,6,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "emb_channels": 256,
45 |     "spk_embed_dim": 109
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/dev.py:
--------------------------------------------------------------------------------
1 | import modules.ui as ui
2 | 
3 | demo = ui.create_ui()
4 | 


--------------------------------------------------------------------------------
/launch.py:
--------------------------------------------------------------------------------
  1 | import importlib.util
  2 | import os
  3 | import shlex
  4 | import subprocess
  5 | import sys
  6 | 
  7 | commandline_args = os.environ.get("COMMANDLINE_ARGS", "")
  8 | sys.argv += shlex.split(commandline_args)
  9 | 
 10 | python = sys.executable
 11 | git = os.environ.get("GIT", "git")
 12 | index_url = os.environ.get("INDEX_URL", "")
 13 | stored_commit_hash = None
 14 | skip_install = False
 15 | 
 16 | 
 17 | def run(command, desc=None, errdesc=None, custom_env=None):
 18 |     if desc is not None:
 19 |         print(desc)
 20 | 
 21 |     result = subprocess.run(
 22 |         command,
 23 |         stdout=subprocess.PIPE,
 24 |         stderr=subprocess.PIPE,
 25 |         shell=True,
 26 |         env=os.environ if custom_env is None else custom_env,
 27 |     )
 28 | 
 29 |     if result.returncode != 0:
 30 |         message = f"""{errdesc or 'Error running command'}.
 31 | Command: {command}
 32 | Error code: {result.returncode}
 33 | stdout: {result.stdout.decode(encoding="utf8", errors="ignore") if len(result.stdout)>0 else '<empty>'}
 34 | stderr: {result.stderr.decode(encoding="utf8", errors="ignore") if len(result.stderr)>0 else '<empty>'}
 35 | """
 36 |         raise RuntimeError(message)
 37 | 
 38 |     return result.stdout.decode(encoding="utf8", errors="ignore")
 39 | 
 40 | 
 41 | def check_run(command):
 42 |     result = subprocess.run(
 43 |         command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
 44 |     )
 45 |     return result.returncode == 0
 46 | 
 47 | 
 48 | def is_installed(package):
 49 |     try:
 50 |         spec = importlib.util.find_spec(package)
 51 |     except ModuleNotFoundError:
 52 |         return False
 53 | 
 54 |     return spec is not None
 55 | 
 56 | 
 57 | def commit_hash():
 58 |     global stored_commit_hash
 59 | 
 60 |     if stored_commit_hash is not None:
 61 |         return stored_commit_hash
 62 | 
 63 |     try:
 64 |         stored_commit_hash = run(f"{git} rev-parse HEAD").strip()
 65 |     except Exception:
 66 |         stored_commit_hash = "<none>"
 67 | 
 68 |     return stored_commit_hash
 69 | 
 70 | 
 71 | def run_pip(args, desc=None):
 72 |     if skip_install:
 73 |         return
 74 | 
 75 |     index_url_line = f" --index-url {index_url}" if index_url != "" else ""
 76 |     return run(
 77 |         f'"{python}" -m pip {args} --prefer-binary{index_url_line}',
 78 |         desc=f"Installing {desc}",
 79 |         errdesc=f"Couldn't install {desc}",
 80 |     )
 81 | 
 82 | 
 83 | def run_python(code, desc=None, errdesc=None):
 84 |     return run(f'"{python}" -c "{code}"', desc, errdesc)
 85 | 
 86 | 
 87 | def extract_arg(args, name):
 88 |     return [x for x in args if x != name], name in args
 89 | 
 90 | 
 91 | def prepare_environment():
 92 |     commit = commit_hash()
 93 | 
 94 |     print(f"Python {sys.version}")
 95 |     print(f"Commit hash: {commit}")
 96 | 
 97 |     torch_command = os.environ.get(
 98 |         "TORCH_COMMAND",
 99 |         "pip install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118",
100 |     )
101 | 
102 |     sys.argv, skip_install = extract_arg(sys.argv, "--skip-install")
103 |     if skip_install:
104 |         return
105 | 
106 |     sys.argv, reinstall_torch = extract_arg(sys.argv, "--reinstall-torch")
107 |     ngrok = "--ngrok" in sys.argv
108 | 
109 |     if reinstall_torch or not is_installed("torch") or not is_installed("torchaudio"):
110 |         run(
111 |             f'"{python}" -m {torch_command}',
112 |             "Installing torch and torchaudio",
113 |             "Couldn't install torch",
114 |         )
115 | 
116 |     if not is_installed("pyngrok") and ngrok:
117 |         run_pip("install pyngrok", "ngrok")
118 | 
119 |     run(
120 |         f'"{python}" -m pip install -r requirements.txt',
121 |         desc=f"Installing requirements",
122 |         errdesc=f"Couldn't install requirements",
123 |     )
124 | 
125 | 
126 | def start():
127 |     os.environ["PATH"] = (
128 |         os.path.join(os.path.dirname(__file__), "bin")
129 |         + os.pathsep
130 |         + os.environ.get("PATH", "")
131 |     )
132 |     subprocess.run(
133 |         [python, "webui.py", *sys.argv[1:]],
134 |     )
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     prepare_environment()
139 |     start()
140 | 


--------------------------------------------------------------------------------
/lib/rvc/attentions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | from . import commons
  8 | from .modules import LayerNorm
  9 | 
 10 | 
 11 | class Encoder(nn.Module):
 12 |     def __init__(
 13 |         self,
 14 |         hidden_channels,
 15 |         filter_channels,
 16 |         n_heads,
 17 |         n_layers,
 18 |         kernel_size=1,
 19 |         p_dropout=0.0,
 20 |         window_size=10,
 21 |         **kwargs
 22 |     ):
 23 |         super().__init__()
 24 |         self.hidden_channels = hidden_channels
 25 |         self.filter_channels = filter_channels
 26 |         self.n_heads = n_heads
 27 |         self.n_layers = n_layers
 28 |         self.kernel_size = kernel_size
 29 |         self.p_dropout = p_dropout
 30 |         self.window_size = window_size
 31 | 
 32 |         self.drop = nn.Dropout(p_dropout)
 33 |         self.attn_layers = nn.ModuleList()
 34 |         self.norm_layers_1 = nn.ModuleList()
 35 |         self.ffn_layers = nn.ModuleList()
 36 |         self.norm_layers_2 = nn.ModuleList()
 37 |         for i in range(self.n_layers):
 38 |             self.attn_layers.append(
 39 |                 MultiHeadAttention(
 40 |                     hidden_channels,
 41 |                     hidden_channels,
 42 |                     n_heads,
 43 |                     p_dropout=p_dropout,
 44 |                     window_size=window_size,
 45 |                 )
 46 |             )
 47 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
 48 |             self.ffn_layers.append(
 49 |                 FFN(
 50 |                     hidden_channels,
 51 |                     hidden_channels,
 52 |                     filter_channels,
 53 |                     kernel_size,
 54 |                     p_dropout=p_dropout,
 55 |                 )
 56 |             )
 57 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
 58 | 
 59 |     def forward(self, x, x_mask):
 60 |         attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 61 |         x = x * x_mask
 62 |         for i in range(self.n_layers):
 63 |             y = self.attn_layers[i](x, x, attn_mask)
 64 |             y = self.drop(y)
 65 |             x = self.norm_layers_1[i](x + y)
 66 | 
 67 |             y = self.ffn_layers[i](x, x_mask)
 68 |             y = self.drop(y)
 69 |             x = self.norm_layers_2[i](x + y)
 70 |         x = x * x_mask
 71 |         return x
 72 | 
 73 | 
 74 | class Decoder(nn.Module):
 75 |     def __init__(
 76 |         self,
 77 |         hidden_channels,
 78 |         filter_channels,
 79 |         n_heads,
 80 |         n_layers,
 81 |         kernel_size=1,
 82 |         p_dropout=0.0,
 83 |         proximal_bias=False,
 84 |         proximal_init=True,
 85 |         **kwargs
 86 |     ):
 87 |         super().__init__()
 88 |         self.hidden_channels = hidden_channels
 89 |         self.filter_channels = filter_channels
 90 |         self.n_heads = n_heads
 91 |         self.n_layers = n_layers
 92 |         self.kernel_size = kernel_size
 93 |         self.p_dropout = p_dropout
 94 |         self.proximal_bias = proximal_bias
 95 |         self.proximal_init = proximal_init
 96 | 
 97 |         self.drop = nn.Dropout(p_dropout)
 98 |         self.self_attn_layers = nn.ModuleList()
 99 |         self.norm_layers_0 = nn.ModuleList()
100 |         self.encdec_attn_layers = nn.ModuleList()
101 |         self.norm_layers_1 = nn.ModuleList()
102 |         self.ffn_layers = nn.ModuleList()
103 |         self.norm_layers_2 = nn.ModuleList()
104 |         for i in range(self.n_layers):
105 |             self.self_attn_layers.append(
106 |                 MultiHeadAttention(
107 |                     hidden_channels,
108 |                     hidden_channels,
109 |                     n_heads,
110 |                     p_dropout=p_dropout,
111 |                     proximal_bias=proximal_bias,
112 |                     proximal_init=proximal_init,
113 |                 )
114 |             )
115 |             self.norm_layers_0.append(LayerNorm(hidden_channels))
116 |             self.encdec_attn_layers.append(
117 |                 MultiHeadAttention(
118 |                     hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
119 |                 )
120 |             )
121 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
122 |             self.ffn_layers.append(
123 |                 FFN(
124 |                     hidden_channels,
125 |                     hidden_channels,
126 |                     filter_channels,
127 |                     kernel_size,
128 |                     p_dropout=p_dropout,
129 |                     causal=True,
130 |                 )
131 |             )
132 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
133 | 
134 |     def forward(self, x, x_mask, h, h_mask):
135 |         """
136 |         x: decoder input
137 |         h: encoder output
138 |         """
139 |         self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
140 |             device=x.device, dtype=x.dtype
141 |         )
142 |         encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
143 |         x = x * x_mask
144 |         for i in range(self.n_layers):
145 |             y = self.self_attn_layers[i](x, x, self_attn_mask)
146 |             y = self.drop(y)
147 |             x = self.norm_layers_0[i](x + y)
148 | 
149 |             y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
150 |             y = self.drop(y)
151 |             x = self.norm_layers_1[i](x + y)
152 | 
153 |             y = self.ffn_layers[i](x, x_mask)
154 |             y = self.drop(y)
155 |             x = self.norm_layers_2[i](x + y)
156 |         x = x * x_mask
157 |         return x
158 | 
159 | 
160 | class MultiHeadAttention(nn.Module):
161 |     def __init__(
162 |         self,
163 |         channels,
164 |         out_channels,
165 |         n_heads,
166 |         p_dropout=0.0,
167 |         window_size=None,
168 |         heads_share=True,
169 |         block_length=None,
170 |         proximal_bias=False,
171 |         proximal_init=False,
172 |     ):
173 |         super().__init__()
174 |         assert channels % n_heads == 0
175 | 
176 |         self.channels = channels
177 |         self.out_channels = out_channels
178 |         self.n_heads = n_heads
179 |         self.p_dropout = p_dropout
180 |         self.window_size = window_size
181 |         self.heads_share = heads_share
182 |         self.block_length = block_length
183 |         self.proximal_bias = proximal_bias
184 |         self.proximal_init = proximal_init
185 |         self.attn = None
186 | 
187 |         self.k_channels = channels // n_heads
188 |         self.conv_q = nn.Conv1d(channels, channels, 1)
189 |         self.conv_k = nn.Conv1d(channels, channels, 1)
190 |         self.conv_v = nn.Conv1d(channels, channels, 1)
191 |         self.conv_o = nn.Conv1d(channels, out_channels, 1)
192 |         self.drop = nn.Dropout(p_dropout)
193 | 
194 |         if window_size is not None:
195 |             n_heads_rel = 1 if heads_share else n_heads
196 |             rel_stddev = self.k_channels**-0.5
197 |             self.emb_rel_k = nn.Parameter(
198 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
199 |                 * rel_stddev
200 |             )
201 |             self.emb_rel_v = nn.Parameter(
202 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
203 |                 * rel_stddev
204 |             )
205 | 
206 |         nn.init.xavier_uniform_(self.conv_q.weight)
207 |         nn.init.xavier_uniform_(self.conv_k.weight)
208 |         nn.init.xavier_uniform_(self.conv_v.weight)
209 |         if proximal_init:
210 |             with torch.no_grad():
211 |                 self.conv_k.weight.copy_(self.conv_q.weight)
212 |                 self.conv_k.bias.copy_(self.conv_q.bias)
213 | 
214 |     def forward(self, x, c, attn_mask=None):
215 |         q = self.conv_q(x)
216 |         k = self.conv_k(c)
217 |         v = self.conv_v(c)
218 | 
219 |         x, self.attn = self.attention(q, k, v, mask=attn_mask)
220 | 
221 |         x = self.conv_o(x)
222 |         return x
223 | 
224 |     def attention(self, query, key, value, mask=None):
225 |         # reshape [b, d, t] -> [b, n_h, t, d_k]
226 |         b, d, t_s, t_t = (*key.size(), query.size(2))
227 |         query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
228 |         key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
229 |         value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
230 | 
231 |         scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
232 |         if self.window_size is not None:
233 |             assert (
234 |                 t_s == t_t
235 |             ), "Relative attention is only available for self-attention."
236 |             key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
237 |             rel_logits = self._matmul_with_relative_keys(
238 |                 query / math.sqrt(self.k_channels), key_relative_embeddings
239 |             )
240 |             scores_local = self._relative_position_to_absolute_position(rel_logits)
241 |             scores = scores + scores_local
242 |         if self.proximal_bias:
243 |             assert t_s == t_t, "Proximal bias is only available for self-attention."
244 |             scores = scores + self._attention_bias_proximal(t_s).to(
245 |                 device=scores.device, dtype=scores.dtype
246 |             )
247 |         if mask is not None:
248 |             scores = scores.masked_fill(mask == 0, -1e4)
249 |             if self.block_length is not None:
250 |                 assert (
251 |                     t_s == t_t
252 |                 ), "Local attention is only available for self-attention."
253 |                 block_mask = (
254 |                     torch.ones_like(scores)
255 |                     .triu(-self.block_length)
256 |                     .tril(self.block_length)
257 |                 )
258 |                 scores = scores.masked_fill(block_mask == 0, -1e4)
259 |         p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
260 |         p_attn = self.drop(p_attn)
261 |         output = torch.matmul(p_attn, value)
262 |         if self.window_size is not None:
263 |             relative_weights = self._absolute_position_to_relative_position(p_attn)
264 |             value_relative_embeddings = self._get_relative_embeddings(
265 |                 self.emb_rel_v, t_s
266 |             )
267 |             output = output + self._matmul_with_relative_values(
268 |                 relative_weights, value_relative_embeddings
269 |             )
270 |         output = (
271 |             output.transpose(2, 3).contiguous().view(b, d, t_t)
272 |         )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
273 |         return output, p_attn
274 | 
275 |     def _matmul_with_relative_values(self, x, y):
276 |         """
277 |         x: [b, h, l, m]
278 |         y: [h or 1, m, d]
279 |         ret: [b, h, l, d]
280 |         """
281 |         ret = torch.matmul(x, y.unsqueeze(0))
282 |         return ret
283 | 
284 |     def _matmul_with_relative_keys(self, x, y):
285 |         """
286 |         x: [b, h, l, d]
287 |         y: [h or 1, m, d]
288 |         ret: [b, h, l, m]
289 |         """
290 |         ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
291 |         return ret
292 | 
293 |     def _get_relative_embeddings(self, relative_embeddings, length):
294 |         max_relative_position = 2 * self.window_size + 1
295 |         # Pad first before slice to avoid using cond ops.
296 |         pad_length = max(length - (self.window_size + 1), 0)
297 |         slice_start_position = max((self.window_size + 1) - length, 0)
298 |         slice_end_position = slice_start_position + 2 * length - 1
299 |         if pad_length > 0:
300 |             padded_relative_embeddings = F.pad(
301 |                 relative_embeddings,
302 |                 commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
303 |             )
304 |         else:
305 |             padded_relative_embeddings = relative_embeddings
306 |         used_relative_embeddings = padded_relative_embeddings[
307 |             :, slice_start_position:slice_end_position
308 |         ]
309 |         return used_relative_embeddings
310 | 
311 |     def _relative_position_to_absolute_position(self, x):
312 |         """
313 |         x: [b, h, l, 2*l-1]
314 |         ret: [b, h, l, l]
315 |         """
316 |         batch, heads, length, _ = x.size()
317 |         # Concat columns of pad to shift from relative to absolute indexing.
318 |         x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
319 | 
320 |         # Concat extra elements so to add up to shape (len+1, 2*len-1).
321 |         x_flat = x.view([batch, heads, length * 2 * length])
322 |         x_flat = F.pad(
323 |             x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
324 |         )
325 | 
326 |         # Reshape and slice out the padded elements.
327 |         x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
328 |             :, :, :length, length - 1 :
329 |         ]
330 |         return x_final
331 | 
332 |     def _absolute_position_to_relative_position(self, x):
333 |         """
334 |         x: [b, h, l, l]
335 |         ret: [b, h, l, 2*l-1]
336 |         """
337 |         batch, heads, length, _ = x.size()
338 |         # padd along column
339 |         x = F.pad(
340 |             x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
341 |         )
342 |         x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
343 |         # add 0's in the beginning that will skew the elements after reshape
344 |         x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
345 |         x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
346 |         return x_final
347 | 
348 |     def _attention_bias_proximal(self, length):
349 |         """Bias for self-attention to encourage attention to close positions.
350 |         Args:
351 |           length: an integer scalar.
352 |         Returns:
353 |           a Tensor with shape [1, 1, length, length]
354 |         """
355 |         r = torch.arange(length, dtype=torch.float32)
356 |         diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
357 |         return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
358 | 
359 | 
360 | class FFN(nn.Module):
361 |     def __init__(
362 |         self,
363 |         in_channels,
364 |         out_channels,
365 |         filter_channels,
366 |         kernel_size,
367 |         p_dropout=0.0,
368 |         activation=None,
369 |         causal=False,
370 |     ):
371 |         super().__init__()
372 |         self.in_channels = in_channels
373 |         self.out_channels = out_channels
374 |         self.filter_channels = filter_channels
375 |         self.kernel_size = kernel_size
376 |         self.p_dropout = p_dropout
377 |         self.activation = activation
378 |         self.causal = causal
379 | 
380 |         if causal:
381 |             self.padding = self._causal_padding
382 |         else:
383 |             self.padding = self._same_padding
384 | 
385 |         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
386 |         self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
387 |         self.drop = nn.Dropout(p_dropout)
388 | 
389 |     def forward(self, x, x_mask):
390 |         x = self.conv_1(self.padding(x * x_mask))
391 |         if self.activation == "gelu":
392 |             x = x * torch.sigmoid(1.702 * x)
393 |         else:
394 |             x = torch.relu(x)
395 |         x = self.drop(x)
396 |         x = self.conv_2(self.padding(x * x_mask))
397 |         return x * x_mask
398 | 
399 |     def _causal_padding(self, x):
400 |         if self.kernel_size == 1:
401 |             return x
402 |         pad_l = self.kernel_size - 1
403 |         pad_r = 0
404 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
405 |         x = F.pad(x, commons.convert_pad_shape(padding))
406 |         return x
407 | 
408 |     def _same_padding(self, x):
409 |         if self.kernel_size == 1:
410 |             return x
411 |         pad_l = (self.kernel_size - 1) // 2
412 |         pad_r = self.kernel_size // 2
413 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
414 |         x = F.pad(x, commons.convert_pad_shape(padding))
415 |         return x
416 | 


--------------------------------------------------------------------------------
/lib/rvc/checkpoints.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import OrderedDict
  3 | from typing import *
  4 | 
  5 | import torch
  6 | 
  7 | 
  8 | def write_config(state_dict: Dict[str, Any], cfg: Dict[str, Any]):
  9 |     state_dict["config"] = []
 10 |     for key, x in cfg.items():
 11 |         state_dict["config"].append(x)
 12 |     state_dict["params"] = cfg
 13 | 
 14 | 
 15 | def create_trained_model(
 16 |     weights: Dict[str, Any],
 17 |     version: Literal["v1", "v2"],
 18 |     sr: str,
 19 |     f0: bool,
 20 |     emb_name: str,
 21 |     emb_ch: int,
 22 |     emb_output_layer: int,
 23 |     epoch: int,
 24 |     speaker_info: Optional[dict[str, int]]
 25 | ):
 26 |     state_dict = OrderedDict()
 27 |     state_dict["weight"] = {}
 28 |     for key in weights.keys():
 29 |         if "enc_q" in key:
 30 |             continue
 31 |         state_dict["weight"][key] = weights[key].half()
 32 |     if sr == "40k":
 33 |         write_config(
 34 |             state_dict,
 35 |             {
 36 |                 "spec_channels": 1025,
 37 |                 "segment_size": 32,
 38 |                 "inter_channels": 192,
 39 |                 "hidden_channels": 192,
 40 |                 "filter_channels": 768,
 41 |                 "n_heads": 2,
 42 |                 "n_layers": 6,
 43 |                 "kernel_size": 3,
 44 |                 "p_dropout": 0,
 45 |                 "resblock": "1",
 46 |                 "resblock_kernel_sizes": [3, 7, 11],
 47 |                 "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
 48 |                 "upsample_rates": [10, 10, 2, 2],
 49 |                 "upsample_initial_channel": 512,
 50 |                 "upsample_kernel_sizes": [16, 16, 4, 4],
 51 |                 "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
 52 |                 "gin_channels": 256,
 53 |                 "emb_channels": emb_ch,
 54 |                 "sr": 40000,
 55 |             },
 56 |         )
 57 |     elif sr == "48k":
 58 |         write_config(
 59 |             state_dict,
 60 |             {
 61 |                 "spec_channels": 1025,
 62 |                 "segment_size": 32,
 63 |                 "inter_channels": 192,
 64 |                 "hidden_channels": 192,
 65 |                 "filter_channels": 768,
 66 |                 "n_heads": 2,
 67 |                 "n_layers": 6,
 68 |                 "kernel_size": 3,
 69 |                 "p_dropout": 0,
 70 |                 "resblock": "1",
 71 |                 "resblock_kernel_sizes": [3, 7, 11],
 72 |                 "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
 73 |                 "upsample_rates": [10, 6, 2, 2, 2],
 74 |                 "upsample_initial_channel": 512,
 75 |                 "upsample_kernel_sizes": [16, 16, 4, 4, 4],
 76 |                 "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
 77 |                 "gin_channels": 256,
 78 |                 "emb_channels": emb_ch,
 79 |                 "sr": 48000,
 80 |             },
 81 |         )
 82 |     elif sr == "32k":
 83 |         write_config(
 84 |             state_dict,
 85 |             {
 86 |                 "spec_channels": 513,
 87 |                 "segment_size": 32,
 88 |                 "inter_channels": 192,
 89 |                 "hidden_channels": 192,
 90 |                 "filter_channels": 768,
 91 |                 "n_heads": 2,
 92 |                 "n_layers": 6,
 93 |                 "kernel_size": 3,
 94 |                 "p_dropout": 0,
 95 |                 "resblock": "1",
 96 |                 "resblock_kernel_sizes": [3, 7, 11],
 97 |                 "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
 98 |                 "upsample_rates": [10, 4, 2, 2, 2],
 99 |                 "upsample_initial_channel": 512,
100 |                 "upsample_kernel_sizes": [16, 16, 4, 4, 4],
101 |                 "spk_embed_dim": 109 if speaker_info is None else len(speaker_info),
102 |                 "gin_channels": 256,
103 |                 "emb_channels": emb_ch,
104 |                 "sr": 32000,
105 |             },
106 |         )
107 |     state_dict["version"] = version
108 |     state_dict["info"] = f"{epoch}epoch"
109 |     state_dict["sr"] = sr
110 |     state_dict["f0"] = 1 if f0 else 0
111 |     state_dict["embedder_name"] = emb_name
112 |     state_dict["embedder_output_layer"] = emb_output_layer
113 |     if not speaker_info is None:
114 |         state_dict["speaker_info"] = {str(v): str(k) for k, v in speaker_info.items()}
115 |     return state_dict
116 | 
117 | 
118 | def save(
119 |     model,
120 |     version: Literal["v1", "v2"],
121 |     sr: str,
122 |     f0: bool,
123 |     emb_name: str,
124 |     emb_ch: int,
125 |     emb_output_layer: int,
126 |     filepath: str,
127 |     epoch: int,
128 |     speaker_info: Optional[dict[str, int]]
129 | ):
130 |     if hasattr(model, "module"):
131 |         state_dict = model.module.state_dict()
132 |     else:
133 |         state_dict = model.state_dict()
134 | 
135 |     print(f"save: emb_name: {emb_name} {emb_ch}")
136 | 
137 |     state_dict = create_trained_model(
138 |         state_dict,
139 |         version,
140 |         sr,
141 |         f0,
142 |         emb_name,
143 |         emb_ch,
144 |         emb_output_layer,
145 |         epoch,
146 |         speaker_info
147 |     )
148 |     os.makedirs(os.path.dirname(filepath), exist_ok=True)
149 |     torch.save(state_dict, filepath)
150 | 


--------------------------------------------------------------------------------
/lib/rvc/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | from torch.nn import functional as F
  5 | 
  6 | 
  7 | def init_weights(m, mean=0.0, std=0.01):
  8 |     classname = m.__class__.__name__
  9 |     if classname.find("Conv") != -1:
 10 |         m.weight.data.normal_(mean, std)
 11 | 
 12 | 
 13 | def get_padding(kernel_size, dilation=1):
 14 |     return int((kernel_size * dilation - dilation) / 2)
 15 | 
 16 | 
 17 | def convert_pad_shape(pad_shape):
 18 |     l = pad_shape[::-1]
 19 |     pad_shape = [item for sublist in l for item in sublist]
 20 |     return pad_shape
 21 | 
 22 | 
 23 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 24 |     """KL(P||Q)"""
 25 |     kl = (logs_q - logs_p) - 0.5
 26 |     kl += (
 27 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 28 |     )
 29 |     return kl
 30 | 
 31 | 
 32 | def rand_gumbel(shape):
 33 |     """Sample from the Gumbel distribution, protect from overflows."""
 34 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 35 |     return -torch.log(-torch.log(uniform_samples))
 36 | 
 37 | 
 38 | def rand_gumbel_like(x):
 39 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 40 |     return g
 41 | 
 42 | 
 43 | def slice_segments(x, ids_str, segment_size=4):
 44 |     ret = torch.zeros_like(x[:, :, :segment_size])
 45 |     for i in range(x.size(0)):
 46 |         idx_str = ids_str[i]
 47 |         idx_end = idx_str + segment_size
 48 |         ret[i] = x[i, :, idx_str:idx_end]
 49 |     return ret
 50 | 
 51 | 
 52 | def slice_segments2(x, ids_str, segment_size=4):
 53 |     ret = torch.zeros_like(x[:, :segment_size])
 54 |     for i in range(x.size(0)):
 55 |         idx_str = ids_str[i]
 56 |         idx_end = idx_str + segment_size
 57 |         ret[i] = x[i, idx_str:idx_end]
 58 |     return ret
 59 | 
 60 | 
 61 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 62 |     b, d, t = x.size()
 63 |     if x_lengths is None:
 64 |         x_lengths = t
 65 |     ids_str_max = x_lengths - segment_size + 1
 66 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 67 |     ret = slice_segments(x, ids_str, segment_size)
 68 |     return ret, ids_str
 69 | 
 70 | 
 71 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 72 |     position = torch.arange(length, dtype=torch.float)
 73 |     num_timescales = channels // 2
 74 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 75 |         num_timescales - 1
 76 |     )
 77 |     inv_timescales = min_timescale * torch.exp(
 78 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 79 |     )
 80 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 81 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 82 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 83 |     signal = signal.view(1, channels, length)
 84 |     return signal
 85 | 
 86 | 
 87 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 88 |     b, channels, length = x.size()
 89 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 90 |     return x + signal.to(dtype=x.dtype, device=x.device)
 91 | 
 92 | 
 93 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 94 |     b, channels, length = x.size()
 95 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 96 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 97 | 
 98 | 
 99 | def subsequent_mask(length):
100 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
101 |     return mask
102 | 
103 | 
104 | @torch.jit.script
105 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
106 |     n_channels_int = n_channels[0]
107 |     in_act = input_a + input_b
108 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
109 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
110 |     acts = t_act * s_act
111 |     return acts
112 | 
113 | 
114 | def convert_pad_shape(pad_shape):
115 |     l = pad_shape[::-1]
116 |     pad_shape = [item for sublist in l for item in sublist]
117 |     return pad_shape
118 | 
119 | 
120 | def shift_1d(x):
121 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
122 |     return x
123 | 
124 | 
125 | def sequence_mask(length, max_length=None):
126 |     if max_length is None:
127 |         max_length = length.max()
128 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
129 |     return x.unsqueeze(0) < length.unsqueeze(1)
130 | 
131 | 
132 | def generate_path(duration, mask):
133 |     """
134 |     duration: [b, 1, t_x]
135 |     mask: [b, 1, t_y, t_x]
136 |     """
137 |     b, _, t_y, t_x = mask.shape
138 |     cum_duration = torch.cumsum(duration, -1)
139 | 
140 |     cum_duration_flat = cum_duration.view(b * t_x)
141 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
142 |     path = path.view(b, t_x, t_y)
143 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
144 |     path = path.unsqueeze(1).transpose(2, 3) * mask
145 |     return path
146 | 
147 | 
148 | def clip_grad_value_(parameters, clip_value, norm_type=2):
149 |     if isinstance(parameters, torch.Tensor):
150 |         parameters = [parameters]
151 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
152 |     norm_type = float(norm_type)
153 |     if clip_value is not None:
154 |         clip_value = float(clip_value)
155 | 
156 |     total_norm = 0
157 |     for p in parameters:
158 |         param_norm = p.grad.data.norm(norm_type)
159 |         total_norm += param_norm.item() ** norm_type
160 |         if clip_value is not None:
161 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
162 |     total_norm = total_norm ** (1.0 / norm_type)
163 |     return total_norm
164 | 


--------------------------------------------------------------------------------
/lib/rvc/config.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class TrainConfigTrain(BaseModel):
 7 |     log_interval: int
 8 |     seed: int
 9 |     epochs: int
10 |     learning_rate: float
11 |     betas: List[float]
12 |     eps: float
13 |     batch_size: int
14 |     fp16_run: bool
15 |     lr_decay: float
16 |     segment_size: int
17 |     init_lr_ratio: int
18 |     warmup_epochs: int
19 |     c_mel: int
20 |     c_kl: float
21 | 
22 | 
23 | class TrainConfigData(BaseModel):
24 |     max_wav_value: float
25 |     sampling_rate: int
26 |     filter_length: int
27 |     hop_length: int
28 |     win_length: int
29 |     n_mel_channels: int
30 |     mel_fmin: float
31 |     mel_fmax: Any
32 | 
33 | 
34 | class TrainConfigModel(BaseModel):
35 |     inter_channels: int
36 |     hidden_channels: int
37 |     filter_channels: int
38 |     n_heads: int
39 |     n_layers: int
40 |     kernel_size: int
41 |     p_dropout: int
42 |     resblock: str
43 |     resblock_kernel_sizes: List[int]
44 |     resblock_dilation_sizes: List[List[int]]
45 |     upsample_rates: List[int]
46 |     upsample_initial_channel: int
47 |     upsample_kernel_sizes: List[int]
48 |     use_spectral_norm: bool
49 |     gin_channels: int
50 |     emb_channels: int
51 |     spk_embed_dim: int
52 | 
53 | 
54 | class TrainConfig(BaseModel):
55 |     version: Literal["v1", "v2"] = "v2"
56 |     train: TrainConfigTrain
57 |     data: TrainConfigData
58 |     model: TrainConfigModel
59 | 
60 | 
61 | class DatasetMetaItem(BaseModel):
62 |     gt_wav: str
63 |     co256: str
64 |     f0: Optional[str]
65 |     f0nsf: Optional[str]
66 |     speaker_id: int
67 | 
68 | 
69 | class DatasetMetadata(BaseModel):
70 |     files: Dict[str, DatasetMetaItem]
71 |     # mute: DatasetMetaItem
72 | 


--------------------------------------------------------------------------------
/lib/rvc/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | 
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 |     loss = 0
17 |     r_losses = []
18 |     g_losses = []
19 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 |         dr = dr.float()
21 |         dg = dg.float()
22 |         r_loss = torch.mean((1 - dr) ** 2)
23 |         g_loss = torch.mean(dg**2)
24 |         loss += r_loss + g_loss
25 |         r_losses.append(r_loss.item())
26 |         g_losses.append(g_loss.item())
27 | 
28 |     return loss, r_losses, g_losses
29 | 
30 | 
31 | def generator_loss(disc_outputs):
32 |     loss = 0
33 |     gen_losses = []
34 |     for dg in disc_outputs:
35 |         dg = dg.float()
36 |         l = torch.mean((1 - dg) ** 2)
37 |         gen_losses.append(l)
38 |         loss += l
39 | 
40 |     return loss, gen_losses
41 | 
42 | 
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 |     """
45 |     z_p, logs_q: [b, h, t_t]
46 |     m_p, logs_p: [b, h, t_t]
47 |     """
48 |     z_p = z_p.float()
49 |     logs_q = logs_q.float()
50 |     m_p = m_p.float()
51 |     logs_p = logs_p.float()
52 |     z_mask = z_mask.float()
53 | 
54 |     kl = logs_p - logs_q - 0.5
55 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 |     kl = torch.sum(kl * z_mask)
57 |     l = kl / torch.sum(z_mask)
58 |     return l
59 | 


--------------------------------------------------------------------------------
/lib/rvc/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | from librosa.filters import mel as librosa_mel_fn
  4 | 
  5 | MAX_WAV_VALUE = 32768.0
  6 | 
  7 | 
  8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
  9 |     """
 10 |     PARAMS
 11 |     ------
 12 |     C: compression factor
 13 |     """
 14 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 15 | 
 16 | 
 17 | def dynamic_range_decompression_torch(x, C=1):
 18 |     """
 19 |     PARAMS
 20 |     ------
 21 |     C: compression factor used to compress
 22 |     """
 23 |     return torch.exp(x) / C
 24 | 
 25 | 
 26 | def spectral_normalize_torch(magnitudes):
 27 |     return dynamic_range_compression_torch(magnitudes)
 28 | 
 29 | 
 30 | def spectral_de_normalize_torch(magnitudes):
 31 |     return dynamic_range_decompression_torch(magnitudes)
 32 | 
 33 | 
 34 | mel_basis = {}
 35 | hann_window = {}
 36 | 
 37 | 
 38 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 39 |     if torch.min(y) < -1.07:
 40 |         print("min value is ", torch.min(y))
 41 |     if torch.max(y) > 1.07:
 42 |         print("max value is ", torch.max(y))
 43 | 
 44 |     global hann_window
 45 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 46 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 47 |     if wnsize_dtype_device not in hann_window:
 48 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 49 |             dtype=y.dtype, device=y.device
 50 |         )
 51 | 
 52 |     y = torch.nn.functional.pad(
 53 |         y.unsqueeze(1),
 54 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 55 |         mode="reflect",
 56 |     )
 57 |     y = y.squeeze(1)
 58 | 
 59 |     # mps does not support torch.stft.
 60 |     if y.device.type == "mps":
 61 |         i = y.cpu()
 62 |         win = hann_window[wnsize_dtype_device].cpu()
 63 |     else:
 64 |         i = y
 65 |         win = hann_window[wnsize_dtype_device]
 66 |     spec = torch.stft(
 67 |         i,
 68 |         n_fft,
 69 |         hop_length=hop_size,
 70 |         win_length=win_size,
 71 |         window=win,
 72 |         center=center,
 73 |         pad_mode="reflect",
 74 |         normalized=False,
 75 |         onesided=True,
 76 |         return_complex=False,
 77 |     ).to(device=y.device)
 78 | 
 79 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 80 |     return spec
 81 | 
 82 | 
 83 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 84 |     global mel_basis
 85 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 86 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 87 |     if fmax_dtype_device not in mel_basis:
 88 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 89 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
 90 |             dtype=spec.dtype, device=spec.device
 91 |         )
 92 |     melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 93 |     melspec = spectral_normalize_torch(melspec)
 94 |     return melspec
 95 | 
 96 | 
 97 | def mel_spectrogram_torch(
 98 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
 99 | ):
100 |     """Convert waveform into Mel-frequency Log-amplitude spectrogram.
101 | 
102 |     Args:
103 |         y       :: (B, T)           - Waveforms
104 |     Returns:
105 |         melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
106 |     """
107 |     # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
108 |     spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
109 | 
110 |     # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
111 |     melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
112 | 
113 |     return melspec
114 | 


--------------------------------------------------------------------------------
/lib/rvc/pipeline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import traceback
  3 | from typing import *
  4 | 
  5 | import faiss
  6 | import numpy as np
  7 | import pyworld
  8 | import scipy.signal as signal
  9 | import torch
 10 | import torch.nn.functional as F
 11 | import torchcrepe
 12 | from torch import Tensor
 13 | # from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip
 14 | from fairseq.models.hubert import HubertModel
 15 | 
 16 | from .models import SynthesizerTrnMs256NSFSid
 17 | 
 18 | 
 19 | class VocalConvertPipeline(object):
 20 |     def __init__(self, tgt_sr: int, device: Union[str, torch.device], is_half: bool):
 21 |         if isinstance(device, str):
 22 |             device = torch.device(device)
 23 |         if device.type == "cuda":
 24 |             vram = torch.cuda.get_device_properties(device).total_memory / 1024**3
 25 |         else:
 26 |             vram = None
 27 | 
 28 |         if vram is not None and vram <= 4:
 29 |             self.x_pad = 1
 30 |             self.x_query = 5
 31 |             self.x_center = 30
 32 |             self.x_max = 32
 33 |         elif vram is not None and vram <= 5:
 34 |             self.x_pad = 1
 35 |             self.x_query = 6
 36 |             self.x_center = 38
 37 |             self.x_max = 41
 38 |         else:
 39 |             self.x_pad = 3
 40 |             self.x_query = 10
 41 |             self.x_center = 60
 42 |             self.x_max = 65
 43 | 
 44 |         self.sr = 16000  # hubert input sample rate
 45 |         self.window = 160  # hubert input window
 46 |         self.t_pad = self.sr * self.x_pad  # padding time for each utterance
 47 |         self.t_pad_tgt = tgt_sr * self.x_pad
 48 |         self.t_pad2 = self.t_pad * 2
 49 |         self.t_query = self.sr * self.x_query  # query time before and after query point
 50 |         self.t_center = self.sr * self.x_center  # query cut point position
 51 |         self.t_max = self.sr * self.x_max  # max time for no query
 52 |         self.device = device
 53 |         self.is_half = is_half
 54 | 
 55 |     def get_optimal_torch_device(self, index: int = 0) -> torch.device:
 56 |         # Get cuda device
 57 |         if torch.cuda.is_available():
 58 |             return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
 59 |         elif torch.backends.mps.is_available():
 60 |             return torch.device("mps")
 61 |         # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
 62 |         # Else wise return the "cpu" as a torch device, 
 63 |         return torch.device("cpu")
 64 | 
 65 |     def get_f0_crepe_computation(
 66 |             self, 
 67 |             x, 
 68 |             f0_min,
 69 |             f0_max,
 70 |             p_len,
 71 |             hop_length=64, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
 72 |             model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
 73 |     ):
 74 |         x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
 75 |         x /= np.quantile(np.abs(x), 0.999)
 76 |         torch_device = self.get_optimal_torch_device()
 77 |         audio = torch.from_numpy(x).to(torch_device, copy=True)
 78 |         audio = torch.unsqueeze(audio, dim=0)
 79 |         if audio.ndim == 2 and audio.shape[0] > 1:
 80 |             audio = torch.mean(audio, dim=0, keepdim=True).detach()
 81 |         audio = audio.detach()
 82 |         print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
 83 |         pitch: Tensor = torchcrepe.predict(
 84 |             audio,
 85 |             self.sr,
 86 |             hop_length,
 87 |             f0_min,
 88 |             f0_max,
 89 |             model,
 90 |             batch_size=hop_length * 2,
 91 |             device=torch_device,
 92 |             pad=True
 93 |         )
 94 |         p_len = p_len or x.shape[0] // hop_length
 95 |         # Resize the pitch for final f0
 96 |         source = np.array(pitch.squeeze(0).cpu().float().numpy())
 97 |         source[source < 0.001] = np.nan
 98 |         target = np.interp(
 99 |             np.arange(0, len(source) * p_len, len(source)) / p_len,
100 |             np.arange(0, len(source)),
101 |             source
102 |         )
103 |         f0 = np.nan_to_num(target)
104 |         return f0 # Resized f0
105 | 
106 |     def get_f0_official_crepe_computation(
107 |             self,
108 |             x,
109 |             f0_min,
110 |             f0_max,
111 |             model="full",
112 |     ):
113 |         # Pick a batch size that doesn't cause memory errors on your gpu
114 |         batch_size = 512
115 |         # Compute pitch using first gpu
116 |         audio = torch.tensor(np.copy(x))[None].float()
117 |         f0, pd = torchcrepe.predict(
118 |             audio,
119 |             self.sr,
120 |             self.window,
121 |             f0_min,
122 |             f0_max,
123 |             model,
124 |             batch_size=batch_size,
125 |             device=self.device,
126 |             return_periodicity=True,
127 |         )
128 |         pd = torchcrepe.filter.median(pd, 3)
129 |         f0 = torchcrepe.filter.mean(f0, 3)
130 |         f0[pd < 0.1] = 0
131 |         f0 = f0[0].cpu().numpy()
132 |         return f0
133 | 
134 |     def get_f0(
135 |         self,
136 |         x: np.ndarray,
137 |         p_len: int,
138 |         f0_up_key: int,
139 |         f0_method: str,
140 |         inp_f0: np.ndarray = None,
141 |     ):
142 |         f0_min = 50
143 |         f0_max = 1100
144 |         f0_mel_min = 1127 * np.log(1 + f0_min / 700)
145 |         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
146 | 
147 |         if f0_method == "harvest":
148 |             f0, t = pyworld.harvest(
149 |                 x.astype(np.double),
150 |                 fs=self.sr,
151 |                 f0_ceil=f0_max,
152 |                 f0_floor=f0_min,
153 |                 frame_period=10,
154 |             )
155 |             f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
156 |             f0 = signal.medfilt(f0, 3)
157 |         elif f0_method == "dio":
158 |             f0, t = pyworld.dio(
159 |                 x.astype(np.double),
160 |                 fs=self.sr,
161 |                 f0_ceil=f0_max,
162 |                 f0_floor=f0_min,
163 |                 frame_period=10,
164 |             )
165 |             f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
166 |             f0 = signal.medfilt(f0, 3)
167 |         elif f0_method == "mangio-crepe":
168 |             f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, 160, "full")
169 |         elif f0_method == "crepe":
170 |             f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "full")
171 | 
172 |         f0 *= pow(2, f0_up_key / 12)
173 |         tf0 = self.sr // self.window  # f0 points per second
174 |         if inp_f0 is not None:
175 |             delta_t = np.round(
176 |                 (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
177 |             ).astype("int16")
178 |             replace_f0 = np.interp(
179 |                 list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
180 |             )
181 |             shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
182 |             f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
183 |                 :shape
184 |             ]
185 | 
186 |         f0bak = f0.copy()
187 |         f0_mel = 1127 * np.log(1 + f0 / 700)
188 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
189 |             f0_mel_max - f0_mel_min
190 |         ) + 1
191 |         f0_mel[f0_mel <= 1] = 1
192 |         f0_mel[f0_mel > 255] = 255
193 |         f0_coarse = np.rint(f0_mel).astype(np.int)
194 |         return f0_coarse, f0bak  # 1-0
195 | 
196 |     def _convert(
197 |         self,
198 |         model: HubertModel,
199 |         embedding_output_layer: int,
200 |         net_g: SynthesizerTrnMs256NSFSid,
201 |         sid: int,
202 |         audio: np.ndarray,
203 |         pitch: np.ndarray,
204 |         pitchf: np.ndarray,
205 |         index: faiss.IndexIVFFlat,
206 |         big_npy: np.ndarray,
207 |         index_rate: float,
208 |     ):
209 |         feats = torch.from_numpy(audio)
210 |         if self.is_half:
211 |             feats = feats.half()
212 |         else:
213 |             feats = feats.float()
214 |         if feats.dim() == 2:  # double channels
215 |             feats = feats.mean(-1)
216 |         assert feats.dim() == 1, feats.dim()
217 |         feats = feats.view(1, -1)
218 |         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
219 | 
220 |         half_support = (
221 |             self.device.type == "cuda"
222 |             and torch.cuda.get_device_capability(self.device)[0] >= 5.3
223 |         )
224 |         is_feats_dim_768 = net_g.emb_channels == 768
225 | 
226 |         if isinstance(model, tuple):
227 |             feats = model[0](
228 |                 feats.squeeze(0).squeeze(0).to(self.device),
229 |                 return_tensors="pt",
230 |                 sampling_rate=16000,
231 |             )
232 |             if self.is_half:
233 |                 feats = feats.input_values.to(self.device).half()
234 |             else:
235 |                 feats = feats.input_values.to(self.device)
236 |             with torch.no_grad():
237 |                 if is_feats_dim_768:
238 |                     feats = model[1](feats).last_hidden_state
239 |                 else:
240 |                     feats = model[1](feats).extract_features
241 |         else:
242 |             inputs = {
243 |                 "source": feats.half().to(self.device)
244 |                 if half_support
245 |                 else feats.to(self.device),
246 |                 "padding_mask": padding_mask.to(self.device),
247 |                 "output_layer": embedding_output_layer,
248 |             }
249 | 
250 |             if not half_support:
251 |                 model = model.float()
252 |                 inputs["source"] = inputs["source"].float()
253 | 
254 |             with torch.no_grad():
255 |                 logits = model.extract_features(**inputs)
256 |                 if is_feats_dim_768:
257 |                     feats = logits[0]
258 |                 else:
259 |                     feats = model.final_proj(logits[0])
260 | 
261 |         if (
262 |             isinstance(index, type(None)) == False
263 |             and isinstance(big_npy, type(None)) == False
264 |             and index_rate != 0
265 |         ):
266 |             npy = feats[0].cpu().numpy()
267 |             if self.is_half:
268 |                 npy = npy.astype("float32")
269 | 
270 |             score, ix = index.search(npy, k=8)
271 |             weight = np.square(1 / score)
272 |             weight /= weight.sum(axis=1, keepdims=True)
273 |             npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
274 | 
275 |             if self.is_half:
276 |                 npy = npy.astype("float16")
277 |             feats = (
278 |                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
279 |                 + (1 - index_rate) * feats
280 |             )
281 | 
282 |         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
283 | 
284 |         p_len = audio.shape[0] // self.window
285 |         if feats.shape[1] < p_len:
286 |             p_len = feats.shape[1]
287 |             if pitch != None and pitchf != None:
288 |                 pitch = pitch[:, :p_len]
289 |                 pitchf = pitchf[:, :p_len]
290 |         p_len = torch.tensor([p_len], device=self.device).long()
291 |         with torch.no_grad():
292 |             if pitch != None and pitchf != None:
293 |                 audio1 = (
294 |                     (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
295 |                     .data.cpu()
296 |                     .float()
297 |                     .numpy()
298 |                     .astype(np.int16)
299 |                 )
300 |             else:
301 |                 audio1 = (
302 |                     (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
303 |                     .data.cpu()
304 |                     .float()
305 |                     .numpy()
306 |                     .astype(np.int16)
307 |                 )
308 |         del feats, p_len, padding_mask
309 |         if torch.cuda.is_available():
310 |             torch.cuda.empty_cache()
311 |         return audio1
312 | 
313 |     def __call__(
314 |         self,
315 |         model: HubertModel,
316 |         embedding_output_layer: int,
317 |         net_g: SynthesizerTrnMs256NSFSid,
318 |         sid: int,
319 |         audio: np.ndarray,
320 |         transpose: int,
321 |         f0_method: str,
322 |         file_index: str,
323 |         index_rate: float,
324 |         if_f0: bool,
325 |         f0_file: str = None,
326 |     ):
327 |         if file_index != "" and os.path.exists(file_index) and index_rate != 0:
328 |             try:
329 |                 index = faiss.read_index(file_index)
330 |                 # big_npy = np.load(file_big_npy)
331 |                 big_npy = index.reconstruct_n(0, index.ntotal)
332 |             except:
333 |                 traceback.print_exc()
334 |                 index = big_npy = None
335 |         else:
336 |             index = big_npy = None
337 | 
338 |         bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
339 |         audio = signal.filtfilt(bh, ah, audio)
340 | 
341 |         audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
342 |         opt_ts = []
343 |         if audio_pad.shape[0] > self.t_max:
344 |             audio_sum = np.zeros_like(audio)
345 |             for i in range(self.window):
346 |                 audio_sum += audio_pad[i : i - self.window]
347 |             for t in range(self.t_center, audio.shape[0], self.t_center):
348 |                 opt_ts.append(
349 |                     t
350 |                     - self.t_query
351 |                     + np.where(
352 |                         np.abs(audio_sum[t - self.t_query : t + self.t_query])
353 |                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
354 |                     )[0][0]
355 |                 )
356 | 
357 |         audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
358 |         p_len = audio_pad.shape[0] // self.window
359 |         inp_f0 = None
360 |         if hasattr(f0_file, "name"):
361 |             try:
362 |                 with open(f0_file.name, "r") as f:
363 |                     lines = f.read().strip("\n").split("\n")
364 |                 inp_f0 = []
365 |                 for line in lines:
366 |                     inp_f0.append([float(i) for i in line.split(",")])
367 |                 inp_f0 = np.array(inp_f0, dtype="float32")
368 |             except:
369 |                 traceback.print_exc()
370 |         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
371 |         pitch, pitchf = None, None
372 |         if if_f0 == 1:
373 |             pitch, pitchf = self.get_f0(audio_pad, p_len, transpose, f0_method, inp_f0)
374 |             pitch = pitch[:p_len]
375 |             pitchf = pitchf[:p_len]
376 |             if self.device.type == "mps":
377 |                 pitchf = pitchf.astype(np.float32)
378 |             pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
379 |             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
380 | 
381 |         audio_opt = []
382 | 
383 |         s = 0
384 |         t = None
385 | 
386 |         for t in opt_ts:
387 |             t = t // self.window * self.window
388 |             if if_f0 == 1:
389 |                 audio_opt.append(
390 |                     self._convert(
391 |                         model,
392 |                         embedding_output_layer,
393 |                         net_g,
394 |                         sid,
395 |                         audio_pad[s : t + self.t_pad2 + self.window],
396 |                         pitch[:, s // self.window : (t + self.t_pad2) // self.window],
397 |                         pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
398 |                         index,
399 |                         big_npy,
400 |                         index_rate,
401 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
402 |                 )
403 |             else:
404 |                 audio_opt.append(
405 |                     self._convert(
406 |                         model,
407 |                         embedding_output_layer,
408 |                         net_g,
409 |                         sid,
410 |                         audio_pad[s : t + self.t_pad2 + self.window],
411 |                         None,
412 |                         None,
413 |                         index,
414 |                         big_npy,
415 |                         index_rate,
416 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
417 |                 )
418 |             s = t
419 |         if if_f0 == 1:
420 |             audio_opt.append(
421 |                 self._convert(
422 |                     model,
423 |                     embedding_output_layer,
424 |                     net_g,
425 |                     sid,
426 |                     audio_pad[t:],
427 |                     pitch[:, t // self.window :] if t is not None else pitch,
428 |                     pitchf[:, t // self.window :] if t is not None else pitchf,
429 |                     index,
430 |                     big_npy,
431 |                     index_rate,
432 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
433 |             )
434 |         else:
435 |             audio_opt.append(
436 |                 self._convert(
437 |                     model,
438 |                     embedding_output_layer,
439 |                     net_g,
440 |                     sid,
441 |                     audio_pad[t:],
442 |                     None,
443 |                     None,
444 |                     index,
445 |                     big_npy,
446 |                     index_rate,
447 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
448 |             )
449 |         audio_opt = np.concatenate(audio_opt)
450 |         del pitch, pitchf, sid
451 |         if torch.cuda.is_available():
452 |             torch.cuda.empty_cache()
453 |         return audio_opt
454 | 


--------------------------------------------------------------------------------
/lib/rvc/preprocessing/extract_f0.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import traceback
  3 | from concurrent.futures import ProcessPoolExecutor
  4 | from typing import *
  5 | import multiprocessing as mp
  6 | 
  7 | import numpy as np
  8 | import pyworld
  9 | import torch
 10 | import torchcrepe
 11 | from torch import Tensor
 12 | from tqdm import tqdm
 13 | 
 14 | from lib.rvc.utils import load_audio
 15 | 
 16 | def get_optimal_torch_device(index: int = 0) -> torch.device:
 17 |     # Get cuda device
 18 |     if torch.cuda.is_available():
 19 |         return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
 20 |     elif torch.backends.mps.is_available():
 21 |         return torch.device("mps")
 22 |     # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
 23 |     # Else wise return the "cpu" as a torch device, 
 24 |     return torch.device("cpu")
 25 | 
 26 | def get_f0_official_crepe_computation(
 27 |         x,
 28 |         sr,
 29 |         f0_min,
 30 |         f0_max,
 31 |         model="full",
 32 | ):
 33 |     batch_size = 512
 34 |     torch_device = get_optimal_torch_device()
 35 |     audio = torch.tensor(np.copy(x))[None].float()
 36 |     f0, pd = torchcrepe.predict(
 37 |         audio,
 38 |         sr,
 39 |         160,
 40 |         f0_min,
 41 |         f0_max,
 42 |         model,
 43 |         batch_size=batch_size,
 44 |         device=torch_device,
 45 |         return_periodicity=True,
 46 |     )
 47 |     pd = torchcrepe.filter.median(pd, 3)
 48 |     f0 = torchcrepe.filter.mean(f0, 3)
 49 |     f0[pd < 0.1] = 0
 50 |     f0 = f0[0].cpu().numpy()
 51 |     f0 = f0[1:] # Get rid of extra first frame
 52 |     return f0
 53 | 
 54 | def get_f0_crepe_computation(
 55 |         x, 
 56 |         sr,
 57 |         f0_min,
 58 |         f0_max,
 59 |         hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
 60 |         model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
 61 | ):
 62 |     x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
 63 |     x /= np.quantile(np.abs(x), 0.999)
 64 |     torch_device = get_optimal_torch_device()
 65 |     audio = torch.from_numpy(x).to(torch_device, copy=True)
 66 |     audio = torch.unsqueeze(audio, dim=0)
 67 |     if audio.ndim == 2 and audio.shape[0] > 1:
 68 |         audio = torch.mean(audio, dim=0, keepdim=True).detach()
 69 |     audio = audio.detach()
 70 |     print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
 71 |     pitch: Tensor = torchcrepe.predict(
 72 |         audio,
 73 |         sr,
 74 |         hop_length,
 75 |         f0_min,
 76 |         f0_max,
 77 |         model,
 78 |         batch_size=hop_length * 2,
 79 |         device=torch_device,
 80 |         pad=True
 81 |     )
 82 |     p_len = x.shape[0] // hop_length
 83 |     # Resize the pitch for final f0
 84 |     source = np.array(pitch.squeeze(0).cpu().float().numpy())
 85 |     source[source < 0.001] = np.nan
 86 |     target = np.interp(
 87 |         np.arange(0, len(source) * p_len, len(source)) / p_len,
 88 |         np.arange(0, len(source)),
 89 |         source
 90 |     )
 91 |     f0 = np.nan_to_num(target)
 92 |     f0 = f0[1:] # Get rid of extra first frame
 93 |     return f0 # Resized f0
 94 | 
 95 | 
 96 | def compute_f0(
 97 |     path: str,
 98 |     f0_method: str,
 99 |     fs: int,
100 |     hop: int,
101 |     f0_max: float,
102 |     f0_min: float,
103 | ):
104 |     x = load_audio(path, fs)
105 |     if f0_method == "harvest":
106 |         f0, t = pyworld.harvest(
107 |             x.astype(np.double),
108 |             fs=fs,
109 |             f0_ceil=f0_max,
110 |             f0_floor=f0_min,
111 |             frame_period=1000 * hop / fs,
112 |         )
113 |         f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs)
114 |     elif f0_method == "dio":
115 |         f0, t = pyworld.dio(
116 |             x.astype(np.double),
117 |             fs=fs,
118 |             f0_ceil=f0_max,
119 |             f0_floor=f0_min,
120 |             frame_period=1000 * hop / fs,
121 |         )
122 |         f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs)
123 |     elif f0_method == "mangio-crepe":
124 |         f0 = get_f0_crepe_computation(x, fs, f0_min, f0_max, 160, "full")
125 |     elif f0_method == "crepe":
126 |         f0 = get_f0_official_crepe_computation(x.astype(np.double), fs, f0_min, f0_max, "full")
127 |     return f0
128 | 
129 | 
130 | def coarse_f0(f0, f0_bin, f0_mel_min, f0_mel_max):
131 |     f0_mel = 1127 * np.log(1 + f0 / 700)
132 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
133 |         f0_mel_max - f0_mel_min
134 |     ) + 1
135 | 
136 |     # use 0 or 1
137 |     f0_mel[f0_mel <= 1] = 1
138 |     f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
139 |     f0_coarse = np.rint(f0_mel).astype(np.int)
140 |     assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
141 |         f0_coarse.max(),
142 |         f0_coarse.min(),
143 |     )
144 |     return f0_coarse
145 | 
146 | 
147 | def processor(paths, f0_method, samplerate=16000, hop_size=160, process_id=0):
148 |     fs = samplerate
149 |     hop = hop_size
150 | 
151 |     f0_bin = 256
152 |     f0_max = 1100.0
153 |     f0_min = 50.0
154 |     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
155 |     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
156 |     if len(paths) != 0:
157 |         for idx, (inp_path, opt_path1, opt_path2) in enumerate(
158 |             tqdm(paths, position=1 + process_id)
159 |         ):
160 |             try:
161 |                 if (
162 |                     os.path.exists(opt_path1 + ".npy") == True
163 |                     and os.path.exists(opt_path2 + ".npy") == True
164 |                 ):
165 |                     continue
166 |                 featur_pit = compute_f0(inp_path, f0_method, fs, hop, f0_max, f0_min)
167 |                 np.save(
168 |                     opt_path2,
169 |                     featur_pit,
170 |                     allow_pickle=False,
171 |                 )  # nsf
172 |                 coarse_pit = coarse_f0(featur_pit, f0_bin, f0_mel_min, f0_mel_max)
173 |                 np.save(
174 |                     opt_path1,
175 |                     coarse_pit,
176 |                     allow_pickle=False,
177 |                 )  # ori
178 |             except:
179 |                 print(f"f0 failed {idx}: {inp_path} {traceback.format_exc()}")
180 | 
181 | 
182 | def run(training_dir: str, num_processes: int, f0_method: str):
183 |     paths = []
184 |     dataset_dir = os.path.join(training_dir, "1_16k_wavs")
185 |     opt_dir_f0 = os.path.join(training_dir, "2a_f0")
186 |     opt_dir_f0_nsf = os.path.join(training_dir, "2b_f0nsf")
187 | 
188 |     if os.path.exists(opt_dir_f0) and os.path.exists(opt_dir_f0_nsf):
189 |         return
190 | 
191 |     os.makedirs(opt_dir_f0, exist_ok=True)
192 |     os.makedirs(opt_dir_f0_nsf, exist_ok=True)
193 | 
194 |     names = []
195 | 
196 |     for pathname in sorted(list(os.listdir(dataset_dir))):
197 |         if os.path.isdir(os.path.join(dataset_dir, pathname)):
198 |             for f in sorted(list(os.listdir(os.path.join(dataset_dir, pathname)))):
199 |                 if "spec" in f:
200 |                     continue
201 |                 names.append(os.path.join(pathname, f))
202 |         else:
203 |             names.append(pathname)
204 | 
205 |     for name in names:  # dataset_dir/{05d}/file.ext
206 |         filepath = os.path.join(dataset_dir, name)
207 |         if "spec" in filepath:
208 |             continue
209 |         opt_filepath_f0 = os.path.join(opt_dir_f0, name)
210 |         opt_filepath_f0_nsf = os.path.join(opt_dir_f0_nsf, name)
211 |         paths.append([filepath, opt_filepath_f0, opt_filepath_f0_nsf])
212 | 
213 |     for dir in set([(os.path.dirname(p[1]), os.path.dirname(p[2])) for p in paths]):
214 |         os.makedirs(dir[0], exist_ok=True)
215 |         os.makedirs(dir[1], exist_ok=True)
216 | 
217 |     with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executer:
218 |         for i in range(num_processes):
219 |             executer.submit(processor, paths[i::num_processes], f0_method, process_id=i)
220 | 
221 |     processor(paths, f0_method)
222 | 


--------------------------------------------------------------------------------
/lib/rvc/preprocessing/extract_feature.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | import os
  3 | import traceback
  4 | from concurrent.futures import ProcessPoolExecutor
  5 | from typing import *
  6 | 
  7 | import numpy as np
  8 | import soundfile as sf
  9 | import torch
 10 | import torch.nn.functional as F
 11 | from fairseq import checkpoint_utils
 12 | from tqdm import tqdm
 13 | 
 14 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 15 | MODELS_DIR = os.path.join(ROOT_DIR, "models")
 16 | EMBEDDINGS_LIST = {
 17 |     "hubert-base-japanese": (
 18 |         "rinna_hubert_base_jp.pt",
 19 |         "hubert-base-japanese",
 20 |         "local",
 21 |     ),
 22 |     "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
 23 | }
 24 | 
 25 | def get_embedder(embedder_name):
 26 |     if embedder_name in EMBEDDINGS_LIST:
 27 |         return EMBEDDINGS_LIST[embedder_name]
 28 |     return None
 29 | 
 30 | 
 31 | def load_embedder(embedder_path: str, device):
 32 |     try:
 33 |         models, cfg, _ = checkpoint_utils.load_model_ensemble_and_task(
 34 |             [embedder_path],
 35 |             suffix="",
 36 |         )
 37 |         embedder_model = models[0]
 38 |         embedder_model = embedder_model.to(device)
 39 |         if device != "cpu":
 40 |             embedder_model = embedder_model.half()
 41 |         else:
 42 |             embedder_model = embedder_model.float()
 43 |         embedder_model.eval()
 44 |     except Exception as e:
 45 |         print(f"Error: {e} {embedder_path}")
 46 |         traceback.print_exc()
 47 | 
 48 |     return embedder_model, cfg
 49 | 
 50 | 
 51 | # wave must be 16k, hop_size=320
 52 | def readwave(wav_path, normalize=False):
 53 |     wav, sr = sf.read(wav_path)
 54 |     assert sr == 16000
 55 |     feats = torch.from_numpy(wav).float()
 56 |     if feats.dim() == 2:  # double channels
 57 |         feats = feats.mean(-1)
 58 |     assert feats.dim() == 1, feats.dim()
 59 |     if normalize:
 60 |         with torch.no_grad():
 61 |             feats = F.layer_norm(feats, feats.shape)
 62 |     feats = feats.view(1, -1)
 63 |     return feats
 64 | 
 65 | 
 66 | def processor(
 67 |     todo: List[str],
 68 |     device: torch.device,
 69 |     embedder_path: str,
 70 |     embedder_load_from: str,
 71 |     embedding_channel: bool,
 72 |     embedding_output_layer: int,
 73 |     wav_dir: str,
 74 |     out_dir: str,
 75 |     process_id: int,
 76 | ):
 77 |     half_support = (
 78 |         device.type == "cuda" and torch.cuda.get_device_capability(device)[0] >= 5.3
 79 |     )
 80 |     is_feats_dim_768 = embedding_channel == 768
 81 | 
 82 |     if embedder_load_from == "local" and not os.path.exists(embedder_path):
 83 |         return f"Embedder not found: {embedder_path}"
 84 | 
 85 |     model, cfg = load_embedder(embedder_path, device)
 86 | 
 87 |     for file in tqdm(todo, position=1 + process_id):
 88 |         try:
 89 |             if file.endswith(".wav"):
 90 |                 wav_filepath = os.path.join(wav_dir, file)
 91 |                 out_filepath = os.path.join(out_dir, file.replace("wav", "npy"))
 92 | 
 93 |                 if os.path.exists(out_filepath):
 94 |                     continue
 95 | 
 96 |                 os.makedirs(os.path.dirname(out_filepath), exist_ok=True)
 97 | 
 98 |                 is_normalize = False if cfg is None else cfg.task.normalize
 99 |                 feats = readwave(wav_filepath, normalize=is_normalize)
100 |                 padding_mask = torch.BoolTensor(feats.shape).fill_(False)
101 |                 if isinstance(model, tuple):
102 |                     feats = model[0](
103 |                         feats.squeeze(0).squeeze(0).to(device),
104 |                         return_tensors="pt",
105 |                         sampling_rate=16000,
106 |                     )
107 |                     if half_support:
108 |                         feats = feats.input_values.to(device).half()
109 |                     else:
110 |                         feats = feats.input_values.to(device).float()
111 | 
112 |                     with torch.no_grad():
113 |                         if half_support:
114 |                             if is_feats_dim_768:
115 |                                 feats = model[1](feats).last_hidden_state
116 |                             else:
117 |                                 feats = model[1](feats).extract_features
118 |                         else:
119 |                             if is_feats_dim_768:
120 |                                 feats = model[1].float()(feats).last_hidden_state
121 |                             else:
122 |                                 feats = model[1].float()(feats).extract_features
123 |                 else:
124 |                     inputs = {
125 |                         "source": feats.half().to(device)
126 |                         if half_support
127 |                         else feats.to(device),
128 |                         "padding_mask": padding_mask.to(device),
129 |                         "output_layer": embedding_output_layer,
130 |                     }
131 | 
132 |                     # なんかまだこの時点でfloat16なので改めて変換
133 |                     if not half_support:
134 |                         model = model.float()
135 |                         inputs["source"] = inputs["source"].float()
136 | 
137 |                     with torch.no_grad():
138 |                         logits = model.extract_features(**inputs)
139 |                         if is_feats_dim_768:
140 |                             feats = logits[0]
141 |                         else:
142 |                             feats = model.final_proj(logits[0])
143 | 
144 |                 feats = feats.squeeze(0).float().cpu().numpy()
145 |                 if np.isnan(feats).sum() == 0:
146 |                     np.save(out_filepath, feats, allow_pickle=False)
147 |                 else:
148 |                     print(f"{file} contains nan")
149 |         except Exception as e:
150 |             print(f"Error: {e} {file}")
151 |             traceback.print_exc()
152 | 
153 | 
154 | def run(
155 |     training_dir: str,
156 |     embedder_path: str,
157 |     embedder_load_from: str,
158 |     embedding_channel: int,
159 |     embedding_output_layer: int,
160 |     gpu_ids: List[int],
161 |     device: Optional[Union[torch.device, str]] = None,
162 | ):
163 |     wav_dir = os.path.join(training_dir, "1_16k_wavs")
164 |     out_dir = os.path.join(training_dir, "3_feature256")
165 | 
166 |     num_gpus = len(gpu_ids)
167 | 
168 |     for gpu_id in gpu_ids:
169 |         if num_gpus < gpu_id + 1:
170 |             print(f"GPU {gpu_id} is not available")
171 |             return
172 | 
173 |     if os.path.exists(out_dir):
174 |         return
175 | 
176 |     os.makedirs(out_dir, exist_ok=True)
177 | 
178 |     todo = [
179 |         os.path.join(dir, f)
180 |         for dir in sorted(list(os.listdir(wav_dir)))
181 |         if os.path.isdir(os.path.join(wav_dir, dir))
182 |         for f in sorted(list(os.listdir(os.path.join(wav_dir, dir))))
183 |     ]
184 | 
185 |     if device is not None:
186 |         if type(device) == str:
187 |             device = torch.device(device)
188 |         if device.type == "mps":
189 |             device = torch.device(
190 |                 "cpu"
191 |             )  # Mac(MPS) crashes when multiprocess, so change to CPU.
192 |         processor(
193 |             todo,
194 |             device,
195 |             embedder_path,
196 |             embedder_load_from,
197 |             embedding_channel,
198 |             embedding_output_layer,
199 |             wav_dir,
200 |             out_dir,
201 |             process_id=0,
202 |         )
203 |     else:
204 |         with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executor:
205 |             for i, id in enumerate(gpu_ids):
206 |                 executor.submit(
207 |                     processor,
208 |                     todo[i::num_gpus],
209 |                     torch.device(f"cuda:{id}"),
210 |                     embedder_path,
211 |                     embedder_load_from,
212 |                     embedding_channel,
213 |                     embedding_output_layer,
214 |                     wav_dir,
215 |                     out_dir,
216 |                     process_id=i,
217 |                 )
218 | 


--------------------------------------------------------------------------------
/lib/rvc/preprocessing/slicer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | # This function is obtained from librosa.
  5 | def get_rms(
  6 |     y,
  7 |     frame_length=2048,
  8 |     hop_length=512,
  9 |     pad_mode="constant",
 10 | ):
 11 |     padding = (int(frame_length // 2), int(frame_length // 2))
 12 |     y = np.pad(y, padding, mode=pad_mode)
 13 | 
 14 |     axis = -1
 15 |     # put our new within-frame axis at the end for now
 16 |     out_strides = y.strides + tuple([y.strides[axis]])
 17 |     # Reduce the shape on the framing axis
 18 |     x_shape_trimmed = list(y.shape)
 19 |     x_shape_trimmed[axis] -= frame_length - 1
 20 |     out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
 21 |     xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
 22 |     if axis < 0:
 23 |         target_axis = axis - 1
 24 |     else:
 25 |         target_axis = axis + 1
 26 |     xw = np.moveaxis(xw, -1, target_axis)
 27 |     # Downsample along the target axis
 28 |     slices = [slice(None)] * xw.ndim
 29 |     slices[axis] = slice(0, None, hop_length)
 30 |     x = xw[tuple(slices)]
 31 | 
 32 |     # Calculate power
 33 |     power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
 34 | 
 35 |     return np.sqrt(power)
 36 | 
 37 | 
 38 | class Slicer:
 39 |     def __init__(
 40 |         self,
 41 |         sr: int,
 42 |         threshold: float = -40.0,
 43 |         min_length: int = 5000,
 44 |         min_interval: int = 300,
 45 |         hop_size: int = 20,
 46 |         max_sil_kept: int = 5000,
 47 |     ):
 48 |         if not min_length >= min_interval >= hop_size:
 49 |             raise ValueError(
 50 |                 "The following condition must be satisfied: min_length >= min_interval >= hop_size"
 51 |             )
 52 |         if not max_sil_kept >= hop_size:
 53 |             raise ValueError(
 54 |                 "The following condition must be satisfied: max_sil_kept >= hop_size"
 55 |             )
 56 |         min_interval = sr * min_interval / 1000
 57 |         self.threshold = 10 ** (threshold / 20.0)
 58 |         self.hop_size = round(sr * hop_size / 1000)
 59 |         self.win_size = min(round(min_interval), 4 * self.hop_size)
 60 |         self.min_length = round(sr * min_length / 1000 / self.hop_size)
 61 |         self.min_interval = round(min_interval / self.hop_size)
 62 |         self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
 63 | 
 64 |     def _apply_slice(self, waveform, begin, end):
 65 |         if len(waveform.shape) > 1:
 66 |             return waveform[
 67 |                 :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
 68 |             ]
 69 |         else:
 70 |             return waveform[
 71 |                 begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
 72 |             ]
 73 | 
 74 |     # @timeit
 75 |     def slice(self, waveform):
 76 |         if len(waveform.shape) > 1:
 77 |             samples = waveform.mean(axis=0)
 78 |         else:
 79 |             samples = waveform
 80 |         if samples.shape[0] <= self.min_length:
 81 |             return [waveform]
 82 |         rms_list = get_rms(
 83 |             y=samples, frame_length=self.win_size, hop_length=self.hop_size
 84 |         ).squeeze(0)
 85 |         sil_tags = []
 86 |         silence_start = None
 87 |         clip_start = 0
 88 |         for i, rms in enumerate(rms_list):
 89 |             # Keep looping while frame is silent.
 90 |             if rms < self.threshold:
 91 |                 # Record start of silent frames.
 92 |                 if silence_start is None:
 93 |                     silence_start = i
 94 |                 continue
 95 |             # Keep looping while frame is not silent and silence start has not been recorded.
 96 |             if silence_start is None:
 97 |                 continue
 98 |             # Clear recorded silence start if interval is not enough or clip is too short
 99 |             is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100 |             need_slice_middle = (
101 |                 i - silence_start >= self.min_interval
102 |                 and i - clip_start >= self.min_length
103 |             )
104 |             if not is_leading_silence and not need_slice_middle:
105 |                 silence_start = None
106 |                 continue
107 |             # Need slicing. Record the range of silent frames to be removed.
108 |             if i - silence_start <= self.max_sil_kept:
109 |                 pos = rms_list[silence_start : i + 1].argmin() + silence_start
110 |                 if silence_start == 0:
111 |                     sil_tags.append((0, pos))
112 |                 else:
113 |                     sil_tags.append((pos, pos))
114 |                 clip_start = pos
115 |             elif i - silence_start <= self.max_sil_kept * 2:
116 |                 pos = rms_list[
117 |                     i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118 |                 ].argmin()
119 |                 pos += i - self.max_sil_kept
120 |                 pos_l = (
121 |                     rms_list[
122 |                         silence_start : silence_start + self.max_sil_kept + 1
123 |                     ].argmin()
124 |                     + silence_start
125 |                 )
126 |                 pos_r = (
127 |                     rms_list[i - self.max_sil_kept : i + 1].argmin()
128 |                     + i
129 |                     - self.max_sil_kept
130 |                 )
131 |                 if silence_start == 0:
132 |                     sil_tags.append((0, pos_r))
133 |                     clip_start = pos_r
134 |                 else:
135 |                     sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136 |                     clip_start = max(pos_r, pos)
137 |             else:
138 |                 pos_l = (
139 |                     rms_list[
140 |                         silence_start : silence_start + self.max_sil_kept + 1
141 |                     ].argmin()
142 |                     + silence_start
143 |                 )
144 |                 pos_r = (
145 |                     rms_list[i - self.max_sil_kept : i + 1].argmin()
146 |                     + i
147 |                     - self.max_sil_kept
148 |                 )
149 |                 if silence_start == 0:
150 |                     sil_tags.append((0, pos_r))
151 |                 else:
152 |                     sil_tags.append((pos_l, pos_r))
153 |                 clip_start = pos_r
154 |             silence_start = None
155 |         # Deal with trailing silence.
156 |         total_frames = rms_list.shape[0]
157 |         if (
158 |             silence_start is not None
159 |             and total_frames - silence_start >= self.min_interval
160 |         ):
161 |             silence_end = min(total_frames, silence_start + self.max_sil_kept)
162 |             pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163 |             sil_tags.append((pos, total_frames + 1))
164 |         # Apply and return slices.
165 |         if len(sil_tags) == 0:
166 |             return [waveform]
167 |         else:
168 |             chunks = []
169 |             if sil_tags[0][0] > 0:
170 |                 chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
171 |             for i in range(len(sil_tags) - 1):
172 |                 chunks.append(
173 |                     self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
174 |                 )
175 |             if sil_tags[-1][1] < total_frames:
176 |                 chunks.append(
177 |                     self._apply_slice(waveform, sil_tags[-1][1], total_frames)
178 |                 )
179 |             return chunks
180 | 


--------------------------------------------------------------------------------
/lib/rvc/preprocessing/split.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import os
  3 | from concurrent.futures import ProcessPoolExecutor
  4 | from typing import *
  5 | 
  6 | import librosa
  7 | import numpy as np
  8 | import scipy.signal as signal
  9 | from scipy.io import wavfile
 10 | from tqdm import tqdm
 11 | 
 12 | from lib.rvc.utils import load_audio
 13 | 
 14 | from .slicer import Slicer
 15 | 
 16 | 
 17 | def norm_write(
 18 |     tmp_audio: np.ndarray,
 19 |     idx0: int,
 20 |     idx1: int,
 21 |     speaker_id: int,
 22 |     outdir: str,
 23 |     outdir_16k: str,
 24 |     sampling_rate: int,
 25 |     max: float,
 26 |     alpha: float,
 27 |     is_normalize: bool,
 28 | ):
 29 |     if is_normalize:
 30 |         tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + (
 31 |             1 - alpha
 32 |         ) * tmp_audio
 33 |     else:
 34 |         # clip level to max (cause sometimes when floating point decoding)
 35 |         audio_min = np.min(tmp_audio)
 36 |         if audio_min < -max:
 37 |             tmp_audio = tmp_audio / -audio_min * max
 38 |         audio_max = np.max(tmp_audio)
 39 |         if audio_max > max:
 40 |             tmp_audio = tmp_audio / audio_max * max
 41 | 
 42 |     wavfile.write(
 43 |         os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"),
 44 |         sampling_rate,
 45 |         tmp_audio.astype(np.float32),
 46 |     )
 47 | 
 48 |     tmp_audio = librosa.resample(
 49 |         tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq"
 50 |     )
 51 |     wavfile.write(
 52 |         os.path.join(outdir_16k, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"),
 53 |         16000,
 54 |         tmp_audio.astype(np.float32),
 55 |     )
 56 | 
 57 | 
 58 | def write_mute(
 59 |     mute_wave_filename: str,
 60 |     speaker_id: int,
 61 |     outdir: str,
 62 |     outdir_16k: str,
 63 |     sampling_rate: int,
 64 | ):
 65 |     tmp_audio = load_audio(mute_wave_filename, sampling_rate)
 66 |     wavfile.write(
 67 |         os.path.join(outdir, f"{speaker_id:05}", "mute.wav"),
 68 |         sampling_rate,
 69 |         tmp_audio.astype(np.float32),
 70 |     )
 71 |     tmp_audio = librosa.resample(
 72 |         tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq"
 73 |     )
 74 |     wavfile.write(
 75 |         os.path.join(outdir_16k, f"{speaker_id:05}", "mute.wav"),
 76 |         16000,
 77 |         tmp_audio.astype(np.float32),
 78 |     )
 79 | 
 80 | 
 81 | def pipeline(
 82 |     slicer: Slicer,
 83 |     datasets: List[Tuple[str, int]],  # List[(path, speaker_id)]
 84 |     outdir: str,
 85 |     outdir_16k: str,
 86 |     sampling_rate: int,
 87 |     is_normalize: bool,
 88 |     process_id: int = 0,
 89 | ):
 90 |     per = 3.7
 91 |     overlap = 0.3
 92 |     tail = per + overlap
 93 |     max = 0.95
 94 |     alpha = 0.8
 95 | 
 96 |     bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=sampling_rate)
 97 | 
 98 |     for index, (wave_filename, speaker_id) in tqdm(datasets, position=1 + process_id):
 99 |         audio = load_audio(wave_filename, sampling_rate)
100 |         audio = signal.lfilter(bh, ah, audio)
101 | 
102 |         idx1 = 0
103 |         for audio in slicer.slice(audio):
104 |             i = 0
105 |             while 1:
106 |                 start = int(sampling_rate * (per - overlap) * i)
107 |                 i += 1
108 |                 if len(audio[start:]) > tail * sampling_rate:
109 |                     tmp_audio = audio[start : start + int(per * sampling_rate)]
110 |                     norm_write(
111 |                         tmp_audio,
112 |                         index,
113 |                         idx1,
114 |                         speaker_id,
115 |                         outdir,
116 |                         outdir_16k,
117 |                         sampling_rate,
118 |                         max,
119 |                         alpha,
120 |                         is_normalize,
121 |                     )
122 |                     idx1 += 1
123 |                 else:
124 |                     tmp_audio = audio[start:]
125 |                     break
126 |             norm_write(
127 |                 tmp_audio,
128 |                 index,
129 |                 idx1,
130 |                 speaker_id,
131 |                 outdir,
132 |                 outdir_16k,
133 |                 sampling_rate,
134 |                 max,
135 |                 alpha,
136 |                 is_normalize,
137 |             )
138 |             idx1 += 1
139 | 
140 | 
141 | def preprocess_audio(
142 |     datasets: List[Tuple[str, int]],  # List[(path, speaker_id)]
143 |     sampling_rate: int,
144 |     num_processes: int,
145 |     training_dir: str,
146 |     is_normalize: bool,
147 |     mute_wav_path: str,
148 | ):
149 |     waves_dir = os.path.join(training_dir, "0_gt_wavs")
150 |     waves16k_dir = os.path.join(training_dir, "1_16k_wavs")
151 |     if os.path.exists(waves_dir) and os.path.exists(waves16k_dir):
152 |         return
153 | 
154 |     for speaker_id in set([spk for _, spk in datasets]):
155 |         os.makedirs(os.path.join(waves_dir, f"{speaker_id:05}"), exist_ok=True)
156 |         os.makedirs(os.path.join(waves16k_dir, f"{speaker_id:05}"), exist_ok=True)
157 | 
158 |     all = [(i, x) for i, x in enumerate(sorted(datasets, key=operator.itemgetter(0)))]
159 | 
160 |     # n of datasets per process
161 |     process_all_nums = [len(all) // num_processes] * num_processes
162 |     # add residual datasets
163 |     for i in range(len(all) % num_processes):
164 |         process_all_nums[i] += 1
165 | 
166 |     assert len(all) == sum(process_all_nums), print(
167 |         f"len(all): {len(all)}, sum(process_all_nums): {sum(process_all_nums)}"
168 |     )
169 | 
170 |     with ProcessPoolExecutor(max_workers=num_processes) as executor:
171 |         all_index = 0
172 |         for i in range(num_processes):
173 |             data = all[all_index : all_index + process_all_nums[i]]
174 |             slicer = Slicer(
175 |                 sr=sampling_rate,
176 |                 threshold=-42,
177 |                 min_length=1500,
178 |                 min_interval=400,
179 |                 hop_size=15,
180 |                 max_sil_kept=500,
181 |             )
182 |             executor.submit(
183 |                 pipeline,
184 |                 slicer,
185 |                 data,
186 |                 waves_dir,
187 |                 waves16k_dir,
188 |                 sampling_rate,
189 |                 is_normalize,
190 |                 process_id=i,
191 |             )
192 |             all_index += process_all_nums[i]
193 | 
194 |     for speaker_id in set([spk for _, spk in datasets]):
195 |         write_mute(mute_wav_path, speaker_id, waves_dir, waves16k_dir, sampling_rate)
196 | 


--------------------------------------------------------------------------------
/lib/rvc/transforms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.nn import functional as F
  4 | 
  5 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  6 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  7 | DEFAULT_MIN_DERIVATIVE = 1e-3
  8 | 
  9 | 
 10 | def piecewise_rational_quadratic_transform(
 11 |     inputs,
 12 |     unnormalized_widths,
 13 |     unnormalized_heights,
 14 |     unnormalized_derivatives,
 15 |     inverse=False,
 16 |     tails=None,
 17 |     tail_bound=1.0,
 18 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 19 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 20 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 21 | ):
 22 |     if tails is None:
 23 |         spline_fn = rational_quadratic_spline
 24 |         spline_kwargs = {}
 25 |     else:
 26 |         spline_fn = unconstrained_rational_quadratic_spline
 27 |         spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
 28 | 
 29 |     outputs, logabsdet = spline_fn(
 30 |         inputs=inputs,
 31 |         unnormalized_widths=unnormalized_widths,
 32 |         unnormalized_heights=unnormalized_heights,
 33 |         unnormalized_derivatives=unnormalized_derivatives,
 34 |         inverse=inverse,
 35 |         min_bin_width=min_bin_width,
 36 |         min_bin_height=min_bin_height,
 37 |         min_derivative=min_derivative,
 38 |         **spline_kwargs
 39 |     )
 40 |     return outputs, logabsdet
 41 | 
 42 | 
 43 | def searchsorted(bin_locations, inputs, eps=1e-6):
 44 |     bin_locations[..., -1] += eps
 45 |     return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
 46 | 
 47 | 
 48 | def unconstrained_rational_quadratic_spline(
 49 |     inputs,
 50 |     unnormalized_widths,
 51 |     unnormalized_heights,
 52 |     unnormalized_derivatives,
 53 |     inverse=False,
 54 |     tails="linear",
 55 |     tail_bound=1.0,
 56 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 57 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 58 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 59 | ):
 60 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 61 |     outside_interval_mask = ~inside_interval_mask
 62 | 
 63 |     outputs = torch.zeros_like(inputs)
 64 |     logabsdet = torch.zeros_like(inputs)
 65 | 
 66 |     if tails == "linear":
 67 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 68 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 69 |         unnormalized_derivatives[..., 0] = constant
 70 |         unnormalized_derivatives[..., -1] = constant
 71 | 
 72 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 73 |         logabsdet[outside_interval_mask] = 0
 74 |     else:
 75 |         raise RuntimeError("{} tails are not implemented.".format(tails))
 76 | 
 77 |     (
 78 |         outputs[inside_interval_mask],
 79 |         logabsdet[inside_interval_mask],
 80 |     ) = rational_quadratic_spline(
 81 |         inputs=inputs[inside_interval_mask],
 82 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 83 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 84 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 85 |         inverse=inverse,
 86 |         left=-tail_bound,
 87 |         right=tail_bound,
 88 |         bottom=-tail_bound,
 89 |         top=tail_bound,
 90 |         min_bin_width=min_bin_width,
 91 |         min_bin_height=min_bin_height,
 92 |         min_derivative=min_derivative,
 93 |     )
 94 | 
 95 |     return outputs, logabsdet
 96 | 
 97 | 
 98 | def rational_quadratic_spline(
 99 |     inputs,
100 |     unnormalized_widths,
101 |     unnormalized_heights,
102 |     unnormalized_derivatives,
103 |     inverse=False,
104 |     left=0.0,
105 |     right=1.0,
106 |     bottom=0.0,
107 |     top=1.0,
108 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
109 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
110 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
111 | ):
112 |     if torch.min(inputs) < left or torch.max(inputs) > right:
113 |         raise ValueError("Input to a transform is not within its domain")
114 | 
115 |     num_bins = unnormalized_widths.shape[-1]
116 | 
117 |     if min_bin_width * num_bins > 1.0:
118 |         raise ValueError("Minimal bin width too large for the number of bins")
119 |     if min_bin_height * num_bins > 1.0:
120 |         raise ValueError("Minimal bin height too large for the number of bins")
121 | 
122 |     widths = F.softmax(unnormalized_widths, dim=-1)
123 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
124 |     cumwidths = torch.cumsum(widths, dim=-1)
125 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
126 |     cumwidths = (right - left) * cumwidths + left
127 |     cumwidths[..., 0] = left
128 |     cumwidths[..., -1] = right
129 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
130 | 
131 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
132 | 
133 |     heights = F.softmax(unnormalized_heights, dim=-1)
134 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
135 |     cumheights = torch.cumsum(heights, dim=-1)
136 |     cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
137 |     cumheights = (top - bottom) * cumheights + bottom
138 |     cumheights[..., 0] = bottom
139 |     cumheights[..., -1] = top
140 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
141 | 
142 |     if inverse:
143 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
144 |     else:
145 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
146 | 
147 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
148 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
151 |     delta = heights / widths
152 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
153 | 
154 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
155 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
156 | 
157 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
158 | 
159 |     if inverse:
160 |         a = (inputs - input_cumheights) * (
161 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
162 |         ) + input_heights * (input_delta - input_derivatives)
163 |         b = input_heights * input_derivatives - (inputs - input_cumheights) * (
164 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
165 |         )
166 |         c = -input_delta * (inputs - input_cumheights)
167 | 
168 |         discriminant = b.pow(2) - 4 * a * c
169 |         assert (discriminant >= 0).all()
170 | 
171 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
172 |         outputs = root * input_bin_widths + input_cumwidths
173 | 
174 |         theta_one_minus_theta = root * (1 - root)
175 |         denominator = input_delta + (
176 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
177 |             * theta_one_minus_theta
178 |         )
179 |         derivative_numerator = input_delta.pow(2) * (
180 |             input_derivatives_plus_one * root.pow(2)
181 |             + 2 * input_delta * theta_one_minus_theta
182 |             + input_derivatives * (1 - root).pow(2)
183 |         )
184 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
185 | 
186 |         return outputs, -logabsdet
187 |     else:
188 |         theta = (inputs - input_cumwidths) / input_bin_widths
189 |         theta_one_minus_theta = theta * (1 - theta)
190 | 
191 |         numerator = input_heights * (
192 |             input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
193 |         )
194 |         denominator = input_delta + (
195 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
196 |             * theta_one_minus_theta
197 |         )
198 |         outputs = input_cumheights + numerator / denominator
199 | 
200 |         derivative_numerator = input_delta.pow(2) * (
201 |             input_derivatives_plus_one * theta.pow(2)
202 |             + 2 * input_delta * theta_one_minus_theta
203 |             + input_derivatives * (1 - theta).pow(2)
204 |         )
205 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
206 | 
207 |         return outputs, logabsdet
208 | 


--------------------------------------------------------------------------------
/lib/rvc/utils.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | import os
  4 | import shutil
  5 | import socket
  6 | import sys
  7 | 
  8 | import ffmpeg
  9 | import matplotlib
 10 | import matplotlib.pylab as plt
 11 | import numpy as np
 12 | import torch
 13 | from scipy.io.wavfile import read
 14 | from torch.nn import functional as F
 15 | 
 16 | from modules.shared import ROOT_DIR
 17 | 
 18 | from .config import TrainConfig
 19 | 
 20 | matplotlib.use("Agg")
 21 | logging.getLogger("matplotlib").setLevel(logging.WARNING)
 22 | 
 23 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 24 | logger = logging
 25 | 
 26 | 
 27 | def load_audio(file: str, sr):
 28 |     try:
 29 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 30 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 31 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
 32 |         file = (
 33 |             file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 34 |         )  # Prevent small white copy path head and tail with spaces and " and return
 35 |         out, _ = (
 36 |             ffmpeg.input(file, threads=0)
 37 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
 38 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
 39 |         )
 40 |     except Exception as e:
 41 |         raise RuntimeError(f"Failed to load audio: {e}")
 42 | 
 43 |     return np.frombuffer(out, np.float32).flatten()
 44 | 
 45 | 
 46 | def find_empty_port():
 47 |     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 48 |     s.bind(("", 0))
 49 |     s.listen(1)
 50 |     port = s.getsockname()[1]
 51 |     s.close()
 52 |     return port
 53 | 
 54 | 
 55 | def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
 56 |     assert os.path.isfile(checkpoint_path)
 57 |     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
 58 | 
 59 |     saved_state_dict = checkpoint_dict["model"]
 60 |     if hasattr(model, "module"):
 61 |         state_dict = model.module.state_dict()
 62 |     else:
 63 |         state_dict = model.state_dict()
 64 |     new_state_dict = {}
 65 |     for k, v in state_dict.items():  # 模型需要的shape
 66 |         try:
 67 |             new_state_dict[k] = saved_state_dict[k]
 68 |             if saved_state_dict[k].shape != state_dict[k].shape:
 69 |                 print(
 70 |                     f"shape-{k}-mismatch|need-{state_dict[k].shape}|get-{saved_state_dict[k].shape}"
 71 |                 )
 72 |                 if saved_state_dict[k].dim() == 2:  # NOTE: check is this ok?
 73 |                     # for embedded input 256 <==> 768
 74 |                     # this achieves we can continue training from original's pretrained checkpoints when using embedder that 768-th dim output etc.
 75 |                     if saved_state_dict[k].dtype == torch.half:
 76 |                         new_state_dict[k] = (
 77 |                             F.interpolate(
 78 |                                 saved_state_dict[k].float().unsqueeze(0).unsqueeze(0),
 79 |                                 size=state_dict[k].shape,
 80 |                                 mode="bilinear",
 81 |                             )
 82 |                             .half()
 83 |                             .squeeze(0)
 84 |                             .squeeze(0)
 85 |                         )
 86 |                     else:
 87 |                         new_state_dict[k] = (
 88 |                             F.interpolate(
 89 |                                 saved_state_dict[k].unsqueeze(0).unsqueeze(0),
 90 |                                 size=state_dict[k].shape,
 91 |                                 mode="bilinear",
 92 |                             )
 93 |                             .squeeze(0)
 94 |                             .squeeze(0)
 95 |                         )
 96 |                     print(
 97 |                         "interpolated new_state_dict",
 98 |                         k,
 99 |                         "from",
100 |                         saved_state_dict[k].shape,
101 |                         "to",
102 |                         new_state_dict[k].shape,
103 |                     )
104 |                 else:
105 |                     raise KeyError
106 |         except Exception as e:
107 |             # print(traceback.format_exc())
108 |             print(f"{k} is not in the checkpoint")
109 |             print("error: %s" % e)
110 |             new_state_dict[k] = v  # 模型自带的随机值
111 |     if hasattr(model, "module"):
112 |         model.module.load_state_dict(new_state_dict, strict=False)
113 |     else:
114 |         model.load_state_dict(new_state_dict, strict=False)
115 |     print("Loaded model weights")
116 | 
117 |     epoch = checkpoint_dict["epoch"]
118 |     learning_rate = checkpoint_dict["learning_rate"]
119 |     if optimizer is not None and load_opt == 1:
120 |         optimizer.load_state_dict(checkpoint_dict["optimizer"])
121 |     print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, epoch))
122 |     return model, optimizer, learning_rate, epoch
123 | 
124 | 
125 | def save_state(model, optimizer, learning_rate, epoch, checkpoint_path):
126 |     print(
127 |         "Saving model and optimizer state at epoch {} to {}".format(
128 |             epoch, checkpoint_path
129 |         )
130 |     )
131 |     if hasattr(model, "module"):
132 |         state_dict = model.module.state_dict()
133 |     else:
134 |         state_dict = model.state_dict()
135 |     torch.save(
136 |         {
137 |             "model": state_dict,
138 |             "epoch": epoch,
139 |             "optimizer": optimizer.state_dict(),
140 |             "learning_rate": learning_rate,
141 |         },
142 |         checkpoint_path,
143 |     )
144 | 
145 | 
146 | def summarize(
147 |     writer,
148 |     global_step,
149 |     scalars={},
150 |     histograms={},
151 |     images={},
152 |     audios={},
153 |     audio_sampling_rate=22050,
154 | ):
155 |     for k, v in scalars.items():
156 |         writer.add_scalar(k, v, global_step)
157 |     for k, v in histograms.items():
158 |         writer.add_histogram(k, v, global_step)
159 |     for k, v in images.items():
160 |         writer.add_image(k, v, global_step, dataformats="HWC")
161 |     for k, v in audios.items():
162 |         writer.add_audio(k, v, global_step, audio_sampling_rate)
163 | 
164 | 
165 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
166 |     filelist = glob.glob(os.path.join(dir_path, regex))
167 |     if len(filelist) == 0:
168 |         return None
169 |     filelist.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
170 |     filepath = filelist[-1]
171 |     return filepath
172 | 
173 | 
174 | def plot_spectrogram_to_numpy(spectrogram):
175 |     fig, ax = plt.subplots(figsize=(10, 2))
176 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
177 |     plt.colorbar(im, ax=ax)
178 |     plt.xlabel("Frames")
179 |     plt.ylabel("Channels")
180 |     plt.tight_layout()
181 | 
182 |     fig.canvas.draw()
183 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
184 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
185 |     plt.close()
186 |     return data
187 | 
188 | 
189 | def plot_alignment_to_numpy(alignment, info=None):
190 |     fig, ax = plt.subplots(figsize=(6, 4))
191 |     im = ax.imshow(
192 |         alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
193 |     )
194 |     fig.colorbar(im, ax=ax)
195 |     xlabel = "Decoder timestep"
196 |     if info is not None:
197 |         xlabel += "\n\n" + info
198 |     plt.xlabel(xlabel)
199 |     plt.ylabel("Encoder timestep")
200 |     plt.tight_layout()
201 | 
202 |     fig.canvas.draw()
203 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
204 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
205 |     plt.close()
206 |     return data
207 | 
208 | 
209 | def load_wav_to_torch(full_path):
210 |     sampling_rate, data = read(full_path)
211 |     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
212 | 
213 | 
214 | def load_config(training_dir: str, sample_rate: int, emb_channels: int):
215 |     if emb_channels == 256:
216 |         config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json")
217 |     else:
218 |         config_path = os.path.join(
219 |             ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json"
220 |         )
221 |     config_save_path = os.path.join(training_dir, "config.json")
222 | 
223 |     shutil.copyfile(config_path, config_save_path)
224 | 
225 |     return TrainConfig.parse_file(config_save_path)
226 | 


--------------------------------------------------------------------------------
/models/checkpoints/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/models/embeddings/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/models/pretrained/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/models/training/.gitignore:
--------------------------------------------------------------------------------
1 | */**
2 | 
3 | !mute/**/*
4 | !.gitignore
5 | 
6 | mute/**/*.pt
7 | 


--------------------------------------------------------------------------------
/models/training/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/models/training/mute/0_gt_wavs/mute32k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute32k.wav


--------------------------------------------------------------------------------
/models/training/mute/0_gt_wavs/mute40k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute40k.wav


--------------------------------------------------------------------------------
/models/training/mute/0_gt_wavs/mute48k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/0_gt_wavs/mute48k.wav


--------------------------------------------------------------------------------
/models/training/mute/1_16k_wavs/mute.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/1_16k_wavs/mute.wav


--------------------------------------------------------------------------------
/models/training/mute/2a_f0/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/2a_f0/mute.wav.npy


--------------------------------------------------------------------------------
/models/training/mute/2b_f0nsf/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/2b_f0nsf/mute.wav.npy


--------------------------------------------------------------------------------
/models/training/mute/3_feature256/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/models/training/mute/3_feature256/mute.npy


--------------------------------------------------------------------------------
/modules/cmd_opts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | parser = argparse.ArgumentParser()
 4 | 
 5 | parser.add_argument("--host", help="Host to connect to", type=str, default="127.0.0.1")
 6 | parser.add_argument("--port", help="Port to connect to", type=int)
 7 | parser.add_argument("--share", help="Enable gradio share", action="store_true")
 8 | parser.add_argument(
 9 |     "--models-dir", help="Path to models directory", type=str, default=None
10 | )
11 | parser.add_argument(
12 |     "--output-dir", help="Path to output directory", type=str, default=None
13 | )
14 | parser.add_argument(
15 |     "--precision",
16 |     help="Precision to use",
17 |     type=str,
18 |     default="fp16",
19 |     choices=["fp32", "fp16"],
20 | )
21 | 
22 | opts, _ = parser.parse_known_args()
23 | 


--------------------------------------------------------------------------------
/modules/core.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import shutil
  4 | import sys
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | 
  7 | import requests
  8 | 
  9 | from modules.models import MODELS_DIR
 10 | from modules.shared import ROOT_DIR
 11 | from modules.utils import download_file
 12 | 
 13 | 
 14 | def get_hf_etag(url: str):
 15 |     r = requests.head(url)
 16 | 
 17 |     etag = r.headers["X-Linked-ETag"] if "X-Linked-ETag" in r.headers else ""
 18 | 
 19 |     if etag.startswith('"') and etag.endswith('"'):
 20 |         etag = etag[1:-1]
 21 | 
 22 |     return etag
 23 | 
 24 | 
 25 | def calc_sha256(filepath: str):
 26 |     sha256 = hashlib.sha256()
 27 |     with open(filepath, "rb") as f:
 28 |         for chunk in iter(lambda: f.read(4096), b""):
 29 |             sha256.update(chunk)
 30 |     return sha256.hexdigest()
 31 | 
 32 | 
 33 | def download_models():
 34 |     def hash_check(url: str, out: str):
 35 |         if not os.path.exists(out):
 36 |             return False
 37 |         etag = get_hf_etag(url)
 38 |         hash = calc_sha256(out)
 39 |         return etag == hash
 40 | 
 41 |     os.makedirs(os.path.join(MODELS_DIR, "pretrained", "v2"), exist_ok=True)
 42 | 
 43 |     tasks = []
 44 |     for template in [
 45 |         "D{}k",
 46 |         "G{}k",
 47 |         "f0D{}k",
 48 |         "f0G{}k",
 49 |     ]:
 50 |         basename = template.format("40")
 51 |         url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/pretrained/v2/{basename}.pth"
 52 |         out = os.path.join(MODELS_DIR, "pretrained", "v2", f"{basename}.pth")
 53 | 
 54 |         if hash_check(url, out):
 55 |             continue
 56 | 
 57 |         tasks.append((url, out))
 58 | 
 59 |     for filename in [
 60 |         "checkpoint_best_legacy_500.pt",
 61 |     ]:
 62 |         out = os.path.join(MODELS_DIR, "embeddings", filename)
 63 |         url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}"
 64 | 
 65 |         if hash_check(url, out):
 66 |             continue
 67 | 
 68 |         tasks.append(
 69 |             (
 70 |                 f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}",
 71 |                 out,
 72 |             )
 73 |         )
 74 | 
 75 |     # japanese-hubert-base (Fairseq)
 76 |     # from official repo
 77 |     # NOTE: change filename?
 78 |     hubert_jp_url = f"https://huggingface.co/rinna/japanese-hubert-base/resolve/main/fairseq/model.pt"
 79 |     out = os.path.join(MODELS_DIR, "embeddings", "rinna_hubert_base_jp.pt")
 80 |     if not hash_check(hubert_jp_url, out):
 81 |         tasks.append(
 82 |             (
 83 |                 hubert_jp_url,
 84 |                 out,
 85 |             )
 86 |         )
 87 | 
 88 |     if len(tasks) < 1:
 89 |         return
 90 | 
 91 |     with ThreadPoolExecutor() as pool:
 92 |         pool.map(
 93 |             download_file,
 94 |             *zip(
 95 |                 *[(filename, out, i, True) for i, (filename, out) in enumerate(tasks)]
 96 |             ),
 97 |         )
 98 | 
 99 | 
100 | def install_ffmpeg():
101 |     if os.path.exists(os.path.join(ROOT_DIR, "bin", "ffmpeg.exe")):
102 |         return
103 |     tmpdir = os.path.join(ROOT_DIR, "tmp")
104 |     url = (
105 |         "https://www.gyan.dev/ffmpeg/builds/packages/ffmpeg-5.1.2-essentials_build.zip"
106 |     )
107 |     out = os.path.join(tmpdir, "ffmpeg.zip")
108 |     os.makedirs(os.path.dirname(out), exist_ok=True)
109 |     download_file(url, out)
110 |     shutil.unpack_archive(out, os.path.join(tmpdir, "ffmpeg"))
111 |     shutil.copyfile(
112 |         os.path.join(
113 |             tmpdir, "ffmpeg", "ffmpeg-5.1.2-essentials_build", "bin", "ffmpeg.exe"
114 |         ),
115 |         os.path.join(ROOT_DIR, "bin", "ffmpeg.exe"),
116 |     )
117 |     os.remove(os.path.join(tmpdir, "ffmpeg.zip"))
118 |     shutil.rmtree(os.path.join(tmpdir, "ffmpeg"))
119 | 
120 | 
121 | def update_modelnames():
122 |     for sr in ["32k", "40k", "48k"]:
123 |         files = [
124 |             f"f0G{sr}",
125 |             f"f0D{sr}",
126 |             f"G{sr}",
127 |             f"D{sr}",
128 |         ]
129 |         for file in files:
130 |             filepath = os.path.join(MODELS_DIR, "pretrained", f"{file}.pth")
131 |             if os.path.exists(filepath):
132 |                 os.rename(
133 |                     filepath,
134 |                     os.path.join(MODELS_DIR, "pretrained", f"{file}256.pth"),
135 |                 )
136 | 
137 |     if not os.path.exists(os.path.join(MODELS_DIR, "embeddings")):
138 |         os.makedirs(os.path.join(MODELS_DIR, "embeddings"))
139 | 
140 |     if os.path.exists(os.path.join(MODELS_DIR, "hubert_base.pt")):
141 |         os.rename(
142 |             os.path.join(MODELS_DIR, "hubert_base.pt"),
143 |             os.path.join(MODELS_DIR, "embeddings", "hubert_base.pt"),
144 |         )
145 |     if os.path.exists(os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt")):
146 |         os.rename(
147 |             os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt"),
148 |             os.path.join(MODELS_DIR, "embeddings", "checkpoint_best_legacy_500.pt"),
149 |         )
150 | 
151 | 
152 | def preload():
153 |     update_modelnames()
154 |     download_models()
155 |     if sys.platform == "win32":
156 |         install_ffmpeg()
157 | 


--------------------------------------------------------------------------------
/modules/merge.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from typing import *
 3 | 
 4 | import torch
 5 | import tqdm
 6 | 
 7 | 
 8 | def merge(
 9 |     path_a: str,
10 |     path_b: str,
11 |     path_c: str,
12 |     alpha: float,
13 |     weights: Dict[str, float],
14 |     method: str,
15 | ):
16 |     def extract(ckpt: Dict[str, Any]):
17 |         a = ckpt["model"]
18 |         opt = OrderedDict()
19 |         opt["weight"] = {}
20 |         for key in a.keys():
21 |             if "enc_q" in key:
22 |                 continue
23 |             opt["weight"][key] = a[key]
24 |         return opt
25 | 
26 |     def load_weight(path: str):
27 |         print(f"Loading {path}...")
28 |         state_dict = torch.load(path, map_location="cpu")
29 |         if "model" in state_dict:
30 |             weight = extract(state_dict)
31 |         else:
32 |             weight = state_dict["weight"]
33 |         return weight, state_dict
34 | 
35 |     def get_alpha(key: str):
36 |         try:
37 |             filtered = sorted(
38 |                 [x for x in weights.keys() if key.startswith(x)], key=len, reverse=True
39 |             )
40 |             if len(filtered) < 1:
41 |                 return alpha
42 |             return weights[filtered[0]]
43 |         except:
44 |             return alpha
45 | 
46 |     weight_a, state_dict = load_weight(path_a)
47 |     weight_b, _ = load_weight(path_b)
48 |     if path_c is not None:
49 |         weight_c, _ = load_weight(path_c)
50 | 
51 |     if sorted(list(weight_a.keys())) != sorted(list(weight_b.keys())):
52 |         raise RuntimeError("Failed to merge models.")
53 | 
54 |     merged = OrderedDict()
55 |     merged["weight"] = {}
56 | 
57 |     def merge_weight(a, b, c, alpha):
58 |         if method == "weight_sum":
59 |             return (1 - alpha) * a + alpha * b
60 |         elif method == "add_diff":
61 |             return a + (b - c) * alpha
62 | 
63 |     for key in tqdm.tqdm(weight_a.keys()):
64 |         a = get_alpha(key)
65 |         if path_c is not None:
66 |             merged["weight"][key] = merge_weight(
67 |                 weight_a[key], weight_b[key], weight_c[key], a
68 |             )
69 |         else:
70 |             merged["weight"][key] = merge_weight(weight_a[key], weight_b[key], None, a)
71 |     merged["config"] = state_dict["config"]
72 |     merged["params"] = state_dict["params"] if "params" in state_dict else None
73 |     merged["version"] = state_dict.get("version", "v1")
74 |     merged["sr"] = state_dict["sr"]
75 |     merged["f0"] = state_dict["f0"]
76 |     merged["info"] = state_dict["info"]
77 |     merged["embedder_name"] = (
78 |         state_dict["embedder_name"] if "embedder_name" in state_dict else None
79 |     )
80 |     merged["embedder_output_layer"] = state_dict.get("embedder_output_layer", "12")
81 |     return merged
82 | 


--------------------------------------------------------------------------------
/modules/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from typing import *
  4 | 
  5 | import torch
  6 | from fairseq import checkpoint_utils
  7 | from fairseq.models.hubert.hubert import HubertModel
  8 | from pydub import AudioSegment
  9 | 
 10 | from lib.rvc.models import (SynthesizerTrnMs256NSFSid,
 11 |                             SynthesizerTrnMs256NSFSidNono)
 12 | from lib.rvc.pipeline import VocalConvertPipeline
 13 | 
 14 | from .cmd_opts import opts
 15 | from .shared import ROOT_DIR, device, is_half
 16 | from .utils import load_audio
 17 | 
 18 | AUDIO_OUT_DIR = opts.output_dir or os.path.join(ROOT_DIR, "outputs")
 19 | 
 20 | 
 21 | EMBEDDINGS_LIST = {
 22 |     "hubert-base-japanese": (
 23 |         "rinna_hubert_base_jp.pt",
 24 |         "hubert-base-japanese",
 25 |         "local",
 26 |     ),
 27 |     "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
 28 | }
 29 | 
 30 | 
 31 | def update_state_dict(state_dict):
 32 |     if "params" in state_dict and state_dict["params"] is not None:
 33 |         return
 34 |     keys = [
 35 |         "spec_channels",
 36 |         "segment_size",
 37 |         "inter_channels",
 38 |         "hidden_channels",
 39 |         "filter_channels",
 40 |         "n_heads",
 41 |         "n_layers",
 42 |         "kernel_size",
 43 |         "p_dropout",
 44 |         "resblock",
 45 |         "resblock_kernel_sizes",
 46 |         "resblock_dilation_sizes",
 47 |         "upsample_rates",
 48 |         "upsample_initial_channel",
 49 |         "upsample_kernel_sizes",
 50 |         "spk_embed_dim",
 51 |         "gin_channels",
 52 |         "emb_channels",
 53 |         "sr",
 54 |     ]
 55 |     state_dict["params"] = {}
 56 |     n = 0
 57 |     for i, key in enumerate(keys):
 58 |         i = i - n
 59 |         if len(state_dict["config"]) != 19 and key == "emb_channels":
 60 |             # backward compat.
 61 |             n += 1
 62 |             continue
 63 |         state_dict["params"][key] = state_dict["config"][i]
 64 | 
 65 |     if not "emb_channels" in state_dict["params"]:
 66 |         if state_dict.get("version", "v1") == "v1":
 67 |             state_dict["params"]["emb_channels"] = 256  # for backward compat.
 68 |             state_dict["embedder_output_layer"] = 9
 69 |         else:
 70 |             state_dict["params"]["emb_channels"] = 768  # for backward compat.
 71 |             state_dict["embedder_output_layer"] = 12
 72 | 
 73 | 
 74 | class VoiceConvertModel:
 75 |     def __init__(self, model_name: str, state_dict: Dict[str, Any]) -> None:
 76 |         update_state_dict(state_dict)
 77 |         self.model_name = model_name
 78 |         self.state_dict = state_dict
 79 |         self.tgt_sr = state_dict["params"]["sr"]
 80 |         f0 = state_dict.get("f0", 1)
 81 |         state_dict["params"]["spk_embed_dim"] = state_dict["weight"][
 82 |             "emb_g.weight"
 83 |         ].shape[0]
 84 |         if not "emb_channels" in state_dict["params"]:
 85 |             state_dict["params"]["emb_channels"] = 256  # for backward compat.
 86 | 
 87 |         if f0 == 1:
 88 |             self.net_g = SynthesizerTrnMs256NSFSid(
 89 |                 **state_dict["params"], is_half=is_half
 90 |             )
 91 |         else:
 92 |             self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"])
 93 | 
 94 |         del self.net_g.enc_q
 95 | 
 96 |         self.net_g.load_state_dict(state_dict["weight"], strict=False)
 97 |         self.net_g.eval().to(device)
 98 | 
 99 |         if is_half:
100 |             self.net_g = self.net_g.half()
101 |         else:
102 |             self.net_g = self.net_g.float()
103 | 
104 |         self.vc = VocalConvertPipeline(self.tgt_sr, device, is_half)
105 |         self.n_spk = state_dict["params"]["spk_embed_dim"]
106 | 
107 |     def single(
108 |         self,
109 |         sid: int,
110 |         input_audio: str,
111 |         embedder_model_name: str,
112 |         embedding_output_layer: str,
113 |         f0_up_key: int,
114 |         f0_file: str,
115 |         f0_method: str,
116 |         auto_load_index: bool,
117 |         faiss_index_file: str,
118 |         index_rate: float,
119 |         output_dir: str = AUDIO_OUT_DIR,
120 |     ):
121 |         if not input_audio:
122 |             raise Exception("You need to set Source Audio")
123 |         f0_up_key = int(f0_up_key)
124 |         audio = load_audio(input_audio, 16000)
125 | 
126 |         if embedder_model_name == "auto":
127 |             embedder_model_name = (
128 |                 self.state_dict["embedder_name"]
129 |                 if "embedder_name" in self.state_dict
130 |                 else "hubert_base"
131 |             )
132 |             if embedder_model_name.endswith("768"):
133 |                 embedder_model_name = embedder_model_name[:-3]
134 | 
135 |         if embedder_model_name == "hubert_base":
136 |             embedder_model_name = "contentvec"
137 | 
138 |         if not embedder_model_name in EMBEDDINGS_LIST.keys():
139 |             raise Exception(f"Not supported embedder: {embedder_model_name}")
140 | 
141 |         if (
142 |             embedder_model == None
143 |             or loaded_embedder_model != EMBEDDINGS_LIST[embedder_model_name][1]
144 |         ):
145 |             print(f"load {embedder_model_name} embedder")
146 |             embedder_filename, embedder_name, load_from = get_embedder(
147 |                 embedder_model_name
148 |             )
149 |             load_embedder(embedder_filename, embedder_name)
150 | 
151 |         if embedding_output_layer == "auto":
152 |             embedding_output_layer = (
153 |                 self.state_dict["embedding_output_layer"]
154 |                 if "embedding_output_layer" in self.state_dict
155 |                 else 12
156 |             )
157 |         else:
158 |             embedding_output_layer = int(embedding_output_layer)
159 | 
160 |         f0 = self.state_dict.get("f0", 1)
161 | 
162 |         if not faiss_index_file and auto_load_index:
163 |             faiss_index_file = self.get_index_path(sid)
164 | 
165 |         audio_opt = self.vc(
166 |             embedder_model,
167 |             embedding_output_layer,
168 |             self.net_g,
169 |             sid,
170 |             audio,
171 |             f0_up_key,
172 |             f0_method,
173 |             faiss_index_file,
174 |             index_rate,
175 |             f0,
176 |             f0_file=f0_file,
177 |         )
178 | 
179 |         audio = AudioSegment(
180 |             audio_opt,
181 |             frame_rate=self.tgt_sr,
182 |             sample_width=2,
183 |             channels=1,
184 |         )
185 |         os.makedirs(output_dir, exist_ok=True)
186 |         input_audio_splitext = os.path.splitext(os.path.basename(input_audio))[0]
187 |         model_splitext = os.path.splitext(self.model_name)[0]
188 |         index = 0
189 |         existing_files = os.listdir(output_dir)
190 |         for existing_file in existing_files:
191 |             result = re.match(r"\d+", existing_file)
192 |             if result:
193 |                 prefix_num = int(result.group(0))
194 |                 if index < prefix_num:
195 |                     index = prefix_num
196 |         audio.export(
197 |             os.path.join(
198 |                 output_dir, f"{index+1}-{model_splitext}-{input_audio_splitext}.wav"
199 |             ),
200 |             format="wav",
201 |         )
202 |         return audio_opt
203 | 
204 |     def get_index_path(self, speaker_id: int):
205 |         basename = os.path.splitext(self.model_name)[0]
206 |         speaker_index_path = os.path.join(
207 |             MODELS_DIR,
208 |             "checkpoints",
209 |             f"{basename}_index",
210 |             f"{basename}.{speaker_id}.index",
211 |         )
212 |         if os.path.exists(speaker_index_path):
213 |             return speaker_index_path
214 |         return os.path.join(MODELS_DIR, "checkpoints", f"{basename}.index")
215 | 
216 | 
217 | MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models")
218 | vc_model: Optional[VoiceConvertModel] = None
219 | embedder_model: Optional[HubertModel] = None
220 | loaded_embedder_model = ""
221 | 
222 | 
223 | def get_models():
224 |     dir = os.path.join(ROOT_DIR, "models", "checkpoints")
225 |     os.makedirs(dir, exist_ok=True)
226 |     return [
227 |         file
228 |         for file in os.listdir(dir)
229 |         if any([x for x in [".ckpt", ".pth"] if file.endswith(x)])
230 |     ]
231 | 
232 | 
233 | def get_embedder(embedder_name):
234 |     if embedder_name in EMBEDDINGS_LIST:
235 |         return EMBEDDINGS_LIST[embedder_name]
236 |     return None
237 | 
238 | 
239 | def load_embedder(emb_file: str, emb_name: str):
240 |     global embedder_model, loaded_embedder_model
241 |     emb_file = os.path.join(MODELS_DIR, "embeddings", emb_file)
242 |     models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
243 |         [emb_file],
244 |         suffix="",
245 |     )
246 |     embedder_model = models[0]
247 |     embedder_model = embedder_model.to(device)
248 | 
249 |     if is_half:
250 |         embedder_model = embedder_model.half()
251 |     else:
252 |         embedder_model = embedder_model.float()
253 |     embedder_model.eval()
254 | 
255 |     loaded_embedder_model = emb_name
256 | 
257 | 
258 | def get_vc_model(model_name: str):
259 |     model_path = os.path.join(MODELS_DIR, "checkpoints", model_name)
260 |     weight = torch.load(model_path, map_location="cpu")
261 |     return VoiceConvertModel(model_name, weight)
262 | 
263 | 
264 | def load_model(model_name: str):
265 |     global vc_model
266 |     vc_model = get_vc_model(model_name)
267 | 


--------------------------------------------------------------------------------
/modules/separate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import *
 3 | 
 4 | import tqdm
 5 | from pydub import AudioSegment
 6 | from pydub.silence import split_on_silence
 7 | 
 8 | 
 9 | def separate_audio(
10 |     input: str,
11 |     output: str,
12 |     silence_thresh: int,
13 |     min_silence_len: int = 1000,
14 |     keep_silence: int = 100,
15 |     margin: int = 0,
16 |     padding: bool = False,
17 |     min: Optional[int] = None,
18 |     max: Optional[int] = None,
19 | ):
20 |     if os.path.isfile(input):
21 |         input = [input]
22 |     elif os.path.isdir(input):
23 |         input = [os.path.join(input, f) for f in os.listdir(input)]
24 |     else:
25 |         raise ValueError("input must be a file or directory")
26 | 
27 |     os.makedirs(output, exist_ok=True)
28 | 
29 |     for file in input:
30 |         if os.path.splitext(file)[1] == ".mp3":
31 |             audio = AudioSegment.from_mp3(file)
32 |         elif os.path.splitext(file)[1] == ".wav":
33 |             audio = AudioSegment.from_wav(file)
34 |         elif os.path.splitext(file)[1] == ".flac":
35 |             audio = AudioSegment.from_file(file, "flac")
36 |         else:
37 |             raise ValueError(
38 |                 "Invalid file format. Only MP3 and WAV files are supported."
39 |             )
40 | 
41 |         chunks = split_on_silence(
42 |             audio,
43 |             min_silence_len=min_silence_len,
44 |             silence_thresh=silence_thresh,
45 |             keep_silence=keep_silence,
46 |         )
47 | 
48 |         output_chunks: List[AudioSegment] = []
49 | 
50 |         so_short = None
51 | 
52 |         for chunk in tqdm.tqdm(chunks):
53 |             if so_short is not None:
54 |                 chunk = so_short + chunk
55 |                 so_short = None
56 |             if min is None or len(chunk) > min:
57 |                 if max is not None and len(chunk) > max:
58 |                     sub_chunks = [
59 |                         chunk[i : i + max + margin]
60 |                         for i in range(0, len(chunk) - margin, max)
61 |                     ]
62 | 
63 |                     if len(sub_chunks[-1]) < min:
64 |                         if padding and len(sub_chunks) > 2:
65 |                             output_chunks.extend(sub_chunks[0:-2])
66 |                             output_chunks.append(sub_chunks[-2] + sub_chunks[-1])
67 |                         else:
68 |                             output_chunks.extend(sub_chunks[0:-1])
69 |                     else:
70 |                         output_chunks.extend(sub_chunks)
71 |                 else:
72 |                     output_chunks.append(chunk)
73 |             else:
74 |                 if so_short is None:
75 |                     so_short = chunk
76 |                 else:
77 |                     so_short += chunk
78 |         basename = os.path.splitext(os.path.basename(file))[0]
79 | 
80 |         for i, chunk in enumerate(output_chunks):
81 |             filepath = os.path.join(output, f"{basename}_{i}.wav")
82 |             chunk.export(filepath, format="wav")
83 | 


--------------------------------------------------------------------------------
/modules/server/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from typing import *
  4 | 
  5 | import faiss
  6 | import numpy as np
  7 | import pyworld
  8 | import scipy.signal as signal
  9 | import torch
 10 | import torch.nn.functional as F
 11 | import torchaudio
 12 | import torchcrepe
 13 | from fairseq import checkpoint_utils
 14 | from fairseq.models.hubert.hubert import HubertModel
 15 | from pydub import AudioSegment
 16 | from torch import Tensor
 17 | 
 18 | from lib.rvc.models import (SynthesizerTrnMs256NSFSid,
 19 |                             SynthesizerTrnMs256NSFSidNono)
 20 | from lib.rvc.pipeline import VocalConvertPipeline
 21 | from modules.cmd_opts import opts
 22 | from modules.models import (EMBEDDINGS_LIST, MODELS_DIR, get_embedder,
 23 |                             get_vc_model, update_state_dict)
 24 | from modules.shared import ROOT_DIR, device, is_half
 25 | 
 26 | MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models")
 27 | vc_model: Optional["VoiceServerModel"] = None
 28 | embedder_model: Optional[HubertModel] = None
 29 | loaded_embedder_model = ""
 30 | 
 31 | 
 32 | class VoiceServerModel:
 33 |     def __init__(self, rvc_model_file: str, faiss_index_file: str) -> None:
 34 |         # setting vram
 35 |         global device, is_half
 36 |         if isinstance(device, str):
 37 |             device = torch.device(device)
 38 |         if device.type == "cuda":
 39 |             vram = torch.cuda.get_device_properties(device).total_memory / 1024**3
 40 |         else:
 41 |             vram = None
 42 |         if vram is not None and vram <= 4:
 43 |             self.x_pad = 1
 44 |             self.x_query = 5
 45 |             self.x_center = 30
 46 |             self.x_max = 32
 47 |         elif vram is not None and vram <= 5:
 48 |             self.x_pad = 1
 49 |             self.x_query = 6
 50 |             self.x_center = 38
 51 |             self.x_max = 41
 52 |         else:
 53 |             self.x_pad = 3
 54 |             self.x_query = 10
 55 |             self.x_center = 60
 56 |             self.x_max = 65
 57 | 
 58 |         # load_model
 59 |         state_dict = torch.load(rvc_model_file, map_location="cpu")
 60 |         update_state_dict(state_dict)
 61 |         self.state_dict = state_dict
 62 |         self.tgt_sr = state_dict["params"]["sr"]
 63 |         self.f0 = state_dict.get("f0", 1)
 64 |         state_dict["params"]["spk_embed_dim"] = state_dict["weight"][
 65 |             "emb_g.weight"
 66 |         ].shape[0]
 67 |         if not "emb_channels" in state_dict["params"]:
 68 |             if state_dict.get("version", "v1") == "v1":
 69 |                 state_dict["params"]["emb_channels"] = 256  # for backward compat.
 70 |                 state_dict["embedder_output_layer"] = 9
 71 |             else:
 72 |                 state_dict["params"]["emb_channels"] = 768  # for backward compat.
 73 |                 state_dict["embedder_output_layer"] = 12
 74 |         if self.f0 == 1:
 75 |             self.net_g = SynthesizerTrnMs256NSFSid(
 76 |                 **state_dict["params"], is_half=is_half
 77 |             )
 78 |         else:
 79 |             self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"])
 80 |         del self.net_g.enc_q
 81 |         self.net_g.load_state_dict(state_dict["weight"], strict=False)
 82 |         self.net_g.eval().to(device)
 83 |         if is_half:
 84 |             self.net_g = self.net_g.half()
 85 |         else:
 86 |             self.net_g = self.net_g.float()
 87 | 
 88 |         emb_name = state_dict.get("embedder_name", "contentvec")
 89 |         if emb_name == "hubert_base":
 90 |             emb_name = "contentvec"
 91 |         emb_file = os.path.join(MODELS_DIR, "embeddings", EMBEDDINGS_LIST[emb_name][0])
 92 |         models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
 93 |             [emb_file],
 94 |             suffix="",
 95 |         )
 96 |         embedder_model = models[0]
 97 |         embedder_model = embedder_model.to(device)
 98 | 
 99 |         if is_half:
100 |             embedder_model = embedder_model.half()
101 |         else:
102 |             embedder_model = embedder_model.float()
103 |         embedder_model.eval()
104 |         self.embedder_model = embedder_model
105 | 
106 |         self.embedder_output_layer = state_dict["embedder_output_layer"]
107 | 
108 |         self.index = None
109 |         if faiss_index_file != "" and os.path.exists(faiss_index_file):
110 |             self.index = faiss.read_index(faiss_index_file)
111 |             self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
112 | 
113 |         self.n_spk = state_dict["params"]["spk_embed_dim"]
114 | 
115 |         self.sr = 16000  # hubert input sample rate
116 |         self.window = 160  # hubert input window
117 |         self.t_pad = self.sr * self.x_pad  # padding time for each utterance
118 |         self.t_pad_tgt = self.tgt_sr * self.x_pad
119 |         self.t_pad2 = self.t_pad * 2
120 |         self.t_query = self.sr * self.x_query  # query time before and after query point
121 |         self.t_center = self.sr * self.x_center  # query cut point position
122 |         self.t_max = self.sr * self.x_max  # max time for no query
123 |         self.device = device
124 |         self.is_half = is_half
125 | 
126 |     def __call__(
127 |         self,
128 |         audio: np.ndarray,
129 |         sr: int,
130 |         sid: int,
131 |         transpose: int,
132 |         f0_method: str,
133 |         index_rate: float,
134 |     ):
135 |         # bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
136 |         # audio = signal.filtfilt(bh, ah, audio)
137 |         if sr != self.sr:
138 |             audio = torchaudio.functional.resample(torch.from_numpy(audio), sr, self.sr, rolloff=0.99).detach().cpu().numpy()
139 |         audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect" if audio.shape[0] > self.window // 2 else "constant")
140 | 
141 |         opt_ts = []
142 |         if audio_pad.shape[0] > self.t_max:
143 |             audio_sum = np.zeros_like(audio)
144 |             for i in range(self.window):
145 |                 audio_sum += audio_pad[i : i - self.window]
146 |             for t in range(self.t_center, audio.shape[0], self.t_center):
147 |                 opt_ts.append(
148 |                     t
149 |                     - self.t_query
150 |                     + np.where(
151 |                         np.abs(audio_sum[t - self.t_query : t + self.t_query])
152 |                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
153 |                     )[0][0]
154 |                 )
155 |         audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect" if audio.shape[0] > self.t_pad else "constant")
156 |         p_len = audio_pad.shape[0] // self.window
157 | 
158 |         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
159 |         pitch, pitchf = None, None
160 |         if self.f0 == 1:
161 |             pitch, pitchf = get_f0(audio_pad, self.sr, p_len, transpose, f0_method)
162 |             pitch = pitch[:p_len]
163 |             pitchf = pitchf[:p_len]
164 |             if self.device.type == "mps":
165 |                 pitchf = pitchf.astype(np.float32)
166 |             pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
167 |             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
168 | 
169 |         audio_opt = []
170 | 
171 |         s = 0
172 |         t = None
173 | 
174 |         for t in opt_ts:
175 |             t = t // self.window * self.window
176 |             if self.f0 == 1:
177 |                 audio_opt.append(
178 |                     self._convert(
179 |                         sid,
180 |                         audio_pad[s : t + self.t_pad2 + self.window],
181 |                         pitch[:, s // self.window : (t + self.t_pad2) // self.window],
182 |                         pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
183 |                         index_rate,
184 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
185 |                 )
186 |             else:
187 |                 audio_opt.append(
188 |                     self._convert(
189 |                         sid,
190 |                         audio_pad[s : t + self.t_pad2 + self.window],
191 |                         None,
192 |                         None,
193 |                         index_rate,
194 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
195 |                 )
196 |             s = t
197 |         if self.f0 == 1:
198 |             audio_opt.append(
199 |                 self._convert(
200 |                     sid,
201 |                     audio_pad[t:],
202 |                     pitch[:, t // self.window :] if t is not None else pitch,
203 |                     pitchf[:, t // self.window :] if t is not None else pitchf,
204 |                     index_rate,
205 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
206 |             )
207 |         else:
208 |             audio_opt.append(
209 |                 self._convert(
210 |                     sid,
211 |                     audio_pad[t:],
212 |                     None,
213 |                     None,
214 |                     index_rate,
215 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
216 |             )
217 |         audio_opt = np.concatenate(audio_opt)
218 |         del pitch, pitchf, sid
219 |         if torch.cuda.is_available():
220 |             torch.cuda.empty_cache()
221 |         return audio_opt
222 | 
223 | 
224 |     def _convert(
225 |         self,
226 |         sid: int,
227 |         audio: np.ndarray,
228 |         pitch: Optional[np.ndarray],
229 |         pitchf: Optional[np.ndarray],
230 |         index_rate: float,
231 |     ):
232 |         feats = torch.from_numpy(audio)
233 |         if self.is_half:
234 |             feats = feats.half()
235 |         else:
236 |             feats = feats.float()
237 |         if feats.dim() == 2:  # double channels
238 |             feats = feats.mean(-1)
239 |         assert feats.dim() == 1, feats.dim()
240 |         feats = feats.view(1, -1)
241 |         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
242 | 
243 |         half_support = (
244 |             self.device.type == "cuda"
245 |             and torch.cuda.get_device_capability(self.device)[0] >= 5.3
246 |         )
247 |         is_feats_dim_768 = self.net_g.emb_channels == 768
248 | 
249 |         if isinstance(self.embedder_model, tuple):
250 |             feats = self.embedder_model[0](
251 |                 feats.squeeze(0).squeeze(0).to(self.device),
252 |                 return_tensors="pt",
253 |                 sampling_rate=16000,
254 |             )
255 |             if self.is_half:
256 |                 feats = feats.input_values.to(self.device).half()
257 |             else:
258 |                 feats = feats.input_values.to(self.device)
259 |             with torch.no_grad():
260 |                 if is_feats_dim_768:
261 |                     feats = self.embedder_model[1](feats).last_hidden_state
262 |                 else:
263 |                     feats = self.embedder_model[1](feats).extract_features
264 |         else:
265 |             inputs = {
266 |                 "source": feats.half().to(self.device)
267 |                 if half_support
268 |                 else feats.to(self.device),
269 |                 "padding_mask": padding_mask.to(self.device),
270 |                 "output_layer": self.embedder_output_layer,
271 |             }
272 | 
273 |             if not half_support:
274 |                 self.embedder_model = self.embedder_model.float()
275 |                 inputs["source"] = inputs["source"].float()
276 | 
277 |             with torch.no_grad():
278 |                 logits = self.embedder_model.extract_features(**inputs)
279 |                 if is_feats_dim_768:
280 |                     feats = logits[0]
281 |                 else:
282 |                     feats = self.embedder_model.final_proj(logits[0])
283 | 
284 |         if (
285 |             isinstance(self.index, type(None)) == False
286 |             and isinstance(self.big_npy, type(None)) == False
287 |             and index_rate != 0
288 |         ):
289 |             npy = feats[0].cpu().numpy()
290 |             if self.is_half:
291 |                 npy = npy.astype("float32")
292 | 
293 |             _, ix = self.index.search(npy, k=1)
294 |             npy = self.big_npy[ix[:, 0]]
295 | 
296 |             if self.is_half:
297 |                 npy = npy.astype("float16")
298 |             feats = (
299 |                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
300 |                 + (1 - index_rate) * feats
301 |             )
302 | 
303 |         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
304 | 
305 |         p_len = audio.shape[0] // self.window
306 |         if feats.shape[1] < p_len:
307 |             p_len = feats.shape[1]
308 |             if pitch != None and pitchf != None:
309 |                 pitch = pitch[:, :p_len]
310 |                 pitchf = pitchf[:, :p_len]
311 |         p_len = torch.tensor([p_len], device=self.device).long()
312 |         with torch.no_grad():
313 |             if pitch != None and pitchf != None:
314 |                 audio1 = (
315 |                     (self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
316 |                     .data.cpu()
317 |                     .float()
318 |                     .numpy()
319 |                     .astype(np.int16)
320 |                 )
321 |             else:
322 |                 audio1 = (
323 |                     (self.net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
324 |                     .data.cpu()
325 |                     .float()
326 |                     .numpy()
327 |                     .astype(np.int16)
328 |                 )
329 |         del feats, p_len, padding_mask
330 |         if torch.cuda.is_available():
331 |             torch.cuda.empty_cache()
332 |         return audio1
333 | 
334 | 
335 | # F0 computation
336 | def get_f0_crepe_computation(
337 |         x,
338 |         sr,
339 |         f0_min,
340 |         f0_max,
341 |         p_len,
342 |         model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
343 | ):
344 |     hop_length = sr // 100
345 |     x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
346 |     x /= np.quantile(np.abs(x), 0.999)
347 |     torch_device = self.get_optimal_torch_device()
348 |     audio = torch.from_numpy(x).to(torch_device, copy=True)
349 |     audio = torch.unsqueeze(audio, dim=0)
350 |     if audio.ndim == 2 and audio.shape[0] > 1:
351 |         audio = torch.mean(audio, dim=0, keepdim=True).detach()
352 |     audio = audio.detach()
353 |     print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
354 |     pitch: Tensor = torchcrepe.predict(
355 |         audio,
356 |         sr,
357 |         sr // 100,
358 |         f0_min,
359 |         f0_max,
360 |         model,
361 |         batch_size=hop_length * 2,
362 |         device=torch_device,
363 |         pad=True
364 |     )
365 |     p_len = p_len or x.shape[0] // hop_length
366 |     # Resize the pitch for final f0
367 |     source = np.array(pitch.squeeze(0).cpu().float().numpy())
368 |     source[source < 0.001] = np.nan
369 |     target = np.interp(
370 |         np.arange(0, len(source) * p_len, len(source)) / p_len,
371 |         np.arange(0, len(source)),
372 |         source
373 |     )
374 |     f0 = np.nan_to_num(target)
375 |     return f0 # Resized f0
376 | 
377 | def get_f0_official_crepe_computation(
378 |         x,
379 |         sr,
380 |         f0_min,
381 |         f0_max,
382 |         model="full",
383 | ):
384 |     # Pick a batch size that doesn't cause memory errors on your gpu
385 |     batch_size = 512
386 |     # Compute pitch using first gpu
387 |     audio = torch.tensor(np.copy(x))[None].float()
388 |     f0, pd = torchcrepe.predict(
389 |         audio,
390 |         sr,
391 |         sr // 100,
392 |         f0_min,
393 |         f0_max,
394 |         model,
395 |         batch_size=batch_size,
396 |         device=device,
397 |         return_periodicity=True,
398 |     )
399 |     pd = torchcrepe.filter.median(pd, 3)
400 |     f0 = torchcrepe.filter.mean(f0, 3)
401 |     f0[pd < 0.1] = 0
402 |     f0 = f0[0].cpu().numpy()
403 |     return f0
404 | 
405 | def get_f0(
406 |     x: np.ndarray,
407 |     sr: int,
408 |     p_len: int,
409 |     f0_up_key: int,
410 |     f0_method: str,
411 | ):
412 |     f0_min = 50
413 |     f0_max = 1100
414 |     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
415 |     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
416 | 
417 |     if f0_method == "harvest":
418 |         f0, t = pyworld.harvest(
419 |             x.astype(np.double),
420 |             fs=sr,
421 |             f0_ceil=f0_max,
422 |             f0_floor=f0_min,
423 |             frame_period=10,
424 |         )
425 |         f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr)
426 |         f0 = signal.medfilt(f0, 3)
427 |     elif f0_method == "dio":
428 |         f0, t = pyworld.dio(
429 |             x.astype(np.double),
430 |             fs=sr,
431 |             f0_ceil=f0_max,
432 |             f0_floor=f0_min,
433 |             frame_period=10,
434 |         )
435 |         f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr)
436 |         f0 = signal.medfilt(f0, 3)
437 |     elif f0_method == "mangio-crepe":
438 |         f0 = get_f0_crepe_computation(x, sr, f0_min, f0_max, p_len, "full")
439 |     elif f0_method == "crepe":
440 |         f0 = get_f0_official_crepe_computation(x, sr, f0_min, f0_max, "full")
441 | 
442 |     f0 *= pow(2, f0_up_key / 12)
443 |     f0bak = f0.copy()
444 |     f0_mel = 1127 * np.log(1 + f0 / 700)
445 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
446 |         f0_mel_max - f0_mel_min
447 |     ) + 1
448 |     f0_mel[f0_mel <= 1] = 1
449 |     f0_mel[f0_mel > 255] = 255
450 |     f0_coarse = np.rint(f0_mel).astype(np.int32)
451 |     return f0_coarse, f0bak  # 1-0


--------------------------------------------------------------------------------
/modules/shared.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import torch
 5 | 
 6 | from modules.cmd_opts import opts
 7 | 
 8 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 9 | MODELS_DIR = os.path.join(ROOT_DIR, "models")
10 | 
11 | 
12 | def has_mps():
13 |     if sys.platform != "darwin":
14 |         return False
15 |     else:
16 |         if not getattr(torch, "has_mps", False):
17 |             return False
18 |         try:
19 |             torch.zeros(1).to(torch.device("mps"))
20 |             return True
21 |         except Exception:
22 |             return False
23 | 
24 | 
25 | is_half = opts.precision == "fp16"
26 | half_support = (
27 |     torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 5.3
28 | )
29 | 
30 | if not half_support:
31 |     print("WARNING: FP16 is not supported on this GPU")
32 |     is_half = False
33 | 
34 | device = "cuda:0"
35 | 
36 | if not torch.cuda.is_available():
37 |     if has_mps():
38 |         print("Using MPS")
39 |         device = "mps"
40 |     else:
41 |         print("Using CPU")
42 |         device = "cpu"
43 | 
44 | device = torch.device(device)
45 | 


--------------------------------------------------------------------------------
/modules/tabs/inference.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import traceback
  4 | 
  5 | import gradio as gr
  6 | 
  7 | from modules import models, ui
  8 | from modules.ui import Tab
  9 | 
 10 | 
 11 | def inference_options_ui(show_out_dir=True):
 12 |     with gr.Row(equal_height=False):
 13 |         with gr.Column():
 14 |             source_audio = gr.Textbox(label="Source Audio")
 15 |             out_dir = gr.Textbox(
 16 |                 label="Out folder",
 17 |                 visible=show_out_dir,
 18 |                 placeholder=models.AUDIO_OUT_DIR,
 19 |             )
 20 |         with gr.Column():
 21 |             transpose = gr.Slider(
 22 |                 minimum=-20, maximum=20, value=0, step=1, label="Transpose"
 23 |             )
 24 |             pitch_extraction_algo = gr.Radio(
 25 |                 choices=["dio", "harvest", "mangio-crepe", "crepe"],
 26 |                 value="crepe",
 27 |                 label="Pitch Extraction Algorithm",
 28 |             )
 29 |             embedding_model = gr.Radio(
 30 |                 choices=["auto", *models.EMBEDDINGS_LIST.keys()],
 31 |                 value="auto",
 32 |                 label="Embedder Model",
 33 |             )
 34 |             embedding_output_layer = gr.Radio(
 35 |                 choices=["auto", "9", "12"],
 36 |                 value="auto",
 37 |                 label="Embedder Output Layer",
 38 |             )
 39 |         with gr.Column():
 40 |             auto_load_index = gr.Checkbox(value=False, label="Auto Load Index")
 41 |             faiss_index_file = gr.Textbox(value="", label="Faiss Index File Path")
 42 |             retrieval_feature_ratio = gr.Slider(
 43 |                 minimum=0,
 44 |                 maximum=1,
 45 |                 value=1,
 46 |                 step=0.01,
 47 |                 label="Retrieval Feature Ratio",
 48 |             )
 49 |         with gr.Column():
 50 |             fo_curve_file = gr.File(label="F0 Curve File")
 51 | 
 52 |     return (
 53 |         source_audio,
 54 |         out_dir,
 55 |         transpose,
 56 |         embedding_model,
 57 |         embedding_output_layer,
 58 |         pitch_extraction_algo,
 59 |         auto_load_index,
 60 |         faiss_index_file,
 61 |         retrieval_feature_ratio,
 62 |         fo_curve_file,
 63 |     )
 64 | 
 65 | 
 66 | class Inference(Tab):
 67 |     def title(self):
 68 |         return "Inference"
 69 | 
 70 |     def sort(self):
 71 |         return 1
 72 | 
 73 |     def ui(self, outlet):
 74 |         def infer(
 75 |             sid,
 76 |             input_audio,
 77 |             out_dir,
 78 |             embedder_model,
 79 |             embedding_output_layer,
 80 |             f0_up_key,
 81 |             f0_file,
 82 |             f0_method,
 83 |             auto_load_index,
 84 |             faiss_index_file,
 85 |             index_rate,
 86 |         ):
 87 |             model = models.vc_model
 88 |             try:
 89 |                 yield "Infering...", None
 90 |                 if out_dir == "":
 91 |                     out_dir = models.AUDIO_OUT_DIR
 92 | 
 93 |                 if "*" in input_audio:
 94 |                     assert (
 95 |                         out_dir is not None
 96 |                     ), "Out folder is required for batch processing"
 97 |                     files = glob.glob(input_audio, recursive=True)
 98 |                 elif os.path.isdir(input_audio):
 99 |                     assert (
100 |                         out_dir is not None
101 |                     ), "Out folder is required for batch processing"
102 |                     files = glob.glob(
103 |                         os.path.join(input_audio, "**", "*.wav"), recursive=True
104 |                     )
105 |                 else:
106 |                     files = [input_audio]
107 |                 for file in files:
108 |                     audio = model.single(
109 |                         sid,
110 |                         file,
111 |                         embedder_model,
112 |                         embedding_output_layer,
113 |                         f0_up_key,
114 |                         f0_file,
115 |                         f0_method,
116 |                         auto_load_index,
117 |                         faiss_index_file,
118 |                         index_rate,
119 |                         output_dir=out_dir,
120 |                     )
121 |                 yield "Success", (model.tgt_sr, audio) if len(files) == 1 else None
122 |             except:
123 |                 yield "Error: " + traceback.format_exc(), None
124 | 
125 |         with gr.Group():
126 |             with gr.Box():
127 |                 with gr.Column():
128 |                     _, speaker_id = ui.create_model_list_ui()
129 | 
130 |                     (
131 |                         source_audio,
132 |                         out_dir,
133 |                         transpose,
134 |                         embedder_model,
135 |                         embedding_output_layer,
136 |                         pitch_extraction_algo,
137 |                         auto_load_index,
138 |                         faiss_index_file,
139 |                         retrieval_feature_ratio,
140 |                         f0_curve_file,
141 |                     ) = inference_options_ui()
142 | 
143 |                     with gr.Row(equal_height=False):
144 |                         with gr.Column():
145 |                             status = gr.Textbox(value="", label="Status")
146 |                             output = gr.Audio(label="Output", interactive=False)
147 | 
148 |                     with gr.Row():
149 |                         infer_button = gr.Button("Infer", variant="primary")
150 | 
151 |         infer_button.click(
152 |             infer,
153 |             inputs=[
154 |                 speaker_id,
155 |                 source_audio,
156 |                 out_dir,
157 |                 embedder_model,
158 |                 embedding_output_layer,
159 |                 transpose,
160 |                 f0_curve_file,
161 |                 pitch_extraction_algo,
162 |                 auto_load_index,
163 |                 faiss_index_file,
164 |                 retrieval_feature_ratio,
165 |             ],
166 |             outputs=[status, output],
167 |             queue=True,
168 |         )
169 | 


--------------------------------------------------------------------------------
/modules/tabs/merge.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import *
  4 | 
  5 | import gradio as gr
  6 | import torch
  7 | 
  8 | from modules import models
  9 | from modules.merge import merge
 10 | from modules.tabs.inference import inference_options_ui
 11 | from modules.ui import Tab
 12 | 
 13 | MERGE_METHODS = {
 14 |     "weight_sum": "Weight sum:A*(1-alpha)+B*alpha",
 15 |     "add_diff": "Add difference:A+(B-C)*alpha",
 16 | }
 17 | 
 18 | 
 19 | class Merge(Tab):
 20 |     def title(self):
 21 |         return "Merge"
 22 | 
 23 |     def sort(self):
 24 |         return 3
 25 | 
 26 |     def ui(self, outlet):
 27 |         def merge_ckpt(model_a, model_b, model_c, weight_text, alpha, each_key, method):
 28 |             model_a = model_a if type(model_a) != list and model_a != "" else None
 29 |             model_b = model_b if type(model_b) != list and model_b != "" else None
 30 |             model_c = model_c if type(model_c) != list and model_c != "" else None
 31 | 
 32 |             if each_key:
 33 |                 weights = json.loads(weight_text)
 34 |             else:
 35 |                 weights = {}
 36 | 
 37 |             method = [k for k, v in MERGE_METHODS.items() if v == method][0]
 38 |             return merge(
 39 |                 os.path.join(models.MODELS_DIR, "checkpoints", model_a),
 40 |                 os.path.join(models.MODELS_DIR, "checkpoints", model_b),
 41 |                 os.path.join(models.MODELS_DIR, "checkpoints", model_c)
 42 |                 if model_c
 43 |                 else None,
 44 |                 alpha,
 45 |                 weights,
 46 |                 method,
 47 |             )
 48 | 
 49 |         def merge_and_save(
 50 |             model_a, model_b, model_c, alpha, each_key, weight_text, method, out_name
 51 |         ):
 52 |             print(each_key)
 53 |             out_path = os.path.join(models.MODELS_DIR, "checkpoints", out_name)
 54 |             if os.path.exists(out_path):
 55 |                 return "Model name already exists."
 56 |             merged = merge_ckpt(
 57 |                 model_a, model_b, model_c, weight_text, alpha, each_key, method
 58 |             )
 59 |             if not out_name.endswith(".pth"):
 60 |                 out_name += ".pth"
 61 |             torch.save(merged, os.path.join(models.MODELS_DIR, "checkpoints", out_name))
 62 |             return "Success"
 63 | 
 64 |         def merge_and_gen(
 65 |             model_a,
 66 |             model_b,
 67 |             model_c,
 68 |             alpha,
 69 |             each_key,
 70 |             weight_text,
 71 |             method,
 72 |             speaker_id,
 73 |             source_audio,
 74 |             embedder_name,
 75 |             embedding_output_layer,
 76 |             transpose,
 77 |             fo_curve_file,
 78 |             pitch_extraction_algo,
 79 |             auto_load_index,
 80 |             faiss_index_file,
 81 |             retrieval_feature_ratio,
 82 |         ):
 83 |             merged = merge_ckpt(
 84 |                 model_a, model_b, model_c, weight_text, alpha, each_key, method
 85 |             )
 86 |             model = models.VoiceConvertModel("merge", merged)
 87 |             audio = model.single(
 88 |                 speaker_id,
 89 |                 source_audio,
 90 |                 embedder_name,
 91 |                 embedding_output_layer,
 92 |                 transpose,
 93 |                 fo_curve_file,
 94 |                 pitch_extraction_algo,
 95 |                 auto_load_index,
 96 |                 faiss_index_file,
 97 |                 retrieval_feature_ratio,
 98 |             )
 99 |             tgt_sr = model.tgt_sr
100 |             del merged
101 |             del model
102 |             torch.cuda.empty_cache()
103 |             return "Success", (tgt_sr, audio)
104 | 
105 |         def reload_model():
106 |             model_list = models.get_models()
107 |             return (
108 |                 gr.Dropdown.update(choices=model_list),
109 |                 gr.Dropdown.update(choices=model_list),
110 |                 gr.Dropdown.update(choices=model_list),
111 |             )
112 | 
113 |         def update_speaker_ids(model):
114 |             if model == "":
115 |                 return gr.Slider.update(
116 |                     maximum=0,
117 |                     visible=False,
118 |                 )
119 |             model = torch.load(
120 |                 os.path.join(models.MODELS_DIR, "checkpoints", model),
121 |                 map_location="cpu",
122 |             )
123 |             vc_model = models.VoiceConvertModel("merge", model)
124 |             max = vc_model.n_spk
125 |             del model
126 |             del vc_model
127 |             return gr.Slider.update(
128 |                 maximum=max,
129 |                 visible=True,
130 |             )
131 | 
132 |         with gr.Group():
133 |             with gr.Column():
134 |                 with gr.Row(equal_height=False):
135 |                     model_a = gr.Dropdown(choices=models.get_models(), label="Model A")
136 |                     model_b = gr.Dropdown(choices=models.get_models(), label="Model B")
137 |                     model_c = gr.Dropdown(choices=models.get_models(), label="Model C")
138 |                     reload_model_button = gr.Button("♻️")
139 |                     reload_model_button.click(
140 |                         reload_model, outputs=[model_a, model_b, model_c]
141 |                     )
142 |                 with gr.Row(equal_height=False):
143 |                     method = gr.Radio(
144 |                         label="Merge method",
145 |                         choices=list(MERGE_METHODS.values()),
146 |                         value="Weight sum:A*(1-alpha)+B*alpha",
147 |                     )
148 |                     output_name = gr.Textbox(label="Output name")
149 |                     each_key = gr.Checkbox(label="Each key merge")
150 |                 with gr.Row(equal_height=False):
151 |                     base_alpha = gr.Slider(
152 |                         label="Base alpha", minimum=0, maximum=1, value=0.5, step=0.01
153 |                     )
154 | 
155 |                 default_weights = {}
156 |                 weights = {}
157 | 
158 |                 def create_weight_ui(name: str, *keys_list: List[List[str]]):
159 |                     with gr.Accordion(label=name, open=False):
160 |                         with gr.Row(equal_height=False):
161 |                             for keys in keys_list:
162 |                                 with gr.Column():
163 |                                     for key in keys:
164 |                                         default_weights[key] = 0.5
165 |                                         weights[key] = gr.Slider(
166 |                                             label=key,
167 |                                             minimum=0,
168 |                                             maximum=1,
169 |                                             step=0.01,
170 |                                             value=0.5,
171 |                                         )
172 | 
173 |                 with gr.Box(visible=False) as each_key_ui:
174 |                     with gr.Column():
175 |                         create_weight_ui(
176 |                             "enc_p",
177 |                             [
178 |                                 "enc_p.encoder.attn_layers.0",
179 |                                 "enc_p.encoder.attn_layers.1",
180 |                                 "enc_p.encoder.attn_layers.2",
181 |                                 "enc_p.encoder.attn_layers.3",
182 |                                 "enc_p.encoder.attn_layers.4",
183 |                                 "enc_p.encoder.attn_layers.5",
184 |                                 "enc_p.encoder.norm_layers_1.0",
185 |                                 "enc_p.encoder.norm_layers_1.1",
186 |                                 "enc_p.encoder.norm_layers_1.2",
187 |                                 "enc_p.encoder.norm_layers_1.3",
188 |                                 "enc_p.encoder.norm_layers_1.4",
189 |                                 "enc_p.encoder.norm_layers_1.5",
190 |                             ],
191 |                             [
192 |                                 "enc_p.encoder.ffn_layers.0",
193 |                                 "enc_p.encoder.ffn_layers.1",
194 |                                 "enc_p.encoder.ffn_layers.2",
195 |                                 "enc_p.encoder.ffn_layers.3",
196 |                                 "enc_p.encoder.ffn_layers.4",
197 |                                 "enc_p.encoder.ffn_layers.5",
198 |                                 "enc_p.encoder.norm_layers_2.0",
199 |                                 "enc_p.encoder.norm_layers_2.1",
200 |                                 "enc_p.encoder.norm_layers_2.2",
201 |                                 "enc_p.encoder.norm_layers_2.3",
202 |                                 "enc_p.encoder.norm_layers_2.4",
203 |                                 "enc_p.encoder.norm_layers_2.5",
204 |                             ],
205 |                             [
206 |                                 "enc_p.emb_phone",
207 |                                 "enc_p.emb_pitch",
208 |                             ],
209 |                         )
210 | 
211 |                         create_weight_ui(
212 |                             "dec",
213 |                             [
214 |                                 "dec.noise_convs.0",
215 |                                 "dec.noise_convs.1",
216 |                                 "dec.noise_convs.2",
217 |                                 "dec.noise_convs.3",
218 |                                 "dec.noise_convs.4",
219 |                                 "dec.noise_convs.5",
220 |                                 "dec.ups.0",
221 |                                 "dec.ups.1",
222 |                                 "dec.ups.2",
223 |                                 "dec.ups.3",
224 |                             ],
225 |                             [
226 |                                 "dec.resblocks.0",
227 |                                 "dec.resblocks.1",
228 |                                 "dec.resblocks.2",
229 |                                 "dec.resblocks.3",
230 |                                 "dec.resblocks.4",
231 |                                 "dec.resblocks.5",
232 |                                 "dec.resblocks.6",
233 |                                 "dec.resblocks.7",
234 |                                 "dec.resblocks.8",
235 |                                 "dec.resblocks.9",
236 |                                 "dec.resblocks.10",
237 |                                 "dec.resblocks.11",
238 |                             ],
239 |                             [
240 |                                 "dec.m_source.l_linear",
241 |                                 "dec.conv_pre",
242 |                                 "dec.conv_post",
243 |                                 "dec.cond",
244 |                             ],
245 |                         )
246 | 
247 |                         create_weight_ui(
248 |                             "flow",
249 |                             [
250 |                                 "flow.flows.0",
251 |                                 "flow.flows.1",
252 |                                 "flow.flows.2",
253 |                                 "flow.flows.3",
254 |                                 "flow.flows.4",
255 |                                 "flow.flows.5",
256 |                                 "flow.flows.6",
257 |                                 "emb_g.weight",
258 |                             ],
259 |                         )
260 | 
261 |                         with gr.Accordion(label="JSON", open=False):
262 |                             weights_text = gr.TextArea(
263 |                                 value=json.dumps(default_weights),
264 |                             )
265 | 
266 |                 with gr.Accordion(label="Inference options", open=False):
267 |                     with gr.Row(equal_height=False):
268 |                         speaker_id = gr.Slider(
269 |                             minimum=0,
270 |                             maximum=2333,
271 |                             step=1,
272 |                             label="Speaker ID",
273 |                             value=0,
274 |                             visible=True,
275 |                             interactive=True,
276 |                         )
277 |                     (
278 |                         source_audio,
279 |                         _,
280 |                         transpose,
281 |                         embedder_name,
282 |                         embedding_output_layer,
283 |                         pitch_extraction_algo,
284 |                         auto_load_index,
285 |                         faiss_index_file,
286 |                         retrieval_feature_ratio,
287 |                         fo_curve_file,
288 |                     ) = inference_options_ui(show_out_dir=False)
289 | 
290 |                 with gr.Row(equal_height=False):
291 |                     with gr.Column():
292 |                         status = gr.Textbox(value="", label="Status")
293 |                         audio_output = gr.Audio(label="Output", interactive=False)
294 | 
295 |                 with gr.Row(equal_height=False):
296 |                     merge_and_save_button = gr.Button(
297 |                         "Merge and save", variant="primary"
298 |                     )
299 |                     merge_and_gen_button = gr.Button("Merge and gen", variant="primary")
300 | 
301 |                 def each_key_on_change(each_key):
302 |                     return gr.update(visible=each_key)
303 | 
304 |                 each_key.change(
305 |                     fn=each_key_on_change,
306 |                     inputs=[each_key],
307 |                     outputs=[each_key_ui],
308 |                 )
309 | 
310 |                 def update_weights_text(data):
311 |                     d = {}
312 |                     for key in weights.keys():
313 |                         d[key] = data[weights[key]]
314 |                     return json.dumps(d)
315 | 
316 |                 for w in weights.values():
317 |                     w.change(
318 |                         fn=update_weights_text,
319 |                         inputs={*weights.values()},
320 |                         outputs=[weights_text],
321 |                     )
322 | 
323 |                 merge_data = [
324 |                     model_a,
325 |                     model_b,
326 |                     model_c,
327 |                     base_alpha,
328 |                     each_key,
329 |                     weights_text,
330 |                     method,
331 |                 ]
332 | 
333 |                 inference_opts = [
334 |                     speaker_id,
335 |                     source_audio,
336 |                     embedder_name,
337 |                     embedding_output_layer,
338 |                     transpose,
339 |                     fo_curve_file,
340 |                     pitch_extraction_algo,
341 |                     auto_load_index,
342 |                     faiss_index_file,
343 |                     retrieval_feature_ratio,
344 |                 ]
345 | 
346 |                 merge_and_save_button.click(
347 |                     fn=merge_and_save,
348 |                     inputs=[
349 |                         *merge_data,
350 |                         output_name,
351 |                     ],
352 |                     outputs=[status],
353 |                 )
354 |                 merge_and_gen_button.click(
355 |                     fn=merge_and_gen,
356 |                     inputs=[
357 |                         *merge_data,
358 |                         *inference_opts,
359 |                     ],
360 |                     outputs=[status, audio_output],
361 |                 )
362 | 
363 |                 model_a.change(
364 |                     update_speaker_ids, inputs=[model_a], outputs=[speaker_id]
365 |                 )
366 | 


--------------------------------------------------------------------------------
/modules/tabs/server.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import json
  3 | 
  4 | import gradio as gr
  5 | import requests
  6 | import soundfile as sf
  7 | import torch.multiprocessing as multiprocessing
  8 | from scipy.io.wavfile import write
  9 | 
 10 | from modules.ui import Tab
 11 | from server import app
 12 | 
 13 | proc = None
 14 | 
 15 | def server_options_ui(show_out_dir=True):
 16 |     with gr.Row().style(equal_height=False):
 17 |         with gr.Row():
 18 |             host = gr.Textbox(value="127.0.0.1", label="host")
 19 |             port = gr.Textbox(value="5001", label="port")
 20 |     with gr.Row().style(equal_height=False):
 21 |         with gr.Row():
 22 |             rvc_model_file = gr.Textbox(value="", label="RVC model file path")
 23 |             faiss_index_file = gr.Textbox(value="", label="Faiss index file path")
 24 |     with gr.Row().style(equal_height=False):
 25 |         with gr.Row():
 26 |             input_voice_file = gr.Textbox(value="", label="input voice file path")
 27 |             speaker_id = gr.Number(
 28 |                 value=0,
 29 |                 label="speaker_id",
 30 |             )
 31 |             transpose = gr.Slider(
 32 |                 minimum=-20, maximum=20, value=0, step=1, label="transpose"
 33 |             )
 34 |             pitch_extraction_algo = gr.Radio(
 35 |                 choices=["dio", "harvest", "mangio-crepe", "crepe"],
 36 |                 value="crepe",
 37 |                 label="pitch_extraction_algo",
 38 |             )
 39 |             retrieval_feature_ratio = gr.Slider(
 40 |                 minimum=0,
 41 |                 maximum=1,
 42 |                 value=1,
 43 |                 step=0.01,
 44 |                 label="retrieval_feature_ratio",
 45 |             )
 46 |     return (
 47 |         host,
 48 |         port,
 49 |         rvc_model_file,
 50 |         faiss_index_file,
 51 |         input_voice_file,
 52 |         speaker_id,
 53 |         transpose,
 54 |         pitch_extraction_algo,
 55 |         retrieval_feature_ratio,
 56 |     )
 57 | 
 58 | def run(**kwargs):
 59 |     app.run(**kwargs)
 60 | 
 61 | class Server(Tab):
 62 |     def title(self):
 63 |         return "Server(experimental)"
 64 | 
 65 |     def sort(self):
 66 |         return 6
 67 | 
 68 |     def ui(self, outlet):
 69 |         def start(host, port):
 70 |             if multiprocessing.get_start_method() == 'fork':
 71 |                 multiprocessing.set_start_method('spawn', force=True)
 72 |             proc = multiprocessing.Process(target = run, kwargs = {'host': host, 'port': port})
 73 |             proc.start()
 74 |             yield "start server"
 75 | 
 76 |         def upload(host, port, rvc_model_file, faiss_index_file):
 77 |             file_names = {"rvc_model_file": rvc_model_file, "faiss_index_file": faiss_index_file}
 78 |             res = requests.post(f"http://{host}:{port}/upload_model", json=file_names)
 79 |             yield res.text
 80 | 
 81 |         def convert(host, port, input_voice_file, speaker_id, transpose, pitch_extraction_algo, retrieval_feature_ratio):
 82 |             params = {
 83 |                 "speaker_id": speaker_id,
 84 |                 "transpose": transpose,
 85 |                 "pitch_extraction_algo": pitch_extraction_algo,
 86 |                 "retrieval_feature_ratio": retrieval_feature_ratio
 87 |             }
 88 | 
 89 |             audio, sr = sf.read(input_voice_file)
 90 |             audio_buffer = io.BytesIO()
 91 |             write(audio_buffer, rate=sr, data=audio)
 92 |             json_buffer = io.BytesIO(json.dumps(params).encode('utf-8'))
 93 |             files = {
 94 |                 "input_wav": audio_buffer,
 95 |                 "params": json_buffer
 96 |             }
 97 |             res = requests.post(f"http://{host}:{port}/convert_sound", files=files)
 98 |             audio, sr = sf.read(io.BytesIO(res.content))
 99 |             yield "convert succeed", (sr, audio)
100 | 
101 |         with gr.Group():
102 |             with gr.Box():
103 |                 with gr.Column():
104 |                     (
105 |                         host,
106 |                         port,
107 |                         rvc_model_file,
108 |                         faiss_index_file,
109 |                         input_voice_file,
110 |                         speaker_id,
111 |                         transpose,
112 |                         pitch_extraction_algo,
113 |                         retrieval_feature_ratio,
114 |                     ) = server_options_ui()
115 | 
116 |                     with gr.Row().style(equal_height=False):
117 |                         with gr.Column():
118 |                             status = gr.Textbox(value="", label="Status")
119 |                             output = gr.Audio(label="Output", interactive=False)
120 | 
121 |                     with gr.Row():
122 |                         start_button = gr.Button("Start server", variant="primary")
123 |                         upload_button = gr.Button("Upload Model")
124 |                         convert_button = gr.Button("Convert Voice")
125 | 
126 |         start_button.click(
127 |             start,
128 |             inputs=[
129 |                 host,
130 |                 port
131 |             ],
132 |             outputs=[status],
133 |             queue=True,
134 |         )
135 |         upload_button.click(
136 |             upload,
137 |             inputs=[
138 |                 host,
139 |                 port,
140 |                 rvc_model_file,
141 |                 faiss_index_file
142 |             ],
143 |             outputs=[status],
144 |             queue=True,
145 |         )
146 |         convert_button.click(
147 |             convert,
148 |             inputs=[
149 |                 host,
150 |                 port,
151 |                 input_voice_file,
152 |                 speaker_id,
153 |                 transpose,
154 |                 pitch_extraction_algo,
155 |                 retrieval_feature_ratio
156 |             ],
157 |             outputs=[status, output],
158 |             queue=True,
159 |         )
160 | 


--------------------------------------------------------------------------------
/modules/tabs/split.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | 
 3 | from modules.separate import separate_audio
 4 | from modules.ui import Tab
 5 | 
 6 | 
 7 | class Split(Tab):
 8 |     def title(self):
 9 |         return "Split Audio"
10 | 
11 |     def sort(self):
12 |         return 5
13 | 
14 |     def ui(self, outlet):
15 |         def separate(
16 |             input_audio,
17 |             output_dir,
18 |             silence_thresh,
19 |             min_silence_len,
20 |             keep_silence,
21 |             margin,
22 |             padding,
23 |             min,
24 |             max,
25 |         ):
26 |             min = None if min == 0 else min
27 |             max = None if max == 0 else max
28 |             separate_audio(
29 |                 input_audio,
30 |                 output_dir,
31 |                 int(silence_thresh),
32 |                 int(min_silence_len),
33 |                 int(keep_silence),
34 |                 int(margin),
35 |                 padding,
36 |                 int(min),
37 |                 int(max),
38 |             )
39 |             return "Success"
40 | 
41 |         with gr.Group():
42 |             with gr.Column():
43 |                 with gr.Row(equal_height=False):
44 |                     input_audio = gr.Textbox(label="Input Audio (File or Directory)")
45 |                     output_dir = gr.Textbox(label="Output Directory")
46 | 
47 |                 with gr.Row(equal_height=False):
48 |                     silence_thresh = gr.Number(value=-40, label="Silence Threshold")
49 |                     min_silence_len = gr.Number(
50 |                         value=750, label="Minimum Silence Length"
51 |                     )
52 |                     keep_silence = gr.Number(value=750, label="Keep Silence")
53 |                     margin = gr.Number(value=0, label="Margin")
54 |                     padding = gr.Checkbox(value=True, label="Padding")
55 | 
56 |                 with gr.Row(equal_height=False):
57 |                     min = gr.Number(value=1000, label="Minimum audio length")
58 |                     max = gr.Number(value=5000, label="Maximum audio length")
59 | 
60 |                 with gr.Row(equal_height=False):
61 |                     status = gr.Textbox(value="", label="Status")
62 |                 with gr.Row(equal_height=False):
63 |                     separate_button = gr.Button("Separate", variant="primary")
64 | 
65 |         separate_button.click(
66 |             separate,
67 |             inputs=[
68 |                 input_audio,
69 |                 output_dir,
70 |                 silence_thresh,
71 |                 min_silence_len,
72 |                 keep_silence,
73 |                 margin,
74 |                 padding,
75 |                 min,
76 |                 max,
77 |             ],
78 |             outputs=[status],
79 |         )
80 | 


--------------------------------------------------------------------------------
/modules/ui.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import os
  3 | from typing import *
  4 | 
  5 | import gradio as gr
  6 | import gradio.routes
  7 | import torch
  8 | 
  9 | from . import models, shared
 10 | from .core import preload
 11 | from .shared import ROOT_DIR
 12 | 
 13 | 
 14 | class Tab:
 15 |     TABS_DIR = os.path.join(ROOT_DIR, "modules", "tabs")
 16 | 
 17 |     def __init__(self, filepath: str) -> None:
 18 |         self.filepath = filepath
 19 | 
 20 |     def sort(self):
 21 |         return 1
 22 | 
 23 |     def title(self):
 24 |         return ""
 25 | 
 26 |     def ui(self, outlet: Callable):
 27 |         pass
 28 | 
 29 |     def __call__(self):
 30 |         children_dir = self.filepath[:-3]
 31 |         children = []
 32 | 
 33 |         if os.path.isdir(children_dir):
 34 |             for file in os.listdir(children_dir):
 35 |                 if not file.endswith(".py"):
 36 |                     continue
 37 |                 module_name = file[:-3]
 38 |                 parent = os.path.relpath(Tab.TABS_DIR, Tab.TABS_DIR).replace("/", ".")
 39 | 
 40 |                 if parent.startswith("."):
 41 |                     parent = parent[1:]
 42 |                 if parent.endswith("."):
 43 |                     parent = parent[:-1]
 44 | 
 45 |                 children.append(
 46 |                     importlib.import_module(f"modules.tabs.{parent}.{module_name}")
 47 |                 )
 48 | 
 49 |         children = sorted(children, key=lambda x: x.sort())
 50 | 
 51 |         tabs = []
 52 | 
 53 |         for child in children:
 54 |             attrs = child.__dict__
 55 |             tab = [x for x in attrs.values() if issubclass(x, Tab)]
 56 |             if len(tab) > 0:
 57 |                 tabs.append(tab[0])
 58 | 
 59 |         def outlet():
 60 |             with gr.Tabs():
 61 |                 for tab in tabs:
 62 |                     with gr.Tab(tab.title()):
 63 |                         tab()
 64 | 
 65 |         return self.ui(outlet)
 66 | 
 67 | 
 68 | def load_tabs() -> List[Tab]:
 69 |     tabs = []
 70 |     files = os.listdir(os.path.join(ROOT_DIR, "modules", "tabs"))
 71 | 
 72 |     for file in files:
 73 |         if not file.endswith(".py"):
 74 |             continue
 75 |         module_name = file[:-3]
 76 |         module = importlib.import_module(f"modules.tabs.{module_name}")
 77 |         attrs = module.__dict__
 78 |         TabClass = [
 79 |             x
 80 |             for x in attrs.values()
 81 |             if type(x) == type and issubclass(x, Tab) and not x == Tab
 82 |         ]
 83 |         if len(TabClass) > 0:
 84 |             tabs.append((file, TabClass[0]))
 85 | 
 86 |     tabs = sorted([TabClass(file) for file, TabClass in tabs], key=lambda x: x.sort())
 87 |     return tabs
 88 | 
 89 | 
 90 | def webpath(fn):
 91 |     if fn.startswith(ROOT_DIR):
 92 |         web_path = os.path.relpath(fn, ROOT_DIR).replace("\\", "/")
 93 |     else:
 94 |         web_path = os.path.abspath(fn)
 95 | 
 96 |     return f"file={web_path}?{os.path.getmtime(fn)}"
 97 | 
 98 | 
 99 | def javascript_html():
100 |     script_js = os.path.join(ROOT_DIR, "script.js")
101 |     head = f'<script type="text/javascript" src="{webpath(script_js)}"></script>\n'
102 | 
103 |     return head
104 | 
105 | 
106 | def css_html():
107 |     return f'<link rel="stylesheet" property="stylesheet" href="{webpath(os.path.join(ROOT_DIR, "styles.css"))}">'
108 | 
109 | 
110 | def create_head():
111 |     head = ""
112 |     head += css_html()
113 |     head += javascript_html()
114 | 
115 |     def template_response(*args, **kwargs):
116 |         res = shared.gradio_template_response_original(*args, **kwargs)
117 |         res.body = res.body.replace(b"</head>", f"{head}</head>".encode("utf8"))
118 |         res.init_headers()
119 |         return res
120 | 
121 |     gradio.routes.templates.TemplateResponse = template_response
122 | 
123 | 
124 | def create_ui():
125 |     preload()
126 |     block = gr.Blocks()
127 | 
128 |     with block:
129 |         with gr.Tabs():
130 |             tabs = load_tabs()
131 |             for tab in tabs:
132 |                 with gr.Tab(tab.title()):
133 |                     tab()
134 | 
135 |     create_head()
136 | 
137 |     return block
138 | 
139 | 
140 | def create_model_list_ui(speaker_id: bool = True, load: bool = True):
141 |     speaker_id_info = {
142 |         "visible": False,
143 |         "maximum": 10000,
144 |     }
145 | 
146 |     def reload_model(raw=False):
147 |         model_list = models.get_models()
148 |         if len(model_list) > 0:
149 |             models.load_model(model_list[0])
150 | 
151 |         if models.vc_model is not None:
152 |             speaker_id_info["visible"] = True
153 |             speaker_id_info["maximum"] = models.vc_model.n_spk
154 | 
155 |         return model_list if raw else gr.Dropdown.update(choices=model_list)
156 | 
157 |     model_list = reload_model(raw=True)
158 | 
159 |     def load_model(model_name):
160 |         if load:
161 |             models.load_model(model_name)
162 |             speaker_id_info["visible"] = True
163 |             speaker_id_info["maximum"] = models.vc_model.n_spk
164 |         else:
165 |             model = models.get_vc_model(model_name)
166 |             speaker_id_info["visible"] = True
167 |             speaker_id_info["maximum"] = model.n_spk
168 |             del model
169 |             torch.cuda.empty_cache()
170 |         return gr.Slider.update(
171 |             maximum=speaker_id_info["maximum"], visible=speaker_id_info["visible"]
172 |         )
173 | 
174 |     with gr.Row(equal_height=False):
175 |         model = gr.Dropdown(
176 |             choices=model_list,
177 |             label="Model",
178 |             value=model_list[0] if len(model_list) > 0 else None,
179 |         )
180 |         speaker_id = gr.Slider(
181 |             minimum=0,
182 |             maximum=speaker_id_info["maximum"],
183 |             step=1,
184 |             label="Speaker ID",
185 |             value=0,
186 |             visible=speaker_id and speaker_id_info["visible"],
187 |             interactive=True,
188 |         )
189 |         reload_model_button = gr.Button("♻️")
190 | 
191 |         model.change(load_model, inputs=[model], outputs=[speaker_id])
192 |         reload_model_button.click(reload_model, outputs=[model])
193 | 
194 |     return model, speaker_id
195 | 
196 | 
197 | if not hasattr(shared, "gradio_template_response_original"):
198 |     shared.gradio_template_response_original = gradio.routes.templates.TemplateResponse
199 | 


--------------------------------------------------------------------------------
/modules/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import *
 3 | 
 4 | import ffmpeg
 5 | import numpy as np
 6 | import requests
 7 | import torch
 8 | from tqdm import tqdm
 9 | 
10 | from lib.rvc.config import TrainConfig
11 | from modules.shared import ROOT_DIR
12 | 
13 | 
14 | def load_audio(file: str, sr):
15 |     try:
16 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
17 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
18 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
19 |         file = (
20 |             file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
21 |         )  # Prevent small white copy path head and tail with spaces and " and return
22 |         out, _ = (
23 |             ffmpeg.input(file, threads=0)
24 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
25 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
26 |         )
27 |     except Exception as e:
28 |         raise RuntimeError(f"Failed to load audio: {e}")
29 | 
30 |     return np.frombuffer(out, np.float32).flatten()
31 | 
32 | 
33 | def get_gpus():
34 |     num_gpus = torch.cuda.device_count()
35 |     return [torch.device(f"cuda:{i}") for i in range(num_gpus)]
36 | 
37 | 
38 | def download_file(url: str, out: str, position: int = 0, show: bool = True):
39 |     req = requests.get(url, stream=True, allow_redirects=True)
40 |     content_length = req.headers.get("content-length")
41 |     if show:
42 |         progress_bar = tqdm(
43 |             total=int(content_length) if content_length is not None else None,
44 |             leave=False,
45 |             unit="B",
46 |             unit_scale=True,
47 |             unit_divisor=1024,
48 |             position=position,
49 |         )
50 | 
51 |     # with tqdm
52 |     with open(out, "wb") as f:
53 |         for chunk in req.iter_content(chunk_size=1024):
54 |             if chunk:
55 |                 if show:
56 |                     progress_bar.update(len(chunk))
57 |                 f.write(chunk)
58 | 
59 | 
60 | def load_config(
61 |     version: Literal["v1", "v2"],
62 |     training_dir: str,
63 |     sample_rate: str,
64 |     emb_channels: int,
65 |     fp16: bool,
66 | ):
67 |     if emb_channels == 256:
68 |         config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json")
69 |     else:
70 |         config_path = os.path.join(
71 |             ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json"
72 |         )
73 | 
74 |     config = TrainConfig.parse_file(config_path)
75 |     config.version = version
76 |     config.train.fp16_run = fp16
77 | 
78 |     config_save_path = os.path.join(training_dir, "config.json")
79 | 
80 |     with open(config_save_path, "w") as f:
81 |         f.write(config.json())
82 | 
83 |     return config
84 | 


--------------------------------------------------------------------------------
/outputs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/main.txt


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | # -r main.txt
2 | 
3 | black
4 | isort


--------------------------------------------------------------------------------
/requirements/main.txt:
--------------------------------------------------------------------------------
 1 | gradio==3.36.1
 2 | tqdm==4.65.0
 3 | numpy==1.23.5
 4 | faiss-cpu==1.7.3
 5 | fairseq==0.12.2
 6 | matplotlib==3.7.1
 7 | scipy==1.9.3
 8 | librosa==0.9.1
 9 | pyworld==0.3.2
10 | soundfile==0.12.1
11 | ffmpeg-python==0.2.0
12 | pydub==0.25.1
13 | soxr==0.3.5
14 | transformers==4.28.1
15 | torchcrepe==0.0.20
16 | Flask==2.3.2
17 | 
18 | tensorboard
19 | tensorboardX
20 | requests


--------------------------------------------------------------------------------
/script.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/script.js


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import json
 3 | import os
 4 | import traceback
 5 | from typing import *
 6 | 
 7 | import soundfile as sf
 8 | from flask import Flask, make_response, request, send_file
 9 | from scipy.io.wavfile import write
10 | 
11 | from modules.server.model import VoiceServerModel
12 | 
13 | model: Optional[VoiceServerModel] = None
14 | app = Flask(__name__)
15 | 
16 | @app.route('/ping')
17 | def ping():
18 |     return make_response("server is alive", 200)
19 | 
20 | @app.route('/upload_model', methods=['POST'])
21 | def upload_model():
22 |     """
23 |     input:
24 |         json:
25 |             rvc_model_file: str
26 |                 specify rvc model's absolute path (.pt, .pth)
27 |             faiss_index_file: Optional[str]
28 |                 specify faiss index'S absolute path (.index)
29 |     """
30 |     global model
31 |     if request.method == "POST":
32 |         rvc_model_file = request.json["rvc_model_file"]
33 |         faiss_index_file =request.json["faiss_index_file"] if "faiss_index_file" in request.json else ""
34 |         try:
35 |             model = VoiceServerModel(rvc_model_file, faiss_index_file)
36 |             return make_response("model is load", 200)
37 |         except:
38 |             traceback.print_exc()
39 |             return make_response("model load error", 400)
40 |     else:
41 |         return make_response("use post method", 400)
42 | 
43 | @app.route('/convert_sound', methods=['POST'])
44 | def convert_sound():
45 |     """
46 |     input:
47 |         params: json
48 |             speaker_id: int
49 |                 default: 0
50 |             transpose: int
51 |                 default: 0
52 |             pitch_extraction_algo: str
53 |                 default: dio
54 |                 value: ["dio", "harvest", "mangio-crepe", "crepe"]
55 |             retrieval_feature_ratio: float
56 |                 default: 0
57 |                 value: 0. ~ 1.
58 |         input_wav: wav file
59 | 
60 |     output:
61 |         wavfile
62 |     """
63 |     global model
64 |     if model is None:
65 |         return make_response("please upload model", 400)
66 |     print("start")
67 |     if request.method == "POST":
68 |         input_buffer = io.BytesIO(request.files["input_wav"].stream.read())
69 |         audio, sr = sf.read(input_buffer)
70 | 
71 |         req_json = json.load(io.BytesIO(request.files["params"].stream.read()))
72 |         sid = int(req_json.get("speaker_id", 0))
73 |         transpose = int(req_json.get("transpose", 0))
74 |         pitch_extraction_algo = req_json.get("pitch_extraction_algo", "dio")
75 |         if not pitch_extraction_algo in ["dio", "harvest", "mangio-crepe", "crepe"]:
76 |             return make_response("bad pitch extraction algo", 400)
77 |         retrieval_feature_ratio = float(req_json.get("retrieval_feature_ratio", 0.))
78 | 
79 |         out_audio = model(audio, sr, sid, transpose, pitch_extraction_algo, retrieval_feature_ratio)
80 |         output_buffer = io.BytesIO()
81 |         write(output_buffer, rate=model.tgt_sr, data=out_audio)
82 |         output_buffer.seek(0)
83 |         response = make_response(send_file(output_buffer, mimetype="audio/wav"), 200)
84 |         return response
85 |     else:
86 |         return make_response("use post method", 400)
87 | 
88 | if __name__ == "__main__":
89 |     app.run()


--------------------------------------------------------------------------------
/styles.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddPn08/rvc-webui/b71742809a24cd89eb18081b831c0b1ac11ccb2a/styles.css


--------------------------------------------------------------------------------
/update.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | if exist ".git" (
 4 |     git fetch --prune
 5 |     git reset --hard origin/main
 6 | ) else (
 7 |     git init
 8 |     git remote add origin https://github.com/ddPn08/rvc-webui.git
 9 |     git fetch --prune
10 |     git reset --hard origin/main
11 | )
12 | 
13 | pause


--------------------------------------------------------------------------------
/update.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ -d .git ]; then
3 |     git fetch --prune
4 |     git reset --hard origin/main
5 | else
6 |     git init
7 |     git remote add origin
8 |     git fetch --prune
9 |     git reset --hard origin/main


--------------------------------------------------------------------------------
/webui-macos-env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ####################################################################
 3 | #                          macOS defaults                          #
 4 | # Please modify webui-user.sh to change these instead of this file #
 5 | ####################################################################
 6 | 
 7 | if [[ -x "$(command -v python3.10)" ]]
 8 | then
 9 |     python_cmd="python3.10"
10 | fi
11 | 
12 | export COMMANDLINE_ARGS=""
13 | export TORCH_COMMAND="pip install torch torchvision torchaudio"
14 | export PYTORCH_ENABLE_MPS_FALLBACK=1
15 | 
16 | ####################################################################


--------------------------------------------------------------------------------
/webui-user.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | 
3 | set PYTHON=
4 | set GIT=
5 | set VENV_DIR=
6 | set COMMANDLINE_ARGS=
7 | 
8 | call webui.bat


--------------------------------------------------------------------------------
/webui-user.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #########################################################
 3 | # Uncomment and change the variables below to your need:#
 4 | #########################################################
 5 | 
 6 | # Commandline arguments for webui.py, for example: export COMMANDLINE_ARGS="--medvram --opt-split-attention"
 7 | #export COMMANDLINE_ARGS=""
 8 | 
 9 | # python3 executable
10 | #python_cmd="python3"
11 | 
12 | # git executable
13 | #export GIT="git"
14 | 
15 | # python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv)
16 | #venv_dir="venv"
17 | 
18 | # script to launch to start the app
19 | #export LAUNCH_SCRIPT="launch.py"
20 | 
21 | # install command for torch
22 | #export TORCH_COMMAND="pip install torch --extra-index-url https://download.pytorch.org/whl/cu118"
23 | 
24 | # Requirements file to use for stable-diffusion-webui
25 | #export REQS_FILE="requirements_versions.txt"
26 | 
27 | ###########################################


--------------------------------------------------------------------------------
/webui.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | if not defined PYTHON (set PYTHON=python)
 4 | if not defined VENV_DIR (set "VENV_DIR=%~dp0%venv")
 5 | 
 6 | 
 7 | set ERROR_REPORTING=FALSE
 8 | 
 9 | mkdir tmp 2>NUL
10 | 
11 | %PYTHON% -c "" >tmp/stdout.txt 2>tmp/stderr.txt
12 | if %ERRORLEVEL% == 0 goto :check_pip
13 | echo Couldn't launch python
14 | goto :show_stdout_stderr
15 | 
16 | :check_pip
17 | %PYTHON% -mpip --help >tmp/stdout.txt 2>tmp/stderr.txt
18 | if %ERRORLEVEL% == 0 goto :start_venv
19 | if "%PIP_INSTALLER_LOCATION%" == "" goto :show_stdout_stderr
20 | %PYTHON% "%PIP_INSTALLER_LOCATION%" >tmp/stdout.txt 2>tmp/stderr.txt
21 | if %ERRORLEVEL% == 0 goto :start_venv
22 | echo Couldn't install pip
23 | goto :show_stdout_stderr
24 | 
25 | :start_venv
26 | if ["%VENV_DIR%"] == ["-"] goto :launch
27 | if ["%SKIP_VENV%"] == ["1"] goto :launch
28 | 
29 | dir "%VENV_DIR%\Scripts\Python.exe" >tmp/stdout.txt 2>tmp/stderr.txt
30 | if %ERRORLEVEL% == 0 goto :activate_venv
31 | 
32 | for /f "delims=" %%i in ('CALL %PYTHON% -c "import sys; print(sys.executable)"') do set PYTHON_FULLNAME="%%i"
33 | echo Creating venv in directory %VENV_DIR% using python %PYTHON_FULLNAME%
34 | %PYTHON_FULLNAME% -m venv "%VENV_DIR%" >tmp/stdout.txt 2>tmp/stderr.txt
35 | if %ERRORLEVEL% == 0 goto :activate_venv
36 | echo Unable to create venv in directory "%VENV_DIR%"
37 | goto :show_stdout_stderr
38 | 
39 | :activate_venv
40 | set PYTHON="%VENV_DIR%\Scripts\Python.exe"
41 | echo venv %PYTHON%
42 | 
43 | :launch
44 | %PYTHON% launch.py %*
45 | pause
46 | exit /b
47 | 
48 | :show_stdout_stderr
49 | 
50 | echo.
51 | echo exit code: %errorlevel%
52 | 
53 | for /f %%i in ("tmp\stdout.txt") do set size=%%~zi
54 | if %size% equ 0 goto :show_stderr
55 | echo.
56 | echo stdout:
57 | type tmp\stdout.txt
58 | 
59 | :show_stderr
60 | for /f %%i in ("tmp\stderr.txt") do set size=%%~zi
61 | if %size% equ 0 goto :show_stderr
62 | echo.
63 | echo stderr:
64 | type tmp\stderr.txt
65 | 
66 | :endofscript
67 | 
68 | echo.
69 | echo Launch unsuccessful. Exiting.
70 | pause


--------------------------------------------------------------------------------
/webui.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from modules import cmd_opts, ui
 4 | 
 5 | # なんか知らんが湧いて出てくる ".DS_Store"　を無視する。
 6 | # ここにこんなコードを置くべきかはわからないけど…
 7 | _list_dir = os.listdir
 8 | 
 9 | def listdir4mac(path):
10 |     return [file for file in _list_dir(path) if not file.startswith(".")]
11 | 
12 | os.listdir = listdir4mac
13 | 
14 | 
15 | def webui():
16 |     app = ui.create_ui()
17 |     app.queue(64)
18 |     app, local_url, share_url = app.launch(
19 |         server_name=cmd_opts.opts.host,
20 |         server_port=cmd_opts.opts.port,
21 |         share=cmd_opts.opts.share,
22 |     )
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     webui()
27 | 


--------------------------------------------------------------------------------
/webui.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #################################################
  3 | # Please do not make any changes to this file,  #
  4 | # change the variables in webui-user.sh instead #
  5 | #################################################
  6 | 
  7 | # If run from macOS, load defaults from webui-macos-env.sh
  8 | if [[ "$OSTYPE" == "darwin"* ]]; then
  9 |     if [[ -f webui-macos-env.sh ]]
 10 |         then
 11 |         source ./webui-macos-env.sh
 12 |     fi
 13 | fi
 14 | 
 15 | # Read variables from webui-user.sh
 16 | # shellcheck source=/dev/null
 17 | if [[ -f webui-user.sh ]]
 18 | then
 19 |     source ./webui-user.sh
 20 | fi
 21 | 
 22 | # python3 executable
 23 | if [[ -z "${python_cmd}" ]]
 24 | then
 25 |     python_cmd="python3"
 26 | fi
 27 | 
 28 | # git executable
 29 | if [[ -z "${GIT}" ]]
 30 | then
 31 |     export GIT="git"
 32 | fi
 33 | 
 34 | # python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv)
 35 | if [[ -z "${venv_dir}" ]]
 36 | then
 37 |     venv_dir="venv"
 38 | fi
 39 | 
 40 | if [[ -z "${LAUNCH_SCRIPT}" ]]
 41 | then
 42 |     LAUNCH_SCRIPT="launch.py"
 43 | fi
 44 | 
 45 | # this script cannot be run as root by default
 46 | can_run_as_root=0
 47 | 
 48 | # read any command line flags to the webui.sh script
 49 | while getopts "f" flag > /dev/null 2>&1
 50 | do
 51 |     case ${flag} in
 52 |         f) can_run_as_root=1;;
 53 |         *) break;;
 54 |     esac
 55 | done
 56 | 
 57 | # Disable sentry logging
 58 | export ERROR_REPORTING=FALSE
 59 | 
 60 | # Do not reinstall existing pip packages on Debian/Ubuntu
 61 | export PIP_IGNORE_INSTALLED=0
 62 | 
 63 | # Pretty print
 64 | delimiter="################################################################"
 65 | 
 66 | # Do not run as root
 67 | if [[ $(id -u) -eq 0 && can_run_as_root -eq 0 ]]
 68 | then
 69 |     printf "\n%s\n" "${delimiter}"
 70 |     printf "\e[1m\e[31mERROR: This script must not be launched as root, aborting...\e[0m"
 71 |     printf "\n%s\n" "${delimiter}"
 72 |     exit 1
 73 | else
 74 |     printf "\n%s\n" "${delimiter}"
 75 |     printf "Running on \e[1m\e[32m%s\e[0m user" "$(whoami)"
 76 |     printf "\n%s\n" "${delimiter}"
 77 | fi
 78 | 
 79 | if echo "$gpu_info" | grep -q "AMD" && [[ -z "${TORCH_COMMAND}" ]]
 80 | then
 81 |     export TORCH_COMMAND="pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.2"
 82 | fi  
 83 | 
 84 | for preq in "${GIT}" "${python_cmd}"
 85 | do
 86 |     if ! hash "${preq}" &>/dev/null
 87 |     then
 88 |         printf "\n%s\n" "${delimiter}"
 89 |         printf "\e[1m\e[31mERROR: %s is not installed, aborting...\e[0m" "${preq}"
 90 |         printf "\n%s\n" "${delimiter}"
 91 |         exit 1
 92 |     fi
 93 | done
 94 | 
 95 | if ! "${python_cmd}" -c "import venv" &>/dev/null
 96 | then
 97 |     printf "\n%s\n" "${delimiter}"
 98 |     printf "\e[1m\e[31mERROR: python3-venv is not installed, aborting...\e[0m"
 99 |     printf "\n%s\n" "${delimiter}"
100 |     exit 1
101 | fi
102 | 
103 | printf "\n%s\n" "${delimiter}"
104 | printf "Create and activate python venv"
105 | printf "\n%s\n" "${delimiter}"
106 | if [[ ! -d "${venv_dir}" ]]
107 | then
108 |     "${python_cmd}" -m venv "${venv_dir}"
109 |     first_launch=1
110 | fi
111 | # shellcheck source=/dev/null
112 | if [[ -f "${venv_dir}"/bin/activate ]]
113 | then
114 |     source "${venv_dir}"/bin/activate
115 | else
116 |     printf "\n%s\n" "${delimiter}"
117 |     printf "\e[1m\e[31mERROR: Cannot activate python venv, aborting...\e[0m"
118 |     printf "\n%s\n" "${delimiter}"
119 |     exit 1
120 | fi
121 | 
122 | printf "\n%s\n" "${delimiter}"
123 | printf "Launching launch.py..."
124 | printf "\n%s\n" "${delimiter}"      
125 | exec "${python_cmd}" "${LAUNCH_SCRIPT}" "$@"


--------------------------------------------------------------------------------