├── LICENSE
├── README.md
├── __init__.py
├── donate.jpg
├── download_models.py
├── nodes.py
├── requirements.txt
├── rvc
├── __init__.py
├── configs
│ ├── __pycache__
│ │ └── config.cpython-310.pyc
│ ├── config.json
│ ├── config.py
│ ├── inuse
│ │ ├── .gitignore
│ │ ├── v1
│ │ │ └── .gitignore
│ │ └── v2
│ │ │ └── .gitignore
│ ├── v1
│ │ ├── 32k.json
│ │ ├── 40k.json
│ │ └── 48k.json
│ └── v2
│ │ ├── 32k.json
│ │ └── 48k.json
├── i18n
│ ├── __pycache__
│ │ └── i18n.cpython-310.pyc
│ ├── i18n.py
│ ├── locale
│ │ ├── en_US.json
│ │ ├── es_ES.json
│ │ ├── fr_FR.json
│ │ ├── it_IT.json
│ │ ├── ja_JP.json
│ │ ├── ko_KR.json
│ │ ├── pt_BR.json
│ │ ├── ru_RU.json
│ │ ├── tr_TR.json
│ │ ├── zh_CN.json
│ │ ├── zh_HK.json
│ │ ├── zh_SG.json
│ │ └── zh_TW.json
│ ├── locale_diff.py
│ └── scan_i18n.py
├── infer
│ ├── lib
│ │ ├── __pycache__
│ │ │ ├── audio.cpython-310.pyc
│ │ │ ├── rmvpe.cpython-310.pyc
│ │ │ ├── rvcmd.cpython-310.pyc
│ │ │ └── slicer2.cpython-310.pyc
│ │ ├── audio.py
│ │ ├── infer_pack
│ │ │ ├── __pycache__
│ │ │ │ ├── attentions.cpython-310.pyc
│ │ │ │ ├── commons.cpython-310.pyc
│ │ │ │ ├── models.cpython-310.pyc
│ │ │ │ ├── modules.cpython-310.pyc
│ │ │ │ └── transforms.cpython-310.pyc
│ │ │ ├── attentions.py
│ │ │ ├── attentions_onnx.py
│ │ │ ├── commons.py
│ │ │ ├── models.py
│ │ │ ├── models_onnx.py
│ │ │ ├── modules.py
│ │ │ ├── modules
│ │ │ │ └── F0Predictor
│ │ │ │ │ ├── DioF0Predictor.py
│ │ │ │ │ ├── F0Predictor.py
│ │ │ │ │ ├── HarvestF0Predictor.py
│ │ │ │ │ ├── PMF0Predictor.py
│ │ │ │ │ └── __init__.py
│ │ │ ├── onnx_inference.py
│ │ │ └── transforms.py
│ │ ├── jit
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ └── __init__.cpython-310.pyc
│ │ │ ├── get_hubert.py
│ │ │ ├── get_rmvpe.py
│ │ │ └── get_synthesizer.py
│ │ ├── rmvpe.py
│ │ ├── rtrvc.py
│ │ ├── rvcmd.py
│ │ ├── slicer2.py
│ │ ├── train
│ │ │ ├── __pycache__
│ │ │ │ ├── data_utils.cpython-310.pyc
│ │ │ │ ├── losses.cpython-310.pyc
│ │ │ │ ├── mel_processing.cpython-310.pyc
│ │ │ │ ├── process_ckpt.cpython-310.pyc
│ │ │ │ └── utils.cpython-310.pyc
│ │ │ ├── data_utils.py
│ │ │ ├── losses.py
│ │ │ ├── mel_processing.py
│ │ │ ├── process_ckpt.py
│ │ │ └── utils.py
│ │ └── uvr5_pack
│ │ │ ├── lib_v5
│ │ │ ├── dataset.py
│ │ │ ├── layers.py
│ │ │ ├── layers_123812KB .py
│ │ │ ├── layers_123821KB.py
│ │ │ ├── layers_33966KB.py
│ │ │ ├── layers_537227KB.py
│ │ │ ├── layers_537238KB.py
│ │ │ ├── layers_new.py
│ │ │ ├── model_param_init.py
│ │ │ ├── modelparams
│ │ │ │ ├── 1band_sr16000_hl512.json
│ │ │ │ ├── 1band_sr32000_hl512.json
│ │ │ │ ├── 1band_sr33075_hl384.json
│ │ │ │ ├── 1band_sr44100_hl1024.json
│ │ │ │ ├── 1band_sr44100_hl256.json
│ │ │ │ ├── 1band_sr44100_hl512.json
│ │ │ │ ├── 1band_sr44100_hl512_cut.json
│ │ │ │ ├── 2band_32000.json
│ │ │ │ ├── 2band_44100_lofi.json
│ │ │ │ ├── 2band_48000.json
│ │ │ │ ├── 3band_44100.json
│ │ │ │ ├── 3band_44100_mid.json
│ │ │ │ ├── 3band_44100_msb2.json
│ │ │ │ ├── 4band_44100.json
│ │ │ │ ├── 4band_44100_mid.json
│ │ │ │ ├── 4band_44100_msb.json
│ │ │ │ ├── 4band_44100_msb2.json
│ │ │ │ ├── 4band_44100_reverse.json
│ │ │ │ ├── 4band_44100_sw.json
│ │ │ │ ├── 4band_v2.json
│ │ │ │ ├── 4band_v2_sn.json
│ │ │ │ ├── 4band_v3.json
│ │ │ │ └── ensemble.json
│ │ │ ├── nets.py
│ │ │ ├── nets_123812KB.py
│ │ │ ├── nets_123821KB.py
│ │ │ ├── nets_33966KB.py
│ │ │ ├── nets_537227KB.py
│ │ │ ├── nets_537238KB.py
│ │ │ ├── nets_61968KB.py
│ │ │ ├── nets_new.py
│ │ │ └── spec_utils.py
│ │ │ ├── name_params.json
│ │ │ └── utils.py
│ └── modules
│ │ ├── gui
│ │ ├── __init__.py
│ │ ├── torchgate.py
│ │ └── utils.py
│ │ ├── ipex
│ │ ├── __init__.py
│ │ ├── attention.py
│ │ ├── gradscaler.py
│ │ └── hijacks.py
│ │ ├── onnx
│ │ └── export.py
│ │ ├── train
│ │ ├── extract
│ │ │ ├── extract_f0_print.py
│ │ │ ├── extract_f0_rmvpe.py
│ │ │ └── extract_f0_rmvpe_dml.py
│ │ ├── extract_feature_print.py
│ │ ├── preprocess.py
│ │ └── train.py
│ │ ├── uvr5
│ │ ├── mdxnet.py
│ │ ├── modules.py
│ │ └── vr.py
│ │ └── vc
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-310.pyc
│ │ ├── modules.cpython-310.pyc
│ │ ├── pipeline.cpython-310.pyc
│ │ └── utils.cpython-310.pyc
│ │ ├── modules.py
│ │ ├── pipeline.py
│ │ └── utils.py
├── logs
│ └── mute
│ │ ├── 0_gt_wavs
│ │ ├── mute32k.wav
│ │ ├── mute40k.spec.pt
│ │ ├── mute40k.wav
│ │ ├── mute48k.spec.pt
│ │ └── mute48k.wav
│ │ ├── 1_16k_wavs
│ │ └── mute.wav
│ │ ├── 2a_f0
│ │ └── mute.wav.npy
│ │ ├── 2b-f0nsf
│ │ └── mute.wav.npy
│ │ ├── 3_feature256
│ │ └── mute.npy
│ │ └── 3_feature768
│ │ └── mute.npy
└── train.py
├── web.png
├── web
└── js
│ ├── alertMSG.js
│ ├── previewAudio.js
│ ├── refreshPath.js
│ └── uploadAudio.js
└── wechat.jpg
/LICENSE:
--------------------------------------------------------------------------------
1 | 本软件及其相关代码以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。
2 | 如不认可该条款,则不能使用或引用软件包内任何代码和文件。
3 |
4 | 特此授予任何获得本软件和相关文档文件(以下简称“软件”)副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利,以及授予本软件所提供的人使用本软件的权利,但须符合以下条件:
5 | 上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。
6 | 软件是“按原样”提供的,没有任何明示或暗示的保证,包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下,作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任,无论是在合同诉讼、侵权诉讼还是其他诉讼中。
7 |
8 | MIT License
9 |
10 | Copyright (c) 2024 AIFSH
11 |
12 | Permission is hereby granted, free of charge, to any person obtaining a copy
13 | of this software and associated documentation files (the "Software"), to deal
14 | in the Software without restriction, including without limitation the rights
15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 | copies of the Software, and to permit persons to whom the Software is
17 | furnished to do so, subject to the following conditions:
18 |
19 | The above copyright notice and this permission notice shall be included in all
20 | copies or substantial portions of the Software.
21 |
22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 | SOFTWARE.
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ComfyUI-RVC
2 | a comfyui custom node for [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git),you can Voice-Conversion in comfyui now!
3 |
4 | ## How to use
5 | make sure `ffmpeg` is worked in your commandline
6 | for Linux
7 | ```
8 | apt update
9 | apt install ffmpeg
10 | ```
11 | for Windows,you can install `ffmpeg` by [WingetUI](https://github.com/marticliment/WingetUI) automatically
12 |
13 | then!
14 | ```
15 | git clone https://github.com/AIFSH/ComfyUI-RVC.git
16 | cd ComfyUI-RVC
17 | pip install -r requirements.txt
18 | ```
19 | `weights` will be download from huggingface automatically!if you in china,make sure your internet attach the huggingface
20 | or if you still struggle with huggingface, you may try follow [hf-mirror](https://hf-mirror.com/) to config your env.
21 |
22 | 或者下载[rvc_assets.zip](https://pan.quark.cn/s/039c8d2d59ac)解压后放置到`ComfyUI-RVC/rvc`目录
23 |
24 | ## Tutorial
25 | [Demo](https://www.bilibili.com/video/BV1bH4y1P7n9/)
26 |
27 | ## WeChat Group && Donate
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 | ## Thanks
36 | - [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git)
37 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys,site
3 | from subprocess import Popen
4 | from server import PromptServer
5 | now_dir = os.path.dirname(os.path.abspath(__file__))
6 |
7 | site_packages_roots = []
8 | for path in site.getsitepackages():
9 | if "packages" in path:
10 | site_packages_roots.append(path)
11 | if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
12 | #os.environ["OPENBLAS_NUM_THREADS"] = "4"
13 | for site_packages_root in site_packages_roots:
14 | if os.path.exists(site_packages_root):
15 | try:
16 | with open("%s/users.pth" % (site_packages_root), "a") as f:
17 | f.write(
18 | "%s\n%s/rvc\n%s/rvc/infer"
19 | % (now_dir,now_dir,now_dir)
20 | )
21 | break
22 | except PermissionError:
23 | raise PermissionError
24 |
25 | if os.path.isfile("%s/users.pth" % (site_packages_root)):
26 | print("!!!RVC path was added to " + "%s/users.pth" % (site_packages_root)
27 | + "\n if meet `No module` error,try `python main.py` again")
28 |
29 | model_path = os.path.join(now_dir,"rvc", "assets")
30 |
31 | if not os.path.exists(os.path.join(model_path, "pretrained_v2")):
32 | cmd = "python %s/download_models.py" % (now_dir)
33 | p = Popen(cmd, shell=True, cwd=now_dir)
34 | p.wait()
35 | else:
36 | print("!!!RVC use cache models,make sure your 'assets' complete")
37 |
38 |
39 | WEB_DIRECTORY = "./web"
40 | from .nodes import LoadAudio, PreViewAudio,RVC_Train,RVC_Infer,CombineAudio
41 |
42 | # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension
43 | # WEB_DIRECTORY = "./somejs"
44 |
45 | # A dictionary that contains all nodes you want to export with their names
46 | # NOTE: names should be globally unique
47 | NODE_CLASS_MAPPINGS = {
48 | "LoadAudio": LoadAudio,
49 | "PreViewAudio": PreViewAudio,
50 | "RVC_Train": RVC_Train,
51 | "RVC_Infer": RVC_Infer,
52 | "CombineAudio": CombineAudio
53 | }
54 |
55 | # A dictionary that contains the friendly/humanly readable titles for the nodes
56 | NODE_DISPLAY_NAME_MAPPINGS = {
57 | "LoadAudio": "AudioLoader",
58 | "PreViewAudio": "PreView Audio",
59 | "RVC_Train": "RVC Train",
60 | "RVC_Infer": "RVC Inference",
61 | "CombineAudio": "CombineAudio"
62 | }
63 |
64 | @PromptServer.instance.routes.get("/rvc/reboot")
65 | def restart(self):
66 | try:
67 | sys.stdout.close_log()
68 | except Exception as e:
69 | pass
70 |
71 | return os.execv(sys.executable, [sys.executable] + sys.argv)
--------------------------------------------------------------------------------
/donate.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/donate.jpg
--------------------------------------------------------------------------------
/download_models.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from huggingface_hub import hf_hub_download
4 |
5 | now_dir = os.path.dirname(os.path.abspath(__file__))
6 | BASE_DIR = os.path.join(now_dir, "rvc")
7 |
8 |
9 | if __name__ == "__main__":
10 | os.makedirs(os.path.join(BASE_DIR ,"assets","weights"), exist_ok=True)
11 | weights_path = os.path.join(BASE_DIR ,"assets")
12 | print("Downloading hubert_base.pt...")
13 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
14 | filename="hubert_base.pt",
15 | subfolder= "",
16 | local_dir= os.path.join(weights_path, "hubert"))
17 | print("Downloading rmvpe.pt...")
18 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
19 | filename="rmvpe.pt",
20 | subfolder= "",
21 | local_dir= os.path.join(weights_path, "rmvpe"))
22 |
23 |
24 | print("Downloading pretrained models:")
25 |
26 | model_names = [
27 | "D40k.pth",
28 | "D48k.pth",
29 | "G32k.pth",
30 | "G40k.pth",
31 | "G48k.pth",
32 | "f0D32k.pth",
33 | "f0D40k.pth",
34 | "f0D48k.pth",
35 | "f0G32k.pth",
36 | "f0G40k.pth",
37 | "f0G48k.pth",
38 | ]
39 | for model in model_names:
40 | print(f"Downloading {model}...")
41 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
42 | filename=model,
43 | subfolder= "pretrained",
44 | local_dir= weights_path)
45 |
46 |
47 | print("Downloading pretrained models v2:")
48 |
49 | for model in model_names:
50 | print(f"Downloading {model}...")
51 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
52 | filename=model,
53 | subfolder= "pretrained_v2",
54 | local_dir= weights_path)
55 |
56 | print("All models downloaded!")
57 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=1.1.0
2 | numba
3 | numpy==1.23.5
4 | scipy
5 | librosa==0.9.1
6 | llvmlite
7 | fairseq
8 | faiss-cpu
9 | Cython
10 | pydub>=0.25.1
11 | soundfile>=0.12.1
12 | ffmpeg-python>=0.2.0
13 | tensorboardX
14 | Jinja2>=3.1.2
15 | json5
16 | Markdown
17 | matplotlib>=3.7.0
18 | matplotlib-inline>=0.1.3
19 | praat-parselmouth>=0.4.2
20 | Pillow>=9.1.1
21 | resampy>=0.4.2
22 | scikit-learn
23 | tensorboard
24 | tqdm>=4.63.1
25 | tornado>=6.1
26 | Werkzeug>=2.2.3
27 | uc-micro-py>=1.0.1
28 | sympy>=1.11.1
29 | tabulate>=0.8.10
30 | PyYAML>=6.0
31 | pyasn1>=0.4.8
32 | pyasn1-modules>=0.2.8
33 | fsspec>=2022.11.0
34 | absl-py>=1.2.0
35 | audioread
36 | uvicorn>=0.21.1
37 | colorama>=0.4.5
38 | pyworld==0.3.2
39 | httpx
40 | onnxruntime; sys_platform == 'darwin'
41 | onnxruntime-gpu; sys_platform != 'darwin'
42 | torchcrepe==0.0.20
43 | fastapi
44 | torchfcpe
45 | ffmpy==0.3.1
46 | python-dotenv>=1.0.0
47 | av
48 |
--------------------------------------------------------------------------------
/rvc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/__init__.py
--------------------------------------------------------------------------------
/rvc/configs/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/configs/__pycache__/config.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/configs/config.json:
--------------------------------------------------------------------------------
1 | {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "formant": 0.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}
--------------------------------------------------------------------------------
/rvc/configs/inuse/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !v1
4 | !v2
5 |
--------------------------------------------------------------------------------
/rvc/configs/inuse/v1/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/rvc/configs/inuse/v2/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/rvc/configs/v1/32k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 12800,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 32000,
21 | "filter_length": 1024,
22 | "hop_length": 320,
23 | "win_length": 1024,
24 | "n_mel_channels": 80,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,4,2,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "spk_embed_dim": 109
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/rvc/configs/v1/40k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 12800,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 40000,
21 | "filter_length": 2048,
22 | "hop_length": 400,
23 | "win_length": 2048,
24 | "n_mel_channels": 125,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,10,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "spk_embed_dim": 109
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/rvc/configs/v1/48k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 11520,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 48000,
21 | "filter_length": 2048,
22 | "hop_length": 480,
23 | "win_length": 2048,
24 | "n_mel_channels": 128,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,6,2,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [16,16,4,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "spk_embed_dim": 109
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/rvc/configs/v2/32k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 12800,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 32000,
21 | "filter_length": 1024,
22 | "hop_length": 320,
23 | "win_length": 1024,
24 | "n_mel_channels": 80,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [10,8,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [20,16,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "spk_embed_dim": 109
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/rvc/configs/v2/48k.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "seed": 1234,
5 | "epochs": 20000,
6 | "learning_rate": 1e-4,
7 | "betas": [0.8, 0.99],
8 | "eps": 1e-9,
9 | "batch_size": 4,
10 | "fp16_run": true,
11 | "lr_decay": 0.999875,
12 | "segment_size": 17280,
13 | "init_lr_ratio": 1,
14 | "warmup_epochs": 0,
15 | "c_mel": 45,
16 | "c_kl": 1.0
17 | },
18 | "data": {
19 | "max_wav_value": 32768.0,
20 | "sampling_rate": 48000,
21 | "filter_length": 2048,
22 | "hop_length": 480,
23 | "win_length": 2048,
24 | "n_mel_channels": 128,
25 | "mel_fmin": 0.0,
26 | "mel_fmax": null
27 | },
28 | "model": {
29 | "inter_channels": 192,
30 | "hidden_channels": 192,
31 | "filter_channels": 768,
32 | "n_heads": 2,
33 | "n_layers": 6,
34 | "kernel_size": 3,
35 | "p_dropout": 0,
36 | "resblock": "1",
37 | "resblock_kernel_sizes": [3,7,11],
38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 | "upsample_rates": [12,10,2,2],
40 | "upsample_initial_channel": 512,
41 | "upsample_kernel_sizes": [24,20,4,4],
42 | "use_spectral_norm": false,
43 | "gin_channels": 256,
44 | "spk_embed_dim": 109
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/rvc/i18n/__pycache__/i18n.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/i18n/__pycache__/i18n.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/i18n/i18n.py:
--------------------------------------------------------------------------------
1 | import json
2 | import locale
3 | import os
4 |
5 | now_dir = os.path.dirname(os.path.abspath(__file__))
6 | def load_language_list(language):
7 | with open(f"{now_dir}/locale/{language}.json", "r", encoding="utf-8") as f:
8 | language_list = json.load(f)
9 | return language_list
10 |
11 |
12 | class I18nAuto:
13 | def __init__(self, language=None):
14 | if language in ["Auto", None]:
15 | language = locale.getdefaultlocale()[
16 | 0
17 | ] # getlocale can't identify the system's language ((None, None))
18 | if not os.path.exists(f"{now_dir}/locale/{language}.json"):
19 | language = "en_US"
20 | self.language = language
21 | self.language_map = load_language_list(language)
22 |
23 | def __call__(self, key):
24 | return self.language_map.get(key, key)
25 |
26 | def __repr__(self):
27 | return "Use Language: " + self.language
28 |
--------------------------------------------------------------------------------
/rvc/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
1 | {
2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音",
3 | "A模型权重": "A模型权重",
4 | "A模型路径": "A模型路径",
5 | "B模型路径": "B模型路径",
6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调",
8 | "Index Rate": "Index Rate",
9 | "Onnx导出": "Onnx导出",
10 | "Onnx输出路径": "Onnx输出路径",
11 | "RVC模型路径": "RVC模型路径",
12 | "ckpt处理": "ckpt处理",
13 | "harvest进程数": "harvest进程数",
14 | "index文件路径不可包含中文": "index文件路径不可包含中文",
15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文",
16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程",
17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ",
18 | "step1:正在处理数据": "step1:正在处理数据",
19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ",
21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)",
22 | "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引",
23 | "step3a:正在训练模型": "step3a:正在训练模型",
24 | "一键训练": "一键训练",
25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。",
27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2",
28 | "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
29 | "使用模型采样率": "使用模型采样率",
30 | "使用设备采样率": "使用设备采样率",
31 | "保存名": "保存名",
32 | "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名",
33 | "保存的模型名不带后缀": "保存的模型名不带后缀",
34 | "保存频率save_every_epoch": "保存频率save_every_epoch",
35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果",
36 | "修改": "修改",
37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)",
38 | "停止音频转换": "停止音频转换",
39 | "全流程结束!": "全流程结束!",
40 | "共振偏移": "共振偏移",
41 | "刷新音色列表和索引路径": "刷新音色列表和索引路径",
42 | "加载模型": "加载模型",
43 | "加载预训练底模D路径": "加载预训练底模D路径",
44 | "加载预训练底模G路径": "加载预训练底模G路径",
45 | "单次推理": "单次推理",
46 | "卸载音色省显存": "卸载音色省显存",
47 | "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
48 | "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样",
49 | "否": "否",
50 | "启用相位声码器": "启用相位声码器",
51 | "响应阈值": "响应阈值",
52 | "响度因子": "响度因子",
53 | "处理数据": "处理数据",
54 | "导出Onnx模型": "导出Onnx模型",
55 | "导出文件格式": "导出文件格式",
56 | "常见问题解答": "常见问题解答",
57 | "常规设置": "常规设置",
58 | "开始音频转换": "开始音频转换",
59 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
60 | "性能设置": "性能设置",
61 | "总训练轮数total_epoch": "总训练轮数total_epoch",
62 | "批量推理": "批量推理",
63 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ",
64 | "指定输出主人声文件夹": "指定输出主人声文件夹",
65 | "指定输出文件夹": "指定输出文件夹",
66 | "指定输出非主人声文件夹": "指定输出非主人声文件夹",
67 | "推理时间(ms):": "推理时间(ms):",
68 | "推理音色": "推理音色",
69 | "提取": "提取",
70 | "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数",
71 | "是": "是",
72 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间",
73 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹",
74 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速",
75 | "显卡信息": "显卡信息",
76 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.",
77 | "查看": "查看",
78 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)",
79 | "检索特征占比": "检索特征占比",
80 | "模型": "模型",
81 | "模型推理": "模型推理",
82 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况",
83 | "模型是否带音高指导": "模型是否带音高指导",
84 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)",
85 | "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否",
86 | "模型版本型号": "模型版本型号",
87 | "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合",
88 | "模型路径": "模型路径",
89 | "每张显卡的batch_size": "每张显卡的batch_size",
90 | "淡入淡出长度": "淡入淡出长度",
91 | "版本": "版本",
92 | "特征提取": "特征提取",
93 | "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果",
94 | "独占 WASAPI 设备": "独占 WASAPI 设备",
95 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ",
96 | "目标采样率": "目标采样率",
97 | "算法延迟(ms):": "算法延迟(ms):",
98 | "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)",
99 | "融合": "融合",
100 | "要改的模型信息": "要改的模型信息",
101 | "要置入的模型信息": "要置入的模型信息",
102 | "训练": "训练",
103 | "训练模型": "训练模型",
104 | "训练特征索引": "训练特征索引",
105 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
106 | "设备类型": "设备类型",
107 | "请指定说话人id": "请指定说话人id",
108 | "请选择index文件": "请选择index文件",
109 | "请选择pth文件": "请选择pth文件",
110 | "请选择说话人id": "请选择说话人id",
111 | "转换": "转换",
112 | "输入实验名": "输入实验名",
113 | "输入待处理音频文件夹路径": "输入待处理音频文件夹路径",
114 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",
115 | "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)",
116 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络",
117 | "输入监听": "输入监听",
118 | "输入训练文件夹路径": "输入训练文件夹路径",
119 | "输入设备": "输入设备",
120 | "输入降噪": "输入降噪",
121 | "输出信息": "输出信息",
122 | "输出变声": "输出变声",
123 | "输出设备": "输出设备",
124 | "输出降噪": "输出降噪",
125 | "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)",
126 | "选择.index文件": "选择.index文件",
127 | "选择.pth文件": "选择.pth文件",
128 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
129 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
130 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
131 | "采样率:": "采样率:",
132 | "采样长度": "采样长度",
133 | "重载设备列表": "重载设备列表",
134 | "音调设置": "音调设置",
135 | "音频设备": "音频设备",
136 | "音高算法": "音高算法",
137 | "额外推理时长": "额外推理时长"
138 | }
139 |
--------------------------------------------------------------------------------
/rvc/i18n/locale_diff.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from collections import OrderedDict
4 |
5 | # Define the standard file name
6 | standard_file = "locale/zh_CN.json"
7 |
8 | # Find all JSON files in the directory
9 | dir_path = "locale/"
10 | languages = [
11 | os.path.join(dir_path, f)
12 | for f in os.listdir(dir_path)
13 | if f.endswith(".json") and f != standard_file
14 | ]
15 |
16 | # Load the standard file
17 | with open(standard_file, "r", encoding="utf-8") as f:
18 | standard_data = json.load(f, object_pairs_hook=OrderedDict)
19 |
20 | # Loop through each language file
21 | for lang_file in languages:
22 | # Load the language file
23 | with open(lang_file, "r", encoding="utf-8") as f:
24 | lang_data = json.load(f, object_pairs_hook=OrderedDict)
25 |
26 | # Find the difference between the language file and the standard file
27 | diff = set(standard_data.keys()) - set(lang_data.keys())
28 |
29 | miss = set(lang_data.keys()) - set(standard_data.keys())
30 |
31 | # Add any missing keys to the language file
32 | for key in diff:
33 | lang_data[key] = key
34 |
35 | # Del any extra keys to the language file
36 | for key in miss:
37 | del lang_data[key]
38 |
39 | # Sort the keys of the language file to match the order of the standard file
40 | lang_data = OrderedDict(
41 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
42 | )
43 |
44 | # Save the updated language file
45 | with open(lang_file, "w", encoding="utf-8") as f:
46 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
47 | f.write("\n")
48 |
--------------------------------------------------------------------------------
/rvc/i18n/scan_i18n.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import glob
3 | import json
4 | from collections import OrderedDict
5 |
6 |
7 | def extract_i18n_strings(node):
8 | i18n_strings = []
9 |
10 | if (
11 | isinstance(node, ast.Call)
12 | and isinstance(node.func, ast.Name)
13 | and node.func.id == "i18n"
14 | ):
15 | for arg in node.args:
16 | if isinstance(arg, ast.Str):
17 | i18n_strings.append(arg.s)
18 |
19 | for child_node in ast.iter_child_nodes(node):
20 | i18n_strings.extend(extract_i18n_strings(child_node))
21 |
22 | return i18n_strings
23 |
24 |
25 | # scan the directory for all .py files (recursively)
26 | # for each file, parse the code into an AST
27 | # for each AST, extract the i18n strings
28 |
29 | strings = []
30 | for filename in glob.iglob("**/*.py", recursive=True):
31 | with open(filename, "r") as f:
32 | code = f.read()
33 | if "I18nAuto" in code:
34 | tree = ast.parse(code)
35 | i18n_strings = extract_i18n_strings(tree)
36 | print(filename, len(i18n_strings))
37 | strings.extend(i18n_strings)
38 | code_keys = set(strings)
39 | """
40 | n_i18n.py
41 | gui_v1.py 26
42 | app.py 16
43 | infer-web.py 147
44 | scan_i18n.py 0
45 | i18n.py 0
46 | lib/train/process_ckpt.py 1
47 | """
48 | print()
49 | print("Total unique:", len(code_keys))
50 |
51 |
52 | standard_file = "i18n/locale/zh_CN.json"
53 | with open(standard_file, "r", encoding="utf-8") as f:
54 | standard_data = json.load(f, object_pairs_hook=OrderedDict)
55 | standard_keys = set(standard_data.keys())
56 |
57 | # Define the standard file name
58 | unused_keys = standard_keys - code_keys
59 | print("Unused keys:", len(unused_keys))
60 | for unused_key in unused_keys:
61 | print("\t", unused_key)
62 |
63 | missing_keys = code_keys - standard_keys
64 | print("Missing keys:", len(missing_keys))
65 | for missing_key in missing_keys:
66 | print("\t", missing_key)
67 |
68 | code_keys_dict = OrderedDict()
69 | for s in strings:
70 | code_keys_dict[s] = s
71 |
72 | # write back
73 | with open(standard_file, "w", encoding="utf-8") as f:
74 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
75 | f.write("\n")
76 |
--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/audio.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/audio.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/rvcmd.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/rvcmd.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/slicer2.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/slicer2.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/audio.py:
--------------------------------------------------------------------------------
1 | import platform
2 | import ffmpeg
3 | import numpy as np
4 | import av
5 |
6 |
7 | def wav2(i, o, format):
8 | inp = av.open(i, "r")
9 | if format == "m4a":
10 | format = "mp4"
11 | out = av.open(o, "w", format=format)
12 | if format == "ogg":
13 | format = "libvorbis"
14 | if format == "mp4":
15 | format = "aac"
16 |
17 | ostream = out.add_stream(format)
18 |
19 | for frame in inp.decode(audio=0):
20 | for p in ostream.encode(frame):
21 | out.mux(p)
22 |
23 | for p in ostream.encode(None):
24 | out.mux(p)
25 |
26 | out.close()
27 | inp.close()
28 |
29 |
30 | def load_audio(file, sr):
31 | try:
32 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
33 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
34 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
35 | file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
36 | out, _ = (
37 | ffmpeg.input(file, threads=0)
38 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
39 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
40 | )
41 | except Exception as e:
42 | raise RuntimeError(f"Failed to load audio: {e}")
43 |
44 | return np.frombuffer(out, np.float32).flatten()
45 |
46 |
47 | def clean_path(path_str):
48 | if platform.system() == "Windows":
49 | path_str = path_str.replace("/", "\\")
50 | return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
51 |
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/commons.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | import math
3 |
4 | import numpy as np
5 | import torch
6 | from torch import nn
7 | from torch.nn import functional as F
8 |
9 |
10 | def init_weights(m, mean=0.0, std=0.01):
11 | classname = m.__class__.__name__
12 | if classname.find("Conv") != -1:
13 | m.weight.data.normal_(mean, std)
14 |
15 |
16 | def get_padding(kernel_size, dilation=1):
17 | return int((kernel_size * dilation - dilation) / 2)
18 |
19 |
20 | # def convert_pad_shape(pad_shape):
21 | # l = pad_shape[::-1]
22 | # pad_shape = [item for sublist in l for item in sublist]
23 | # return pad_shape
24 |
25 |
26 | def kl_divergence(m_p, logs_p, m_q, logs_q):
27 | """KL(P||Q)"""
28 | kl = (logs_q - logs_p) - 0.5
29 | kl += (
30 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
31 | )
32 | return kl
33 |
34 |
35 | def rand_gumbel(shape):
36 | """Sample from the Gumbel distribution, protect from overflows."""
37 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
38 | return -torch.log(-torch.log(uniform_samples))
39 |
40 |
41 | def rand_gumbel_like(x):
42 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
43 | return g
44 |
45 |
46 | def slice_segments(x, ids_str, segment_size=4):
47 | ret = torch.zeros_like(x[:, :, :segment_size])
48 | for i in range(x.size(0)):
49 | idx_str = ids_str[i]
50 | idx_end = idx_str + segment_size
51 | ret[i] = x[i, :, idx_str:idx_end]
52 | return ret
53 |
54 |
55 | def slice_segments2(x, ids_str, segment_size=4):
56 | ret = torch.zeros_like(x[:, :segment_size])
57 | for i in range(x.size(0)):
58 | idx_str = ids_str[i]
59 | idx_end = idx_str + segment_size
60 | ret[i] = x[i, idx_str:idx_end]
61 | return ret
62 |
63 |
64 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
65 | b, d, t = x.size()
66 | if x_lengths is None:
67 | x_lengths = t
68 | ids_str_max = x_lengths - segment_size + 1
69 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
70 | ret = slice_segments(x, ids_str, segment_size)
71 | return ret, ids_str
72 |
73 |
74 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
75 | position = torch.arange(length, dtype=torch.float)
76 | num_timescales = channels // 2
77 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
78 | num_timescales - 1
79 | )
80 | inv_timescales = min_timescale * torch.exp(
81 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
82 | )
83 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
84 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
85 | signal = F.pad(signal, [0, 0, 0, channels % 2])
86 | signal = signal.view(1, channels, length)
87 | return signal
88 |
89 |
90 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
91 | b, channels, length = x.size()
92 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
93 | return x + signal.to(dtype=x.dtype, device=x.device)
94 |
95 |
96 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
97 | b, channels, length = x.size()
98 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
99 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
100 |
101 |
102 | def subsequent_mask(length):
103 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
104 | return mask
105 |
106 |
107 | @torch.jit.script
108 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
109 | n_channels_int = n_channels[0]
110 | in_act = input_a + input_b
111 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
112 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
113 | acts = t_act * s_act
114 | return acts
115 |
116 |
117 | # def convert_pad_shape(pad_shape):
118 | # l = pad_shape[::-1]
119 | # pad_shape = [item for sublist in l for item in sublist]
120 | # return pad_shape
121 |
122 |
123 | def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
124 | return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
125 |
126 |
127 | def shift_1d(x):
128 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
129 | return x
130 |
131 |
132 | def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
133 | if max_length is None:
134 | max_length = length.max()
135 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
136 | return x.unsqueeze(0) < length.unsqueeze(1)
137 |
138 |
139 | def generate_path(duration, mask):
140 | """
141 | duration: [b, 1, t_x]
142 | mask: [b, 1, t_y, t_x]
143 | """
144 | device = duration.device
145 |
146 | b, _, t_y, t_x = mask.shape
147 | cum_duration = torch.cumsum(duration, -1)
148 |
149 | cum_duration_flat = cum_duration.view(b * t_x)
150 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
151 | path = path.view(b, t_x, t_y)
152 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
153 | path = path.unsqueeze(1).transpose(2, 3) * mask
154 | return path
155 |
156 |
157 | def clip_grad_value_(parameters, clip_value, norm_type=2):
158 | if isinstance(parameters, torch.Tensor):
159 | parameters = [parameters]
160 | parameters = list(filter(lambda p: p.grad is not None, parameters))
161 | norm_type = float(norm_type)
162 | if clip_value is not None:
163 | clip_value = float(clip_value)
164 |
165 | total_norm = 0
166 | for p in parameters:
167 | param_norm = p.grad.data.norm(norm_type)
168 | total_norm += param_norm.item() ** norm_type
169 | if clip_value is not None:
170 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
171 | total_norm = total_norm ** (1.0 / norm_type)
172 | return total_norm
173 |
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pyworld
3 |
4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5 |
6 |
7 | class DioF0Predictor(F0Predictor):
8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9 | self.hop_length = hop_length
10 | self.f0_min = f0_min
11 | self.f0_max = f0_max
12 | self.sampling_rate = sampling_rate
13 |
14 | def interpolate_f0(self, f0):
15 | """
16 | 对F0进行插值处理
17 | """
18 |
19 | data = np.reshape(f0, (f0.size, 1))
20 |
21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 | vuv_vector[data > 0.0] = 1.0
23 | vuv_vector[data <= 0.0] = 0.0
24 |
25 | ip_data = data
26 |
27 | frame_number = data.size
28 | last_value = 0.0
29 | for i in range(frame_number):
30 | if data[i] <= 0.0:
31 | j = i + 1
32 | for j in range(i + 1, frame_number):
33 | if data[j] > 0.0:
34 | break
35 | if j < frame_number - 1:
36 | if last_value > 0.0:
37 | step = (data[j] - data[i - 1]) / float(j - i)
38 | for k in range(i, j):
39 | ip_data[k] = data[i - 1] + step * (k - i + 1)
40 | else:
41 | for k in range(i, j):
42 | ip_data[k] = data[j]
43 | else:
44 | for k in range(i, frame_number):
45 | ip_data[k] = last_value
46 | else:
47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48 | last_value = data[i]
49 |
50 | return ip_data[:, 0], vuv_vector[:, 0]
51 |
52 | def resize_f0(self, x, target_len):
53 | source = np.array(x)
54 | source[source < 0.001] = np.nan
55 | target = np.interp(
56 | np.arange(0, len(source) * target_len, len(source)) / target_len,
57 | np.arange(0, len(source)),
58 | source,
59 | )
60 | res = np.nan_to_num(target)
61 | return res
62 |
63 | def compute_f0(self, wav, p_len=None):
64 | if p_len is None:
65 | p_len = wav.shape[0] // self.hop_length
66 | f0, t = pyworld.dio(
67 | wav.astype(np.double),
68 | fs=self.sampling_rate,
69 | f0_floor=self.f0_min,
70 | f0_ceil=self.f0_max,
71 | frame_period=1000 * self.hop_length / self.sampling_rate,
72 | )
73 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
74 | for index, pitch in enumerate(f0):
75 | f0[index] = round(pitch, 1)
76 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
77 |
78 | def compute_f0_uv(self, wav, p_len=None):
79 | if p_len is None:
80 | p_len = wav.shape[0] // self.hop_length
81 | f0, t = pyworld.dio(
82 | wav.astype(np.double),
83 | fs=self.sampling_rate,
84 | f0_floor=self.f0_min,
85 | f0_ceil=self.f0_max,
86 | frame_period=1000 * self.hop_length / self.sampling_rate,
87 | )
88 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
89 | for index, pitch in enumerate(f0):
90 | f0[index] = round(pitch, 1)
91 | return self.interpolate_f0(self.resize_f0(f0, p_len))
92 |
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py:
--------------------------------------------------------------------------------
1 | class F0Predictor(object):
2 | def compute_f0(self, wav, p_len):
3 | """
4 | input: wav:[signal_length]
5 | p_len:int
6 | output: f0:[signal_length//hop_length]
7 | """
8 | pass
9 |
10 | def compute_f0_uv(self, wav, p_len):
11 | """
12 | input: wav:[signal_length]
13 | p_len:int
14 | output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15 | """
16 | pass
17 |
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pyworld
3 |
4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5 |
6 |
7 | class HarvestF0Predictor(F0Predictor):
8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9 | self.hop_length = hop_length
10 | self.f0_min = f0_min
11 | self.f0_max = f0_max
12 | self.sampling_rate = sampling_rate
13 |
14 | def interpolate_f0(self, f0):
15 | """
16 | 对F0进行插值处理
17 | """
18 |
19 | data = np.reshape(f0, (f0.size, 1))
20 |
21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 | vuv_vector[data > 0.0] = 1.0
23 | vuv_vector[data <= 0.0] = 0.0
24 |
25 | ip_data = data
26 |
27 | frame_number = data.size
28 | last_value = 0.0
29 | for i in range(frame_number):
30 | if data[i] <= 0.0:
31 | j = i + 1
32 | for j in range(i + 1, frame_number):
33 | if data[j] > 0.0:
34 | break
35 | if j < frame_number - 1:
36 | if last_value > 0.0:
37 | step = (data[j] - data[i - 1]) / float(j - i)
38 | for k in range(i, j):
39 | ip_data[k] = data[i - 1] + step * (k - i + 1)
40 | else:
41 | for k in range(i, j):
42 | ip_data[k] = data[j]
43 | else:
44 | for k in range(i, frame_number):
45 | ip_data[k] = last_value
46 | else:
47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48 | last_value = data[i]
49 |
50 | return ip_data[:, 0], vuv_vector[:, 0]
51 |
52 | def resize_f0(self, x, target_len):
53 | source = np.array(x)
54 | source[source < 0.001] = np.nan
55 | target = np.interp(
56 | np.arange(0, len(source) * target_len, len(source)) / target_len,
57 | np.arange(0, len(source)),
58 | source,
59 | )
60 | res = np.nan_to_num(target)
61 | return res
62 |
63 | def compute_f0(self, wav, p_len=None):
64 | if p_len is None:
65 | p_len = wav.shape[0] // self.hop_length
66 | f0, t = pyworld.harvest(
67 | wav.astype(np.double),
68 | fs=self.sampling_rate,
69 | f0_ceil=self.f0_max,
70 | f0_floor=self.f0_min,
71 | frame_period=1000 * self.hop_length / self.sampling_rate,
72 | )
73 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
74 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
75 |
76 | def compute_f0_uv(self, wav, p_len=None):
77 | if p_len is None:
78 | p_len = wav.shape[0] // self.hop_length
79 | f0, t = pyworld.harvest(
80 | wav.astype(np.double),
81 | fs=self.sampling_rate,
82 | f0_floor=self.f0_min,
83 | f0_ceil=self.f0_max,
84 | frame_period=1000 * self.hop_length / self.sampling_rate,
85 | )
86 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
87 | return self.interpolate_f0(self.resize_f0(f0, p_len))
88 |
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import parselmouth
3 |
4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5 |
6 |
7 | class PMF0Predictor(F0Predictor):
8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9 | self.hop_length = hop_length
10 | self.f0_min = f0_min
11 | self.f0_max = f0_max
12 | self.sampling_rate = sampling_rate
13 |
14 | def interpolate_f0(self, f0):
15 | """
16 | 对F0进行插值处理
17 | """
18 |
19 | data = np.reshape(f0, (f0.size, 1))
20 |
21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 | vuv_vector[data > 0.0] = 1.0
23 | vuv_vector[data <= 0.0] = 0.0
24 |
25 | ip_data = data
26 |
27 | frame_number = data.size
28 | last_value = 0.0
29 | for i in range(frame_number):
30 | if data[i] <= 0.0:
31 | j = i + 1
32 | for j in range(i + 1, frame_number):
33 | if data[j] > 0.0:
34 | break
35 | if j < frame_number - 1:
36 | if last_value > 0.0:
37 | step = (data[j] - data[i - 1]) / float(j - i)
38 | for k in range(i, j):
39 | ip_data[k] = data[i - 1] + step * (k - i + 1)
40 | else:
41 | for k in range(i, j):
42 | ip_data[k] = data[j]
43 | else:
44 | for k in range(i, frame_number):
45 | ip_data[k] = last_value
46 | else:
47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48 | last_value = data[i]
49 |
50 | return ip_data[:, 0], vuv_vector[:, 0]
51 |
52 | def compute_f0(self, wav, p_len=None):
53 | x = wav
54 | if p_len is None:
55 | p_len = x.shape[0] // self.hop_length
56 | else:
57 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
58 | time_step = self.hop_length / self.sampling_rate * 1000
59 | f0 = (
60 | parselmouth.Sound(x, self.sampling_rate)
61 | .to_pitch_ac(
62 | time_step=time_step / 1000,
63 | voicing_threshold=0.6,
64 | pitch_floor=self.f0_min,
65 | pitch_ceiling=self.f0_max,
66 | )
67 | .selected_array["frequency"]
68 | )
69 |
70 | pad_size = (p_len - len(f0) + 1) // 2
71 | if pad_size > 0 or p_len - len(f0) - pad_size > 0:
72 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
73 | f0, uv = self.interpolate_f0(f0)
74 | return f0
75 |
76 | def compute_f0_uv(self, wav, p_len=None):
77 | x = wav
78 | if p_len is None:
79 | p_len = x.shape[0] // self.hop_length
80 | else:
81 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
82 | time_step = self.hop_length / self.sampling_rate * 1000
83 | f0 = (
84 | parselmouth.Sound(x, self.sampling_rate)
85 | .to_pitch_ac(
86 | time_step=time_step / 1000,
87 | voicing_threshold=0.6,
88 | pitch_floor=self.f0_min,
89 | pitch_ceiling=self.f0_max,
90 | )
91 | .selected_array["frequency"]
92 | )
93 |
94 | pad_size = (p_len - len(f0) + 1) // 2
95 | if pad_size > 0 or p_len - len(f0) - pad_size > 0:
96 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
97 | f0, uv = self.interpolate_f0(f0)
98 | return f0, uv
99 |
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py
--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/onnx_inference.py:
--------------------------------------------------------------------------------
1 | import librosa
2 | import numpy as np
3 | import onnxruntime
4 | import soundfile
5 |
6 | import logging
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class ContentVec:
12 | def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
13 | logger.info("Load model(s) from {}".format(vec_path))
14 | if device == "cpu" or device is None:
15 | providers = ["CPUExecutionProvider"]
16 | elif device == "cuda":
17 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
18 | elif device == "dml":
19 | providers = ["DmlExecutionProvider"]
20 | else:
21 | raise RuntimeError("Unsportted Device")
22 | self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23 |
24 | def __call__(self, wav):
25 | return self.forward(wav)
26 |
27 | def forward(self, wav):
28 | feats = wav
29 | if feats.ndim == 2: # double channels
30 | feats = feats.mean(-1)
31 | assert feats.ndim == 1, feats.ndim
32 | feats = np.expand_dims(np.expand_dims(feats, 0), 0)
33 | onnx_input = {self.model.get_inputs()[0].name: feats}
34 | logits = self.model.run(None, onnx_input)[0]
35 | return logits.transpose(0, 2, 1)
36 |
37 |
38 | def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
39 | if f0_predictor == "pm":
40 | from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
41 |
42 | f0_predictor_object = PMF0Predictor(
43 | hop_length=hop_length, sampling_rate=sampling_rate
44 | )
45 | elif f0_predictor == "harvest":
46 | from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
47 | HarvestF0Predictor,
48 | )
49 |
50 | f0_predictor_object = HarvestF0Predictor(
51 | hop_length=hop_length, sampling_rate=sampling_rate
52 | )
53 | elif f0_predictor == "dio":
54 | from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
55 |
56 | f0_predictor_object = DioF0Predictor(
57 | hop_length=hop_length, sampling_rate=sampling_rate
58 | )
59 | else:
60 | raise Exception("Unknown f0 predictor")
61 | return f0_predictor_object
62 |
63 |
64 | class OnnxRVC:
65 | def __init__(
66 | self,
67 | model_path,
68 | sr=40000,
69 | hop_size=512,
70 | vec_path="vec-768-layer-12",
71 | device="cpu",
72 | ):
73 | vec_path = f"pretrained/{vec_path}.onnx"
74 | self.vec_model = ContentVec(vec_path, device)
75 | if device == "cpu" or device is None:
76 | providers = ["CPUExecutionProvider"]
77 | elif device == "cuda":
78 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
79 | elif device == "dml":
80 | providers = ["DmlExecutionProvider"]
81 | else:
82 | raise RuntimeError("Unsportted Device")
83 | self.model = onnxruntime.InferenceSession(model_path, providers=providers)
84 | self.sampling_rate = sr
85 | self.hop_size = hop_size
86 |
87 | def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
88 | onnx_input = {
89 | self.model.get_inputs()[0].name: hubert,
90 | self.model.get_inputs()[1].name: hubert_length,
91 | self.model.get_inputs()[2].name: pitch,
92 | self.model.get_inputs()[3].name: pitchf,
93 | self.model.get_inputs()[4].name: ds,
94 | self.model.get_inputs()[5].name: rnd,
95 | }
96 | return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
97 |
98 | def inference(
99 | self,
100 | raw_path,
101 | sid,
102 | f0_method="dio",
103 | f0_up_key=0,
104 | pad_time=0.5,
105 | cr_threshold=0.02,
106 | ):
107 | f0_min = 50
108 | f0_max = 1100
109 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
110 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
111 | f0_predictor = get_f0_predictor(
112 | f0_method,
113 | hop_length=self.hop_size,
114 | sampling_rate=self.sampling_rate,
115 | threshold=cr_threshold,
116 | )
117 | wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
118 | org_length = len(wav)
119 | if org_length / sr > 50.0:
120 | raise RuntimeError("Reached Max Length")
121 |
122 | wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
123 | wav16k = wav16k
124 |
125 | hubert = self.vec_model(wav16k)
126 | hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
127 | hubert_length = hubert.shape[1]
128 |
129 | pitchf = f0_predictor.compute_f0(wav, hubert_length)
130 | pitchf = pitchf * 2 ** (f0_up_key / 12)
131 | pitch = pitchf.copy()
132 | f0_mel = 1127 * np.log(1 + pitch / 700)
133 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
134 | f0_mel_max - f0_mel_min
135 | ) + 1
136 | f0_mel[f0_mel <= 1] = 1
137 | f0_mel[f0_mel > 255] = 255
138 | pitch = np.rint(f0_mel).astype(np.int64)
139 |
140 | pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
141 | pitch = pitch.reshape(1, len(pitch))
142 | ds = np.array([sid]).astype(np.int64)
143 |
144 | rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
145 | hubert_length = np.array([hubert_length]).astype(np.int64)
146 |
147 | out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
148 | out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
149 | return out_wav[0:org_length]
150 |
--------------------------------------------------------------------------------
/rvc/infer/lib/jit/__init__.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | import pickle
3 | import time
4 | import torch
5 | from tqdm import tqdm
6 | from collections import OrderedDict
7 |
8 |
9 | def load_inputs(path, device, is_half=False):
10 | parm = torch.load(path, map_location=torch.device("cpu"))
11 | for key in parm.keys():
12 | parm[key] = parm[key].to(device)
13 | if is_half and parm[key].dtype == torch.float32:
14 | parm[key] = parm[key].half()
15 | elif not is_half and parm[key].dtype == torch.float16:
16 | parm[key] = parm[key].float()
17 | return parm
18 |
19 |
20 | def benchmark(
21 | model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False
22 | ):
23 | parm = load_inputs(inputs_path, device, is_half)
24 | total_ts = 0.0
25 | bar = tqdm(range(epoch))
26 | for i in bar:
27 | start_time = time.perf_counter()
28 | o = model(**parm)
29 | total_ts += time.perf_counter() - start_time
30 | print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}")
31 |
32 |
33 | def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False):
34 | benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half)
35 |
36 |
37 | def to_jit_model(
38 | model_path,
39 | model_type: str,
40 | mode: str = "trace",
41 | inputs_path: str = None,
42 | device=torch.device("cpu"),
43 | is_half=False,
44 | ):
45 | model = None
46 | if model_type.lower() == "synthesizer":
47 | from .get_synthesizer import get_synthesizer
48 |
49 | model, _ = get_synthesizer(model_path, device)
50 | model.forward = model.infer
51 | elif model_type.lower() == "rmvpe":
52 | from .get_rmvpe import get_rmvpe
53 |
54 | model = get_rmvpe(model_path, device)
55 | elif model_type.lower() == "hubert":
56 | from .get_hubert import get_hubert_model
57 |
58 | model = get_hubert_model(model_path, device)
59 | model.forward = model.infer
60 | else:
61 | raise ValueError(f"No model type named {model_type}")
62 | model = model.eval()
63 | model = model.half() if is_half else model.float()
64 | if mode == "trace":
65 | assert not inputs_path
66 | inputs = load_inputs(inputs_path, device, is_half)
67 | model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
68 | elif mode == "script":
69 | model_jit = torch.jit.script(model)
70 | model_jit.to(device)
71 | model_jit = model_jit.half() if is_half else model_jit.float()
72 | # model = model.half() if is_half else model.float()
73 | return (model, model_jit)
74 |
75 |
76 | def export(
77 | model: torch.nn.Module,
78 | mode: str = "trace",
79 | inputs: dict = None,
80 | device=torch.device("cpu"),
81 | is_half: bool = False,
82 | ) -> dict:
83 | model = model.half() if is_half else model.float()
84 | model.eval()
85 | if mode == "trace":
86 | assert inputs is not None
87 | model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
88 | elif mode == "script":
89 | model_jit = torch.jit.script(model)
90 | model_jit.to(device)
91 | model_jit = model_jit.half() if is_half else model_jit.float()
92 | buffer = BytesIO()
93 | # model_jit=model_jit.cpu()
94 | torch.jit.save(model_jit, buffer)
95 | del model_jit
96 | cpt = OrderedDict()
97 | cpt["model"] = buffer.getvalue()
98 | cpt["is_half"] = is_half
99 | return cpt
100 |
101 |
102 | def load(path: str):
103 | with open(path, "rb") as f:
104 | return pickle.load(f)
105 |
106 |
107 | def save(ckpt: dict, save_path: str):
108 | with open(save_path, "wb") as f:
109 | pickle.dump(ckpt, f)
110 |
111 |
112 | def rmvpe_jit_export(
113 | model_path: str,
114 | mode: str = "script",
115 | inputs_path: str = None,
116 | save_path: str = None,
117 | device=torch.device("cpu"),
118 | is_half=False,
119 | ):
120 | if not save_path:
121 | save_path = model_path.rstrip(".pth")
122 | save_path += ".half.jit" if is_half else ".jit"
123 | if "cuda" in str(device) and ":" not in str(device):
124 | device = torch.device("cuda:0")
125 | from .get_rmvpe import get_rmvpe
126 |
127 | model = get_rmvpe(model_path, device)
128 | inputs = None
129 | if mode == "trace":
130 | inputs = load_inputs(inputs_path, device, is_half)
131 | ckpt = export(model, mode, inputs, device, is_half)
132 | ckpt["device"] = str(device)
133 | save(ckpt, save_path)
134 | return ckpt
135 |
136 |
137 | def synthesizer_jit_export(
138 | model_path: str,
139 | mode: str = "script",
140 | inputs_path: str = None,
141 | save_path: str = None,
142 | device=torch.device("cpu"),
143 | is_half=False,
144 | ):
145 | if not save_path:
146 | save_path = model_path.rstrip(".pth")
147 | save_path += ".half.jit" if is_half else ".jit"
148 | if "cuda" in str(device) and ":" not in str(device):
149 | device = torch.device("cuda:0")
150 | from .get_synthesizer import get_synthesizer
151 |
152 | model, cpt = get_synthesizer(model_path, device)
153 | assert isinstance(cpt, dict)
154 | model.forward = model.infer
155 | inputs = None
156 | if mode == "trace":
157 | inputs = load_inputs(inputs_path, device, is_half)
158 | ckpt = export(model, mode, inputs, device, is_half)
159 | cpt.pop("weight")
160 | cpt["model"] = ckpt["model"]
161 | cpt["device"] = device
162 | save(cpt, save_path)
163 | return cpt
164 |
--------------------------------------------------------------------------------
/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/jit/get_rmvpe.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
5 | from infer.lib.rmvpe import E2E
6 |
7 | model = E2E(4, 1, (2, 2))
8 | ckpt = torch.load(model_path, map_location=device)
9 | model.load_state_dict(ckpt)
10 | model.eval()
11 | model = model.to(device)
12 | return model
13 |
--------------------------------------------------------------------------------
/rvc/infer/lib/jit/get_synthesizer.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_synthesizer(pth_path, device=torch.device("cpu")):
5 | from infer.lib.infer_pack.models import (
6 | SynthesizerTrnMs256NSFsid,
7 | SynthesizerTrnMs256NSFsid_nono,
8 | SynthesizerTrnMs768NSFsid,
9 | SynthesizerTrnMs768NSFsid_nono,
10 | )
11 |
12 | cpt = torch.load(pth_path, map_location=torch.device("cpu"))
13 | # tgt_sr = cpt["config"][-1]
14 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
15 | if_f0 = cpt.get("f0", 1)
16 | version = cpt.get("version", "v1")
17 | if version == "v1":
18 | if if_f0 == 1:
19 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
20 | else:
21 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
22 | elif version == "v2":
23 | if if_f0 == 1:
24 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False)
25 | else:
26 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
27 | del net_g.enc_q
28 | # net_g.forward = net_g.infer
29 | # ckpt = {}
30 | # ckpt["config"] = cpt["config"]
31 | # ckpt["f0"] = if_f0
32 | # ckpt["version"] = version
33 | # ckpt["info"] = cpt.get("info", "0epoch")
34 | net_g.load_state_dict(cpt["weight"], strict=False)
35 | net_g = net_g.float()
36 | net_g.eval().to(device)
37 | net_g.remove_weight_norm()
38 | return net_g, cpt
39 |
--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/data_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/data_utils.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/losses.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/losses.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/mel_processing.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/mel_processing.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/process_ckpt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/process_ckpt.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/utils.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/lib/train/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def feature_loss(fmap_r, fmap_g):
5 | loss = 0
6 | for dr, dg in zip(fmap_r, fmap_g):
7 | for rl, gl in zip(dr, dg):
8 | rl = rl.float().detach()
9 | gl = gl.float()
10 | loss += torch.mean(torch.abs(rl - gl))
11 |
12 | return loss * 2
13 |
14 |
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 | loss = 0
17 | r_losses = []
18 | g_losses = []
19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 | dr = dr.float()
21 | dg = dg.float()
22 | r_loss = torch.mean((1 - dr) ** 2)
23 | g_loss = torch.mean(dg**2)
24 | loss += r_loss + g_loss
25 | r_losses.append(r_loss.item())
26 | g_losses.append(g_loss.item())
27 |
28 | return loss, r_losses, g_losses
29 |
30 |
31 | def generator_loss(disc_outputs):
32 | loss = 0
33 | gen_losses = []
34 | for dg in disc_outputs:
35 | dg = dg.float()
36 | l = torch.mean((1 - dg) ** 2)
37 | gen_losses.append(l)
38 | loss += l
39 |
40 | return loss, gen_losses
41 |
42 |
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 | """
45 | z_p, logs_q: [b, h, t_t]
46 | m_p, logs_p: [b, h, t_t]
47 | """
48 | z_p = z_p.float()
49 | logs_q = logs_q.float()
50 | m_p = m_p.float()
51 | logs_p = logs_p.float()
52 | z_mask = z_mask.float()
53 |
54 | kl = logs_p - logs_q - 0.5
55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 | kl = torch.sum(kl * z_mask)
57 | l = kl / torch.sum(z_mask)
58 | return l
59 |
--------------------------------------------------------------------------------
/rvc/infer/lib/train/mel_processing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data
3 | from librosa.filters import mel as librosa_mel_fn
4 | import logging
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 | MAX_WAV_VALUE = 32768.0
9 |
10 |
11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
12 | """
13 | PARAMS
14 | ------
15 | C: compression factor
16 | """
17 | return torch.log(torch.clamp(x, min=clip_val) * C)
18 |
19 |
20 | def dynamic_range_decompression_torch(x, C=1):
21 | """
22 | PARAMS
23 | ------
24 | C: compression factor used to compress
25 | """
26 | return torch.exp(x) / C
27 |
28 |
29 | def spectral_normalize_torch(magnitudes):
30 | return dynamic_range_compression_torch(magnitudes)
31 |
32 |
33 | def spectral_de_normalize_torch(magnitudes):
34 | return dynamic_range_decompression_torch(magnitudes)
35 |
36 |
37 | # Reusable banks
38 | mel_basis = {}
39 | hann_window = {}
40 |
41 |
42 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
43 | """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
44 |
45 | Args:
46 | y :: (B, T) - Audio waveforms
47 | n_fft
48 | sampling_rate
49 | hop_size
50 | win_size
51 | center
52 | Returns:
53 | :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
54 | """
55 |
56 | # Window - Cache if needed
57 | global hann_window
58 | dtype_device = str(y.dtype) + "_" + str(y.device)
59 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
60 | if wnsize_dtype_device not in hann_window:
61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
62 | dtype=y.dtype, device=y.device
63 | )
64 |
65 | # Padding
66 | y = torch.nn.functional.pad(
67 | y.unsqueeze(1),
68 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
69 | mode="reflect",
70 | )
71 | y = y.squeeze(1)
72 |
73 | # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
74 | spec = torch.stft(
75 | y,
76 | n_fft,
77 | hop_length=hop_size,
78 | win_length=win_size,
79 | window=hann_window[wnsize_dtype_device],
80 | center=center,
81 | pad_mode="reflect",
82 | normalized=False,
83 | onesided=True,
84 | return_complex=True,
85 | )
86 |
87 | # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
88 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
89 | return spec
90 |
91 |
92 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
93 | # MelBasis - Cache if needed
94 | global mel_basis
95 | dtype_device = str(spec.dtype) + "_" + str(spec.device)
96 | fmax_dtype_device = str(fmax) + "_" + dtype_device
97 | if fmax_dtype_device not in mel_basis:
98 | mel = librosa_mel_fn(
99 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
100 | )
101 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
102 | dtype=spec.dtype, device=spec.device
103 | )
104 |
105 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
106 | melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
107 | melspec = spectral_normalize_torch(melspec)
108 | return melspec
109 |
110 |
111 | def mel_spectrogram_torch(
112 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
113 | ):
114 | """Convert waveform into Mel-frequency Log-amplitude spectrogram.
115 |
116 | Args:
117 | y :: (B, T) - Waveforms
118 | Returns:
119 | melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
120 | """
121 | # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
122 | spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
123 |
124 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
125 | melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
126 |
127 | return melspec
128 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 |
4 | import numpy as np
5 | import torch
6 | import torch.utils.data
7 | from tqdm import tqdm
8 |
9 | from . import spec_utils
10 |
11 |
12 | class VocalRemoverValidationSet(torch.utils.data.Dataset):
13 | def __init__(self, patch_list):
14 | self.patch_list = patch_list
15 |
16 | def __len__(self):
17 | return len(self.patch_list)
18 |
19 | def __getitem__(self, idx):
20 | path = self.patch_list[idx]
21 | data = np.load(path)
22 |
23 | X, y = data["X"], data["y"]
24 |
25 | X_mag = np.abs(X)
26 | y_mag = np.abs(y)
27 |
28 | return X_mag, y_mag
29 |
30 |
31 | def make_pair(mix_dir, inst_dir):
32 | input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
33 |
34 | X_list = sorted(
35 | [
36 | os.path.join(mix_dir, fname)
37 | for fname in os.listdir(mix_dir)
38 | if os.path.splitext(fname)[1] in input_exts
39 | ]
40 | )
41 | y_list = sorted(
42 | [
43 | os.path.join(inst_dir, fname)
44 | for fname in os.listdir(inst_dir)
45 | if os.path.splitext(fname)[1] in input_exts
46 | ]
47 | )
48 |
49 | filelist = list(zip(X_list, y_list))
50 |
51 | return filelist
52 |
53 |
54 | def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
55 | if split_mode == "random":
56 | filelist = make_pair(
57 | os.path.join(dataset_dir, "mixtures"),
58 | os.path.join(dataset_dir, "instruments"),
59 | )
60 |
61 | random.shuffle(filelist)
62 |
63 | if len(val_filelist) == 0:
64 | val_size = int(len(filelist) * val_rate)
65 | train_filelist = filelist[:-val_size]
66 | val_filelist = filelist[-val_size:]
67 | else:
68 | train_filelist = [
69 | pair for pair in filelist if list(pair) not in val_filelist
70 | ]
71 | elif split_mode == "subdirs":
72 | if len(val_filelist) != 0:
73 | raise ValueError(
74 | "The `val_filelist` option is not available in `subdirs` mode"
75 | )
76 |
77 | train_filelist = make_pair(
78 | os.path.join(dataset_dir, "training/mixtures"),
79 | os.path.join(dataset_dir, "training/instruments"),
80 | )
81 |
82 | val_filelist = make_pair(
83 | os.path.join(dataset_dir, "validation/mixtures"),
84 | os.path.join(dataset_dir, "validation/instruments"),
85 | )
86 |
87 | return train_filelist, val_filelist
88 |
89 |
90 | def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
91 | perm = np.random.permutation(len(X))
92 | for i, idx in enumerate(tqdm(perm)):
93 | if np.random.uniform() < reduction_rate:
94 | y[idx] = spec_utils.reduce_vocal_aggressively(
95 | X[idx], y[idx], reduction_mask
96 | )
97 |
98 | if np.random.uniform() < 0.5:
99 | # swap channel
100 | X[idx] = X[idx, ::-1]
101 | y[idx] = y[idx, ::-1]
102 | if np.random.uniform() < 0.02:
103 | # mono
104 | X[idx] = X[idx].mean(axis=0, keepdims=True)
105 | y[idx] = y[idx].mean(axis=0, keepdims=True)
106 | if np.random.uniform() < 0.02:
107 | # inst
108 | X[idx] = y[idx]
109 |
110 | if np.random.uniform() < mixup_rate and i < len(perm) - 1:
111 | lam = np.random.beta(mixup_alpha, mixup_alpha)
112 | X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
113 | y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
114 |
115 | return X, y
116 |
117 |
118 | def make_padding(width, cropsize, offset):
119 | left = offset
120 | roi_size = cropsize - left * 2
121 | if roi_size == 0:
122 | roi_size = cropsize
123 | right = roi_size - (width % roi_size) + left
124 |
125 | return left, right, roi_size
126 |
127 |
128 | def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
129 | len_dataset = patches * len(filelist)
130 |
131 | X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
132 | y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
133 |
134 | for i, (X_path, y_path) in enumerate(tqdm(filelist)):
135 | X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
136 | coef = np.max([np.abs(X).max(), np.abs(y).max()])
137 | X, y = X / coef, y / coef
138 |
139 | l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
140 | X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
141 | y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
142 |
143 | starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
144 | ends = starts + cropsize
145 | for j in range(patches):
146 | idx = i * patches + j
147 | X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
148 | y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
149 |
150 | return X_dataset, y_dataset
151 |
152 |
153 | def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
154 | patch_list = []
155 | patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
156 | cropsize, sr, hop_length, n_fft, offset
157 | )
158 | os.makedirs(patch_dir, exist_ok=True)
159 |
160 | for i, (X_path, y_path) in enumerate(tqdm(filelist)):
161 | basename = os.path.splitext(os.path.basename(X_path))[0]
162 |
163 | X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
164 | coef = np.max([np.abs(X).max(), np.abs(y).max()])
165 | X, y = X / coef, y / coef
166 |
167 | l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
168 | X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
169 | y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
170 |
171 | len_dataset = int(np.ceil(X.shape[2] / roi_size))
172 | for j in range(len_dataset):
173 | outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
174 | start = j * roi_size
175 | if not os.path.exists(outpath):
176 | np.savez(
177 | outpath,
178 | X=X_pad[:, :, start : start + cropsize],
179 | y=y_pad[:, :, start : start + cropsize],
180 | )
181 | patch_list.append(outpath)
182 |
183 | return VocalRemoverValidationSet(patch_list)
184 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.bottleneck = nn.Sequential(
104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 | )
106 |
107 | def forward(self, x):
108 | _, _, h, w = x.size()
109 | feat1 = F.interpolate(
110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 | )
112 | feat2 = self.conv2(x)
113 | feat3 = self.conv3(x)
114 | feat4 = self.conv4(x)
115 | feat5 = self.conv5(x)
116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 | bottle = self.bottleneck(out)
118 | return bottle
119 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.bottleneck = nn.Sequential(
104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 | )
106 |
107 | def forward(self, x):
108 | _, _, h, w = x.size()
109 | feat1 = F.interpolate(
110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 | )
112 | feat2 = self.conv2(x)
113 | feat3 = self.conv3(x)
114 | feat4 = self.conv4(x)
115 | feat5 = self.conv5(x)
116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 | bottle = self.bottleneck(out)
118 | return bottle
119 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.bottleneck = nn.Sequential(
104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 | )
106 |
107 | def forward(self, x):
108 | _, _, h, w = x.size()
109 | feat1 = F.interpolate(
110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 | )
112 | feat2 = self.conv2(x)
113 | feat3 = self.conv3(x)
114 | feat4 = self.conv4(x)
115 | feat5 = self.conv5(x)
116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 | bottle = self.bottleneck(out)
118 | return bottle
119 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.conv6 = SeperableConv2DBNActiv(
104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 | )
106 | self.conv7 = SeperableConv2DBNActiv(
107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 | )
109 | self.bottleneck = nn.Sequential(
110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 | )
112 |
113 | def forward(self, x):
114 | _, _, h, w = x.size()
115 | feat1 = F.interpolate(
116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 | )
118 | feat2 = self.conv2(x)
119 | feat3 = self.conv3(x)
120 | feat4 = self.conv4(x)
121 | feat5 = self.conv5(x)
122 | feat6 = self.conv6(x)
123 | feat7 = self.conv7(x)
124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 | bottle = self.bottleneck(out)
126 | return bottle
127 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.conv6 = SeperableConv2DBNActiv(
104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 | )
106 | self.conv7 = SeperableConv2DBNActiv(
107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 | )
109 | self.bottleneck = nn.Sequential(
110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 | )
112 |
113 | def forward(self, x):
114 | _, _, h, w = x.size()
115 | feat1 = F.interpolate(
116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 | )
118 | feat2 = self.conv2(x)
119 | feat3 = self.conv3(x)
120 | feat4 = self.conv4(x)
121 | feat5 = self.conv5(x)
122 | feat6 = self.conv6(x)
123 | feat7 = self.conv7(x)
124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 | bottle = self.bottleneck(out)
126 | return bottle
127 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.conv6 = SeperableConv2DBNActiv(
104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 | )
106 | self.conv7 = SeperableConv2DBNActiv(
107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 | )
109 | self.bottleneck = nn.Sequential(
110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 | )
112 |
113 | def forward(self, x):
114 | _, _, h, w = x.size()
115 | feat1 = F.interpolate(
116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 | )
118 | feat2 = self.conv2(x)
119 | feat3 = self.conv3(x)
120 | feat4 = self.conv4(x)
121 | feat5 = self.conv5(x)
122 | feat6 = self.conv6(x)
123 | feat7 = self.conv7(x)
124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 | bottle = self.bottleneck(out)
126 | return bottle
127 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_new.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class Encoder(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
31 | super(Encoder, self).__init__()
32 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
33 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
34 |
35 | def __call__(self, x):
36 | h = self.conv1(x)
37 | h = self.conv2(h)
38 |
39 | return h
40 |
41 |
42 | class Decoder(nn.Module):
43 | def __init__(
44 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
45 | ):
46 | super(Decoder, self).__init__()
47 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
48 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
49 | self.dropout = nn.Dropout2d(0.1) if dropout else None
50 |
51 | def __call__(self, x, skip=None):
52 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
53 |
54 | if skip is not None:
55 | skip = spec_utils.crop_center(skip, x)
56 | x = torch.cat([x, skip], dim=1)
57 |
58 | h = self.conv1(x)
59 | # h = self.conv2(h)
60 |
61 | if self.dropout is not None:
62 | h = self.dropout(h)
63 |
64 | return h
65 |
66 |
67 | class ASPPModule(nn.Module):
68 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
69 | super(ASPPModule, self).__init__()
70 | self.conv1 = nn.Sequential(
71 | nn.AdaptiveAvgPool2d((1, None)),
72 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
73 | )
74 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
75 | self.conv3 = Conv2DBNActiv(
76 | nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
77 | )
78 | self.conv4 = Conv2DBNActiv(
79 | nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
80 | )
81 | self.conv5 = Conv2DBNActiv(
82 | nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
83 | )
84 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
85 | self.dropout = nn.Dropout2d(0.1) if dropout else None
86 |
87 | def forward(self, x):
88 | _, _, h, w = x.size()
89 | feat1 = F.interpolate(
90 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
91 | )
92 | feat2 = self.conv2(x)
93 | feat3 = self.conv3(x)
94 | feat4 = self.conv4(x)
95 | feat5 = self.conv5(x)
96 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
97 | out = self.bottleneck(out)
98 |
99 | if self.dropout is not None:
100 | out = self.dropout(out)
101 |
102 | return out
103 |
104 |
105 | class LSTMModule(nn.Module):
106 | def __init__(self, nin_conv, nin_lstm, nout_lstm):
107 | super(LSTMModule, self).__init__()
108 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109 | self.lstm = nn.LSTM(
110 | input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111 | )
112 | self.dense = nn.Sequential(
113 | nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114 | )
115 |
116 | def forward(self, x):
117 | N, _, nbins, nframes = x.size()
118 | h = self.conv(x)[:, 0] # N, nbins, nframes
119 | h = h.permute(2, 0, 1) # nframes, N, nbins
120 | h, _ = self.lstm(h)
121 | h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
122 | h = h.reshape(nframes, N, 1, nbins)
123 | h = h.permute(1, 2, 3, 0)
124 |
125 | return h
126 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import pathlib
4 |
5 | default_param = {}
6 | default_param["bins"] = 768
7 | default_param["unstable_bins"] = 9 # training only
8 | default_param["reduction_bins"] = 762 # training only
9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 |
14 |
15 | default_param["band"][1] = {
16 | "sr": 11025,
17 | "hl": 128,
18 | "n_fft": 960,
19 | "crop_start": 0,
20 | "crop_stop": 245,
21 | "lpf_start": 61, # inference only
22 | "res_type": "polyphase",
23 | }
24 |
25 | default_param["band"][2] = {
26 | "sr": 44100,
27 | "hl": 512,
28 | "n_fft": 1536,
29 | "crop_start": 24,
30 | "crop_stop": 547,
31 | "hpf_start": 81, # inference only
32 | "res_type": "sinc_best",
33 | }
34 |
35 |
36 | def int_keys(d):
37 | r = {}
38 | for k, v in d:
39 | if k.isdigit():
40 | k = int(k)
41 | r[k] = v
42 | return r
43 |
44 |
45 | class ModelParameters(object):
46 | def __init__(self, config_path=""):
47 | if ".pth" == pathlib.Path(config_path).suffix:
48 | import zipfile
49 |
50 | with zipfile.ZipFile(config_path, "r") as zip:
51 | self.param = json.loads(
52 | zip.read("param.json"), object_pairs_hook=int_keys
53 | )
54 | elif ".json" == pathlib.Path(config_path).suffix:
55 | with open(config_path, "r") as f:
56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 | else:
58 | self.param = default_param
59 |
60 | for k in [
61 | "mid_side",
62 | "mid_side_b",
63 | "mid_side_b2",
64 | "stereo_w",
65 | "stereo_n",
66 | "reverse",
67 | ]:
68 | if not k in self.param:
69 | self.param[k] = False
70 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 16000,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 16000,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 32000,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "kaiser_fast"
14 | }
15 | },
16 | "sr": 32000,
17 | "pre_filter_start": 1000,
18 | "pre_filter_stop": 1021
19 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 33075,
8 | "hl": 384,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 33075,
17 | "pre_filter_start": 1000,
18 | "pre_filter_stop": 1021
19 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 1024,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 256,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 256,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 256,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 256,
18 | "pre_filter_stop": 256
19 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 700,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 700
19 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 705,
5 | "band": {
6 | "1": {
7 | "sr": 6000,
8 | "hl": 66,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 240,
12 | "lpf_start": 60,
13 | "lpf_stop": 118,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 32000,
18 | "hl": 352,
19 | "n_fft": 1024,
20 | "crop_start": 22,
21 | "crop_stop": 505,
22 | "hpf_start": 44,
23 | "hpf_stop": 23,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 32000,
28 | "pre_filter_start": 710,
29 | "pre_filter_stop": 731
30 | }
31 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 512,
3 | "unstable_bins": 7,
4 | "reduction_bins": 510,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 160,
9 | "n_fft": 768,
10 | "crop_start": 0,
11 | "crop_stop": 192,
12 | "lpf_start": 41,
13 | "lpf_stop": 139,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 44100,
18 | "hl": 640,
19 | "n_fft": 1024,
20 | "crop_start": 10,
21 | "crop_stop": 320,
22 | "hpf_start": 47,
23 | "hpf_stop": 15,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 44100,
28 | "pre_filter_start": 510,
29 | "pre_filter_stop": 512
30 | }
31 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 705,
5 | "band": {
6 | "1": {
7 | "sr": 6000,
8 | "hl": 66,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 240,
12 | "lpf_start": 60,
13 | "lpf_stop": 240,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 48000,
18 | "hl": 528,
19 | "n_fft": 1536,
20 | "crop_start": 22,
21 | "crop_stop": 505,
22 | "hpf_start": 82,
23 | "hpf_stop": 22,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 48000,
28 | "pre_filter_start": 710,
29 | "pre_filter_stop": 731
30 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 5,
4 | "reduction_bins": 733,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 128,
9 | "n_fft": 768,
10 | "crop_start": 0,
11 | "crop_stop": 278,
12 | "lpf_start": 28,
13 | "lpf_stop": 140,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 22050,
18 | "hl": 256,
19 | "n_fft": 768,
20 | "crop_start": 14,
21 | "crop_stop": 322,
22 | "hpf_start": 70,
23 | "hpf_stop": 14,
24 | "lpf_start": 283,
25 | "lpf_stop": 314,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 44100,
30 | "hl": 512,
31 | "n_fft": 768,
32 | "crop_start": 131,
33 | "crop_stop": 313,
34 | "hpf_start": 154,
35 | "hpf_stop": 141,
36 | "res_type": "sinc_medium"
37 | }
38 | },
39 | "sr": 44100,
40 | "pre_filter_start": 757,
41 | "pre_filter_stop": 768
42 | }
43 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side": true,
3 | "bins": 768,
4 | "unstable_bins": 5,
5 | "reduction_bins": 733,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 768,
11 | "crop_start": 0,
12 | "crop_stop": 278,
13 | "lpf_start": 28,
14 | "lpf_stop": 140,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 256,
20 | "n_fft": 768,
21 | "crop_start": 14,
22 | "crop_stop": 322,
23 | "hpf_start": 70,
24 | "hpf_stop": 14,
25 | "lpf_start": 283,
26 | "lpf_stop": 314,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 512,
32 | "n_fft": 768,
33 | "crop_start": 131,
34 | "crop_stop": 313,
35 | "hpf_start": 154,
36 | "hpf_stop": 141,
37 | "res_type": "sinc_medium"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 757,
42 | "pre_filter_stop": 768
43 | }
44 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b2": true,
3 | "bins": 640,
4 | "unstable_bins": 7,
5 | "reduction_bins": 565,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 108,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 187,
13 | "lpf_start": 92,
14 | "lpf_stop": 186,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 216,
20 | "n_fft": 768,
21 | "crop_start": 0,
22 | "crop_stop": 212,
23 | "hpf_start": 68,
24 | "hpf_stop": 34,
25 | "lpf_start": 174,
26 | "lpf_stop": 209,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 432,
32 | "n_fft": 640,
33 | "crop_start": 66,
34 | "crop_stop": 307,
35 | "hpf_start": 86,
36 | "hpf_stop": 72,
37 | "res_type": "kaiser_fast"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 639,
42 | "pre_filter_stop": 640
43 | }
44 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 668,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 128,
9 | "n_fft": 1024,
10 | "crop_start": 0,
11 | "crop_stop": 186,
12 | "lpf_start": 37,
13 | "lpf_stop": 73,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 11025,
18 | "hl": 128,
19 | "n_fft": 512,
20 | "crop_start": 4,
21 | "crop_stop": 185,
22 | "hpf_start": 36,
23 | "hpf_stop": 18,
24 | "lpf_start": 93,
25 | "lpf_stop": 185,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 22050,
30 | "hl": 256,
31 | "n_fft": 512,
32 | "crop_start": 46,
33 | "crop_stop": 186,
34 | "hpf_start": 93,
35 | "hpf_stop": 46,
36 | "lpf_start": 164,
37 | "lpf_stop": 186,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 512,
43 | "n_fft": 768,
44 | "crop_start": 121,
45 | "crop_stop": 382,
46 | "hpf_start": 138,
47 | "hpf_stop": 123,
48 | "res_type": "sinc_medium"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 740,
53 | "pre_filter_stop": 768
54 | }
55 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "mid_side": true,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
56 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
1 | {
2 | "reverse": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
1 | {
2 | "stereo_w": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 637,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "res_type": "kaiser_fast"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 668,
53 | "pre_filter_stop": 672
54 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 637,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "convert_channels": "stereo_n",
49 | "res_type": "kaiser_fast"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 668,
54 | "pre_filter_stop": 672
55 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 530,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "res_type": "kaiser_fast"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 668,
53 | "pre_filter_stop": 672
54 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b2": true,
3 | "bins": 1280,
4 | "unstable_bins": 7,
5 | "reduction_bins": 565,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 108,
10 | "n_fft": 2048,
11 | "crop_start": 0,
12 | "crop_stop": 374,
13 | "lpf_start": 92,
14 | "lpf_stop": 186,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 216,
20 | "n_fft": 1536,
21 | "crop_start": 0,
22 | "crop_stop": 424,
23 | "hpf_start": 68,
24 | "hpf_stop": 34,
25 | "lpf_start": 348,
26 | "lpf_stop": 418,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 432,
32 | "n_fft": 1280,
33 | "crop_start": 132,
34 | "crop_stop": 614,
35 | "hpf_start": 172,
36 | "hpf_stop": 144,
37 | "res_type": "polyphase"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 1280,
42 | "pre_filter_stop": 1280
43 | }
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets.py:
--------------------------------------------------------------------------------
1 | import layers
2 | import torch
3 | import torch.nn.functional as F
4 | from torch import nn
5 |
6 | from . import spec_utils
7 |
8 |
9 | class BaseASPPNet(nn.Module):
10 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
11 | super(BaseASPPNet, self).__init__()
12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16 |
17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18 |
19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23 |
24 | def __call__(self, x):
25 | h, e1 = self.enc1(x)
26 | h, e2 = self.enc2(h)
27 | h, e3 = self.enc3(h)
28 | h, e4 = self.enc4(h)
29 |
30 | h = self.aspp(h)
31 |
32 | h = self.dec4(h, e4)
33 | h = self.dec3(h, e3)
34 | h = self.dec2(h, e2)
35 | h = self.dec1(h, e1)
36 |
37 | return h
38 |
39 |
40 | class CascadedASPPNet(nn.Module):
41 | def __init__(self, n_fft):
42 | super(CascadedASPPNet, self).__init__()
43 | self.stg1_low_band_net = BaseASPPNet(2, 16)
44 | self.stg1_high_band_net = BaseASPPNet(2, 16)
45 |
46 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
47 | self.stg2_full_band_net = BaseASPPNet(8, 16)
48 |
49 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
50 | self.stg3_full_band_net = BaseASPPNet(16, 32)
51 |
52 | self.out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
54 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
55 |
56 | self.max_bin = n_fft // 2
57 | self.output_bin = n_fft // 2 + 1
58 |
59 | self.offset = 128
60 |
61 | def forward(self, x, aggressiveness=None):
62 | mix = x.detach()
63 | x = x.clone()
64 |
65 | x = x[:, :, : self.max_bin]
66 |
67 | bandw = x.size()[2] // 2
68 | aux1 = torch.cat(
69 | [
70 | self.stg1_low_band_net(x[:, :, :bandw]),
71 | self.stg1_high_band_net(x[:, :, bandw:]),
72 | ],
73 | dim=2,
74 | )
75 |
76 | h = torch.cat([x, aux1], dim=1)
77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78 |
79 | h = torch.cat([x, aux1, aux2], dim=1)
80 | h = self.stg3_full_band_net(self.stg3_bridge(h))
81 |
82 | mask = torch.sigmoid(self.out(h))
83 | mask = F.pad(
84 | input=mask,
85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86 | mode="replicate",
87 | )
88 |
89 | if self.training:
90 | aux1 = torch.sigmoid(self.aux1_out(aux1))
91 | aux1 = F.pad(
92 | input=aux1,
93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94 | mode="replicate",
95 | )
96 | aux2 = torch.sigmoid(self.aux2_out(aux2))
97 | aux2 = F.pad(
98 | input=aux2,
99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 | mode="replicate",
101 | )
102 | return mask * mix, aux1 * mix, aux2 * mix
103 | else:
104 | if aggressiveness:
105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 | mask[:, :, : aggressiveness["split_bin"]],
107 | 1 + aggressiveness["value"] / 3,
108 | )
109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 | mask[:, :, aggressiveness["split_bin"] :],
111 | 1 + aggressiveness["value"],
112 | )
113 |
114 | return mask * mix
115 |
116 | def predict(self, x_mag, aggressiveness=None):
117 | h = self.forward(x_mag, aggressiveness)
118 |
119 | if self.offset > 0:
120 | h = h[:, :, :, self.offset : -self.offset]
121 | assert h.size()[3] > 0
122 |
123 | return h
124 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_123821KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 32)
43 | self.stg1_high_band_net = BaseASPPNet(2, 32)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(16, 32)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(32, 64)
50 |
51 | self.out = nn.Conv2d(64, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_123821KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 32)
43 | self.stg1_high_band_net = BaseASPPNet(2, 32)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(16, 32)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(32, 64)
50 |
51 | self.out = nn.Conv2d(64, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_33966KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 16)
43 | self.stg1_high_band_net = BaseASPPNet(2, 16)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(8, 16)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(16, 32)
50 |
51 | self.out = nn.Conv2d(32, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from torch import nn
5 |
6 | from . import layers_537238KB as layers
7 |
8 |
9 | class BaseASPPNet(nn.Module):
10 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
11 | super(BaseASPPNet, self).__init__()
12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16 |
17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18 |
19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23 |
24 | def __call__(self, x):
25 | h, e1 = self.enc1(x)
26 | h, e2 = self.enc2(h)
27 | h, e3 = self.enc3(h)
28 | h, e4 = self.enc4(h)
29 |
30 | h = self.aspp(h)
31 |
32 | h = self.dec4(h, e4)
33 | h = self.dec3(h, e3)
34 | h = self.dec2(h, e2)
35 | h = self.dec1(h, e1)
36 |
37 | return h
38 |
39 |
40 | class CascadedASPPNet(nn.Module):
41 | def __init__(self, n_fft):
42 | super(CascadedASPPNet, self).__init__()
43 | self.stg1_low_band_net = BaseASPPNet(2, 64)
44 | self.stg1_high_band_net = BaseASPPNet(2, 64)
45 |
46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47 | self.stg2_full_band_net = BaseASPPNet(32, 64)
48 |
49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50 | self.stg3_full_band_net = BaseASPPNet(64, 128)
51 |
52 | self.out = nn.Conv2d(128, 2, 1, bias=False)
53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55 |
56 | self.max_bin = n_fft // 2
57 | self.output_bin = n_fft // 2 + 1
58 |
59 | self.offset = 128
60 |
61 | def forward(self, x, aggressiveness=None):
62 | mix = x.detach()
63 | x = x.clone()
64 |
65 | x = x[:, :, : self.max_bin]
66 |
67 | bandw = x.size()[2] // 2
68 | aux1 = torch.cat(
69 | [
70 | self.stg1_low_band_net(x[:, :, :bandw]),
71 | self.stg1_high_band_net(x[:, :, bandw:]),
72 | ],
73 | dim=2,
74 | )
75 |
76 | h = torch.cat([x, aux1], dim=1)
77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78 |
79 | h = torch.cat([x, aux1, aux2], dim=1)
80 | h = self.stg3_full_band_net(self.stg3_bridge(h))
81 |
82 | mask = torch.sigmoid(self.out(h))
83 | mask = F.pad(
84 | input=mask,
85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86 | mode="replicate",
87 | )
88 |
89 | if self.training:
90 | aux1 = torch.sigmoid(self.aux1_out(aux1))
91 | aux1 = F.pad(
92 | input=aux1,
93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94 | mode="replicate",
95 | )
96 | aux2 = torch.sigmoid(self.aux2_out(aux2))
97 | aux2 = F.pad(
98 | input=aux2,
99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 | mode="replicate",
101 | )
102 | return mask * mix, aux1 * mix, aux2 * mix
103 | else:
104 | if aggressiveness:
105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 | mask[:, :, : aggressiveness["split_bin"]],
107 | 1 + aggressiveness["value"] / 3,
108 | )
109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 | mask[:, :, aggressiveness["split_bin"] :],
111 | 1 + aggressiveness["value"],
112 | )
113 |
114 | return mask * mix
115 |
116 | def predict(self, x_mag, aggressiveness=None):
117 | h = self.forward(x_mag, aggressiveness)
118 |
119 | if self.offset > 0:
120 | h = h[:, :, :, self.offset : -self.offset]
121 | assert h.size()[3] > 0
122 |
123 | return h
124 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from torch import nn
5 |
6 | from . import layers_537238KB as layers
7 |
8 |
9 | class BaseASPPNet(nn.Module):
10 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
11 | super(BaseASPPNet, self).__init__()
12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16 |
17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18 |
19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23 |
24 | def __call__(self, x):
25 | h, e1 = self.enc1(x)
26 | h, e2 = self.enc2(h)
27 | h, e3 = self.enc3(h)
28 | h, e4 = self.enc4(h)
29 |
30 | h = self.aspp(h)
31 |
32 | h = self.dec4(h, e4)
33 | h = self.dec3(h, e3)
34 | h = self.dec2(h, e2)
35 | h = self.dec1(h, e1)
36 |
37 | return h
38 |
39 |
40 | class CascadedASPPNet(nn.Module):
41 | def __init__(self, n_fft):
42 | super(CascadedASPPNet, self).__init__()
43 | self.stg1_low_band_net = BaseASPPNet(2, 64)
44 | self.stg1_high_band_net = BaseASPPNet(2, 64)
45 |
46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47 | self.stg2_full_band_net = BaseASPPNet(32, 64)
48 |
49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50 | self.stg3_full_band_net = BaseASPPNet(64, 128)
51 |
52 | self.out = nn.Conv2d(128, 2, 1, bias=False)
53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55 |
56 | self.max_bin = n_fft // 2
57 | self.output_bin = n_fft // 2 + 1
58 |
59 | self.offset = 128
60 |
61 | def forward(self, x, aggressiveness=None):
62 | mix = x.detach()
63 | x = x.clone()
64 |
65 | x = x[:, :, : self.max_bin]
66 |
67 | bandw = x.size()[2] // 2
68 | aux1 = torch.cat(
69 | [
70 | self.stg1_low_band_net(x[:, :, :bandw]),
71 | self.stg1_high_band_net(x[:, :, bandw:]),
72 | ],
73 | dim=2,
74 | )
75 |
76 | h = torch.cat([x, aux1], dim=1)
77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78 |
79 | h = torch.cat([x, aux1, aux2], dim=1)
80 | h = self.stg3_full_band_net(self.stg3_bridge(h))
81 |
82 | mask = torch.sigmoid(self.out(h))
83 | mask = F.pad(
84 | input=mask,
85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86 | mode="replicate",
87 | )
88 |
89 | if self.training:
90 | aux1 = torch.sigmoid(self.aux1_out(aux1))
91 | aux1 = F.pad(
92 | input=aux1,
93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94 | mode="replicate",
95 | )
96 | aux2 = torch.sigmoid(self.aux2_out(aux2))
97 | aux2 = F.pad(
98 | input=aux2,
99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 | mode="replicate",
101 | )
102 | return mask * mix, aux1 * mix, aux2 * mix
103 | else:
104 | if aggressiveness:
105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 | mask[:, :, : aggressiveness["split_bin"]],
107 | 1 + aggressiveness["value"] / 3,
108 | )
109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 | mask[:, :, aggressiveness["split_bin"] :],
111 | 1 + aggressiveness["value"],
112 | )
113 |
114 | return mask * mix
115 |
116 | def predict(self, x_mag, aggressiveness=None):
117 | h = self.forward(x_mag, aggressiveness)
118 |
119 | if self.offset > 0:
120 | h = h[:, :, :, self.offset : -self.offset]
121 | assert h.size()[3] > 0
122 |
123 | return h
124 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_123821KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 32)
43 | self.stg1_high_band_net = BaseASPPNet(2, 32)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(16, 32)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(32, 64)
50 |
51 | self.out = nn.Conv2d(64, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_new.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_new
6 |
7 |
8 | class BaseNet(nn.Module):
9 | def __init__(
10 | self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
11 | ):
12 | super(BaseNet, self).__init__()
13 | self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
14 | self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
15 | self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
16 | self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
17 | self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
18 |
19 | self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
20 |
21 | self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
22 | self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
23 | self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
24 | self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
25 | self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
26 |
27 | def __call__(self, x):
28 | e1 = self.enc1(x)
29 | e2 = self.enc2(e1)
30 | e3 = self.enc3(e2)
31 | e4 = self.enc4(e3)
32 | e5 = self.enc5(e4)
33 |
34 | h = self.aspp(e5)
35 |
36 | h = self.dec4(h, e4)
37 | h = self.dec3(h, e3)
38 | h = self.dec2(h, e2)
39 | h = torch.cat([h, self.lstm_dec2(h)], dim=1)
40 | h = self.dec1(h, e1)
41 |
42 | return h
43 |
44 |
45 | class CascadedNet(nn.Module):
46 | def __init__(self, n_fft, nout=32, nout_lstm=128):
47 | super(CascadedNet, self).__init__()
48 |
49 | self.max_bin = n_fft // 2
50 | self.output_bin = n_fft // 2 + 1
51 | self.nin_lstm = self.max_bin // 2
52 | self.offset = 64
53 |
54 | self.stg1_low_band_net = nn.Sequential(
55 | BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
56 | layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
57 | )
58 |
59 | self.stg1_high_band_net = BaseNet(
60 | 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
61 | )
62 |
63 | self.stg2_low_band_net = nn.Sequential(
64 | BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
65 | layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
66 | )
67 | self.stg2_high_band_net = BaseNet(
68 | nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
69 | )
70 |
71 | self.stg3_full_band_net = BaseNet(
72 | 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
73 | )
74 |
75 | self.out = nn.Conv2d(nout, 2, 1, bias=False)
76 | self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
77 |
78 | def forward(self, x):
79 | x = x[:, :, : self.max_bin]
80 |
81 | bandw = x.size()[2] // 2
82 | l1_in = x[:, :, :bandw]
83 | h1_in = x[:, :, bandw:]
84 | l1 = self.stg1_low_band_net(l1_in)
85 | h1 = self.stg1_high_band_net(h1_in)
86 | aux1 = torch.cat([l1, h1], dim=2)
87 |
88 | l2_in = torch.cat([l1_in, l1], dim=1)
89 | h2_in = torch.cat([h1_in, h1], dim=1)
90 | l2 = self.stg2_low_band_net(l2_in)
91 | h2 = self.stg2_high_band_net(h2_in)
92 | aux2 = torch.cat([l2, h2], dim=2)
93 |
94 | f3_in = torch.cat([x, aux1, aux2], dim=1)
95 | f3 = self.stg3_full_band_net(f3_in)
96 |
97 | mask = torch.sigmoid(self.out(f3))
98 | mask = F.pad(
99 | input=mask,
100 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
101 | mode="replicate",
102 | )
103 |
104 | if self.training:
105 | aux = torch.cat([aux1, aux2], dim=1)
106 | aux = torch.sigmoid(self.aux_out(aux))
107 | aux = F.pad(
108 | input=aux,
109 | pad=(0, 0, 0, self.output_bin - aux.size()[2]),
110 | mode="replicate",
111 | )
112 | return mask, aux
113 | else:
114 | return mask
115 |
116 | def predict_mask(self, x):
117 | mask = self.forward(x)
118 |
119 | if self.offset > 0:
120 | mask = mask[:, :, :, self.offset : -self.offset]
121 | assert mask.size()[3] > 0
122 |
123 | return mask
124 |
125 | def predict(self, x, aggressiveness=None):
126 | mask = self.forward(x)
127 | pred_mag = x * mask
128 |
129 | if self.offset > 0:
130 | pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
131 | assert pred_mag.size()[3] > 0
132 |
133 | return pred_mag
134 |
--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import numpy as np
4 | import torch
5 | from tqdm import tqdm
6 |
7 |
8 | def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict:
9 | with open(file_name, "r") as f:
10 | data = json.load(f)
11 |
12 | return data
13 |
14 |
15 | def make_padding(width, cropsize, offset):
16 | left = offset
17 | roi_size = cropsize - left * 2
18 | if roi_size == 0:
19 | roi_size = cropsize
20 | right = roi_size - (width % roi_size) + left
21 |
22 | return left, right, roi_size
23 |
24 |
25 | def inference(X_spec, device, model, aggressiveness, data):
26 | """
27 | data : dic configs
28 | """
29 |
30 | def _execute(
31 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
32 | ):
33 | model.eval()
34 | with torch.no_grad():
35 | preds = []
36 |
37 | iterations = [n_window]
38 |
39 | total_iterations = sum(iterations)
40 | for i in tqdm(range(n_window)):
41 | start = i * roi_size
42 | X_mag_window = X_mag_pad[
43 | None, :, :, start : start + data["window_size"]
44 | ]
45 | X_mag_window = torch.from_numpy(X_mag_window)
46 | if is_half:
47 | X_mag_window = X_mag_window.half()
48 | X_mag_window = X_mag_window.to(device)
49 |
50 | pred = model.predict(X_mag_window, aggressiveness)
51 |
52 | pred = pred.detach().cpu().numpy()
53 | preds.append(pred[0])
54 |
55 | pred = np.concatenate(preds, axis=2)
56 | return pred
57 |
58 | def preprocess(X_spec):
59 | X_mag = np.abs(X_spec)
60 | X_phase = np.angle(X_spec)
61 |
62 | return X_mag, X_phase
63 |
64 | X_mag, X_phase = preprocess(X_spec)
65 |
66 | coef = X_mag.max()
67 | X_mag_pre = X_mag / coef
68 |
69 | n_frame = X_mag_pre.shape[2]
70 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
71 | n_window = int(np.ceil(n_frame / roi_size))
72 |
73 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
74 |
75 | if list(model.state_dict().values())[0].dtype == torch.float16:
76 | is_half = True
77 | else:
78 | is_half = False
79 | pred = _execute(
80 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
81 | )
82 | pred = pred[:, :, :n_frame]
83 |
84 | if data["tta"]:
85 | pad_l += roi_size // 2
86 | pad_r += roi_size // 2
87 | n_window += 1
88 |
89 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
90 |
91 | pred_tta = _execute(
92 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
93 | )
94 | pred_tta = pred_tta[:, :, roi_size // 2 :]
95 | pred_tta = pred_tta[:, :, :n_frame]
96 |
97 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
98 | else:
99 | return pred * coef, X_mag, np.exp(1.0j * X_phase)
100 |
101 |
102 | def _get_name_params(model_path, model_hash):
103 | data = load_data()
104 | flag = False
105 | ModelName = model_path
106 | for type in list(data):
107 | for model in list(data[type][0]):
108 | for i in range(len(data[type][0][model])):
109 | if str(data[type][0][model][i]["hash_name"]) == model_hash:
110 | flag = True
111 | elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112 | flag = True
113 |
114 | if flag:
115 | model_params_auto = data[type][0][model][i]["model_params"]
116 | param_name_auto = data[type][0][model][i]["param_name"]
117 | if type == "equivalent":
118 | return param_name_auto, model_params_auto
119 | else:
120 | flag = False
121 | return param_name_auto, model_params_auto
122 |
--------------------------------------------------------------------------------
/rvc/infer/modules/gui/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | TorchGating is a PyTorch-based implementation of Spectral Gating
3 | ================================================
4 | Author: Asaf Zorea
5 |
6 | Contents
7 | --------
8 | torchgate imports all the functions from PyTorch, and in addition provides:
9 | TorchGating --- A PyTorch module that applies a spectral gate to an input signal
10 |
11 | """
12 |
13 | from .torchgate import TorchGate
14 |
--------------------------------------------------------------------------------
/rvc/infer/modules/gui/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.types import Number
3 |
4 |
5 | @torch.no_grad()
6 | def amp_to_db(
7 | x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40
8 | ) -> torch.Tensor:
9 | """
10 | Convert the input tensor from amplitude to decibel scale.
11 |
12 | Arguments:
13 | x {[torch.Tensor]} -- [Input tensor.]
14 |
15 | Keyword Arguments:
16 | eps {[float]} -- [Small value to avoid numerical instability.]
17 | (default: {torch.finfo(torch.float64).eps})
18 | top_db {[float]} -- [threshold the output at ``top_db`` below the peak]
19 | ` (default: {40})
20 |
21 | Returns:
22 | [torch.Tensor] -- [Output tensor in decibel scale.]
23 | """
24 | x_db = 20 * torch.log10(x.abs() + eps)
25 | return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1))
26 |
27 |
28 | @torch.no_grad()
29 | def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor:
30 | """
31 | Apply a sigmoid function with temperature scaling.
32 |
33 | Arguments:
34 | x {[torch.Tensor]} -- [Input tensor.]
35 | x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.]
36 | temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.]
37 |
38 | Returns:
39 | [torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.]
40 | """
41 | return torch.sigmoid((x - x0) / temp_coeff)
42 |
43 |
44 | @torch.no_grad()
45 | def linspace(
46 | start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs
47 | ) -> torch.Tensor:
48 | """
49 | Generate a linearly spaced 1-D tensor.
50 |
51 | Arguments:
52 | start {[Number]} -- [The starting value of the sequence.]
53 | stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False.
54 | In that case, the sequence consists of all but the last of ``num + 1``
55 | evenly spaced samples, so that `stop` is excluded. Note that the step
56 | size changes when `endpoint` is False.]
57 |
58 | Keyword Arguments:
59 | num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.]
60 | endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included.
61 | Default is True.]
62 | **kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.]
63 |
64 | Returns:
65 | [torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.]
66 | """
67 | if endpoint:
68 | return torch.linspace(start, stop, num, **kwargs)
69 | else:
70 | return torch.linspace(start, stop, num + 1, **kwargs)[:-1]
71 |
--------------------------------------------------------------------------------
/rvc/infer/modules/onnx/export.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
4 |
5 |
6 | def export_onnx(ModelPath, ExportedPath):
7 | cpt = torch.load(ModelPath, map_location="cpu")
8 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
9 | vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
10 |
11 | test_phone = torch.rand(1, 200, vec_channels) # hidden unit
12 | test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
13 | test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
14 | test_pitchf = torch.rand(1, 200) # nsf基频
15 | test_ds = torch.LongTensor([0]) # 说话人ID
16 | test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
17 |
18 | device = "cpu" # 导出时设备(不影响使用模型)
19 |
20 | net_g = SynthesizerTrnMsNSFsidM(
21 | *cpt["config"], is_half=False, encoder_dim=vec_channels
22 | ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
23 | net_g.load_state_dict(cpt["weight"], strict=False)
24 | input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
25 | output_names = [
26 | "audio",
27 | ]
28 | # net_g.construct_spkmixmap() #多角色混合轨道导出
29 | torch.onnx.export(
30 | net_g,
31 | (
32 | test_phone.to(device),
33 | test_phone_lengths.to(device),
34 | test_pitch.to(device),
35 | test_pitchf.to(device),
36 | test_ds.to(device),
37 | test_rnd.to(device),
38 | ),
39 | ExportedPath,
40 | dynamic_axes={
41 | "phone": [1],
42 | "pitch": [1],
43 | "pitchf": [1],
44 | "rnd": [2],
45 | },
46 | do_constant_folding=False,
47 | opset_version=17,
48 | verbose=False,
49 | input_names=input_names,
50 | output_names=output_names,
51 | )
52 | return "Finished"
53 |
--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract/extract_f0_print.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 |
5 | import parselmouth
6 |
7 | now_dir = os.getcwd()
8 | sys.path.append(now_dir)
9 | import logging
10 |
11 | import numpy as np
12 | import pyworld
13 |
14 | from infer.lib.audio import load_audio
15 |
16 | logging.getLogger("numba").setLevel(logging.WARNING)
17 | from multiprocessing import Process
18 |
19 | exp_dir = sys.argv[1]
20 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
21 |
22 |
23 | def printt(strr):
24 | print(strr)
25 | f.write("%s\n" % strr)
26 | f.flush()
27 |
28 |
29 | n_p = int(sys.argv[2])
30 | f0method = sys.argv[3]
31 |
32 |
33 | class FeatureInput(object):
34 | def __init__(self, samplerate=16000, hop_size=160):
35 | self.fs = samplerate
36 | self.hop = hop_size
37 |
38 | self.f0_bin = 256
39 | self.f0_max = 1100.0
40 | self.f0_min = 50.0
41 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
42 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
43 |
44 | def compute_f0(self, path, f0_method):
45 | x = load_audio(path, self.fs)
46 | p_len = x.shape[0] // self.hop
47 | if f0_method == "pm":
48 | time_step = 160 / 16000 * 1000
49 | f0_min = 50
50 | f0_max = 1100
51 | f0 = (
52 | parselmouth.Sound(x, self.fs)
53 | .to_pitch_ac(
54 | time_step=time_step / 1000,
55 | voicing_threshold=0.6,
56 | pitch_floor=f0_min,
57 | pitch_ceiling=f0_max,
58 | )
59 | .selected_array["frequency"]
60 | )
61 | pad_size = (p_len - len(f0) + 1) // 2
62 | if pad_size > 0 or p_len - len(f0) - pad_size > 0:
63 | f0 = np.pad(
64 | f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
65 | )
66 | elif f0_method == "harvest":
67 | f0, t = pyworld.harvest(
68 | x.astype(np.double),
69 | fs=self.fs,
70 | f0_ceil=self.f0_max,
71 | f0_floor=self.f0_min,
72 | frame_period=1000 * self.hop / self.fs,
73 | )
74 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
75 | elif f0_method == "dio":
76 | f0, t = pyworld.dio(
77 | x.astype(np.double),
78 | fs=self.fs,
79 | f0_ceil=self.f0_max,
80 | f0_floor=self.f0_min,
81 | frame_period=1000 * self.hop / self.fs,
82 | )
83 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
84 | elif f0_method == "rmvpe":
85 | if hasattr(self, "model_rmvpe") == False:
86 | from infer.lib.rmvpe import RMVPE
87 |
88 | print("Loading rmvpe model")
89 | self.model_rmvpe = RMVPE(
90 | "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu"
91 | )
92 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
93 | return f0
94 |
95 | def coarse_f0(self, f0):
96 | f0_mel = 1127 * np.log(1 + f0 / 700)
97 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
98 | self.f0_bin - 2
99 | ) / (self.f0_mel_max - self.f0_mel_min) + 1
100 |
101 | # use 0 or 1
102 | f0_mel[f0_mel <= 1] = 1
103 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
104 | f0_coarse = np.rint(f0_mel).astype(int)
105 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
106 | f0_coarse.max(),
107 | f0_coarse.min(),
108 | )
109 | return f0_coarse
110 |
111 | def go(self, paths, f0_method):
112 | if len(paths) == 0:
113 | printt("no-f0-todo")
114 | else:
115 | printt("todo-f0-%s" % len(paths))
116 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条
117 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
118 | try:
119 | if idx % n == 0:
120 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
121 | if (
122 | os.path.exists(opt_path1 + ".npy") == True
123 | and os.path.exists(opt_path2 + ".npy") == True
124 | ):
125 | continue
126 | featur_pit = self.compute_f0(inp_path, f0_method)
127 | np.save(
128 | opt_path2,
129 | featur_pit,
130 | allow_pickle=False,
131 | ) # nsf
132 | coarse_pit = self.coarse_f0(featur_pit)
133 | np.save(
134 | opt_path1,
135 | coarse_pit,
136 | allow_pickle=False,
137 | ) # ori
138 | except:
139 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
140 |
141 |
142 | if __name__ == "__main__":
143 | # exp_dir=r"E:\codes\py39\dataset\mi-test"
144 | # n_p=16
145 | # f = open("%s/log_extract_f0.log"%exp_dir, "w")
146 | printt(" ".join(sys.argv))
147 | featureInput = FeatureInput()
148 | paths = []
149 | inp_root = "%s/1_16k_wavs" % (exp_dir)
150 | opt_root1 = "%s/2a_f0" % (exp_dir)
151 | opt_root2 = "%s/2b-f0nsf" % (exp_dir)
152 |
153 | os.makedirs(opt_root1, exist_ok=True)
154 | os.makedirs(opt_root2, exist_ok=True)
155 | for name in sorted(list(os.listdir(inp_root))):
156 | inp_path = "%s/%s" % (inp_root, name)
157 | if "spec" in inp_path:
158 | continue
159 | opt_path1 = "%s/%s" % (opt_root1, name)
160 | opt_path2 = "%s/%s" % (opt_root2, name)
161 | paths.append([inp_path, opt_path1, opt_path2])
162 |
163 | ps = []
164 | for i in range(n_p):
165 | p = Process(
166 | target=featureInput.go,
167 | args=(
168 | paths[i::n_p],
169 | f0method,
170 | ),
171 | )
172 | ps.append(p)
173 | p.start()
174 | for i in range(n_p):
175 | ps[i].join()
176 |
--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract/extract_f0_rmvpe.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 |
5 | import parselmouth
6 |
7 | now_dir = os.getcwd()
8 | sys.path.append(now_dir)
9 | import logging
10 |
11 | import numpy as np
12 | import pyworld
13 |
14 | from infer.lib.audio import load_audio
15 |
16 | logging.getLogger("numba").setLevel(logging.WARNING)
17 |
18 | n_part = int(sys.argv[1])
19 | i_part = int(sys.argv[2])
20 | i_gpu = sys.argv[3]
21 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
22 | exp_dir = sys.argv[4]
23 | is_half = sys.argv[5]
24 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
25 |
26 |
27 | def printt(strr):
28 | print(strr)
29 | f.write("%s\n" % strr)
30 | f.flush()
31 |
32 |
33 | class FeatureInput(object):
34 | def __init__(self, samplerate=16000, hop_size=160):
35 | self.fs = samplerate
36 | self.hop = hop_size
37 |
38 | self.f0_bin = 256
39 | self.f0_max = 1100.0
40 | self.f0_min = 50.0
41 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
42 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
43 |
44 | def compute_f0(self, path, f0_method):
45 | x = load_audio(path, self.fs)
46 | # p_len = x.shape[0] // self.hop
47 | if f0_method == "rmvpe":
48 | if hasattr(self, "model_rmvpe") == False:
49 | from infer.lib.rmvpe import RMVPE
50 |
51 | print("Loading rmvpe model")
52 | self.model_rmvpe = RMVPE(
53 | "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda"
54 | )
55 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
56 | return f0
57 |
58 | def coarse_f0(self, f0):
59 | f0_mel = 1127 * np.log(1 + f0 / 700)
60 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
61 | self.f0_bin - 2
62 | ) / (self.f0_mel_max - self.f0_mel_min) + 1
63 |
64 | # use 0 or 1
65 | f0_mel[f0_mel <= 1] = 1
66 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
67 | f0_coarse = np.rint(f0_mel).astype(int)
68 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
69 | f0_coarse.max(),
70 | f0_coarse.min(),
71 | )
72 | return f0_coarse
73 |
74 | def go(self, paths, f0_method):
75 | if len(paths) == 0:
76 | printt("no-f0-todo")
77 | else:
78 | printt("todo-f0-%s" % len(paths))
79 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条
80 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
81 | try:
82 | if idx % n == 0:
83 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
84 | if (
85 | os.path.exists(opt_path1 + ".npy") == True
86 | and os.path.exists(opt_path2 + ".npy") == True
87 | ):
88 | continue
89 | featur_pit = self.compute_f0(inp_path, f0_method)
90 | np.save(
91 | opt_path2,
92 | featur_pit,
93 | allow_pickle=False,
94 | ) # nsf
95 | coarse_pit = self.coarse_f0(featur_pit)
96 | np.save(
97 | opt_path1,
98 | coarse_pit,
99 | allow_pickle=False,
100 | ) # ori
101 | except:
102 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
103 |
104 |
105 | if __name__ == "__main__":
106 | # exp_dir=r"E:\codes\py39\dataset\mi-test"
107 | # n_p=16
108 | # f = open("%s/log_extract_f0.log"%exp_dir, "w")
109 | printt(" ".join(sys.argv))
110 | featureInput = FeatureInput()
111 | paths = []
112 | inp_root = "%s/1_16k_wavs" % (exp_dir)
113 | opt_root1 = "%s/2a_f0" % (exp_dir)
114 | opt_root2 = "%s/2b-f0nsf" % (exp_dir)
115 |
116 | os.makedirs(opt_root1, exist_ok=True)
117 | os.makedirs(opt_root2, exist_ok=True)
118 | for name in sorted(list(os.listdir(inp_root))):
119 | inp_path = "%s/%s" % (inp_root, name)
120 | if "spec" in inp_path:
121 | continue
122 | opt_path1 = "%s/%s" % (opt_root1, name)
123 | opt_path2 = "%s/%s" % (opt_root2, name)
124 | paths.append([inp_path, opt_path1, opt_path2])
125 | try:
126 | featureInput.go(paths[i_part::n_part], "rmvpe")
127 | except:
128 | printt("f0_all_fail-%s" % (traceback.format_exc()))
129 | # ps = []
130 | # for i in range(n_p):
131 | # p = Process(
132 | # target=featureInput.go,
133 | # args=(
134 | # paths[i::n_p],
135 | # f0method,
136 | # ),
137 | # )
138 | # ps.append(p)
139 | # p.start()
140 | # for i in range(n_p):
141 | # ps[i].join()
142 |
--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract/extract_f0_rmvpe_dml.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 |
5 | import parselmouth
6 |
7 | now_dir = os.getcwd()
8 | sys.path.append(now_dir)
9 | import logging
10 |
11 | import numpy as np
12 | import pyworld
13 |
14 | from infer.lib.audio import load_audio
15 |
16 | logging.getLogger("numba").setLevel(logging.WARNING)
17 |
18 | exp_dir = sys.argv[1]
19 | import torch_directml
20 |
21 | device = torch_directml.device(torch_directml.default_device())
22 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
23 |
24 |
25 | def printt(strr):
26 | print(strr)
27 | f.write("%s\n" % strr)
28 | f.flush()
29 |
30 |
31 | class FeatureInput(object):
32 | def __init__(self, samplerate=16000, hop_size=160):
33 | self.fs = samplerate
34 | self.hop = hop_size
35 |
36 | self.f0_bin = 256
37 | self.f0_max = 1100.0
38 | self.f0_min = 50.0
39 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
40 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
41 |
42 | def compute_f0(self, path, f0_method):
43 | x = load_audio(path, self.fs)
44 | # p_len = x.shape[0] // self.hop
45 | if f0_method == "rmvpe":
46 | if hasattr(self, "model_rmvpe") == False:
47 | from infer.lib.rmvpe import RMVPE
48 |
49 | print("Loading rmvpe model")
50 | self.model_rmvpe = RMVPE(
51 | "assets/rmvpe/rmvpe.pt", is_half=False, device=device
52 | )
53 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
54 | return f0
55 |
56 | def coarse_f0(self, f0):
57 | f0_mel = 1127 * np.log(1 + f0 / 700)
58 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
59 | self.f0_bin - 2
60 | ) / (self.f0_mel_max - self.f0_mel_min) + 1
61 |
62 | # use 0 or 1
63 | f0_mel[f0_mel <= 1] = 1
64 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
65 | f0_coarse = np.rint(f0_mel).astype(int)
66 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
67 | f0_coarse.max(),
68 | f0_coarse.min(),
69 | )
70 | return f0_coarse
71 |
72 | def go(self, paths, f0_method):
73 | if len(paths) == 0:
74 | printt("no-f0-todo")
75 | else:
76 | printt("todo-f0-%s" % len(paths))
77 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条
78 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
79 | try:
80 | if idx % n == 0:
81 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
82 | if (
83 | os.path.exists(opt_path1 + ".npy") == True
84 | and os.path.exists(opt_path2 + ".npy") == True
85 | ):
86 | continue
87 | featur_pit = self.compute_f0(inp_path, f0_method)
88 | np.save(
89 | opt_path2,
90 | featur_pit,
91 | allow_pickle=False,
92 | ) # nsf
93 | coarse_pit = self.coarse_f0(featur_pit)
94 | np.save(
95 | opt_path1,
96 | coarse_pit,
97 | allow_pickle=False,
98 | ) # ori
99 | except:
100 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
101 |
102 |
103 | if __name__ == "__main__":
104 | # exp_dir=r"E:\codes\py39\dataset\mi-test"
105 | # n_p=16
106 | # f = open("%s/log_extract_f0.log"%exp_dir, "w")
107 | printt(" ".join(sys.argv))
108 | featureInput = FeatureInput()
109 | paths = []
110 | inp_root = "%s/1_16k_wavs" % (exp_dir)
111 | opt_root1 = "%s/2a_f0" % (exp_dir)
112 | opt_root2 = "%s/2b-f0nsf" % (exp_dir)
113 |
114 | os.makedirs(opt_root1, exist_ok=True)
115 | os.makedirs(opt_root2, exist_ok=True)
116 | for name in sorted(list(os.listdir(inp_root))):
117 | inp_path = "%s/%s" % (inp_root, name)
118 | if "spec" in inp_path:
119 | continue
120 | opt_path1 = "%s/%s" % (opt_root1, name)
121 | opt_path2 = "%s/%s" % (opt_root2, name)
122 | paths.append([inp_path, opt_path1, opt_path2])
123 | try:
124 | featureInput.go(paths, "rmvpe")
125 | except:
126 | printt("f0_all_fail-%s" % (traceback.format_exc()))
127 | # ps = []
128 | # for i in range(n_p):
129 | # p = Process(
130 | # target=featureInput.go,
131 | # args=(
132 | # paths[i::n_p],
133 | # f0method,
134 | # ),
135 | # )
136 | # ps.append(p)
137 | # p.start()
138 | # for i in range(n_p):
139 | # ps[i].join()
140 |
--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract_feature_print.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 |
5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
6 | os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
7 |
8 | device = sys.argv[1]
9 | n_part = int(sys.argv[2])
10 | i_part = int(sys.argv[3])
11 | if len(sys.argv) == 7:
12 | exp_dir = sys.argv[4]
13 | version = sys.argv[5]
14 | is_half = sys.argv[6].lower() == "true"
15 | else:
16 | i_gpu = sys.argv[4]
17 | exp_dir = sys.argv[5]
18 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
19 | version = sys.argv[6]
20 | is_half = sys.argv[7].lower() == "true"
21 | import fairseq
22 | import numpy as np
23 | import soundfile as sf
24 | import torch
25 | import torch.nn.functional as F
26 |
27 | if "privateuseone" not in device:
28 | device = "cpu"
29 | if torch.cuda.is_available():
30 | device = "cuda"
31 | elif torch.backends.mps.is_available():
32 | device = "mps"
33 | else:
34 | import torch_directml
35 |
36 | device = torch_directml.device(torch_directml.default_device())
37 |
38 | def forward_dml(ctx, x, scale):
39 | ctx.scale = scale
40 | res = x.clone().detach()
41 | return res
42 |
43 | fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
44 |
45 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
46 |
47 |
48 | def printt(strr):
49 | print(strr)
50 | f.write("%s\n" % strr)
51 | f.flush()
52 |
53 |
54 | printt(" ".join(sys.argv))
55 | model_path = "assets/hubert/hubert_base.pt"
56 |
57 | printt("exp_dir: " + exp_dir)
58 | wavPath = "%s/1_16k_wavs" % exp_dir
59 | outPath = (
60 | "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir
61 | )
62 | os.makedirs(outPath, exist_ok=True)
63 |
64 |
65 | # wave must be 16k, hop_size=320
66 | def readwave(wav_path, normalize=False):
67 | wav, sr = sf.read(wav_path)
68 | assert sr == 16000
69 | feats = torch.from_numpy(wav).float()
70 | if feats.dim() == 2: # double channels
71 | feats = feats.mean(-1)
72 | assert feats.dim() == 1, feats.dim()
73 | if normalize:
74 | with torch.no_grad():
75 | feats = F.layer_norm(feats, feats.shape)
76 | feats = feats.view(1, -1)
77 | return feats
78 |
79 |
80 | # HuBERT model
81 | printt("load model(s) from {}".format(model_path))
82 | # if hubert model is exist
83 | if os.access(model_path, os.F_OK) == False:
84 | printt(
85 | "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
86 | % model_path
87 | )
88 | exit(0)
89 | models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
90 | [model_path],
91 | suffix="",
92 | )
93 | model = models[0]
94 | model = model.to(device)
95 | printt("move model to %s" % device)
96 | if is_half:
97 | if device not in ["mps", "cpu"]:
98 | model = model.half()
99 | model.eval()
100 |
101 | todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
102 | n = max(1, len(todo) // 10) # 最多打印十条
103 | if len(todo) == 0:
104 | printt("no-feature-todo")
105 | else:
106 | printt("all-feature-%s" % len(todo))
107 | for idx, file in enumerate(todo):
108 | try:
109 | if file.endswith(".wav"):
110 | wav_path = "%s/%s" % (wavPath, file)
111 | out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
112 |
113 | if os.path.exists(out_path):
114 | continue
115 |
116 | feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
117 | padding_mask = torch.BoolTensor(feats.shape).fill_(False)
118 | inputs = {
119 | "source": (
120 | feats.half().to(device)
121 | if is_half and device not in ["mps", "cpu"]
122 | else feats.to(device)
123 | ),
124 | "padding_mask": padding_mask.to(device),
125 | "output_layer": 9 if version == "v1" else 12, # layer 9
126 | }
127 | with torch.no_grad():
128 | logits = model.extract_features(**inputs)
129 | feats = (
130 | model.final_proj(logits[0]) if version == "v1" else logits[0]
131 | )
132 |
133 | feats = feats.squeeze(0).float().cpu().numpy()
134 | if np.isnan(feats).sum() == 0:
135 | np.save(out_path, feats, allow_pickle=False)
136 | else:
137 | printt("%s-contains nan" % file)
138 | if idx % n == 0:
139 | printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
140 | except:
141 | printt(traceback.format_exc())
142 | printt("all-feature-done")
143 |
--------------------------------------------------------------------------------
/rvc/infer/modules/train/preprocess.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import os
3 | import sys
4 |
5 | from scipy import signal
6 |
7 | now_dir = os.getcwd()
8 | sys.path.append(now_dir)
9 | print(*sys.argv[1:])
10 | inp_root = sys.argv[1]
11 | sr = int(sys.argv[2])
12 | n_p = int(sys.argv[3])
13 | exp_dir = sys.argv[4]
14 | noparallel = sys.argv[5] == "True"
15 | per = float(sys.argv[6])
16 | import os
17 | import traceback
18 |
19 | import librosa
20 | import numpy as np
21 | from scipy.io import wavfile
22 |
23 | from infer.lib.audio import load_audio
24 | from infer.lib.slicer2 import Slicer
25 |
26 | f = open("%s/preprocess.log" % exp_dir, "a+")
27 |
28 |
29 | def println(strr):
30 | print(strr)
31 | f.write("%s\n" % strr)
32 | f.flush()
33 |
34 |
35 | class PreProcess:
36 | def __init__(self, sr, exp_dir, per=3.7):
37 | self.slicer = Slicer(
38 | sr=sr,
39 | threshold=-42,
40 | min_length=1500,
41 | min_interval=400,
42 | hop_size=15,
43 | max_sil_kept=500,
44 | )
45 | self.sr = sr
46 | self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
47 | self.per = per
48 | self.overlap = 0.3
49 | self.tail = self.per + self.overlap
50 | self.max = 0.9
51 | self.alpha = 0.75
52 | self.exp_dir = exp_dir
53 | self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
54 | self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
55 | os.makedirs(self.exp_dir, exist_ok=True)
56 | os.makedirs(self.gt_wavs_dir, exist_ok=True)
57 | os.makedirs(self.wavs16k_dir, exist_ok=True)
58 |
59 | def norm_write(self, tmp_audio, idx0, idx1):
60 | tmp_max = np.abs(tmp_audio).max()
61 | if tmp_max > 2.5:
62 | print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
63 | return
64 | tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
65 | 1 - self.alpha
66 | ) * tmp_audio
67 | wavfile.write(
68 | "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
69 | self.sr,
70 | tmp_audio.astype(np.float32),
71 | )
72 | tmp_audio = librosa.resample(
73 | tmp_audio, orig_sr=self.sr, target_sr=16000
74 | ) # , res_type="soxr_vhq"
75 | wavfile.write(
76 | "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
77 | 16000,
78 | tmp_audio.astype(np.float32),
79 | )
80 |
81 | def pipeline(self, path, idx0):
82 | try:
83 | audio = load_audio(path, self.sr)
84 | # zero phased digital filter cause pre-ringing noise...
85 | # audio = signal.filtfilt(self.bh, self.ah, audio)
86 | audio = signal.lfilter(self.bh, self.ah, audio)
87 |
88 | idx1 = 0
89 | for audio in self.slicer.slice(audio):
90 | i = 0
91 | while 1:
92 | start = int(self.sr * (self.per - self.overlap) * i)
93 | i += 1
94 | if len(audio[start:]) > self.tail * self.sr:
95 | tmp_audio = audio[start : start + int(self.per * self.sr)]
96 | self.norm_write(tmp_audio, idx0, idx1)
97 | idx1 += 1
98 | else:
99 | tmp_audio = audio[start:]
100 | idx1 += 1
101 | break
102 | self.norm_write(tmp_audio, idx0, idx1)
103 | println("%s\t-> Success" % path)
104 | except:
105 | println("%s\t-> %s" % (path, traceback.format_exc()))
106 |
107 | def pipeline_mp(self, infos):
108 | for path, idx0 in infos:
109 | self.pipeline(path, idx0)
110 |
111 | def pipeline_mp_inp_dir(self, inp_root, n_p):
112 | try:
113 | infos = [
114 | ("%s/%s" % (inp_root, name), idx)
115 | for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
116 | ]
117 | if noparallel:
118 | for i in range(n_p):
119 | self.pipeline_mp(infos[i::n_p])
120 | else:
121 | ps = []
122 | for i in range(n_p):
123 | p = multiprocessing.Process(
124 | target=self.pipeline_mp, args=(infos[i::n_p],)
125 | )
126 | ps.append(p)
127 | p.start()
128 | for i in range(n_p):
129 | ps[i].join()
130 | except:
131 | println("Fail. %s" % traceback.format_exc())
132 |
133 |
134 | def preprocess_trainset(inp_root, sr, n_p, exp_dir, per):
135 | pp = PreProcess(sr, exp_dir, per)
136 | println("start preprocess")
137 | pp.pipeline_mp_inp_dir(inp_root, n_p)
138 | println("end preprocess")
139 |
140 |
141 | if __name__ == "__main__":
142 | preprocess_trainset(inp_root, sr, n_p, exp_dir, per)
143 |
--------------------------------------------------------------------------------
/rvc/infer/modules/uvr5/modules.py:
--------------------------------------------------------------------------------
1 | import os
2 | import traceback
3 | import logging
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 | import ffmpeg
8 | import torch
9 |
10 | from configs.config import Config
11 | from infer.modules.uvr5.mdxnet import MDXNetDereverb
12 | from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
13 |
14 | config = Config()
15 |
16 |
17 | def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
18 | infos = []
19 | try:
20 | inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
21 | save_root_vocal = (
22 | save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
23 | )
24 | save_root_ins = (
25 | save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
26 | )
27 | if model_name == "onnx_dereverb_By_FoxJoy":
28 | pre_fun = MDXNetDereverb(15, config.device)
29 | else:
30 | func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
31 | pre_fun = func(
32 | agg=int(agg),
33 | model_path=os.path.join(
34 | os.getenv("weight_uvr5_root"), model_name + ".pth"
35 | ),
36 | device=config.device,
37 | is_half=config.is_half,
38 | )
39 | is_hp3 = "HP3" in model_name
40 | if inp_root != "":
41 | paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
42 | else:
43 | paths = [path.name for path in paths]
44 | for path in paths:
45 | inp_path = os.path.join(inp_root, path)
46 | need_reformat = 1
47 | done = 0
48 | try:
49 | info = ffmpeg.probe(inp_path, cmd="ffprobe")
50 | if (
51 | info["streams"][0]["channels"] == 2
52 | and info["streams"][0]["sample_rate"] == "44100"
53 | ):
54 | need_reformat = 0
55 | pre_fun._path_audio_(
56 | inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
57 | )
58 | done = 1
59 | except:
60 | need_reformat = 1
61 | traceback.print_exc()
62 | if need_reformat == 1:
63 | tmp_path = "%s/%s.reformatted.wav" % (
64 | os.path.join(os.environ["TEMP"]),
65 | os.path.basename(inp_path),
66 | )
67 | os.system(
68 | "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
69 | % (inp_path, tmp_path)
70 | )
71 | inp_path = tmp_path
72 | try:
73 | if done == 0:
74 | pre_fun._path_audio_(
75 | inp_path, save_root_ins, save_root_vocal, format0
76 | )
77 | infos.append("%s->Success" % (os.path.basename(inp_path)))
78 | yield "\n".join(infos)
79 | except:
80 | try:
81 | if done == 0:
82 | pre_fun._path_audio_(
83 | inp_path, save_root_ins, save_root_vocal, format0
84 | )
85 | infos.append("%s->Success" % (os.path.basename(inp_path)))
86 | yield "\n".join(infos)
87 | except:
88 | infos.append(
89 | "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
90 | )
91 | yield "\n".join(infos)
92 | except:
93 | infos.append(traceback.format_exc())
94 | yield "\n".join(infos)
95 | finally:
96 | try:
97 | if model_name == "onnx_dereverb_By_FoxJoy":
98 | del pre_fun.pred.model
99 | del pre_fun.pred.model_
100 | else:
101 | del pre_fun.model
102 | del pre_fun
103 | except:
104 | traceback.print_exc()
105 | if torch.cuda.is_available():
106 | torch.cuda.empty_cache()
107 | logger.info("Executed torch.cuda.empty_cache()")
108 | yield "\n".join(infos)
109 |
--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__init__.py
--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/modules.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/pipeline.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/pipeline.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/utils.cpython-310.pyc
--------------------------------------------------------------------------------
/rvc/infer/modules/vc/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from fairseq import checkpoint_utils
4 |
5 |
6 | def get_index_path_from_model(sid):
7 | return next(
8 | (
9 | f
10 | for f in [
11 | os.path.join(root, name)
12 | for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
13 | for name in files
14 | if name.endswith(".index") and "trained" not in name
15 | ]
16 | if sid.split(".")[0] in f
17 | ),
18 | "",
19 | )
20 |
21 |
22 | def load_hubert(config):
23 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
24 | [os.getenv('hubert_base')],
25 | suffix="",
26 | )
27 | hubert_model = models[0]
28 | hubert_model = hubert_model.to(config.device)
29 | if config.is_half:
30 | hubert_model = hubert_model.half()
31 | else:
32 | hubert_model = hubert_model.float()
33 | return hubert_model.eval()
34 |
--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute32k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute32k.wav
--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute40k.spec.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute40k.spec.pt
--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute40k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute40k.wav
--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute48k.spec.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute48k.spec.pt
--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute48k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute48k.wav
--------------------------------------------------------------------------------
/rvc/logs/mute/1_16k_wavs/mute.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/1_16k_wavs/mute.wav
--------------------------------------------------------------------------------
/rvc/logs/mute/2a_f0/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/2a_f0/mute.wav.npy
--------------------------------------------------------------------------------
/rvc/logs/mute/2b-f0nsf/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/2b-f0nsf/mute.wav.npy
--------------------------------------------------------------------------------
/rvc/logs/mute/3_feature256/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/3_feature256/mute.npy
--------------------------------------------------------------------------------
/rvc/logs/mute/3_feature768/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/3_feature768/mute.npy
--------------------------------------------------------------------------------
/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/web.png
--------------------------------------------------------------------------------
/web/js/alertMSG.js:
--------------------------------------------------------------------------------
1 | import { app } from "../../../scripts/app.js";
2 |
3 | app.registerExtension({
4 | name: "RVC.alertMSG",
5 | async beforeRegisterNodeDef(nodeType, nodeData, app) {
6 | if (nodeData?.name == "RVC_Train") {
7 | nodeType.prototype.onExecuted = function (data) {
8 | // alert("Success!you can find weights in:\n" + data.finetune[0] + "\n" + data.finetune[1] + "\n Now you can tts or inference");
9 | let msg = "Success! you can find weights in:\n" + data.train[0] + "\n you'd like to reboot the server to inference?"
10 | if (confirm(msg)) {
11 | try {
12 | api.fetchApi("/rvc/reboot");
13 | }
14 | catch(exception) {
15 | console.log(exception);
16 | }
17 | }
18 | }
19 | }
20 | },
21 | });
--------------------------------------------------------------------------------
/web/js/previewAudio.js:
--------------------------------------------------------------------------------
1 | import { app } from "../../../scripts/app.js";
2 | import { api } from '../../../scripts/api.js'
3 |
4 | function fitHeight(node) {
5 | node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]])
6 | node?.graph?.setDirtyCanvas(true);
7 | }
8 | function chainCallback(object, property, callback) {
9 | if (object == undefined) {
10 | //This should not happen.
11 | console.error("Tried to add callback to non-existant object")
12 | return;
13 | }
14 | if (property in object) {
15 | const callback_orig = object[property]
16 | object[property] = function () {
17 | const r = callback_orig.apply(this, arguments);
18 | callback.apply(this, arguments);
19 | return r
20 | };
21 | } else {
22 | object[property] = callback;
23 | }
24 | }
25 |
26 | function addPreviewOptions(nodeType) {
27 | chainCallback(nodeType.prototype, "getExtraMenuOptions", function(_, options) {
28 | // The intended way of appending options is returning a list of extra options,
29 | // but this isn't used in widgetInputs.js and would require
30 | // less generalization of chainCallback
31 | let optNew = []
32 | try {
33 | const previewWidget = this.widgets.find((w) => w.name === "audiopreview");
34 |
35 | let url = null
36 | if (previewWidget.audioEl?.hidden == false && previewWidget.audioEl.src) {
37 | //Use full quality audio
38 | //url = api.apiURL('/view?' + new URLSearchParams(previewWidget.value.params));
39 | url = previewWidget.audioEl.src
40 | }
41 | if (url) {
42 | optNew.push(
43 | {
44 | content: "Open preview",
45 | callback: () => {
46 | window.open(url, "_blank")
47 | },
48 | },
49 | {
50 | content: "Save preview",
51 | callback: () => {
52 | const a = document.createElement("a");
53 | a.href = url;
54 | a.setAttribute("download", new URLSearchParams(previewWidget.value.params).get("filename"));
55 | document.body.append(a);
56 | a.click();
57 | requestAnimationFrame(() => a.remove());
58 | },
59 | }
60 | );
61 | }
62 | if(options.length > 0 && options[0] != null && optNew.length > 0) {
63 | optNew.push(null);
64 | }
65 | options.unshift(...optNew);
66 |
67 | } catch (error) {
68 | console.log(error);
69 | }
70 |
71 | });
72 | }
73 | function previewAudio(node,file,type){
74 | var element = document.createElement("div");
75 | const previewNode = node;
76 | var previewWidget = node.addDOMWidget("audiopreview", "preview", element, {
77 | serialize: false,
78 | hideOnZoom: false,
79 | getValue() {
80 | return element.value;
81 | },
82 | setValue(v) {
83 | element.value = v;
84 | },
85 | });
86 | previewWidget.computeSize = function(width) {
87 | if (this.aspectRatio && !this.parentEl.hidden) {
88 | let height = (previewNode.size[0]-20)/ this.aspectRatio + 10;
89 | if (!(height > 0)) {
90 | height = 0;
91 | }
92 | this.computedHeight = height + 10;
93 | return [width, height];
94 | }
95 | return [width, -4];//no loaded src, widget should not display
96 | }
97 | // element.style['pointer-events'] = "none"
98 | previewWidget.value = {hidden: false, paused: false, params: {}}
99 | previewWidget.parentEl = document.createElement("div");
100 | previewWidget.parentEl.className = "audio_preview";
101 | previewWidget.parentEl.style['width'] = "100%"
102 | element.appendChild(previewWidget.parentEl);
103 | previewWidget.audioEl = document.createElement("audio");
104 | previewWidget.audioEl.controls = true;
105 | previewWidget.audioEl.loop = false;
106 | previewWidget.audioEl.muted = false;
107 | previewWidget.audioEl.style['width'] = "100%"
108 | previewWidget.audioEl.addEventListener("loadedmetadata", () => {
109 |
110 | previewWidget.aspectRatio = previewWidget.audioEl.audioWidth / previewWidget.audioEl.audioHeight;
111 | fitHeight(this);
112 | });
113 | previewWidget.audioEl.addEventListener("error", () => {
114 | //TODO: consider a way to properly notify the user why a preview isn't shown.
115 | previewWidget.parentEl.hidden = true;
116 | fitHeight(this);
117 | });
118 |
119 | let params = {
120 | "filename": file,
121 | "type": type,
122 | }
123 |
124 | previewWidget.parentEl.hidden = previewWidget.value.hidden;
125 | previewWidget.audioEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden;
126 | let target_width = 256
127 | if (element.style?.width) {
128 | //overscale to allow scrolling. Endpoint won't return higher than native
129 | target_width = element.style.width.slice(0,-2)*2;
130 | }
131 | if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") {
132 | params.force_size = target_width+"x?"
133 | } else {
134 | let size = params.force_size.split("x")
135 | let ar = parseInt(size[0])/parseInt(size[1])
136 | params.force_size = target_width+"x"+(target_width/ar)
137 | }
138 |
139 | previewWidget.audioEl.src = api.apiURL('/view?' + new URLSearchParams(params));
140 |
141 | previewWidget.audioEl.hidden = false;
142 | previewWidget.parentEl.appendChild(previewWidget.audioEl)
143 | }
144 |
145 | app.registerExtension({
146 | name: "RVC.AudioPreviewer",
147 | async beforeRegisterNodeDef(nodeType, nodeData, app) {
148 | if (nodeData?.name == "PreViewAudio") {
149 | nodeType.prototype.onExecuted = function (data) {
150 | previewAudio(this, data.audio[0], data.audio[1]);
151 | }
152 | addPreviewOptions(nodeType)
153 | }
154 | }
155 | });
--------------------------------------------------------------------------------
/web/js/refreshPath.js:
--------------------------------------------------------------------------------
1 | import { app } from "../../../scripts/app.js";
2 | import { api } from '../../../scripts/api.js'
3 | import { ComfyWidgets } from "../../../scripts/widgets.js"
4 | function rebootAPI() {
5 | if (confirm("Are you sure you'd like to reboot the server to refresh weights path?")) {
6 | try {
7 | api.fetchApi("/rvc/reboot");
8 | }
9 | catch(exception) {
10 |
11 | }
12 | return true;
13 | }
14 |
15 | return false;
16 | }
17 | function pathRefresh(node, inputName, inputData, app) {
18 | // Create the button widget for selecting the files
19 | let refreshWidget = node.addWidget("button", "REBOOT TO REFRESH SID LIST", "refresh", () => {
20 | rebootAPI()
21 | });
22 |
23 | refreshWidget.serialize = false;
24 |
25 | return { widget: refreshWidget };
26 | }
27 | ComfyWidgets.PATHREFRESH = pathRefresh;
28 |
29 | app.registerExtension({
30 | name: "RVC.RefreshPath",
31 | async beforeRegisterNodeDef(nodeType, nodeData, app) {
32 | if (nodeData?.name == "RVC_Infer") {
33 | nodeData.input.required.upload = ["PATHREFRESH"];
34 | }
35 | },
36 | });
--------------------------------------------------------------------------------
/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/wechat.jpg
--------------------------------------------------------------------------------