├── LICENSE ├── README.md ├── __init__.py ├── donate.jpg ├── download_models.py ├── nodes.py ├── requirements.txt ├── rvc ├── __init__.py ├── configs │ ├── __pycache__ │ │ └── config.cpython-310.pyc │ ├── config.json │ ├── config.py │ ├── inuse │ │ ├── .gitignore │ │ ├── v1 │ │ │ └── .gitignore │ │ └── v2 │ │ │ └── .gitignore │ ├── v1 │ │ ├── 32k.json │ │ ├── 40k.json │ │ └── 48k.json │ └── v2 │ │ ├── 32k.json │ │ └── 48k.json ├── i18n │ ├── __pycache__ │ │ └── i18n.cpython-310.pyc │ ├── i18n.py │ ├── locale │ │ ├── en_US.json │ │ ├── es_ES.json │ │ ├── fr_FR.json │ │ ├── it_IT.json │ │ ├── ja_JP.json │ │ ├── ko_KR.json │ │ ├── pt_BR.json │ │ ├── ru_RU.json │ │ ├── tr_TR.json │ │ ├── zh_CN.json │ │ ├── zh_HK.json │ │ ├── zh_SG.json │ │ └── zh_TW.json │ ├── locale_diff.py │ └── scan_i18n.py ├── infer │ ├── lib │ │ ├── __pycache__ │ │ │ ├── audio.cpython-310.pyc │ │ │ ├── rmvpe.cpython-310.pyc │ │ │ ├── rvcmd.cpython-310.pyc │ │ │ └── slicer2.cpython-310.pyc │ │ ├── audio.py │ │ ├── infer_pack │ │ │ ├── __pycache__ │ │ │ │ ├── attentions.cpython-310.pyc │ │ │ │ ├── commons.cpython-310.pyc │ │ │ │ ├── models.cpython-310.pyc │ │ │ │ ├── modules.cpython-310.pyc │ │ │ │ └── transforms.cpython-310.pyc │ │ │ ├── attentions.py │ │ │ ├── attentions_onnx.py │ │ │ ├── commons.py │ │ │ ├── models.py │ │ │ ├── models_onnx.py │ │ │ ├── modules.py │ │ │ ├── modules │ │ │ │ └── F0Predictor │ │ │ │ │ ├── DioF0Predictor.py │ │ │ │ │ ├── F0Predictor.py │ │ │ │ │ ├── HarvestF0Predictor.py │ │ │ │ │ ├── PMF0Predictor.py │ │ │ │ │ └── __init__.py │ │ │ ├── onnx_inference.py │ │ │ └── transforms.py │ │ ├── jit │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ └── __init__.cpython-310.pyc │ │ │ ├── get_hubert.py │ │ │ ├── get_rmvpe.py │ │ │ └── get_synthesizer.py │ │ ├── rmvpe.py │ │ ├── rtrvc.py │ │ ├── rvcmd.py │ │ ├── slicer2.py │ │ ├── train │ │ │ ├── __pycache__ │ │ │ │ ├── data_utils.cpython-310.pyc │ │ │ │ ├── losses.cpython-310.pyc │ │ │ │ ├── mel_processing.cpython-310.pyc │ │ │ │ ├── process_ckpt.cpython-310.pyc │ │ │ │ └── utils.cpython-310.pyc │ │ │ ├── data_utils.py │ │ │ ├── losses.py │ │ │ ├── mel_processing.py │ │ │ ├── process_ckpt.py │ │ │ └── utils.py │ │ └── uvr5_pack │ │ │ ├── lib_v5 │ │ │ ├── dataset.py │ │ │ ├── layers.py │ │ │ ├── layers_123812KB .py │ │ │ ├── layers_123821KB.py │ │ │ ├── layers_33966KB.py │ │ │ ├── layers_537227KB.py │ │ │ ├── layers_537238KB.py │ │ │ ├── layers_new.py │ │ │ ├── model_param_init.py │ │ │ ├── modelparams │ │ │ │ ├── 1band_sr16000_hl512.json │ │ │ │ ├── 1band_sr32000_hl512.json │ │ │ │ ├── 1band_sr33075_hl384.json │ │ │ │ ├── 1band_sr44100_hl1024.json │ │ │ │ ├── 1band_sr44100_hl256.json │ │ │ │ ├── 1band_sr44100_hl512.json │ │ │ │ ├── 1band_sr44100_hl512_cut.json │ │ │ │ ├── 2band_32000.json │ │ │ │ ├── 2band_44100_lofi.json │ │ │ │ ├── 2band_48000.json │ │ │ │ ├── 3band_44100.json │ │ │ │ ├── 3band_44100_mid.json │ │ │ │ ├── 3band_44100_msb2.json │ │ │ │ ├── 4band_44100.json │ │ │ │ ├── 4band_44100_mid.json │ │ │ │ ├── 4band_44100_msb.json │ │ │ │ ├── 4band_44100_msb2.json │ │ │ │ ├── 4band_44100_reverse.json │ │ │ │ ├── 4band_44100_sw.json │ │ │ │ ├── 4band_v2.json │ │ │ │ ├── 4band_v2_sn.json │ │ │ │ ├── 4band_v3.json │ │ │ │ └── ensemble.json │ │ │ ├── nets.py │ │ │ ├── nets_123812KB.py │ │ │ ├── nets_123821KB.py │ │ │ ├── nets_33966KB.py │ │ │ ├── nets_537227KB.py │ │ │ ├── nets_537238KB.py │ │ │ ├── nets_61968KB.py │ │ │ ├── nets_new.py │ │ │ └── spec_utils.py │ │ │ ├── name_params.json │ │ │ └── utils.py │ └── modules │ │ ├── gui │ │ ├── __init__.py │ │ ├── torchgate.py │ │ └── utils.py │ │ ├── ipex │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── gradscaler.py │ │ └── hijacks.py │ │ ├── onnx │ │ └── export.py │ │ ├── train │ │ ├── extract │ │ │ ├── extract_f0_print.py │ │ │ ├── extract_f0_rmvpe.py │ │ │ └── extract_f0_rmvpe_dml.py │ │ ├── extract_feature_print.py │ │ ├── preprocess.py │ │ └── train.py │ │ ├── uvr5 │ │ ├── mdxnet.py │ │ ├── modules.py │ │ └── vr.py │ │ └── vc │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── modules.cpython-310.pyc │ │ ├── pipeline.cpython-310.pyc │ │ └── utils.cpython-310.pyc │ │ ├── modules.py │ │ ├── pipeline.py │ │ └── utils.py ├── logs │ └── mute │ │ ├── 0_gt_wavs │ │ ├── mute32k.wav │ │ ├── mute40k.spec.pt │ │ ├── mute40k.wav │ │ ├── mute48k.spec.pt │ │ └── mute48k.wav │ │ ├── 1_16k_wavs │ │ └── mute.wav │ │ ├── 2a_f0 │ │ └── mute.wav.npy │ │ ├── 2b-f0nsf │ │ └── mute.wav.npy │ │ ├── 3_feature256 │ │ └── mute.npy │ │ └── 3_feature768 │ │ └── mute.npy └── train.py ├── web.png ├── web └── js │ ├── alertMSG.js │ ├── previewAudio.js │ ├── refreshPath.js │ └── uploadAudio.js └── wechat.jpg /LICENSE: -------------------------------------------------------------------------------- 1 | 本软件及其相关代码以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。 2 | 如不认可该条款,则不能使用或引用软件包内任何代码和文件。 3 | 4 | 特此授予任何获得本软件和相关文档文件(以下简称“软件”)副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利,以及授予本软件所提供的人使用本软件的权利,但须符合以下条件: 5 | 上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。 6 | 软件是“按原样”提供的,没有任何明示或暗示的保证,包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下,作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任,无论是在合同诉讼、侵权诉讼还是其他诉讼中。 7 | 8 | MIT License 9 | 10 | Copyright (c) 2024 AIFSH 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining a copy 13 | of this software and associated documentation files (the "Software"), to deal 14 | in the Software without restriction, including without limitation the rights 15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | copies of the Software, and to permit persons to whom the Software is 17 | furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in all 20 | copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 28 | SOFTWARE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-RVC 2 | a comfyui custom node for [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git),you can Voice-Conversion in comfyui now! 3 | 4 | ## How to use 5 | make sure `ffmpeg` is worked in your commandline 6 | for Linux 7 | ``` 8 | apt update 9 | apt install ffmpeg 10 | ``` 11 | for Windows,you can install `ffmpeg` by [WingetUI](https://github.com/marticliment/WingetUI) automatically 12 | 13 | then! 14 | ``` 15 | git clone https://github.com/AIFSH/ComfyUI-RVC.git 16 | cd ComfyUI-RVC 17 | pip install -r requirements.txt 18 | ``` 19 | `weights` will be download from huggingface automatically!if you in china,make sure your internet attach the huggingface 20 | or if you still struggle with huggingface, you may try follow [hf-mirror](https://hf-mirror.com/) to config your env. 21 | 22 | 或者下载[rvc_assets.zip](https://pan.quark.cn/s/039c8d2d59ac)解压后放置到`ComfyUI-RVC/rvc`目录 23 | 24 | ## Tutorial 25 | [Demo](https://www.bilibili.com/video/BV1bH4y1P7n9/) 26 | 27 | ## WeChat Group && Donate 28 |
29 |
30 | Wechat 31 | donate 32 |
33 |
34 | 35 | ## Thanks 36 | - [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git) 37 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys,site 3 | from subprocess import Popen 4 | from server import PromptServer 5 | now_dir = os.path.dirname(os.path.abspath(__file__)) 6 | 7 | site_packages_roots = [] 8 | for path in site.getsitepackages(): 9 | if "packages" in path: 10 | site_packages_roots.append(path) 11 | if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir] 12 | #os.environ["OPENBLAS_NUM_THREADS"] = "4" 13 | for site_packages_root in site_packages_roots: 14 | if os.path.exists(site_packages_root): 15 | try: 16 | with open("%s/users.pth" % (site_packages_root), "a") as f: 17 | f.write( 18 | "%s\n%s/rvc\n%s/rvc/infer" 19 | % (now_dir,now_dir,now_dir) 20 | ) 21 | break 22 | except PermissionError: 23 | raise PermissionError 24 | 25 | if os.path.isfile("%s/users.pth" % (site_packages_root)): 26 | print("!!!RVC path was added to " + "%s/users.pth" % (site_packages_root) 27 | + "\n if meet `No module` error,try `python main.py` again") 28 | 29 | model_path = os.path.join(now_dir,"rvc", "assets") 30 | 31 | if not os.path.exists(os.path.join(model_path, "pretrained_v2")): 32 | cmd = "python %s/download_models.py" % (now_dir) 33 | p = Popen(cmd, shell=True, cwd=now_dir) 34 | p.wait() 35 | else: 36 | print("!!!RVC use cache models,make sure your 'assets' complete") 37 | 38 | 39 | WEB_DIRECTORY = "./web" 40 | from .nodes import LoadAudio, PreViewAudio,RVC_Train,RVC_Infer,CombineAudio 41 | 42 | # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension 43 | # WEB_DIRECTORY = "./somejs" 44 | 45 | # A dictionary that contains all nodes you want to export with their names 46 | # NOTE: names should be globally unique 47 | NODE_CLASS_MAPPINGS = { 48 | "LoadAudio": LoadAudio, 49 | "PreViewAudio": PreViewAudio, 50 | "RVC_Train": RVC_Train, 51 | "RVC_Infer": RVC_Infer, 52 | "CombineAudio": CombineAudio 53 | } 54 | 55 | # A dictionary that contains the friendly/humanly readable titles for the nodes 56 | NODE_DISPLAY_NAME_MAPPINGS = { 57 | "LoadAudio": "AudioLoader", 58 | "PreViewAudio": "PreView Audio", 59 | "RVC_Train": "RVC Train", 60 | "RVC_Infer": "RVC Inference", 61 | "CombineAudio": "CombineAudio" 62 | } 63 | 64 | @PromptServer.instance.routes.get("/rvc/reboot") 65 | def restart(self): 66 | try: 67 | sys.stdout.close_log() 68 | except Exception as e: 69 | pass 70 | 71 | return os.execv(sys.executable, [sys.executable] + sys.argv) -------------------------------------------------------------------------------- /donate.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/donate.jpg -------------------------------------------------------------------------------- /download_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from huggingface_hub import hf_hub_download 4 | 5 | now_dir = os.path.dirname(os.path.abspath(__file__)) 6 | BASE_DIR = os.path.join(now_dir, "rvc") 7 | 8 | 9 | if __name__ == "__main__": 10 | os.makedirs(os.path.join(BASE_DIR ,"assets","weights"), exist_ok=True) 11 | weights_path = os.path.join(BASE_DIR ,"assets") 12 | print("Downloading hubert_base.pt...") 13 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI", 14 | filename="hubert_base.pt", 15 | subfolder= "", 16 | local_dir= os.path.join(weights_path, "hubert")) 17 | print("Downloading rmvpe.pt...") 18 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI", 19 | filename="rmvpe.pt", 20 | subfolder= "", 21 | local_dir= os.path.join(weights_path, "rmvpe")) 22 | 23 | 24 | print("Downloading pretrained models:") 25 | 26 | model_names = [ 27 | "D40k.pth", 28 | "D48k.pth", 29 | "G32k.pth", 30 | "G40k.pth", 31 | "G48k.pth", 32 | "f0D32k.pth", 33 | "f0D40k.pth", 34 | "f0D48k.pth", 35 | "f0G32k.pth", 36 | "f0G40k.pth", 37 | "f0G48k.pth", 38 | ] 39 | for model in model_names: 40 | print(f"Downloading {model}...") 41 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI", 42 | filename=model, 43 | subfolder= "pretrained", 44 | local_dir= weights_path) 45 | 46 | 47 | print("Downloading pretrained models v2:") 48 | 49 | for model in model_names: 50 | print(f"Downloading {model}...") 51 | hf_hub_download(repo_id="lj1995/VoiceConversionWebUI", 52 | filename=model, 53 | subfolder= "pretrained_v2", 54 | local_dir= weights_path) 55 | 56 | print("All models downloaded!") 57 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=1.1.0 2 | numba 3 | numpy==1.23.5 4 | scipy 5 | librosa==0.9.1 6 | llvmlite 7 | fairseq 8 | faiss-cpu 9 | Cython 10 | pydub>=0.25.1 11 | soundfile>=0.12.1 12 | ffmpeg-python>=0.2.0 13 | tensorboardX 14 | Jinja2>=3.1.2 15 | json5 16 | Markdown 17 | matplotlib>=3.7.0 18 | matplotlib-inline>=0.1.3 19 | praat-parselmouth>=0.4.2 20 | Pillow>=9.1.1 21 | resampy>=0.4.2 22 | scikit-learn 23 | tensorboard 24 | tqdm>=4.63.1 25 | tornado>=6.1 26 | Werkzeug>=2.2.3 27 | uc-micro-py>=1.0.1 28 | sympy>=1.11.1 29 | tabulate>=0.8.10 30 | PyYAML>=6.0 31 | pyasn1>=0.4.8 32 | pyasn1-modules>=0.2.8 33 | fsspec>=2022.11.0 34 | absl-py>=1.2.0 35 | audioread 36 | uvicorn>=0.21.1 37 | colorama>=0.4.5 38 | pyworld==0.3.2 39 | httpx 40 | onnxruntime; sys_platform == 'darwin' 41 | onnxruntime-gpu; sys_platform != 'darwin' 42 | torchcrepe==0.0.20 43 | fastapi 44 | torchfcpe 45 | ffmpy==0.3.1 46 | python-dotenv>=1.0.0 47 | av 48 | -------------------------------------------------------------------------------- /rvc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/__init__.py -------------------------------------------------------------------------------- /rvc/configs/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/configs/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/configs/config.json: -------------------------------------------------------------------------------- 1 | {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "formant": 0.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} -------------------------------------------------------------------------------- /rvc/configs/inuse/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !v1 4 | !v2 5 | -------------------------------------------------------------------------------- /rvc/configs/inuse/v1/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /rvc/configs/inuse/v2/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /rvc/configs/v1/32k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,4,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /rvc/configs/v1/40k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 40000, 21 | "filter_length": 2048, 22 | "hop_length": 400, 23 | "win_length": 2048, 24 | "n_mel_channels": 125, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /rvc/configs/v1/48k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 11520, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,6,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /rvc/configs/v2/32k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,8,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [20,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /rvc/configs/v2/48k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 17280, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [12,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [24,20,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /rvc/i18n/__pycache__/i18n.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/i18n/__pycache__/i18n.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | now_dir = os.path.dirname(os.path.abspath(__file__)) 6 | def load_language_list(language): 7 | with open(f"{now_dir}/locale/{language}.json", "r", encoding="utf-8") as f: 8 | language_list = json.load(f) 9 | return language_list 10 | 11 | 12 | class I18nAuto: 13 | def __init__(self, language=None): 14 | if language in ["Auto", None]: 15 | language = locale.getdefaultlocale()[ 16 | 0 17 | ] # getlocale can't identify the system's language ((None, None)) 18 | if not os.path.exists(f"{now_dir}/locale/{language}.json"): 19 | language = "en_US" 20 | self.language = language 21 | self.language_map = load_language_list(language) 22 | 23 | def __call__(self, key): 24 | return self.language_map.get(key, key) 25 | 26 | def __repr__(self): 27 | return "Use Language: " + self.language 28 | -------------------------------------------------------------------------------- /rvc/i18n/locale/zh_CN.json: -------------------------------------------------------------------------------- 1 | { 2 | ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音", 3 | "A模型权重": "A模型权重", 4 | "A模型路径": "A模型路径", 5 | "B模型路径": "B模型路径", 6 | "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", 7 | "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", 8 | "Index Rate": "Index Rate", 9 | "Onnx导出": "Onnx导出", 10 | "Onnx输出路径": "Onnx输出路径", 11 | "RVC模型路径": "RVC模型路径", 12 | "ckpt处理": "ckpt处理", 13 | "harvest进程数": "harvest进程数", 14 | "index文件路径不可包含中文": "index文件路径不可包含中文", 15 | "pth文件路径不可包含中文": "pth文件路径不可包含中文", 16 | "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", 17 | "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", 18 | "step1:正在处理数据": "step1:正在处理数据", 19 | "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", 20 | "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", 21 | "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", 22 | "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引", 23 | "step3a:正在训练模型": "step3a:正在训练模型", 24 | "一键训练": "一键训练", 25 | "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", 26 | "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。", 27 | "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2", 28 | "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声", 29 | "使用模型采样率": "使用模型采样率", 30 | "使用设备采样率": "使用设备采样率", 31 | "保存名": "保存名", 32 | "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名", 33 | "保存的模型名不带后缀": "保存的模型名不带后缀", 34 | "保存频率save_every_epoch": "保存频率save_every_epoch", 35 | "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果", 36 | "修改": "修改", 37 | "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", 38 | "停止音频转换": "停止音频转换", 39 | "全流程结束!": "全流程结束!", 40 | "共振偏移": "共振偏移", 41 | "刷新音色列表和索引路径": "刷新音色列表和索引路径", 42 | "加载模型": "加载模型", 43 | "加载预训练底模D路径": "加载预训练底模D路径", 44 | "加载预训练底模G路径": "加载预训练底模G路径", 45 | "单次推理": "单次推理", 46 | "卸载音色省显存": "卸载音色省显存", 47 | "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", 48 | "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样", 49 | "否": "否", 50 | "启用相位声码器": "启用相位声码器", 51 | "响应阈值": "响应阈值", 52 | "响度因子": "响度因子", 53 | "处理数据": "处理数据", 54 | "导出Onnx模型": "导出Onnx模型", 55 | "导出文件格式": "导出文件格式", 56 | "常见问题解答": "常见问题解答", 57 | "常规设置": "常规设置", 58 | "开始音频转换": "开始音频转换", 59 | "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", 60 | "性能设置": "性能设置", 61 | "总训练轮数total_epoch": "总训练轮数total_epoch", 62 | "批量推理": "批量推理", 63 | "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", 64 | "指定输出主人声文件夹": "指定输出主人声文件夹", 65 | "指定输出文件夹": "指定输出文件夹", 66 | "指定输出非主人声文件夹": "指定输出非主人声文件夹", 67 | "推理时间(ms):": "推理时间(ms):", 68 | "推理音色": "推理音色", 69 | "提取": "提取", 70 | "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数", 71 | "是": "是", 72 | "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间", 73 | "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹", 74 | "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速", 75 | "显卡信息": "显卡信息", 76 | "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.", 77 | "查看": "查看", 78 | "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)", 79 | "检索特征占比": "检索特征占比", 80 | "模型": "模型", 81 | "模型推理": "模型推理", 82 | "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况", 83 | "模型是否带音高指导": "模型是否带音高指导", 84 | "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)", 85 | "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否", 86 | "模型版本型号": "模型版本型号", 87 | "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合", 88 | "模型路径": "模型路径", 89 | "每张显卡的batch_size": "每张显卡的batch_size", 90 | "淡入淡出长度": "淡入淡出长度", 91 | "版本": "版本", 92 | "特征提取": "特征提取", 93 | "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果", 94 | "独占 WASAPI 设备": "独占 WASAPI 设备", 95 | "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ", 96 | "目标采样率": "目标采样率", 97 | "算法延迟(ms):": "算法延迟(ms):", 98 | "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)", 99 | "融合": "融合", 100 | "要改的模型信息": "要改的模型信息", 101 | "要置入的模型信息": "要置入的模型信息", 102 | "训练": "训练", 103 | "训练模型": "训练模型", 104 | "训练特征索引": "训练特征索引", 105 | "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", 106 | "设备类型": "设备类型", 107 | "请指定说话人id": "请指定说话人id", 108 | "请选择index文件": "请选择index文件", 109 | "请选择pth文件": "请选择pth文件", 110 | "请选择说话人id": "请选择说话人id", 111 | "转换": "转换", 112 | "输入实验名": "输入实验名", 113 | "输入待处理音频文件夹路径": "输入待处理音频文件夹路径", 114 | "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", 115 | "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", 116 | "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络", 117 | "输入监听": "输入监听", 118 | "输入训练文件夹路径": "输入训练文件夹路径", 119 | "输入设备": "输入设备", 120 | "输入降噪": "输入降噪", 121 | "输出信息": "输出信息", 122 | "输出变声": "输出变声", 123 | "输出设备": "输出设备", 124 | "输出降噪": "输出降噪", 125 | "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)", 126 | "选择.index文件": "选择.index文件", 127 | "选择.pth文件": "选择.pth文件", 128 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", 129 | "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", 130 | "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", 131 | "采样率:": "采样率:", 132 | "采样长度": "采样长度", 133 | "重载设备列表": "重载设备列表", 134 | "音调设置": "音调设置", 135 | "音频设备": "音频设备", 136 | "音高算法": "音高算法", 137 | "额外推理时长": "额外推理时长" 138 | } 139 | -------------------------------------------------------------------------------- /rvc/i18n/locale_diff.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import OrderedDict 4 | 5 | # Define the standard file name 6 | standard_file = "locale/zh_CN.json" 7 | 8 | # Find all JSON files in the directory 9 | dir_path = "locale/" 10 | languages = [ 11 | os.path.join(dir_path, f) 12 | for f in os.listdir(dir_path) 13 | if f.endswith(".json") and f != standard_file 14 | ] 15 | 16 | # Load the standard file 17 | with open(standard_file, "r", encoding="utf-8") as f: 18 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 19 | 20 | # Loop through each language file 21 | for lang_file in languages: 22 | # Load the language file 23 | with open(lang_file, "r", encoding="utf-8") as f: 24 | lang_data = json.load(f, object_pairs_hook=OrderedDict) 25 | 26 | # Find the difference between the language file and the standard file 27 | diff = set(standard_data.keys()) - set(lang_data.keys()) 28 | 29 | miss = set(lang_data.keys()) - set(standard_data.keys()) 30 | 31 | # Add any missing keys to the language file 32 | for key in diff: 33 | lang_data[key] = key 34 | 35 | # Del any extra keys to the language file 36 | for key in miss: 37 | del lang_data[key] 38 | 39 | # Sort the keys of the language file to match the order of the standard file 40 | lang_data = OrderedDict( 41 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) 42 | ) 43 | 44 | # Save the updated language file 45 | with open(lang_file, "w", encoding="utf-8") as f: 46 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) 47 | f.write("\n") 48 | -------------------------------------------------------------------------------- /rvc/i18n/scan_i18n.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import glob 3 | import json 4 | from collections import OrderedDict 5 | 6 | 7 | def extract_i18n_strings(node): 8 | i18n_strings = [] 9 | 10 | if ( 11 | isinstance(node, ast.Call) 12 | and isinstance(node.func, ast.Name) 13 | and node.func.id == "i18n" 14 | ): 15 | for arg in node.args: 16 | if isinstance(arg, ast.Str): 17 | i18n_strings.append(arg.s) 18 | 19 | for child_node in ast.iter_child_nodes(node): 20 | i18n_strings.extend(extract_i18n_strings(child_node)) 21 | 22 | return i18n_strings 23 | 24 | 25 | # scan the directory for all .py files (recursively) 26 | # for each file, parse the code into an AST 27 | # for each AST, extract the i18n strings 28 | 29 | strings = [] 30 | for filename in glob.iglob("**/*.py", recursive=True): 31 | with open(filename, "r") as f: 32 | code = f.read() 33 | if "I18nAuto" in code: 34 | tree = ast.parse(code) 35 | i18n_strings = extract_i18n_strings(tree) 36 | print(filename, len(i18n_strings)) 37 | strings.extend(i18n_strings) 38 | code_keys = set(strings) 39 | """ 40 | n_i18n.py 41 | gui_v1.py 26 42 | app.py 16 43 | infer-web.py 147 44 | scan_i18n.py 0 45 | i18n.py 0 46 | lib/train/process_ckpt.py 1 47 | """ 48 | print() 49 | print("Total unique:", len(code_keys)) 50 | 51 | 52 | standard_file = "i18n/locale/zh_CN.json" 53 | with open(standard_file, "r", encoding="utf-8") as f: 54 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 55 | standard_keys = set(standard_data.keys()) 56 | 57 | # Define the standard file name 58 | unused_keys = standard_keys - code_keys 59 | print("Unused keys:", len(unused_keys)) 60 | for unused_key in unused_keys: 61 | print("\t", unused_key) 62 | 63 | missing_keys = code_keys - standard_keys 64 | print("Missing keys:", len(missing_keys)) 65 | for missing_key in missing_keys: 66 | print("\t", missing_key) 67 | 68 | code_keys_dict = OrderedDict() 69 | for s in strings: 70 | code_keys_dict[s] = s 71 | 72 | # write back 73 | with open(standard_file, "w", encoding="utf-8") as f: 74 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) 75 | f.write("\n") 76 | -------------------------------------------------------------------------------- /rvc/infer/lib/__pycache__/audio.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/audio.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/__pycache__/rvcmd.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/rvcmd.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/__pycache__/slicer2.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/slicer2.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/audio.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import ffmpeg 3 | import numpy as np 4 | import av 5 | 6 | 7 | def wav2(i, o, format): 8 | inp = av.open(i, "r") 9 | if format == "m4a": 10 | format = "mp4" 11 | out = av.open(o, "w", format=format) 12 | if format == "ogg": 13 | format = "libvorbis" 14 | if format == "mp4": 15 | format = "aac" 16 | 17 | ostream = out.add_stream(format) 18 | 19 | for frame in inp.decode(audio=0): 20 | for p in ostream.encode(frame): 21 | out.mux(p) 22 | 23 | for p in ostream.encode(None): 24 | out.mux(p) 25 | 26 | out.close() 27 | inp.close() 28 | 29 | 30 | def load_audio(file, sr): 31 | try: 32 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 33 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 34 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 35 | file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 36 | out, _ = ( 37 | ffmpeg.input(file, threads=0) 38 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 39 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 40 | ) 41 | except Exception as e: 42 | raise RuntimeError(f"Failed to load audio: {e}") 43 | 44 | return np.frombuffer(out, np.float32).flatten() 45 | 46 | 47 | def clean_path(path_str): 48 | if platform.system() == "Windows": 49 | path_str = path_str.replace("/", "\\") 50 | return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 51 | -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/commons.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import math 3 | 4 | import numpy as np 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | 10 | def init_weights(m, mean=0.0, std=0.01): 11 | classname = m.__class__.__name__ 12 | if classname.find("Conv") != -1: 13 | m.weight.data.normal_(mean, std) 14 | 15 | 16 | def get_padding(kernel_size, dilation=1): 17 | return int((kernel_size * dilation - dilation) / 2) 18 | 19 | 20 | # def convert_pad_shape(pad_shape): 21 | # l = pad_shape[::-1] 22 | # pad_shape = [item for sublist in l for item in sublist] 23 | # return pad_shape 24 | 25 | 26 | def kl_divergence(m_p, logs_p, m_q, logs_q): 27 | """KL(P||Q)""" 28 | kl = (logs_q - logs_p) - 0.5 29 | kl += ( 30 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 31 | ) 32 | return kl 33 | 34 | 35 | def rand_gumbel(shape): 36 | """Sample from the Gumbel distribution, protect from overflows.""" 37 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 38 | return -torch.log(-torch.log(uniform_samples)) 39 | 40 | 41 | def rand_gumbel_like(x): 42 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 43 | return g 44 | 45 | 46 | def slice_segments(x, ids_str, segment_size=4): 47 | ret = torch.zeros_like(x[:, :, :segment_size]) 48 | for i in range(x.size(0)): 49 | idx_str = ids_str[i] 50 | idx_end = idx_str + segment_size 51 | ret[i] = x[i, :, idx_str:idx_end] 52 | return ret 53 | 54 | 55 | def slice_segments2(x, ids_str, segment_size=4): 56 | ret = torch.zeros_like(x[:, :segment_size]) 57 | for i in range(x.size(0)): 58 | idx_str = ids_str[i] 59 | idx_end = idx_str + segment_size 60 | ret[i] = x[i, idx_str:idx_end] 61 | return ret 62 | 63 | 64 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 65 | b, d, t = x.size() 66 | if x_lengths is None: 67 | x_lengths = t 68 | ids_str_max = x_lengths - segment_size + 1 69 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 70 | ret = slice_segments(x, ids_str, segment_size) 71 | return ret, ids_str 72 | 73 | 74 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 75 | position = torch.arange(length, dtype=torch.float) 76 | num_timescales = channels // 2 77 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 78 | num_timescales - 1 79 | ) 80 | inv_timescales = min_timescale * torch.exp( 81 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 82 | ) 83 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 84 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 85 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 86 | signal = signal.view(1, channels, length) 87 | return signal 88 | 89 | 90 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 91 | b, channels, length = x.size() 92 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 93 | return x + signal.to(dtype=x.dtype, device=x.device) 94 | 95 | 96 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 97 | b, channels, length = x.size() 98 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 99 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 100 | 101 | 102 | def subsequent_mask(length): 103 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 104 | return mask 105 | 106 | 107 | @torch.jit.script 108 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 109 | n_channels_int = n_channels[0] 110 | in_act = input_a + input_b 111 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 112 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 113 | acts = t_act * s_act 114 | return acts 115 | 116 | 117 | # def convert_pad_shape(pad_shape): 118 | # l = pad_shape[::-1] 119 | # pad_shape = [item for sublist in l for item in sublist] 120 | # return pad_shape 121 | 122 | 123 | def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: 124 | return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist() 125 | 126 | 127 | def shift_1d(x): 128 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 129 | return x 130 | 131 | 132 | def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None): 133 | if max_length is None: 134 | max_length = length.max() 135 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 136 | return x.unsqueeze(0) < length.unsqueeze(1) 137 | 138 | 139 | def generate_path(duration, mask): 140 | """ 141 | duration: [b, 1, t_x] 142 | mask: [b, 1, t_y, t_x] 143 | """ 144 | device = duration.device 145 | 146 | b, _, t_y, t_x = mask.shape 147 | cum_duration = torch.cumsum(duration, -1) 148 | 149 | cum_duration_flat = cum_duration.view(b * t_x) 150 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 151 | path = path.view(b, t_x, t_y) 152 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 153 | path = path.unsqueeze(1).transpose(2, 3) * mask 154 | return path 155 | 156 | 157 | def clip_grad_value_(parameters, clip_value, norm_type=2): 158 | if isinstance(parameters, torch.Tensor): 159 | parameters = [parameters] 160 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 161 | norm_type = float(norm_type) 162 | if clip_value is not None: 163 | clip_value = float(clip_value) 164 | 165 | total_norm = 0 166 | for p in parameters: 167 | param_norm = p.grad.data.norm(norm_type) 168 | total_norm += param_norm.item() ** norm_type 169 | if clip_value is not None: 170 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 171 | total_norm = total_norm ** (1.0 / norm_type) 172 | return total_norm 173 | -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyworld 3 | 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class DioF0Predictor(F0Predictor): 8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | 14 | def interpolate_f0(self, f0): 15 | """ 16 | 对F0进行插值处理 17 | """ 18 | 19 | data = np.reshape(f0, (f0.size, 1)) 20 | 21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 22 | vuv_vector[data > 0.0] = 1.0 23 | vuv_vector[data <= 0.0] = 0.0 24 | 25 | ip_data = data 26 | 27 | frame_number = data.size 28 | last_value = 0.0 29 | for i in range(frame_number): 30 | if data[i] <= 0.0: 31 | j = i + 1 32 | for j in range(i + 1, frame_number): 33 | if data[j] > 0.0: 34 | break 35 | if j < frame_number - 1: 36 | if last_value > 0.0: 37 | step = (data[j] - data[i - 1]) / float(j - i) 38 | for k in range(i, j): 39 | ip_data[k] = data[i - 1] + step * (k - i + 1) 40 | else: 41 | for k in range(i, j): 42 | ip_data[k] = data[j] 43 | else: 44 | for k in range(i, frame_number): 45 | ip_data[k] = last_value 46 | else: 47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 48 | last_value = data[i] 49 | 50 | return ip_data[:, 0], vuv_vector[:, 0] 51 | 52 | def resize_f0(self, x, target_len): 53 | source = np.array(x) 54 | source[source < 0.001] = np.nan 55 | target = np.interp( 56 | np.arange(0, len(source) * target_len, len(source)) / target_len, 57 | np.arange(0, len(source)), 58 | source, 59 | ) 60 | res = np.nan_to_num(target) 61 | return res 62 | 63 | def compute_f0(self, wav, p_len=None): 64 | if p_len is None: 65 | p_len = wav.shape[0] // self.hop_length 66 | f0, t = pyworld.dio( 67 | wav.astype(np.double), 68 | fs=self.sampling_rate, 69 | f0_floor=self.f0_min, 70 | f0_ceil=self.f0_max, 71 | frame_period=1000 * self.hop_length / self.sampling_rate, 72 | ) 73 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 74 | for index, pitch in enumerate(f0): 75 | f0[index] = round(pitch, 1) 76 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 77 | 78 | def compute_f0_uv(self, wav, p_len=None): 79 | if p_len is None: 80 | p_len = wav.shape[0] // self.hop_length 81 | f0, t = pyworld.dio( 82 | wav.astype(np.double), 83 | fs=self.sampling_rate, 84 | f0_floor=self.f0_min, 85 | f0_ceil=self.f0_max, 86 | frame_period=1000 * self.hop_length / self.sampling_rate, 87 | ) 88 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 89 | for index, pitch in enumerate(f0): 90 | f0[index] = round(pitch, 1) 91 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 92 | -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py: -------------------------------------------------------------------------------- 1 | class F0Predictor(object): 2 | def compute_f0(self, wav, p_len): 3 | """ 4 | input: wav:[signal_length] 5 | p_len:int 6 | output: f0:[signal_length//hop_length] 7 | """ 8 | pass 9 | 10 | def compute_f0_uv(self, wav, p_len): 11 | """ 12 | input: wav:[signal_length] 13 | p_len:int 14 | output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] 15 | """ 16 | pass 17 | -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyworld 3 | 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class HarvestF0Predictor(F0Predictor): 8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | 14 | def interpolate_f0(self, f0): 15 | """ 16 | 对F0进行插值处理 17 | """ 18 | 19 | data = np.reshape(f0, (f0.size, 1)) 20 | 21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 22 | vuv_vector[data > 0.0] = 1.0 23 | vuv_vector[data <= 0.0] = 0.0 24 | 25 | ip_data = data 26 | 27 | frame_number = data.size 28 | last_value = 0.0 29 | for i in range(frame_number): 30 | if data[i] <= 0.0: 31 | j = i + 1 32 | for j in range(i + 1, frame_number): 33 | if data[j] > 0.0: 34 | break 35 | if j < frame_number - 1: 36 | if last_value > 0.0: 37 | step = (data[j] - data[i - 1]) / float(j - i) 38 | for k in range(i, j): 39 | ip_data[k] = data[i - 1] + step * (k - i + 1) 40 | else: 41 | for k in range(i, j): 42 | ip_data[k] = data[j] 43 | else: 44 | for k in range(i, frame_number): 45 | ip_data[k] = last_value 46 | else: 47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 48 | last_value = data[i] 49 | 50 | return ip_data[:, 0], vuv_vector[:, 0] 51 | 52 | def resize_f0(self, x, target_len): 53 | source = np.array(x) 54 | source[source < 0.001] = np.nan 55 | target = np.interp( 56 | np.arange(0, len(source) * target_len, len(source)) / target_len, 57 | np.arange(0, len(source)), 58 | source, 59 | ) 60 | res = np.nan_to_num(target) 61 | return res 62 | 63 | def compute_f0(self, wav, p_len=None): 64 | if p_len is None: 65 | p_len = wav.shape[0] // self.hop_length 66 | f0, t = pyworld.harvest( 67 | wav.astype(np.double), 68 | fs=self.sampling_rate, 69 | f0_ceil=self.f0_max, 70 | f0_floor=self.f0_min, 71 | frame_period=1000 * self.hop_length / self.sampling_rate, 72 | ) 73 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) 74 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 75 | 76 | def compute_f0_uv(self, wav, p_len=None): 77 | if p_len is None: 78 | p_len = wav.shape[0] // self.hop_length 79 | f0, t = pyworld.harvest( 80 | wav.astype(np.double), 81 | fs=self.sampling_rate, 82 | f0_floor=self.f0_min, 83 | f0_ceil=self.f0_max, 84 | frame_period=1000 * self.hop_length / self.sampling_rate, 85 | ) 86 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 87 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 88 | -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import parselmouth 3 | 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class PMF0Predictor(F0Predictor): 8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | 14 | def interpolate_f0(self, f0): 15 | """ 16 | 对F0进行插值处理 17 | """ 18 | 19 | data = np.reshape(f0, (f0.size, 1)) 20 | 21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 22 | vuv_vector[data > 0.0] = 1.0 23 | vuv_vector[data <= 0.0] = 0.0 24 | 25 | ip_data = data 26 | 27 | frame_number = data.size 28 | last_value = 0.0 29 | for i in range(frame_number): 30 | if data[i] <= 0.0: 31 | j = i + 1 32 | for j in range(i + 1, frame_number): 33 | if data[j] > 0.0: 34 | break 35 | if j < frame_number - 1: 36 | if last_value > 0.0: 37 | step = (data[j] - data[i - 1]) / float(j - i) 38 | for k in range(i, j): 39 | ip_data[k] = data[i - 1] + step * (k - i + 1) 40 | else: 41 | for k in range(i, j): 42 | ip_data[k] = data[j] 43 | else: 44 | for k in range(i, frame_number): 45 | ip_data[k] = last_value 46 | else: 47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 48 | last_value = data[i] 49 | 50 | return ip_data[:, 0], vuv_vector[:, 0] 51 | 52 | def compute_f0(self, wav, p_len=None): 53 | x = wav 54 | if p_len is None: 55 | p_len = x.shape[0] // self.hop_length 56 | else: 57 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 58 | time_step = self.hop_length / self.sampling_rate * 1000 59 | f0 = ( 60 | parselmouth.Sound(x, self.sampling_rate) 61 | .to_pitch_ac( 62 | time_step=time_step / 1000, 63 | voicing_threshold=0.6, 64 | pitch_floor=self.f0_min, 65 | pitch_ceiling=self.f0_max, 66 | ) 67 | .selected_array["frequency"] 68 | ) 69 | 70 | pad_size = (p_len - len(f0) + 1) // 2 71 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 72 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 73 | f0, uv = self.interpolate_f0(f0) 74 | return f0 75 | 76 | def compute_f0_uv(self, wav, p_len=None): 77 | x = wav 78 | if p_len is None: 79 | p_len = x.shape[0] // self.hop_length 80 | else: 81 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 82 | time_step = self.hop_length / self.sampling_rate * 1000 83 | f0 = ( 84 | parselmouth.Sound(x, self.sampling_rate) 85 | .to_pitch_ac( 86 | time_step=time_step / 1000, 87 | voicing_threshold=0.6, 88 | pitch_floor=self.f0_min, 89 | pitch_ceiling=self.f0_max, 90 | ) 91 | .selected_array["frequency"] 92 | ) 93 | 94 | pad_size = (p_len - len(f0) + 1) // 2 95 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 96 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 97 | f0, uv = self.interpolate_f0(f0) 98 | return f0, uv 99 | -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py -------------------------------------------------------------------------------- /rvc/infer/lib/infer_pack/onnx_inference.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import onnxruntime 4 | import soundfile 5 | 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class ContentVec: 12 | def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None): 13 | logger.info("Load model(s) from {}".format(vec_path)) 14 | if device == "cpu" or device is None: 15 | providers = ["CPUExecutionProvider"] 16 | elif device == "cuda": 17 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] 18 | elif device == "dml": 19 | providers = ["DmlExecutionProvider"] 20 | else: 21 | raise RuntimeError("Unsportted Device") 22 | self.model = onnxruntime.InferenceSession(vec_path, providers=providers) 23 | 24 | def __call__(self, wav): 25 | return self.forward(wav) 26 | 27 | def forward(self, wav): 28 | feats = wav 29 | if feats.ndim == 2: # double channels 30 | feats = feats.mean(-1) 31 | assert feats.ndim == 1, feats.ndim 32 | feats = np.expand_dims(np.expand_dims(feats, 0), 0) 33 | onnx_input = {self.model.get_inputs()[0].name: feats} 34 | logits = self.model.run(None, onnx_input)[0] 35 | return logits.transpose(0, 2, 1) 36 | 37 | 38 | def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): 39 | if f0_predictor == "pm": 40 | from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor 41 | 42 | f0_predictor_object = PMF0Predictor( 43 | hop_length=hop_length, sampling_rate=sampling_rate 44 | ) 45 | elif f0_predictor == "harvest": 46 | from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( 47 | HarvestF0Predictor, 48 | ) 49 | 50 | f0_predictor_object = HarvestF0Predictor( 51 | hop_length=hop_length, sampling_rate=sampling_rate 52 | ) 53 | elif f0_predictor == "dio": 54 | from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor 55 | 56 | f0_predictor_object = DioF0Predictor( 57 | hop_length=hop_length, sampling_rate=sampling_rate 58 | ) 59 | else: 60 | raise Exception("Unknown f0 predictor") 61 | return f0_predictor_object 62 | 63 | 64 | class OnnxRVC: 65 | def __init__( 66 | self, 67 | model_path, 68 | sr=40000, 69 | hop_size=512, 70 | vec_path="vec-768-layer-12", 71 | device="cpu", 72 | ): 73 | vec_path = f"pretrained/{vec_path}.onnx" 74 | self.vec_model = ContentVec(vec_path, device) 75 | if device == "cpu" or device is None: 76 | providers = ["CPUExecutionProvider"] 77 | elif device == "cuda": 78 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] 79 | elif device == "dml": 80 | providers = ["DmlExecutionProvider"] 81 | else: 82 | raise RuntimeError("Unsportted Device") 83 | self.model = onnxruntime.InferenceSession(model_path, providers=providers) 84 | self.sampling_rate = sr 85 | self.hop_size = hop_size 86 | 87 | def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd): 88 | onnx_input = { 89 | self.model.get_inputs()[0].name: hubert, 90 | self.model.get_inputs()[1].name: hubert_length, 91 | self.model.get_inputs()[2].name: pitch, 92 | self.model.get_inputs()[3].name: pitchf, 93 | self.model.get_inputs()[4].name: ds, 94 | self.model.get_inputs()[5].name: rnd, 95 | } 96 | return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16) 97 | 98 | def inference( 99 | self, 100 | raw_path, 101 | sid, 102 | f0_method="dio", 103 | f0_up_key=0, 104 | pad_time=0.5, 105 | cr_threshold=0.02, 106 | ): 107 | f0_min = 50 108 | f0_max = 1100 109 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 110 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 111 | f0_predictor = get_f0_predictor( 112 | f0_method, 113 | hop_length=self.hop_size, 114 | sampling_rate=self.sampling_rate, 115 | threshold=cr_threshold, 116 | ) 117 | wav, sr = librosa.load(raw_path, sr=self.sampling_rate) 118 | org_length = len(wav) 119 | if org_length / sr > 50.0: 120 | raise RuntimeError("Reached Max Length") 121 | 122 | wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000) 123 | wav16k = wav16k 124 | 125 | hubert = self.vec_model(wav16k) 126 | hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) 127 | hubert_length = hubert.shape[1] 128 | 129 | pitchf = f0_predictor.compute_f0(wav, hubert_length) 130 | pitchf = pitchf * 2 ** (f0_up_key / 12) 131 | pitch = pitchf.copy() 132 | f0_mel = 1127 * np.log(1 + pitch / 700) 133 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 134 | f0_mel_max - f0_mel_min 135 | ) + 1 136 | f0_mel[f0_mel <= 1] = 1 137 | f0_mel[f0_mel > 255] = 255 138 | pitch = np.rint(f0_mel).astype(np.int64) 139 | 140 | pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) 141 | pitch = pitch.reshape(1, len(pitch)) 142 | ds = np.array([sid]).astype(np.int64) 143 | 144 | rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) 145 | hubert_length = np.array([hubert_length]).astype(np.int64) 146 | 147 | out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze() 148 | out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") 149 | return out_wav[0:org_length] 150 | -------------------------------------------------------------------------------- /rvc/infer/lib/jit/__init__.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import pickle 3 | import time 4 | import torch 5 | from tqdm import tqdm 6 | from collections import OrderedDict 7 | 8 | 9 | def load_inputs(path, device, is_half=False): 10 | parm = torch.load(path, map_location=torch.device("cpu")) 11 | for key in parm.keys(): 12 | parm[key] = parm[key].to(device) 13 | if is_half and parm[key].dtype == torch.float32: 14 | parm[key] = parm[key].half() 15 | elif not is_half and parm[key].dtype == torch.float16: 16 | parm[key] = parm[key].float() 17 | return parm 18 | 19 | 20 | def benchmark( 21 | model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False 22 | ): 23 | parm = load_inputs(inputs_path, device, is_half) 24 | total_ts = 0.0 25 | bar = tqdm(range(epoch)) 26 | for i in bar: 27 | start_time = time.perf_counter() 28 | o = model(**parm) 29 | total_ts += time.perf_counter() - start_time 30 | print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}") 31 | 32 | 33 | def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False): 34 | benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half) 35 | 36 | 37 | def to_jit_model( 38 | model_path, 39 | model_type: str, 40 | mode: str = "trace", 41 | inputs_path: str = None, 42 | device=torch.device("cpu"), 43 | is_half=False, 44 | ): 45 | model = None 46 | if model_type.lower() == "synthesizer": 47 | from .get_synthesizer import get_synthesizer 48 | 49 | model, _ = get_synthesizer(model_path, device) 50 | model.forward = model.infer 51 | elif model_type.lower() == "rmvpe": 52 | from .get_rmvpe import get_rmvpe 53 | 54 | model = get_rmvpe(model_path, device) 55 | elif model_type.lower() == "hubert": 56 | from .get_hubert import get_hubert_model 57 | 58 | model = get_hubert_model(model_path, device) 59 | model.forward = model.infer 60 | else: 61 | raise ValueError(f"No model type named {model_type}") 62 | model = model.eval() 63 | model = model.half() if is_half else model.float() 64 | if mode == "trace": 65 | assert not inputs_path 66 | inputs = load_inputs(inputs_path, device, is_half) 67 | model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) 68 | elif mode == "script": 69 | model_jit = torch.jit.script(model) 70 | model_jit.to(device) 71 | model_jit = model_jit.half() if is_half else model_jit.float() 72 | # model = model.half() if is_half else model.float() 73 | return (model, model_jit) 74 | 75 | 76 | def export( 77 | model: torch.nn.Module, 78 | mode: str = "trace", 79 | inputs: dict = None, 80 | device=torch.device("cpu"), 81 | is_half: bool = False, 82 | ) -> dict: 83 | model = model.half() if is_half else model.float() 84 | model.eval() 85 | if mode == "trace": 86 | assert inputs is not None 87 | model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) 88 | elif mode == "script": 89 | model_jit = torch.jit.script(model) 90 | model_jit.to(device) 91 | model_jit = model_jit.half() if is_half else model_jit.float() 92 | buffer = BytesIO() 93 | # model_jit=model_jit.cpu() 94 | torch.jit.save(model_jit, buffer) 95 | del model_jit 96 | cpt = OrderedDict() 97 | cpt["model"] = buffer.getvalue() 98 | cpt["is_half"] = is_half 99 | return cpt 100 | 101 | 102 | def load(path: str): 103 | with open(path, "rb") as f: 104 | return pickle.load(f) 105 | 106 | 107 | def save(ckpt: dict, save_path: str): 108 | with open(save_path, "wb") as f: 109 | pickle.dump(ckpt, f) 110 | 111 | 112 | def rmvpe_jit_export( 113 | model_path: str, 114 | mode: str = "script", 115 | inputs_path: str = None, 116 | save_path: str = None, 117 | device=torch.device("cpu"), 118 | is_half=False, 119 | ): 120 | if not save_path: 121 | save_path = model_path.rstrip(".pth") 122 | save_path += ".half.jit" if is_half else ".jit" 123 | if "cuda" in str(device) and ":" not in str(device): 124 | device = torch.device("cuda:0") 125 | from .get_rmvpe import get_rmvpe 126 | 127 | model = get_rmvpe(model_path, device) 128 | inputs = None 129 | if mode == "trace": 130 | inputs = load_inputs(inputs_path, device, is_half) 131 | ckpt = export(model, mode, inputs, device, is_half) 132 | ckpt["device"] = str(device) 133 | save(ckpt, save_path) 134 | return ckpt 135 | 136 | 137 | def synthesizer_jit_export( 138 | model_path: str, 139 | mode: str = "script", 140 | inputs_path: str = None, 141 | save_path: str = None, 142 | device=torch.device("cpu"), 143 | is_half=False, 144 | ): 145 | if not save_path: 146 | save_path = model_path.rstrip(".pth") 147 | save_path += ".half.jit" if is_half else ".jit" 148 | if "cuda" in str(device) and ":" not in str(device): 149 | device = torch.device("cuda:0") 150 | from .get_synthesizer import get_synthesizer 151 | 152 | model, cpt = get_synthesizer(model_path, device) 153 | assert isinstance(cpt, dict) 154 | model.forward = model.infer 155 | inputs = None 156 | if mode == "trace": 157 | inputs = load_inputs(inputs_path, device, is_half) 158 | ckpt = export(model, mode, inputs, device, is_half) 159 | cpt.pop("weight") 160 | cpt["model"] = ckpt["model"] 161 | cpt["device"] = device 162 | save(cpt, save_path) 163 | return cpt 164 | -------------------------------------------------------------------------------- /rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/jit/get_rmvpe.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): 5 | from infer.lib.rmvpe import E2E 6 | 7 | model = E2E(4, 1, (2, 2)) 8 | ckpt = torch.load(model_path, map_location=device) 9 | model.load_state_dict(ckpt) 10 | model.eval() 11 | model = model.to(device) 12 | return model 13 | -------------------------------------------------------------------------------- /rvc/infer/lib/jit/get_synthesizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_synthesizer(pth_path, device=torch.device("cpu")): 5 | from infer.lib.infer_pack.models import ( 6 | SynthesizerTrnMs256NSFsid, 7 | SynthesizerTrnMs256NSFsid_nono, 8 | SynthesizerTrnMs768NSFsid, 9 | SynthesizerTrnMs768NSFsid_nono, 10 | ) 11 | 12 | cpt = torch.load(pth_path, map_location=torch.device("cpu")) 13 | # tgt_sr = cpt["config"][-1] 14 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] 15 | if_f0 = cpt.get("f0", 1) 16 | version = cpt.get("version", "v1") 17 | if version == "v1": 18 | if if_f0 == 1: 19 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) 20 | else: 21 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) 22 | elif version == "v2": 23 | if if_f0 == 1: 24 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False) 25 | else: 26 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) 27 | del net_g.enc_q 28 | # net_g.forward = net_g.infer 29 | # ckpt = {} 30 | # ckpt["config"] = cpt["config"] 31 | # ckpt["f0"] = if_f0 32 | # ckpt["version"] = version 33 | # ckpt["info"] = cpt.get("info", "0epoch") 34 | net_g.load_state_dict(cpt["weight"], strict=False) 35 | net_g = net_g.float() 36 | net_g.eval().to(device) 37 | net_g.remove_weight_norm() 38 | return net_g, cpt 39 | -------------------------------------------------------------------------------- /rvc/infer/lib/train/__pycache__/data_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/data_utils.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/train/__pycache__/losses.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/losses.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/train/__pycache__/mel_processing.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/mel_processing.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/train/__pycache__/process_ckpt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/process_ckpt.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/train/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/lib/train/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | 15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 16 | loss = 0 17 | r_losses = [] 18 | g_losses = [] 19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 20 | dr = dr.float() 21 | dg = dg.float() 22 | r_loss = torch.mean((1 - dr) ** 2) 23 | g_loss = torch.mean(dg**2) 24 | loss += r_loss + g_loss 25 | r_losses.append(r_loss.item()) 26 | g_losses.append(g_loss.item()) 27 | 28 | return loss, r_losses, g_losses 29 | 30 | 31 | def generator_loss(disc_outputs): 32 | loss = 0 33 | gen_losses = [] 34 | for dg in disc_outputs: 35 | dg = dg.float() 36 | l = torch.mean((1 - dg) ** 2) 37 | gen_losses.append(l) 38 | loss += l 39 | 40 | return loss, gen_losses 41 | 42 | 43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 44 | """ 45 | z_p, logs_q: [b, h, t_t] 46 | m_p, logs_p: [b, h, t_t] 47 | """ 48 | z_p = z_p.float() 49 | logs_q = logs_q.float() 50 | m_p = m_p.float() 51 | logs_p = logs_p.float() 52 | z_mask = z_mask.float() 53 | 54 | kl = logs_p - logs_q - 0.5 55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 56 | kl = torch.sum(kl * z_mask) 57 | l = kl / torch.sum(z_mask) 58 | return l 59 | -------------------------------------------------------------------------------- /rvc/infer/lib/train/mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | MAX_WAV_VALUE = 32768.0 9 | 10 | 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 12 | """ 13 | PARAMS 14 | ------ 15 | C: compression factor 16 | """ 17 | return torch.log(torch.clamp(x, min=clip_val) * C) 18 | 19 | 20 | def dynamic_range_decompression_torch(x, C=1): 21 | """ 22 | PARAMS 23 | ------ 24 | C: compression factor used to compress 25 | """ 26 | return torch.exp(x) / C 27 | 28 | 29 | def spectral_normalize_torch(magnitudes): 30 | return dynamic_range_compression_torch(magnitudes) 31 | 32 | 33 | def spectral_de_normalize_torch(magnitudes): 34 | return dynamic_range_decompression_torch(magnitudes) 35 | 36 | 37 | # Reusable banks 38 | mel_basis = {} 39 | hann_window = {} 40 | 41 | 42 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 43 | """Convert waveform into Linear-frequency Linear-amplitude spectrogram. 44 | 45 | Args: 46 | y :: (B, T) - Audio waveforms 47 | n_fft 48 | sampling_rate 49 | hop_size 50 | win_size 51 | center 52 | Returns: 53 | :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram 54 | """ 55 | 56 | # Window - Cache if needed 57 | global hann_window 58 | dtype_device = str(y.dtype) + "_" + str(y.device) 59 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 62 | dtype=y.dtype, device=y.device 63 | ) 64 | 65 | # Padding 66 | y = torch.nn.functional.pad( 67 | y.unsqueeze(1), 68 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 69 | mode="reflect", 70 | ) 71 | y = y.squeeze(1) 72 | 73 | # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) 74 | spec = torch.stft( 75 | y, 76 | n_fft, 77 | hop_length=hop_size, 78 | win_length=win_size, 79 | window=hann_window[wnsize_dtype_device], 80 | center=center, 81 | pad_mode="reflect", 82 | normalized=False, 83 | onesided=True, 84 | return_complex=True, 85 | ) 86 | 87 | # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) 88 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) 89 | return spec 90 | 91 | 92 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 93 | # MelBasis - Cache if needed 94 | global mel_basis 95 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 96 | fmax_dtype_device = str(fmax) + "_" + dtype_device 97 | if fmax_dtype_device not in mel_basis: 98 | mel = librosa_mel_fn( 99 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 100 | ) 101 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 102 | dtype=spec.dtype, device=spec.device 103 | ) 104 | 105 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) 106 | melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) 107 | melspec = spectral_normalize_torch(melspec) 108 | return melspec 109 | 110 | 111 | def mel_spectrogram_torch( 112 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 113 | ): 114 | """Convert waveform into Mel-frequency Log-amplitude spectrogram. 115 | 116 | Args: 117 | y :: (B, T) - Waveforms 118 | Returns: 119 | melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram 120 | """ 121 | # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) 122 | spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) 123 | 124 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) 125 | melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) 126 | 127 | return melspec 128 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import numpy as np 5 | import torch 6 | import torch.utils.data 7 | from tqdm import tqdm 8 | 9 | from . import spec_utils 10 | 11 | 12 | class VocalRemoverValidationSet(torch.utils.data.Dataset): 13 | def __init__(self, patch_list): 14 | self.patch_list = patch_list 15 | 16 | def __len__(self): 17 | return len(self.patch_list) 18 | 19 | def __getitem__(self, idx): 20 | path = self.patch_list[idx] 21 | data = np.load(path) 22 | 23 | X, y = data["X"], data["y"] 24 | 25 | X_mag = np.abs(X) 26 | y_mag = np.abs(y) 27 | 28 | return X_mag, y_mag 29 | 30 | 31 | def make_pair(mix_dir, inst_dir): 32 | input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] 33 | 34 | X_list = sorted( 35 | [ 36 | os.path.join(mix_dir, fname) 37 | for fname in os.listdir(mix_dir) 38 | if os.path.splitext(fname)[1] in input_exts 39 | ] 40 | ) 41 | y_list = sorted( 42 | [ 43 | os.path.join(inst_dir, fname) 44 | for fname in os.listdir(inst_dir) 45 | if os.path.splitext(fname)[1] in input_exts 46 | ] 47 | ) 48 | 49 | filelist = list(zip(X_list, y_list)) 50 | 51 | return filelist 52 | 53 | 54 | def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): 55 | if split_mode == "random": 56 | filelist = make_pair( 57 | os.path.join(dataset_dir, "mixtures"), 58 | os.path.join(dataset_dir, "instruments"), 59 | ) 60 | 61 | random.shuffle(filelist) 62 | 63 | if len(val_filelist) == 0: 64 | val_size = int(len(filelist) * val_rate) 65 | train_filelist = filelist[:-val_size] 66 | val_filelist = filelist[-val_size:] 67 | else: 68 | train_filelist = [ 69 | pair for pair in filelist if list(pair) not in val_filelist 70 | ] 71 | elif split_mode == "subdirs": 72 | if len(val_filelist) != 0: 73 | raise ValueError( 74 | "The `val_filelist` option is not available in `subdirs` mode" 75 | ) 76 | 77 | train_filelist = make_pair( 78 | os.path.join(dataset_dir, "training/mixtures"), 79 | os.path.join(dataset_dir, "training/instruments"), 80 | ) 81 | 82 | val_filelist = make_pair( 83 | os.path.join(dataset_dir, "validation/mixtures"), 84 | os.path.join(dataset_dir, "validation/instruments"), 85 | ) 86 | 87 | return train_filelist, val_filelist 88 | 89 | 90 | def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): 91 | perm = np.random.permutation(len(X)) 92 | for i, idx in enumerate(tqdm(perm)): 93 | if np.random.uniform() < reduction_rate: 94 | y[idx] = spec_utils.reduce_vocal_aggressively( 95 | X[idx], y[idx], reduction_mask 96 | ) 97 | 98 | if np.random.uniform() < 0.5: 99 | # swap channel 100 | X[idx] = X[idx, ::-1] 101 | y[idx] = y[idx, ::-1] 102 | if np.random.uniform() < 0.02: 103 | # mono 104 | X[idx] = X[idx].mean(axis=0, keepdims=True) 105 | y[idx] = y[idx].mean(axis=0, keepdims=True) 106 | if np.random.uniform() < 0.02: 107 | # inst 108 | X[idx] = y[idx] 109 | 110 | if np.random.uniform() < mixup_rate and i < len(perm) - 1: 111 | lam = np.random.beta(mixup_alpha, mixup_alpha) 112 | X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]] 113 | y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]] 114 | 115 | return X, y 116 | 117 | 118 | def make_padding(width, cropsize, offset): 119 | left = offset 120 | roi_size = cropsize - left * 2 121 | if roi_size == 0: 122 | roi_size = cropsize 123 | right = roi_size - (width % roi_size) + left 124 | 125 | return left, right, roi_size 126 | 127 | 128 | def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): 129 | len_dataset = patches * len(filelist) 130 | 131 | X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) 132 | y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) 133 | 134 | for i, (X_path, y_path) in enumerate(tqdm(filelist)): 135 | X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) 136 | coef = np.max([np.abs(X).max(), np.abs(y).max()]) 137 | X, y = X / coef, y / coef 138 | 139 | l, r, roi_size = make_padding(X.shape[2], cropsize, offset) 140 | X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") 141 | y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") 142 | 143 | starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) 144 | ends = starts + cropsize 145 | for j in range(patches): 146 | idx = i * patches + j 147 | X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]] 148 | y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]] 149 | 150 | return X_dataset, y_dataset 151 | 152 | 153 | def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): 154 | patch_list = [] 155 | patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( 156 | cropsize, sr, hop_length, n_fft, offset 157 | ) 158 | os.makedirs(patch_dir, exist_ok=True) 159 | 160 | for i, (X_path, y_path) in enumerate(tqdm(filelist)): 161 | basename = os.path.splitext(os.path.basename(X_path))[0] 162 | 163 | X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) 164 | coef = np.max([np.abs(X).max(), np.abs(y).max()]) 165 | X, y = X / coef, y / coef 166 | 167 | l, r, roi_size = make_padding(X.shape[2], cropsize, offset) 168 | X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") 169 | y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") 170 | 171 | len_dataset = int(np.ceil(X.shape[2] / roi_size)) 172 | for j in range(len_dataset): 173 | outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j)) 174 | start = j * roi_size 175 | if not os.path.exists(outpath): 176 | np.savez( 177 | outpath, 178 | X=X_pad[:, :, start : start + cropsize], 179 | y=y_pad[:, :, start : start + cropsize], 180 | ) 181 | patch_list.append(outpath) 182 | 183 | return VocalRemoverValidationSet(patch_list) 184 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/layers_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class Encoder(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 31 | super(Encoder, self).__init__() 32 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) 33 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 34 | 35 | def __call__(self, x): 36 | h = self.conv1(x) 37 | h = self.conv2(h) 38 | 39 | return h 40 | 41 | 42 | class Decoder(nn.Module): 43 | def __init__( 44 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 45 | ): 46 | super(Decoder, self).__init__() 47 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 48 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 49 | self.dropout = nn.Dropout2d(0.1) if dropout else None 50 | 51 | def __call__(self, x, skip=None): 52 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 53 | 54 | if skip is not None: 55 | skip = spec_utils.crop_center(skip, x) 56 | x = torch.cat([x, skip], dim=1) 57 | 58 | h = self.conv1(x) 59 | # h = self.conv2(h) 60 | 61 | if self.dropout is not None: 62 | h = self.dropout(h) 63 | 64 | return h 65 | 66 | 67 | class ASPPModule(nn.Module): 68 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): 69 | super(ASPPModule, self).__init__() 70 | self.conv1 = nn.Sequential( 71 | nn.AdaptiveAvgPool2d((1, None)), 72 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), 73 | ) 74 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) 75 | self.conv3 = Conv2DBNActiv( 76 | nin, nout, 3, 1, dilations[0], dilations[0], activ=activ 77 | ) 78 | self.conv4 = Conv2DBNActiv( 79 | nin, nout, 3, 1, dilations[1], dilations[1], activ=activ 80 | ) 81 | self.conv5 = Conv2DBNActiv( 82 | nin, nout, 3, 1, dilations[2], dilations[2], activ=activ 83 | ) 84 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) 85 | self.dropout = nn.Dropout2d(0.1) if dropout else None 86 | 87 | def forward(self, x): 88 | _, _, h, w = x.size() 89 | feat1 = F.interpolate( 90 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 91 | ) 92 | feat2 = self.conv2(x) 93 | feat3 = self.conv3(x) 94 | feat4 = self.conv4(x) 95 | feat5 = self.conv5(x) 96 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 97 | out = self.bottleneck(out) 98 | 99 | if self.dropout is not None: 100 | out = self.dropout(out) 101 | 102 | return out 103 | 104 | 105 | class LSTMModule(nn.Module): 106 | def __init__(self, nin_conv, nin_lstm, nout_lstm): 107 | super(LSTMModule, self).__init__() 108 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) 109 | self.lstm = nn.LSTM( 110 | input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True 111 | ) 112 | self.dense = nn.Sequential( 113 | nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() 114 | ) 115 | 116 | def forward(self, x): 117 | N, _, nbins, nframes = x.size() 118 | h = self.conv(x)[:, 0] # N, nbins, nframes 119 | h = h.permute(2, 0, 1) # nframes, N, nbins 120 | h, _ = self.lstm(h) 121 | h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins 122 | h = h.reshape(nframes, N, 1, nbins) 123 | h = h.permute(1, 2, 3, 0) 124 | 125 | return h 126 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | 5 | default_param = {} 6 | default_param["bins"] = 768 7 | default_param["unstable_bins"] = 9 # training only 8 | default_param["reduction_bins"] = 762 # training only 9 | default_param["sr"] = 44100 10 | default_param["pre_filter_start"] = 757 11 | default_param["pre_filter_stop"] = 768 12 | default_param["band"] = {} 13 | 14 | 15 | default_param["band"][1] = { 16 | "sr": 11025, 17 | "hl": 128, 18 | "n_fft": 960, 19 | "crop_start": 0, 20 | "crop_stop": 245, 21 | "lpf_start": 61, # inference only 22 | "res_type": "polyphase", 23 | } 24 | 25 | default_param["band"][2] = { 26 | "sr": 44100, 27 | "hl": 512, 28 | "n_fft": 1536, 29 | "crop_start": 24, 30 | "crop_stop": 547, 31 | "hpf_start": 81, # inference only 32 | "res_type": "sinc_best", 33 | } 34 | 35 | 36 | def int_keys(d): 37 | r = {} 38 | for k, v in d: 39 | if k.isdigit(): 40 | k = int(k) 41 | r[k] = v 42 | return r 43 | 44 | 45 | class ModelParameters(object): 46 | def __init__(self, config_path=""): 47 | if ".pth" == pathlib.Path(config_path).suffix: 48 | import zipfile 49 | 50 | with zipfile.ZipFile(config_path, "r") as zip: 51 | self.param = json.loads( 52 | zip.read("param.json"), object_pairs_hook=int_keys 53 | ) 54 | elif ".json" == pathlib.Path(config_path).suffix: 55 | with open(config_path, "r") as f: 56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 57 | else: 58 | self.param = default_param 59 | 60 | for k in [ 61 | "mid_side", 62 | "mid_side_b", 63 | "mid_side_b2", 64 | "stereo_w", 65 | "stereo_n", 66 | "reverse", 67 | ]: 68 | if not k in self.param: 69 | self.param[k] = False 70 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets.py: -------------------------------------------------------------------------------- 1 | import layers 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import spec_utils 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 16) 44 | self.stg1_high_band_net = BaseASPPNet(2, 16) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(8, 16) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(16, 32) 51 | 52 | self.out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_33966KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 16) 43 | self.stg1_high_band_net = BaseASPPNet(2, 16) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(8, 16) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(16, 32) 50 | 51 | self.out = nn.Conv2d(32, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 64) 44 | self.stg1_high_band_net = BaseASPPNet(2, 64) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(32, 64) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(64, 128) 51 | 52 | self.out = nn.Conv2d(128, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 64) 44 | self.stg1_high_band_net = BaseASPPNet(2, 64) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(32, 64) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(64, 128) 51 | 52 | self.out = nn.Conv2d(128, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/lib_v5/nets_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_new 6 | 7 | 8 | class BaseNet(nn.Module): 9 | def __init__( 10 | self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) 11 | ): 12 | super(BaseNet, self).__init__() 13 | self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) 14 | self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) 15 | self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) 16 | self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) 17 | self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) 18 | 19 | self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) 20 | 21 | self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) 22 | self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) 23 | self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) 24 | self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) 25 | self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) 26 | 27 | def __call__(self, x): 28 | e1 = self.enc1(x) 29 | e2 = self.enc2(e1) 30 | e3 = self.enc3(e2) 31 | e4 = self.enc4(e3) 32 | e5 = self.enc5(e4) 33 | 34 | h = self.aspp(e5) 35 | 36 | h = self.dec4(h, e4) 37 | h = self.dec3(h, e3) 38 | h = self.dec2(h, e2) 39 | h = torch.cat([h, self.lstm_dec2(h)], dim=1) 40 | h = self.dec1(h, e1) 41 | 42 | return h 43 | 44 | 45 | class CascadedNet(nn.Module): 46 | def __init__(self, n_fft, nout=32, nout_lstm=128): 47 | super(CascadedNet, self).__init__() 48 | 49 | self.max_bin = n_fft // 2 50 | self.output_bin = n_fft // 2 + 1 51 | self.nin_lstm = self.max_bin // 2 52 | self.offset = 64 53 | 54 | self.stg1_low_band_net = nn.Sequential( 55 | BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), 56 | layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), 57 | ) 58 | 59 | self.stg1_high_band_net = BaseNet( 60 | 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 61 | ) 62 | 63 | self.stg2_low_band_net = nn.Sequential( 64 | BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), 65 | layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), 66 | ) 67 | self.stg2_high_band_net = BaseNet( 68 | nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 69 | ) 70 | 71 | self.stg3_full_band_net = BaseNet( 72 | 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm 73 | ) 74 | 75 | self.out = nn.Conv2d(nout, 2, 1, bias=False) 76 | self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) 77 | 78 | def forward(self, x): 79 | x = x[:, :, : self.max_bin] 80 | 81 | bandw = x.size()[2] // 2 82 | l1_in = x[:, :, :bandw] 83 | h1_in = x[:, :, bandw:] 84 | l1 = self.stg1_low_band_net(l1_in) 85 | h1 = self.stg1_high_band_net(h1_in) 86 | aux1 = torch.cat([l1, h1], dim=2) 87 | 88 | l2_in = torch.cat([l1_in, l1], dim=1) 89 | h2_in = torch.cat([h1_in, h1], dim=1) 90 | l2 = self.stg2_low_band_net(l2_in) 91 | h2 = self.stg2_high_band_net(h2_in) 92 | aux2 = torch.cat([l2, h2], dim=2) 93 | 94 | f3_in = torch.cat([x, aux1, aux2], dim=1) 95 | f3 = self.stg3_full_band_net(f3_in) 96 | 97 | mask = torch.sigmoid(self.out(f3)) 98 | mask = F.pad( 99 | input=mask, 100 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 101 | mode="replicate", 102 | ) 103 | 104 | if self.training: 105 | aux = torch.cat([aux1, aux2], dim=1) 106 | aux = torch.sigmoid(self.aux_out(aux)) 107 | aux = F.pad( 108 | input=aux, 109 | pad=(0, 0, 0, self.output_bin - aux.size()[2]), 110 | mode="replicate", 111 | ) 112 | return mask, aux 113 | else: 114 | return mask 115 | 116 | def predict_mask(self, x): 117 | mask = self.forward(x) 118 | 119 | if self.offset > 0: 120 | mask = mask[:, :, :, self.offset : -self.offset] 121 | assert mask.size()[3] > 0 122 | 123 | return mask 124 | 125 | def predict(self, x, aggressiveness=None): 126 | mask = self.forward(x) 127 | pred_mag = x * mask 128 | 129 | if self.offset > 0: 130 | pred_mag = pred_mag[:, :, :, self.offset : -self.offset] 131 | assert pred_mag.size()[3] > 0 132 | 133 | return pred_mag 134 | -------------------------------------------------------------------------------- /rvc/infer/lib/uvr5_pack/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import torch 5 | from tqdm import tqdm 6 | 7 | 8 | def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: 9 | with open(file_name, "r") as f: 10 | data = json.load(f) 11 | 12 | return data 13 | 14 | 15 | def make_padding(width, cropsize, offset): 16 | left = offset 17 | roi_size = cropsize - left * 2 18 | if roi_size == 0: 19 | roi_size = cropsize 20 | right = roi_size - (width % roi_size) + left 21 | 22 | return left, right, roi_size 23 | 24 | 25 | def inference(X_spec, device, model, aggressiveness, data): 26 | """ 27 | data : dic configs 28 | """ 29 | 30 | def _execute( 31 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True 32 | ): 33 | model.eval() 34 | with torch.no_grad(): 35 | preds = [] 36 | 37 | iterations = [n_window] 38 | 39 | total_iterations = sum(iterations) 40 | for i in tqdm(range(n_window)): 41 | start = i * roi_size 42 | X_mag_window = X_mag_pad[ 43 | None, :, :, start : start + data["window_size"] 44 | ] 45 | X_mag_window = torch.from_numpy(X_mag_window) 46 | if is_half: 47 | X_mag_window = X_mag_window.half() 48 | X_mag_window = X_mag_window.to(device) 49 | 50 | pred = model.predict(X_mag_window, aggressiveness) 51 | 52 | pred = pred.detach().cpu().numpy() 53 | preds.append(pred[0]) 54 | 55 | pred = np.concatenate(preds, axis=2) 56 | return pred 57 | 58 | def preprocess(X_spec): 59 | X_mag = np.abs(X_spec) 60 | X_phase = np.angle(X_spec) 61 | 62 | return X_mag, X_phase 63 | 64 | X_mag, X_phase = preprocess(X_spec) 65 | 66 | coef = X_mag.max() 67 | X_mag_pre = X_mag / coef 68 | 69 | n_frame = X_mag_pre.shape[2] 70 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) 71 | n_window = int(np.ceil(n_frame / roi_size)) 72 | 73 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 74 | 75 | if list(model.state_dict().values())[0].dtype == torch.float16: 76 | is_half = True 77 | else: 78 | is_half = False 79 | pred = _execute( 80 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 81 | ) 82 | pred = pred[:, :, :n_frame] 83 | 84 | if data["tta"]: 85 | pad_l += roi_size // 2 86 | pad_r += roi_size // 2 87 | n_window += 1 88 | 89 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 90 | 91 | pred_tta = _execute( 92 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 93 | ) 94 | pred_tta = pred_tta[:, :, roi_size // 2 :] 95 | pred_tta = pred_tta[:, :, :n_frame] 96 | 97 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) 98 | else: 99 | return pred * coef, X_mag, np.exp(1.0j * X_phase) 100 | 101 | 102 | def _get_name_params(model_path, model_hash): 103 | data = load_data() 104 | flag = False 105 | ModelName = model_path 106 | for type in list(data): 107 | for model in list(data[type][0]): 108 | for i in range(len(data[type][0][model])): 109 | if str(data[type][0][model][i]["hash_name"]) == model_hash: 110 | flag = True 111 | elif str(data[type][0][model][i]["hash_name"]) in ModelName: 112 | flag = True 113 | 114 | if flag: 115 | model_params_auto = data[type][0][model][i]["model_params"] 116 | param_name_auto = data[type][0][model][i]["param_name"] 117 | if type == "equivalent": 118 | return param_name_auto, model_params_auto 119 | else: 120 | flag = False 121 | return param_name_auto, model_params_auto 122 | -------------------------------------------------------------------------------- /rvc/infer/modules/gui/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | TorchGating is a PyTorch-based implementation of Spectral Gating 3 | ================================================ 4 | Author: Asaf Zorea 5 | 6 | Contents 7 | -------- 8 | torchgate imports all the functions from PyTorch, and in addition provides: 9 | TorchGating --- A PyTorch module that applies a spectral gate to an input signal 10 | 11 | """ 12 | 13 | from .torchgate import TorchGate 14 | -------------------------------------------------------------------------------- /rvc/infer/modules/gui/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.types import Number 3 | 4 | 5 | @torch.no_grad() 6 | def amp_to_db( 7 | x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40 8 | ) -> torch.Tensor: 9 | """ 10 | Convert the input tensor from amplitude to decibel scale. 11 | 12 | Arguments: 13 | x {[torch.Tensor]} -- [Input tensor.] 14 | 15 | Keyword Arguments: 16 | eps {[float]} -- [Small value to avoid numerical instability.] 17 | (default: {torch.finfo(torch.float64).eps}) 18 | top_db {[float]} -- [threshold the output at ``top_db`` below the peak] 19 | ` (default: {40}) 20 | 21 | Returns: 22 | [torch.Tensor] -- [Output tensor in decibel scale.] 23 | """ 24 | x_db = 20 * torch.log10(x.abs() + eps) 25 | return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1)) 26 | 27 | 28 | @torch.no_grad() 29 | def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor: 30 | """ 31 | Apply a sigmoid function with temperature scaling. 32 | 33 | Arguments: 34 | x {[torch.Tensor]} -- [Input tensor.] 35 | x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.] 36 | temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.] 37 | 38 | Returns: 39 | [torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.] 40 | """ 41 | return torch.sigmoid((x - x0) / temp_coeff) 42 | 43 | 44 | @torch.no_grad() 45 | def linspace( 46 | start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs 47 | ) -> torch.Tensor: 48 | """ 49 | Generate a linearly spaced 1-D tensor. 50 | 51 | Arguments: 52 | start {[Number]} -- [The starting value of the sequence.] 53 | stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False. 54 | In that case, the sequence consists of all but the last of ``num + 1`` 55 | evenly spaced samples, so that `stop` is excluded. Note that the step 56 | size changes when `endpoint` is False.] 57 | 58 | Keyword Arguments: 59 | num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.] 60 | endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included. 61 | Default is True.] 62 | **kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.] 63 | 64 | Returns: 65 | [torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.] 66 | """ 67 | if endpoint: 68 | return torch.linspace(start, stop, num, **kwargs) 69 | else: 70 | return torch.linspace(start, stop, num + 1, **kwargs)[:-1] 71 | -------------------------------------------------------------------------------- /rvc/infer/modules/onnx/export.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM 4 | 5 | 6 | def export_onnx(ModelPath, ExportedPath): 7 | cpt = torch.load(ModelPath, map_location="cpu") 8 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] 9 | vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 10 | 11 | test_phone = torch.rand(1, 200, vec_channels) # hidden unit 12 | test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) 13 | test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) 14 | test_pitchf = torch.rand(1, 200) # nsf基频 15 | test_ds = torch.LongTensor([0]) # 说话人ID 16 | test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) 17 | 18 | device = "cpu" # 导出时设备(不影响使用模型) 19 | 20 | net_g = SynthesizerTrnMsNSFsidM( 21 | *cpt["config"], is_half=False, encoder_dim=vec_channels 22 | ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) 23 | net_g.load_state_dict(cpt["weight"], strict=False) 24 | input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] 25 | output_names = [ 26 | "audio", 27 | ] 28 | # net_g.construct_spkmixmap() #多角色混合轨道导出 29 | torch.onnx.export( 30 | net_g, 31 | ( 32 | test_phone.to(device), 33 | test_phone_lengths.to(device), 34 | test_pitch.to(device), 35 | test_pitchf.to(device), 36 | test_ds.to(device), 37 | test_rnd.to(device), 38 | ), 39 | ExportedPath, 40 | dynamic_axes={ 41 | "phone": [1], 42 | "pitch": [1], 43 | "pitchf": [1], 44 | "rnd": [2], 45 | }, 46 | do_constant_folding=False, 47 | opset_version=17, 48 | verbose=False, 49 | input_names=input_names, 50 | output_names=output_names, 51 | ) 52 | return "Finished" 53 | -------------------------------------------------------------------------------- /rvc/infer/modules/train/extract/extract_f0_print.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | import parselmouth 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | import logging 10 | 11 | import numpy as np 12 | import pyworld 13 | 14 | from infer.lib.audio import load_audio 15 | 16 | logging.getLogger("numba").setLevel(logging.WARNING) 17 | from multiprocessing import Process 18 | 19 | exp_dir = sys.argv[1] 20 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+") 21 | 22 | 23 | def printt(strr): 24 | print(strr) 25 | f.write("%s\n" % strr) 26 | f.flush() 27 | 28 | 29 | n_p = int(sys.argv[2]) 30 | f0method = sys.argv[3] 31 | 32 | 33 | class FeatureInput(object): 34 | def __init__(self, samplerate=16000, hop_size=160): 35 | self.fs = samplerate 36 | self.hop = hop_size 37 | 38 | self.f0_bin = 256 39 | self.f0_max = 1100.0 40 | self.f0_min = 50.0 41 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) 42 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) 43 | 44 | def compute_f0(self, path, f0_method): 45 | x = load_audio(path, self.fs) 46 | p_len = x.shape[0] // self.hop 47 | if f0_method == "pm": 48 | time_step = 160 / 16000 * 1000 49 | f0_min = 50 50 | f0_max = 1100 51 | f0 = ( 52 | parselmouth.Sound(x, self.fs) 53 | .to_pitch_ac( 54 | time_step=time_step / 1000, 55 | voicing_threshold=0.6, 56 | pitch_floor=f0_min, 57 | pitch_ceiling=f0_max, 58 | ) 59 | .selected_array["frequency"] 60 | ) 61 | pad_size = (p_len - len(f0) + 1) // 2 62 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 63 | f0 = np.pad( 64 | f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" 65 | ) 66 | elif f0_method == "harvest": 67 | f0, t = pyworld.harvest( 68 | x.astype(np.double), 69 | fs=self.fs, 70 | f0_ceil=self.f0_max, 71 | f0_floor=self.f0_min, 72 | frame_period=1000 * self.hop / self.fs, 73 | ) 74 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) 75 | elif f0_method == "dio": 76 | f0, t = pyworld.dio( 77 | x.astype(np.double), 78 | fs=self.fs, 79 | f0_ceil=self.f0_max, 80 | f0_floor=self.f0_min, 81 | frame_period=1000 * self.hop / self.fs, 82 | ) 83 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) 84 | elif f0_method == "rmvpe": 85 | if hasattr(self, "model_rmvpe") == False: 86 | from infer.lib.rmvpe import RMVPE 87 | 88 | print("Loading rmvpe model") 89 | self.model_rmvpe = RMVPE( 90 | "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" 91 | ) 92 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) 93 | return f0 94 | 95 | def coarse_f0(self, f0): 96 | f0_mel = 1127 * np.log(1 + f0 / 700) 97 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( 98 | self.f0_bin - 2 99 | ) / (self.f0_mel_max - self.f0_mel_min) + 1 100 | 101 | # use 0 or 1 102 | f0_mel[f0_mel <= 1] = 1 103 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 104 | f0_coarse = np.rint(f0_mel).astype(int) 105 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( 106 | f0_coarse.max(), 107 | f0_coarse.min(), 108 | ) 109 | return f0_coarse 110 | 111 | def go(self, paths, f0_method): 112 | if len(paths) == 0: 113 | printt("no-f0-todo") 114 | else: 115 | printt("todo-f0-%s" % len(paths)) 116 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条 117 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): 118 | try: 119 | if idx % n == 0: 120 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) 121 | if ( 122 | os.path.exists(opt_path1 + ".npy") == True 123 | and os.path.exists(opt_path2 + ".npy") == True 124 | ): 125 | continue 126 | featur_pit = self.compute_f0(inp_path, f0_method) 127 | np.save( 128 | opt_path2, 129 | featur_pit, 130 | allow_pickle=False, 131 | ) # nsf 132 | coarse_pit = self.coarse_f0(featur_pit) 133 | np.save( 134 | opt_path1, 135 | coarse_pit, 136 | allow_pickle=False, 137 | ) # ori 138 | except: 139 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) 140 | 141 | 142 | if __name__ == "__main__": 143 | # exp_dir=r"E:\codes\py39\dataset\mi-test" 144 | # n_p=16 145 | # f = open("%s/log_extract_f0.log"%exp_dir, "w") 146 | printt(" ".join(sys.argv)) 147 | featureInput = FeatureInput() 148 | paths = [] 149 | inp_root = "%s/1_16k_wavs" % (exp_dir) 150 | opt_root1 = "%s/2a_f0" % (exp_dir) 151 | opt_root2 = "%s/2b-f0nsf" % (exp_dir) 152 | 153 | os.makedirs(opt_root1, exist_ok=True) 154 | os.makedirs(opt_root2, exist_ok=True) 155 | for name in sorted(list(os.listdir(inp_root))): 156 | inp_path = "%s/%s" % (inp_root, name) 157 | if "spec" in inp_path: 158 | continue 159 | opt_path1 = "%s/%s" % (opt_root1, name) 160 | opt_path2 = "%s/%s" % (opt_root2, name) 161 | paths.append([inp_path, opt_path1, opt_path2]) 162 | 163 | ps = [] 164 | for i in range(n_p): 165 | p = Process( 166 | target=featureInput.go, 167 | args=( 168 | paths[i::n_p], 169 | f0method, 170 | ), 171 | ) 172 | ps.append(p) 173 | p.start() 174 | for i in range(n_p): 175 | ps[i].join() 176 | -------------------------------------------------------------------------------- /rvc/infer/modules/train/extract/extract_f0_rmvpe.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | import parselmouth 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | import logging 10 | 11 | import numpy as np 12 | import pyworld 13 | 14 | from infer.lib.audio import load_audio 15 | 16 | logging.getLogger("numba").setLevel(logging.WARNING) 17 | 18 | n_part = int(sys.argv[1]) 19 | i_part = int(sys.argv[2]) 20 | i_gpu = sys.argv[3] 21 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) 22 | exp_dir = sys.argv[4] 23 | is_half = sys.argv[5] 24 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+") 25 | 26 | 27 | def printt(strr): 28 | print(strr) 29 | f.write("%s\n" % strr) 30 | f.flush() 31 | 32 | 33 | class FeatureInput(object): 34 | def __init__(self, samplerate=16000, hop_size=160): 35 | self.fs = samplerate 36 | self.hop = hop_size 37 | 38 | self.f0_bin = 256 39 | self.f0_max = 1100.0 40 | self.f0_min = 50.0 41 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) 42 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) 43 | 44 | def compute_f0(self, path, f0_method): 45 | x = load_audio(path, self.fs) 46 | # p_len = x.shape[0] // self.hop 47 | if f0_method == "rmvpe": 48 | if hasattr(self, "model_rmvpe") == False: 49 | from infer.lib.rmvpe import RMVPE 50 | 51 | print("Loading rmvpe model") 52 | self.model_rmvpe = RMVPE( 53 | "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" 54 | ) 55 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) 56 | return f0 57 | 58 | def coarse_f0(self, f0): 59 | f0_mel = 1127 * np.log(1 + f0 / 700) 60 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( 61 | self.f0_bin - 2 62 | ) / (self.f0_mel_max - self.f0_mel_min) + 1 63 | 64 | # use 0 or 1 65 | f0_mel[f0_mel <= 1] = 1 66 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 67 | f0_coarse = np.rint(f0_mel).astype(int) 68 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( 69 | f0_coarse.max(), 70 | f0_coarse.min(), 71 | ) 72 | return f0_coarse 73 | 74 | def go(self, paths, f0_method): 75 | if len(paths) == 0: 76 | printt("no-f0-todo") 77 | else: 78 | printt("todo-f0-%s" % len(paths)) 79 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条 80 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): 81 | try: 82 | if idx % n == 0: 83 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) 84 | if ( 85 | os.path.exists(opt_path1 + ".npy") == True 86 | and os.path.exists(opt_path2 + ".npy") == True 87 | ): 88 | continue 89 | featur_pit = self.compute_f0(inp_path, f0_method) 90 | np.save( 91 | opt_path2, 92 | featur_pit, 93 | allow_pickle=False, 94 | ) # nsf 95 | coarse_pit = self.coarse_f0(featur_pit) 96 | np.save( 97 | opt_path1, 98 | coarse_pit, 99 | allow_pickle=False, 100 | ) # ori 101 | except: 102 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) 103 | 104 | 105 | if __name__ == "__main__": 106 | # exp_dir=r"E:\codes\py39\dataset\mi-test" 107 | # n_p=16 108 | # f = open("%s/log_extract_f0.log"%exp_dir, "w") 109 | printt(" ".join(sys.argv)) 110 | featureInput = FeatureInput() 111 | paths = [] 112 | inp_root = "%s/1_16k_wavs" % (exp_dir) 113 | opt_root1 = "%s/2a_f0" % (exp_dir) 114 | opt_root2 = "%s/2b-f0nsf" % (exp_dir) 115 | 116 | os.makedirs(opt_root1, exist_ok=True) 117 | os.makedirs(opt_root2, exist_ok=True) 118 | for name in sorted(list(os.listdir(inp_root))): 119 | inp_path = "%s/%s" % (inp_root, name) 120 | if "spec" in inp_path: 121 | continue 122 | opt_path1 = "%s/%s" % (opt_root1, name) 123 | opt_path2 = "%s/%s" % (opt_root2, name) 124 | paths.append([inp_path, opt_path1, opt_path2]) 125 | try: 126 | featureInput.go(paths[i_part::n_part], "rmvpe") 127 | except: 128 | printt("f0_all_fail-%s" % (traceback.format_exc())) 129 | # ps = [] 130 | # for i in range(n_p): 131 | # p = Process( 132 | # target=featureInput.go, 133 | # args=( 134 | # paths[i::n_p], 135 | # f0method, 136 | # ), 137 | # ) 138 | # ps.append(p) 139 | # p.start() 140 | # for i in range(n_p): 141 | # ps[i].join() 142 | -------------------------------------------------------------------------------- /rvc/infer/modules/train/extract/extract_f0_rmvpe_dml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | import parselmouth 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | import logging 10 | 11 | import numpy as np 12 | import pyworld 13 | 14 | from infer.lib.audio import load_audio 15 | 16 | logging.getLogger("numba").setLevel(logging.WARNING) 17 | 18 | exp_dir = sys.argv[1] 19 | import torch_directml 20 | 21 | device = torch_directml.device(torch_directml.default_device()) 22 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+") 23 | 24 | 25 | def printt(strr): 26 | print(strr) 27 | f.write("%s\n" % strr) 28 | f.flush() 29 | 30 | 31 | class FeatureInput(object): 32 | def __init__(self, samplerate=16000, hop_size=160): 33 | self.fs = samplerate 34 | self.hop = hop_size 35 | 36 | self.f0_bin = 256 37 | self.f0_max = 1100.0 38 | self.f0_min = 50.0 39 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) 40 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) 41 | 42 | def compute_f0(self, path, f0_method): 43 | x = load_audio(path, self.fs) 44 | # p_len = x.shape[0] // self.hop 45 | if f0_method == "rmvpe": 46 | if hasattr(self, "model_rmvpe") == False: 47 | from infer.lib.rmvpe import RMVPE 48 | 49 | print("Loading rmvpe model") 50 | self.model_rmvpe = RMVPE( 51 | "assets/rmvpe/rmvpe.pt", is_half=False, device=device 52 | ) 53 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) 54 | return f0 55 | 56 | def coarse_f0(self, f0): 57 | f0_mel = 1127 * np.log(1 + f0 / 700) 58 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( 59 | self.f0_bin - 2 60 | ) / (self.f0_mel_max - self.f0_mel_min) + 1 61 | 62 | # use 0 or 1 63 | f0_mel[f0_mel <= 1] = 1 64 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 65 | f0_coarse = np.rint(f0_mel).astype(int) 66 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( 67 | f0_coarse.max(), 68 | f0_coarse.min(), 69 | ) 70 | return f0_coarse 71 | 72 | def go(self, paths, f0_method): 73 | if len(paths) == 0: 74 | printt("no-f0-todo") 75 | else: 76 | printt("todo-f0-%s" % len(paths)) 77 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条 78 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): 79 | try: 80 | if idx % n == 0: 81 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) 82 | if ( 83 | os.path.exists(opt_path1 + ".npy") == True 84 | and os.path.exists(opt_path2 + ".npy") == True 85 | ): 86 | continue 87 | featur_pit = self.compute_f0(inp_path, f0_method) 88 | np.save( 89 | opt_path2, 90 | featur_pit, 91 | allow_pickle=False, 92 | ) # nsf 93 | coarse_pit = self.coarse_f0(featur_pit) 94 | np.save( 95 | opt_path1, 96 | coarse_pit, 97 | allow_pickle=False, 98 | ) # ori 99 | except: 100 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) 101 | 102 | 103 | if __name__ == "__main__": 104 | # exp_dir=r"E:\codes\py39\dataset\mi-test" 105 | # n_p=16 106 | # f = open("%s/log_extract_f0.log"%exp_dir, "w") 107 | printt(" ".join(sys.argv)) 108 | featureInput = FeatureInput() 109 | paths = [] 110 | inp_root = "%s/1_16k_wavs" % (exp_dir) 111 | opt_root1 = "%s/2a_f0" % (exp_dir) 112 | opt_root2 = "%s/2b-f0nsf" % (exp_dir) 113 | 114 | os.makedirs(opt_root1, exist_ok=True) 115 | os.makedirs(opt_root2, exist_ok=True) 116 | for name in sorted(list(os.listdir(inp_root))): 117 | inp_path = "%s/%s" % (inp_root, name) 118 | if "spec" in inp_path: 119 | continue 120 | opt_path1 = "%s/%s" % (opt_root1, name) 121 | opt_path2 = "%s/%s" % (opt_root2, name) 122 | paths.append([inp_path, opt_path1, opt_path2]) 123 | try: 124 | featureInput.go(paths, "rmvpe") 125 | except: 126 | printt("f0_all_fail-%s" % (traceback.format_exc())) 127 | # ps = [] 128 | # for i in range(n_p): 129 | # p = Process( 130 | # target=featureInput.go, 131 | # args=( 132 | # paths[i::n_p], 133 | # f0method, 134 | # ), 135 | # ) 136 | # ps.append(p) 137 | # p.start() 138 | # for i in range(n_p): 139 | # ps[i].join() 140 | -------------------------------------------------------------------------------- /rvc/infer/modules/train/extract_feature_print.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" 6 | os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" 7 | 8 | device = sys.argv[1] 9 | n_part = int(sys.argv[2]) 10 | i_part = int(sys.argv[3]) 11 | if len(sys.argv) == 7: 12 | exp_dir = sys.argv[4] 13 | version = sys.argv[5] 14 | is_half = sys.argv[6].lower() == "true" 15 | else: 16 | i_gpu = sys.argv[4] 17 | exp_dir = sys.argv[5] 18 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) 19 | version = sys.argv[6] 20 | is_half = sys.argv[7].lower() == "true" 21 | import fairseq 22 | import numpy as np 23 | import soundfile as sf 24 | import torch 25 | import torch.nn.functional as F 26 | 27 | if "privateuseone" not in device: 28 | device = "cpu" 29 | if torch.cuda.is_available(): 30 | device = "cuda" 31 | elif torch.backends.mps.is_available(): 32 | device = "mps" 33 | else: 34 | import torch_directml 35 | 36 | device = torch_directml.device(torch_directml.default_device()) 37 | 38 | def forward_dml(ctx, x, scale): 39 | ctx.scale = scale 40 | res = x.clone().detach() 41 | return res 42 | 43 | fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml 44 | 45 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+") 46 | 47 | 48 | def printt(strr): 49 | print(strr) 50 | f.write("%s\n" % strr) 51 | f.flush() 52 | 53 | 54 | printt(" ".join(sys.argv)) 55 | model_path = "assets/hubert/hubert_base.pt" 56 | 57 | printt("exp_dir: " + exp_dir) 58 | wavPath = "%s/1_16k_wavs" % exp_dir 59 | outPath = ( 60 | "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir 61 | ) 62 | os.makedirs(outPath, exist_ok=True) 63 | 64 | 65 | # wave must be 16k, hop_size=320 66 | def readwave(wav_path, normalize=False): 67 | wav, sr = sf.read(wav_path) 68 | assert sr == 16000 69 | feats = torch.from_numpy(wav).float() 70 | if feats.dim() == 2: # double channels 71 | feats = feats.mean(-1) 72 | assert feats.dim() == 1, feats.dim() 73 | if normalize: 74 | with torch.no_grad(): 75 | feats = F.layer_norm(feats, feats.shape) 76 | feats = feats.view(1, -1) 77 | return feats 78 | 79 | 80 | # HuBERT model 81 | printt("load model(s) from {}".format(model_path)) 82 | # if hubert model is exist 83 | if os.access(model_path, os.F_OK) == False: 84 | printt( 85 | "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" 86 | % model_path 87 | ) 88 | exit(0) 89 | models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( 90 | [model_path], 91 | suffix="", 92 | ) 93 | model = models[0] 94 | model = model.to(device) 95 | printt("move model to %s" % device) 96 | if is_half: 97 | if device not in ["mps", "cpu"]: 98 | model = model.half() 99 | model.eval() 100 | 101 | todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] 102 | n = max(1, len(todo) // 10) # 最多打印十条 103 | if len(todo) == 0: 104 | printt("no-feature-todo") 105 | else: 106 | printt("all-feature-%s" % len(todo)) 107 | for idx, file in enumerate(todo): 108 | try: 109 | if file.endswith(".wav"): 110 | wav_path = "%s/%s" % (wavPath, file) 111 | out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) 112 | 113 | if os.path.exists(out_path): 114 | continue 115 | 116 | feats = readwave(wav_path, normalize=saved_cfg.task.normalize) 117 | padding_mask = torch.BoolTensor(feats.shape).fill_(False) 118 | inputs = { 119 | "source": ( 120 | feats.half().to(device) 121 | if is_half and device not in ["mps", "cpu"] 122 | else feats.to(device) 123 | ), 124 | "padding_mask": padding_mask.to(device), 125 | "output_layer": 9 if version == "v1" else 12, # layer 9 126 | } 127 | with torch.no_grad(): 128 | logits = model.extract_features(**inputs) 129 | feats = ( 130 | model.final_proj(logits[0]) if version == "v1" else logits[0] 131 | ) 132 | 133 | feats = feats.squeeze(0).float().cpu().numpy() 134 | if np.isnan(feats).sum() == 0: 135 | np.save(out_path, feats, allow_pickle=False) 136 | else: 137 | printt("%s-contains nan" % file) 138 | if idx % n == 0: 139 | printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) 140 | except: 141 | printt(traceback.format_exc()) 142 | printt("all-feature-done") 143 | -------------------------------------------------------------------------------- /rvc/infer/modules/train/preprocess.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import sys 4 | 5 | from scipy import signal 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | print(*sys.argv[1:]) 10 | inp_root = sys.argv[1] 11 | sr = int(sys.argv[2]) 12 | n_p = int(sys.argv[3]) 13 | exp_dir = sys.argv[4] 14 | noparallel = sys.argv[5] == "True" 15 | per = float(sys.argv[6]) 16 | import os 17 | import traceback 18 | 19 | import librosa 20 | import numpy as np 21 | from scipy.io import wavfile 22 | 23 | from infer.lib.audio import load_audio 24 | from infer.lib.slicer2 import Slicer 25 | 26 | f = open("%s/preprocess.log" % exp_dir, "a+") 27 | 28 | 29 | def println(strr): 30 | print(strr) 31 | f.write("%s\n" % strr) 32 | f.flush() 33 | 34 | 35 | class PreProcess: 36 | def __init__(self, sr, exp_dir, per=3.7): 37 | self.slicer = Slicer( 38 | sr=sr, 39 | threshold=-42, 40 | min_length=1500, 41 | min_interval=400, 42 | hop_size=15, 43 | max_sil_kept=500, 44 | ) 45 | self.sr = sr 46 | self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) 47 | self.per = per 48 | self.overlap = 0.3 49 | self.tail = self.per + self.overlap 50 | self.max = 0.9 51 | self.alpha = 0.75 52 | self.exp_dir = exp_dir 53 | self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir 54 | self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir 55 | os.makedirs(self.exp_dir, exist_ok=True) 56 | os.makedirs(self.gt_wavs_dir, exist_ok=True) 57 | os.makedirs(self.wavs16k_dir, exist_ok=True) 58 | 59 | def norm_write(self, tmp_audio, idx0, idx1): 60 | tmp_max = np.abs(tmp_audio).max() 61 | if tmp_max > 2.5: 62 | print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) 63 | return 64 | tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( 65 | 1 - self.alpha 66 | ) * tmp_audio 67 | wavfile.write( 68 | "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), 69 | self.sr, 70 | tmp_audio.astype(np.float32), 71 | ) 72 | tmp_audio = librosa.resample( 73 | tmp_audio, orig_sr=self.sr, target_sr=16000 74 | ) # , res_type="soxr_vhq" 75 | wavfile.write( 76 | "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 77 | 16000, 78 | tmp_audio.astype(np.float32), 79 | ) 80 | 81 | def pipeline(self, path, idx0): 82 | try: 83 | audio = load_audio(path, self.sr) 84 | # zero phased digital filter cause pre-ringing noise... 85 | # audio = signal.filtfilt(self.bh, self.ah, audio) 86 | audio = signal.lfilter(self.bh, self.ah, audio) 87 | 88 | idx1 = 0 89 | for audio in self.slicer.slice(audio): 90 | i = 0 91 | while 1: 92 | start = int(self.sr * (self.per - self.overlap) * i) 93 | i += 1 94 | if len(audio[start:]) > self.tail * self.sr: 95 | tmp_audio = audio[start : start + int(self.per * self.sr)] 96 | self.norm_write(tmp_audio, idx0, idx1) 97 | idx1 += 1 98 | else: 99 | tmp_audio = audio[start:] 100 | idx1 += 1 101 | break 102 | self.norm_write(tmp_audio, idx0, idx1) 103 | println("%s\t-> Success" % path) 104 | except: 105 | println("%s\t-> %s" % (path, traceback.format_exc())) 106 | 107 | def pipeline_mp(self, infos): 108 | for path, idx0 in infos: 109 | self.pipeline(path, idx0) 110 | 111 | def pipeline_mp_inp_dir(self, inp_root, n_p): 112 | try: 113 | infos = [ 114 | ("%s/%s" % (inp_root, name), idx) 115 | for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) 116 | ] 117 | if noparallel: 118 | for i in range(n_p): 119 | self.pipeline_mp(infos[i::n_p]) 120 | else: 121 | ps = [] 122 | for i in range(n_p): 123 | p = multiprocessing.Process( 124 | target=self.pipeline_mp, args=(infos[i::n_p],) 125 | ) 126 | ps.append(p) 127 | p.start() 128 | for i in range(n_p): 129 | ps[i].join() 130 | except: 131 | println("Fail. %s" % traceback.format_exc()) 132 | 133 | 134 | def preprocess_trainset(inp_root, sr, n_p, exp_dir, per): 135 | pp = PreProcess(sr, exp_dir, per) 136 | println("start preprocess") 137 | pp.pipeline_mp_inp_dir(inp_root, n_p) 138 | println("end preprocess") 139 | 140 | 141 | if __name__ == "__main__": 142 | preprocess_trainset(inp_root, sr, n_p, exp_dir, per) 143 | -------------------------------------------------------------------------------- /rvc/infer/modules/uvr5/modules.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | import ffmpeg 8 | import torch 9 | 10 | from configs.config import Config 11 | from infer.modules.uvr5.mdxnet import MDXNetDereverb 12 | from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho 13 | 14 | config = Config() 15 | 16 | 17 | def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): 18 | infos = [] 19 | try: 20 | inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 21 | save_root_vocal = ( 22 | save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 23 | ) 24 | save_root_ins = ( 25 | save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 26 | ) 27 | if model_name == "onnx_dereverb_By_FoxJoy": 28 | pre_fun = MDXNetDereverb(15, config.device) 29 | else: 30 | func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho 31 | pre_fun = func( 32 | agg=int(agg), 33 | model_path=os.path.join( 34 | os.getenv("weight_uvr5_root"), model_name + ".pth" 35 | ), 36 | device=config.device, 37 | is_half=config.is_half, 38 | ) 39 | is_hp3 = "HP3" in model_name 40 | if inp_root != "": 41 | paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] 42 | else: 43 | paths = [path.name for path in paths] 44 | for path in paths: 45 | inp_path = os.path.join(inp_root, path) 46 | need_reformat = 1 47 | done = 0 48 | try: 49 | info = ffmpeg.probe(inp_path, cmd="ffprobe") 50 | if ( 51 | info["streams"][0]["channels"] == 2 52 | and info["streams"][0]["sample_rate"] == "44100" 53 | ): 54 | need_reformat = 0 55 | pre_fun._path_audio_( 56 | inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 57 | ) 58 | done = 1 59 | except: 60 | need_reformat = 1 61 | traceback.print_exc() 62 | if need_reformat == 1: 63 | tmp_path = "%s/%s.reformatted.wav" % ( 64 | os.path.join(os.environ["TEMP"]), 65 | os.path.basename(inp_path), 66 | ) 67 | os.system( 68 | "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" 69 | % (inp_path, tmp_path) 70 | ) 71 | inp_path = tmp_path 72 | try: 73 | if done == 0: 74 | pre_fun._path_audio_( 75 | inp_path, save_root_ins, save_root_vocal, format0 76 | ) 77 | infos.append("%s->Success" % (os.path.basename(inp_path))) 78 | yield "\n".join(infos) 79 | except: 80 | try: 81 | if done == 0: 82 | pre_fun._path_audio_( 83 | inp_path, save_root_ins, save_root_vocal, format0 84 | ) 85 | infos.append("%s->Success" % (os.path.basename(inp_path))) 86 | yield "\n".join(infos) 87 | except: 88 | infos.append( 89 | "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) 90 | ) 91 | yield "\n".join(infos) 92 | except: 93 | infos.append(traceback.format_exc()) 94 | yield "\n".join(infos) 95 | finally: 96 | try: 97 | if model_name == "onnx_dereverb_By_FoxJoy": 98 | del pre_fun.pred.model 99 | del pre_fun.pred.model_ 100 | else: 101 | del pre_fun.model 102 | del pre_fun 103 | except: 104 | traceback.print_exc() 105 | if torch.cuda.is_available(): 106 | torch.cuda.empty_cache() 107 | logger.info("Executed torch.cuda.empty_cache()") 108 | yield "\n".join(infos) 109 | -------------------------------------------------------------------------------- /rvc/infer/modules/vc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__init__.py -------------------------------------------------------------------------------- /rvc/infer/modules/vc/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/modules/vc/__pycache__/modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/modules.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/modules/vc/__pycache__/pipeline.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/pipeline.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/modules/vc/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /rvc/infer/modules/vc/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from fairseq import checkpoint_utils 4 | 5 | 6 | def get_index_path_from_model(sid): 7 | return next( 8 | ( 9 | f 10 | for f in [ 11 | os.path.join(root, name) 12 | for root, _, files in os.walk(os.getenv("index_root"), topdown=False) 13 | for name in files 14 | if name.endswith(".index") and "trained" not in name 15 | ] 16 | if sid.split(".")[0] in f 17 | ), 18 | "", 19 | ) 20 | 21 | 22 | def load_hubert(config): 23 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task( 24 | [os.getenv('hubert_base')], 25 | suffix="", 26 | ) 27 | hubert_model = models[0] 28 | hubert_model = hubert_model.to(config.device) 29 | if config.is_half: 30 | hubert_model = hubert_model.half() 31 | else: 32 | hubert_model = hubert_model.float() 33 | return hubert_model.eval() 34 | -------------------------------------------------------------------------------- /rvc/logs/mute/0_gt_wavs/mute32k.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute32k.wav -------------------------------------------------------------------------------- /rvc/logs/mute/0_gt_wavs/mute40k.spec.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute40k.spec.pt -------------------------------------------------------------------------------- /rvc/logs/mute/0_gt_wavs/mute40k.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute40k.wav -------------------------------------------------------------------------------- /rvc/logs/mute/0_gt_wavs/mute48k.spec.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute48k.spec.pt -------------------------------------------------------------------------------- /rvc/logs/mute/0_gt_wavs/mute48k.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute48k.wav -------------------------------------------------------------------------------- /rvc/logs/mute/1_16k_wavs/mute.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/1_16k_wavs/mute.wav -------------------------------------------------------------------------------- /rvc/logs/mute/2a_f0/mute.wav.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/2a_f0/mute.wav.npy -------------------------------------------------------------------------------- /rvc/logs/mute/2b-f0nsf/mute.wav.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/2b-f0nsf/mute.wav.npy -------------------------------------------------------------------------------- /rvc/logs/mute/3_feature256/mute.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/3_feature256/mute.npy -------------------------------------------------------------------------------- /rvc/logs/mute/3_feature768/mute.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/3_feature768/mute.npy -------------------------------------------------------------------------------- /web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/web.png -------------------------------------------------------------------------------- /web/js/alertMSG.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | 3 | app.registerExtension({ 4 | name: "RVC.alertMSG", 5 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 6 | if (nodeData?.name == "RVC_Train") { 7 | nodeType.prototype.onExecuted = function (data) { 8 | // alert("Success!you can find weights in:\n" + data.finetune[0] + "\n" + data.finetune[1] + "\n Now you can tts or inference"); 9 | let msg = "Success! you can find weights in:\n" + data.train[0] + "\n you'd like to reboot the server to inference?" 10 | if (confirm(msg)) { 11 | try { 12 | api.fetchApi("/rvc/reboot"); 13 | } 14 | catch(exception) { 15 | console.log(exception); 16 | } 17 | } 18 | } 19 | } 20 | }, 21 | }); -------------------------------------------------------------------------------- /web/js/previewAudio.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | import { api } from '../../../scripts/api.js' 3 | 4 | function fitHeight(node) { 5 | node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]]) 6 | node?.graph?.setDirtyCanvas(true); 7 | } 8 | function chainCallback(object, property, callback) { 9 | if (object == undefined) { 10 | //This should not happen. 11 | console.error("Tried to add callback to non-existant object") 12 | return; 13 | } 14 | if (property in object) { 15 | const callback_orig = object[property] 16 | object[property] = function () { 17 | const r = callback_orig.apply(this, arguments); 18 | callback.apply(this, arguments); 19 | return r 20 | }; 21 | } else { 22 | object[property] = callback; 23 | } 24 | } 25 | 26 | function addPreviewOptions(nodeType) { 27 | chainCallback(nodeType.prototype, "getExtraMenuOptions", function(_, options) { 28 | // The intended way of appending options is returning a list of extra options, 29 | // but this isn't used in widgetInputs.js and would require 30 | // less generalization of chainCallback 31 | let optNew = [] 32 | try { 33 | const previewWidget = this.widgets.find((w) => w.name === "audiopreview"); 34 | 35 | let url = null 36 | if (previewWidget.audioEl?.hidden == false && previewWidget.audioEl.src) { 37 | //Use full quality audio 38 | //url = api.apiURL('/view?' + new URLSearchParams(previewWidget.value.params)); 39 | url = previewWidget.audioEl.src 40 | } 41 | if (url) { 42 | optNew.push( 43 | { 44 | content: "Open preview", 45 | callback: () => { 46 | window.open(url, "_blank") 47 | }, 48 | }, 49 | { 50 | content: "Save preview", 51 | callback: () => { 52 | const a = document.createElement("a"); 53 | a.href = url; 54 | a.setAttribute("download", new URLSearchParams(previewWidget.value.params).get("filename")); 55 | document.body.append(a); 56 | a.click(); 57 | requestAnimationFrame(() => a.remove()); 58 | }, 59 | } 60 | ); 61 | } 62 | if(options.length > 0 && options[0] != null && optNew.length > 0) { 63 | optNew.push(null); 64 | } 65 | options.unshift(...optNew); 66 | 67 | } catch (error) { 68 | console.log(error); 69 | } 70 | 71 | }); 72 | } 73 | function previewAudio(node,file,type){ 74 | var element = document.createElement("div"); 75 | const previewNode = node; 76 | var previewWidget = node.addDOMWidget("audiopreview", "preview", element, { 77 | serialize: false, 78 | hideOnZoom: false, 79 | getValue() { 80 | return element.value; 81 | }, 82 | setValue(v) { 83 | element.value = v; 84 | }, 85 | }); 86 | previewWidget.computeSize = function(width) { 87 | if (this.aspectRatio && !this.parentEl.hidden) { 88 | let height = (previewNode.size[0]-20)/ this.aspectRatio + 10; 89 | if (!(height > 0)) { 90 | height = 0; 91 | } 92 | this.computedHeight = height + 10; 93 | return [width, height]; 94 | } 95 | return [width, -4];//no loaded src, widget should not display 96 | } 97 | // element.style['pointer-events'] = "none" 98 | previewWidget.value = {hidden: false, paused: false, params: {}} 99 | previewWidget.parentEl = document.createElement("div"); 100 | previewWidget.parentEl.className = "audio_preview"; 101 | previewWidget.parentEl.style['width'] = "100%" 102 | element.appendChild(previewWidget.parentEl); 103 | previewWidget.audioEl = document.createElement("audio"); 104 | previewWidget.audioEl.controls = true; 105 | previewWidget.audioEl.loop = false; 106 | previewWidget.audioEl.muted = false; 107 | previewWidget.audioEl.style['width'] = "100%" 108 | previewWidget.audioEl.addEventListener("loadedmetadata", () => { 109 | 110 | previewWidget.aspectRatio = previewWidget.audioEl.audioWidth / previewWidget.audioEl.audioHeight; 111 | fitHeight(this); 112 | }); 113 | previewWidget.audioEl.addEventListener("error", () => { 114 | //TODO: consider a way to properly notify the user why a preview isn't shown. 115 | previewWidget.parentEl.hidden = true; 116 | fitHeight(this); 117 | }); 118 | 119 | let params = { 120 | "filename": file, 121 | "type": type, 122 | } 123 | 124 | previewWidget.parentEl.hidden = previewWidget.value.hidden; 125 | previewWidget.audioEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden; 126 | let target_width = 256 127 | if (element.style?.width) { 128 | //overscale to allow scrolling. Endpoint won't return higher than native 129 | target_width = element.style.width.slice(0,-2)*2; 130 | } 131 | if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") { 132 | params.force_size = target_width+"x?" 133 | } else { 134 | let size = params.force_size.split("x") 135 | let ar = parseInt(size[0])/parseInt(size[1]) 136 | params.force_size = target_width+"x"+(target_width/ar) 137 | } 138 | 139 | previewWidget.audioEl.src = api.apiURL('/view?' + new URLSearchParams(params)); 140 | 141 | previewWidget.audioEl.hidden = false; 142 | previewWidget.parentEl.appendChild(previewWidget.audioEl) 143 | } 144 | 145 | app.registerExtension({ 146 | name: "RVC.AudioPreviewer", 147 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 148 | if (nodeData?.name == "PreViewAudio") { 149 | nodeType.prototype.onExecuted = function (data) { 150 | previewAudio(this, data.audio[0], data.audio[1]); 151 | } 152 | addPreviewOptions(nodeType) 153 | } 154 | } 155 | }); -------------------------------------------------------------------------------- /web/js/refreshPath.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | import { api } from '../../../scripts/api.js' 3 | import { ComfyWidgets } from "../../../scripts/widgets.js" 4 | function rebootAPI() { 5 | if (confirm("Are you sure you'd like to reboot the server to refresh weights path?")) { 6 | try { 7 | api.fetchApi("/rvc/reboot"); 8 | } 9 | catch(exception) { 10 | 11 | } 12 | return true; 13 | } 14 | 15 | return false; 16 | } 17 | function pathRefresh(node, inputName, inputData, app) { 18 | // Create the button widget for selecting the files 19 | let refreshWidget = node.addWidget("button", "REBOOT TO REFRESH SID LIST", "refresh", () => { 20 | rebootAPI() 21 | }); 22 | 23 | refreshWidget.serialize = false; 24 | 25 | return { widget: refreshWidget }; 26 | } 27 | ComfyWidgets.PATHREFRESH = pathRefresh; 28 | 29 | app.registerExtension({ 30 | name: "RVC.RefreshPath", 31 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 32 | if (nodeData?.name == "RVC_Infer") { 33 | nodeData.input.required.upload = ["PATHREFRESH"]; 34 | } 35 | }, 36 | }); -------------------------------------------------------------------------------- /wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/wechat.jpg --------------------------------------------------------------------------------