├── LICENSE
├── README.md
├── __init__.py
├── donate.jpg
├── download_models.py
├── nodes.py
├── requirements.txt
├── rvc
    ├── __init__.py
    ├── configs
    │   ├── __pycache__
    │   │   └── config.cpython-310.pyc
    │   ├── config.json
    │   ├── config.py
    │   ├── inuse
    │   │   ├── .gitignore
    │   │   ├── v1
    │   │   │   └── .gitignore
    │   │   └── v2
    │   │   │   └── .gitignore
    │   ├── v1
    │   │   ├── 32k.json
    │   │   ├── 40k.json
    │   │   └── 48k.json
    │   └── v2
    │   │   ├── 32k.json
    │   │   └── 48k.json
    ├── i18n
    │   ├── __pycache__
    │   │   └── i18n.cpython-310.pyc
    │   ├── i18n.py
    │   ├── locale
    │   │   ├── en_US.json
    │   │   ├── es_ES.json
    │   │   ├── fr_FR.json
    │   │   ├── it_IT.json
    │   │   ├── ja_JP.json
    │   │   ├── ko_KR.json
    │   │   ├── pt_BR.json
    │   │   ├── ru_RU.json
    │   │   ├── tr_TR.json
    │   │   ├── zh_CN.json
    │   │   ├── zh_HK.json
    │   │   ├── zh_SG.json
    │   │   └── zh_TW.json
    │   ├── locale_diff.py
    │   └── scan_i18n.py
    ├── infer
    │   ├── lib
    │   │   ├── __pycache__
    │   │   │   ├── audio.cpython-310.pyc
    │   │   │   ├── rmvpe.cpython-310.pyc
    │   │   │   ├── rvcmd.cpython-310.pyc
    │   │   │   └── slicer2.cpython-310.pyc
    │   │   ├── audio.py
    │   │   ├── infer_pack
    │   │   │   ├── __pycache__
    │   │   │   │   ├── attentions.cpython-310.pyc
    │   │   │   │   ├── commons.cpython-310.pyc
    │   │   │   │   ├── models.cpython-310.pyc
    │   │   │   │   ├── modules.cpython-310.pyc
    │   │   │   │   └── transforms.cpython-310.pyc
    │   │   │   ├── attentions.py
    │   │   │   ├── attentions_onnx.py
    │   │   │   ├── commons.py
    │   │   │   ├── models.py
    │   │   │   ├── models_onnx.py
    │   │   │   ├── modules.py
    │   │   │   ├── modules
    │   │   │   │   └── F0Predictor
    │   │   │   │   │   ├── DioF0Predictor.py
    │   │   │   │   │   ├── F0Predictor.py
    │   │   │   │   │   ├── HarvestF0Predictor.py
    │   │   │   │   │   ├── PMF0Predictor.py
    │   │   │   │   │   └── __init__.py
    │   │   │   ├── onnx_inference.py
    │   │   │   └── transforms.py
    │   │   ├── jit
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   └── __init__.cpython-310.pyc
    │   │   │   ├── get_hubert.py
    │   │   │   ├── get_rmvpe.py
    │   │   │   └── get_synthesizer.py
    │   │   ├── rmvpe.py
    │   │   ├── rtrvc.py
    │   │   ├── rvcmd.py
    │   │   ├── slicer2.py
    │   │   ├── train
    │   │   │   ├── __pycache__
    │   │   │   │   ├── data_utils.cpython-310.pyc
    │   │   │   │   ├── losses.cpython-310.pyc
    │   │   │   │   ├── mel_processing.cpython-310.pyc
    │   │   │   │   ├── process_ckpt.cpython-310.pyc
    │   │   │   │   └── utils.cpython-310.pyc
    │   │   │   ├── data_utils.py
    │   │   │   ├── losses.py
    │   │   │   ├── mel_processing.py
    │   │   │   ├── process_ckpt.py
    │   │   │   └── utils.py
    │   │   └── uvr5_pack
    │   │   │   ├── lib_v5
    │   │   │       ├── dataset.py
    │   │   │       ├── layers.py
    │   │   │       ├── layers_123812KB .py
    │   │   │       ├── layers_123821KB.py
    │   │   │       ├── layers_33966KB.py
    │   │   │       ├── layers_537227KB.py
    │   │   │       ├── layers_537238KB.py
    │   │   │       ├── layers_new.py
    │   │   │       ├── model_param_init.py
    │   │   │       ├── modelparams
    │   │   │       │   ├── 1band_sr16000_hl512.json
    │   │   │       │   ├── 1band_sr32000_hl512.json
    │   │   │       │   ├── 1band_sr33075_hl384.json
    │   │   │       │   ├── 1band_sr44100_hl1024.json
    │   │   │       │   ├── 1band_sr44100_hl256.json
    │   │   │       │   ├── 1band_sr44100_hl512.json
    │   │   │       │   ├── 1band_sr44100_hl512_cut.json
    │   │   │       │   ├── 2band_32000.json
    │   │   │       │   ├── 2band_44100_lofi.json
    │   │   │       │   ├── 2band_48000.json
    │   │   │       │   ├── 3band_44100.json
    │   │   │       │   ├── 3band_44100_mid.json
    │   │   │       │   ├── 3band_44100_msb2.json
    │   │   │       │   ├── 4band_44100.json
    │   │   │       │   ├── 4band_44100_mid.json
    │   │   │       │   ├── 4band_44100_msb.json
    │   │   │       │   ├── 4band_44100_msb2.json
    │   │   │       │   ├── 4band_44100_reverse.json
    │   │   │       │   ├── 4band_44100_sw.json
    │   │   │       │   ├── 4band_v2.json
    │   │   │       │   ├── 4band_v2_sn.json
    │   │   │       │   ├── 4band_v3.json
    │   │   │       │   └── ensemble.json
    │   │   │       ├── nets.py
    │   │   │       ├── nets_123812KB.py
    │   │   │       ├── nets_123821KB.py
    │   │   │       ├── nets_33966KB.py
    │   │   │       ├── nets_537227KB.py
    │   │   │       ├── nets_537238KB.py
    │   │   │       ├── nets_61968KB.py
    │   │   │       ├── nets_new.py
    │   │   │       └── spec_utils.py
    │   │   │   ├── name_params.json
    │   │   │   └── utils.py
    │   └── modules
    │   │   ├── gui
    │   │       ├── __init__.py
    │   │       ├── torchgate.py
    │   │       └── utils.py
    │   │   ├── ipex
    │   │       ├── __init__.py
    │   │       ├── attention.py
    │   │       ├── gradscaler.py
    │   │       └── hijacks.py
    │   │   ├── onnx
    │   │       └── export.py
    │   │   ├── train
    │   │       ├── extract
    │   │       │   ├── extract_f0_print.py
    │   │       │   ├── extract_f0_rmvpe.py
    │   │       │   └── extract_f0_rmvpe_dml.py
    │   │       ├── extract_feature_print.py
    │   │       ├── preprocess.py
    │   │       └── train.py
    │   │   ├── uvr5
    │   │       ├── mdxnet.py
    │   │       ├── modules.py
    │   │       └── vr.py
    │   │   └── vc
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │           ├── __init__.cpython-310.pyc
    │   │           ├── modules.cpython-310.pyc
    │   │           ├── pipeline.cpython-310.pyc
    │   │           └── utils.cpython-310.pyc
    │   │       ├── modules.py
    │   │       ├── pipeline.py
    │   │       └── utils.py
    ├── logs
    │   └── mute
    │   │   ├── 0_gt_wavs
    │   │       ├── mute32k.wav
    │   │       ├── mute40k.spec.pt
    │   │       ├── mute40k.wav
    │   │       ├── mute48k.spec.pt
    │   │       └── mute48k.wav
    │   │   ├── 1_16k_wavs
    │   │       └── mute.wav
    │   │   ├── 2a_f0
    │   │       └── mute.wav.npy
    │   │   ├── 2b-f0nsf
    │   │       └── mute.wav.npy
    │   │   ├── 3_feature256
    │   │       └── mute.npy
    │   │   └── 3_feature768
    │   │       └── mute.npy
    └── train.py
├── web.png
├── web
    └── js
    │   ├── alertMSG.js
    │   ├── previewAudio.js
    │   ├── refreshPath.js
    │   └── uploadAudio.js
└── wechat.jpg


/LICENSE:
--------------------------------------------------------------------------------
 1 | 本软件及其相关代码以MIT协议开源，作者不对软件具备任何控制力，使用软件者、传播软件导出的声音者自负全责。
 2 | 如不认可该条款，则不能使用或引用软件包内任何代码和文件。
 3 | 
 4 | 特此授予任何获得本软件和相关文档文件（以下简称“软件”）副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利，以及授予本软件所提供的人使用本软件的权利，但须符合以下条件：
 5 | 上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。
 6 | 软件是“按原样”提供的，没有任何明示或暗示的保证，包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下，作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任，无论是在合同诉讼、侵权诉讼还是其他诉讼中。
 7 | 
 8 | MIT License
 9 | 
10 | Copyright (c) 2024 AIFSH
11 | 
12 | Permission is hereby granted, free of charge, to any person obtaining a copy
13 | of this software and associated documentation files (the "Software"), to deal
14 | in the Software without restriction, including without limitation the rights
15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 | copies of the Software, and to permit persons to whom the Software is
17 | furnished to do so, subject to the following conditions:
18 | 
19 | The above copyright notice and this permission notice shall be included in all
20 | copies or substantial portions of the Software.
21 | 
22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 | SOFTWARE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI-RVC
 2 | a comfyui custom node for [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git),you can Voice-Conversion in comfyui now!
 3 | 
 4 | ## How to use
 5 | make sure `ffmpeg` is worked in your commandline
 6 | for Linux
 7 | ```
 8 | apt update
 9 | apt install ffmpeg
10 | ```
11 | for Windows,you can install `ffmpeg` by [WingetUI](https://github.com/marticliment/WingetUI) automatically
12 | 
13 | then!
14 | ```
15 | git clone https://github.com/AIFSH/ComfyUI-RVC.git
16 | cd ComfyUI-RVC
17 | pip install -r requirements.txt
18 | ```
19 | `weights` will be download from huggingface automatically!if you in china,make sure your internet attach the huggingface
20 | or if you still struggle with huggingface, you may try follow [hf-mirror](https://hf-mirror.com/) to config your env.
21 | 
22 | 或者下载[rvc_assets.zip](https://pan.quark.cn/s/039c8d2d59ac)解压后放置到`ComfyUI-RVC/rvc`目录
23 | 
24 | ## Tutorial
25 | [Demo](https://www.bilibili.com/video/BV1bH4y1P7n9/)
26 | 
27 | ## WeChat Group && Donate
28 | <div>
29 |   <figure>
30 |   <img alt='Wechat' src="wechat.jpg?raw=true" width="300px"/>
31 |   <img alt='donate' src="donate.jpg?raw=true" width="300px"/>
32 |   <figure>
33 | </div>
34 | 
35 | ## Thanks
36 | - [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git)
37 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys,site
 3 | from subprocess import Popen
 4 | from server import PromptServer
 5 | now_dir = os.path.dirname(os.path.abspath(__file__))
 6 | 
 7 | site_packages_roots = []
 8 | for path in site.getsitepackages():
 9 |     if "packages" in path:
10 |         site_packages_roots.append(path)
11 | if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
12 | #os.environ["OPENBLAS_NUM_THREADS"] = "4"
13 | for site_packages_root in site_packages_roots:
14 |     if os.path.exists(site_packages_root):
15 |         try:
16 |             with open("%s/users.pth" % (site_packages_root), "a") as f:
17 |                 f.write(
18 |                     "%s\n%s/rvc\n%s/rvc/infer"
19 |                     % (now_dir,now_dir,now_dir)
20 |                 )
21 |             break
22 |         except PermissionError:
23 |             raise PermissionError
24 | 
25 | if os.path.isfile("%s/users.pth" % (site_packages_root)):
26 |     print("!!!RVC path was added to " + "%s/users.pth" % (site_packages_root) 
27 |     + "\n if meet `No module` error,try `python main.py` again")
28 | 
29 | model_path = os.path.join(now_dir,"rvc", "assets")
30 | 
31 | if not os.path.exists(os.path.join(model_path, "pretrained_v2")):
32 |     cmd = "python %s/download_models.py" % (now_dir)
33 |     p = Popen(cmd, shell=True, cwd=now_dir)
34 |     p.wait()
35 | else:
36 |     print("!!!RVC use cache models,make sure your 'assets' complete")
37 | 
38 | 
39 | WEB_DIRECTORY = "./web"
40 | from .nodes import LoadAudio, PreViewAudio,RVC_Train,RVC_Infer,CombineAudio
41 | 
42 | # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension
43 | # WEB_DIRECTORY = "./somejs"
44 | 
45 | # A dictionary that contains all nodes you want to export with their names
46 | # NOTE: names should be globally unique
47 | NODE_CLASS_MAPPINGS = {
48 |     "LoadAudio": LoadAudio,
49 |     "PreViewAudio": PreViewAudio,
50 |     "RVC_Train": RVC_Train,
51 |     "RVC_Infer": RVC_Infer,
52 |     "CombineAudio": CombineAudio
53 | }
54 | 
55 | # A dictionary that contains the friendly/humanly readable titles for the nodes
56 | NODE_DISPLAY_NAME_MAPPINGS = {
57 |     "LoadAudio": "AudioLoader",
58 |     "PreViewAudio": "PreView Audio",
59 |     "RVC_Train": "RVC Train",
60 |     "RVC_Infer": "RVC Inference",
61 |     "CombineAudio": "CombineAudio"
62 | }
63 | 
64 | @PromptServer.instance.routes.get("/rvc/reboot")
65 | def restart(self):
66 |     try:
67 |         sys.stdout.close_log()
68 |     except Exception as e:
69 |         pass
70 | 
71 |     return os.execv(sys.executable, [sys.executable] + sys.argv)


--------------------------------------------------------------------------------
/donate.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/donate.jpg


--------------------------------------------------------------------------------
/download_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | now_dir = os.path.dirname(os.path.abspath(__file__))
 6 | BASE_DIR = os.path.join(now_dir, "rvc")
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     os.makedirs(os.path.join(BASE_DIR ,"assets","weights"), exist_ok=True)
11 |     weights_path = os.path.join(BASE_DIR ,"assets")
12 |     print("Downloading hubert_base.pt...")
13 |     hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
14 |                     filename="hubert_base.pt",
15 |                     subfolder= "",
16 |                     local_dir= os.path.join(weights_path, "hubert"))
17 |     print("Downloading rmvpe.pt...")
18 |     hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
19 |                     filename="rmvpe.pt",
20 |                     subfolder= "",
21 |                     local_dir= os.path.join(weights_path, "rmvpe"))
22 |     
23 |     
24 |     print("Downloading pretrained models:")
25 | 
26 |     model_names = [
27 |         "D40k.pth",
28 |         "D48k.pth",
29 |         "G32k.pth",
30 |         "G40k.pth",
31 |         "G48k.pth",
32 |         "f0D32k.pth",
33 |         "f0D40k.pth",
34 |         "f0D48k.pth",
35 |         "f0G32k.pth",
36 |         "f0G40k.pth",
37 |         "f0G48k.pth",
38 |     ]
39 |     for model in model_names:
40 |         print(f"Downloading {model}...")
41 |         hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
42 |                     filename=model,
43 |                     subfolder= "pretrained",
44 |                     local_dir= weights_path)
45 | 
46 |    
47 |     print("Downloading pretrained models v2:")
48 | 
49 |     for model in model_names:
50 |         print(f"Downloading {model}...")
51 |         hf_hub_download(repo_id="lj1995/VoiceConversionWebUI",
52 |                     filename=model,
53 |                     subfolder= "pretrained_v2",
54 |                     local_dir= weights_path)
55 | 
56 |     print("All models downloaded!")
57 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib>=1.1.0
 2 | numba
 3 | numpy==1.23.5
 4 | scipy
 5 | librosa==0.9.1
 6 | llvmlite
 7 | fairseq
 8 | faiss-cpu
 9 | Cython
10 | pydub>=0.25.1
11 | soundfile>=0.12.1
12 | ffmpeg-python>=0.2.0
13 | tensorboardX
14 | Jinja2>=3.1.2
15 | json5
16 | Markdown
17 | matplotlib>=3.7.0
18 | matplotlib-inline>=0.1.3
19 | praat-parselmouth>=0.4.2
20 | Pillow>=9.1.1
21 | resampy>=0.4.2
22 | scikit-learn
23 | tensorboard
24 | tqdm>=4.63.1
25 | tornado>=6.1
26 | Werkzeug>=2.2.3
27 | uc-micro-py>=1.0.1
28 | sympy>=1.11.1
29 | tabulate>=0.8.10
30 | PyYAML>=6.0
31 | pyasn1>=0.4.8
32 | pyasn1-modules>=0.2.8
33 | fsspec>=2022.11.0
34 | absl-py>=1.2.0
35 | audioread
36 | uvicorn>=0.21.1
37 | colorama>=0.4.5
38 | pyworld==0.3.2
39 | httpx
40 | onnxruntime; sys_platform == 'darwin'
41 | onnxruntime-gpu; sys_platform != 'darwin'
42 | torchcrepe==0.0.20
43 | fastapi
44 | torchfcpe
45 | ffmpy==0.3.1
46 | python-dotenv>=1.0.0
47 | av
48 | 


--------------------------------------------------------------------------------
/rvc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/__init__.py


--------------------------------------------------------------------------------
/rvc/configs/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/configs/__pycache__/config.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/configs/config.json:
--------------------------------------------------------------------------------
1 | {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "VoiceMeeter Output (VB-Audio Vo", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi", "sr_type": "sr_device", "threhold": -60.0, "pitch": 12.0, "formant": 0.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.15, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}


--------------------------------------------------------------------------------
/rvc/configs/inuse/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !v1
4 | !v2
5 | 


--------------------------------------------------------------------------------
/rvc/configs/inuse/v1/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/rvc/configs/inuse/v2/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/rvc/configs/v1/32k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 32000,
21 |     "filter_length": 1024,
22 |     "hop_length": 320,
23 |     "win_length": 1024,
24 |     "n_mel_channels": 80,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,4,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/rvc/configs/v1/40k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 40000,
21 |     "filter_length": 2048,
22 |     "hop_length": 400,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 125,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,10,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/rvc/configs/v1/48k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 11520,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 48000,
21 |     "filter_length": 2048,
22 |     "hop_length": 480,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 128,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,6,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/rvc/configs/v2/32k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 32000,
21 |     "filter_length": 1024,
22 |     "hop_length": 320,
23 |     "win_length": 1024,
24 |     "n_mel_channels": 80,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,8,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [20,16,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/rvc/configs/v2/48k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": true,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 17280,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 48000,
21 |     "filter_length": 2048,
22 |     "hop_length": 480,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 128,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [12,10,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [24,20,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/rvc/i18n/__pycache__/i18n.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/i18n/__pycache__/i18n.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/i18n/i18n.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import locale
 3 | import os
 4 | 
 5 | now_dir = os.path.dirname(os.path.abspath(__file__))
 6 | def load_language_list(language):
 7 |     with open(f"{now_dir}/locale/{language}.json", "r", encoding="utf-8") as f:
 8 |         language_list = json.load(f)
 9 |     return language_list
10 | 
11 | 
12 | class I18nAuto:
13 |     def __init__(self, language=None):
14 |         if language in ["Auto", None]:
15 |             language = locale.getdefaultlocale()[
16 |                 0
17 |             ]  # getlocale can't identify the system's language ((None, None))
18 |         if not os.path.exists(f"{now_dir}/locale/{language}.json"):
19 |             language = "en_US"
20 |         self.language = language
21 |         self.language_map = load_language_list(language)
22 | 
23 |     def __call__(self, key):
24 |         return self.language_map.get(key, key)
25 | 
26 |     def __repr__(self):
27 |         return "Use Language: " + self.language
28 | 


--------------------------------------------------------------------------------
/rvc/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音",
  3 |     "A模型权重": "A模型权重",
  4 |     "A模型路径": "A模型路径",
  5 |     "B模型路径": "B模型路径",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt处理",
 13 |     "harvest进程数": "harvest进程数",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一键训练",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "保存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名",
 33 |     "保存的模型名不带后缀": "保存的模型名不带后缀",
 34 |     "保存频率save_every_epoch": "保存频率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)",
 38 |     "停止音频转换": "停止音频转换",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "共振偏移": "共振偏移",
 41 |     "刷新音色列表和索引路径": "刷新音色列表和索引路径",
 42 |     "加载模型": "加载模型",
 43 |     "加载预训练底模D路径": "加载预训练底模D路径",
 44 |     "加载预训练底模G路径": "加载预训练底模G路径",
 45 |     "单次推理": "单次推理",
 46 |     "卸载音色省显存": "卸载音色省显存",
 47 |     "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
 48 |     "后处理重采样至最终采样率，0为不进行重采样": "后处理重采样至最终采样率，0为不进行重采样",
 49 |     "否": "否",
 50 |     "启用相位声码器": "启用相位声码器",
 51 |     "响应阈值": "响应阈值",
 52 |     "响度因子": "响度因子",
 53 |     "处理数据": "处理数据",
 54 |     "导出Onnx模型": "导出Onnx模型",
 55 |     "导出文件格式": "导出文件格式",
 56 |     "常见问题解答": "常见问题解答",
 57 |     "常规设置": "常规设置",
 58 |     "开始音频转换": "开始音频转换",
 59 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 60 |     "性能设置": "性能设置",
 61 |     "总训练轮数total_epoch": "总训练轮数total_epoch",
 62 |     "批量推理": "批量推理",
 63 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ",
 64 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 65 |     "指定输出文件夹": "指定输出文件夹",
 66 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 67 |     "推理时间(ms):": "推理时间(ms):",
 68 |     "推理音色": "推理音色",
 69 |     "提取": "提取",
 70 |     "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数",
 71 |     "是": "是",
 72 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间",
 73 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹",
 74 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速",
 75 |     "显卡信息": "显卡信息",
 76 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.",
 77 |     "查看": "查看",
 78 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)",
 79 |     "检索特征占比": "检索特征占比",
 80 |     "模型": "模型",
 81 |     "模型推理": "模型推理",
 82 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况",
 83 |     "模型是否带音高指导": "模型是否带音高指导",
 84 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)",
 85 |     "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否",
 86 |     "模型版本型号": "模型版本型号",
 87 |     "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合",
 88 |     "模型路径": "模型路径",
 89 |     "每张显卡的batch_size": "每张显卡的batch_size",
 90 |     "淡入淡出长度": "淡入淡出长度",
 91 |     "版本": "版本",
 92 |     "特征提取": "特征提取",
 93 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果",
 94 |     "独占 WASAPI 设备": "独占 WASAPI 设备",
 95 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ",
 96 |     "目标采样率": "目标采样率",
 97 |     "算法延迟(ms):": "算法延迟(ms):",
 98 |     "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)",
 99 |     "融合": "融合",
100 |     "要改的模型信息": "要改的模型信息",
101 |     "要置入的模型信息": "要置入的模型信息",
102 |     "训练": "训练",
103 |     "训练模型": "训练模型",
104 |     "训练特征索引": "训练特征索引",
105 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
106 |     "设备类型": "设备类型",
107 |     "请指定说话人id": "请指定说话人id",
108 |     "请选择index文件": "请选择index文件",
109 |     "请选择pth文件": "请选择pth文件",
110 |     "请选择说话人id": "请选择说话人id",
111 |     "转换": "转换",
112 |     "输入实验名": "输入实验名",
113 |     "输入待处理音频文件夹路径": "输入待处理音频文件夹路径",
114 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",
115 |     "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)",
116 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络",
117 |     "输入监听": "输入监听",
118 |     "输入训练文件夹路径": "输入训练文件夹路径",
119 |     "输入设备": "输入设备",
120 |     "输入降噪": "输入降噪",
121 |     "输出信息": "输出信息",
122 |     "输出变声": "输出变声",
123 |     "输出设备": "输出设备",
124 |     "输出降噪": "输出降噪",
125 |     "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)",
126 |     "选择.index文件": "选择.index文件",
127 |     "选择.pth文件": "选择.pth文件",
128 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
129 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
130 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
131 |     "采样率:": "采样率:",
132 |     "采样长度": "采样长度",
133 |     "重载设备列表": "重载设备列表",
134 |     "音调设置": "音调设置",
135 |     "音频设备": "音频设备",
136 |     "音高算法": "音高算法",
137 |     "额外推理时长": "额外推理时长"
138 | }
139 | 


--------------------------------------------------------------------------------
/rvc/i18n/locale_diff.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import OrderedDict
 4 | 
 5 | # Define the standard file name
 6 | standard_file = "locale/zh_CN.json"
 7 | 
 8 | # Find all JSON files in the directory
 9 | dir_path = "locale/"
10 | languages = [
11 |     os.path.join(dir_path, f)
12 |     for f in os.listdir(dir_path)
13 |     if f.endswith(".json") and f != standard_file
14 | ]
15 | 
16 | # Load the standard file
17 | with open(standard_file, "r", encoding="utf-8") as f:
18 |     standard_data = json.load(f, object_pairs_hook=OrderedDict)
19 | 
20 | # Loop through each language file
21 | for lang_file in languages:
22 |     # Load the language file
23 |     with open(lang_file, "r", encoding="utf-8") as f:
24 |         lang_data = json.load(f, object_pairs_hook=OrderedDict)
25 | 
26 |     # Find the difference between the language file and the standard file
27 |     diff = set(standard_data.keys()) - set(lang_data.keys())
28 | 
29 |     miss = set(lang_data.keys()) - set(standard_data.keys())
30 | 
31 |     # Add any missing keys to the language file
32 |     for key in diff:
33 |         lang_data[key] = key
34 | 
35 |     # Del any extra keys to the language file
36 |     for key in miss:
37 |         del lang_data[key]
38 | 
39 |     # Sort the keys of the language file to match the order of the standard file
40 |     lang_data = OrderedDict(
41 |         sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
42 |     )
43 | 
44 |     # Save the updated language file
45 |     with open(lang_file, "w", encoding="utf-8") as f:
46 |         json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
47 |         f.write("\n")
48 | 


--------------------------------------------------------------------------------
/rvc/i18n/scan_i18n.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import glob
 3 | import json
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | def extract_i18n_strings(node):
 8 |     i18n_strings = []
 9 | 
10 |     if (
11 |         isinstance(node, ast.Call)
12 |         and isinstance(node.func, ast.Name)
13 |         and node.func.id == "i18n"
14 |     ):
15 |         for arg in node.args:
16 |             if isinstance(arg, ast.Str):
17 |                 i18n_strings.append(arg.s)
18 | 
19 |     for child_node in ast.iter_child_nodes(node):
20 |         i18n_strings.extend(extract_i18n_strings(child_node))
21 | 
22 |     return i18n_strings
23 | 
24 | 
25 | # scan the directory for all .py files (recursively)
26 | # for each file, parse the code into an AST
27 | # for each AST, extract the i18n strings
28 | 
29 | strings = []
30 | for filename in glob.iglob("**/*.py", recursive=True):
31 |     with open(filename, "r") as f:
32 |         code = f.read()
33 |         if "I18nAuto" in code:
34 |             tree = ast.parse(code)
35 |             i18n_strings = extract_i18n_strings(tree)
36 |             print(filename, len(i18n_strings))
37 |             strings.extend(i18n_strings)
38 | code_keys = set(strings)
39 | """
40 | n_i18n.py
41 | gui_v1.py 26
42 | app.py 16
43 | infer-web.py 147
44 | scan_i18n.py 0
45 | i18n.py 0
46 | lib/train/process_ckpt.py 1
47 | """
48 | print()
49 | print("Total unique:", len(code_keys))
50 | 
51 | 
52 | standard_file = "i18n/locale/zh_CN.json"
53 | with open(standard_file, "r", encoding="utf-8") as f:
54 |     standard_data = json.load(f, object_pairs_hook=OrderedDict)
55 | standard_keys = set(standard_data.keys())
56 | 
57 | # Define the standard file name
58 | unused_keys = standard_keys - code_keys
59 | print("Unused keys:", len(unused_keys))
60 | for unused_key in unused_keys:
61 |     print("\t", unused_key)
62 | 
63 | missing_keys = code_keys - standard_keys
64 | print("Missing keys:", len(missing_keys))
65 | for missing_key in missing_keys:
66 |     print("\t", missing_key)
67 | 
68 | code_keys_dict = OrderedDict()
69 | for s in strings:
70 |     code_keys_dict[s] = s
71 | 
72 | # write back
73 | with open(standard_file, "w", encoding="utf-8") as f:
74 |     json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
75 |     f.write("\n")
76 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/audio.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/audio.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/rvcmd.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/rvcmd.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/__pycache__/slicer2.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/__pycache__/slicer2.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/audio.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import ffmpeg
 3 | import numpy as np
 4 | import av
 5 | 
 6 | 
 7 | def wav2(i, o, format):
 8 |     inp = av.open(i, "r")
 9 |     if format == "m4a":
10 |         format = "mp4"
11 |     out = av.open(o, "w", format=format)
12 |     if format == "ogg":
13 |         format = "libvorbis"
14 |     if format == "mp4":
15 |         format = "aac"
16 | 
17 |     ostream = out.add_stream(format)
18 | 
19 |     for frame in inp.decode(audio=0):
20 |         for p in ostream.encode(frame):
21 |             out.mux(p)
22 | 
23 |     for p in ostream.encode(None):
24 |         out.mux(p)
25 | 
26 |     out.close()
27 |     inp.close()
28 | 
29 | 
30 | def load_audio(file, sr):
31 |     try:
32 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
33 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
34 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
35 |         file = clean_path(file)  # 防止小白拷路径头尾带了空格和"和回车
36 |         out, _ = (
37 |             ffmpeg.input(file, threads=0)
38 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
39 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
40 |         )
41 |     except Exception as e:
42 |         raise RuntimeError(f"Failed to load audio: {e}")
43 | 
44 |     return np.frombuffer(out, np.float32).flatten()
45 | 
46 | 
47 | def clean_path(path_str):
48 |     if platform.system() == "Windows":
49 |         path_str = path_str.replace("/", "\\")
50 |     return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
51 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/commons.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | import math
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | 
 10 | def init_weights(m, mean=0.0, std=0.01):
 11 |     classname = m.__class__.__name__
 12 |     if classname.find("Conv") != -1:
 13 |         m.weight.data.normal_(mean, std)
 14 | 
 15 | 
 16 | def get_padding(kernel_size, dilation=1):
 17 |     return int((kernel_size * dilation - dilation) / 2)
 18 | 
 19 | 
 20 | # def convert_pad_shape(pad_shape):
 21 | #     l = pad_shape[::-1]
 22 | #     pad_shape = [item for sublist in l for item in sublist]
 23 | #     return pad_shape
 24 | 
 25 | 
 26 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 27 |     """KL(P||Q)"""
 28 |     kl = (logs_q - logs_p) - 0.5
 29 |     kl += (
 30 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 31 |     )
 32 |     return kl
 33 | 
 34 | 
 35 | def rand_gumbel(shape):
 36 |     """Sample from the Gumbel distribution, protect from overflows."""
 37 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 38 |     return -torch.log(-torch.log(uniform_samples))
 39 | 
 40 | 
 41 | def rand_gumbel_like(x):
 42 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 43 |     return g
 44 | 
 45 | 
 46 | def slice_segments(x, ids_str, segment_size=4):
 47 |     ret = torch.zeros_like(x[:, :, :segment_size])
 48 |     for i in range(x.size(0)):
 49 |         idx_str = ids_str[i]
 50 |         idx_end = idx_str + segment_size
 51 |         ret[i] = x[i, :, idx_str:idx_end]
 52 |     return ret
 53 | 
 54 | 
 55 | def slice_segments2(x, ids_str, segment_size=4):
 56 |     ret = torch.zeros_like(x[:, :segment_size])
 57 |     for i in range(x.size(0)):
 58 |         idx_str = ids_str[i]
 59 |         idx_end = idx_str + segment_size
 60 |         ret[i] = x[i, idx_str:idx_end]
 61 |     return ret
 62 | 
 63 | 
 64 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 65 |     b, d, t = x.size()
 66 |     if x_lengths is None:
 67 |         x_lengths = t
 68 |     ids_str_max = x_lengths - segment_size + 1
 69 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 70 |     ret = slice_segments(x, ids_str, segment_size)
 71 |     return ret, ids_str
 72 | 
 73 | 
 74 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 75 |     position = torch.arange(length, dtype=torch.float)
 76 |     num_timescales = channels // 2
 77 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 78 |         num_timescales - 1
 79 |     )
 80 |     inv_timescales = min_timescale * torch.exp(
 81 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 82 |     )
 83 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 84 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 85 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 86 |     signal = signal.view(1, channels, length)
 87 |     return signal
 88 | 
 89 | 
 90 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 91 |     b, channels, length = x.size()
 92 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 93 |     return x + signal.to(dtype=x.dtype, device=x.device)
 94 | 
 95 | 
 96 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 97 |     b, channels, length = x.size()
 98 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 99 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
100 | 
101 | 
102 | def subsequent_mask(length):
103 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
104 |     return mask
105 | 
106 | 
107 | @torch.jit.script
108 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
109 |     n_channels_int = n_channels[0]
110 |     in_act = input_a + input_b
111 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
112 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
113 |     acts = t_act * s_act
114 |     return acts
115 | 
116 | 
117 | # def convert_pad_shape(pad_shape):
118 | #     l = pad_shape[::-1]
119 | #     pad_shape = [item for sublist in l for item in sublist]
120 | #     return pad_shape
121 | 
122 | 
123 | def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
124 |     return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
125 | 
126 | 
127 | def shift_1d(x):
128 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
129 |     return x
130 | 
131 | 
132 | def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
133 |     if max_length is None:
134 |         max_length = length.max()
135 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
136 |     return x.unsqueeze(0) < length.unsqueeze(1)
137 | 
138 | 
139 | def generate_path(duration, mask):
140 |     """
141 |     duration: [b, 1, t_x]
142 |     mask: [b, 1, t_y, t_x]
143 |     """
144 |     device = duration.device
145 | 
146 |     b, _, t_y, t_x = mask.shape
147 |     cum_duration = torch.cumsum(duration, -1)
148 | 
149 |     cum_duration_flat = cum_duration.view(b * t_x)
150 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
151 |     path = path.view(b, t_x, t_y)
152 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
153 |     path = path.unsqueeze(1).transpose(2, 3) * mask
154 |     return path
155 | 
156 | 
157 | def clip_grad_value_(parameters, clip_value, norm_type=2):
158 |     if isinstance(parameters, torch.Tensor):
159 |         parameters = [parameters]
160 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
161 |     norm_type = float(norm_type)
162 |     if clip_value is not None:
163 |         clip_value = float(clip_value)
164 | 
165 |     total_norm = 0
166 |     for p in parameters:
167 |         param_norm = p.grad.data.norm(norm_type)
168 |         total_norm += param_norm.item() ** norm_type
169 |         if clip_value is not None:
170 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
171 |     total_norm = total_norm ** (1.0 / norm_type)
172 |     return total_norm
173 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyworld
 3 | 
 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class DioF0Predictor(F0Predictor):
 8 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 | 
14 |     def interpolate_f0(self, f0):
15 |         """
16 |         对F0进行插值处理
17 |         """
18 | 
19 |         data = np.reshape(f0, (f0.size, 1))
20 | 
21 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 |         vuv_vector[data > 0.0] = 1.0
23 |         vuv_vector[data <= 0.0] = 0.0
24 | 
25 |         ip_data = data
26 | 
27 |         frame_number = data.size
28 |         last_value = 0.0
29 |         for i in range(frame_number):
30 |             if data[i] <= 0.0:
31 |                 j = i + 1
32 |                 for j in range(i + 1, frame_number):
33 |                     if data[j] > 0.0:
34 |                         break
35 |                 if j < frame_number - 1:
36 |                     if last_value > 0.0:
37 |                         step = (data[j] - data[i - 1]) / float(j - i)
38 |                         for k in range(i, j):
39 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
40 |                     else:
41 |                         for k in range(i, j):
42 |                             ip_data[k] = data[j]
43 |                 else:
44 |                     for k in range(i, frame_number):
45 |                         ip_data[k] = last_value
46 |             else:
47 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
48 |                 last_value = data[i]
49 | 
50 |         return ip_data[:, 0], vuv_vector[:, 0]
51 | 
52 |     def resize_f0(self, x, target_len):
53 |         source = np.array(x)
54 |         source[source < 0.001] = np.nan
55 |         target = np.interp(
56 |             np.arange(0, len(source) * target_len, len(source)) / target_len,
57 |             np.arange(0, len(source)),
58 |             source,
59 |         )
60 |         res = np.nan_to_num(target)
61 |         return res
62 | 
63 |     def compute_f0(self, wav, p_len=None):
64 |         if p_len is None:
65 |             p_len = wav.shape[0] // self.hop_length
66 |         f0, t = pyworld.dio(
67 |             wav.astype(np.double),
68 |             fs=self.sampling_rate,
69 |             f0_floor=self.f0_min,
70 |             f0_ceil=self.f0_max,
71 |             frame_period=1000 * self.hop_length / self.sampling_rate,
72 |         )
73 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
74 |         for index, pitch in enumerate(f0):
75 |             f0[index] = round(pitch, 1)
76 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
77 | 
78 |     def compute_f0_uv(self, wav, p_len=None):
79 |         if p_len is None:
80 |             p_len = wav.shape[0] // self.hop_length
81 |         f0, t = pyworld.dio(
82 |             wav.astype(np.double),
83 |             fs=self.sampling_rate,
84 |             f0_floor=self.f0_min,
85 |             f0_ceil=self.f0_max,
86 |             frame_period=1000 * self.hop_length / self.sampling_rate,
87 |         )
88 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
89 |         for index, pitch in enumerate(f0):
90 |             f0[index] = round(pitch, 1)
91 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
92 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py:
--------------------------------------------------------------------------------
 1 | class F0Predictor(object):
 2 |     def compute_f0(self, wav, p_len):
 3 |         """
 4 |         input: wav:[signal_length]
 5 |                p_len:int
 6 |         output: f0:[signal_length//hop_length]
 7 |         """
 8 |         pass
 9 | 
10 |     def compute_f0_uv(self, wav, p_len):
11 |         """
12 |         input: wav:[signal_length]
13 |                p_len:int
14 |         output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15 |         """
16 |         pass
17 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyworld
 3 | 
 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class HarvestF0Predictor(F0Predictor):
 8 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 | 
14 |     def interpolate_f0(self, f0):
15 |         """
16 |         对F0进行插值处理
17 |         """
18 | 
19 |         data = np.reshape(f0, (f0.size, 1))
20 | 
21 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 |         vuv_vector[data > 0.0] = 1.0
23 |         vuv_vector[data <= 0.0] = 0.0
24 | 
25 |         ip_data = data
26 | 
27 |         frame_number = data.size
28 |         last_value = 0.0
29 |         for i in range(frame_number):
30 |             if data[i] <= 0.0:
31 |                 j = i + 1
32 |                 for j in range(i + 1, frame_number):
33 |                     if data[j] > 0.0:
34 |                         break
35 |                 if j < frame_number - 1:
36 |                     if last_value > 0.0:
37 |                         step = (data[j] - data[i - 1]) / float(j - i)
38 |                         for k in range(i, j):
39 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
40 |                     else:
41 |                         for k in range(i, j):
42 |                             ip_data[k] = data[j]
43 |                 else:
44 |                     for k in range(i, frame_number):
45 |                         ip_data[k] = last_value
46 |             else:
47 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
48 |                 last_value = data[i]
49 | 
50 |         return ip_data[:, 0], vuv_vector[:, 0]
51 | 
52 |     def resize_f0(self, x, target_len):
53 |         source = np.array(x)
54 |         source[source < 0.001] = np.nan
55 |         target = np.interp(
56 |             np.arange(0, len(source) * target_len, len(source)) / target_len,
57 |             np.arange(0, len(source)),
58 |             source,
59 |         )
60 |         res = np.nan_to_num(target)
61 |         return res
62 | 
63 |     def compute_f0(self, wav, p_len=None):
64 |         if p_len is None:
65 |             p_len = wav.shape[0] // self.hop_length
66 |         f0, t = pyworld.harvest(
67 |             wav.astype(np.double),
68 |             fs=self.sampling_rate,
69 |             f0_ceil=self.f0_max,
70 |             f0_floor=self.f0_min,
71 |             frame_period=1000 * self.hop_length / self.sampling_rate,
72 |         )
73 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
74 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
75 | 
76 |     def compute_f0_uv(self, wav, p_len=None):
77 |         if p_len is None:
78 |             p_len = wav.shape[0] // self.hop_length
79 |         f0, t = pyworld.harvest(
80 |             wav.astype(np.double),
81 |             fs=self.sampling_rate,
82 |             f0_floor=self.f0_min,
83 |             f0_ceil=self.f0_max,
84 |             frame_period=1000 * self.hop_length / self.sampling_rate,
85 |         )
86 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
87 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
88 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import parselmouth
 3 | 
 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class PMF0Predictor(F0Predictor):
 8 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 | 
14 |     def interpolate_f0(self, f0):
15 |         """
16 |         对F0进行插值处理
17 |         """
18 | 
19 |         data = np.reshape(f0, (f0.size, 1))
20 | 
21 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 |         vuv_vector[data > 0.0] = 1.0
23 |         vuv_vector[data <= 0.0] = 0.0
24 | 
25 |         ip_data = data
26 | 
27 |         frame_number = data.size
28 |         last_value = 0.0
29 |         for i in range(frame_number):
30 |             if data[i] <= 0.0:
31 |                 j = i + 1
32 |                 for j in range(i + 1, frame_number):
33 |                     if data[j] > 0.0:
34 |                         break
35 |                 if j < frame_number - 1:
36 |                     if last_value > 0.0:
37 |                         step = (data[j] - data[i - 1]) / float(j - i)
38 |                         for k in range(i, j):
39 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
40 |                     else:
41 |                         for k in range(i, j):
42 |                             ip_data[k] = data[j]
43 |                 else:
44 |                     for k in range(i, frame_number):
45 |                         ip_data[k] = last_value
46 |             else:
47 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
48 |                 last_value = data[i]
49 | 
50 |         return ip_data[:, 0], vuv_vector[:, 0]
51 | 
52 |     def compute_f0(self, wav, p_len=None):
53 |         x = wav
54 |         if p_len is None:
55 |             p_len = x.shape[0] // self.hop_length
56 |         else:
57 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
58 |         time_step = self.hop_length / self.sampling_rate * 1000
59 |         f0 = (
60 |             parselmouth.Sound(x, self.sampling_rate)
61 |             .to_pitch_ac(
62 |                 time_step=time_step / 1000,
63 |                 voicing_threshold=0.6,
64 |                 pitch_floor=self.f0_min,
65 |                 pitch_ceiling=self.f0_max,
66 |             )
67 |             .selected_array["frequency"]
68 |         )
69 | 
70 |         pad_size = (p_len - len(f0) + 1) // 2
71 |         if pad_size > 0 or p_len - len(f0) - pad_size > 0:
72 |             f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
73 |         f0, uv = self.interpolate_f0(f0)
74 |         return f0
75 | 
76 |     def compute_f0_uv(self, wav, p_len=None):
77 |         x = wav
78 |         if p_len is None:
79 |             p_len = x.shape[0] // self.hop_length
80 |         else:
81 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
82 |         time_step = self.hop_length / self.sampling_rate * 1000
83 |         f0 = (
84 |             parselmouth.Sound(x, self.sampling_rate)
85 |             .to_pitch_ac(
86 |                 time_step=time_step / 1000,
87 |                 voicing_threshold=0.6,
88 |                 pitch_floor=self.f0_min,
89 |                 pitch_ceiling=self.f0_max,
90 |             )
91 |             .selected_array["frequency"]
92 |         )
93 | 
94 |         pad_size = (p_len - len(f0) + 1) // 2
95 |         if pad_size > 0 or p_len - len(f0) - pad_size > 0:
96 |             f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
97 |         f0, uv = self.interpolate_f0(f0)
98 |         return f0, uv
99 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py


--------------------------------------------------------------------------------
/rvc/infer/lib/infer_pack/onnx_inference.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import numpy as np
  3 | import onnxruntime
  4 | import soundfile
  5 | 
  6 | import logging
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class ContentVec:
 12 |     def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
 13 |         logger.info("Load model(s) from {}".format(vec_path))
 14 |         if device == "cpu" or device is None:
 15 |             providers = ["CPUExecutionProvider"]
 16 |         elif device == "cuda":
 17 |             providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
 18 |         elif device == "dml":
 19 |             providers = ["DmlExecutionProvider"]
 20 |         else:
 21 |             raise RuntimeError("Unsportted Device")
 22 |         self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
 23 | 
 24 |     def __call__(self, wav):
 25 |         return self.forward(wav)
 26 | 
 27 |     def forward(self, wav):
 28 |         feats = wav
 29 |         if feats.ndim == 2:  # double channels
 30 |             feats = feats.mean(-1)
 31 |         assert feats.ndim == 1, feats.ndim
 32 |         feats = np.expand_dims(np.expand_dims(feats, 0), 0)
 33 |         onnx_input = {self.model.get_inputs()[0].name: feats}
 34 |         logits = self.model.run(None, onnx_input)[0]
 35 |         return logits.transpose(0, 2, 1)
 36 | 
 37 | 
 38 | def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
 39 |     if f0_predictor == "pm":
 40 |         from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
 41 | 
 42 |         f0_predictor_object = PMF0Predictor(
 43 |             hop_length=hop_length, sampling_rate=sampling_rate
 44 |         )
 45 |     elif f0_predictor == "harvest":
 46 |         from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
 47 |             HarvestF0Predictor,
 48 |         )
 49 | 
 50 |         f0_predictor_object = HarvestF0Predictor(
 51 |             hop_length=hop_length, sampling_rate=sampling_rate
 52 |         )
 53 |     elif f0_predictor == "dio":
 54 |         from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
 55 | 
 56 |         f0_predictor_object = DioF0Predictor(
 57 |             hop_length=hop_length, sampling_rate=sampling_rate
 58 |         )
 59 |     else:
 60 |         raise Exception("Unknown f0 predictor")
 61 |     return f0_predictor_object
 62 | 
 63 | 
 64 | class OnnxRVC:
 65 |     def __init__(
 66 |         self,
 67 |         model_path,
 68 |         sr=40000,
 69 |         hop_size=512,
 70 |         vec_path="vec-768-layer-12",
 71 |         device="cpu",
 72 |     ):
 73 |         vec_path = f"pretrained/{vec_path}.onnx"
 74 |         self.vec_model = ContentVec(vec_path, device)
 75 |         if device == "cpu" or device is None:
 76 |             providers = ["CPUExecutionProvider"]
 77 |         elif device == "cuda":
 78 |             providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
 79 |         elif device == "dml":
 80 |             providers = ["DmlExecutionProvider"]
 81 |         else:
 82 |             raise RuntimeError("Unsportted Device")
 83 |         self.model = onnxruntime.InferenceSession(model_path, providers=providers)
 84 |         self.sampling_rate = sr
 85 |         self.hop_size = hop_size
 86 | 
 87 |     def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
 88 |         onnx_input = {
 89 |             self.model.get_inputs()[0].name: hubert,
 90 |             self.model.get_inputs()[1].name: hubert_length,
 91 |             self.model.get_inputs()[2].name: pitch,
 92 |             self.model.get_inputs()[3].name: pitchf,
 93 |             self.model.get_inputs()[4].name: ds,
 94 |             self.model.get_inputs()[5].name: rnd,
 95 |         }
 96 |         return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
 97 | 
 98 |     def inference(
 99 |         self,
100 |         raw_path,
101 |         sid,
102 |         f0_method="dio",
103 |         f0_up_key=0,
104 |         pad_time=0.5,
105 |         cr_threshold=0.02,
106 |     ):
107 |         f0_min = 50
108 |         f0_max = 1100
109 |         f0_mel_min = 1127 * np.log(1 + f0_min / 700)
110 |         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
111 |         f0_predictor = get_f0_predictor(
112 |             f0_method,
113 |             hop_length=self.hop_size,
114 |             sampling_rate=self.sampling_rate,
115 |             threshold=cr_threshold,
116 |         )
117 |         wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
118 |         org_length = len(wav)
119 |         if org_length / sr > 50.0:
120 |             raise RuntimeError("Reached Max Length")
121 | 
122 |         wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
123 |         wav16k = wav16k
124 | 
125 |         hubert = self.vec_model(wav16k)
126 |         hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
127 |         hubert_length = hubert.shape[1]
128 | 
129 |         pitchf = f0_predictor.compute_f0(wav, hubert_length)
130 |         pitchf = pitchf * 2 ** (f0_up_key / 12)
131 |         pitch = pitchf.copy()
132 |         f0_mel = 1127 * np.log(1 + pitch / 700)
133 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
134 |             f0_mel_max - f0_mel_min
135 |         ) + 1
136 |         f0_mel[f0_mel <= 1] = 1
137 |         f0_mel[f0_mel > 255] = 255
138 |         pitch = np.rint(f0_mel).astype(np.int64)
139 | 
140 |         pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
141 |         pitch = pitch.reshape(1, len(pitch))
142 |         ds = np.array([sid]).astype(np.int64)
143 | 
144 |         rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
145 |         hubert_length = np.array([hubert_length]).astype(np.int64)
146 | 
147 |         out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
148 |         out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
149 |         return out_wav[0:org_length]
150 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/jit/__init__.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | import pickle
  3 | import time
  4 | import torch
  5 | from tqdm import tqdm
  6 | from collections import OrderedDict
  7 | 
  8 | 
  9 | def load_inputs(path, device, is_half=False):
 10 |     parm = torch.load(path, map_location=torch.device("cpu"))
 11 |     for key in parm.keys():
 12 |         parm[key] = parm[key].to(device)
 13 |         if is_half and parm[key].dtype == torch.float32:
 14 |             parm[key] = parm[key].half()
 15 |         elif not is_half and parm[key].dtype == torch.float16:
 16 |             parm[key] = parm[key].float()
 17 |     return parm
 18 | 
 19 | 
 20 | def benchmark(
 21 |     model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False
 22 | ):
 23 |     parm = load_inputs(inputs_path, device, is_half)
 24 |     total_ts = 0.0
 25 |     bar = tqdm(range(epoch))
 26 |     for i in bar:
 27 |         start_time = time.perf_counter()
 28 |         o = model(**parm)
 29 |         total_ts += time.perf_counter() - start_time
 30 |     print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}")
 31 | 
 32 | 
 33 | def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False):
 34 |     benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half)
 35 | 
 36 | 
 37 | def to_jit_model(
 38 |     model_path,
 39 |     model_type: str,
 40 |     mode: str = "trace",
 41 |     inputs_path: str = None,
 42 |     device=torch.device("cpu"),
 43 |     is_half=False,
 44 | ):
 45 |     model = None
 46 |     if model_type.lower() == "synthesizer":
 47 |         from .get_synthesizer import get_synthesizer
 48 | 
 49 |         model, _ = get_synthesizer(model_path, device)
 50 |         model.forward = model.infer
 51 |     elif model_type.lower() == "rmvpe":
 52 |         from .get_rmvpe import get_rmvpe
 53 | 
 54 |         model = get_rmvpe(model_path, device)
 55 |     elif model_type.lower() == "hubert":
 56 |         from .get_hubert import get_hubert_model
 57 | 
 58 |         model = get_hubert_model(model_path, device)
 59 |         model.forward = model.infer
 60 |     else:
 61 |         raise ValueError(f"No model type named {model_type}")
 62 |     model = model.eval()
 63 |     model = model.half() if is_half else model.float()
 64 |     if mode == "trace":
 65 |         assert not inputs_path
 66 |         inputs = load_inputs(inputs_path, device, is_half)
 67 |         model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
 68 |     elif mode == "script":
 69 |         model_jit = torch.jit.script(model)
 70 |     model_jit.to(device)
 71 |     model_jit = model_jit.half() if is_half else model_jit.float()
 72 |     # model = model.half() if is_half else model.float()
 73 |     return (model, model_jit)
 74 | 
 75 | 
 76 | def export(
 77 |     model: torch.nn.Module,
 78 |     mode: str = "trace",
 79 |     inputs: dict = None,
 80 |     device=torch.device("cpu"),
 81 |     is_half: bool = False,
 82 | ) -> dict:
 83 |     model = model.half() if is_half else model.float()
 84 |     model.eval()
 85 |     if mode == "trace":
 86 |         assert inputs is not None
 87 |         model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
 88 |     elif mode == "script":
 89 |         model_jit = torch.jit.script(model)
 90 |     model_jit.to(device)
 91 |     model_jit = model_jit.half() if is_half else model_jit.float()
 92 |     buffer = BytesIO()
 93 |     # model_jit=model_jit.cpu()
 94 |     torch.jit.save(model_jit, buffer)
 95 |     del model_jit
 96 |     cpt = OrderedDict()
 97 |     cpt["model"] = buffer.getvalue()
 98 |     cpt["is_half"] = is_half
 99 |     return cpt
100 | 
101 | 
102 | def load(path: str):
103 |     with open(path, "rb") as f:
104 |         return pickle.load(f)
105 | 
106 | 
107 | def save(ckpt: dict, save_path: str):
108 |     with open(save_path, "wb") as f:
109 |         pickle.dump(ckpt, f)
110 | 
111 | 
112 | def rmvpe_jit_export(
113 |     model_path: str,
114 |     mode: str = "script",
115 |     inputs_path: str = None,
116 |     save_path: str = None,
117 |     device=torch.device("cpu"),
118 |     is_half=False,
119 | ):
120 |     if not save_path:
121 |         save_path = model_path.rstrip(".pth")
122 |         save_path += ".half.jit" if is_half else ".jit"
123 |     if "cuda" in str(device) and ":" not in str(device):
124 |         device = torch.device("cuda:0")
125 |     from .get_rmvpe import get_rmvpe
126 | 
127 |     model = get_rmvpe(model_path, device)
128 |     inputs = None
129 |     if mode == "trace":
130 |         inputs = load_inputs(inputs_path, device, is_half)
131 |     ckpt = export(model, mode, inputs, device, is_half)
132 |     ckpt["device"] = str(device)
133 |     save(ckpt, save_path)
134 |     return ckpt
135 | 
136 | 
137 | def synthesizer_jit_export(
138 |     model_path: str,
139 |     mode: str = "script",
140 |     inputs_path: str = None,
141 |     save_path: str = None,
142 |     device=torch.device("cpu"),
143 |     is_half=False,
144 | ):
145 |     if not save_path:
146 |         save_path = model_path.rstrip(".pth")
147 |         save_path += ".half.jit" if is_half else ".jit"
148 |     if "cuda" in str(device) and ":" not in str(device):
149 |         device = torch.device("cuda:0")
150 |     from .get_synthesizer import get_synthesizer
151 | 
152 |     model, cpt = get_synthesizer(model_path, device)
153 |     assert isinstance(cpt, dict)
154 |     model.forward = model.infer
155 |     inputs = None
156 |     if mode == "trace":
157 |         inputs = load_inputs(inputs_path, device, is_half)
158 |     ckpt = export(model, mode, inputs, device, is_half)
159 |     cpt.pop("weight")
160 |     cpt["model"] = ckpt["model"]
161 |     cpt["device"] = device
162 |     save(cpt, save_path)
163 |     return cpt
164 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/jit/get_rmvpe.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
 5 |     from infer.lib.rmvpe import E2E
 6 | 
 7 |     model = E2E(4, 1, (2, 2))
 8 |     ckpt = torch.load(model_path, map_location=device)
 9 |     model.load_state_dict(ckpt)
10 |     model.eval()
11 |     model = model.to(device)
12 |     return model
13 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/jit/get_synthesizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_synthesizer(pth_path, device=torch.device("cpu")):
 5 |     from infer.lib.infer_pack.models import (
 6 |         SynthesizerTrnMs256NSFsid,
 7 |         SynthesizerTrnMs256NSFsid_nono,
 8 |         SynthesizerTrnMs768NSFsid,
 9 |         SynthesizerTrnMs768NSFsid_nono,
10 |     )
11 | 
12 |     cpt = torch.load(pth_path, map_location=torch.device("cpu"))
13 |     # tgt_sr = cpt["config"][-1]
14 |     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
15 |     if_f0 = cpt.get("f0", 1)
16 |     version = cpt.get("version", "v1")
17 |     if version == "v1":
18 |         if if_f0 == 1:
19 |             net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
20 |         else:
21 |             net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
22 |     elif version == "v2":
23 |         if if_f0 == 1:
24 |             net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False)
25 |         else:
26 |             net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
27 |     del net_g.enc_q
28 |     # net_g.forward = net_g.infer
29 |     # ckpt = {}
30 |     # ckpt["config"] = cpt["config"]
31 |     # ckpt["f0"] = if_f0
32 |     # ckpt["version"] = version
33 |     # ckpt["info"] = cpt.get("info", "0epoch")
34 |     net_g.load_state_dict(cpt["weight"], strict=False)
35 |     net_g = net_g.float()
36 |     net_g.eval().to(device)
37 |     net_g.remove_weight_norm()
38 |     return net_g, cpt
39 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/data_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/data_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/losses.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/losses.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/mel_processing.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/mel_processing.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/process_ckpt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/process_ckpt.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/train/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/lib/train/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/lib/train/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | 
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 |     loss = 0
17 |     r_losses = []
18 |     g_losses = []
19 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 |         dr = dr.float()
21 |         dg = dg.float()
22 |         r_loss = torch.mean((1 - dr) ** 2)
23 |         g_loss = torch.mean(dg**2)
24 |         loss += r_loss + g_loss
25 |         r_losses.append(r_loss.item())
26 |         g_losses.append(g_loss.item())
27 | 
28 |     return loss, r_losses, g_losses
29 | 
30 | 
31 | def generator_loss(disc_outputs):
32 |     loss = 0
33 |     gen_losses = []
34 |     for dg in disc_outputs:
35 |         dg = dg.float()
36 |         l = torch.mean((1 - dg) ** 2)
37 |         gen_losses.append(l)
38 |         loss += l
39 | 
40 |     return loss, gen_losses
41 | 
42 | 
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 |     """
45 |     z_p, logs_q: [b, h, t_t]
46 |     m_p, logs_p: [b, h, t_t]
47 |     """
48 |     z_p = z_p.float()
49 |     logs_q = logs_q.float()
50 |     m_p = m_p.float()
51 |     logs_p = logs_p.float()
52 |     z_mask = z_mask.float()
53 | 
54 |     kl = logs_p - logs_q - 0.5
55 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 |     kl = torch.sum(kl * z_mask)
57 |     l = kl / torch.sum(z_mask)
58 |     return l
59 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/train/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | from librosa.filters import mel as librosa_mel_fn
  4 | import logging
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | MAX_WAV_VALUE = 32768.0
  9 | 
 10 | 
 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 12 |     """
 13 |     PARAMS
 14 |     ------
 15 |     C: compression factor
 16 |     """
 17 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 18 | 
 19 | 
 20 | def dynamic_range_decompression_torch(x, C=1):
 21 |     """
 22 |     PARAMS
 23 |     ------
 24 |     C: compression factor used to compress
 25 |     """
 26 |     return torch.exp(x) / C
 27 | 
 28 | 
 29 | def spectral_normalize_torch(magnitudes):
 30 |     return dynamic_range_compression_torch(magnitudes)
 31 | 
 32 | 
 33 | def spectral_de_normalize_torch(magnitudes):
 34 |     return dynamic_range_decompression_torch(magnitudes)
 35 | 
 36 | 
 37 | # Reusable banks
 38 | mel_basis = {}
 39 | hann_window = {}
 40 | 
 41 | 
 42 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 43 |     """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
 44 | 
 45 |     Args:
 46 |         y             :: (B, T) - Audio waveforms
 47 |         n_fft
 48 |         sampling_rate
 49 |         hop_size
 50 |         win_size
 51 |         center
 52 |     Returns:
 53 |         :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
 54 |     """
 55 | 
 56 |     # Window - Cache if needed
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 62 |             dtype=y.dtype, device=y.device
 63 |         )
 64 | 
 65 |     # Padding
 66 |     y = torch.nn.functional.pad(
 67 |         y.unsqueeze(1),
 68 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 69 |         mode="reflect",
 70 |     )
 71 |     y = y.squeeze(1)
 72 | 
 73 |     # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
 74 |     spec = torch.stft(
 75 |         y,
 76 |         n_fft,
 77 |         hop_length=hop_size,
 78 |         win_length=win_size,
 79 |         window=hann_window[wnsize_dtype_device],
 80 |         center=center,
 81 |         pad_mode="reflect",
 82 |         normalized=False,
 83 |         onesided=True,
 84 |         return_complex=True,
 85 |     )
 86 | 
 87 |     # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
 88 |     spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
 89 |     return spec
 90 | 
 91 | 
 92 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 93 |     # MelBasis - Cache if needed
 94 |     global mel_basis
 95 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 96 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 97 |     if fmax_dtype_device not in mel_basis:
 98 |         mel = librosa_mel_fn(
 99 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
100 |         )
101 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
102 |             dtype=spec.dtype, device=spec.device
103 |         )
104 | 
105 |     # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
106 |     melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
107 |     melspec = spectral_normalize_torch(melspec)
108 |     return melspec
109 | 
110 | 
111 | def mel_spectrogram_torch(
112 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
113 | ):
114 |     """Convert waveform into Mel-frequency Log-amplitude spectrogram.
115 | 
116 |     Args:
117 |         y       :: (B, T)           - Waveforms
118 |     Returns:
119 |         melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
120 |     """
121 |     # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
122 |     spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
123 | 
124 |     # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
125 |     melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
126 | 
127 |     return melspec
128 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.utils.data
  7 | from tqdm import tqdm
  8 | 
  9 | from . import spec_utils
 10 | 
 11 | 
 12 | class VocalRemoverValidationSet(torch.utils.data.Dataset):
 13 |     def __init__(self, patch_list):
 14 |         self.patch_list = patch_list
 15 | 
 16 |     def __len__(self):
 17 |         return len(self.patch_list)
 18 | 
 19 |     def __getitem__(self, idx):
 20 |         path = self.patch_list[idx]
 21 |         data = np.load(path)
 22 | 
 23 |         X, y = data["X"], data["y"]
 24 | 
 25 |         X_mag = np.abs(X)
 26 |         y_mag = np.abs(y)
 27 | 
 28 |         return X_mag, y_mag
 29 | 
 30 | 
 31 | def make_pair(mix_dir, inst_dir):
 32 |     input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
 33 | 
 34 |     X_list = sorted(
 35 |         [
 36 |             os.path.join(mix_dir, fname)
 37 |             for fname in os.listdir(mix_dir)
 38 |             if os.path.splitext(fname)[1] in input_exts
 39 |         ]
 40 |     )
 41 |     y_list = sorted(
 42 |         [
 43 |             os.path.join(inst_dir, fname)
 44 |             for fname in os.listdir(inst_dir)
 45 |             if os.path.splitext(fname)[1] in input_exts
 46 |         ]
 47 |     )
 48 | 
 49 |     filelist = list(zip(X_list, y_list))
 50 | 
 51 |     return filelist
 52 | 
 53 | 
 54 | def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
 55 |     if split_mode == "random":
 56 |         filelist = make_pair(
 57 |             os.path.join(dataset_dir, "mixtures"),
 58 |             os.path.join(dataset_dir, "instruments"),
 59 |         )
 60 | 
 61 |         random.shuffle(filelist)
 62 | 
 63 |         if len(val_filelist) == 0:
 64 |             val_size = int(len(filelist) * val_rate)
 65 |             train_filelist = filelist[:-val_size]
 66 |             val_filelist = filelist[-val_size:]
 67 |         else:
 68 |             train_filelist = [
 69 |                 pair for pair in filelist if list(pair) not in val_filelist
 70 |             ]
 71 |     elif split_mode == "subdirs":
 72 |         if len(val_filelist) != 0:
 73 |             raise ValueError(
 74 |                 "The `val_filelist` option is not available in `subdirs` mode"
 75 |             )
 76 | 
 77 |         train_filelist = make_pair(
 78 |             os.path.join(dataset_dir, "training/mixtures"),
 79 |             os.path.join(dataset_dir, "training/instruments"),
 80 |         )
 81 | 
 82 |         val_filelist = make_pair(
 83 |             os.path.join(dataset_dir, "validation/mixtures"),
 84 |             os.path.join(dataset_dir, "validation/instruments"),
 85 |         )
 86 | 
 87 |     return train_filelist, val_filelist
 88 | 
 89 | 
 90 | def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
 91 |     perm = np.random.permutation(len(X))
 92 |     for i, idx in enumerate(tqdm(perm)):
 93 |         if np.random.uniform() < reduction_rate:
 94 |             y[idx] = spec_utils.reduce_vocal_aggressively(
 95 |                 X[idx], y[idx], reduction_mask
 96 |             )
 97 | 
 98 |         if np.random.uniform() < 0.5:
 99 |             # swap channel
100 |             X[idx] = X[idx, ::-1]
101 |             y[idx] = y[idx, ::-1]
102 |         if np.random.uniform() < 0.02:
103 |             # mono
104 |             X[idx] = X[idx].mean(axis=0, keepdims=True)
105 |             y[idx] = y[idx].mean(axis=0, keepdims=True)
106 |         if np.random.uniform() < 0.02:
107 |             # inst
108 |             X[idx] = y[idx]
109 | 
110 |         if np.random.uniform() < mixup_rate and i < len(perm) - 1:
111 |             lam = np.random.beta(mixup_alpha, mixup_alpha)
112 |             X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
113 |             y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
114 | 
115 |     return X, y
116 | 
117 | 
118 | def make_padding(width, cropsize, offset):
119 |     left = offset
120 |     roi_size = cropsize - left * 2
121 |     if roi_size == 0:
122 |         roi_size = cropsize
123 |     right = roi_size - (width % roi_size) + left
124 | 
125 |     return left, right, roi_size
126 | 
127 | 
128 | def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
129 |     len_dataset = patches * len(filelist)
130 | 
131 |     X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
132 |     y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
133 | 
134 |     for i, (X_path, y_path) in enumerate(tqdm(filelist)):
135 |         X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
136 |         coef = np.max([np.abs(X).max(), np.abs(y).max()])
137 |         X, y = X / coef, y / coef
138 | 
139 |         l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
140 |         X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
141 |         y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
142 | 
143 |         starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
144 |         ends = starts + cropsize
145 |         for j in range(patches):
146 |             idx = i * patches + j
147 |             X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
148 |             y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
149 | 
150 |     return X_dataset, y_dataset
151 | 
152 | 
153 | def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
154 |     patch_list = []
155 |     patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
156 |         cropsize, sr, hop_length, n_fft, offset
157 |     )
158 |     os.makedirs(patch_dir, exist_ok=True)
159 | 
160 |     for i, (X_path, y_path) in enumerate(tqdm(filelist)):
161 |         basename = os.path.splitext(os.path.basename(X_path))[0]
162 | 
163 |         X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
164 |         coef = np.max([np.abs(X).max(), np.abs(y).max()])
165 |         X, y = X / coef, y / coef
166 | 
167 |         l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
168 |         X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
169 |         y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
170 | 
171 |         len_dataset = int(np.ceil(X.shape[2] / roi_size))
172 |         for j in range(len_dataset):
173 |             outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
174 |             start = j * roi_size
175 |             if not os.path.exists(outpath):
176 |                 np.savez(
177 |                     outpath,
178 |                     X=X_pad[:, :, start : start + cropsize],
179 |                     y=y_pad[:, :, start : start + cropsize],
180 |                 )
181 |             patch_list.append(outpath)
182 | 
183 |     return VocalRemoverValidationSet(patch_list)
184 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/layers_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class Encoder(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 31 |         super(Encoder, self).__init__()
 32 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
 33 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 34 | 
 35 |     def __call__(self, x):
 36 |         h = self.conv1(x)
 37 |         h = self.conv2(h)
 38 | 
 39 |         return h
 40 | 
 41 | 
 42 | class Decoder(nn.Module):
 43 |     def __init__(
 44 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 45 |     ):
 46 |         super(Decoder, self).__init__()
 47 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 48 |         # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 49 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 50 | 
 51 |     def __call__(self, x, skip=None):
 52 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 53 | 
 54 |         if skip is not None:
 55 |             skip = spec_utils.crop_center(skip, x)
 56 |             x = torch.cat([x, skip], dim=1)
 57 | 
 58 |         h = self.conv1(x)
 59 |         # h = self.conv2(h)
 60 | 
 61 |         if self.dropout is not None:
 62 |             h = self.dropout(h)
 63 | 
 64 |         return h
 65 | 
 66 | 
 67 | class ASPPModule(nn.Module):
 68 |     def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
 69 |         super(ASPPModule, self).__init__()
 70 |         self.conv1 = nn.Sequential(
 71 |             nn.AdaptiveAvgPool2d((1, None)),
 72 |             Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
 73 |         )
 74 |         self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
 75 |         self.conv3 = Conv2DBNActiv(
 76 |             nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
 77 |         )
 78 |         self.conv4 = Conv2DBNActiv(
 79 |             nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
 80 |         )
 81 |         self.conv5 = Conv2DBNActiv(
 82 |             nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
 83 |         )
 84 |         self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
 85 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 86 | 
 87 |     def forward(self, x):
 88 |         _, _, h, w = x.size()
 89 |         feat1 = F.interpolate(
 90 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
 91 |         )
 92 |         feat2 = self.conv2(x)
 93 |         feat3 = self.conv3(x)
 94 |         feat4 = self.conv4(x)
 95 |         feat5 = self.conv5(x)
 96 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
 97 |         out = self.bottleneck(out)
 98 | 
 99 |         if self.dropout is not None:
100 |             out = self.dropout(out)
101 | 
102 |         return out
103 | 
104 | 
105 | class LSTMModule(nn.Module):
106 |     def __init__(self, nin_conv, nin_lstm, nout_lstm):
107 |         super(LSTMModule, self).__init__()
108 |         self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109 |         self.lstm = nn.LSTM(
110 |             input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111 |         )
112 |         self.dense = nn.Sequential(
113 |             nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114 |         )
115 | 
116 |     def forward(self, x):
117 |         N, _, nbins, nframes = x.size()
118 |         h = self.conv(x)[:, 0]  # N, nbins, nframes
119 |         h = h.permute(2, 0, 1)  # nframes, N, nbins
120 |         h, _ = self.lstm(h)
121 |         h = self.dense(h.reshape(-1, h.size()[-1]))  # nframes * N, nbins
122 |         h = h.reshape(nframes, N, 1, nbins)
123 |         h = h.permute(1, 2, 3, 0)
124 | 
125 |         return h
126 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pathlib
 4 | 
 5 | default_param = {}
 6 | default_param["bins"] = 768
 7 | default_param["unstable_bins"] = 9  # training only
 8 | default_param["reduction_bins"] = 762  # training only
 9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 | 
14 | 
15 | default_param["band"][1] = {
16 |     "sr": 11025,
17 |     "hl": 128,
18 |     "n_fft": 960,
19 |     "crop_start": 0,
20 |     "crop_stop": 245,
21 |     "lpf_start": 61,  # inference only
22 |     "res_type": "polyphase",
23 | }
24 | 
25 | default_param["band"][2] = {
26 |     "sr": 44100,
27 |     "hl": 512,
28 |     "n_fft": 1536,
29 |     "crop_start": 24,
30 |     "crop_stop": 547,
31 |     "hpf_start": 81,  # inference only
32 |     "res_type": "sinc_best",
33 | }
34 | 
35 | 
36 | def int_keys(d):
37 |     r = {}
38 |     for k, v in d:
39 |         if k.isdigit():
40 |             k = int(k)
41 |         r[k] = v
42 |     return r
43 | 
44 | 
45 | class ModelParameters(object):
46 |     def __init__(self, config_path=""):
47 |         if ".pth" == pathlib.Path(config_path).suffix:
48 |             import zipfile
49 | 
50 |             with zipfile.ZipFile(config_path, "r") as zip:
51 |                 self.param = json.loads(
52 |                     zip.read("param.json"), object_pairs_hook=int_keys
53 |                 )
54 |         elif ".json" == pathlib.Path(config_path).suffix:
55 |             with open(config_path, "r") as f:
56 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 |         else:
58 |             self.param = default_param
59 | 
60 |         for k in [
61 |             "mid_side",
62 |             "mid_side_b",
63 |             "mid_side_b2",
64 |             "stereo_w",
65 |             "stereo_n",
66 |             "reverse",
67 |         ]:
68 |             if not k in self.param:
69 |                 self.param[k] = False
70 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets.py:
--------------------------------------------------------------------------------
  1 | import layers
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import spec_utils
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 16)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 16)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(8, 16)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(16, 32)
 51 | 
 52 |         self.out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_33966KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 16)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 16)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(8, 16)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(16, 32)
 50 | 
 51 |         self.out = nn.Conv2d(32, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import layers_537238KB as layers
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 64)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 64)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(32, 64)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(64, 128)
 51 | 
 52 |         self.out = nn.Conv2d(128, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import layers_537238KB as layers
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 64)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 64)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(32, 64)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(64, 128)
 51 | 
 52 |         self.out = nn.Conv2d(128, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/lib_v5/nets_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_new
  6 | 
  7 | 
  8 | class BaseNet(nn.Module):
  9 |     def __init__(
 10 |         self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
 11 |     ):
 12 |         super(BaseNet, self).__init__()
 13 |         self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
 14 |         self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
 15 |         self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
 16 |         self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
 17 |         self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
 18 | 
 19 |         self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
 20 | 
 21 |         self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
 22 |         self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
 23 |         self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
 24 |         self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
 25 |         self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
 26 | 
 27 |     def __call__(self, x):
 28 |         e1 = self.enc1(x)
 29 |         e2 = self.enc2(e1)
 30 |         e3 = self.enc3(e2)
 31 |         e4 = self.enc4(e3)
 32 |         e5 = self.enc5(e4)
 33 | 
 34 |         h = self.aspp(e5)
 35 | 
 36 |         h = self.dec4(h, e4)
 37 |         h = self.dec3(h, e3)
 38 |         h = self.dec2(h, e2)
 39 |         h = torch.cat([h, self.lstm_dec2(h)], dim=1)
 40 |         h = self.dec1(h, e1)
 41 | 
 42 |         return h
 43 | 
 44 | 
 45 | class CascadedNet(nn.Module):
 46 |     def __init__(self, n_fft, nout=32, nout_lstm=128):
 47 |         super(CascadedNet, self).__init__()
 48 | 
 49 |         self.max_bin = n_fft // 2
 50 |         self.output_bin = n_fft // 2 + 1
 51 |         self.nin_lstm = self.max_bin // 2
 52 |         self.offset = 64
 53 | 
 54 |         self.stg1_low_band_net = nn.Sequential(
 55 |             BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
 56 |             layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
 57 |         )
 58 | 
 59 |         self.stg1_high_band_net = BaseNet(
 60 |             2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
 61 |         )
 62 | 
 63 |         self.stg2_low_band_net = nn.Sequential(
 64 |             BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
 65 |             layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
 66 |         )
 67 |         self.stg2_high_band_net = BaseNet(
 68 |             nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
 69 |         )
 70 | 
 71 |         self.stg3_full_band_net = BaseNet(
 72 |             3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
 73 |         )
 74 | 
 75 |         self.out = nn.Conv2d(nout, 2, 1, bias=False)
 76 |         self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
 77 | 
 78 |     def forward(self, x):
 79 |         x = x[:, :, : self.max_bin]
 80 | 
 81 |         bandw = x.size()[2] // 2
 82 |         l1_in = x[:, :, :bandw]
 83 |         h1_in = x[:, :, bandw:]
 84 |         l1 = self.stg1_low_band_net(l1_in)
 85 |         h1 = self.stg1_high_band_net(h1_in)
 86 |         aux1 = torch.cat([l1, h1], dim=2)
 87 | 
 88 |         l2_in = torch.cat([l1_in, l1], dim=1)
 89 |         h2_in = torch.cat([h1_in, h1], dim=1)
 90 |         l2 = self.stg2_low_band_net(l2_in)
 91 |         h2 = self.stg2_high_band_net(h2_in)
 92 |         aux2 = torch.cat([l2, h2], dim=2)
 93 | 
 94 |         f3_in = torch.cat([x, aux1, aux2], dim=1)
 95 |         f3 = self.stg3_full_band_net(f3_in)
 96 | 
 97 |         mask = torch.sigmoid(self.out(f3))
 98 |         mask = F.pad(
 99 |             input=mask,
100 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
101 |             mode="replicate",
102 |         )
103 | 
104 |         if self.training:
105 |             aux = torch.cat([aux1, aux2], dim=1)
106 |             aux = torch.sigmoid(self.aux_out(aux))
107 |             aux = F.pad(
108 |                 input=aux,
109 |                 pad=(0, 0, 0, self.output_bin - aux.size()[2]),
110 |                 mode="replicate",
111 |             )
112 |             return mask, aux
113 |         else:
114 |             return mask
115 | 
116 |     def predict_mask(self, x):
117 |         mask = self.forward(x)
118 | 
119 |         if self.offset > 0:
120 |             mask = mask[:, :, :, self.offset : -self.offset]
121 |             assert mask.size()[3] > 0
122 | 
123 |         return mask
124 | 
125 |     def predict(self, x, aggressiveness=None):
126 |         mask = self.forward(x)
127 |         pred_mag = x * mask
128 | 
129 |         if self.offset > 0:
130 |             pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
131 |             assert pred_mag.size()[3] > 0
132 | 
133 |         return pred_mag
134 | 


--------------------------------------------------------------------------------
/rvc/infer/lib/uvr5_pack/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict:
  9 |     with open(file_name, "r") as f:
 10 |         data = json.load(f)
 11 | 
 12 |     return data
 13 | 
 14 | 
 15 | def make_padding(width, cropsize, offset):
 16 |     left = offset
 17 |     roi_size = cropsize - left * 2
 18 |     if roi_size == 0:
 19 |         roi_size = cropsize
 20 |     right = roi_size - (width % roi_size) + left
 21 | 
 22 |     return left, right, roi_size
 23 | 
 24 | 
 25 | def inference(X_spec, device, model, aggressiveness, data):
 26 |     """
 27 |     data ： dic configs
 28 |     """
 29 | 
 30 |     def _execute(
 31 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
 32 |     ):
 33 |         model.eval()
 34 |         with torch.no_grad():
 35 |             preds = []
 36 | 
 37 |             iterations = [n_window]
 38 | 
 39 |             total_iterations = sum(iterations)
 40 |             for i in tqdm(range(n_window)):
 41 |                 start = i * roi_size
 42 |                 X_mag_window = X_mag_pad[
 43 |                     None, :, :, start : start + data["window_size"]
 44 |                 ]
 45 |                 X_mag_window = torch.from_numpy(X_mag_window)
 46 |                 if is_half:
 47 |                     X_mag_window = X_mag_window.half()
 48 |                 X_mag_window = X_mag_window.to(device)
 49 | 
 50 |                 pred = model.predict(X_mag_window, aggressiveness)
 51 | 
 52 |                 pred = pred.detach().cpu().numpy()
 53 |                 preds.append(pred[0])
 54 | 
 55 |             pred = np.concatenate(preds, axis=2)
 56 |         return pred
 57 | 
 58 |     def preprocess(X_spec):
 59 |         X_mag = np.abs(X_spec)
 60 |         X_phase = np.angle(X_spec)
 61 | 
 62 |         return X_mag, X_phase
 63 | 
 64 |     X_mag, X_phase = preprocess(X_spec)
 65 | 
 66 |     coef = X_mag.max()
 67 |     X_mag_pre = X_mag / coef
 68 | 
 69 |     n_frame = X_mag_pre.shape[2]
 70 |     pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
 71 |     n_window = int(np.ceil(n_frame / roi_size))
 72 | 
 73 |     X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 74 | 
 75 |     if list(model.state_dict().values())[0].dtype == torch.float16:
 76 |         is_half = True
 77 |     else:
 78 |         is_half = False
 79 |     pred = _execute(
 80 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 81 |     )
 82 |     pred = pred[:, :, :n_frame]
 83 | 
 84 |     if data["tta"]:
 85 |         pad_l += roi_size // 2
 86 |         pad_r += roi_size // 2
 87 |         n_window += 1
 88 | 
 89 |         X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 90 | 
 91 |         pred_tta = _execute(
 92 |             X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 93 |         )
 94 |         pred_tta = pred_tta[:, :, roi_size // 2 :]
 95 |         pred_tta = pred_tta[:, :, :n_frame]
 96 | 
 97 |         return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
 98 |     else:
 99 |         return pred * coef, X_mag, np.exp(1.0j * X_phase)
100 | 
101 | 
102 | def _get_name_params(model_path, model_hash):
103 |     data = load_data()
104 |     flag = False
105 |     ModelName = model_path
106 |     for type in list(data):
107 |         for model in list(data[type][0]):
108 |             for i in range(len(data[type][0][model])):
109 |                 if str(data[type][0][model][i]["hash_name"]) == model_hash:
110 |                     flag = True
111 |                 elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112 |                     flag = True
113 | 
114 |                 if flag:
115 |                     model_params_auto = data[type][0][model][i]["model_params"]
116 |                     param_name_auto = data[type][0][model][i]["param_name"]
117 |                     if type == "equivalent":
118 |                         return param_name_auto, model_params_auto
119 |                     else:
120 |                         flag = False
121 |     return param_name_auto, model_params_auto
122 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/gui/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TorchGating is a PyTorch-based implementation of Spectral Gating
 3 | ================================================
 4 | Author: Asaf Zorea
 5 | 
 6 | Contents
 7 | --------
 8 | torchgate imports all the functions from PyTorch, and in addition provides:
 9 |  TorchGating       --- A PyTorch module that applies a spectral gate to an input signal
10 | 
11 | """
12 | 
13 | from .torchgate import TorchGate
14 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/gui/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.types import Number
 3 | 
 4 | 
 5 | @torch.no_grad()
 6 | def amp_to_db(
 7 |     x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40
 8 | ) -> torch.Tensor:
 9 |     """
10 |     Convert the input tensor from amplitude to decibel scale.
11 | 
12 |     Arguments:
13 |         x {[torch.Tensor]} -- [Input tensor.]
14 | 
15 |     Keyword Arguments:
16 |         eps {[float]} -- [Small value to avoid numerical instability.]
17 |                           (default: {torch.finfo(torch.float64).eps})
18 |         top_db {[float]} -- [threshold the output at ``top_db`` below the peak]
19 |             `             (default: {40})
20 | 
21 |     Returns:
22 |         [torch.Tensor] -- [Output tensor in decibel scale.]
23 |     """
24 |     x_db = 20 * torch.log10(x.abs() + eps)
25 |     return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1))
26 | 
27 | 
28 | @torch.no_grad()
29 | def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor:
30 |     """
31 |     Apply a sigmoid function with temperature scaling.
32 | 
33 |     Arguments:
34 |         x {[torch.Tensor]} -- [Input tensor.]
35 |         x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.]
36 |         temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.]
37 | 
38 |     Returns:
39 |         [torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.]
40 |     """
41 |     return torch.sigmoid((x - x0) / temp_coeff)
42 | 
43 | 
44 | @torch.no_grad()
45 | def linspace(
46 |     start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs
47 | ) -> torch.Tensor:
48 |     """
49 |     Generate a linearly spaced 1-D tensor.
50 | 
51 |     Arguments:
52 |         start {[Number]} -- [The starting value of the sequence.]
53 |         stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False.
54 |                             In that case, the sequence consists of all but the last of ``num + 1``
55 |                             evenly spaced samples, so that `stop` is excluded. Note that the step
56 |                             size changes when `endpoint` is False.]
57 | 
58 |     Keyword Arguments:
59 |         num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.]
60 |         endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included.
61 |                               Default is True.]
62 |         **kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.]
63 | 
64 |     Returns:
65 |         [torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.]
66 |     """
67 |     if endpoint:
68 |         return torch.linspace(start, stop, num, **kwargs)
69 |     else:
70 |         return torch.linspace(start, stop, num + 1, **kwargs)[:-1]
71 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/onnx/export.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
 4 | 
 5 | 
 6 | def export_onnx(ModelPath, ExportedPath):
 7 |     cpt = torch.load(ModelPath, map_location="cpu")
 8 |     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
 9 |     vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
10 | 
11 |     test_phone = torch.rand(1, 200, vec_channels)  # hidden unit
12 |     test_phone_lengths = torch.tensor([200]).long()  # hidden unit 长度（貌似没啥用）
13 |     test_pitch = torch.randint(size=(1, 200), low=5, high=255)  # 基频（单位赫兹）
14 |     test_pitchf = torch.rand(1, 200)  # nsf基频
15 |     test_ds = torch.LongTensor([0])  # 说话人ID
16 |     test_rnd = torch.rand(1, 192, 200)  # 噪声（加入随机因子）
17 | 
18 |     device = "cpu"  # 导出时设备（不影响使用模型）
19 | 
20 |     net_g = SynthesizerTrnMsNSFsidM(
21 |         *cpt["config"], is_half=False, encoder_dim=vec_channels
22 |     )  # fp32导出（C++要支持fp16必须手动将内存重新排列所以暂时不用fp16）
23 |     net_g.load_state_dict(cpt["weight"], strict=False)
24 |     input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
25 |     output_names = [
26 |         "audio",
27 |     ]
28 |     # net_g.construct_spkmixmap() #多角色混合轨道导出
29 |     torch.onnx.export(
30 |         net_g,
31 |         (
32 |             test_phone.to(device),
33 |             test_phone_lengths.to(device),
34 |             test_pitch.to(device),
35 |             test_pitchf.to(device),
36 |             test_ds.to(device),
37 |             test_rnd.to(device),
38 |         ),
39 |         ExportedPath,
40 |         dynamic_axes={
41 |             "phone": [1],
42 |             "pitch": [1],
43 |             "pitchf": [1],
44 |             "rnd": [2],
45 |         },
46 |         do_constant_folding=False,
47 |         opset_version=17,
48 |         verbose=False,
49 |         input_names=input_names,
50 |         output_names=output_names,
51 |     )
52 |     return "Finished"
53 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract/extract_f0_print.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | 
  5 | import parselmouth
  6 | 
  7 | now_dir = os.getcwd()
  8 | sys.path.append(now_dir)
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import pyworld
 13 | 
 14 | from infer.lib.audio import load_audio
 15 | 
 16 | logging.getLogger("numba").setLevel(logging.WARNING)
 17 | from multiprocessing import Process
 18 | 
 19 | exp_dir = sys.argv[1]
 20 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
 21 | 
 22 | 
 23 | def printt(strr):
 24 |     print(strr)
 25 |     f.write("%s\n" % strr)
 26 |     f.flush()
 27 | 
 28 | 
 29 | n_p = int(sys.argv[2])
 30 | f0method = sys.argv[3]
 31 | 
 32 | 
 33 | class FeatureInput(object):
 34 |     def __init__(self, samplerate=16000, hop_size=160):
 35 |         self.fs = samplerate
 36 |         self.hop = hop_size
 37 | 
 38 |         self.f0_bin = 256
 39 |         self.f0_max = 1100.0
 40 |         self.f0_min = 50.0
 41 |         self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
 42 |         self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
 43 | 
 44 |     def compute_f0(self, path, f0_method):
 45 |         x = load_audio(path, self.fs)
 46 |         p_len = x.shape[0] // self.hop
 47 |         if f0_method == "pm":
 48 |             time_step = 160 / 16000 * 1000
 49 |             f0_min = 50
 50 |             f0_max = 1100
 51 |             f0 = (
 52 |                 parselmouth.Sound(x, self.fs)
 53 |                 .to_pitch_ac(
 54 |                     time_step=time_step / 1000,
 55 |                     voicing_threshold=0.6,
 56 |                     pitch_floor=f0_min,
 57 |                     pitch_ceiling=f0_max,
 58 |                 )
 59 |                 .selected_array["frequency"]
 60 |             )
 61 |             pad_size = (p_len - len(f0) + 1) // 2
 62 |             if pad_size > 0 or p_len - len(f0) - pad_size > 0:
 63 |                 f0 = np.pad(
 64 |                     f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
 65 |                 )
 66 |         elif f0_method == "harvest":
 67 |             f0, t = pyworld.harvest(
 68 |                 x.astype(np.double),
 69 |                 fs=self.fs,
 70 |                 f0_ceil=self.f0_max,
 71 |                 f0_floor=self.f0_min,
 72 |                 frame_period=1000 * self.hop / self.fs,
 73 |             )
 74 |             f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
 75 |         elif f0_method == "dio":
 76 |             f0, t = pyworld.dio(
 77 |                 x.astype(np.double),
 78 |                 fs=self.fs,
 79 |                 f0_ceil=self.f0_max,
 80 |                 f0_floor=self.f0_min,
 81 |                 frame_period=1000 * self.hop / self.fs,
 82 |             )
 83 |             f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
 84 |         elif f0_method == "rmvpe":
 85 |             if hasattr(self, "model_rmvpe") == False:
 86 |                 from infer.lib.rmvpe import RMVPE
 87 | 
 88 |                 print("Loading rmvpe model")
 89 |                 self.model_rmvpe = RMVPE(
 90 |                     "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu"
 91 |                 )
 92 |             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
 93 |         return f0
 94 | 
 95 |     def coarse_f0(self, f0):
 96 |         f0_mel = 1127 * np.log(1 + f0 / 700)
 97 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
 98 |             self.f0_bin - 2
 99 |         ) / (self.f0_mel_max - self.f0_mel_min) + 1
100 | 
101 |         # use 0 or 1
102 |         f0_mel[f0_mel <= 1] = 1
103 |         f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
104 |         f0_coarse = np.rint(f0_mel).astype(int)
105 |         assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
106 |             f0_coarse.max(),
107 |             f0_coarse.min(),
108 |         )
109 |         return f0_coarse
110 | 
111 |     def go(self, paths, f0_method):
112 |         if len(paths) == 0:
113 |             printt("no-f0-todo")
114 |         else:
115 |             printt("todo-f0-%s" % len(paths))
116 |             n = max(len(paths) // 5, 1)  # 每个进程最多打印5条
117 |             for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
118 |                 try:
119 |                     if idx % n == 0:
120 |                         printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
121 |                     if (
122 |                         os.path.exists(opt_path1 + ".npy") == True
123 |                         and os.path.exists(opt_path2 + ".npy") == True
124 |                     ):
125 |                         continue
126 |                     featur_pit = self.compute_f0(inp_path, f0_method)
127 |                     np.save(
128 |                         opt_path2,
129 |                         featur_pit,
130 |                         allow_pickle=False,
131 |                     )  # nsf
132 |                     coarse_pit = self.coarse_f0(featur_pit)
133 |                     np.save(
134 |                         opt_path1,
135 |                         coarse_pit,
136 |                         allow_pickle=False,
137 |                     )  # ori
138 |                 except:
139 |                     printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     # exp_dir=r"E:\codes\py39\dataset\mi-test"
144 |     # n_p=16
145 |     # f = open("%s/log_extract_f0.log"%exp_dir, "w")
146 |     printt(" ".join(sys.argv))
147 |     featureInput = FeatureInput()
148 |     paths = []
149 |     inp_root = "%s/1_16k_wavs" % (exp_dir)
150 |     opt_root1 = "%s/2a_f0" % (exp_dir)
151 |     opt_root2 = "%s/2b-f0nsf" % (exp_dir)
152 | 
153 |     os.makedirs(opt_root1, exist_ok=True)
154 |     os.makedirs(opt_root2, exist_ok=True)
155 |     for name in sorted(list(os.listdir(inp_root))):
156 |         inp_path = "%s/%s" % (inp_root, name)
157 |         if "spec" in inp_path:
158 |             continue
159 |         opt_path1 = "%s/%s" % (opt_root1, name)
160 |         opt_path2 = "%s/%s" % (opt_root2, name)
161 |         paths.append([inp_path, opt_path1, opt_path2])
162 | 
163 |     ps = []
164 |     for i in range(n_p):
165 |         p = Process(
166 |             target=featureInput.go,
167 |             args=(
168 |                 paths[i::n_p],
169 |                 f0method,
170 |             ),
171 |         )
172 |         ps.append(p)
173 |         p.start()
174 |     for i in range(n_p):
175 |         ps[i].join()
176 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract/extract_f0_rmvpe.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | 
  5 | import parselmouth
  6 | 
  7 | now_dir = os.getcwd()
  8 | sys.path.append(now_dir)
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import pyworld
 13 | 
 14 | from infer.lib.audio import load_audio
 15 | 
 16 | logging.getLogger("numba").setLevel(logging.WARNING)
 17 | 
 18 | n_part = int(sys.argv[1])
 19 | i_part = int(sys.argv[2])
 20 | i_gpu = sys.argv[3]
 21 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
 22 | exp_dir = sys.argv[4]
 23 | is_half = sys.argv[5]
 24 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
 25 | 
 26 | 
 27 | def printt(strr):
 28 |     print(strr)
 29 |     f.write("%s\n" % strr)
 30 |     f.flush()
 31 | 
 32 | 
 33 | class FeatureInput(object):
 34 |     def __init__(self, samplerate=16000, hop_size=160):
 35 |         self.fs = samplerate
 36 |         self.hop = hop_size
 37 | 
 38 |         self.f0_bin = 256
 39 |         self.f0_max = 1100.0
 40 |         self.f0_min = 50.0
 41 |         self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
 42 |         self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
 43 | 
 44 |     def compute_f0(self, path, f0_method):
 45 |         x = load_audio(path, self.fs)
 46 |         # p_len = x.shape[0] // self.hop
 47 |         if f0_method == "rmvpe":
 48 |             if hasattr(self, "model_rmvpe") == False:
 49 |                 from infer.lib.rmvpe import RMVPE
 50 | 
 51 |                 print("Loading rmvpe model")
 52 |                 self.model_rmvpe = RMVPE(
 53 |                     "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda"
 54 |                 )
 55 |             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
 56 |         return f0
 57 | 
 58 |     def coarse_f0(self, f0):
 59 |         f0_mel = 1127 * np.log(1 + f0 / 700)
 60 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
 61 |             self.f0_bin - 2
 62 |         ) / (self.f0_mel_max - self.f0_mel_min) + 1
 63 | 
 64 |         # use 0 or 1
 65 |         f0_mel[f0_mel <= 1] = 1
 66 |         f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
 67 |         f0_coarse = np.rint(f0_mel).astype(int)
 68 |         assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
 69 |             f0_coarse.max(),
 70 |             f0_coarse.min(),
 71 |         )
 72 |         return f0_coarse
 73 | 
 74 |     def go(self, paths, f0_method):
 75 |         if len(paths) == 0:
 76 |             printt("no-f0-todo")
 77 |         else:
 78 |             printt("todo-f0-%s" % len(paths))
 79 |             n = max(len(paths) // 5, 1)  # 每个进程最多打印5条
 80 |             for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
 81 |                 try:
 82 |                     if idx % n == 0:
 83 |                         printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
 84 |                     if (
 85 |                         os.path.exists(opt_path1 + ".npy") == True
 86 |                         and os.path.exists(opt_path2 + ".npy") == True
 87 |                     ):
 88 |                         continue
 89 |                     featur_pit = self.compute_f0(inp_path, f0_method)
 90 |                     np.save(
 91 |                         opt_path2,
 92 |                         featur_pit,
 93 |                         allow_pickle=False,
 94 |                     )  # nsf
 95 |                     coarse_pit = self.coarse_f0(featur_pit)
 96 |                     np.save(
 97 |                         opt_path1,
 98 |                         coarse_pit,
 99 |                         allow_pickle=False,
100 |                     )  # ori
101 |                 except:
102 |                     printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     # exp_dir=r"E:\codes\py39\dataset\mi-test"
107 |     # n_p=16
108 |     # f = open("%s/log_extract_f0.log"%exp_dir, "w")
109 |     printt(" ".join(sys.argv))
110 |     featureInput = FeatureInput()
111 |     paths = []
112 |     inp_root = "%s/1_16k_wavs" % (exp_dir)
113 |     opt_root1 = "%s/2a_f0" % (exp_dir)
114 |     opt_root2 = "%s/2b-f0nsf" % (exp_dir)
115 | 
116 |     os.makedirs(opt_root1, exist_ok=True)
117 |     os.makedirs(opt_root2, exist_ok=True)
118 |     for name in sorted(list(os.listdir(inp_root))):
119 |         inp_path = "%s/%s" % (inp_root, name)
120 |         if "spec" in inp_path:
121 |             continue
122 |         opt_path1 = "%s/%s" % (opt_root1, name)
123 |         opt_path2 = "%s/%s" % (opt_root2, name)
124 |         paths.append([inp_path, opt_path1, opt_path2])
125 |     try:
126 |         featureInput.go(paths[i_part::n_part], "rmvpe")
127 |     except:
128 |         printt("f0_all_fail-%s" % (traceback.format_exc()))
129 |     # ps = []
130 |     # for i in range(n_p):
131 |     #     p = Process(
132 |     #         target=featureInput.go,
133 |     #         args=(
134 |     #             paths[i::n_p],
135 |     #             f0method,
136 |     #         ),
137 |     #     )
138 |     #     ps.append(p)
139 |     #     p.start()
140 |     # for i in range(n_p):
141 |     #     ps[i].join()
142 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract/extract_f0_rmvpe_dml.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | 
  5 | import parselmouth
  6 | 
  7 | now_dir = os.getcwd()
  8 | sys.path.append(now_dir)
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import pyworld
 13 | 
 14 | from infer.lib.audio import load_audio
 15 | 
 16 | logging.getLogger("numba").setLevel(logging.WARNING)
 17 | 
 18 | exp_dir = sys.argv[1]
 19 | import torch_directml
 20 | 
 21 | device = torch_directml.device(torch_directml.default_device())
 22 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
 23 | 
 24 | 
 25 | def printt(strr):
 26 |     print(strr)
 27 |     f.write("%s\n" % strr)
 28 |     f.flush()
 29 | 
 30 | 
 31 | class FeatureInput(object):
 32 |     def __init__(self, samplerate=16000, hop_size=160):
 33 |         self.fs = samplerate
 34 |         self.hop = hop_size
 35 | 
 36 |         self.f0_bin = 256
 37 |         self.f0_max = 1100.0
 38 |         self.f0_min = 50.0
 39 |         self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
 40 |         self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
 41 | 
 42 |     def compute_f0(self, path, f0_method):
 43 |         x = load_audio(path, self.fs)
 44 |         # p_len = x.shape[0] // self.hop
 45 |         if f0_method == "rmvpe":
 46 |             if hasattr(self, "model_rmvpe") == False:
 47 |                 from infer.lib.rmvpe import RMVPE
 48 | 
 49 |                 print("Loading rmvpe model")
 50 |                 self.model_rmvpe = RMVPE(
 51 |                     "assets/rmvpe/rmvpe.pt", is_half=False, device=device
 52 |                 )
 53 |             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
 54 |         return f0
 55 | 
 56 |     def coarse_f0(self, f0):
 57 |         f0_mel = 1127 * np.log(1 + f0 / 700)
 58 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
 59 |             self.f0_bin - 2
 60 |         ) / (self.f0_mel_max - self.f0_mel_min) + 1
 61 | 
 62 |         # use 0 or 1
 63 |         f0_mel[f0_mel <= 1] = 1
 64 |         f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
 65 |         f0_coarse = np.rint(f0_mel).astype(int)
 66 |         assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
 67 |             f0_coarse.max(),
 68 |             f0_coarse.min(),
 69 |         )
 70 |         return f0_coarse
 71 | 
 72 |     def go(self, paths, f0_method):
 73 |         if len(paths) == 0:
 74 |             printt("no-f0-todo")
 75 |         else:
 76 |             printt("todo-f0-%s" % len(paths))
 77 |             n = max(len(paths) // 5, 1)  # 每个进程最多打印5条
 78 |             for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
 79 |                 try:
 80 |                     if idx % n == 0:
 81 |                         printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
 82 |                     if (
 83 |                         os.path.exists(opt_path1 + ".npy") == True
 84 |                         and os.path.exists(opt_path2 + ".npy") == True
 85 |                     ):
 86 |                         continue
 87 |                     featur_pit = self.compute_f0(inp_path, f0_method)
 88 |                     np.save(
 89 |                         opt_path2,
 90 |                         featur_pit,
 91 |                         allow_pickle=False,
 92 |                     )  # nsf
 93 |                     coarse_pit = self.coarse_f0(featur_pit)
 94 |                     np.save(
 95 |                         opt_path1,
 96 |                         coarse_pit,
 97 |                         allow_pickle=False,
 98 |                     )  # ori
 99 |                 except:
100 |                     printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     # exp_dir=r"E:\codes\py39\dataset\mi-test"
105 |     # n_p=16
106 |     # f = open("%s/log_extract_f0.log"%exp_dir, "w")
107 |     printt(" ".join(sys.argv))
108 |     featureInput = FeatureInput()
109 |     paths = []
110 |     inp_root = "%s/1_16k_wavs" % (exp_dir)
111 |     opt_root1 = "%s/2a_f0" % (exp_dir)
112 |     opt_root2 = "%s/2b-f0nsf" % (exp_dir)
113 | 
114 |     os.makedirs(opt_root1, exist_ok=True)
115 |     os.makedirs(opt_root2, exist_ok=True)
116 |     for name in sorted(list(os.listdir(inp_root))):
117 |         inp_path = "%s/%s" % (inp_root, name)
118 |         if "spec" in inp_path:
119 |             continue
120 |         opt_path1 = "%s/%s" % (opt_root1, name)
121 |         opt_path2 = "%s/%s" % (opt_root2, name)
122 |         paths.append([inp_path, opt_path1, opt_path2])
123 |     try:
124 |         featureInput.go(paths, "rmvpe")
125 |     except:
126 |         printt("f0_all_fail-%s" % (traceback.format_exc()))
127 |     # ps = []
128 |     # for i in range(n_p):
129 |     #     p = Process(
130 |     #         target=featureInput.go,
131 |     #         args=(
132 |     #             paths[i::n_p],
133 |     #             f0method,
134 |     #         ),
135 |     #     )
136 |     #     ps.append(p)
137 |     #     p.start()
138 |     # for i in range(n_p):
139 |     #     ps[i].join()
140 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/train/extract_feature_print.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | 
  5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
  6 | os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
  7 | 
  8 | device = sys.argv[1]
  9 | n_part = int(sys.argv[2])
 10 | i_part = int(sys.argv[3])
 11 | if len(sys.argv) == 7:
 12 |     exp_dir = sys.argv[4]
 13 |     version = sys.argv[5]
 14 |     is_half = sys.argv[6].lower() == "true"
 15 | else:
 16 |     i_gpu = sys.argv[4]
 17 |     exp_dir = sys.argv[5]
 18 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
 19 |     version = sys.argv[6]
 20 |     is_half = sys.argv[7].lower() == "true"
 21 | import fairseq
 22 | import numpy as np
 23 | import soundfile as sf
 24 | import torch
 25 | import torch.nn.functional as F
 26 | 
 27 | if "privateuseone" not in device:
 28 |     device = "cpu"
 29 |     if torch.cuda.is_available():
 30 |         device = "cuda"
 31 |     elif torch.backends.mps.is_available():
 32 |         device = "mps"
 33 | else:
 34 |     import torch_directml
 35 | 
 36 |     device = torch_directml.device(torch_directml.default_device())
 37 | 
 38 |     def forward_dml(ctx, x, scale):
 39 |         ctx.scale = scale
 40 |         res = x.clone().detach()
 41 |         return res
 42 | 
 43 |     fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
 44 | 
 45 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
 46 | 
 47 | 
 48 | def printt(strr):
 49 |     print(strr)
 50 |     f.write("%s\n" % strr)
 51 |     f.flush()
 52 | 
 53 | 
 54 | printt(" ".join(sys.argv))
 55 | model_path = "assets/hubert/hubert_base.pt"
 56 | 
 57 | printt("exp_dir: " + exp_dir)
 58 | wavPath = "%s/1_16k_wavs" % exp_dir
 59 | outPath = (
 60 |     "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir
 61 | )
 62 | os.makedirs(outPath, exist_ok=True)
 63 | 
 64 | 
 65 | # wave must be 16k, hop_size=320
 66 | def readwave(wav_path, normalize=False):
 67 |     wav, sr = sf.read(wav_path)
 68 |     assert sr == 16000
 69 |     feats = torch.from_numpy(wav).float()
 70 |     if feats.dim() == 2:  # double channels
 71 |         feats = feats.mean(-1)
 72 |     assert feats.dim() == 1, feats.dim()
 73 |     if normalize:
 74 |         with torch.no_grad():
 75 |             feats = F.layer_norm(feats, feats.shape)
 76 |     feats = feats.view(1, -1)
 77 |     return feats
 78 | 
 79 | 
 80 | # HuBERT model
 81 | printt("load model(s) from {}".format(model_path))
 82 | # if hubert model is exist
 83 | if os.access(model_path, os.F_OK) == False:
 84 |     printt(
 85 |         "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
 86 |         % model_path
 87 |     )
 88 |     exit(0)
 89 | models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
 90 |     [model_path],
 91 |     suffix="",
 92 | )
 93 | model = models[0]
 94 | model = model.to(device)
 95 | printt("move model to %s" % device)
 96 | if is_half:
 97 |     if device not in ["mps", "cpu"]:
 98 |         model = model.half()
 99 | model.eval()
100 | 
101 | todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
102 | n = max(1, len(todo) // 10)  # 最多打印十条
103 | if len(todo) == 0:
104 |     printt("no-feature-todo")
105 | else:
106 |     printt("all-feature-%s" % len(todo))
107 |     for idx, file in enumerate(todo):
108 |         try:
109 |             if file.endswith(".wav"):
110 |                 wav_path = "%s/%s" % (wavPath, file)
111 |                 out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
112 | 
113 |                 if os.path.exists(out_path):
114 |                     continue
115 | 
116 |                 feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
117 |                 padding_mask = torch.BoolTensor(feats.shape).fill_(False)
118 |                 inputs = {
119 |                     "source": (
120 |                         feats.half().to(device)
121 |                         if is_half and device not in ["mps", "cpu"]
122 |                         else feats.to(device)
123 |                     ),
124 |                     "padding_mask": padding_mask.to(device),
125 |                     "output_layer": 9 if version == "v1" else 12,  # layer 9
126 |                 }
127 |                 with torch.no_grad():
128 |                     logits = model.extract_features(**inputs)
129 |                     feats = (
130 |                         model.final_proj(logits[0]) if version == "v1" else logits[0]
131 |                     )
132 | 
133 |                 feats = feats.squeeze(0).float().cpu().numpy()
134 |                 if np.isnan(feats).sum() == 0:
135 |                     np.save(out_path, feats, allow_pickle=False)
136 |                 else:
137 |                     printt("%s-contains nan" % file)
138 |                 if idx % n == 0:
139 |                     printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
140 |         except:
141 |             printt(traceback.format_exc())
142 |     printt("all-feature-done")
143 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/train/preprocess.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | import sys
  4 | 
  5 | from scipy import signal
  6 | 
  7 | now_dir = os.getcwd()
  8 | sys.path.append(now_dir)
  9 | print(*sys.argv[1:])
 10 | inp_root = sys.argv[1]
 11 | sr = int(sys.argv[2])
 12 | n_p = int(sys.argv[3])
 13 | exp_dir = sys.argv[4]
 14 | noparallel = sys.argv[5] == "True"
 15 | per = float(sys.argv[6])
 16 | import os
 17 | import traceback
 18 | 
 19 | import librosa
 20 | import numpy as np
 21 | from scipy.io import wavfile
 22 | 
 23 | from infer.lib.audio import load_audio
 24 | from infer.lib.slicer2 import Slicer
 25 | 
 26 | f = open("%s/preprocess.log" % exp_dir, "a+")
 27 | 
 28 | 
 29 | def println(strr):
 30 |     print(strr)
 31 |     f.write("%s\n" % strr)
 32 |     f.flush()
 33 | 
 34 | 
 35 | class PreProcess:
 36 |     def __init__(self, sr, exp_dir, per=3.7):
 37 |         self.slicer = Slicer(
 38 |             sr=sr,
 39 |             threshold=-42,
 40 |             min_length=1500,
 41 |             min_interval=400,
 42 |             hop_size=15,
 43 |             max_sil_kept=500,
 44 |         )
 45 |         self.sr = sr
 46 |         self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
 47 |         self.per = per
 48 |         self.overlap = 0.3
 49 |         self.tail = self.per + self.overlap
 50 |         self.max = 0.9
 51 |         self.alpha = 0.75
 52 |         self.exp_dir = exp_dir
 53 |         self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
 54 |         self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
 55 |         os.makedirs(self.exp_dir, exist_ok=True)
 56 |         os.makedirs(self.gt_wavs_dir, exist_ok=True)
 57 |         os.makedirs(self.wavs16k_dir, exist_ok=True)
 58 | 
 59 |     def norm_write(self, tmp_audio, idx0, idx1):
 60 |         tmp_max = np.abs(tmp_audio).max()
 61 |         if tmp_max > 2.5:
 62 |             print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
 63 |             return
 64 |         tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
 65 |             1 - self.alpha
 66 |         ) * tmp_audio
 67 |         wavfile.write(
 68 |             "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
 69 |             self.sr,
 70 |             tmp_audio.astype(np.float32),
 71 |         )
 72 |         tmp_audio = librosa.resample(
 73 |             tmp_audio, orig_sr=self.sr, target_sr=16000
 74 |         )  # , res_type="soxr_vhq"
 75 |         wavfile.write(
 76 |             "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
 77 |             16000,
 78 |             tmp_audio.astype(np.float32),
 79 |         )
 80 | 
 81 |     def pipeline(self, path, idx0):
 82 |         try:
 83 |             audio = load_audio(path, self.sr)
 84 |             # zero phased digital filter cause pre-ringing noise...
 85 |             # audio = signal.filtfilt(self.bh, self.ah, audio)
 86 |             audio = signal.lfilter(self.bh, self.ah, audio)
 87 | 
 88 |             idx1 = 0
 89 |             for audio in self.slicer.slice(audio):
 90 |                 i = 0
 91 |                 while 1:
 92 |                     start = int(self.sr * (self.per - self.overlap) * i)
 93 |                     i += 1
 94 |                     if len(audio[start:]) > self.tail * self.sr:
 95 |                         tmp_audio = audio[start : start + int(self.per * self.sr)]
 96 |                         self.norm_write(tmp_audio, idx0, idx1)
 97 |                         idx1 += 1
 98 |                     else:
 99 |                         tmp_audio = audio[start:]
100 |                         idx1 += 1
101 |                         break
102 |                 self.norm_write(tmp_audio, idx0, idx1)
103 |             println("%s\t-> Success" % path)
104 |         except:
105 |             println("%s\t-> %s" % (path, traceback.format_exc()))
106 | 
107 |     def pipeline_mp(self, infos):
108 |         for path, idx0 in infos:
109 |             self.pipeline(path, idx0)
110 | 
111 |     def pipeline_mp_inp_dir(self, inp_root, n_p):
112 |         try:
113 |             infos = [
114 |                 ("%s/%s" % (inp_root, name), idx)
115 |                 for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
116 |             ]
117 |             if noparallel:
118 |                 for i in range(n_p):
119 |                     self.pipeline_mp(infos[i::n_p])
120 |             else:
121 |                 ps = []
122 |                 for i in range(n_p):
123 |                     p = multiprocessing.Process(
124 |                         target=self.pipeline_mp, args=(infos[i::n_p],)
125 |                     )
126 |                     ps.append(p)
127 |                     p.start()
128 |                 for i in range(n_p):
129 |                     ps[i].join()
130 |         except:
131 |             println("Fail. %s" % traceback.format_exc())
132 | 
133 | 
134 | def preprocess_trainset(inp_root, sr, n_p, exp_dir, per):
135 |     pp = PreProcess(sr, exp_dir, per)
136 |     println("start preprocess")
137 |     pp.pipeline_mp_inp_dir(inp_root, n_p)
138 |     println("end preprocess")
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     preprocess_trainset(inp_root, sr, n_p, exp_dir, per)
143 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/uvr5/modules.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import traceback
  3 | import logging
  4 | 
  5 | logger = logging.getLogger(__name__)
  6 | 
  7 | import ffmpeg
  8 | import torch
  9 | 
 10 | from configs.config import Config
 11 | from infer.modules.uvr5.mdxnet import MDXNetDereverb
 12 | from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
 13 | 
 14 | config = Config()
 15 | 
 16 | 
 17 | def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
 18 |     infos = []
 19 |     try:
 20 |         inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 21 |         save_root_vocal = (
 22 |             save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 23 |         )
 24 |         save_root_ins = (
 25 |             save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 26 |         )
 27 |         if model_name == "onnx_dereverb_By_FoxJoy":
 28 |             pre_fun = MDXNetDereverb(15, config.device)
 29 |         else:
 30 |             func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
 31 |             pre_fun = func(
 32 |                 agg=int(agg),
 33 |                 model_path=os.path.join(
 34 |                     os.getenv("weight_uvr5_root"), model_name + ".pth"
 35 |                 ),
 36 |                 device=config.device,
 37 |                 is_half=config.is_half,
 38 |             )
 39 |         is_hp3 = "HP3" in model_name
 40 |         if inp_root != "":
 41 |             paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
 42 |         else:
 43 |             paths = [path.name for path in paths]
 44 |         for path in paths:
 45 |             inp_path = os.path.join(inp_root, path)
 46 |             need_reformat = 1
 47 |             done = 0
 48 |             try:
 49 |                 info = ffmpeg.probe(inp_path, cmd="ffprobe")
 50 |                 if (
 51 |                     info["streams"][0]["channels"] == 2
 52 |                     and info["streams"][0]["sample_rate"] == "44100"
 53 |                 ):
 54 |                     need_reformat = 0
 55 |                     pre_fun._path_audio_(
 56 |                         inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
 57 |                     )
 58 |                     done = 1
 59 |             except:
 60 |                 need_reformat = 1
 61 |                 traceback.print_exc()
 62 |             if need_reformat == 1:
 63 |                 tmp_path = "%s/%s.reformatted.wav" % (
 64 |                     os.path.join(os.environ["TEMP"]),
 65 |                     os.path.basename(inp_path),
 66 |                 )
 67 |                 os.system(
 68 |                     "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
 69 |                     % (inp_path, tmp_path)
 70 |                 )
 71 |                 inp_path = tmp_path
 72 |             try:
 73 |                 if done == 0:
 74 |                     pre_fun._path_audio_(
 75 |                         inp_path, save_root_ins, save_root_vocal, format0
 76 |                     )
 77 |                 infos.append("%s->Success" % (os.path.basename(inp_path)))
 78 |                 yield "\n".join(infos)
 79 |             except:
 80 |                 try:
 81 |                     if done == 0:
 82 |                         pre_fun._path_audio_(
 83 |                             inp_path, save_root_ins, save_root_vocal, format0
 84 |                         )
 85 |                     infos.append("%s->Success" % (os.path.basename(inp_path)))
 86 |                     yield "\n".join(infos)
 87 |                 except:
 88 |                     infos.append(
 89 |                         "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
 90 |                     )
 91 |                     yield "\n".join(infos)
 92 |     except:
 93 |         infos.append(traceback.format_exc())
 94 |         yield "\n".join(infos)
 95 |     finally:
 96 |         try:
 97 |             if model_name == "onnx_dereverb_By_FoxJoy":
 98 |                 del pre_fun.pred.model
 99 |                 del pre_fun.pred.model_
100 |             else:
101 |                 del pre_fun.model
102 |                 del pre_fun
103 |         except:
104 |             traceback.print_exc()
105 |         if torch.cuda.is_available():
106 |             torch.cuda.empty_cache()
107 |             logger.info("Executed torch.cuda.empty_cache()")
108 |     yield "\n".join(infos)
109 | 


--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__init__.py


--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/modules.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/pipeline.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/pipeline.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/modules/vc/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/infer/modules/vc/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/rvc/infer/modules/vc/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from fairseq import checkpoint_utils
 4 | 
 5 | 
 6 | def get_index_path_from_model(sid):
 7 |     return next(
 8 |         (
 9 |             f
10 |             for f in [
11 |                 os.path.join(root, name)
12 |                 for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
13 |                 for name in files
14 |                 if name.endswith(".index") and "trained" not in name
15 |             ]
16 |             if sid.split(".")[0] in f
17 |         ),
18 |         "",
19 |     )
20 | 
21 | 
22 | def load_hubert(config):
23 |     models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
24 |         [os.getenv('hubert_base')],
25 |         suffix="",
26 |     )
27 |     hubert_model = models[0]
28 |     hubert_model = hubert_model.to(config.device)
29 |     if config.is_half:
30 |         hubert_model = hubert_model.half()
31 |     else:
32 |         hubert_model = hubert_model.float()
33 |     return hubert_model.eval()
34 | 


--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute32k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute32k.wav


--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute40k.spec.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute40k.spec.pt


--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute40k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute40k.wav


--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute48k.spec.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute48k.spec.pt


--------------------------------------------------------------------------------
/rvc/logs/mute/0_gt_wavs/mute48k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/0_gt_wavs/mute48k.wav


--------------------------------------------------------------------------------
/rvc/logs/mute/1_16k_wavs/mute.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/1_16k_wavs/mute.wav


--------------------------------------------------------------------------------
/rvc/logs/mute/2a_f0/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/2a_f0/mute.wav.npy


--------------------------------------------------------------------------------
/rvc/logs/mute/2b-f0nsf/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/2b-f0nsf/mute.wav.npy


--------------------------------------------------------------------------------
/rvc/logs/mute/3_feature256/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/3_feature256/mute.npy


--------------------------------------------------------------------------------
/rvc/logs/mute/3_feature768/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/rvc/logs/mute/3_feature768/mute.npy


--------------------------------------------------------------------------------
/web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/web.png


--------------------------------------------------------------------------------
/web/js/alertMSG.js:
--------------------------------------------------------------------------------
 1 | import { app } from "../../../scripts/app.js";
 2 | 
 3 | app.registerExtension({
 4 | 	name: "RVC.alertMSG",
 5 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
 6 | 		if (nodeData?.name == "RVC_Train") {
 7 | 			nodeType.prototype.onExecuted = function (data) {
 8 | 				// alert("Success!you can find weights in:\n" + data.finetune[0] + "\n" + data.finetune[1] + "\n Now you can tts or inference");
 9 | 				let msg = "Success! you can find weights in:\n" + data.train[0] + "\n you'd like to reboot the server to inference?"
10 | 				if (confirm(msg)) {
11 | 					try {
12 | 						api.fetchApi("/rvc/reboot");
13 | 					}
14 | 					catch(exception) {
15 | 						console.log(exception);
16 | 					}
17 | 				}
18 | 			}
19 | 		}
20 | 	},
21 | });


--------------------------------------------------------------------------------
/web/js/previewAudio.js:
--------------------------------------------------------------------------------
  1 | import { app } from "../../../scripts/app.js";
  2 | import { api } from '../../../scripts/api.js'
  3 | 
  4 | function fitHeight(node) {
  5 |     node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]])
  6 |     node?.graph?.setDirtyCanvas(true);
  7 | }
  8 | function chainCallback(object, property, callback) {
  9 |     if (object == undefined) {
 10 |         //This should not happen.
 11 |         console.error("Tried to add callback to non-existant object")
 12 |         return;
 13 |     }
 14 |     if (property in object) {
 15 |         const callback_orig = object[property]
 16 |         object[property] = function () {
 17 |             const r = callback_orig.apply(this, arguments);
 18 |             callback.apply(this, arguments);
 19 |             return r
 20 |         };
 21 |     } else {
 22 |         object[property] = callback;
 23 |     }
 24 | }
 25 | 
 26 | function addPreviewOptions(nodeType) {
 27 |     chainCallback(nodeType.prototype, "getExtraMenuOptions", function(_, options) {
 28 |         // The intended way of appending options is returning a list of extra options,
 29 |         // but this isn't used in widgetInputs.js and would require
 30 |         // less generalization of chainCallback
 31 |         let optNew = []
 32 |         try {
 33 |             const previewWidget = this.widgets.find((w) => w.name === "audiopreview");
 34 | 
 35 |             let url = null
 36 |             if (previewWidget.audioEl?.hidden == false && previewWidget.audioEl.src) {
 37 |                 //Use full quality audio
 38 |                 //url = api.apiURL('/view?' + new URLSearchParams(previewWidget.value.params));
 39 |                 url = previewWidget.audioEl.src
 40 |             }
 41 |             if (url) {
 42 |                 optNew.push(
 43 |                     {
 44 |                         content: "Open preview",
 45 |                         callback: () => {
 46 |                             window.open(url, "_blank")
 47 |                         },
 48 |                     },
 49 |                     {
 50 |                         content: "Save preview",
 51 |                         callback: () => {
 52 |                             const a = document.createElement("a");
 53 |                             a.href = url;
 54 |                             a.setAttribute("download", new URLSearchParams(previewWidget.value.params).get("filename"));
 55 |                             document.body.append(a);
 56 |                             a.click();
 57 |                             requestAnimationFrame(() => a.remove());
 58 |                         },
 59 |                     }
 60 |                 );
 61 |             }
 62 |             if(options.length > 0 && options[0] != null && optNew.length > 0) {
 63 |                 optNew.push(null);
 64 |             }
 65 |             options.unshift(...optNew);
 66 |             
 67 |         } catch (error) {
 68 |             console.log(error);
 69 |         }
 70 |         
 71 |     });
 72 | }
 73 | function previewAudio(node,file,type){
 74 |     var element = document.createElement("div");
 75 |     const previewNode = node;
 76 |     var previewWidget = node.addDOMWidget("audiopreview", "preview", element, {
 77 |         serialize: false,
 78 |         hideOnZoom: false,
 79 |         getValue() {
 80 |             return element.value;
 81 |         },
 82 |         setValue(v) {
 83 |             element.value = v;
 84 |         },
 85 |     });
 86 |     previewWidget.computeSize = function(width) {
 87 |         if (this.aspectRatio && !this.parentEl.hidden) {
 88 |             let height = (previewNode.size[0]-20)/ this.aspectRatio + 10;
 89 |             if (!(height > 0)) {
 90 |                 height = 0;
 91 |             }
 92 |             this.computedHeight = height + 10;
 93 |             return [width, height];
 94 |         }
 95 |         return [width, -4];//no loaded src, widget should not display
 96 |     }
 97 |     // element.style['pointer-events'] = "none"
 98 |     previewWidget.value = {hidden: false, paused: false, params: {}}
 99 |     previewWidget.parentEl = document.createElement("div");
100 |     previewWidget.parentEl.className = "audio_preview";
101 |     previewWidget.parentEl.style['width'] = "100%"
102 |     element.appendChild(previewWidget.parentEl);
103 |     previewWidget.audioEl = document.createElement("audio");
104 |     previewWidget.audioEl.controls = true;
105 |     previewWidget.audioEl.loop = false;
106 |     previewWidget.audioEl.muted = false;
107 |     previewWidget.audioEl.style['width'] = "100%"
108 |     previewWidget.audioEl.addEventListener("loadedmetadata", () => {
109 | 
110 |         previewWidget.aspectRatio = previewWidget.audioEl.audioWidth / previewWidget.audioEl.audioHeight;
111 |         fitHeight(this);
112 |     });
113 |     previewWidget.audioEl.addEventListener("error", () => {
114 |         //TODO: consider a way to properly notify the user why a preview isn't shown.
115 |         previewWidget.parentEl.hidden = true;
116 |         fitHeight(this);
117 |     });
118 | 
119 |     let params =  {
120 |         "filename": file,
121 |         "type": type,
122 |     }
123 |     
124 |     previewWidget.parentEl.hidden = previewWidget.value.hidden;
125 |     previewWidget.audioEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden;
126 |     let target_width = 256
127 |     if (element.style?.width) {
128 |         //overscale to allow scrolling. Endpoint won't return higher than native
129 |         target_width = element.style.width.slice(0,-2)*2;
130 |     }
131 |     if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") {
132 |         params.force_size = target_width+"x?"
133 |     } else {
134 |         let size = params.force_size.split("x")
135 |         let ar = parseInt(size[0])/parseInt(size[1])
136 |         params.force_size = target_width+"x"+(target_width/ar)
137 |     }
138 |     
139 |     previewWidget.audioEl.src = api.apiURL('/view?' + new URLSearchParams(params));
140 | 
141 |     previewWidget.audioEl.hidden = false;
142 |     previewWidget.parentEl.appendChild(previewWidget.audioEl)
143 | }
144 | 
145 | app.registerExtension({
146 | 	name: "RVC.AudioPreviewer",
147 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
148 | 		if (nodeData?.name == "PreViewAudio") {
149 | 			nodeType.prototype.onExecuted = function (data) {
150 | 				previewAudio(this, data.audio[0], data.audio[1]);
151 | 			}
152 |             addPreviewOptions(nodeType)
153 | 		}
154 | 	}
155 | });


--------------------------------------------------------------------------------
/web/js/refreshPath.js:
--------------------------------------------------------------------------------
 1 | import { app } from "../../../scripts/app.js";
 2 | import { api } from '../../../scripts/api.js'
 3 | import { ComfyWidgets } from "../../../scripts/widgets.js"
 4 | function rebootAPI() {
 5 | 	if (confirm("Are you sure you'd like to reboot the server to refresh weights path?")) {
 6 | 		try {
 7 | 			api.fetchApi("/rvc/reboot");
 8 | 		}
 9 | 		catch(exception) {
10 | 
11 | 		}
12 | 		return true;
13 | 	}
14 | 
15 | 	return false;
16 | }
17 | function pathRefresh(node, inputName, inputData, app) {
18 |     // Create the button widget for selecting the files
19 |     let refreshWidget = node.addWidget("button", "REBOOT TO REFRESH SID LIST", "refresh", () => {
20 |         rebootAPI()
21 |     });
22 | 
23 |     refreshWidget.serialize = false;
24 | 
25 |     return { widget: refreshWidget };
26 | }
27 | ComfyWidgets.PATHREFRESH = pathRefresh;
28 | 
29 | app.registerExtension({
30 | 	name: "RVC.RefreshPath",
31 | 	async beforeRegisterNodeDef(nodeType, nodeData, app) {
32 | 		if (nodeData?.name == "RVC_Infer") {
33 | 			nodeData.input.required.upload = ["PATHREFRESH"];
34 | 		}
35 | 	},
36 | });


--------------------------------------------------------------------------------
/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIFSH/ComfyUI-RVC/5829db68395487d5ecc922f36aca7e7e1c7575d3/wechat.jpg


--------------------------------------------------------------------------------