├── advanced_rvc_inference ├── __init__.py ├── assets │ ├── __init__.py │ ├── f0 │ │ ├── .gitattributes │ │ └── __init__.py │ ├── audios │ │ ├── .gitattributes │ │ ├── others │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── rvc │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── tts │ │ │ ├── ..gitattributes │ │ │ └── __init__.py │ │ ├── uvr │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ └── __init__.py │ ├── binary │ │ ├── __init__.py │ │ ├── world.bin │ │ ├── decrypt.bin │ │ └── vr_params.bin │ ├── dataset │ │ ├── .gitattributes │ │ └── __init__.py │ ├── presets │ │ ├── .gitattributes │ │ └── __init__.py │ ├── weights │ │ ├── .gitattributes │ │ └── __init__.py │ ├── languages │ │ └── __init__.py │ ├── models │ │ ├── uvr5 │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── embedders │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── predictors │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── pretrained_v1 │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── pretrained_v2 │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── pretrained_custom │ │ │ ├── .gitattributes │ │ │ └── __init__.py │ │ ├── speaker_diarization │ │ │ ├── models │ │ │ │ ├── .gitattributes │ │ │ │ └── __init__.py │ │ │ ├── __init__.py │ │ │ └── assets │ │ │ │ ├── __init__.py │ │ │ │ └── mel_filters.npz │ │ └── __init__.py │ ├── logs │ │ ├── reference │ │ │ └── .gitattributes │ │ └── mute │ │ │ ├── f0 │ │ │ └── mute.wav.npy │ │ │ ├── energy │ │ │ └── mute.wav.npy │ │ │ ├── f0_voiced │ │ │ └── mute.wav.npy │ │ │ ├── v1_extracted │ │ │ ├── mute.npy │ │ │ ├── mute_chinese.npy │ │ │ ├── mute_japanese.npy │ │ │ ├── mute_korean.npy │ │ │ ├── mute_spin-v1.npy │ │ │ ├── mute_spin-v2.npy │ │ │ ├── mute_portuguese.npy │ │ │ └── mute_vietnamese.npy │ │ │ ├── v2_extracted │ │ │ ├── mute.npy │ │ │ ├── mute_chinese.npy │ │ │ ├── mute_japanese.npy │ │ │ ├── mute_korean.npy │ │ │ ├── mute_spin-v1.npy │ │ │ ├── mute_spin-v2.npy │ │ │ ├── mute_portuguese.npy │ │ │ └── mute_vietnamese.npy │ │ │ ├── sliced_audios │ │ │ ├── mute32000.wav │ │ │ ├── mute40000.wav │ │ │ └── mute48000.wav │ │ │ └── sliced_audios_16k │ │ │ └── mute.wav │ ├── zluda │ │ ├── __init__.py │ │ ├── run_app.bat │ │ ├── path-zluda-hip57.bat │ │ ├── path-zluda-hip61.bat │ │ └── path-zluda-hip62.bat │ └── config.txt ├── configs │ ├── __init__.py │ ├── v1 │ │ ├── __init__.py │ │ ├── 32000.json │ │ ├── 40000.json │ │ └── 48000.json │ └── v2 │ │ ├── __init__.py │ │ ├── 32000.json │ │ ├── 40000.json │ │ └── 48000.json ├── core │ ├── __init__.py │ ├── restart.py │ ├── f0_extract.py │ ├── csrt.py │ └── separate.py ├── library │ ├── __init__.py │ ├── onnx │ │ ├── __init__.py │ │ └── wrapper.py │ ├── backends │ │ ├── __init__.py │ │ ├── directml.py │ │ ├── zluda.py │ │ └── opencl.py │ ├── embedders │ │ ├── __init__.py │ │ ├── transformers.py │ │ ├── onnx.py │ │ └── ppg.py │ ├── generators │ │ ├── __init__.py │ │ └── hifigan.py │ ├── predictors │ │ ├── __init__.py │ │ ├── DJCM │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ ├── spec.py │ │ │ ├── encoder.py │ │ │ ├── utils.py │ │ │ └── decoder.py │ │ ├── FCPE │ │ │ ├── __init__.py │ │ │ ├── wav2mel.py │ │ │ ├── utils.py │ │ │ ├── stft.py │ │ │ └── encoder.py │ │ ├── PENN │ │ │ ├── __init__.py │ │ │ ├── fcn.py │ │ │ └── core.py │ │ ├── CREPE │ │ │ ├── __init__.py │ │ │ ├── filter.py │ │ │ └── model.py │ │ ├── PESTO │ │ │ ├── __init__.py │ │ │ └── PESTO.py │ │ ├── RMVPE │ │ │ ├── __init__.py │ │ │ ├── e2e.py │ │ │ ├── mel.py │ │ │ └── RMVPE.py │ │ ├── SWIFT │ │ │ ├── __init__.py │ │ │ └── SWIFT.py │ │ └── WORLD │ │ │ └── __init__.py │ ├── uvr5_lib │ │ ├── __init__.py │ │ ├── demucs │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ └── states.py │ │ └── vr_network │ │ │ ├── __init__.py │ │ │ ├── model_param_init.py │ │ │ └── layers_new.py │ ├── architectures │ │ └── __init__.py │ ├── speaker_diarization │ │ ├── __init__.py │ │ ├── embedding.py │ │ └── parameter_transfer.py │ └── algorithm │ │ ├── __init__.py │ │ ├── normalization.py │ │ ├── commons.py │ │ ├── modules.py │ │ └── discriminators.py ├── tabs │ ├── __init__.py │ ├── extra │ │ ├── __init__.py │ │ ├── child │ │ │ ├── __init__.py │ │ │ ├── read_model.py │ │ │ ├── convert_model.py │ │ │ ├── fushion.py │ │ │ ├── create_srt.py │ │ │ └── f0_extract.py │ │ └── extra.py │ ├── realtime │ │ └── __init__.py │ ├── training │ │ ├── __init__.py │ │ ├── child │ │ │ └── __init__.py │ │ └── training.py │ ├── downloads │ │ └── __init__.py │ └── inference │ │ ├── __init__.py │ │ ├── child │ │ └── __init__.py │ │ └── inference.py ├── tools │ ├── __init__.py │ ├── pixeldrain.py │ ├── huggingface.py │ └── mediafire.py ├── infer │ ├── __init__.py │ ├── rvc │ │ └── __init__.py │ ├── train │ │ ├── __init__.py │ │ ├── training │ │ │ ├── __init__.py │ │ │ ├── losses.py │ │ │ ├── extract_model.py │ │ │ └── anyprecision_optimizer.py │ │ ├── extracting │ │ │ ├── __init__.py │ │ │ ├── setup_path.py │ │ │ ├── rms.py │ │ │ ├── embedding.py │ │ │ ├── feature.py │ │ │ └── preparing_files.py │ │ ├── preprocess │ │ │ └── __init__.py │ │ └── create_index.py │ ├── extracting │ │ ├── __init__.py │ │ ├── setup_path.py │ │ ├── rms.py │ │ ├── embedding.py │ │ ├── feature.py │ │ └── preparing_files.py │ └── realtime │ │ ├── __init__.py │ │ └── vad_utils.py ├── run_tensorboard.py └── app.py ├── LICENSE ├── requirements.txt ├── installer.bat ├── CONTRIBUTING.md ├── pyproject.toml └── Advanced-RVC.ipynb /advanced_rvc_inference/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/f0/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/core/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tools/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/binary/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/dataset/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/f0/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/presets/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/weights/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v1/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v2/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/onnx/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/realtime/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/training/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/languages/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/uvr5/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/backends/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/generators/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/uvr5_lib/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/downloads/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/child/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/inference/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/others/.gitattributes: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/rvc/.gitattributes: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/tts/..gitattributes: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/uvr/.gitattributes: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/reference/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/embedders/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/predictors/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/architectures/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/uvr5_lib/demucs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/inference/child/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/training/child/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/pretrained_v1/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/pretrained_v2/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/speaker_diarization/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/uvr5_lib/vr_network/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/pretrained_custom/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/speaker_diarization/models/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/rvc/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer.rvc module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer.train module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.audios module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/zluda/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.zluda module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/presets/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.presets module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/weights/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.weights module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/extracting/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer.extracting module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/realtime/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer.realtime module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/rvc/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.audios.rvc module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/tts/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.audios.tts module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/uvr/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.audios.uvr module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/uvr5/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.uvr5 module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.algorithm module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/audios/others/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.audios.others module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/training/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer.train.training module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.embedders module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/extracting/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer.train.extracting module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for infer.train.preprocess module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/DJCM/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.DJCM module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/FCPE/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.FCPE module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/PENN/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.PENN module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/predictors/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.predictors module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/CREPE/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.CREPE module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/PESTO/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.PESTO module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/RMVPE/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.RMVPE module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/SWIFT/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.SWIFT module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/WORLD/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for library.predictors.WORLD module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/pretrained_v1/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.pretrained_v1 module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/pretrained_v2/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.pretrained_v2 module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/pretrained_custom/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.pretrained_custom module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/speaker_diarization/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.speaker_diarization module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/speaker_diarization/assets/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.speaker_diarization.assets module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/speaker_diarization/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Package initialization for assets.models.speaker_diarization.models module.""" -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/binary/world.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/binary/world.bin -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/binary/decrypt.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/binary/decrypt.bin -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/binary/vr_params.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/binary/vr_params.bin -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/f0/mute.wav.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/f0/mute.wav.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/energy/mute.wav.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/energy/mute.wav.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/f0_voiced/mute.wav.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/f0_voiced/mute.wav.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/sliced_audios/mute32000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute32000.wav -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/sliced_audios/mute40000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute40000.wav -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/sliced_audios/mute48000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute48000.wav -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/sliced_audios_16k/mute.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios_16k/mute.wav -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_chinese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_chinese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_japanese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_japanese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_korean.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_korean.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v1.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v2.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_chinese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_chinese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_japanese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_japanese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_korean.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_korean.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v1.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v2.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_portuguese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_portuguese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_vietnamese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_vietnamese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_portuguese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_portuguese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_vietnamese.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_vietnamese.npy -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/models/speaker_diarization/assets/mel_filters.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/models/speaker_diarization/assets/mel_filters.npz -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/zluda/run_app.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal 3 | title Vietnamese RVC By Anh [ZLUDA] 4 | 5 | set HIP_VISIBLE_DEVICES="0" 6 | set ZLUDA_COMGR_LOG_LEVEL=1 7 | SET DISABLE_ADDMM_CUDA_LT=1 8 | 9 | zluda\zluda.exe -- env\\Scripts\\python.exe main\\app\\app.py --open --allow_all_disk 10 | echo. 11 | pause -------------------------------------------------------------------------------- /advanced_rvc_inference/library/uvr5_lib/demucs/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def center_trim(tensor, reference): 4 | ref_size = reference.size(-1) if isinstance(reference, torch.Tensor) else reference 5 | delta = tensor.size(-1) - ref_size 6 | 7 | if delta < 0: raise ValueError(f"tensor > parameter: {delta}.") 8 | if delta: tensor = tensor[..., delta // 2 : -(delta - delta // 2)] 9 | 10 | return tensor -------------------------------------------------------------------------------- /advanced_rvc_inference/library/embedders/transformers.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import HubertModel 3 | 4 | class HubertModelWithFinalProj(HubertModel): 5 | def __init__(self, config): 6 | super().__init__(config) 7 | self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) 8 | 9 | def extract_features(self, source, padding_mask = None, output_layer = None): 10 | return self.forward(source) -------------------------------------------------------------------------------- /advanced_rvc_inference/tools/pixeldrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | def pixeldrain(url, output_dir): 5 | try: 6 | response = requests.get(f"https://pixeldrain.com/api/file/{url.split('pixeldrain.com/u/')[1]}") 7 | 8 | if response.status_code == 200: 9 | file_path = os.path.join(output_dir, (response.headers.get("Content-Disposition").split("filename=")[-1].strip('";'))) 10 | 11 | with open(file_path, "wb") as newfile: 12 | newfile.write(response.content) 13 | return file_path 14 | else: return None 15 | except Exception as e: 16 | raise RuntimeError(e) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/algorithm/normalization.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import torch.nn.functional as F 4 | 5 | class LayerNorm(torch.nn.Module): 6 | def __init__(self, channels, eps=1e-5, onnx=False): 7 | super().__init__() 8 | self.channels = channels 9 | self.eps = eps 10 | self.onnx = onnx 11 | self.gamma = torch.nn.Parameter(torch.ones(channels)) 12 | self.beta = torch.nn.Parameter(torch.zeros(channels)) 13 | 14 | def forward(self, x): 15 | x = x.transpose(1, -1) 16 | return (F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) if self.onnx else F.layer_norm(x, (x.size(-1),), self.gamma, self.beta, self.eps)).transpose(1, -1) -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/zluda/path-zluda-hip57.bat: -------------------------------------------------------------------------------- 1 | rmdir /S /q zluda 2 | curl -s -L https://github.com/lshqqytiger/ZLUDA/releases/download/rel.c0804ca624963aab420cb418412b1c7fbae3454b/ZLUDA-windows-rocm5-amd64.zip > zluda.zip 3 | tar -xf zluda.zip 4 | del zluda.zip 5 | 6 | if exist "runtime" ( 7 | copy zluda\cublas.dll runtime\Lib\site-packages\torch\lib\cublas64_11.dll /y 8 | copy zluda\cusparse.dll runtime\Lib\site-packages\torch\lib\cusparse64_11.dll /y 9 | copy zluda\nvrtc.dll runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y 10 | ) else ( 11 | copy zluda\cublas.dll env\Lib\site-packages\torch\lib\cublas64_11.dll /y 12 | copy zluda\cusparse.dll env\Lib\site-packages\torch\lib\cusparse64_11.dll /y 13 | copy zluda\nvrtc.dll env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y 14 | ) -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/zluda/path-zluda-hip61.bat: -------------------------------------------------------------------------------- 1 | rmdir /S /q zluda 2 | curl -s -L https://github.com/lshqqytiger/ZLUDA/releases/download/rel.c0804ca624963aab420cb418412b1c7fbae3454b/ZLUDA-windows-rocm6-amd64.zip > zluda.zip 3 | tar -xf zluda.zip 4 | del zluda.zip 5 | 6 | if exist "runtime" ( 7 | copy zluda\cublas.dll runtime\Lib\site-packages\torch\lib\cublas64_11.dll /y 8 | copy zluda\cusparse.dll runtime\Lib\site-packages\torch\lib\cusparse64_11.dll /y 9 | copy zluda\nvrtc.dll runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y 10 | ) else ( 11 | copy zluda\cublas.dll env\Lib\site-packages\torch\lib\cublas64_11.dll /y 12 | copy zluda\cusparse.dll env\Lib\site-packages\torch\lib\cusparse64_11.dll /y 13 | copy zluda\nvrtc.dll env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y 14 | ) -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/extracting/setup_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def setup_paths(exp_dir, version = None, rms_extract = False): 4 | wav_path = os.path.join(exp_dir, "sliced_audios_16k") 5 | 6 | if rms_extract: 7 | out_path = os.path.join(exp_dir, "energy") 8 | os.makedirs(out_path, exist_ok=True) 9 | 10 | return wav_path, out_path 11 | 12 | if version: 13 | out_path = os.path.join(exp_dir, f"{version}_extracted") 14 | os.makedirs(out_path, exist_ok=True) 15 | 16 | return wav_path, out_path 17 | else: 18 | output_root1, output_root2 = os.path.join(exp_dir, "f0"), os.path.join(exp_dir, "f0_voiced") 19 | os.makedirs(output_root1, exist_ok=True); os.makedirs(output_root2, exist_ok=True) 20 | 21 | return wav_path, output_root1, output_root2 -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/extracting/setup_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def setup_paths(exp_dir, version = None, rms_extract = False): 4 | wav_path = os.path.join(exp_dir, "sliced_audios_16k") 5 | 6 | if rms_extract: 7 | out_path = os.path.join(exp_dir, "energy") 8 | os.makedirs(out_path, exist_ok=True) 9 | 10 | return wav_path, out_path 11 | 12 | if version: 13 | out_path = os.path.join(exp_dir, f"{version}_extracted") 14 | os.makedirs(out_path, exist_ok=True) 15 | 16 | return wav_path, out_path 17 | else: 18 | output_root1, output_root2 = os.path.join(exp_dir, "f0"), os.path.join(exp_dir, "f0_voiced") 19 | os.makedirs(output_root1, exist_ok=True); os.makedirs(output_root2, exist_ok=True) 20 | 21 | return wav_path, output_root1, output_root2 -------------------------------------------------------------------------------- /advanced_rvc_inference/library/embedders/onnx.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import onnxruntime 3 | 4 | class HubertModelONNX: 5 | def __init__(self, embedder_model_path, providers, device): 6 | sess_options = onnxruntime.SessionOptions() 7 | sess_options.log_severity_level = 3 8 | self.model = onnxruntime.InferenceSession(embedder_model_path, sess_options=sess_options, providers=providers) 9 | self.final_proj = self._final_proj 10 | self.device = device 11 | 12 | def _final_proj(self, source): 13 | return source 14 | 15 | def extract_features(self, source, padding_mask = None, output_layer = None): 16 | logits = self.model.run([self.model.get_outputs()[0].name, self.model.get_outputs()[1].name], {"feats": source.detach().cpu().numpy()}) 17 | return [torch.as_tensor(logits[int(output_layer != 9)], dtype=torch.float32, device=self.device)] -------------------------------------------------------------------------------- /advanced_rvc_inference/run_tensorboard.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import logging 5 | import warnings 6 | import webbrowser 7 | 8 | from tensorboard import program 9 | 10 | sys.path.append(os.getcwd()) 11 | 12 | from advanced_rvc_inference.variables import config, translations, logger 13 | 14 | def launch_tensorboard(): 15 | warnings.filterwarnings("ignore") 16 | for l in ["root", "tensorboard"]: 17 | logging.getLogger(l).setLevel(logging.ERROR) 18 | 19 | tb = program.TensorBoard() 20 | tb.configure(argv=[None, "--logdir", config.configs["logs_path"], f"--port={config.configs['tensorboard_port']}"]) 21 | url = tb.launch() 22 | 23 | logger.info(f"{translations['tensorboard_url']}: {url}") 24 | if "--open" in sys.argv: webbrowser.open(url) 25 | 26 | return f"{translations['tensorboard_url']}: {url}" 27 | 28 | if __name__ == "__advanced_rvc_inference__": 29 | launch_tensorboard() 30 | 31 | while 1: 32 | time.sleep(5) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/PENN/fcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class FCN(torch.nn.Sequential): 4 | def __init__(self, channels = 256, pitch_bins = 1440, pooling = (2, 2)): 5 | super().__init__(*(Block(1, channels, 481, pooling), Block(channels, channels // 8, 225, pooling), Block(channels // 8, channels // 8, 97, pooling), Block(channels // 8, channels // 2, 66), Block(channels // 2, channels, 35), Block(channels, channels * 2, 4), torch.nn.Conv1d(channels * 2, pitch_bins, 4))) 6 | 7 | def forward(self, frames): 8 | return super().forward(frames[:, :, 16:-15]) 9 | 10 | class Block(torch.nn.Sequential): 11 | def __init__(self, in_channels, out_channels, length=1, pooling=None, kernel_size=32): 12 | layers = (torch.nn.Conv1d(in_channels, out_channels, kernel_size), torch.nn.ReLU()) 13 | if pooling is not None: layers += (torch.nn.MaxPool1d(*pooling),) 14 | layers += (torch.nn.LayerNorm((out_channels, length)),) 15 | super().__init__(*layers) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 ArkanDash 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /advanced_rvc_inference/tools/huggingface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tqdm 3 | import requests 4 | 5 | try: 6 | import wget 7 | except: 8 | wget = None 9 | 10 | def HF_download_file(url, output_path=None): 11 | url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip() 12 | output_path = os.path.basename(url) if output_path is None else (os.path.join(output_path, os.path.basename(url)) if os.path.isdir(output_path) else output_path) 13 | 14 | if wget != None: wget.download(url, out=output_path) 15 | else: 16 | response = requests.get(url, stream=True, timeout=300) 17 | 18 | if response.status_code == 200: 19 | progress_bar = tqdm.tqdm(total=int(response.headers.get("content-length", 0)), desc=os.path.basename(url), ncols=100, unit="byte", leave=False) 20 | 21 | with open(output_path, "wb") as f: 22 | for chunk in response.iter_content(chunk_size=10 * 1024 * 1024): 23 | progress_bar.update(len(chunk)) 24 | f.write(chunk) 25 | 26 | progress_bar.close() 27 | else: raise ValueError(response.status_code) 28 | 29 | return output_path -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/training/training.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.variables import translations, configs 9 | from advanced_rvc_inference.tabs.training.child.training import training_model_tab 10 | from advanced_rvc_inference.tabs.training.child.create_dataset import create_dataset_tab 11 | from advanced_rvc_inference.tabs.training.child.create_reference import create_reference_tab 12 | 13 | def training_tab(): 14 | with gr.TabItem(translations["training_model"], visible=configs.get("create_and_training_tab", True)): 15 | with gr.TabItem(translations["createdataset"], visible=configs.get("create_dataset_tab", True)): 16 | gr.Markdown(translations["create_dataset_markdown"]) 17 | create_dataset_tab() 18 | 19 | with gr.TabItem(translations["create_reference"], visible=configs.get("create_reference_tab", True)): 20 | gr.Markdown(translations["create_reference_markdown"]) 21 | create_reference_tab() 22 | 23 | with gr.TabItem(translations["training_model"], visible=configs.get("training_tab", True)): 24 | gr.Markdown(f"## {translations['training_model']}") 25 | training_model_tab() -------------------------------------------------------------------------------- /advanced_rvc_inference/library/backends/directml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gc 3 | import sys 4 | import torch 5 | import subprocess 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from advanced_rvc_inference.library.embedders import fairseq 10 | from advanced_rvc_inference.library.backends.utils import GRU 11 | 12 | try: 13 | import torch_directml 14 | except: 15 | torch_directml = None 16 | 17 | torch_available = torch_directml != None 18 | 19 | def device_count(): 20 | return torch_directml.device_count() if torch_available else 0 21 | 22 | def device_name(device_id = 0): 23 | return torch_directml.device_name(device_id) if torch_available else "" 24 | 25 | def is_available(): 26 | return torch_directml.is_available() if torch_available else False 27 | 28 | def empty_cache(): 29 | empty_cache_path = os.path.join("main", "library", "backends", "dml_empty_cache", "empty_cache.exe") 30 | 31 | if torch_available and os.path.exists(empty_cache_path): 32 | subprocess.run([empty_cache_path], capture_output=True, text=True) 33 | gc.collect() 34 | 35 | def forward_dml(ctx, x, scale): 36 | ctx.scale = scale 37 | res = x.clone().detach() 38 | return res 39 | 40 | if torch_available: 41 | torch.nn.GRU = GRU 42 | fairseq.GradMultiply.forward = forward_dml -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v2/32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "learning_rate": 0.0001, 6 | "betas": [0.8, 0.99], 7 | "eps": 1e-09, 8 | "lr_decay": 0.999875, 9 | "segment_size": 12800, 10 | "c_mel": 45, 11 | "c_kl": 1.0 12 | }, 13 | "data": { 14 | "max_wav_value": 32768.0, 15 | "sample_rate": 32000, 16 | "filter_length": 1024, 17 | "hop_length": 320, 18 | "win_length": 1024, 19 | "n_mel_channels": 80, 20 | "mel_fmin": 0.0, 21 | "mel_fmax": null 22 | }, 23 | "model": { 24 | "inter_channels": 192, 25 | "hidden_channels": 192, 26 | "filter_channels": 768, 27 | "text_enc_hidden_dim": 768, 28 | "n_heads": 2, 29 | "n_layers": 6, 30 | "kernel_size": 3, 31 | "p_dropout": 0, 32 | "resblock": "1", 33 | "resblock_kernel_sizes": [3, 7, 11], 34 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 35 | "upsample_rates": [10, 8, 2, 2], 36 | "upsample_initial_channel": 512, 37 | "upsample_kernel_sizes": [20, 16, 4, 4], 38 | "use_spectral_norm": false, 39 | "gin_channels": 256, 40 | "spk_embed_dim": 109 41 | } 42 | } -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v2/40000.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "learning_rate": 0.0001, 6 | "betas": [0.8, 0.99], 7 | "eps": 1e-09, 8 | "lr_decay": 0.999875, 9 | "segment_size": 12800, 10 | "c_mel": 45, 11 | "c_kl": 1.0 12 | }, 13 | "data": { 14 | "max_wav_value": 32768.0, 15 | "sample_rate": 40000, 16 | "filter_length": 2048, 17 | "hop_length": 400, 18 | "win_length": 2048, 19 | "n_mel_channels": 125, 20 | "mel_fmin": 0.0, 21 | "mel_fmax": null 22 | }, 23 | "model": { 24 | "inter_channels": 192, 25 | "hidden_channels": 192, 26 | "filter_channels": 768, 27 | "text_enc_hidden_dim": 768, 28 | "n_heads": 2, 29 | "n_layers": 6, 30 | "kernel_size": 3, 31 | "p_dropout": 0, 32 | "resblock": "1", 33 | "resblock_kernel_sizes": [3, 7, 11], 34 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 35 | "upsample_rates": [10, 10, 2, 2], 36 | "upsample_initial_channel": 512, 37 | "upsample_kernel_sizes": [16, 16, 4, 4], 38 | "use_spectral_norm": false, 39 | "gin_channels": 256, 40 | "spk_embed_dim": 109 41 | } 42 | } -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v2/48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "learning_rate": 0.0001, 6 | "betas": [0.8, 0.99], 7 | "eps": 1e-09, 8 | "lr_decay": 0.999875, 9 | "segment_size": 17280, 10 | "c_mel": 45, 11 | "c_kl": 1.0 12 | }, 13 | "data": { 14 | "max_wav_value": 32768.0, 15 | "sample_rate": 48000, 16 | "filter_length": 2048, 17 | "hop_length": 480, 18 | "win_length": 2048, 19 | "n_mel_channels": 128, 20 | "mel_fmin": 0.0, 21 | "mel_fmax": null 22 | }, 23 | "model": { 24 | "inter_channels": 192, 25 | "hidden_channels": 192, 26 | "filter_channels": 768, 27 | "text_enc_hidden_dim": 768, 28 | "n_heads": 2, 29 | "n_layers": 6, 30 | "kernel_size": 3, 31 | "p_dropout": 0, 32 | "resblock": "1", 33 | "resblock_kernel_sizes": [3, 7, 11], 34 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 35 | "upsample_rates": [12, 10, 2, 2], 36 | "upsample_initial_channel": 512, 37 | "upsample_kernel_sizes": [24, 20, 4, 4], 38 | "use_spectral_norm": false, 39 | "gin_channels": 256, 40 | "spk_embed_dim": 109 41 | } 42 | } -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/child/read_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.core.ui import shutil_move 9 | from advanced_rvc_inference.core.model_utils import model_info 10 | from advanced_rvc_inference.variables import translations, configs 11 | 12 | def read_model_tab(): 13 | with gr.Row(): 14 | gr.Markdown(translations["read_model_markdown_2"]) 15 | with gr.Row(): 16 | model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"]) 17 | with gr.Row(): 18 | read_button = gr.Button(translations["readmodel"], variant="primary", scale=2) 19 | with gr.Column(): 20 | model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True) 21 | output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6) 22 | with gr.Row(): 23 | model.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model], outputs=[model_path]) 24 | read_button.click( 25 | fn=model_info, 26 | inputs=[model_path], 27 | outputs=[output_info], 28 | api_name="read_model" 29 | ) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/embedders/ppg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | sys.path.append(os.getcwd()) 6 | 7 | from advanced_rvc_inference.library.speaker_diarization.whisper import Whisper, ModelDimensions, log_mel_spectrogram, pad_or_trim 8 | 9 | class WhisperModel(torch.nn.Module): 10 | def __init__(self, model_path, device): 11 | super().__init__() 12 | checkpoint = torch.load(model_path, map_location="cpu") 13 | dims = ModelDimensions(**checkpoint["dims"]) 14 | self.final_proj = torch.nn.Linear(dims.n_text_state, 768) 15 | self.model = Whisper(dims) 16 | self.model.load_state_dict(checkpoint["model_state_dict"]) 17 | self.model = self.model.to(device) 18 | del self.model.decoder 19 | 20 | def forward(self, audio): 21 | ppgln = audio.shape[1] // 320 22 | mel = log_mel_spectrogram(pad_or_trim(audio[0])).to(audio.device) 23 | 24 | with torch.no_grad(): 25 | ppg_raw = self.model.encoder(mel.unsqueeze(0)) 26 | ppg_projected = self.final_proj(ppg_raw) 27 | ppg = ppg_projected.data.float() 28 | ppg = ppg[:, :ppgln, :] 29 | 30 | return [ppg] 31 | 32 | def extract_features(self, source, padding_mask = None, output_layer = None): 33 | return self.forward(source) -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/zluda/path-zluda-hip62.bat: -------------------------------------------------------------------------------- 1 | rmdir /S /q zluda 2 | curl -s -L https://github.com/lshqqytiger/ZLUDA/releases/download/rel.5e717459179dc272b7d7d23391f0fad66c7459cf/ZLUDA-windows-rocm6-amd64.zip > zluda.zip 3 | tar -xf zluda.zip 4 | del zluda.zip 5 | 6 | if exist "runtime" ( 7 | copy runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll runtime\Lib\site-packages\torch\lib\nvrtc_cuda.dll /y 8 | copy zluda\cublas.dll runtime\Lib\site-packages\torch\lib\cublas64_11.dll /y 9 | copy zluda\cusparse.dll runtime\Lib\site-packages\torch\lib\cusparse64_11.dll /y 10 | copy zluda\nvrtc.dll runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y 11 | copy zluda\cufft.dll runtime\Lib\site-packages\torch\lib\cufft64_10.dll /y 12 | copy zluda\cufftw.dll runtime\Lib\site-packages\torch\lib\cufftw64_10.dll /y 13 | ) else ( 14 | copy env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll env\Lib\site-packages\torch\lib\nvrtc_cuda.dll /y 15 | copy zluda\cublas.dll env\Lib\site-packages\torch\lib\cublas64_11.dll /y 16 | copy zluda\cusparse.dll env\Lib\site-packages\torch\lib\cusparse64_11.dll /y 17 | copy zluda\nvrtc.dll env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y 18 | copy zluda\cufft.dll env\Lib\site-packages\torch\lib\cufft64_10.dll /y 19 | copy zluda\cufftw.dll env\Lib\site-packages\torch\lib\cufftw64_10.dll /y 20 | ) 21 | pause -------------------------------------------------------------------------------- /advanced_rvc_inference/library/uvr5_lib/vr_network/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | 4 | default_param = {} 5 | default_param["bins"] = -1 6 | default_param["unstable_bins"] = -1 7 | default_param["stable_bins"] = -1 8 | default_param["sr"] = 44100 9 | default_param["pre_filter_start"] = -1 10 | default_param["pre_filter_stop"] = -1 11 | default_param["band"] = {} 12 | 13 | N_BINS = "n_bins" 14 | 15 | def int_keys(pairs): 16 | result_dict = {} 17 | 18 | for key, value in pairs: 19 | if isinstance(key, str) and key.isdigit(): key = int(key) 20 | result_dict[key] = value 21 | 22 | return result_dict 23 | 24 | class ModelParameters(object): 25 | def __init__(self, config_path="", key_in_bin=None): 26 | if config_path.endswith(".bin"): 27 | with open(config_path, "rb") as f: 28 | data = pickle.load(f) 29 | self.param = data[key_in_bin] 30 | else: 31 | with open(config_path, "r", encoding="utf-8") as f: 32 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 33 | 34 | for k in ["mid_side", "mid_side_b", "mid_side_b2", "stereo_w", "stereo_n", "reverse"]: 35 | if k not in self.param: 36 | self.param[k] = False 37 | 38 | if N_BINS in self.param: 39 | self.param["bins"] = self.param[N_BINS] -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/training/losses.py: -------------------------------------------------------------------------------- 1 | def feature_loss(fmap_r, fmap_g): 2 | loss = 0 3 | for dr, dg in zip(fmap_r, fmap_g): 4 | for rl, gl in zip(dr, dg): 5 | loss += (rl.float().detach() - gl.float()).abs().mean() 6 | 7 | return loss * 2 8 | 9 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 10 | loss = 0 11 | r_losses, g_losses = [], [] 12 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 13 | dr = dr.float() 14 | dg = dg.float() 15 | r_loss = ((1 - dr) ** 2).mean() 16 | g_loss = (dg**2).mean() 17 | loss += r_loss + g_loss 18 | r_losses.append(r_loss.item()) 19 | g_losses.append(g_loss.item()) 20 | 21 | return loss, r_losses, g_losses 22 | 23 | def generator_loss(disc_outputs): 24 | loss = 0 25 | gen_losses = [] 26 | for dg in disc_outputs: 27 | l = ((1 - dg.float()) ** 2).mean() 28 | gen_losses.append(l) 29 | loss += l 30 | 31 | return loss, gen_losses 32 | 33 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 34 | z_p = z_p.float() 35 | logs_q = logs_q.float() 36 | m_p = m_p.float() 37 | logs_p = logs_p.float() 38 | z_mask = z_mask.float() 39 | 40 | kl = logs_p - logs_q - 0.5 41 | kl += 0.5 * ((z_p - m_p) ** 2) * (-2.0 * logs_p).exp() 42 | 43 | return (kl * z_mask).sum() / z_mask.sum() -------------------------------------------------------------------------------- /advanced_rvc_inference/tools/mediafire.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | def Mediafire_Download(url, output=None, filename=None): 8 | if not filename: filename = url.split('/')[-2] 9 | if not output: output = os.path.dirname(os.path.realpath(__file__)) 10 | output_file = os.path.join(output, filename) 11 | 12 | sess = requests.session() 13 | sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}) 14 | 15 | try: 16 | with requests.get(BeautifulSoup(sess.get(url).content, "html.parser").find(id="downloadButton").get("href"), stream=True) as r: 17 | r.raise_for_status() 18 | 19 | with open(output_file, "wb") as f: 20 | total_length = int(r.headers.get('content-length')) 21 | download_progress = 0 22 | 23 | for chunk in r.iter_content(chunk_size=1024): 24 | download_progress += len(chunk) 25 | f.write(chunk) 26 | 27 | sys.stdout.write(f"\r[{filename}]: {int(100 * download_progress / total_length)}% ({round(download_progress / 1024 / 1024, 2)}mb/{round(total_length / 1024 / 1024, 2)}mb)") 28 | sys.stdout.flush() 29 | 30 | sys.stdout.write("\n") 31 | return output_file 32 | except Exception as e: 33 | raise RuntimeError(e) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Platform-specific requirements section 2 | pip>=23.3 3 | wheel 4 | omegaconf>=2.0.6 5 | onnxruntime; sys_platform == 'darwin' 6 | onnxruntime-gpu; sys_platform != 'darwin' 7 | 8 | # Core dependencies 9 | PyYAML>=6.0 10 | tiktoken 11 | hyperpyyaml 12 | torch>=2.3.1 13 | tqdm>=4.63.1 14 | sortedcontainers 15 | torchvision>=0.18.1 16 | torchaudio>=2.3.1 17 | torchcodec>=0.8.1 18 | 19 | faiss-cpu==1.7.3; python_version < "3.12" 20 | faiss-cpu>=1.7.3; python_version >= "3.12" 21 | 22 | # Machine learning, NLP and deep learning 23 | transformers>=4.49.0 24 | scikit-learn 25 | einops>=0.8.0 26 | 27 | # Pitch and sound processing 28 | librosa>=0.10.2 29 | pydub>=0.25.1 30 | praat-parselmouth 31 | soundfile>=0.13.0 32 | pedalboard 33 | 34 | # Data processing and calculation 35 | numpy>=1.25.2,<2.0.0 36 | numba>=0.57.0 37 | scipy>=1.15.0 38 | matplotlib>=3.7.2 39 | 40 | # Implementation and web framework 41 | gradio>=5.23.3,<6.0.0 42 | requests>=2.32.3 43 | aiohttp 44 | pysrt 45 | 46 | # Utility section 47 | yt-dlp 48 | edge-tts>=7.2.0 49 | ffmpy==0.3.1 50 | ffmpeg-python>=0.2.0 51 | beautifulsoup4 52 | 53 | # Tensorboard and ONNX 54 | tensorboard 55 | onnx>=1.14 56 | onnxslim 57 | onnx2torch>=1.5.15 58 | 59 | # Cryptography section 60 | pycryptodome>=3.9.6,<4.0.0 61 | 62 | # Realtime and VAD 63 | sounddevice>=0.5.2 64 | webrtcvad-wheels>=2.0.14 65 | -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v1/32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 0.0001, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-09, 9 | "batch_size": 4, 10 | "lr_decay": 0.999875, 11 | "segment_size": 12800, 12 | "init_lr_ratio": 1, 13 | "warmup_epochs": 0, 14 | "c_mel": 45, 15 | "c_kl": 1.0 16 | }, 17 | "data": { 18 | "max_wav_value": 32768.0, 19 | "sample_rate": 32000, 20 | "filter_length": 1024, 21 | "hop_length": 320, 22 | "win_length": 1024, 23 | "n_mel_channels": 80, 24 | "mel_fmin": 0.0, 25 | "mel_fmax": null 26 | }, 27 | "model": { 28 | "inter_channels": 192, 29 | "hidden_channels": 192, 30 | "filter_channels": 768, 31 | "text_enc_hidden_dim": 256, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3, 7, 11], 38 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 39 | "upsample_rates": [10, 4, 2, 2, 2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16, 16, 4, 4, 4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v1/40000.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 0.0001, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-09, 9 | "batch_size": 4, 10 | "lr_decay": 0.999875, 11 | "segment_size": 12800, 12 | "init_lr_ratio": 1, 13 | "warmup_epochs": 0, 14 | "c_mel": 45, 15 | "c_kl": 1.0 16 | }, 17 | "data": { 18 | "max_wav_value": 32768.0, 19 | "sample_rate": 40000, 20 | "filter_length": 2048, 21 | "hop_length": 400, 22 | "win_length": 2048, 23 | "n_mel_channels": 125, 24 | "mel_fmin": 0.0, 25 | "mel_fmax": null 26 | }, 27 | "model": { 28 | "inter_channels": 192, 29 | "hidden_channels": 192, 30 | "filter_channels": 768, 31 | "text_enc_hidden_dim": 256, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3, 7, 11], 38 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 39 | "upsample_rates": [10, 10, 2, 2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16, 16, 4, 4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } -------------------------------------------------------------------------------- /advanced_rvc_inference/configs/v1/48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 0.0001, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-09, 9 | "batch_size": 4, 10 | "lr_decay": 0.999875, 11 | "segment_size": 11520, 12 | "init_lr_ratio": 1, 13 | "warmup_epochs": 0, 14 | "c_mel": 45, 15 | "c_kl": 1.0 16 | }, 17 | "data": { 18 | "max_wav_value": 32768.0, 19 | "sample_rate": 48000, 20 | "filter_length": 2048, 21 | "hop_length": 480, 22 | "win_length": 2048, 23 | "n_mel_channels": 128, 24 | "mel_fmin": 0.0, 25 | "mel_fmax": null 26 | }, 27 | "model": { 28 | "inter_channels": 192, 29 | "hidden_channels": 192, 30 | "filter_channels": 768, 31 | "text_enc_hidden_dim": 256, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3, 7, 11], 38 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 39 | "upsample_rates": [10, 6, 2, 2, 2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16, 16, 4, 4, 4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/RMVPE/e2e.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import torch.nn as nn 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from advanced_rvc_inference.library.predictors.RMVPE.deepunet import DeepUnet 10 | 11 | N_MELS, N_CLASS = 128, 360 12 | 13 | class BiGRU(nn.Module): 14 | def __init__(self, input_features, hidden_features, num_layers): 15 | super(BiGRU, self).__init__() 16 | self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True) 17 | 18 | def forward(self, x): 19 | try: 20 | return self.gru(x)[0] 21 | except: 22 | torch.backends.cudnn.enabled = False 23 | return self.gru(x)[0] 24 | 25 | class E2E(nn.Module): 26 | def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16): 27 | super(E2E, self).__init__() 28 | self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels) 29 | self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) 30 | self.fc = nn.Sequential(BiGRU(3 * 128, 256, n_gru), nn.Linear(512, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) if n_gru else nn.Sequential(nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) 31 | 32 | def forward(self, mel): 33 | return self.fc(self.cnn(self.unet(mel.transpose(-1, -2).unsqueeze(1))).transpose(1, 2).flatten(-2)) -------------------------------------------------------------------------------- /advanced_rvc_inference/core/restart.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import platform 5 | import subprocess 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from advanced_rvc_inference.core.ui import gr_info 10 | from advanced_rvc_inference.variables import python, translations, configs_json 11 | 12 | def restart_app(app): 13 | gr_info(translations["30s"]) 14 | os.system("cls" if platform.system() == "Windows" else "clear") 15 | 16 | app.close() 17 | subprocess.run([python, os.path.join("advanced_rvc_inference", "app.py")] + [arg for arg in sys.argv[1:] if arg != "--open"]) 18 | 19 | def change_language(lang, app): 20 | configs = json.load(open(configs_json, "r")) 21 | 22 | if lang != configs["language"]: 23 | configs["language"] = lang 24 | 25 | with open(configs_json, "w") as f: 26 | json.dump(configs, f, indent=4) 27 | 28 | restart_app(app) 29 | 30 | def change_theme(theme, app): 31 | configs = json.load(open(configs_json, "r")) 32 | 33 | if theme != configs["theme"]: 34 | configs["theme"] = theme 35 | with open(configs_json, "w") as f: 36 | json.dump(configs, f, indent=4) 37 | 38 | restart_app(app) 39 | 40 | def change_font(font, app): 41 | configs = json.load(open(configs_json, "r")) 42 | 43 | if font != configs["font"]: 44 | configs["font"] = font 45 | with open(configs_json, "w") as f: 46 | json.dump(configs, f, indent=4) 47 | 48 | restart_app(app) -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/inference/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.variables import translations, configs 9 | from advanced_rvc_inference.tabs.inference.child.convert import convert_tab 10 | from advanced_rvc_inference.tabs.inference.child.separate import separate_tab 11 | from advanced_rvc_inference.tabs.inference.child.convert_tts import convert_tts_tab 12 | from advanced_rvc_inference.tabs.inference.child.convert_with_whisper import convert_with_whisper_tab 13 | 14 | def inference_tab(): 15 | with gr.TabItem(translations["inference"], visible=configs.get("inference_tab", True)): 16 | with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)): 17 | gr.Markdown(f"## {translations['convert_audio']}") 18 | convert_tab() 19 | with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)): 20 | gr.Markdown(f"## {translations['separator_tab']}") 21 | separate_tab() 22 | 23 | with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)): 24 | gr.Markdown(f"## {translations['convert_with_whisper']}") 25 | convert_with_whisper_tab() 26 | 27 | with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)): 28 | gr.Markdown(translations["convert_text_markdown"]) 29 | convert_tts_tab() 30 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/DJCM/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch.nn as nn 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.library.predictors.DJCM.decoder import PE_Decoder 9 | from advanced_rvc_inference.library.predictors.DJCM.utils import init_bn, WINDOW_LENGTH 10 | from advanced_rvc_inference.library.predictors.DJCM.encoder import ResEncoderBlock, Encoder 11 | 12 | class LatentBlocks(nn.Module): 13 | def __init__(self, n_blocks, latent_layers): 14 | super(LatentBlocks, self).__init__() 15 | self.latent_blocks = nn.ModuleList([ 16 | ResEncoderBlock(384, 384, n_blocks, None) 17 | for _ in range(latent_layers) 18 | ]) 19 | 20 | def forward(self, x): 21 | for layer in self.latent_blocks: 22 | x = layer(x) 23 | 24 | return x 25 | 26 | class DJCMM(nn.Module): 27 | def __init__(self, in_channels, n_blocks, latent_layers): 28 | super(DJCMM, self).__init__() 29 | self.bn = nn.BatchNorm2d(WINDOW_LENGTH // 2 + 1, momentum=0.01) 30 | self.pe_encoder = Encoder(in_channels, n_blocks) 31 | self.pe_latent = LatentBlocks(n_blocks, latent_layers) 32 | self.pe_decoder = PE_Decoder(n_blocks) 33 | init_bn(self.bn) 34 | 35 | def forward(self, spec): 36 | x = self.bn(spec.transpose(1, 3)).transpose(1, 3)[..., :-1] 37 | x, concat_tensors = self.pe_encoder(x) 38 | pe_out = self.pe_decoder(self.pe_latent(x), concat_tensors) 39 | 40 | return pe_out -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/child/convert_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.core.ui import visible, shutil_move 9 | from advanced_rvc_inference.core.model_utils import onnx_export 10 | from advanced_rvc_inference.variables import translations, configs 11 | 12 | def convert_model_tab(): 13 | with gr.Row(): 14 | gr.Markdown(translations["pytorch2onnx_markdown"]) 15 | with gr.Row(): 16 | model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"]) 17 | with gr.Row(): 18 | convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2) 19 | with gr.Row(): 20 | model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True) 21 | with gr.Row(): 22 | output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False) 23 | with gr.Row(): 24 | model_pth_upload.upload(fn=lambda model_pth_upload: shutil_move(model_pth_upload.name, configs["weights_path"]), inputs=[model_pth_upload], outputs=[model_pth_path]) 25 | convert_onnx.click( 26 | fn=onnx_export, 27 | inputs=[model_pth_path], 28 | outputs=[output_model2], 29 | api_name="model_onnx_export" 30 | ) 31 | convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2]) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/DJCM/spec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import numpy as np 6 | import torch.nn as nn 7 | 8 | sys.path.append(os.getcwd()) 9 | 10 | class Spectrogram(nn.Module): 11 | def __init__(self, hop_length, win_length, n_fft=None, clamp=1e-10): 12 | super(Spectrogram, self).__init__() 13 | self.n_fft = win_length if n_fft is None else n_fft 14 | self.hop_length = hop_length 15 | self.win_length = win_length 16 | self.clamp = clamp 17 | self.register_buffer("window", torch.hann_window(win_length), persistent=False) 18 | 19 | def forward(self, audio, center=True): 20 | bs, c, segment_samples = audio.shape 21 | audio = audio.reshape(bs * c, segment_samples) 22 | 23 | if str(audio.device).startswith(("ocl", "privateuseone")): 24 | if not hasattr(self, "stft"): 25 | from main.library.backends.utils import STFT 26 | self.stft = STFT(filter_length=self.n_fft, hop_length=self.hop_length, win_length=self.win_length).to(audio.device) 27 | magnitude = self.stft.transform(audio, 1e-9) 28 | else: 29 | fft = torch.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, center=center, pad_mode="reflect", return_complex=True) 30 | magnitude = (fft.real.pow(2) + fft.imag.pow(2)).sqrt() 31 | 32 | mag = magnitude.transpose(1, 2).clamp(self.clamp, np.inf) 33 | mag = mag.reshape(bs, c, mag.shape[1], mag.shape[2]) 34 | 35 | return mag -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/realtime/vad_utils.py: -------------------------------------------------------------------------------- 1 | import webrtcvad 2 | 3 | import numpy as np 4 | 5 | class VADProcessor: 6 | def __init__(self, sensitivity_mode=3, sample_rate=16000, frame_duration_ms=30): 7 | if sample_rate not in [8000, 16000]: raise ValueError 8 | if frame_duration_ms not in [10, 20, 30]: raise ValueError 9 | 10 | self.vad = webrtcvad.Vad(sensitivity_mode) 11 | self.sample_rate = sample_rate 12 | self.frame_length = int(sample_rate * (frame_duration_ms / 1000.0)) 13 | 14 | def is_speech(self, audio_chunk): 15 | if audio_chunk.ndim > 1 and audio_chunk.shape[1] == 1: audio_chunk = audio_chunk.flatten() 16 | elif audio_chunk.ndim > 1: audio_chunk = np.mean(audio_chunk, axis=1) 17 | 18 | if np.max(np.abs(audio_chunk)) > 1.0: audio_chunk = np.clip(audio_chunk, -1.0, 1.0) 19 | 20 | audio_chunk = (audio_chunk * 32767).astype(np.int16) 21 | num_frames = len(audio_chunk) // self.frame_length 22 | 23 | if num_frames == 0 and len(audio_chunk) > 0: 24 | audio_chunk = np.concatenate((audio_chunk, np.zeros(self.frame_length - len(audio_chunk), dtype=np.int16))) 25 | num_frames = 1 26 | elif num_frames == 0 and len(audio_chunk) == 0: return False 27 | 28 | try: 29 | for i in range(num_frames): 30 | start = i * self.frame_length 31 | if self.vad.is_speech(audio_chunk[start:start + self.frame_length].tobytes(), self.sample_rate): return True 32 | 33 | return False 34 | except Exception: 35 | return False -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/CREPE/filter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def mean(signals, win_length=9): 4 | assert signals.dim() == 2 5 | 6 | signals = signals.unsqueeze(1) 7 | mask = ~torch.isnan(signals) 8 | padding = win_length // 2 9 | 10 | ones_kernel = torch.ones(signals.size(1), 1, win_length, device=signals.device) 11 | avg_pooled = torch.nn.functional.conv1d(torch.where(mask, signals, torch.zeros_like(signals)), ones_kernel, stride=1, padding=padding) / torch.nn.functional.conv1d(mask.float(), ones_kernel, stride=1, padding=padding).clamp(min=1) 12 | avg_pooled[avg_pooled == 0] = float("nan") 13 | 14 | return avg_pooled.squeeze(1) 15 | 16 | def median(signals, win_length): 17 | assert signals.dim() == 2 18 | 19 | signals = signals.unsqueeze(1) 20 | mask = ~torch.isnan(signals) 21 | padding = win_length // 2 22 | 23 | x = torch.nn.functional.pad(torch.where(mask, signals, torch.zeros_like(signals)), (padding, padding), mode="reflect") 24 | mask = torch.nn.functional.pad(mask.float(), (padding, padding), mode="constant", value=0) 25 | 26 | x = x.unfold(2, win_length, 1) 27 | mask = mask.unfold(2, win_length, 1) 28 | 29 | x = x.contiguous().view(x.size()[:3] + (-1,)) 30 | mask = mask.contiguous().view(mask.size()[:3] + (-1,)) 31 | 32 | x_sorted, _ = torch.where(mask.bool(), x.float(), float("inf")).to(x).sort(dim=-1) 33 | 34 | median_pooled = x_sorted.gather(-1, ((mask.sum(dim=-1) - 1) // 2).clamp(min=0).unsqueeze(-1).long()).squeeze(-1) 35 | median_pooled[torch.isinf(median_pooled)] = float("nan") 36 | 37 | return median_pooled.squeeze(1) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/DJCM/encoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch.nn as nn 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.library.predictors.DJCM.utils import ResConvBlock 9 | 10 | class ResEncoderBlock(nn.Module): 11 | def __init__(self, in_channels, out_channels, n_blocks, kernel_size): 12 | super(ResEncoderBlock, self).__init__() 13 | self.conv = nn.ModuleList([ResConvBlock(in_channels, out_channels)]) 14 | for _ in range(n_blocks - 1): 15 | self.conv.append(ResConvBlock(out_channels, out_channels)) 16 | 17 | self.pool = nn.MaxPool2d(kernel_size) if kernel_size is not None else None 18 | 19 | def forward(self, x): 20 | for each_layer in self.conv: 21 | x = each_layer(x) 22 | 23 | if self.pool is not None: return x, self.pool(x) 24 | return x 25 | 26 | class Encoder(nn.Module): 27 | def __init__(self, in_channels, n_blocks): 28 | super(Encoder, self).__init__() 29 | self.en_blocks = nn.ModuleList([ 30 | ResEncoderBlock(in_channels, 32, n_blocks, (1, 2)), 31 | ResEncoderBlock(32, 64, n_blocks, (1, 2)), 32 | ResEncoderBlock(64, 128, n_blocks, (1, 2)), 33 | ResEncoderBlock(128, 256, n_blocks, (1, 2)), 34 | ResEncoderBlock(256, 384, n_blocks, (1, 2)), 35 | ResEncoderBlock(384, 384, n_blocks, (1, 2)) 36 | ]) 37 | 38 | def forward(self, x): 39 | concat_tensors = [] 40 | 41 | for layer in self.en_blocks: 42 | _, x = layer(x) 43 | concat_tensors.append(_) 44 | 45 | return x, concat_tensors -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/FCPE/wav2mel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | from torchaudio.transforms import Resample 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from advanced_rvc_inference.library.predictors.FCPE.stft import STFT 10 | 11 | class Wav2Mel: 12 | def __init__(self, device=None, dtype=torch.float32): 13 | self.sample_rate = 16000 14 | self.hop_size = 160 15 | if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" 16 | self.device = device 17 | self.dtype = dtype 18 | self.stft = STFT(16000, 128, 1024, 1024, 160, 0, 8000) 19 | self.resample_kernel = {} 20 | 21 | def extract_nvstft(self, audio, keyshift=0, train=False): 22 | return self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) 23 | 24 | def extract_mel(self, audio, sample_rate, keyshift=0, train=False): 25 | audio = audio.to(self.dtype).to(self.device) 26 | if sample_rate == self.sample_rate: audio_res = audio 27 | else: 28 | key_str = str(sample_rate) 29 | if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample(sample_rate, self.sample_rate, lowpass_filter_width=128) 30 | self.resample_kernel[key_str] = (self.resample_kernel[key_str].to(self.dtype).to(self.device)) 31 | audio_res = self.resample_kernel[key_str](audio) 32 | 33 | mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train) 34 | n_frames = int(audio.shape[1] // self.hop_size) + 1 35 | mel = (torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel) 36 | return mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel 37 | 38 | def __call__(self, audio, sample_rate, keyshift=0, train=False): 39 | return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/extra.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.variables import translations, configs 9 | from advanced_rvc_inference.tabs.extra.child.fushion import fushion_tab 10 | from advanced_rvc_inference.tabs.extra.child.settings import settings_tab 11 | from advanced_rvc_inference.tabs.extra.child.read_model import read_model_tab 12 | from advanced_rvc_inference.tabs.extra.child.f0_extract import f0_extract_tab 13 | from advanced_rvc_inference.tabs.extra.child.create_srt import create_srt_tab 14 | from advanced_rvc_inference.tabs.extra.child.convert_model import convert_model_tab 15 | 16 | def extra_tab(app): 17 | with gr.TabItem(translations["extra"], visible=configs.get("extra_tab", True)): 18 | with gr.TabItem(translations["fushion"], visible=configs.get("fushion_tab", True)): 19 | gr.Markdown(translations["fushion_markdown"]) 20 | fushion_tab() 21 | 22 | with gr.TabItem(translations["read_model"], visible=configs.get("read_tab", True)): 23 | gr.Markdown(translations["read_model_markdown"]) 24 | read_model_tab() 25 | 26 | with gr.TabItem(translations["convert_model"], visible=configs.get("onnx_tab", True)): 27 | gr.Markdown(translations["pytorch2onnx"]) 28 | convert_model_tab() 29 | 30 | with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)): 31 | gr.Markdown(translations["f0_extractor_markdown"]) 32 | f0_extract_tab() 33 | 34 | with gr.TabItem(translations["create_srt_tab"], visible=configs.get("create_srt_tab", True)): 35 | gr.Markdown(translations["create_srt_markdown"]) 36 | create_srt_tab() 37 | 38 | with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)): 39 | gr.Markdown(translations["settings_markdown"]) 40 | settings_tab(app) -------------------------------------------------------------------------------- /advanced_rvc_inference/core/f0_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.getcwd()) 5 | 6 | from advanced_rvc_inference.core.ui import gr_info, gr_warning 7 | from advanced_rvc_inference.variables import config, translations, configs 8 | 9 | def f0_extract(audio, f0_method, f0_onnx): 10 | if not audio or not os.path.exists(audio) or os.path.isdir(audio): 11 | gr_warning(translations["input_not_valid"]) 12 | return [None]*2 13 | 14 | import librosa 15 | import numpy as np 16 | import matplotlib.pyplot as plt 17 | 18 | from advanced_rvc_inference.library.utils import check_assets, load_audio 19 | from advanced_rvc_inference.library.predictors.Generator import Generator 20 | 21 | check_assets(f0_method, "", f0_onnx, "") 22 | 23 | f0_path = os.path.join(configs["f0_path"], os.path.splitext(os.path.basename(audio))[0]) 24 | image_path = os.path.join(f0_path, "f0.png") 25 | txt_path = os.path.join(f0_path, "f0.txt") 26 | 27 | gr_info(translations["start_extract"]) 28 | 29 | if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True) 30 | 31 | y = load_audio(audio, sample_rate=16000) 32 | f0_generator = Generator(16000, 160, 50, 1100, 0.5, is_half=config.is_half, device=config.device, f0_onnx_mode=f0_onnx, del_onnx_model=f0_onnx) 33 | _, pitchf = f0_generator.calculator(config.x_pad, f0_method, y, 0, None, 3, False, 0, None, False) 34 | 35 | F_temp = np.array(pitchf, dtype=np.float32) 36 | F_temp[F_temp == 0] = np.nan 37 | 38 | f0 = 1200 * np.log2(F_temp / librosa.midi_to_hz(0)) 39 | 40 | plt.figure(figsize=(10, 4)) 41 | plt.plot(f0) 42 | plt.title(f0_method) 43 | plt.xlabel(translations["time_frames"]) 44 | plt.ylabel(translations["Frequency"]) 45 | plt.savefig(image_path) 46 | plt.close() 47 | 48 | with open(txt_path, "w") as f: 49 | for i, f0_value in enumerate(f0): 50 | f.write(f"{i * 100.0},{f0_value}\n") 51 | 52 | gr_info(translations["extract_done"]) 53 | 54 | return [txt_path, image_path] -------------------------------------------------------------------------------- /advanced_rvc_inference/library/uvr5_lib/demucs/states.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import inspect 5 | import warnings 6 | import functools 7 | 8 | sys.path.append(os.getcwd()) 9 | 10 | from advanced_rvc_inference.variables import translations 11 | 12 | def load_model(path_or_package, strict=False): 13 | if isinstance(path_or_package, dict): package = path_or_package 14 | elif isinstance(path_or_package, (str, os.PathLike)): 15 | with warnings.catch_warnings(): 16 | warnings.simplefilter("ignore") 17 | 18 | package = torch.load(path_or_package, map_location="cpu", weights_only=False) 19 | else: raise ValueError(f"{translations['type_not_valid']} {path_or_package}.") 20 | 21 | klass = package["klass"] 22 | args = package["args"] 23 | kwargs = package["kwargs"] 24 | 25 | if strict: model = klass(*args, **kwargs) 26 | else: 27 | sig = inspect.signature(klass) 28 | 29 | for key in list(kwargs): 30 | if key not in sig.parameters: 31 | warnings.warn(translations["del_parameter"] + key) 32 | 33 | del kwargs[key] 34 | 35 | model = klass(*args, **kwargs) 36 | 37 | state = package["state"] 38 | 39 | set_state(model, state) 40 | 41 | return model 42 | 43 | def restore_quantized_state(model, state): 44 | assert "meta" in state 45 | 46 | quantizer = state["meta"]["klass"](model, **state["meta"]["init_kwargs"]) 47 | 48 | quantizer.restore_quantized_state(state) 49 | 50 | quantizer.detach() 51 | 52 | def set_state(model, state, quantizer=None): 53 | if state.get("__quantized"): 54 | if quantizer is not None: quantizer.restore_quantized_state(model, state["quantized"]) 55 | else: restore_quantized_state(model, state) 56 | else: model.load_state_dict(state) 57 | 58 | return state 59 | 60 | def capture_init(init): 61 | @functools.wraps(init) 62 | def __init__(self, *args, **kwargs): 63 | self._init_args_kwargs = (args, kwargs) 64 | 65 | init(self, *args, **kwargs) 66 | 67 | return __init__ -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/PENN/core.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | import torch.nn.functional as F 5 | 6 | PITCH_BINS, CENTS_PER_BIN, OCTAVE = 1440, 5, 1200 7 | 8 | def frequency_to_bins(frequency, quantize_fn=torch.floor): 9 | return cents_to_bins(frequency_to_cents(frequency), quantize_fn) 10 | 11 | def cents_to_bins(cents, quantize_fn=torch.floor): 12 | bins = quantize_fn(cents / CENTS_PER_BIN).long() 13 | bins[bins < 0] = 0 14 | bins[bins >= PITCH_BINS] = PITCH_BINS - 1 15 | return bins 16 | 17 | def cents_to_frequency(cents): 18 | return 31 * 2 ** (cents / OCTAVE) 19 | 20 | def bins_to_cents(bins): 21 | return CENTS_PER_BIN * bins 22 | 23 | def frequency_to_cents(frequency): 24 | return OCTAVE * (frequency / 31).log2() 25 | 26 | def seconds_to_samples(seconds, sample_rate=8000): 27 | return seconds * sample_rate 28 | 29 | def interpolate(pitch, periodicity, value): 30 | voiced = periodicity > value 31 | if not voiced.any(): return pitch 32 | 33 | pitch = pitch.log2() 34 | pitch[..., 0] = pitch[voiced][..., 0] 35 | pitch[..., -1] = pitch[voiced][..., -1] 36 | voiced[..., 0] = True 37 | voiced[..., -1] = True 38 | pitch[~voiced] = _interpolate(torch.where(~voiced[0])[0][None], torch.where(voiced[0])[0][None], pitch[voiced][None]) 39 | 40 | return 2 ** pitch 41 | 42 | def _interpolate(x, xp, fp): 43 | if xp.shape[-1] == 0: return x 44 | if xp.shape[-1] == 1: return torch.full(x.shape, fp.squeeze(), device=fp.device, dtype=fp.dtype) 45 | 46 | m = (fp[:, 1:] - fp[:, :-1]) / (xp[:, 1:] - xp[:, :-1]) 47 | b = fp[:, :-1] - (m.mul(xp[:, :-1])) 48 | 49 | indicies = x[:, :, None].ge(xp[:, None, :]).sum(-1) - 1 50 | indicies = indicies.clamp(0, m.shape[-1] - 1) 51 | line_idx = torch.linspace(0, indicies.shape[0], 1, device=indicies.device).to(torch.long).expand(indicies.shape) 52 | 53 | return m[line_idx, indicies].mul(x) + b[line_idx, indicies] 54 | 55 | def entropy(logits): 56 | distribution = F.softmax(logits, dim=1) 57 | return (1 + 1 / math.log(PITCH_BINS) * (distribution * (distribution + 1e-7).log()).sum(dim=1)) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/SWIFT/SWIFT.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import onnxruntime 3 | 4 | import numpy as np 5 | 6 | SAMPLE_RATE, HOP_LENGTH, FRAME_LENGTH = 16000, 256, 1024 7 | 8 | class SWIFT: 9 | def __init__(self, model_path, fmin = 50, fmax = 1100, confidence_threshold = 0.9, providers = ["CPUExecutionProvider"]): 10 | self.fmin = fmin 11 | self.fmax = fmax 12 | self.confidence_threshold = confidence_threshold 13 | session_options = onnxruntime.SessionOptions() 14 | session_options.inter_op_num_threads = 1 15 | session_options.intra_op_num_threads = 1 16 | self.pitch_session = onnxruntime.InferenceSession(model_path, session_options, providers=providers) 17 | self.pitch_input_name = self.pitch_session.get_inputs()[0].name 18 | 19 | def _extract_pitch_and_confidence(self, audio_16k): 20 | if audio_16k.ndim != 1 or len(audio_16k) == 0: raise ValueError 21 | if len(audio_16k) < 256: audio_16k = np.pad(audio_16k, (0, max(0, 256 - len(audio_16k))), mode="constant") 22 | 23 | outputs = self.pitch_session.run(None, {self.pitch_input_name: audio_16k[None, :].astype(np.float32)}) 24 | if len(outputs) < 2: raise RuntimeError 25 | 26 | return outputs[0][0], outputs[1][0] 27 | 28 | def _compute_voicing(self, pitch_hz, confidence): 29 | return (confidence > self.confidence_threshold) & (pitch_hz >= self.fmin) & (pitch_hz <= self.fmax) 30 | 31 | def _calculate_timestamps(self, n_frames): 32 | frame_centers = np.arange(n_frames) * HOP_LENGTH + ((FRAME_LENGTH - 1) / 2 - ((FRAME_LENGTH - HOP_LENGTH) // 2)) 33 | return frame_centers / SAMPLE_RATE 34 | 35 | def detect_from_array(self, audio_array, sample_rate=SAMPLE_RATE): 36 | if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=-1) 37 | 38 | audio_16k = librosa.resample(audio_array.astype(np.float32), orig_sr=sample_rate, target_sr=SAMPLE_RATE) if sample_rate != SAMPLE_RATE else audio_array 39 | pitch_hz, confidence = self._extract_pitch_and_confidence(audio_16k) 40 | 41 | return pitch_hz, self._compute_voicing(pitch_hz, confidence), self._calculate_timestamps(len(pitch_hz)) -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/training/extract_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import hashlib 5 | import datetime 6 | 7 | from collections import OrderedDict 8 | 9 | sys.path.append(os.getcwd()) 10 | 11 | from advanced_rvc_inference.variables import logger, translations, config 12 | from advanced_rvc_inference.infer.training.utils import replace_keys_in_dict 13 | 14 | def extract_model(ckpt, sr, pitch_guidance, name, model_path, epoch, step, version, hps, model_author, vocoder, energy_use): 15 | try: 16 | logger.info(translations["savemodel"].format(model_dir=model_path, epoch=epoch, step=step)) 17 | os.makedirs(os.path.dirname(model_path), exist_ok=True) 18 | 19 | opt = OrderedDict(weight={key: (value if not config.device.startswith("privateuseone") else value.detach().cpu()).to(torch.float16 if config.is_half else torch.float32) for key, value in ckpt.items() if "enc_q" not in key}) 20 | opt["config"] = [hps.data.filter_length // 2 + 1, 32, hps.model.inter_channels, hps.model.hidden_channels, hps.model.filter_channels, hps.model.n_heads, hps.model.n_layers, hps.model.kernel_size, hps.model.p_dropout, hps.model.resblock, hps.model.resblock_kernel_sizes, hps.model.resblock_dilation_sizes, hps.model.upsample_rates, hps.model.upsample_initial_channel, hps.model.upsample_kernel_sizes, hps.model.spk_embed_dim, hps.model.gin_channels, hps.data.sample_rate] 21 | opt["epoch"] = f"{epoch}epoch" 22 | opt["step"] = step 23 | opt["sr"] = sr 24 | opt["f0"] = int(pitch_guidance) 25 | opt["version"] = version 26 | opt["creation_date"] = datetime.datetime.now().isoformat() 27 | opt["model_hash"] = hashlib.sha256(f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}".encode()).hexdigest() 28 | opt["model_name"] = name 29 | opt["author"] = model_author 30 | opt["vocoder"] = vocoder 31 | opt["energy"] = energy_use 32 | 33 | torch.save(replace_keys_in_dict(replace_keys_in_dict(opt, ".parametrizations.weight.original1", ".weight_v"), ".parametrizations.weight.original0", ".weight_g"), model_path) 34 | except Exception as e: 35 | logger.error(f"{translations['extract_model_error']}: {e}") -------------------------------------------------------------------------------- /advanced_rvc_inference/library/algorithm/commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def init_weights(m, mean=0.0, std=0.01): 4 | if m.__class__.__name__.find("Conv") != -1: m.weight.data.normal_(mean, std) 5 | 6 | def get_padding(kernel_size, dilation=1): 7 | return int((kernel_size * dilation - dilation) / 2) 8 | 9 | def convert_pad_shape(pad_shape): 10 | return [item for sublist in pad_shape[::-1] for item in sublist] 11 | 12 | def slice_segments(x, ids_str, segment_size = 4, dim = 2): 13 | if dim == 2: ret = torch.zeros_like(x[:, :segment_size]) 14 | elif dim == 3: ret = torch.zeros_like(x[:, :, :segment_size]) 15 | 16 | for i in range(x.size(0)): 17 | idx_str = ids_str[i].item() 18 | idx_end = idx_str + segment_size 19 | 20 | if dim == 2: ret[i] = x[i, idx_str:idx_end] 21 | else: ret[i] = x[i, :, idx_str:idx_end] 22 | 23 | return ret 24 | 25 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 26 | b, _, t = x.size() 27 | if x_lengths is None: x_lengths = t 28 | 29 | ids_str = (torch.rand([b]).to(device=x.device) * (x_lengths - segment_size + 1)).to(dtype=torch.long) 30 | 31 | return slice_segments(x, ids_str, segment_size, dim=3), ids_str 32 | 33 | @torch.jit.script 34 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 35 | n_channels_int = n_channels[0] 36 | in_act = input_a + input_b 37 | 38 | return in_act[:, :n_channels_int, :].tanh() * in_act[:, n_channels_int:, :].sigmoid() 39 | 40 | def sequence_mask(length, max_length = None): 41 | if max_length is None: max_length = length.max() 42 | return torch.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1) 43 | 44 | def clip_grad_value(parameters, clip_value, norm_type=2): 45 | if isinstance(parameters, torch.Tensor): parameters = [parameters] 46 | norm_type = float(norm_type) 47 | 48 | if clip_value is not None: clip_value = float(clip_value) 49 | total_norm = 0 50 | 51 | for p in list(filter(lambda p: p.grad is not None, parameters)): 52 | total_norm += (p.grad.data.norm(norm_type)).item() ** norm_type 53 | 54 | if clip_value is not None: p.grad.data.clamp_(min=-clip_value, max=clip_value) 55 | 56 | return total_norm ** (1.0 / norm_type) -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/child/fushion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.core.ui import visible, shutil_move 9 | from advanced_rvc_inference.core.model_utils import fushion_model 10 | from advanced_rvc_inference.variables import translations, configs 11 | 12 | def fushion_tab(): 13 | with gr.Row(): 14 | gr.Markdown(translations["fushion_markdown_2"]) 15 | with gr.Row(): 16 | name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True) 17 | with gr.Row(): 18 | fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4) 19 | with gr.Column(): 20 | with gr.Row(): 21 | model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"]) 22 | model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"]) 23 | with gr.Row(): 24 | model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth") 25 | model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth") 26 | with gr.Row(): 27 | ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True) 28 | with gr.Row(): 29 | output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False) 30 | with gr.Row(): 31 | model_a.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_a], outputs=[model_path_a]) 32 | model_b.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_b], outputs=[model_path_b]) 33 | with gr.Row(): 34 | fushion_button.click( 35 | fn=fushion_model, 36 | inputs=[ 37 | name_to_save, 38 | model_path_a, 39 | model_path_b, 40 | ratio 41 | ], 42 | outputs=[name_to_save, output_model], 43 | api_name="fushion_model" 44 | ) 45 | fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model]) -------------------------------------------------------------------------------- /installer.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | REM Advanced RVC Inference - Windows Installation Script 3 | REM This script installs all necessary dependencies for Advanced RVC Inference 4 | 5 | echo =========================================== 6 | echo Advanced RVC Inference Installation Script 7 | echo =========================================== 8 | 9 | REM Set up environment variables 10 | set PIP_PREFER_BINARY=1 11 | set PYTHONPATH=%CD%;%PYTHONPATH% 12 | 13 | echo Setting up Python environment... 14 | 15 | REM Install uv for fast package management 16 | echo Installing uv... 17 | powershell -Command "Invoke-RestMethod -Uri https://astral.sh/uv/install.ps1 | Invoke-Expression" 18 | 19 | REM Add uv to PATH for current session 20 | set PATH=%LOCALAPPDATA%\uv;%PATH% 21 | 22 | REM Create virtual environment using uv 23 | echo Creating virtual environment... 24 | uv venv 25 | 26 | REM Activate the virtual environment 27 | call .venv\Scripts\activate.bat 28 | 29 | echo Virtual environment activated. 30 | 31 | REM Install torch with CUDA support 32 | echo Installing PyTorch with CUDA support... 33 | uv pip install --upgrade "torch>=2.0.0" "torchvision>=0.15.0" "torchaudio>=2.0.0" --index-url https://download.pytorch.org/whl/cu121 34 | 35 | REM Install dependencies from requirements.txt 36 | echo Installing requirements... 37 | uv pip install -r requirements.txt --index-strategy unsafe-best-match 38 | 39 | REM Install this package in development mode - only install if dependencies are available 40 | echo Installing Advanced RVC Inference package... 41 | uv pip install -e . || echo Warning: Development install failed, continuing with basic setup... 42 | 43 | REM Install prerequisites for RVC 44 | echo Installing RVC prerequisites... 45 | python -c "from advanced_rvc_inference.core import run_prerequisites_script; run_prerequisites_script(pretraineds_hifigan=True, models=True, exe=True)" 46 | 47 | echo =========================================== 48 | echo Installation completed successfully! 49 | echo =========================================== 50 | 51 | echo To run the application, use one of the following commands: 52 | echo python -m advanced_rvc_inference.app // Run with default settings 53 | echo python -m advanced_rvc_inference.app --share // Run with public sharing 54 | echo python -m advanced_rvc_inference.app --listen // Run with external access 55 | 56 | echo =========================================== 57 | 58 | pause -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/DJCM/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch import nn 4 | from einops.layers.torch import Rearrange 5 | 6 | SAMPLE_RATE, WINDOW_LENGTH, N_CLASS = 16000, 1024, 360 7 | 8 | def init_layer(layer): 9 | nn.init.xavier_uniform_(layer.weight) 10 | if hasattr(layer, "bias") and layer.bias is not None: layer.bias.data.fill_(0.0) 11 | 12 | def init_bn(bn): 13 | bn.bias.data.fill_(0.0) 14 | bn.weight.data.fill_(1.0) 15 | bn.running_mean.data.fill_(0.0) 16 | bn.running_var.data.fill_(1.0) 17 | 18 | class BiGRU(nn.Module): 19 | def __init__(self, patch_size, channels, depth): 20 | super(BiGRU, self).__init__() 21 | patch_width, patch_height = patch_size 22 | patch_dim = channels * patch_height * patch_width 23 | self.to_patch_embedding = nn.Sequential(Rearrange('b c (w p1) (h p2) -> b (w h) (p1 p2 c)', p1=patch_width, p2=patch_height)) 24 | self.gru = nn.GRU(patch_dim, patch_dim // 2, num_layers=depth, batch_first=True, bidirectional=True) 25 | 26 | def forward(self, x): 27 | x = self.to_patch_embedding(x) 28 | try: 29 | return self.gru(x)[0] 30 | except: 31 | torch.backends.cudnn.enabled = False 32 | return self.gru(x)[0] 33 | 34 | class ResConvBlock(nn.Module): 35 | def __init__(self, in_planes, out_planes): 36 | super(ResConvBlock, self).__init__() 37 | self.bn1 = nn.BatchNorm2d(in_planes, momentum=0.01) 38 | self.bn2 = nn.BatchNorm2d(out_planes, momentum=0.01) 39 | self.act1 = nn.PReLU() 40 | self.act2 = nn.PReLU() 41 | self.conv1 = nn.Conv2d(in_planes, out_planes, (3, 3), padding=(1, 1), bias=False) 42 | self.conv2 = nn.Conv2d(out_planes, out_planes, (3, 3), padding=(1, 1), bias=False) 43 | self.is_shortcut = False 44 | 45 | if in_planes != out_planes: 46 | self.shortcut = nn.Conv2d(in_planes, out_planes, (1, 1)) 47 | self.is_shortcut = True 48 | 49 | self.init_weights() 50 | 51 | def init_weights(self): 52 | init_bn(self.bn1) 53 | init_bn(self.bn2) 54 | init_layer(self.conv1) 55 | init_layer(self.conv2) 56 | if self.is_shortcut: init_layer(self.shortcut) 57 | 58 | def forward(self, x): 59 | out = self.conv2(self.act2(self.bn2(self.conv1(self.act1(self.bn1(x)))))) 60 | 61 | if self.is_shortcut: return self.shortcut(x) + out 62 | else: return out + x -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Advanced-RVC 2 | 3 | Thank you for your interest in contributing to Advanced-RVC! We’re excited to have you here, and we can’t wait to see what you’ll bring to our community-driven organization. This guide will walk you through how you can participate and contribute to our open-source projects. 4 | 5 | ## How to Contribute 6 | 7 | There are several ways you can get involved: 8 | 9 | ### 1. Reporting Issues 10 | If you encounter bugs or have feature requests, you can help by [reporting an issue](https://github.com/ArkanDash/Advanced-RVC-Inference/issues). Please use clear, detailed information to help us understand the problem. Include: 11 | - A brief description of the issue. 12 | - Steps to reproduce the bug (if applicable). 13 | - Any suggestions you have for solving the problem. 14 | 15 | ### 2. Suggesting Enhancements 16 | AI is a rapidly evolving field, and we welcome ideas for improvement! If you have an idea for an enhancement, please: 17 | - Open a [new issue](https://github.com/ArkanDash/Advanced-RVC-Inference/issues/new) and describe your suggestion. 18 | - Include why you think this enhancement would be useful. 19 | - Mention any alternatives you’ve considered. 20 | 21 | ### 3. Submitting Code 22 | We are always looking for new code contributions, including new features, bug fixes, and documentation improvements. Here’s how you can submit code: 23 | 1. **Fork the repository** you want to contribute to. 24 | 2. **Create a branch** for your feature or fix: 25 | ``` 26 | git checkout -b your-branch-name 27 | ``` 28 | 3. **Make your changes** and commit them to your branch. Write a clear commit message. 29 | 4. **Push to your fork** and submit a [Pull Request (PR)](https://github.com/ArkanDash/Advanced-RVC-Inference/pulls). 30 | 31 | #### Code Style Guidelines 32 | - Follow existing styles in the project you’re contributing to. 33 | - Make sure your code is clean, readable, and well-documented. 34 | 35 | ### 4. Improving Documentation 36 | If you find areas of our documentation that need improvement, we’d love your help! Whether it’s fixing a typo or writing new tutorials, all contributions are appreciated. You can: 37 | - Edit markdown files directly in GitHub or locally. 38 | - Submit your changes via a [Pull Request](https://github.com/ArkanDash/Advanced-RVC-Inference/pulls). 39 | 40 | ## Community & Discussions 41 | 42 | Want to discuss your ideas, ask questions, or connect with other community members? 43 | Join our [Discord](https://discord.gg/hvmsukmBHE) for real-time conversations. 44 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/backends/zluda.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): 4 | class STFT: 5 | def __init__(self): 6 | self.device = "cuda" 7 | self.fourier_bases = {} 8 | 9 | def _get_fourier_basis(self, n_fft): 10 | if n_fft in self.fourier_bases: 11 | return self.fourier_bases[n_fft] 12 | 13 | fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to( 14 | self.device 15 | ) 16 | 17 | cutoff = n_fft // 2 + 1 18 | fourier_basis = torch.cat( 19 | [fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], dim=0 20 | ) 21 | 22 | self.fourier_bases[n_fft] = fourier_basis 23 | return fourier_basis 24 | 25 | def transform(self, input, n_fft, hop_length, window): 26 | fourier_basis = self._get_fourier_basis(n_fft) 27 | fourier_basis = fourier_basis * window 28 | 29 | pad_amount = n_fft // 2 30 | input = torch.nn.functional.pad( 31 | input, (pad_amount, pad_amount), mode="reflect" 32 | ) 33 | 34 | input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1) 35 | fourier_transform = fourier_basis @ input_frames 36 | cutoff = n_fft // 2 + 1 37 | 38 | return torch.complex( 39 | fourier_transform[:, :cutoff, :], fourier_transform[:, cutoff:, :] 40 | ) 41 | 42 | stft = STFT() 43 | _torch_stft = torch.stft 44 | 45 | def z_stft(input, window, *args, **kwargs): 46 | if ( 47 | kwargs.get("win_length") == None 48 | and kwargs.get("center") == None 49 | and kwargs.get("return_complex") == True 50 | ): 51 | return stft.transform( 52 | input, kwargs.get("n_fft"), kwargs.get("hop_length"), window 53 | ) 54 | else: 55 | return _torch_stft( 56 | input=input.cpu(), window=window.cpu(), *args, **kwargs 57 | ).to(input.device) 58 | 59 | def z_jit(f, *_, **__): 60 | f.graph = torch._C.Graph() 61 | return f 62 | 63 | torch.stft = z_stft 64 | torch.jit.script = z_jit 65 | torch.backends.cudnn.enabled = False 66 | torch.backends.cuda.enable_flash_sdp(False) 67 | torch.backends.cuda.enable_math_sdp(True) 68 | torch.backends.cuda.enable_mem_efficient_sdp(False) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/RMVPE/mel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import numpy as np 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from librosa.filters import mel 10 | 11 | sys.path.append(os.getcwd()) 12 | 13 | class MelSpectrogram(nn.Module): 14 | def __init__(self, n_mel_channels, sample_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-5): 15 | super().__init__() 16 | n_fft = win_length if n_fft is None else n_fft 17 | self.hann_window = {} 18 | mel_basis = mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True) 19 | mel_basis = torch.from_numpy(mel_basis).float() 20 | self.register_buffer("mel_basis", mel_basis) 21 | self.n_fft = win_length if n_fft is None else n_fft 22 | self.hop_length = hop_length 23 | self.win_length = win_length 24 | self.sample_rate = sample_rate 25 | self.n_mel_channels = n_mel_channels 26 | self.clamp = clamp 27 | 28 | def forward(self, audio, keyshift=0, speed=1, center=True): 29 | factor = 2 ** (keyshift / 12) 30 | win_length_new = int(np.round(self.win_length * factor)) 31 | keyshift_key = str(keyshift) + "_" + str(audio.device) 32 | if keyshift_key not in self.hann_window: self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device) 33 | 34 | n_fft = int(np.round(self.n_fft * factor)) 35 | hop_length = int(np.round(self.hop_length * speed)) 36 | 37 | if str(audio.device).startswith(("ocl", "privateuseone")): 38 | if not hasattr(self, "stft"): 39 | from main.library.backends.utils import STFT 40 | self.stft = STFT(filter_length=n_fft, hop_length=hop_length, win_length=win_length_new).to(audio.device) 41 | magnitude = self.stft.transform(audio, 1e-9) 42 | else: 43 | fft = torch.stft(audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length_new, window=self.hann_window[keyshift_key], center=center, return_complex=True) 44 | magnitude = (fft.real.pow(2) + fft.imag.pow(2)).sqrt() 45 | 46 | if keyshift != 0: 47 | size = self.n_fft // 2 + 1 48 | resize = magnitude.size(1) 49 | if resize < size: magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) 50 | magnitude = magnitude[:, :size, :] * self.win_length / win_length_new 51 | 52 | mel_output = self.mel_basis @ magnitude 53 | return mel_output.clamp(min=self.clamp).log() -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/FCPE/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from torch import nn 5 | from io import BytesIO 6 | from Crypto.Cipher import AES 7 | from Crypto.Util.Padding import unpad 8 | 9 | def decrypt_model(configs, input_path): 10 | with open(input_path, "rb") as f: 11 | data = f.read() 12 | 13 | with open(os.path.join(configs["binary_path"], "decrypt.bin"), "rb") as f: 14 | key = f.read() 15 | 16 | return BytesIO(unpad(AES.new(key, AES.MODE_CBC, data[:16]).decrypt(data[16:]), AES.block_size)).read() 17 | 18 | def calc_same_padding(kernel_size): 19 | pad = kernel_size // 2 20 | return (pad, pad - (kernel_size + 1) % 2) 21 | 22 | def l2_regularization(model, l2_alpha): 23 | l2_loss = [] 24 | for module in model.modules(): 25 | if type(module) is nn.Conv2d: l2_loss.append((module.weight**2).sum() / 2.0) 26 | 27 | return l2_alpha * sum(l2_loss) 28 | 29 | def torch_interp(x, xp, fp): 30 | sort_idx = xp.argsort() 31 | xp = xp[sort_idx] 32 | fp = fp[sort_idx] 33 | 34 | right_idxs = torch.searchsorted(xp, x).clamp(max=len(xp) - 1) 35 | left_idxs = (right_idxs - 1).clamp(min=0) 36 | x_left = xp[left_idxs] 37 | y_left = fp[left_idxs] 38 | 39 | interp_vals = y_left + ((x - x_left) * (fp[right_idxs] - y_left) / (xp[right_idxs] - x_left)) 40 | interp_vals[x < xp[0]] = fp[0] 41 | interp_vals[x > xp[-1]] = fp[-1] 42 | 43 | return interp_vals 44 | 45 | def batch_interp_with_replacement_detach(uv, f0): 46 | result = f0.clone() 47 | for i in range(uv.shape[0]): 48 | interp_vals = torch_interp(torch.where(uv[i])[-1], torch.where(~uv[i])[-1], f0[i][~uv[i]]).detach() 49 | result[i][uv[i]] = interp_vals 50 | 51 | return result 52 | 53 | class DotDict(dict): 54 | def __getattr__(*args): 55 | val = dict.get(*args) 56 | return DotDict(val) if type(val) is dict else val 57 | 58 | __setattr__ = dict.__setitem__ 59 | __delattr__ = dict.__delitem__ 60 | 61 | class Swish(nn.Module): 62 | def forward(self, x): 63 | return x * x.sigmoid() 64 | 65 | class Transpose(nn.Module): 66 | def __init__(self, dims): 67 | super().__init__() 68 | assert len(dims) == 2, "dims == 2" 69 | self.dims = dims 70 | 71 | def forward(self, x): 72 | return x.transpose(*self.dims) 73 | 74 | class GLU(nn.Module): 75 | def __init__(self, dim): 76 | super().__init__() 77 | self.dim = dim 78 | 79 | def forward(self, x): 80 | out, gate = x.chunk(2, dim=self.dim) 81 | return out * gate.sigmoid() -------------------------------------------------------------------------------- /advanced_rvc_inference/core/csrt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.getcwd()) 5 | 6 | from advanced_rvc_inference.core.inference import whisper_process 7 | from advanced_rvc_inference.library.utils import check_spk_diarization 8 | from advanced_rvc_inference.core.ui import gr_info, gr_warning, process_output 9 | from advanced_rvc_inference.variables import config, translations, configs, logger 10 | 11 | def create_srt(model_size, input_audio, output_file, word_timestamps): 12 | import multiprocessing as mp 13 | 14 | if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio): 15 | gr_warning(translations["input_not_valid"]) 16 | return [None]*2 17 | 18 | if not output_file.endswith(".srt"): output_file += ".srt" 19 | 20 | if not output_file: 21 | gr_warning(translations["output_not_valid"]) 22 | return [None]*2 23 | 24 | output_dir = os.path.dirname(output_file) 25 | if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) 26 | 27 | info = "" 28 | output_file = process_output(output_file) 29 | 30 | check_spk_diarization(model_size, speechbrain=False) 31 | gr_info(translations["csrt"]) 32 | 33 | try: 34 | mp.set_start_method("spawn") 35 | except: 36 | pass 37 | 38 | whisper_queue = mp.Queue() 39 | whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, word_timestamps)) 40 | whisperprocess.start() 41 | 42 | segments = whisper_queue.get() 43 | 44 | with open(output_file, "w", encoding="utf-8") as f: 45 | for i, segment in enumerate(segments): 46 | start = segment["start"] 47 | end = segment["end"] 48 | text = segment["text"].strip() 49 | 50 | index = f"{i+1}\n" 51 | timestamp = f"{format_timestamp(start)} --> {format_timestamp(end)}\n" 52 | text1 = f"{text}\n\n" 53 | 54 | f.write(index) 55 | f.write(timestamp) 56 | f.write(text1) 57 | 58 | info = info + index + timestamp + text1 59 | logger.info(info) 60 | 61 | gr_info(translations["success"]) 62 | 63 | return [{"value": output_file, "visible": True, "__type__": "update"}, info] 64 | 65 | def format_timestamp(seconds): 66 | hours = int(seconds // 3600) 67 | minutes = int((seconds % 3600) // 60) 68 | 69 | seconds = int(seconds % 60) 70 | miliseconds = int((seconds - int(seconds)) * 1000) 71 | 72 | return f"{hours:02}:{minutes:02}:{seconds:02},{miliseconds:03}" -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/DJCM/decoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | sys.path.append(os.getcwd()) 9 | 10 | from advanced_rvc_inference.library.predictors.DJCM.encoder import ResEncoderBlock 11 | from advanced_rvc_inference.library.predictors.DJCM.utils import ResConvBlock, BiGRU, init_bn, init_layer, N_CLASS, WINDOW_LENGTH 12 | 13 | class ResDecoderBlock(nn.Module): 14 | def __init__(self, in_channels, out_channels, n_blocks, stride): 15 | super(ResDecoderBlock, self).__init__() 16 | self.conv1 = nn.ConvTranspose2d(in_channels, out_channels, stride, stride, (0, 0), bias=False) 17 | self.bn1 = nn.BatchNorm2d(in_channels, momentum=0.01) 18 | self.conv = nn.ModuleList([ResConvBlock(out_channels * 2, out_channels)]) 19 | 20 | for _ in range(n_blocks - 1): 21 | self.conv.append(ResConvBlock(out_channels, out_channels)) 22 | 23 | self.init_weights() 24 | 25 | def init_weights(self): 26 | init_bn(self.bn1) 27 | init_layer(self.conv1) 28 | 29 | def forward(self, x, concat): 30 | x = self.conv1(F.relu_(self.bn1(x))) 31 | x = torch.cat((x, concat), dim=1) 32 | 33 | for each_layer in self.conv: 34 | x = each_layer(x) 35 | 36 | return x 37 | 38 | class Decoder(nn.Module): 39 | def __init__(self, n_blocks): 40 | super(Decoder, self).__init__() 41 | self.de_blocks = nn.ModuleList([ 42 | ResDecoderBlock(384, 384, n_blocks, (1, 2)), 43 | ResDecoderBlock(384, 384, n_blocks, (1, 2)), 44 | ResDecoderBlock(384, 256, n_blocks, (1, 2)), 45 | ResDecoderBlock(256, 128, n_blocks, (1, 2)), 46 | ResDecoderBlock(128, 64, n_blocks, (1, 2)), 47 | ResDecoderBlock(64, 32, n_blocks, (1, 2)) 48 | ]) 49 | 50 | def forward(self, x, concat_tensors): 51 | for i, layer in enumerate(self.de_blocks): 52 | x = layer(x, concat_tensors[-1 - i]) 53 | 54 | return x 55 | 56 | class PE_Decoder(nn.Module): 57 | def __init__(self, n_blocks, seq_layers=1): 58 | super(PE_Decoder, self).__init__() 59 | self.de_blocks = Decoder(n_blocks) 60 | self.after_conv1 = ResEncoderBlock(32, 32, n_blocks, None) 61 | self.after_conv2 = nn.Conv2d(32, 1, (1, 1)) 62 | self.fc = nn.Sequential(BiGRU((1, WINDOW_LENGTH // 2), 1, seq_layers), nn.Linear(WINDOW_LENGTH // 2, N_CLASS), nn.Sigmoid()) 63 | init_layer(self.after_conv2) 64 | 65 | def forward(self, x, concat_tensors): 66 | return self.fc(self.after_conv2(self.after_conv1(self.de_blocks(x, concat_tensors)))).squeeze(1) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/generators/hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import torch.nn.utils.parametrize as parametrize 6 | 7 | from torch.nn.utils import remove_weight_norm 8 | from torch.nn.utils.parametrizations import weight_norm 9 | 10 | sys.path.append(os.getcwd()) 11 | 12 | from advanced_rvc_inference.library.algorithm.commons import init_weights 13 | from advanced_rvc_inference.library.algorithm.residuals import ResBlock, LRELU_SLOPE 14 | 15 | class HiFiGANGenerator(torch.nn.Module): 16 | def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): 17 | super(HiFiGANGenerator, self).__init__() 18 | self.num_kernels = len(resblock_kernel_sizes) 19 | self.num_upsamples = len(upsample_rates) 20 | self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 21 | self.ups = torch.nn.ModuleList() 22 | self.resblocks = torch.nn.ModuleList() 23 | 24 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 25 | self.ups.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2))) 26 | ch = upsample_initial_channel // (2 ** (i + 1)) 27 | 28 | for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes): 29 | self.resblocks.append(ResBlock(ch, k, d)) 30 | 31 | self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) 32 | self.ups.apply(init_weights) 33 | if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) 34 | 35 | def forward(self, x, g = None): 36 | x = self.conv_pre(x) 37 | if g is not None: x += self.cond(g) 38 | 39 | for i in range(self.num_upsamples): 40 | x = self.ups[i](torch.nn.functional.leaky_relu(x, LRELU_SLOPE)) 41 | xs = None 42 | 43 | for j in range(self.num_kernels): 44 | if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) 45 | else: xs += self.resblocks[i * self.num_kernels + j](x) 46 | x = xs / self.num_kernels 47 | 48 | return self.conv_post(torch.nn.functional.leaky_relu(x)).tanh() 49 | 50 | def remove_weight_norm(self): 51 | for l in self.ups: 52 | if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True) 53 | else: remove_weight_norm(l) 54 | 55 | for l in self.resblocks: 56 | l.remove_weight_norm() -------------------------------------------------------------------------------- /advanced_rvc_inference/assets/config.txt: -------------------------------------------------------------------------------- 1 | # Netscape HTTP Cookie File 2 | # http://curl.haxx.se/rfc/cookie_spec.html 3 | # This is a generated file! Do not edit. 4 | 5 | .youtube.com TRUE / TRUE 1775861587 PREF f4=4000000&tz=Europe.Copenhagen 6 | .youtube.com TRUE / TRUE 1775429540 SOCS CAESEwgDEgk3MzM4ODU4MTgaAmVuIAEaBgiAy6O-Bg 7 | .youtube.com TRUE / TRUE 1772837586 __Secure-1PSIDTS sidts-CjIBEJ3XV6pCalNg0CjCWF_v2t6TsOrMxJZS0U4Syxsj1Ar5Wt0j_I0mV1HvK8pjgzceWRAA 8 | .youtube.com TRUE / TRUE 1772837586 __Secure-3PSIDTS sidts-CjIBEJ3XV6pCalNg0CjCWF_v2t6TsOrMxJZS0U4Syxsj1Ar5Wt0j_I0mV1HvK8pjgzceWRAA 9 | .youtube.com TRUE / FALSE 1775861586 HSID A0YpCKpriRsFf-eaY 10 | .youtube.com TRUE / TRUE 1775861586 SSID A81Z41FkWZ4pTWTeH 11 | .youtube.com TRUE / FALSE 1775861586 APISID Sz068zUXho-XjSR6/AQiI5USjyJu7Gohv0 12 | .youtube.com TRUE / TRUE 1775861586 SAPISID jhCFMxYOhwdvfusq/AN48BaLVbLYevdtWs 13 | .youtube.com TRUE / TRUE 1775861586 __Secure-1PAPISID jhCFMxYOhwdvfusq/AN48BaLVbLYevdtWs 14 | .youtube.com TRUE / TRUE 1775861586 __Secure-3PAPISID jhCFMxYOhwdvfusq/AN48BaLVbLYevdtWs 15 | .youtube.com TRUE / FALSE 1775861586 SID g.a000ugiSAkUceGjFKLyl_8d6QCYNYGiTK5OslV6Yo4XcCJ12HlsifzqU3EwANmP9XbixAPxd9AACgYKASYSARQSFQHGX2Mia271rvQxsQP9duW_omjJFxoVAUF8yKpRiIybd7GrZtUXXe-mWs9h0076 16 | .youtube.com TRUE / TRUE 1775861586 __Secure-1PSID g.a000ugiSAkUceGjFKLyl_8d6QCYNYGiTK5OslV6Yo4XcCJ12Hlsiyqo-MXwLpsPxxYN7v1RgEAACgYKASoSARQSFQHGX2MiT4w8Mfz-Uacjva743UUmKRoVAUF8yKqBquoRXGGeFoLcSlEkzLr80076 17 | .youtube.com TRUE / TRUE 1775861586 __Secure-3PSID g.a000ugiSAkUceGjFKLyl_8d6QCYNYGiTK5OslV6Yo4XcCJ12Hlsic_IpbHF-rVcATgFnBx5-XQACgYKAV8SARQSFQHGX2MiHfIVTQ75805ff5G_9ErOohoVAUF8yKo3by8D34K0XL5d0nQs3pcU0076 18 | .youtube.com TRUE / TRUE 1775861586 LOGIN_INFO AFmmF2swRgIhAJ-I_I_kAC3fwjXFk8Ii8hS6J01HlWxT1hxOIZ_hC_b1AiEA7Dl6QKY61fi7podAChQOvsxRJLfhJp90urdIYHkvnJc:QUQ3MjNmeXJ5UTB6V2h1ZUdLTXBuS0p2QnJyZnk5M2RFWVNQNTQtaHdJSjhseFpIX2xJbC1DOGp0NUxzcFVkOVZyZkRVVEtQelktUGk2c2VjVnQtTmlwS2tBNUpOTUFfNTJ5SGVCMTY5STFSTjZjaXFNOHYtQm5BZTdEOWQxWkNPY0laZ3FibnZSYy1nNER2eVZYN0p1Ukl4bk1lMElIejRR 19 | .youtube.com TRUE / TRUE 1775429537 __Secure-YEC CgtnMXZOdi1WVXNpayjSzqi-BjIiCgJOTBIcEhgSFhMLFBUWFwwYGRobHB0eHw4PIBAREiEgUw%3D%3D 20 | .youtube.com TRUE / FALSE 1772837590 SIDCC AKEyXzX7AgkezzZjTXTDuEQejiJwX0Qa9krKmOjMc8i6VuxONJDa_91O2xgFKbiGRZh3F3kpIQ 21 | .youtube.com TRUE / TRUE 1772837590 __Secure-1PSIDCC AKEyXzW_Q4mUyomFQUP0p9Mv0o0rdzS5PBN-V7_XS2bloLP1wv5_9En8qmUAsarHn7wkU4KY 22 | .youtube.com TRUE / TRUE 1772837590 __Secure-3PSIDCC AKEyXzXTgT_gT0WiuG-2VSWgMoOm0vQSvFwlCYnqtQI8cPpFYOxpwOZfByhS1WNI1ZYvhZqK_Q 23 | .youtube.com TRUE / TRUE 0 YSC 78pipAlj27I 24 | .youtube.com TRUE / TRUE 1775429588 VISITOR_PRIVACY_METADATA CgJOTBIcEhgSFhMLFBUWFwwYGRobHB0eHw4PIBAREiEgUw%3D%3D 25 | .youtube.com TRUE / TRUE 1756853586 __Secure-ROLLOUT_TOKEN CNCwjIqOv-6f8QEQjO3F88X2iwMYgL3K88X2iwM%3D 26 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/PESTO/PESTO.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | sys.path.append(os.getcwd()) 6 | 7 | class PESTO: 8 | def __init__(self, model_path, step_size=10, reduction="alwa", num_chunks=1, sample_rate=16000, device=None, providers=None, onnx=False): 9 | self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") 10 | self.step_size = step_size 11 | self.reduction = reduction 12 | self.num_chunks = num_chunks 13 | self.sample_rate = sample_rate 14 | self.onnx = onnx 15 | 16 | if self.onnx: 17 | import onnxruntime as ort 18 | 19 | sess_options = ort.SessionOptions() 20 | sess_options.log_severity_level = 3 21 | self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) 22 | else: 23 | from main.library.predictors.PESTO.model import PPESTO, Resnet1d 24 | from main.library.predictors.PESTO.preprocessor import Preprocessor 25 | 26 | ckpt = torch.load(model_path, map_location="cpu", weights_only=False) 27 | model = PPESTO(Resnet1d(**ckpt["hparams"]["encoder"]), preprocessor=Preprocessor(hop_size=step_size, sampling_rate=sample_rate, **ckpt["hcqt_params"]), crop_kwargs=ckpt["hparams"]["pitch_shift"], reduction=ckpt["hparams"]["reduction"]) 28 | model.load_state_dict(ckpt["state_dict"], strict=False) 29 | 30 | self.model = model.to(self.device).eval() 31 | self.model.reduction = self.reduction 32 | 33 | def compute_f0(self, x): 34 | assert x.ndim <= 2 35 | 36 | with torch.inference_mode(): 37 | with torch.no_grad(): 38 | preds, confidence = [], [] 39 | 40 | for chunk in x.chunk(chunks=self.num_chunks): 41 | if self.onnx: 42 | model = self.model.run( 43 | [self.model.get_outputs()[0].name, self.model.get_outputs()[1].name], 44 | { 45 | self.model.get_inputs()[0].name: chunk.cpu().numpy() 46 | } 47 | ) 48 | pred, conf = torch.tensor(model[0], device=self.device), torch.tensor(model[1], device=self.device) 49 | else: 50 | pred, conf = self.model( 51 | chunk, 52 | sr=self.sample_rate, 53 | convert_to_freq=True, 54 | return_activations=False 55 | ) 56 | 57 | preds.append(pred) 58 | confidence.append(conf) 59 | 60 | return torch.cat(preds, dim=0), torch.cat(confidence, dim=0) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/speaker_diarization/embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import numpy as np 6 | import torch.nn.functional as F 7 | 8 | from functools import cached_property 9 | from torch.nn.utils.rnn import pad_sequence 10 | 11 | sys.path.append(os.getcwd()) 12 | 13 | from advanced_rvc_inference.library.speaker_diarization.speechbrain import EncoderClassifier 14 | 15 | class SpeechBrainPretrainedSpeakerEmbedding: 16 | def __init__(self, embedding, device = None): 17 | super().__init__() 18 | 19 | self.embedding = embedding 20 | self.device = device or torch.device("cpu") 21 | self.classifier_ = EncoderClassifier.from_hparams(source=self.embedding, run_opts={"device": self.device}) 22 | 23 | @cached_property 24 | def dimension(self): 25 | *_, dimension = self.classifier_.encode_batch(torch.rand(1, 16000).to(self.device)).shape 26 | return dimension 27 | 28 | @cached_property 29 | def min_num_samples(self): 30 | with torch.inference_mode(): 31 | lower, upper = 2, round(0.5 * self.classifier_.audio_normalizer.sample_rate) 32 | middle = (lower + upper) // 2 33 | 34 | while lower + 1 < upper: 35 | try: 36 | _ = self.classifier_.encode_batch(torch.randn(1, middle).to(self.device)) 37 | upper = middle 38 | except RuntimeError: 39 | lower = middle 40 | 41 | middle = (lower + upper) // 2 42 | 43 | return upper 44 | 45 | def __call__(self, waveforms, masks = None): 46 | batch_size, num_channels, num_samples = waveforms.shape 47 | assert num_channels == 1 48 | 49 | waveforms = waveforms.squeeze(dim=1) 50 | 51 | if masks is None: 52 | signals = waveforms.squeeze(dim=1) 53 | wav_lens = signals.shape[1] * torch.ones(batch_size) 54 | else: 55 | batch_size_masks, _ = masks.shape 56 | assert batch_size == batch_size_masks 57 | 58 | imasks = F.interpolate(masks.unsqueeze(dim=1), size=num_samples, mode="nearest").squeeze(dim=1) > 0.5 59 | signals = pad_sequence([waveform[imask].contiguous() for waveform, imask in zip(waveforms, imasks)], batch_first=True) 60 | wav_lens = imasks.sum(dim=1) 61 | 62 | max_len = wav_lens.max() 63 | if max_len < self.min_num_samples: return np.nan * np.zeros((batch_size, self.dimension)) 64 | 65 | too_short = wav_lens < self.min_num_samples 66 | wav_lens = wav_lens / max_len 67 | wav_lens[too_short] = 1.0 68 | 69 | embeddings = (self.classifier_.encode_batch(signals, wav_lens=wav_lens).squeeze(dim=1).cpu().numpy()) 70 | embeddings[too_short.cpu().numpy()] = np.nan 71 | 72 | return embeddings -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/FCPE/stft.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import numpy as np 6 | import torch.nn.functional as F 7 | 8 | from librosa.filters import mel 9 | 10 | sys.path.append(os.getcwd()) 11 | 12 | class STFT: 13 | def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): 14 | self.target_sr = sr 15 | self.n_mels = n_mels 16 | self.n_fft = n_fft 17 | self.win_size = win_size 18 | self.hop_length = hop_length 19 | self.fmin = fmin 20 | self.fmax = fmax 21 | self.clip_val = clip_val 22 | self.mel_basis = {} 23 | self.hann_window = {} 24 | 25 | def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): 26 | n_fft = self.n_fft 27 | win_size = self.win_size 28 | hop_length = self.hop_length 29 | fmax = self.fmax 30 | factor = 2 ** (keyshift / 12) 31 | win_size_new = int(np.round(win_size * factor)) 32 | hop_length_new = int(np.round(hop_length * speed)) 33 | mel_basis = self.mel_basis if not train else {} 34 | hann_window = self.hann_window if not train else {} 35 | mel_basis_key = str(fmax) + "_" + str(y.device) 36 | 37 | if mel_basis_key not in mel_basis: mel_basis[mel_basis_key] = torch.from_numpy(mel(sr=self.target_sr, n_fft=n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=fmax)).float().to(y.device) 38 | keyshift_key = str(keyshift) + "_" + str(y.device) 39 | if keyshift_key not in hann_window: hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) 40 | 41 | pad_left = (win_size_new - hop_length_new) // 2 42 | pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left) 43 | 44 | pad = F.pad(y.unsqueeze(1), (pad_left, pad_right), mode="reflect" if pad_right < y.size(-1) else "constant").squeeze(1) 45 | n_fft = int(np.round(n_fft * factor)) 46 | 47 | if str(y.device).startswith(("ocl", "privateuseone")): 48 | if not hasattr(self, "stft"): 49 | from main.library.backends.utils import STFT as _STFT 50 | self.stft = _STFT(filter_length=n_fft, hop_length=hop_length_new, win_length=win_size_new).to(y.device) 51 | spec = self.stft.transform(pad, 1e-9) 52 | else: 53 | spec = torch.stft(pad, n_fft, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=True) 54 | spec = (spec.real.pow(2) + spec.imag.pow(2) + 1e-9).sqrt() 55 | 56 | if keyshift != 0: 57 | size = n_fft // 2 + 1 58 | resize = spec.size(1) 59 | spec = (F.pad(spec, (0, 0, 0, size - resize)) if resize < size else spec[:, :size, :]) * win_size / win_size_new 60 | 61 | return ((mel_basis[mel_basis_key] @ spec).clamp(min=self.clip_val) * 1).log() -------------------------------------------------------------------------------- /advanced_rvc_inference/library/backends/opencl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import platform 5 | import subprocess 6 | 7 | try: 8 | import pytorch_ocl 9 | except: 10 | pytorch_ocl = None 11 | 12 | sys.path.append(os.getcwd()) 13 | 14 | from advanced_rvc_inference.library.backends.utils import GRU 15 | 16 | torch_available = pytorch_ocl != None 17 | if torch_available: adaptive_orig = torch.nn.AdaptiveAvgPool2d 18 | 19 | def check_amd_gpu(gpu): 20 | for i in ["RX", "AMD", "Vega", "Radeon", "FirePro"]: 21 | return i in gpu 22 | 23 | def get_amd_gpu_windows(): 24 | gpus = "" 25 | 26 | try: 27 | gpus = subprocess.check_output("wmic path win32_VideoController get name", shell=True, stderr=subprocess.DEVNULL) 28 | except subprocess.CalledProcessError: 29 | gpus = subprocess.check_output('powershell "Get-CimInstance Win32_VideoController | Select-Object -ExpandProperty Name"', shell=True, stderr=subprocess.DEVNULL) 30 | 31 | return [gpu.strip() for gpu in gpus.decode().split('\n')[1:] if check_amd_gpu(gpu)] 32 | 33 | def get_amd_gpu_linux(): 34 | try: 35 | return [gpu for gpu in subprocess.check_output("lspci | grep VGA", shell=True).decode().split('\n') if check_amd_gpu(gpu)] 36 | except: 37 | return [] 38 | 39 | def get_gpu_list(): 40 | return (get_amd_gpu_windows() if platform.system() == "Windows" else get_amd_gpu_linux()) if torch_available else [] 41 | 42 | def device_count(): 43 | return len(get_gpu_list()) if torch_available else 0 44 | 45 | def device_name(device_id = 0): 46 | return (get_gpu_list()[device_id] if device_id >= 0 and device_id < device_count() else "") if torch_available else "" 47 | 48 | def is_available(): 49 | return (device_count() > 0) if torch_available else False 50 | 51 | def group_norm(x, num_groups, weight=None, bias=None, eps=1e-5): 52 | N, C = x.shape[:2] 53 | assert C % num_groups == 0 54 | 55 | shape = (N, num_groups, C // num_groups) + x.shape[2:] 56 | x_reshaped = x.view(shape) 57 | 58 | dims = (2,) + tuple(range(3, x_reshaped.dim())) 59 | mean = x_reshaped.mean(dim=dims, keepdim=True) 60 | var = x_reshaped.var(dim=dims, keepdim=True, unbiased=False) 61 | 62 | x_norm = (x_reshaped - mean) / (var + eps).sqrt() 63 | x_norm = x_norm.view_as(x) 64 | 65 | if weight is not None: 66 | weight = weight.view(1, C, *([1] * (x.dim() - 2))) 67 | x_norm = x_norm * weight 68 | 69 | if bias is not None: 70 | bias = bias.view(1, C, *([1] * (x.dim() - 2))) 71 | x_norm = x_norm + bias 72 | 73 | return x_norm 74 | 75 | def script(f, *_, **__): 76 | f.graph = pytorch_ocl.torch._C.Graph() 77 | return f 78 | 79 | def AdaptiveAvgPool2d(input): 80 | input = input[0] if isinstance(input, tuple) else input 81 | return adaptive_orig(input) 82 | 83 | if torch_available: 84 | torch.nn.GRU = GRU 85 | torch.nn.AdaptiveAvgPool2d = AdaptiveAvgPool2d 86 | torch.nn.functional.group_norm = group_norm 87 | torch.jit.script = script -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/child/create_srt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.core.csrt import create_srt 9 | from advanced_rvc_inference.core.ui import shutil_move, change_audios_choices 10 | from advanced_rvc_inference.variables import translations, file_types, configs, paths_for_files 11 | 12 | def create_srt_tab(): 13 | with gr.Row(): 14 | gr.Markdown(translations["create_srt_markdown_2"]) 15 | with gr.Row(): 16 | with gr.Column(): 17 | srt_content = gr.Textbox(label=translations["srt_content"], value="", lines=9, max_lines=9, interactive=False) 18 | with gr.Column(): 19 | word_timestamps = gr.Checkbox(label=translations["word_timestamps"], info=translations["word_timestamps_info"], value=False, interactive=True) 20 | model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True) 21 | with gr.Row(): 22 | convert_button = gr.Button(translations["convert_audio"], variant="primary") 23 | with gr.Row(): 24 | with gr.Accordion(translations["input_output"], open=False): 25 | with gr.Column(): 26 | input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) 27 | output_file = gr.Textbox(label=translations["srt_output_file"], value="srt/output.srt", placeholder="srt/output.srt", interactive=True) 28 | with gr.Column(): 29 | refresh = gr.Button(translations["refresh"]) 30 | with gr.Row(): 31 | input_file = gr.Files(label=translations["drop_audio"], file_types=file_types) 32 | with gr.Row(): 33 | play_audio = gr.Audio(interactive=False, label=translations["input_audio"]) 34 | with gr.Row(): 35 | output_srt = gr.File(label=translations["srt_output_file"], file_types=[".srt"], interactive=False, visible=False) 36 | with gr.Row(): 37 | input_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input_file], outputs=[input_audio]) 38 | input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[play_audio]) 39 | refresh.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio]) 40 | with gr.Row(): 41 | convert_button.click( 42 | fn=create_srt, 43 | inputs=[ 44 | model_size, 45 | input_audio, 46 | output_file, 47 | word_timestamps 48 | ], 49 | outputs=[ 50 | output_srt, 51 | srt_content 52 | ], 53 | api_name="create_srt" 54 | ) 55 | -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/training/anyprecision_optimizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.optim.optimizer import Optimizer 4 | 5 | class AnyPrecisionAdamW(Optimizer): 6 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, use_kahan_summation=True, momentum_dtype=torch.bfloat16, variance_dtype=torch.bfloat16, compensation_buffer_dtype=torch.bfloat16): 7 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, use_kahan_summation=use_kahan_summation, momentum_dtype=momentum_dtype, variance_dtype=variance_dtype, compensation_buffer_dtype=compensation_buffer_dtype) 8 | super().__init__(params, defaults) 9 | 10 | @torch.no_grad() 11 | def step(self, closure=None): 12 | if closure is not None: 13 | with torch.enable_grad(): 14 | closure() 15 | 16 | for group in self.param_groups: 17 | beta1, beta2 = group["betas"] 18 | lr = group["lr"] 19 | weight_decay = group["weight_decay"] 20 | eps = group["eps"] 21 | use_kahan_summation = group["use_kahan_summation"] 22 | momentum_dtype = group["momentum_dtype"] 23 | variance_dtype = group["variance_dtype"] 24 | compensation_buffer_dtype = group["compensation_buffer_dtype"] 25 | 26 | for p in group["params"]: 27 | if p.grad is None: continue 28 | if p.grad.is_sparse: raise RuntimeError 29 | 30 | state = self.state[p] 31 | if len(state) == 0: 32 | state["step"] = torch.tensor(0.0) 33 | state["exp_avg"] = torch.zeros_like(p, dtype=momentum_dtype) 34 | state["exp_avg_sq"] = torch.zeros_like(p, dtype=variance_dtype) 35 | if use_kahan_summation: state["compensation"] = torch.zeros_like(p, dtype=compensation_buffer_dtype) 36 | 37 | state["step"] += 1 38 | step = state["step"] 39 | exp_avg = state["exp_avg"] 40 | exp_avg_sq = state["exp_avg_sq"] 41 | 42 | grad = p.grad 43 | if weight_decay: p.data.mul_(1 - lr * weight_decay) 44 | 45 | exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) 46 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) 47 | 48 | bias_correction1 = 1 - beta1 ** step 49 | step_size = lr / bias_correction1 50 | 51 | denom_correction = (1 - beta2**step) ** 0.5 52 | centered_variance = (exp_avg_sq.sqrt() / denom_correction).add_(eps, alpha=1) 53 | 54 | if use_kahan_summation: 55 | compensation = state["compensation"] 56 | compensation.addcdiv_(exp_avg, centered_variance, value=-step_size) 57 | 58 | temp_buffer = p.detach().clone() 59 | p.data.add_(compensation) 60 | compensation.add_(temp_buffer.sub_(p.data)) 61 | else: p.data.addcdiv_(exp_avg, centered_variance, value=-step_size) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/onnx/wrapper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import onnx 3 | import torch 4 | import onnxruntime 5 | 6 | import numpy as np 7 | 8 | class ONNXRVC: 9 | def __init__(self, model_path, providers, log_severity_level = 3): 10 | sess_options = onnxruntime.SessionOptions() 11 | sess_options.log_severity_level = log_severity_level 12 | 13 | metadata_dict = None 14 | for prop in onnx.load(model_path).metadata_props: 15 | if prop.key == "model_info": 16 | metadata_dict = json.loads(prop.value) 17 | break 18 | 19 | self.cpt = {} 20 | self.cpt["tgt_sr"] = metadata_dict.get("sr", 32000) 21 | self.cpt["use_f0"] = metadata_dict.get("f0", 1) 22 | self.cpt["version"] = metadata_dict.get("version", "v1") 23 | self.cpt["energy"] = metadata_dict.get("energy", False) 24 | self.net_g = onnxruntime.InferenceSession( 25 | model_path, 26 | sess_options=sess_options, 27 | providers=providers 28 | ) 29 | 30 | def get_onnx_argument(self, feats, p_len, sid, pitch, pitchf, energy): 31 | inputs = { 32 | self.net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32), 33 | self.net_g.get_inputs()[1].name: p_len.cpu().numpy(), 34 | self.net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64), 35 | self.net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32) 36 | } 37 | 38 | if self.cpt["energy"]: 39 | if self.cpt["use_f0"]: 40 | inputs.update({ 41 | self.net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64), 42 | self.net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32), 43 | self.net_g.get_inputs()[6].name: energy.cpu().numpy().astype(np.float32) 44 | }) 45 | else: 46 | inputs.update({ 47 | self.net_g.get_inputs()[4].name: energy.cpu().numpy().astype(np.float32) 48 | }) 49 | else: 50 | if self.cpt["use_f0"]: 51 | inputs.update({ 52 | self.net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64), 53 | self.net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32) 54 | }) 55 | 56 | return inputs 57 | 58 | def to(self, device = "cpu"): 59 | self.device = device 60 | return self 61 | 62 | def infer(self, feats = None, p_len = None, pitch = None, pitchf = None, sid = None, energy = None): 63 | output = self.net_g.run( 64 | [self.net_g.get_outputs()[0].name], ( 65 | self.get_onnx_argument( 66 | feats, 67 | p_len, 68 | sid, 69 | pitch, 70 | pitchf, 71 | energy, 72 | ) 73 | ) 74 | ) 75 | 76 | return torch.as_tensor(output, device=self.device) 77 | -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/CREPE/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import functools 3 | 4 | PITCH_BINS = 360 5 | 6 | class MODEL(torch.nn.Module): 7 | def __init__(self, model='full'): 8 | super().__init__() 9 | in_channels = {"full": [1, 1024, 128, 128, 128, 256], "large": [1, 768, 96, 96, 96, 192], "medium": [1, 512, 64, 64, 64, 128], "small": [1, 256, 32, 32, 32, 64], "tiny": [1, 128, 16, 16, 16, 32]}[model] 10 | out_channels = {"full": [1024, 128, 128, 128, 256, 512], "large": [768, 96, 96, 96, 192, 384], "medium": [512, 64, 64, 64, 128, 256], "small": [256, 32, 32, 32, 64, 128], "tiny": [128, 16, 16, 16, 32, 64]}[model] 11 | self.in_features = {"full": 2048, "large": 1536, "medium": 1024, "small": 512, "tiny": 256}[model] 12 | 13 | kernel_sizes = [(512, 1)] + 5 * [(64, 1)] 14 | strides = [(4, 1)] + 5 * [(1, 1)] 15 | batch_norm_fn = functools.partial(torch.nn.BatchNorm2d, eps=0.0010000000474974513, momentum=0.0) 16 | 17 | self.conv1 = torch.nn.Conv2d(in_channels=in_channels[0], out_channels=out_channels[0], kernel_size=kernel_sizes[0], stride=strides[0]) 18 | self.conv1_BN = batch_norm_fn(num_features=out_channels[0]) 19 | 20 | self.conv2 = torch.nn.Conv2d(in_channels=in_channels[1], out_channels=out_channels[1], kernel_size=kernel_sizes[1], stride=strides[1]) 21 | self.conv2_BN = batch_norm_fn(num_features=out_channels[1]) 22 | 23 | self.conv3 = torch.nn.Conv2d(in_channels=in_channels[2], out_channels=out_channels[2], kernel_size=kernel_sizes[2], stride=strides[2]) 24 | self.conv3_BN = batch_norm_fn(num_features=out_channels[2]) 25 | 26 | self.conv4 = torch.nn.Conv2d(in_channels=in_channels[3], out_channels=out_channels[3], kernel_size=kernel_sizes[3], stride=strides[3]) 27 | self.conv4_BN = batch_norm_fn(num_features=out_channels[3]) 28 | 29 | self.conv5 = torch.nn.Conv2d(in_channels=in_channels[4], out_channels=out_channels[4], kernel_size=kernel_sizes[4], stride=strides[4]) 30 | self.conv5_BN = batch_norm_fn(num_features=out_channels[4]) 31 | 32 | self.conv6 = torch.nn.Conv2d(in_channels=in_channels[5], out_channels=out_channels[5], kernel_size=kernel_sizes[5], stride=strides[5]) 33 | self.conv6_BN = batch_norm_fn(num_features=out_channels[5]) 34 | 35 | self.classifier = torch.nn.Linear(in_features=self.in_features, out_features=PITCH_BINS) 36 | 37 | def forward(self, x, embed=False): 38 | x = self.embed(x) 39 | if embed: return x 40 | return self.classifier(self.layer(x, self.conv6, self.conv6_BN).permute(0, 2, 1, 3).reshape(-1, self.in_features)).sigmoid() 41 | 42 | def embed(self, x): 43 | x = x[:, None, :, None] 44 | return self.layer(self.layer(self.layer(self.layer(self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254)), self.conv2, self.conv2_BN), self.conv3, self.conv3_BN), self.conv4, self.conv4_BN), self.conv5, self.conv5_BN) 45 | 46 | def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)): 47 | return torch.nn.functional.max_pool2d(batch_norm(torch.nn.functional.relu(conv(torch.nn.functional.pad(x, padding)))), (2, 1), (2, 1)) -------------------------------------------------------------------------------- /advanced_rvc_inference/tabs/extra/child/f0_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import gradio as gr 5 | 6 | sys.path.append(os.getcwd()) 7 | 8 | from advanced_rvc_inference.core.f0_extract import f0_extract 9 | from advanced_rvc_inference.core.ui import change_audios_choices, unlock_f0, shutil_move 10 | from advanced_rvc_inference.variables import translations, paths_for_files, method_f0, configs, file_types 11 | 12 | def f0_extract_tab(): 13 | with gr.Row(): 14 | gr.Markdown(translations["f0_extractor_markdown_2"]) 15 | with gr.Row(): 16 | extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary") 17 | with gr.Row(): 18 | with gr.Column(): 19 | upload_audio_file = gr.Files(label=translations["drop_audio"], file_types=file_types) 20 | audioplay = gr.Audio(interactive=False, label=translations["input_audio"]) 21 | with gr.Column(): 22 | with gr.Accordion(translations["f0_method"], open=False): 23 | with gr.Group(): 24 | with gr.Row(): 25 | onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) 26 | unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True) 27 | f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True) 28 | with gr.Accordion(translations["audio_path"], open=True): 29 | input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True) 30 | refresh_audio_button = gr.Button(translations["refresh"]) 31 | with gr.Row(): 32 | gr.Markdown("___") 33 | with gr.Row(): 34 | file_output = gr.File(label="", file_types=[".txt"], interactive=False) 35 | image_output = gr.Image(label="", interactive=False) 36 | with gr.Row(): 37 | upload_audio_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio_file], outputs=[input_audio_path]) 38 | input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay]) 39 | refresh_audio_button.click(fn=change_audios_choices, inputs=[input_audio_path], outputs=[input_audio_path]) 40 | with gr.Row(): 41 | unlock_full_method.change(fn=lambda method: {"choices": [m for m in unlock_f0(method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"}, inputs=[unlock_full_method], outputs=[f0_method_extract]) 42 | extractor_button.click( 43 | fn=f0_extract, 44 | inputs=[ 45 | input_audio_path, 46 | f0_method_extract, 47 | onnx_f0_mode3 48 | ], 49 | outputs=[file_output, image_output], 50 | api_name="f0_extract" 51 | ) 52 | -------------------------------------------------------------------------------- /advanced_rvc_inference/core/separate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | sys.path.append(os.getcwd()) 6 | 7 | from advanced_rvc_inference.core.ui import gr_info, gr_warning 8 | from advanced_rvc_inference.variables import python, translations, configs 9 | 10 | def separate_music( 11 | input_path, 12 | output_dirs, 13 | export_format, 14 | model_name, 15 | karaoke_model, 16 | reverb_model, 17 | denoise_model, 18 | sample_rate, 19 | shifts, 20 | batch_size, 21 | overlap, 22 | aggression, 23 | hop_length, 24 | window_size, 25 | segments_size, 26 | post_process_threshold, 27 | enable_tta, 28 | enable_denoise, 29 | high_end_process, 30 | enable_post_process, 31 | separate_backing, 32 | separate_reverb 33 | ): 34 | output_dirs = os.path.dirname(output_dirs) or output_dirs 35 | 36 | if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path): 37 | gr_warning(translations["input_not_valid"]) 38 | return [None]*4 39 | 40 | if not os.path.exists(output_dirs): 41 | gr_warning(translations["output_not_valid"]) 42 | return [None]*4 43 | 44 | if not os.path.exists(output_dirs): os.makedirs(output_dirs) 45 | gr_info(translations["start"].format(start=translations["separator_music"])) 46 | 47 | subprocess.run([ 48 | python, configs["separate_path"], 49 | "--input_path", input_path, 50 | "--output_dirs", output_dirs, 51 | "--export_format", export_format, 52 | "--model_name", model_name, 53 | "--karaoke_model", karaoke_model, 54 | "--reverb_model", reverb_model, 55 | "--denoise_model", denoise_model, 56 | "--sample_rate", str(sample_rate), 57 | "--shifts", str(shifts), 58 | "--batch_size", str(batch_size), 59 | "--overlap", str(overlap), 60 | "--aggression", str(aggression), 61 | "--hop_length", str(hop_length), 62 | "--window_size", str(window_size), 63 | "--segments_size", str(segments_size), 64 | "--post_process_threshold", str(post_process_threshold), 65 | "--enable_tta", str(enable_tta), 66 | "--enable_denoise", str(enable_denoise), 67 | "--high_end_process", str(high_end_process), 68 | "--enable_post_process", str(enable_post_process), 69 | "--separate_backing", str(separate_backing), 70 | "--separate_reverb", str(separate_reverb), 71 | ]) 72 | 73 | gr_info(translations["success"]) 74 | 75 | filename, _ = os.path.splitext(os.path.basename(input_path)) 76 | output_dirs = os.path.join(output_dirs, filename) 77 | 78 | return [ 79 | os.path.join( 80 | output_dirs, 81 | f"Original_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Original_Vocals.{export_format}" 82 | ), 83 | os.path.join( 84 | output_dirs, 85 | f"Instruments.{export_format}" 86 | ), 87 | os.path.join( 88 | output_dirs, 89 | f"Main_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Main_Vocals.{export_format}" 90 | ) if separate_backing else None, 91 | os.path.join( 92 | output_dirs, 93 | f"Backing_Vocals.{export_format}" 94 | ) if separate_backing else None 95 | ] if os.path.isfile(input_path) else [None]*4 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel", "setuptools-scm[toml]>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "advanced-rvc-inference" 7 | version = "0.1.0" 8 | description = "Advanced RVC Inference - A state-of-the-art web UI for rapid and effortless inference." 9 | readme = "README.md" 10 | requires-python = ">=3.9,<3.14" 11 | license = {file = "LICENSE"} 12 | authors = [ 13 | {name = "ArkanDash"} 14 | ] 15 | classifiers = [ 16 | "Development Status :: 3 - Alpha", 17 | "Intended Audience :: Developers", 18 | "Topic :: Software Development :: Build Tools", 19 | "License :: OSI Approved :: MIT License", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.9", 22 | "Programming Language :: Python :: 3.10", 23 | "Programming Language :: Python :: 3.11", 24 | "Programming Language :: Python :: 3.12", 25 | "Programming Language :: Python :: 3.13", 26 | ] 27 | dependencies = [ 28 | # Platform-specific requirements 29 | "pip>=23.3", 30 | "wheel", 31 | "omegaconf>=2.0.6", 32 | "onnxruntime; sys_platform == 'darwin'", 33 | "onnxruntime-gpu; sys_platform != 'darwin'", 34 | 35 | # Core dependencies 36 | "PyYAML>=6.0", 37 | "tiktoken", 38 | "hyperpyyaml", 39 | "torch>=2.3.1", 40 | "tqdm>=4.63.1", 41 | "sortedcontainers", 42 | "torchvision>=0.18.1", 43 | "torchaudio>=2.3.1", 44 | "torchcodec>=0.8.1", 45 | 46 | # FAISS with version constraints 47 | "faiss-cpu==1.7.3; python_version < '3.12'", 48 | "faiss-cpu>=1.7.3; python_version >= '3.12'", 49 | 50 | # Machine learning, NLP and deep learning 51 | "transformers>=4.49.0", 52 | "scikit-learn", 53 | "einops>=0.8.0", 54 | 55 | # Pitch and sound processing 56 | "librosa>=0.10.2", 57 | "pydub>=0.25.1", 58 | "praat-parselmouth", 59 | "soundfile>=0.13.0", 60 | "pedalboard", 61 | 62 | # Data processing and calculation 63 | "numpy>=1.25.2,<2.0.0", 64 | "numba>=0.57.0", 65 | "scipy>=1.15.0", 66 | "matplotlib>=3.7.2", 67 | 68 | # Implementation and web framework 69 | "gradio>=5.23.3,<6.0.0", 70 | "requests>=2.32.3", 71 | "aiohttp", 72 | "pysrt", 73 | 74 | # Utility section 75 | "yt-dlp", 76 | "edge-tts>=7.2.0", 77 | "ffmpy==0.3.1", 78 | "ffmpeg-python>=0.2.0", 79 | "beautifulsoup4", 80 | 81 | # Tensorboard and ONNX 82 | "tensorboard", 83 | "onnx>=1.14", 84 | "onnxslim", 85 | "onnx2torch>=1.5.15", 86 | 87 | # Cryptography section 88 | "pycryptodome>=3.9.6,<4.0.0", 89 | 90 | # Realtime and VAD 91 | "sounddevice>=0.5.2", 92 | "webrtcvad-wheels>=2.0.14", 93 | ] 94 | 95 | [project.optional-dependencies] 96 | dev = [ 97 | "pytest", 98 | "pytest-cov", 99 | "black", 100 | "flake8", 101 | "mypy", 102 | ] 103 | 104 | [project.urls] 105 | Homepage = "https://github.com/ArkanDash/Advanced-RVC-Inference" 106 | Repository = "https://github.com/ArkanDash/Advanced-RVC-Inference" 107 | Issues = "https://github.com/ArkanDash/Advanced-RVC-Inference/issues" 108 | 109 | [project.scripts] 110 | advanced-rvc-inference = "advanced_rvc_inference.app:app" 111 | -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/create_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import faiss 4 | import argparse 5 | 6 | import numpy as np 7 | 8 | from multiprocessing import cpu_count 9 | from sklearn.cluster import MiniBatchKMeans 10 | 11 | sys.path.append(os.getcwd()) 12 | 13 | from advanced_rvc_inference.variables import logger, translations, configs 14 | 15 | def parse_arguments(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--create_index", action='store_true') 18 | parser.add_argument("--model_name", type=str, required=True) 19 | parser.add_argument("--rvc_version", type=str, default="v2") 20 | parser.add_argument("--index_algorithm", type=str, default="Auto") 21 | 22 | return parser.parse_args() 23 | 24 | def main(): 25 | args = parse_arguments() 26 | exp_dir = os.path.join(configs["logs_path"], args.model_name) 27 | version, index_algorithm = args.rvc_version, args.index_algorithm 28 | 29 | log_data = {translations['modelname']: args.model_name, translations['model_path']: exp_dir, translations['training_version']: version, translations['index_algorithm_info']: index_algorithm} 30 | for key, value in log_data.items(): 31 | logger.debug(f"{key}: {value}") 32 | 33 | try: 34 | npys = [] 35 | feature_dir = os.path.join(exp_dir, f"{version}_extracted") 36 | model_name = os.path.basename(exp_dir) 37 | 38 | for name in sorted(os.listdir(feature_dir)): 39 | npys.append(np.load(os.path.join(feature_dir, name))) 40 | 41 | big_npy = np.concatenate(npys, axis=0) 42 | big_npy_idx = np.arange(big_npy.shape[0]) 43 | np.random.shuffle(big_npy_idx) 44 | big_npy = big_npy[big_npy_idx] 45 | 46 | if big_npy.shape[0] > 2e5 and (index_algorithm == "Auto" or index_algorithm == "KMeans"): big_npy = (MiniBatchKMeans(n_clusters=10000, verbose=True, batch_size=256 * cpu_count(), compute_labels=False, init="random").fit(big_npy).cluster_centers_) 47 | np.save(os.path.join(exp_dir, "total_fea.npy"), big_npy) 48 | 49 | n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) 50 | index_trained = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat") 51 | index_ivf_trained = faiss.extract_index_ivf(index_trained) 52 | index_ivf_trained.nprobe = 1 53 | index_trained.train(big_npy) 54 | faiss.write_index(index_trained, os.path.join(exp_dir, f"trained_IVF{n_ivf}_Flat_nprobe_{index_ivf_trained.nprobe}_{model_name}_{version}.index")) 55 | 56 | index_added = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat") 57 | index_ivf_added = faiss.extract_index_ivf(index_added) 58 | index_ivf_added.nprobe = 1 59 | index_added.train(big_npy) 60 | batch_size_add = 8192 61 | 62 | for i in range(0, big_npy.shape[0], batch_size_add): 63 | index_added.add(big_npy[i : i + batch_size_add]) 64 | 65 | index_filepath_added = os.path.join(exp_dir, f"added_IVF{n_ivf}_Flat_nprobe_{index_ivf_added.nprobe}_{model_name}_{version}.index") 66 | faiss.write_index(index_added, index_filepath_added) 67 | logger.info(f"{translations['save_index']} '{index_filepath_added}'") 68 | except Exception as e: 69 | logger.error(f"{translations['create_index_error']}: {e}") 70 | import traceback 71 | logger.debug(traceback.format_exc()) 72 | 73 | if __name__ == "__main__": main() -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/extracting/rms.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import tqdm 5 | import torch 6 | import librosa 7 | import traceback 8 | import concurrent.futures 9 | 10 | import numpy as np 11 | import torch.nn as nn 12 | 13 | sys.path.append(os.getcwd()) 14 | 15 | from advanced_rvc_inference.library.utils import load_audio 16 | from advanced_rvc_inference.variables import logger, translations 17 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths 18 | 19 | class RMSEnergyExtractor(nn.Module): 20 | def __init__(self, frame_length=2048, hop_length=512, center=True, pad_mode = "reflect"): 21 | super().__init__() 22 | self.frame_length = frame_length 23 | self.hop_length = hop_length 24 | self.center = center 25 | self.pad_mode = pad_mode 26 | 27 | def forward(self, x): 28 | assert x.ndim == 2 29 | assert x.shape[0] == 1 30 | 31 | if str(x.device).startswith(("ocl", "privateuseone")): x = x.contiguous() 32 | 33 | rms = torch.from_numpy( 34 | librosa.feature.rms( 35 | y=x.squeeze(0).cpu().numpy(), 36 | frame_length=self.frame_length, 37 | hop_length=self.hop_length, 38 | center=self.center, 39 | pad_mode=self.pad_mode 40 | ) 41 | ) 42 | 43 | if str(x.device).startswith(("ocl", "privateuseone")): rms = rms.contiguous() 44 | return rms.squeeze(-2).to(x.device) 45 | 46 | def process_file_rms(files, device, threads): 47 | threads = max(1, threads) 48 | 49 | module = RMSEnergyExtractor( 50 | frame_length=2048, hop_length=160, center=True, pad_mode = "reflect" 51 | ).to(device).eval().float() 52 | 53 | def worker(file_info): 54 | try: 55 | file, out_path = file_info 56 | out_file_path = os.path.join(out_path, os.path.basename(file)) 57 | 58 | if os.path.exists(out_file_path + ".npy"): return 59 | feats = torch.from_numpy(load_audio(file, 16000)).unsqueeze(0) 60 | 61 | with torch.no_grad(): 62 | feats = module(feats if device.startswith(("ocl", "privateuseone")) else feats.to(device)) 63 | 64 | np.save(out_file_path, feats.float().cpu().numpy(), allow_pickle=False) 65 | except: 66 | logger.debug(traceback.format_exc()) 67 | 68 | with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar: 69 | with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 70 | for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]): 71 | pbar.update(1) 72 | 73 | def run_rms_extraction(exp_dir, num_processes, devices, rms_extract): 74 | if rms_extract: 75 | wav_path, out_path = setup_paths(exp_dir, rms_extract=rms_extract) 76 | paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")]) 77 | 78 | start_time = time.time() 79 | logger.info(translations["rms_start_extract"].format(num_processes=num_processes)) 80 | 81 | with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: 82 | concurrent.futures.wait([executor.submit(process_file_rms, paths[i::len(devices)], devices[i], num_processes // len(devices)) for i in range(len(devices))]) 83 | 84 | logger.info(translations["rms_success_extract"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/extracting/rms.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import tqdm 5 | import torch 6 | import librosa 7 | import traceback 8 | import concurrent.futures 9 | 10 | import numpy as np 11 | import torch.nn as nn 12 | 13 | sys.path.append(os.getcwd()) 14 | 15 | from advanced_rvc_inference.library.utils import load_audio 16 | from advanced_rvc_inference.variables import logger, translations 17 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths 18 | 19 | class RMSEnergyExtractor(nn.Module): 20 | def __init__(self, frame_length=2048, hop_length=512, center=True, pad_mode = "reflect"): 21 | super().__init__() 22 | self.frame_length = frame_length 23 | self.hop_length = hop_length 24 | self.center = center 25 | self.pad_mode = pad_mode 26 | 27 | def forward(self, x): 28 | assert x.ndim == 2 29 | assert x.shape[0] == 1 30 | 31 | if str(x.device).startswith(("ocl", "privateuseone")): x = x.contiguous() 32 | 33 | rms = torch.from_numpy( 34 | librosa.feature.rms( 35 | y=x.squeeze(0).cpu().numpy(), 36 | frame_length=self.frame_length, 37 | hop_length=self.hop_length, 38 | center=self.center, 39 | pad_mode=self.pad_mode 40 | ) 41 | ) 42 | 43 | if str(x.device).startswith(("ocl", "privateuseone")): rms = rms.contiguous() 44 | return rms.squeeze(-2).to(x.device) 45 | 46 | def process_file_rms(files, device, threads): 47 | threads = max(1, threads) 48 | 49 | module = RMSEnergyExtractor( 50 | frame_length=2048, hop_length=160, center=True, pad_mode = "reflect" 51 | ).to(device).eval().float() 52 | 53 | def worker(file_info): 54 | try: 55 | file, out_path = file_info 56 | out_file_path = os.path.join(out_path, os.path.basename(file)) 57 | 58 | if os.path.exists(out_file_path + ".npy"): return 59 | feats = torch.from_numpy(load_audio(file, 16000)).unsqueeze(0) 60 | 61 | with torch.no_grad(): 62 | feats = module(feats if device.startswith(("ocl", "privateuseone")) else feats.to(device)) 63 | 64 | np.save(out_file_path, feats.float().cpu().numpy(), allow_pickle=False) 65 | except: 66 | logger.debug(traceback.format_exc()) 67 | 68 | with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar: 69 | with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 70 | for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]): 71 | pbar.update(1) 72 | 73 | def run_rms_extraction(exp_dir, num_processes, devices, rms_extract): 74 | if rms_extract: 75 | wav_path, out_path = setup_paths(exp_dir, rms_extract=rms_extract) 76 | paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")]) 77 | 78 | start_time = time.time() 79 | logger.info(translations["rms_start_extract"].format(num_processes=num_processes)) 80 | 81 | with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: 82 | concurrent.futures.wait([executor.submit(process_file_rms, paths[i::len(devices)], devices[i], num_processes // len(devices)) for i in range(len(devices))]) 83 | 84 | logger.info(translations["rms_success_extract"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/extracting/embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gc 3 | import sys 4 | import time 5 | import tqdm 6 | import torch 7 | import traceback 8 | import concurrent.futures 9 | 10 | import numpy as np 11 | 12 | sys.path.append(os.getcwd()) 13 | 14 | from advanced_rvc_inference.variables import logger, translations, config 15 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths 16 | from advanced_rvc_inference.library.utils import load_audio, load_embedders_model, extract_features 17 | 18 | def process_file_embedding(files, embedder_model, embedders_mode, device, version, is_half, threads): 19 | model = load_embedders_model(embedder_model, embedders_mode) 20 | if isinstance(model, torch.nn.Module): model = model.to(device).to(torch.float16 if is_half else torch.float32).eval() 21 | 22 | def worker(file_info): 23 | try: 24 | file, out_path = file_info 25 | out_file_path = os.path.join(out_path, os.path.basename(file.replace("wav", "npy"))) if os.path.isdir(out_path) else out_path 26 | 27 | if os.path.exists(out_file_path): return 28 | feats = torch.from_numpy(load_audio(file, 16000)).to(device).to(torch.float16 if is_half else torch.float32) 29 | 30 | with torch.no_grad(): 31 | feats = extract_features(model, feats.view(1, -1), version, device) 32 | 33 | feats = feats.squeeze(0).float().cpu().numpy() 34 | if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False) 35 | else: logger.warning(f"{file} {translations['NaN']}") 36 | except: 37 | logger.debug(traceback.format_exc()) 38 | 39 | with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar: 40 | with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 41 | for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]): 42 | pbar.update(1) 43 | 44 | def run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, is_half): 45 | wav_path, out_path = setup_paths(exp_dir, version) 46 | 47 | logger.info(translations["start_extract_hubert"]) 48 | num_processes = 1 if (config.device.startswith("ocl") and embedders_mode == "onnx") or config.device.startswith("privateuseone") else num_processes 49 | paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")]) 50 | 51 | start_time = time.time() 52 | with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: 53 | concurrent.futures.wait([executor.submit(process_file_embedding, paths[i::len(devices)], embedder_model, embedders_mode, devices[i], version, is_half, num_processes // len(devices)) for i in range(len(devices))]) 54 | 55 | gc.collect() 56 | logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) 57 | 58 | def create_mute_file(version, embedder_model, embedders_mode, is_half): 59 | start_time = time.time() 60 | logger.info(translations["start_extract_hubert"]) 61 | 62 | process_file_embedding([(os.path.join("assets", "logs", "mute", "sliced_audios_16k", "mute.wav"), os.path.join("assets", "logs", "mute", f"{version}_extracted", f"mute_{embedder_model}.npy"))], embedder_model, embedders_mode, config.device, version, is_half, 1) 63 | 64 | gc.collect() 65 | logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) -------------------------------------------------------------------------------- /advanced_rvc_inference/infer/train/extracting/embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gc 3 | import sys 4 | import time 5 | import tqdm 6 | import torch 7 | import traceback 8 | import concurrent.futures 9 | 10 | import numpy as np 11 | 12 | sys.path.append(os.getcwd()) 13 | 14 | from advanced_rvc_inference.variables import logger, translations, config 15 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths 16 | from advanced_rvc_inference.library.utils import load_audio, load_embedders_model, extract_features 17 | 18 | def process_file_embedding(files, embedder_model, embedders_mode, device, version, is_half, threads): 19 | model = load_embedders_model(embedder_model, embedders_mode) 20 | if isinstance(model, torch.nn.Module): model = model.to(device).to(torch.float16 if is_half else torch.float32).eval() 21 | 22 | def worker(file_info): 23 | try: 24 | file, out_path = file_info 25 | out_file_path = os.path.join(out_path, os.path.basename(file.replace("wav", "npy"))) if os.path.isdir(out_path) else out_path 26 | 27 | if os.path.exists(out_file_path): return 28 | feats = torch.from_numpy(load_audio(file, 16000)).to(device).to(torch.float16 if is_half else torch.float32) 29 | 30 | with torch.no_grad(): 31 | feats = extract_features(model, feats.view(1, -1), version, device) 32 | 33 | feats = feats.squeeze(0).float().cpu().numpy() 34 | if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False) 35 | else: logger.warning(f"{file} {translations['NaN']}") 36 | except: 37 | logger.debug(traceback.format_exc()) 38 | 39 | with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar: 40 | with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 41 | for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]): 42 | pbar.update(1) 43 | 44 | def run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, is_half): 45 | wav_path, out_path = setup_paths(exp_dir, version) 46 | 47 | logger.info(translations["start_extract_hubert"]) 48 | num_processes = 1 if (config.device.startswith("ocl") and embedders_mode == "onnx") or config.device.startswith("privateuseone") else num_processes 49 | paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")]) 50 | 51 | start_time = time.time() 52 | with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: 53 | concurrent.futures.wait([executor.submit(process_file_embedding, paths[i::len(devices)], embedder_model, embedders_mode, devices[i], version, is_half, num_processes // len(devices)) for i in range(len(devices))]) 54 | 55 | gc.collect() 56 | logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) 57 | 58 | def create_mute_file(version, embedder_model, embedders_mode, is_half): 59 | start_time = time.time() 60 | logger.info(translations["start_extract_hubert"]) 61 | 62 | process_file_embedding([(os.path.join("assets", "logs", "mute", "sliced_audios_16k", "mute.wav"), os.path.join("assets", "logs", "mute", f"{version}_extracted", f"mute_{embedder_model}.npy"))], embedder_model, embedders_mode, config.device, version, is_half, 1) 63 | 64 | gc.collect() 65 | logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/algorithm/modules.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import torch.nn.utils.parametrize as parametrize 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from .commons import fused_add_tanh_sigmoid_multiply 10 | 11 | class WaveNet(torch.nn.Module): 12 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 13 | super(WaveNet, self).__init__() 14 | assert kernel_size % 2 == 1 15 | self.hidden_channels = hidden_channels 16 | self.kernel_size = (kernel_size,) 17 | self.dilation_rate = dilation_rate 18 | self.n_layers = n_layers 19 | self.gin_channels = gin_channels 20 | self.p_dropout = p_dropout 21 | self.in_layers = torch.nn.ModuleList() 22 | self.res_skip_layers = torch.nn.ModuleList() 23 | self.drop = torch.nn.Dropout(p_dropout) 24 | if gin_channels != 0: self.cond_layer = torch.nn.utils.parametrizations.weight_norm(torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), name="weight") 25 | dilations = [dilation_rate ** i for i in range(n_layers)] 26 | paddings = [(kernel_size * d - d) // 2 for d in dilations] 27 | 28 | for i in range(n_layers): 29 | in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilations[i], padding=paddings[i]) 30 | in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight") 31 | self.in_layers.append(in_layer) 32 | res_skip_channels = (hidden_channels if i == n_layers - 1 else 2 * hidden_channels) 33 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 34 | res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight") 35 | self.res_skip_layers.append(res_skip_layer) 36 | 37 | def forward(self, x, x_mask, g=None): 38 | output = x.clone().zero_() 39 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 40 | 41 | if g is not None: g = self.cond_layer(g) 42 | 43 | for i in range(self.n_layers): 44 | x_in = self.in_layers[i](x) 45 | g_l = (g[:, i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels, :] if g is not None else 0) 46 | res_skip_acts = self.res_skip_layers[i](self.drop(fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor))) 47 | 48 | if i < self.n_layers - 1: 49 | x = (x + (res_skip_acts[:, : self.hidden_channels, :])) * x_mask 50 | output = output + res_skip_acts[:, self.hidden_channels :, :] 51 | else: output = output + res_skip_acts 52 | 53 | return output * x_mask 54 | 55 | def remove_weight_norm(self): 56 | if self.gin_channels != 0: 57 | if hasattr(self.cond_layer, "parametrizations") and "weight" in self.cond_layer.parametrizations: parametrize.remove_parametrizations(self.cond_layer, "weight", leave_parametrized=True) 58 | else: torch.nn.utils.remove_weight_norm(self.cond_layer) 59 | 60 | for l in self.in_layers: 61 | if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True) 62 | else: torch.nn.utils.remove_weight_norm(l) 63 | 64 | for l in self.res_skip_layers: 65 | if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True) 66 | else: torch.nn.utils.remove_weight_norm(l) -------------------------------------------------------------------------------- /advanced_rvc_inference/library/predictors/RMVPE/RMVPE.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | 5 | import numpy as np 6 | import torch.nn.functional as F 7 | 8 | sys.path.append(os.getcwd()) 9 | 10 | from advanced_rvc_inference.library.predictors.RMVPE.mel import MelSpectrogram 11 | 12 | N_MELS, N_CLASS = 128, 360 13 | 14 | class RMVPE: 15 | def __init__(self, model_path, is_half, device=None, providers=None, onnx=False): 16 | self.onnx = onnx 17 | 18 | if self.onnx: 19 | import onnxruntime as ort 20 | 21 | sess_options = ort.SessionOptions() 22 | sess_options.log_severity_level = 3 23 | self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) 24 | else: 25 | from advanced_rvc_inference.library.predictors.RMVPE.e2e import E2E 26 | 27 | model = E2E(4, 1, (2, 2)) 28 | ckpt = torch.load(model_path, map_location="cpu", weights_only=True) 29 | model.load_state_dict(ckpt) 30 | model.eval() 31 | if is_half: model = model.half() 32 | self.model = model.to(device) 33 | 34 | self.is_half = is_half 35 | self.device = device 36 | self.mel_extractor = MelSpectrogram(N_MELS, 16000, 1024, 160, None, 30, 8000).to(device) 37 | cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 38 | self.cents_mapping = np.pad(cents_mapping, (4, 4)) 39 | 40 | def mel2hidden(self, mel): 41 | with torch.no_grad(): 42 | n_frames = mel.shape[-1] 43 | n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames 44 | if n_pad > 0: mel = F.pad(mel, (0, n_pad), mode="constant") 45 | 46 | if self.onnx: 47 | hidden = self.model.run( 48 | [self.model.get_outputs()[0].name], 49 | { 50 | self.model.get_inputs()[0].name: mel.cpu().numpy().astype(np.float32) 51 | } 52 | )[0] 53 | else: 54 | hidden = self.model( 55 | mel.half() if self.is_half else mel.float() 56 | ) 57 | 58 | return hidden[:, :n_frames] 59 | 60 | def decode(self, hidden, thred=0.03): 61 | f0 = 10 * (2 ** (self.to_local_average_cents(hidden, thred=thred) / 1200)) 62 | f0[f0 == 10] = 0 63 | 64 | return f0 65 | 66 | def infer_from_audio(self, audio, thred=0.03): 67 | hidden = self.mel2hidden(self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True)) 68 | 69 | return self.decode(hidden.squeeze(0).cpu().numpy().astype(np.float32) if not self.onnx else hidden[0], thred=thred) 70 | 71 | def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100): 72 | f0 = self.infer_from_audio(audio, thred) 73 | f0[(f0 < f0_min) | (f0 > f0_max)] = 0 74 | 75 | return f0 76 | 77 | def to_local_average_cents(self, salience, thred=0.05): 78 | center = np.argmax(salience, axis=1) 79 | salience = np.pad(salience, ((0, 0), (4, 4))) 80 | center += 4 81 | todo_salience, todo_cents_mapping = [], [] 82 | starts = center - 4 83 | ends = center + 5 84 | 85 | for idx in range(salience.shape[0]): 86 | todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) 87 | todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) 88 | 89 | todo_salience = np.array(todo_salience) 90 | devided = np.sum(todo_salience * np.array(todo_cents_mapping), 1) / np.sum(todo_salience, 1) 91 | devided[np.max(salience, axis=1) <= thred] = 0 92 | 93 | return devided -------------------------------------------------------------------------------- /advanced_rvc_inference/app.py: -------------------------------------------------------------------------------- 1 | import os, io 2 | #import ssl 3 | import sys 4 | import time 5 | import codecs 6 | import logging 7 | import warnings 8 | 9 | import gradio as gr 10 | 11 | sys.path.append(os.getcwd()) 12 | start_time = time.time() 13 | 14 | from advanced_rvc_inference.tabs.extra.extra import extra_tab 15 | from advanced_rvc_inference.tabs.training.training import training_tab 16 | from advanced_rvc_inference.tabs.downloads.downloads import download_tab 17 | from advanced_rvc_inference.tabs.inference.inference import inference_tab 18 | from advanced_rvc_inference.variables import logger, config, translations, theme, font, configs, language, allow_disk 19 | from advanced_rvc_inference.mainjs import js_code 20 | #ssl._create_default_https_context = ssl._create_unverified_context 21 | 22 | warnings.filterwarnings("ignore") 23 | for l in ["httpx", "uvicorn", "httpcore", "urllib3"]: 24 | logging.getLogger(l).setLevel(logging.ERROR) 25 | 26 | 27 | client_mode = "--client" in sys.argv 28 | 29 | with gr.Blocks( 30 | title="📱 Advanced RVC Inference", 31 | js=js_code if client_mode else None, 32 | theme=theme, 33 | ) as app: 34 | gr.HTML("