├── advanced_rvc_inference
    ├── __init__.py
    ├── assets
    │   ├── __init__.py
    │   ├── f0
    │   │   ├── .gitattributes
    │   │   └── __init__.py
    │   ├── audios
    │   │   ├── .gitattributes
    │   │   ├── others
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── rvc
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── tts
    │   │   │   ├── ..gitattributes
    │   │   │   └── __init__.py
    │   │   ├── uvr
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── binary
    │   │   ├── __init__.py
    │   │   ├── world.bin
    │   │   ├── decrypt.bin
    │   │   └── vr_params.bin
    │   ├── dataset
    │   │   ├── .gitattributes
    │   │   └── __init__.py
    │   ├── presets
    │   │   ├── .gitattributes
    │   │   └── __init__.py
    │   ├── weights
    │   │   ├── .gitattributes
    │   │   └── __init__.py
    │   ├── languages
    │   │   └── __init__.py
    │   ├── models
    │   │   ├── uvr5
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── embedders
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── predictors
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── pretrained_v1
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── pretrained_v2
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── pretrained_custom
    │   │   │   ├── .gitattributes
    │   │   │   └── __init__.py
    │   │   ├── speaker_diarization
    │   │   │   ├── models
    │   │   │   │   ├── .gitattributes
    │   │   │   │   └── __init__.py
    │   │   │   ├── __init__.py
    │   │   │   └── assets
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── mel_filters.npz
    │   │   └── __init__.py
    │   ├── logs
    │   │   ├── reference
    │   │   │   └── .gitattributes
    │   │   └── mute
    │   │   │   ├── f0
    │   │   │       └── mute.wav.npy
    │   │   │   ├── energy
    │   │   │       └── mute.wav.npy
    │   │   │   ├── f0_voiced
    │   │   │       └── mute.wav.npy
    │   │   │   ├── v1_extracted
    │   │   │       ├── mute.npy
    │   │   │       ├── mute_chinese.npy
    │   │   │       ├── mute_japanese.npy
    │   │   │       ├── mute_korean.npy
    │   │   │       ├── mute_spin-v1.npy
    │   │   │       ├── mute_spin-v2.npy
    │   │   │       ├── mute_portuguese.npy
    │   │   │       └── mute_vietnamese.npy
    │   │   │   ├── v2_extracted
    │   │   │       ├── mute.npy
    │   │   │       ├── mute_chinese.npy
    │   │   │       ├── mute_japanese.npy
    │   │   │       ├── mute_korean.npy
    │   │   │       ├── mute_spin-v1.npy
    │   │   │       ├── mute_spin-v2.npy
    │   │   │       ├── mute_portuguese.npy
    │   │   │       └── mute_vietnamese.npy
    │   │   │   ├── sliced_audios
    │   │   │       ├── mute32000.wav
    │   │   │       ├── mute40000.wav
    │   │   │       └── mute48000.wav
    │   │   │   └── sliced_audios_16k
    │   │   │       └── mute.wav
    │   ├── zluda
    │   │   ├── __init__.py
    │   │   ├── run_app.bat
    │   │   ├── path-zluda-hip57.bat
    │   │   ├── path-zluda-hip61.bat
    │   │   └── path-zluda-hip62.bat
    │   └── config.txt
    ├── configs
    │   ├── __init__.py
    │   ├── v1
    │   │   ├── __init__.py
    │   │   ├── 32000.json
    │   │   ├── 40000.json
    │   │   └── 48000.json
    │   └── v2
    │   │   ├── __init__.py
    │   │   ├── 32000.json
    │   │   ├── 40000.json
    │   │   └── 48000.json
    ├── core
    │   ├── __init__.py
    │   ├── restart.py
    │   ├── f0_extract.py
    │   ├── csrt.py
    │   └── separate.py
    ├── library
    │   ├── __init__.py
    │   ├── onnx
    │   │   ├── __init__.py
    │   │   └── wrapper.py
    │   ├── backends
    │   │   ├── __init__.py
    │   │   ├── directml.py
    │   │   ├── zluda.py
    │   │   └── opencl.py
    │   ├── embedders
    │   │   ├── __init__.py
    │   │   ├── transformers.py
    │   │   ├── onnx.py
    │   │   └── ppg.py
    │   ├── generators
    │   │   ├── __init__.py
    │   │   └── hifigan.py
    │   ├── predictors
    │   │   ├── __init__.py
    │   │   ├── DJCM
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   ├── spec.py
    │   │   │   ├── encoder.py
    │   │   │   ├── utils.py
    │   │   │   └── decoder.py
    │   │   ├── FCPE
    │   │   │   ├── __init__.py
    │   │   │   ├── wav2mel.py
    │   │   │   ├── utils.py
    │   │   │   ├── stft.py
    │   │   │   └── encoder.py
    │   │   ├── PENN
    │   │   │   ├── __init__.py
    │   │   │   ├── fcn.py
    │   │   │   └── core.py
    │   │   ├── CREPE
    │   │   │   ├── __init__.py
    │   │   │   ├── filter.py
    │   │   │   └── model.py
    │   │   ├── PESTO
    │   │   │   ├── __init__.py
    │   │   │   └── PESTO.py
    │   │   ├── RMVPE
    │   │   │   ├── __init__.py
    │   │   │   ├── e2e.py
    │   │   │   ├── mel.py
    │   │   │   └── RMVPE.py
    │   │   ├── SWIFT
    │   │   │   ├── __init__.py
    │   │   │   └── SWIFT.py
    │   │   └── WORLD
    │   │   │   └── __init__.py
    │   ├── uvr5_lib
    │   │   ├── __init__.py
    │   │   ├── demucs
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   └── states.py
    │   │   └── vr_network
    │   │   │   ├── __init__.py
    │   │   │   ├── model_param_init.py
    │   │   │   └── layers_new.py
    │   ├── architectures
    │   │   └── __init__.py
    │   ├── speaker_diarization
    │   │   ├── __init__.py
    │   │   ├── embedding.py
    │   │   └── parameter_transfer.py
    │   └── algorithm
    │   │   ├── __init__.py
    │   │   ├── normalization.py
    │   │   ├── commons.py
    │   │   ├── modules.py
    │   │   └── discriminators.py
    ├── tabs
    │   ├── __init__.py
    │   ├── extra
    │   │   ├── __init__.py
    │   │   ├── child
    │   │   │   ├── __init__.py
    │   │   │   ├── read_model.py
    │   │   │   ├── convert_model.py
    │   │   │   ├── fushion.py
    │   │   │   ├── create_srt.py
    │   │   │   └── f0_extract.py
    │   │   └── extra.py
    │   ├── realtime
    │   │   └── __init__.py
    │   ├── training
    │   │   ├── __init__.py
    │   │   ├── child
    │   │   │   └── __init__.py
    │   │   └── training.py
    │   ├── downloads
    │   │   └── __init__.py
    │   └── inference
    │   │   ├── __init__.py
    │   │   ├── child
    │   │       └── __init__.py
    │   │   └── inference.py
    ├── tools
    │   ├── __init__.py
    │   ├── pixeldrain.py
    │   ├── huggingface.py
    │   └── mediafire.py
    ├── infer
    │   ├── __init__.py
    │   ├── rvc
    │   │   └── __init__.py
    │   ├── train
    │   │   ├── __init__.py
    │   │   ├── training
    │   │   │   ├── __init__.py
    │   │   │   ├── losses.py
    │   │   │   ├── extract_model.py
    │   │   │   └── anyprecision_optimizer.py
    │   │   ├── extracting
    │   │   │   ├── __init__.py
    │   │   │   ├── setup_path.py
    │   │   │   ├── rms.py
    │   │   │   ├── embedding.py
    │   │   │   ├── feature.py
    │   │   │   └── preparing_files.py
    │   │   ├── preprocess
    │   │   │   └── __init__.py
    │   │   └── create_index.py
    │   ├── extracting
    │   │   ├── __init__.py
    │   │   ├── setup_path.py
    │   │   ├── rms.py
    │   │   ├── embedding.py
    │   │   ├── feature.py
    │   │   └── preparing_files.py
    │   └── realtime
    │   │   ├── __init__.py
    │   │   └── vad_utils.py
    ├── run_tensorboard.py
    └── app.py
├── LICENSE
├── requirements.txt
├── installer.bat
├── CONTRIBUTING.md
├── pyproject.toml
└── Advanced-RVC.ipynb


/advanced_rvc_inference/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/f0/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/core/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/binary/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/dataset/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/f0/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/presets/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/weights/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v1/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v2/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/onnx/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/realtime/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/training/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/languages/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/uvr5/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/backends/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/embedders/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/generators/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/uvr5_lib/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/downloads/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/child/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/others/.gitattributes:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/rvc/.gitattributes:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/tts/..gitattributes:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/uvr/.gitattributes:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/reference/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/embedders/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/predictors/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/architectures/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/uvr5_lib/demucs/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/inference/child/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/training/child/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/pretrained_v1/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/pretrained_v2/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/speaker_diarization/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/uvr5_lib/vr_network/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/pretrained_custom/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/speaker_diarization/models/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/rvc/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer.rvc module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer.train module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.audios module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/zluda/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.zluda module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/presets/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.presets module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/weights/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.weights module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/extracting/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer.extracting module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/realtime/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer.realtime module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/rvc/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.audios.rvc module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/tts/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.audios.tts module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/uvr/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.audios.uvr module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/uvr5/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.uvr5 module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/algorithm/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.algorithm module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/audios/others/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.audios.others module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/training/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer.train.training module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/embedders/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.embedders module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/extracting/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer.train.extracting module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for infer.train.preprocess module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/DJCM/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.DJCM module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/FCPE/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.FCPE module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/PENN/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.PENN module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/predictors/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.predictors module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/CREPE/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.CREPE module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/PESTO/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.PESTO module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/RMVPE/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.RMVPE module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/SWIFT/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.SWIFT module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/WORLD/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for library.predictors.WORLD module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/pretrained_v1/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.pretrained_v1 module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/pretrained_v2/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.pretrained_v2 module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/pretrained_custom/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.pretrained_custom module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/speaker_diarization/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.speaker_diarization module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/speaker_diarization/assets/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.speaker_diarization.assets module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/speaker_diarization/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Package initialization for assets.models.speaker_diarization.models module."""


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/binary/world.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/binary/world.bin


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/binary/decrypt.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/binary/decrypt.bin


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/binary/vr_params.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/binary/vr_params.bin


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/f0/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/f0/mute.wav.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/energy/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/energy/mute.wav.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/f0_voiced/mute.wav.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/f0_voiced/mute.wav.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute32000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute32000.wav


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute40000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute40000.wav


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute48000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios/mute48000.wav


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/sliced_audios_16k/mute.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/sliced_audios_16k/mute.wav


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_chinese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_chinese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_japanese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_japanese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_korean.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_korean.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v1.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_spin-v2.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_chinese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_chinese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_japanese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_japanese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_korean.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_korean.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v1.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_spin-v2.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_portuguese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_portuguese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_vietnamese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v1_extracted/mute_vietnamese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_portuguese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_portuguese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_vietnamese.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/logs/mute/v2_extracted/mute_vietnamese.npy


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/models/speaker_diarization/assets/mel_filters.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Advanced-RVC-Inference/HEAD/advanced_rvc_inference/assets/models/speaker_diarization/assets/mel_filters.npz


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/zluda/run_app.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | setlocal
 3 | title Vietnamese RVC By Anh [ZLUDA]
 4 | 
 5 | set HIP_VISIBLE_DEVICES="0"
 6 | set ZLUDA_COMGR_LOG_LEVEL=1
 7 | SET DISABLE_ADDMM_CUDA_LT=1
 8 | 
 9 | zluda\zluda.exe -- env\\Scripts\\python.exe main\\app\\app.py --open --allow_all_disk
10 | echo.
11 | pause


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/uvr5_lib/demucs/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def center_trim(tensor, reference):
 4 |     ref_size = reference.size(-1) if isinstance(reference, torch.Tensor) else reference
 5 |     delta = tensor.size(-1) - ref_size
 6 |     
 7 |     if delta < 0: raise ValueError(f"tensor > parameter: {delta}.")
 8 |     if delta: tensor = tensor[..., delta // 2 : -(delta - delta // 2)]
 9 | 
10 |     return tensor


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/embedders/transformers.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import HubertModel
 3 | 
 4 | class HubertModelWithFinalProj(HubertModel):
 5 |     def __init__(self, config):
 6 |         super().__init__(config)
 7 |         self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
 8 | 
 9 |     def extract_features(self, source, padding_mask = None, output_layer = None):
10 |         return self.forward(source)


--------------------------------------------------------------------------------
/advanced_rvc_inference/tools/pixeldrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | 
 4 | def pixeldrain(url, output_dir):
 5 |     try:
 6 |         response = requests.get(f"https://pixeldrain.com/api/file/{url.split('pixeldrain.com/u/')[1]}")
 7 | 
 8 |         if response.status_code == 200:
 9 |             file_path = os.path.join(output_dir, (response.headers.get("Content-Disposition").split("filename=")[-1].strip('";')))
10 | 
11 |             with open(file_path, "wb") as newfile:
12 |                 newfile.write(response.content)
13 |             return file_path
14 |         else: return None
15 |     except Exception as e:
16 |         raise RuntimeError(e)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/algorithm/normalization.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import torch.nn.functional as F
 4 | 
 5 | class LayerNorm(torch.nn.Module):
 6 |     def __init__(self, channels, eps=1e-5, onnx=False):
 7 |         super().__init__()
 8 |         self.channels = channels
 9 |         self.eps = eps
10 |         self.onnx = onnx
11 |         self.gamma = torch.nn.Parameter(torch.ones(channels))
12 |         self.beta = torch.nn.Parameter(torch.zeros(channels))
13 | 
14 |     def forward(self, x):
15 |         x = x.transpose(1, -1)
16 |         return (F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) if self.onnx else F.layer_norm(x, (x.size(-1),), self.gamma, self.beta, self.eps)).transpose(1, -1) 


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/zluda/path-zluda-hip57.bat:
--------------------------------------------------------------------------------
 1 | rmdir /S /q zluda
 2 | curl -s -L https://github.com/lshqqytiger/ZLUDA/releases/download/rel.c0804ca624963aab420cb418412b1c7fbae3454b/ZLUDA-windows-rocm5-amd64.zip > zluda.zip
 3 | tar -xf zluda.zip
 4 | del zluda.zip
 5 | 
 6 | if exist "runtime" (
 7 |     copy zluda\cublas.dll runtime\Lib\site-packages\torch\lib\cublas64_11.dll /y
 8 |     copy zluda\cusparse.dll runtime\Lib\site-packages\torch\lib\cusparse64_11.dll /y
 9 |     copy zluda\nvrtc.dll runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y
10 | ) else (
11 |     copy zluda\cublas.dll env\Lib\site-packages\torch\lib\cublas64_11.dll /y
12 |     copy zluda\cusparse.dll env\Lib\site-packages\torch\lib\cusparse64_11.dll /y
13 |     copy zluda\nvrtc.dll env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y
14 | )


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/zluda/path-zluda-hip61.bat:
--------------------------------------------------------------------------------
 1 | rmdir /S /q zluda
 2 | curl -s -L https://github.com/lshqqytiger/ZLUDA/releases/download/rel.c0804ca624963aab420cb418412b1c7fbae3454b/ZLUDA-windows-rocm6-amd64.zip > zluda.zip
 3 | tar -xf zluda.zip
 4 | del zluda.zip
 5 | 
 6 | if exist "runtime" (
 7 |     copy zluda\cublas.dll runtime\Lib\site-packages\torch\lib\cublas64_11.dll /y
 8 |     copy zluda\cusparse.dll runtime\Lib\site-packages\torch\lib\cusparse64_11.dll /y
 9 |     copy zluda\nvrtc.dll runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y
10 | ) else (
11 |     copy zluda\cublas.dll env\Lib\site-packages\torch\lib\cublas64_11.dll /y
12 |     copy zluda\cusparse.dll env\Lib\site-packages\torch\lib\cusparse64_11.dll /y
13 |     copy zluda\nvrtc.dll env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y
14 | )


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/extracting/setup_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def setup_paths(exp_dir, version = None, rms_extract = False):
 4 |     wav_path = os.path.join(exp_dir, "sliced_audios_16k")
 5 | 
 6 |     if rms_extract:
 7 |         out_path = os.path.join(exp_dir, "energy")
 8 |         os.makedirs(out_path, exist_ok=True)
 9 | 
10 |         return wav_path, out_path
11 | 
12 |     if version:
13 |         out_path = os.path.join(exp_dir, f"{version}_extracted")
14 |         os.makedirs(out_path, exist_ok=True)
15 | 
16 |         return wav_path, out_path
17 |     else:
18 |         output_root1, output_root2 = os.path.join(exp_dir, "f0"), os.path.join(exp_dir, "f0_voiced")
19 |         os.makedirs(output_root1, exist_ok=True); os.makedirs(output_root2, exist_ok=True)
20 | 
21 |         return wav_path, output_root1, output_root2


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/extracting/setup_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def setup_paths(exp_dir, version = None, rms_extract = False):
 4 |     wav_path = os.path.join(exp_dir, "sliced_audios_16k")
 5 | 
 6 |     if rms_extract:
 7 |         out_path = os.path.join(exp_dir, "energy")
 8 |         os.makedirs(out_path, exist_ok=True)
 9 | 
10 |         return wav_path, out_path
11 | 
12 |     if version:
13 |         out_path = os.path.join(exp_dir, f"{version}_extracted")
14 |         os.makedirs(out_path, exist_ok=True)
15 | 
16 |         return wav_path, out_path
17 |     else:
18 |         output_root1, output_root2 = os.path.join(exp_dir, "f0"), os.path.join(exp_dir, "f0_voiced")
19 |         os.makedirs(output_root1, exist_ok=True); os.makedirs(output_root2, exist_ok=True)
20 | 
21 |         return wav_path, output_root1, output_root2


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/embedders/onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import onnxruntime
 3 | 
 4 | class HubertModelONNX:
 5 |     def __init__(self, embedder_model_path, providers, device):
 6 |         sess_options = onnxruntime.SessionOptions()
 7 |         sess_options.log_severity_level = 3
 8 |         self.model = onnxruntime.InferenceSession(embedder_model_path, sess_options=sess_options, providers=providers)
 9 |         self.final_proj = self._final_proj
10 |         self.device = device
11 | 
12 |     def _final_proj(self, source):
13 |         return source
14 |     
15 |     def extract_features(self, source, padding_mask = None, output_layer = None):
16 |         logits = self.model.run([self.model.get_outputs()[0].name, self.model.get_outputs()[1].name], {"feats": source.detach().cpu().numpy()})
17 |         return [torch.as_tensor(logits[int(output_layer != 9)], dtype=torch.float32, device=self.device)]


--------------------------------------------------------------------------------
/advanced_rvc_inference/run_tensorboard.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | import logging
 5 | import warnings
 6 | import webbrowser
 7 | 
 8 | from tensorboard import program
 9 | 
10 | sys.path.append(os.getcwd())
11 | 
12 | from advanced_rvc_inference.variables import config, translations, logger
13 | 
14 | def launch_tensorboard():
15 |     warnings.filterwarnings("ignore")
16 |     for l in ["root", "tensorboard"]:
17 |         logging.getLogger(l).setLevel(logging.ERROR)
18 | 
19 |     tb = program.TensorBoard()
20 |     tb.configure(argv=[None, "--logdir", config.configs["logs_path"], f"--port={config.configs['tensorboard_port']}"])
21 |     url = tb.launch()
22 | 
23 |     logger.info(f"{translations['tensorboard_url']}: {url}")
24 |     if "--open" in sys.argv: webbrowser.open(url)
25 | 
26 |     return f"{translations['tensorboard_url']}: {url}"
27 | 
28 | if __name__ == "__advanced_rvc_inference__": 
29 |     launch_tensorboard()
30 | 
31 |     while 1:
32 |         time.sleep(5)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/PENN/fcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class FCN(torch.nn.Sequential):
 4 |     def __init__(self, channels = 256, pitch_bins = 1440, pooling = (2, 2)):
 5 |         super().__init__(*(Block(1, channels, 481, pooling), Block(channels, channels // 8, 225, pooling), Block(channels // 8, channels // 8, 97, pooling), Block(channels // 8, channels // 2, 66), Block(channels // 2, channels, 35), Block(channels, channels * 2, 4), torch.nn.Conv1d(channels * 2, pitch_bins, 4)))
 6 | 
 7 |     def forward(self, frames):
 8 |         return super().forward(frames[:, :, 16:-15])
 9 |     
10 | class Block(torch.nn.Sequential):
11 |     def __init__(self, in_channels, out_channels, length=1, pooling=None, kernel_size=32):
12 |         layers = (torch.nn.Conv1d(in_channels, out_channels, kernel_size), torch.nn.ReLU())
13 |         if pooling is not None: layers += (torch.nn.MaxPool1d(*pooling),)
14 |         layers += (torch.nn.LayerNorm((out_channels, length)),)
15 |         super().__init__(*layers)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 ArkanDash
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/tools/huggingface.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tqdm
 3 | import requests
 4 | 
 5 | try:
 6 |     import wget
 7 | except:
 8 |     wget = None
 9 | 
10 | def HF_download_file(url, output_path=None):
11 |     url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
12 |     output_path = os.path.basename(url) if output_path is None else (os.path.join(output_path, os.path.basename(url)) if os.path.isdir(output_path) else output_path)
13 | 
14 |     if wget != None: wget.download(url, out=output_path)
15 |     else:
16 |         response = requests.get(url, stream=True, timeout=300)
17 | 
18 |         if response.status_code == 200:
19 |             progress_bar = tqdm.tqdm(total=int(response.headers.get("content-length", 0)), desc=os.path.basename(url), ncols=100, unit="byte", leave=False)
20 | 
21 |             with open(output_path, "wb") as f:
22 |                 for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):
23 |                     progress_bar.update(len(chunk))
24 |                     f.write(chunk)
25 | 
26 |             progress_bar.close()
27 |         else: raise ValueError(response.status_code)
28 | 
29 |     return output_path


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/training/training.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.variables import translations, configs
 9 | from advanced_rvc_inference.tabs.training.child.training import training_model_tab
10 | from advanced_rvc_inference.tabs.training.child.create_dataset import create_dataset_tab
11 | from advanced_rvc_inference.tabs.training.child.create_reference import create_reference_tab
12 | 
13 | def training_tab():
14 |     with gr.TabItem(translations["training_model"], visible=configs.get("create_and_training_tab", True)):
15 |         with gr.TabItem(translations["createdataset"], visible=configs.get("create_dataset_tab", True)):
16 |             gr.Markdown(translations["create_dataset_markdown"])
17 |             create_dataset_tab()
18 | 
19 |         with gr.TabItem(translations["create_reference"], visible=configs.get("create_reference_tab", True)):
20 |             gr.Markdown(translations["create_reference_markdown"])
21 |             create_reference_tab()
22 | 
23 |         with gr.TabItem(translations["training_model"], visible=configs.get("training_tab", True)):
24 |             gr.Markdown(f"## {translations['training_model']}")
25 |             training_model_tab()


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/backends/directml.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gc
 3 | import sys
 4 | import torch
 5 | import subprocess
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from advanced_rvc_inference.library.embedders import fairseq
10 | from advanced_rvc_inference.library.backends.utils import GRU
11 | 
12 | try:
13 |     import torch_directml
14 | except:
15 |     torch_directml = None
16 | 
17 | torch_available = torch_directml != None
18 | 
19 | def device_count():
20 |     return torch_directml.device_count() if torch_available else 0
21 | 
22 | def device_name(device_id = 0):
23 |     return torch_directml.device_name(device_id) if torch_available else ""
24 | 
25 | def is_available():
26 |     return torch_directml.is_available() if torch_available else False
27 | 
28 | def empty_cache():
29 |     empty_cache_path = os.path.join("main", "library", "backends", "dml_empty_cache", "empty_cache.exe")
30 | 
31 |     if torch_available and os.path.exists(empty_cache_path):
32 |         subprocess.run([empty_cache_path], capture_output=True, text=True)
33 |         gc.collect()
34 | 
35 | def forward_dml(ctx, x, scale):
36 |     ctx.scale = scale
37 |     res = x.clone().detach()
38 |     return res
39 | 
40 | if torch_available: 
41 |     torch.nn.GRU = GRU
42 |     fairseq.GradMultiply.forward = forward_dml


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v2/32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "log_interval": 200,
 4 |         "seed": 1234,
 5 |         "learning_rate": 0.0001,
 6 |         "betas": [0.8, 0.99],
 7 |         "eps": 1e-09,
 8 |         "lr_decay": 0.999875,
 9 |         "segment_size": 12800,
10 |         "c_mel": 45,
11 |         "c_kl": 1.0
12 |     },
13 |     "data": {
14 |         "max_wav_value": 32768.0,
15 |         "sample_rate": 32000,
16 |         "filter_length": 1024,
17 |         "hop_length": 320,
18 |         "win_length": 1024,
19 |         "n_mel_channels": 80,
20 |         "mel_fmin": 0.0,
21 |         "mel_fmax": null
22 |     },
23 |     "model": {
24 |         "inter_channels": 192,
25 |         "hidden_channels": 192,
26 |         "filter_channels": 768,
27 |         "text_enc_hidden_dim": 768,
28 |         "n_heads": 2,
29 |         "n_layers": 6,
30 |         "kernel_size": 3,
31 |         "p_dropout": 0,
32 |         "resblock": "1",
33 |         "resblock_kernel_sizes": [3, 7, 11],
34 |         "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35 |         "upsample_rates": [10, 8, 2, 2],
36 |         "upsample_initial_channel": 512,
37 |         "upsample_kernel_sizes": [20, 16, 4, 4],
38 |         "use_spectral_norm": false,
39 |         "gin_channels": 256,
40 |         "spk_embed_dim": 109
41 |     }
42 | }


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v2/40000.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "log_interval": 200,
 4 |         "seed": 1234,
 5 |         "learning_rate": 0.0001,
 6 |         "betas": [0.8, 0.99],
 7 |         "eps": 1e-09,
 8 |         "lr_decay": 0.999875,
 9 |         "segment_size": 12800,
10 |         "c_mel": 45,
11 |         "c_kl": 1.0
12 |     },
13 |     "data": {
14 |         "max_wav_value": 32768.0,
15 |         "sample_rate": 40000,
16 |         "filter_length": 2048,
17 |         "hop_length": 400,
18 |         "win_length": 2048,
19 |         "n_mel_channels": 125,
20 |         "mel_fmin": 0.0,
21 |         "mel_fmax": null
22 |     },
23 |     "model": {
24 |         "inter_channels": 192,
25 |         "hidden_channels": 192,
26 |         "filter_channels": 768,
27 |         "text_enc_hidden_dim": 768,
28 |         "n_heads": 2,
29 |         "n_layers": 6,
30 |         "kernel_size": 3,
31 |         "p_dropout": 0,
32 |         "resblock": "1",
33 |         "resblock_kernel_sizes": [3, 7, 11],
34 |         "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35 |         "upsample_rates": [10, 10, 2, 2],
36 |         "upsample_initial_channel": 512,
37 |         "upsample_kernel_sizes": [16, 16, 4, 4],
38 |         "use_spectral_norm": false,
39 |         "gin_channels": 256,
40 |         "spk_embed_dim": 109
41 |     }
42 | }


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v2/48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "log_interval": 200,
 4 |         "seed": 1234,
 5 |         "learning_rate": 0.0001,
 6 |         "betas": [0.8, 0.99],
 7 |         "eps": 1e-09,
 8 |         "lr_decay": 0.999875,
 9 |         "segment_size": 17280,
10 |         "c_mel": 45,
11 |         "c_kl": 1.0
12 |     },
13 |     "data": {
14 |         "max_wav_value": 32768.0,
15 |         "sample_rate": 48000,
16 |         "filter_length": 2048,
17 |         "hop_length": 480,
18 |         "win_length": 2048,
19 |         "n_mel_channels": 128,
20 |         "mel_fmin": 0.0,
21 |         "mel_fmax": null
22 |     },
23 |     "model": {
24 |         "inter_channels": 192,
25 |         "hidden_channels": 192,
26 |         "filter_channels": 768,
27 |         "text_enc_hidden_dim": 768,
28 |         "n_heads": 2,
29 |         "n_layers": 6,
30 |         "kernel_size": 3,
31 |         "p_dropout": 0,
32 |         "resblock": "1",
33 |         "resblock_kernel_sizes": [3, 7, 11],
34 |         "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35 |         "upsample_rates": [12, 10, 2, 2],
36 |         "upsample_initial_channel": 512,
37 |         "upsample_kernel_sizes": [24, 20, 4, 4],
38 |         "use_spectral_norm": false,
39 |         "gin_channels": 256,
40 |         "spk_embed_dim": 109
41 |     }
42 | }


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/child/read_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.core.ui import shutil_move
 9 | from advanced_rvc_inference.core.model_utils import model_info
10 | from advanced_rvc_inference.variables import translations, configs
11 | 
12 | def read_model_tab():
13 |     with gr.Row():
14 |         gr.Markdown(translations["read_model_markdown_2"])
15 |     with gr.Row():
16 |         model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"]) 
17 |     with gr.Row():
18 |         read_button = gr.Button(translations["readmodel"], variant="primary", scale=2)
19 |     with gr.Column():
20 |         model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
21 |         output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6)
22 |     with gr.Row():
23 |         model.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model], outputs=[model_path])
24 |         read_button.click(
25 |             fn=model_info,
26 |             inputs=[model_path],
27 |             outputs=[output_info],
28 |             api_name="read_model"
29 |         )


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/embedders/ppg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | sys.path.append(os.getcwd())
 6 | 
 7 | from advanced_rvc_inference.library.speaker_diarization.whisper import Whisper, ModelDimensions, log_mel_spectrogram, pad_or_trim
 8 | 
 9 | class WhisperModel(torch.nn.Module):
10 |     def __init__(self, model_path, device):
11 |         super().__init__()
12 |         checkpoint = torch.load(model_path, map_location="cpu")
13 |         dims = ModelDimensions(**checkpoint["dims"])
14 |         self.final_proj = torch.nn.Linear(dims.n_text_state, 768)
15 |         self.model = Whisper(dims)
16 |         self.model.load_state_dict(checkpoint["model_state_dict"])
17 |         self.model = self.model.to(device)
18 |         del self.model.decoder
19 | 
20 |     def forward(self, audio):
21 |         ppgln = audio.shape[1] // 320
22 |         mel = log_mel_spectrogram(pad_or_trim(audio[0])).to(audio.device)
23 | 
24 |         with torch.no_grad():
25 |             ppg_raw = self.model.encoder(mel.unsqueeze(0))
26 |             ppg_projected = self.final_proj(ppg_raw)
27 |             ppg = ppg_projected.data.float()
28 |             ppg = ppg[:, :ppgln, :]
29 | 
30 |         return [ppg]
31 |     
32 |     def extract_features(self, source, padding_mask = None, output_layer = None):
33 |         return self.forward(source)


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/zluda/path-zluda-hip62.bat:
--------------------------------------------------------------------------------
 1 | rmdir /S /q zluda
 2 | curl -s -L https://github.com/lshqqytiger/ZLUDA/releases/download/rel.5e717459179dc272b7d7d23391f0fad66c7459cf/ZLUDA-windows-rocm6-amd64.zip > zluda.zip
 3 | tar -xf zluda.zip
 4 | del zluda.zip
 5 | 
 6 | if exist "runtime" (
 7 |     copy runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll runtime\Lib\site-packages\torch\lib\nvrtc_cuda.dll /y
 8 |     copy zluda\cublas.dll runtime\Lib\site-packages\torch\lib\cublas64_11.dll /y
 9 |     copy zluda\cusparse.dll runtime\Lib\site-packages\torch\lib\cusparse64_11.dll /y
10 |     copy zluda\nvrtc.dll runtime\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y
11 |     copy zluda\cufft.dll runtime\Lib\site-packages\torch\lib\cufft64_10.dll /y
12 |     copy zluda\cufftw.dll runtime\Lib\site-packages\torch\lib\cufftw64_10.dll /y
13 | ) else (
14 |     copy env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll env\Lib\site-packages\torch\lib\nvrtc_cuda.dll /y
15 |     copy zluda\cublas.dll env\Lib\site-packages\torch\lib\cublas64_11.dll /y
16 |     copy zluda\cusparse.dll env\Lib\site-packages\torch\lib\cusparse64_11.dll /y
17 |     copy zluda\nvrtc.dll env\Lib\site-packages\torch\lib\nvrtc64_112_0.dll /y
18 |     copy zluda\cufft.dll env\Lib\site-packages\torch\lib\cufft64_10.dll /y
19 |     copy zluda\cufftw.dll env\Lib\site-packages\torch\lib\cufftw64_10.dll /y
20 | )
21 | pause


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/uvr5_lib/vr_network/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | 
 4 | default_param = {}
 5 | default_param["bins"] = -1
 6 | default_param["unstable_bins"] = -1
 7 | default_param["stable_bins"] = -1
 8 | default_param["sr"] = 44100
 9 | default_param["pre_filter_start"] = -1
10 | default_param["pre_filter_stop"] = -1
11 | default_param["band"] = {}
12 | 
13 | N_BINS = "n_bins"
14 | 
15 | def int_keys(pairs):
16 |     result_dict = {}
17 | 
18 |     for key, value in pairs:
19 |         if isinstance(key, str) and key.isdigit(): key = int(key)
20 |         result_dict[key] = value
21 | 
22 |     return result_dict
23 | 
24 | class ModelParameters(object):
25 |     def __init__(self, config_path="", key_in_bin=None):
26 |         if config_path.endswith(".bin"):
27 |             with open(config_path, "rb") as f:
28 |                 data = pickle.load(f)
29 |                 self.param = data[key_in_bin]
30 |         else:
31 |             with open(config_path, "r", encoding="utf-8") as f:
32 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
33 | 
34 |         for k in ["mid_side", "mid_side_b", "mid_side_b2", "stereo_w", "stereo_n", "reverse"]:
35 |             if k not in self.param:
36 |                 self.param[k] = False
37 | 
38 |         if N_BINS in self.param:
39 |             self.param["bins"] = self.param[N_BINS]


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/training/losses.py:
--------------------------------------------------------------------------------
 1 | def feature_loss(fmap_r, fmap_g):
 2 |     loss = 0
 3 |     for dr, dg in zip(fmap_r, fmap_g):
 4 |         for rl, gl in zip(dr, dg):
 5 |             loss += (rl.float().detach() - gl.float()).abs().mean()
 6 | 
 7 |     return loss * 2
 8 | 
 9 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
10 |     loss = 0
11 |     r_losses, g_losses = [], []
12 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
13 |         dr = dr.float()
14 |         dg = dg.float()
15 |         r_loss = ((1 - dr) ** 2).mean()
16 |         g_loss = (dg**2).mean()
17 |         loss += r_loss + g_loss
18 |         r_losses.append(r_loss.item())
19 |         g_losses.append(g_loss.item())
20 | 
21 |     return loss, r_losses, g_losses
22 | 
23 | def generator_loss(disc_outputs):
24 |     loss = 0
25 |     gen_losses = []
26 |     for dg in disc_outputs:
27 |         l = ((1 - dg.float()) ** 2).mean()
28 |         gen_losses.append(l)
29 |         loss += l
30 | 
31 |     return loss, gen_losses
32 | 
33 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
34 |     z_p = z_p.float()
35 |     logs_q = logs_q.float()
36 |     m_p = m_p.float()
37 |     logs_p = logs_p.float()
38 |     z_mask = z_mask.float()
39 | 
40 |     kl = logs_p - logs_q - 0.5
41 |     kl += 0.5 * ((z_p - m_p) ** 2) * (-2.0 * logs_p).exp()
42 | 
43 |     return (kl * z_mask).sum() / z_mask.sum()


--------------------------------------------------------------------------------
/advanced_rvc_inference/tools/mediafire.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import requests
 4 | 
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | def Mediafire_Download(url, output=None, filename=None):
 8 |     if not filename: filename = url.split('/')[-2]
 9 |     if not output: output = os.path.dirname(os.path.realpath(__file__))
10 |     output_file = os.path.join(output, filename)
11 | 
12 |     sess = requests.session()
13 |     sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"})
14 | 
15 |     try:
16 |         with requests.get(BeautifulSoup(sess.get(url).content, "html.parser").find(id="downloadButton").get("href"), stream=True) as r:
17 |             r.raise_for_status()
18 | 
19 |             with open(output_file, "wb") as f:
20 |                 total_length = int(r.headers.get('content-length'))
21 |                 download_progress = 0
22 | 
23 |                 for chunk in r.iter_content(chunk_size=1024):
24 |                     download_progress += len(chunk)
25 |                     f.write(chunk)
26 | 
27 |                     sys.stdout.write(f"\r[{filename}]: {int(100 * download_progress / total_length)}% ({round(download_progress / 1024 / 1024, 2)}mb/{round(total_length / 1024 / 1024, 2)}mb)")
28 |                     sys.stdout.flush()
29 | 
30 |         sys.stdout.write("\n")
31 |         return output_file
32 |     except Exception as e:
33 |         raise RuntimeError(e)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Platform-specific requirements section
 2 | pip>=23.3
 3 | wheel
 4 | omegaconf>=2.0.6
 5 | onnxruntime; sys_platform == 'darwin'
 6 | onnxruntime-gpu; sys_platform != 'darwin'
 7 | 
 8 | # Core dependencies
 9 | PyYAML>=6.0
10 | tiktoken
11 | hyperpyyaml
12 | torch>=2.3.1
13 | tqdm>=4.63.1
14 | sortedcontainers
15 | torchvision>=0.18.1
16 | torchaudio>=2.3.1
17 | torchcodec>=0.8.1
18 | 
19 | faiss-cpu==1.7.3; python_version < "3.12"
20 | faiss-cpu>=1.7.3; python_version >= "3.12"
21 | 
22 | # Machine learning, NLP and deep learning
23 | transformers>=4.49.0
24 | scikit-learn
25 | einops>=0.8.0
26 | 
27 | # Pitch and sound processing
28 | librosa>=0.10.2
29 | pydub>=0.25.1
30 | praat-parselmouth
31 | soundfile>=0.13.0
32 | pedalboard
33 | 
34 | # Data processing and calculation
35 | numpy>=1.25.2,<2.0.0
36 | numba>=0.57.0
37 | scipy>=1.15.0
38 | matplotlib>=3.7.2
39 | 
40 | # Implementation and web framework
41 | gradio>=5.23.3,<6.0.0
42 | requests>=2.32.3
43 | aiohttp
44 | pysrt
45 | 
46 | # Utility section
47 | yt-dlp
48 | edge-tts>=7.2.0
49 | ffmpy==0.3.1
50 | ffmpeg-python>=0.2.0
51 | beautifulsoup4
52 | 
53 | # Tensorboard and ONNX
54 | tensorboard
55 | onnx>=1.14
56 | onnxslim
57 | onnx2torch>=1.5.15
58 | 
59 | # Cryptography section
60 | pycryptodome>=3.9.6,<4.0.0
61 | 
62 | # Realtime and VAD
63 | sounddevice>=0.5.2
64 | webrtcvad-wheels>=2.0.14
65 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v1/32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "log_interval": 200,
 4 |         "seed": 1234,
 5 |         "epochs": 20000,
 6 |         "learning_rate": 0.0001,
 7 |         "betas": [0.8, 0.99],
 8 |         "eps": 1e-09,
 9 |         "batch_size": 4,
10 |         "lr_decay": 0.999875,
11 |         "segment_size": 12800,
12 |         "init_lr_ratio": 1,
13 |         "warmup_epochs": 0,
14 |         "c_mel": 45,
15 |         "c_kl": 1.0
16 |     },
17 |     "data": {
18 |         "max_wav_value": 32768.0,
19 |         "sample_rate": 32000,
20 |         "filter_length": 1024,
21 |         "hop_length": 320,
22 |         "win_length": 1024,
23 |         "n_mel_channels": 80,
24 |         "mel_fmin": 0.0,
25 |         "mel_fmax": null
26 |     },
27 |     "model": {
28 |         "inter_channels": 192,
29 |         "hidden_channels": 192,
30 |         "filter_channels": 768,
31 |         "text_enc_hidden_dim": 256,
32 |         "n_heads": 2,
33 |         "n_layers": 6,
34 |         "kernel_size": 3,
35 |         "p_dropout": 0,
36 |         "resblock": "1",
37 |         "resblock_kernel_sizes": [3, 7, 11],
38 |         "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39 |         "upsample_rates": [10, 4, 2, 2, 2],
40 |         "upsample_initial_channel": 512,
41 |         "upsample_kernel_sizes": [16, 16, 4, 4, 4],
42 |         "use_spectral_norm": false,
43 |         "gin_channels": 256,
44 |         "spk_embed_dim": 109
45 |     }
46 | }


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v1/40000.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "log_interval": 200,
 4 |         "seed": 1234,
 5 |         "epochs": 20000,
 6 |         "learning_rate": 0.0001,
 7 |         "betas": [0.8, 0.99],
 8 |         "eps": 1e-09,
 9 |         "batch_size": 4,
10 |         "lr_decay": 0.999875,
11 |         "segment_size": 12800,
12 |         "init_lr_ratio": 1,
13 |         "warmup_epochs": 0,
14 |         "c_mel": 45,
15 |         "c_kl": 1.0
16 |     },
17 |     "data": {
18 |         "max_wav_value": 32768.0,
19 |         "sample_rate": 40000,
20 |         "filter_length": 2048,
21 |         "hop_length": 400,
22 |         "win_length": 2048,
23 |         "n_mel_channels": 125,
24 |         "mel_fmin": 0.0,
25 |         "mel_fmax": null
26 |     },
27 |     "model": {
28 |         "inter_channels": 192,
29 |         "hidden_channels": 192,
30 |         "filter_channels": 768,
31 |         "text_enc_hidden_dim": 256,
32 |         "n_heads": 2,
33 |         "n_layers": 6,
34 |         "kernel_size": 3,
35 |         "p_dropout": 0,
36 |         "resblock": "1",
37 |         "resblock_kernel_sizes": [3, 7, 11],
38 |         "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39 |         "upsample_rates": [10, 10, 2, 2],
40 |         "upsample_initial_channel": 512,
41 |         "upsample_kernel_sizes": [16, 16, 4, 4],
42 |         "use_spectral_norm": false,
43 |         "gin_channels": 256,
44 |         "spk_embed_dim": 109
45 |     }
46 | }


--------------------------------------------------------------------------------
/advanced_rvc_inference/configs/v1/48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |         "log_interval": 200,
 4 |         "seed": 1234,
 5 |         "epochs": 20000,
 6 |         "learning_rate": 0.0001,
 7 |         "betas": [0.8, 0.99],
 8 |         "eps": 1e-09,
 9 |         "batch_size": 4,
10 |         "lr_decay": 0.999875,
11 |         "segment_size": 11520,
12 |         "init_lr_ratio": 1,
13 |         "warmup_epochs": 0,
14 |         "c_mel": 45,
15 |         "c_kl": 1.0
16 |     },
17 |     "data": {
18 |         "max_wav_value": 32768.0,
19 |         "sample_rate": 48000,
20 |         "filter_length": 2048,
21 |         "hop_length": 480,
22 |         "win_length": 2048,
23 |         "n_mel_channels": 128,
24 |         "mel_fmin": 0.0,
25 |         "mel_fmax": null
26 |     },
27 |     "model": {
28 |         "inter_channels": 192,
29 |         "hidden_channels": 192,
30 |         "filter_channels": 768,
31 |         "text_enc_hidden_dim": 256,
32 |         "n_heads": 2,
33 |         "n_layers": 6,
34 |         "kernel_size": 3,
35 |         "p_dropout": 0,
36 |         "resblock": "1",
37 |         "resblock_kernel_sizes": [3, 7, 11],
38 |         "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39 |         "upsample_rates": [10, 6, 2, 2, 2],
40 |         "upsample_initial_channel": 512,
41 |         "upsample_kernel_sizes": [16, 16, 4, 4, 4],
42 |         "use_spectral_norm": false,
43 |         "gin_channels": 256,
44 |         "spk_embed_dim": 109
45 |     }
46 | }


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/RMVPE/e2e.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import torch.nn as nn
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from advanced_rvc_inference.library.predictors.RMVPE.deepunet import DeepUnet
10 | 
11 | N_MELS, N_CLASS = 128, 360
12 | 
13 | class BiGRU(nn.Module):
14 |     def __init__(self, input_features, hidden_features, num_layers):
15 |         super(BiGRU, self).__init__()
16 |         self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)
17 | 
18 |     def forward(self, x):
19 |         try:
20 |             return self.gru(x)[0]
21 |         except:
22 |             torch.backends.cudnn.enabled = False
23 |             return self.gru(x)[0]
24 |         
25 | class E2E(nn.Module):
26 |     def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
27 |         super(E2E, self).__init__()
28 |         self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
29 |         self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
30 |         self.fc = nn.Sequential(BiGRU(3 * 128, 256, n_gru), nn.Linear(512, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) if n_gru else nn.Sequential(nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid())
31 | 
32 |     def forward(self, mel):
33 |         return self.fc(self.cnn(self.unet(mel.transpose(-1, -2).unsqueeze(1))).transpose(1, 2).flatten(-2))


--------------------------------------------------------------------------------
/advanced_rvc_inference/core/restart.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import platform
 5 | import subprocess
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from advanced_rvc_inference.core.ui import gr_info
10 | from advanced_rvc_inference.variables import python, translations, configs_json
11 | 
12 | def restart_app(app):
13 |     gr_info(translations["30s"])
14 |     os.system("cls" if platform.system() == "Windows" else "clear")
15 |     
16 |     app.close()
17 |     subprocess.run([python, os.path.join("advanced_rvc_inference", "app.py")] + [arg for arg in sys.argv[1:] if arg != "--open"])
18 | 
19 | def change_language(lang, app):
20 |     configs = json.load(open(configs_json, "r"))
21 | 
22 |     if lang != configs["language"]:
23 |         configs["language"] = lang
24 | 
25 |         with open(configs_json, "w") as f:
26 |             json.dump(configs, f, indent=4)
27 | 
28 |         restart_app(app)
29 | 
30 | def change_theme(theme, app):
31 |     configs = json.load(open(configs_json, "r"))
32 |     
33 |     if theme != configs["theme"]:
34 |         configs["theme"] = theme
35 |         with open(configs_json, "w") as f:
36 |             json.dump(configs, f, indent=4)
37 | 
38 |         restart_app(app)
39 | 
40 | def change_font(font, app):
41 |     configs = json.load(open(configs_json, "r"))
42 | 
43 |     if font != configs["font"]:
44 |         configs["font"] = font
45 |         with open(configs_json, "w") as f:
46 |             json.dump(configs, f, indent=4)
47 | 
48 |         restart_app(app)


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/inference/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.variables import translations, configs
 9 | from advanced_rvc_inference.tabs.inference.child.convert import convert_tab
10 | from advanced_rvc_inference.tabs.inference.child.separate import separate_tab
11 | from advanced_rvc_inference.tabs.inference.child.convert_tts import convert_tts_tab
12 | from advanced_rvc_inference.tabs.inference.child.convert_with_whisper import convert_with_whisper_tab
13 | 
14 | def inference_tab():
15 |     with gr.TabItem(translations["inference"], visible=configs.get("inference_tab", True)):
16 |         with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)):
17 |             gr.Markdown(f"## {translations['convert_audio']}")
18 |             convert_tab()
19 |         with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)):
20 |             gr.Markdown(f"## {translations['separator_tab']}")
21 |             separate_tab()    
22 | 
23 |         with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)):
24 |             gr.Markdown(f"## {translations['convert_with_whisper']}")
25 |             convert_with_whisper_tab()
26 | 
27 |         with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)):
28 |             gr.Markdown(translations["convert_text_markdown"])
29 |             convert_tts_tab()
30 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/DJCM/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.library.predictors.DJCM.decoder import PE_Decoder
 9 | from advanced_rvc_inference.library.predictors.DJCM.utils import init_bn, WINDOW_LENGTH
10 | from advanced_rvc_inference.library.predictors.DJCM.encoder import ResEncoderBlock, Encoder
11 | 
12 | class LatentBlocks(nn.Module):
13 |     def __init__(self, n_blocks, latent_layers):
14 |         super(LatentBlocks, self).__init__()
15 |         self.latent_blocks = nn.ModuleList([
16 |             ResEncoderBlock(384, 384, n_blocks, None) 
17 |             for _ in range(latent_layers)
18 |         ])
19 | 
20 |     def forward(self, x):
21 |         for layer in self.latent_blocks:
22 |             x = layer(x)
23 | 
24 |         return x
25 | 
26 | class DJCMM(nn.Module):
27 |     def __init__(self, in_channels, n_blocks, latent_layers):
28 |         super(DJCMM, self).__init__()
29 |         self.bn = nn.BatchNorm2d(WINDOW_LENGTH // 2 + 1, momentum=0.01)
30 |         self.pe_encoder = Encoder(in_channels, n_blocks)
31 |         self.pe_latent = LatentBlocks(n_blocks, latent_layers)
32 |         self.pe_decoder = PE_Decoder(n_blocks)
33 |         init_bn(self.bn)
34 | 
35 |     def forward(self, spec):
36 |         x = self.bn(spec.transpose(1, 3)).transpose(1, 3)[..., :-1]
37 |         x, concat_tensors = self.pe_encoder(x)
38 |         pe_out = self.pe_decoder(self.pe_latent(x), concat_tensors)
39 | 
40 |         return pe_out


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/child/convert_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.core.ui import visible, shutil_move
 9 | from advanced_rvc_inference.core.model_utils import onnx_export
10 | from advanced_rvc_inference.variables import translations, configs
11 | 
12 | def convert_model_tab():
13 |     with gr.Row():
14 |         gr.Markdown(translations["pytorch2onnx_markdown"])
15 |     with gr.Row():
16 |         model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"]) 
17 |     with gr.Row():
18 |         convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2)
19 |     with gr.Row():
20 |         model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
21 |     with gr.Row():
22 |         output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
23 |     with gr.Row():
24 |         model_pth_upload.upload(fn=lambda model_pth_upload: shutil_move(model_pth_upload.name, configs["weights_path"]), inputs=[model_pth_upload], outputs=[model_pth_path])
25 |         convert_onnx.click(
26 |             fn=onnx_export,
27 |             inputs=[model_pth_path],
28 |             outputs=[output_model2],
29 |             api_name="model_onnx_export"
30 |         )
31 |         convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2])  


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/DJCM/spec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import numpy as np
 6 | import torch.nn as nn
 7 | 
 8 | sys.path.append(os.getcwd())
 9 | 
10 | class Spectrogram(nn.Module):
11 |     def __init__(self, hop_length, win_length, n_fft=None, clamp=1e-10):
12 |         super(Spectrogram, self).__init__()
13 |         self.n_fft = win_length if n_fft is None else n_fft
14 |         self.hop_length = hop_length
15 |         self.win_length = win_length
16 |         self.clamp = clamp
17 |         self.register_buffer("window", torch.hann_window(win_length), persistent=False)
18 | 
19 |     def forward(self, audio, center=True):
20 |         bs, c, segment_samples = audio.shape
21 |         audio = audio.reshape(bs * c, segment_samples)
22 | 
23 |         if str(audio.device).startswith(("ocl", "privateuseone")):
24 |             if not hasattr(self, "stft"): 
25 |                 from main.library.backends.utils import STFT
26 |                 self.stft = STFT(filter_length=self.n_fft, hop_length=self.hop_length, win_length=self.win_length).to(audio.device)
27 |             magnitude = self.stft.transform(audio, 1e-9)
28 |         else:
29 |             fft = torch.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, center=center, pad_mode="reflect", return_complex=True)
30 |             magnitude = (fft.real.pow(2) + fft.imag.pow(2)).sqrt()
31 | 
32 |         mag = magnitude.transpose(1, 2).clamp(self.clamp, np.inf)
33 |         mag = mag.reshape(bs, c, mag.shape[1], mag.shape[2])
34 | 
35 |         return mag


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/realtime/vad_utils.py:
--------------------------------------------------------------------------------
 1 | import webrtcvad
 2 | 
 3 | import numpy as np
 4 | 
 5 | class VADProcessor:
 6 |     def __init__(self, sensitivity_mode=3, sample_rate=16000, frame_duration_ms=30):
 7 |         if sample_rate not in [8000, 16000]: raise ValueError
 8 |         if frame_duration_ms not in [10, 20, 30]: raise ValueError
 9 | 
10 |         self.vad = webrtcvad.Vad(sensitivity_mode)
11 |         self.sample_rate = sample_rate
12 |         self.frame_length = int(sample_rate * (frame_duration_ms / 1000.0))
13 | 
14 |     def is_speech(self, audio_chunk):
15 |         if audio_chunk.ndim > 1 and audio_chunk.shape[1] == 1: audio_chunk = audio_chunk.flatten()
16 |         elif audio_chunk.ndim > 1: audio_chunk = np.mean(audio_chunk, axis=1)
17 | 
18 |         if np.max(np.abs(audio_chunk)) > 1.0: audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
19 | 
20 |         audio_chunk = (audio_chunk * 32767).astype(np.int16)
21 |         num_frames = len(audio_chunk) // self.frame_length
22 | 
23 |         if num_frames == 0 and len(audio_chunk) > 0:
24 |             audio_chunk = np.concatenate((audio_chunk, np.zeros(self.frame_length - len(audio_chunk), dtype=np.int16)))
25 |             num_frames = 1
26 |         elif num_frames == 0 and len(audio_chunk) == 0: return False
27 | 
28 |         try:
29 |             for i in range(num_frames):
30 |                 start = i * self.frame_length
31 |                 if self.vad.is_speech(audio_chunk[start:start + self.frame_length].tobytes(), self.sample_rate): return True
32 |             
33 |             return False
34 |         except Exception:
35 |             return False


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/CREPE/filter.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def mean(signals, win_length=9):
 4 |     assert signals.dim() == 2
 5 | 
 6 |     signals = signals.unsqueeze(1)
 7 |     mask = ~torch.isnan(signals)
 8 |     padding = win_length // 2
 9 | 
10 |     ones_kernel = torch.ones(signals.size(1), 1, win_length, device=signals.device)
11 |     avg_pooled = torch.nn.functional.conv1d(torch.where(mask, signals, torch.zeros_like(signals)), ones_kernel, stride=1, padding=padding) / torch.nn.functional.conv1d(mask.float(), ones_kernel, stride=1, padding=padding).clamp(min=1) 
12 |     avg_pooled[avg_pooled == 0] = float("nan")
13 | 
14 |     return avg_pooled.squeeze(1)
15 | 
16 | def median(signals, win_length):
17 |     assert signals.dim() == 2
18 | 
19 |     signals = signals.unsqueeze(1)
20 |     mask = ~torch.isnan(signals)
21 |     padding = win_length // 2
22 | 
23 |     x = torch.nn.functional.pad(torch.where(mask, signals, torch.zeros_like(signals)), (padding, padding), mode="reflect")
24 |     mask = torch.nn.functional.pad(mask.float(), (padding, padding), mode="constant", value=0)
25 | 
26 |     x = x.unfold(2, win_length, 1)
27 |     mask = mask.unfold(2, win_length, 1)
28 | 
29 |     x = x.contiguous().view(x.size()[:3] + (-1,))
30 |     mask = mask.contiguous().view(mask.size()[:3] + (-1,))
31 | 
32 |     x_sorted, _ = torch.where(mask.bool(), x.float(), float("inf")).to(x).sort(dim=-1)
33 | 
34 |     median_pooled = x_sorted.gather(-1, ((mask.sum(dim=-1) - 1) // 2).clamp(min=0).unsqueeze(-1).long()).squeeze(-1)
35 |     median_pooled[torch.isinf(median_pooled)] = float("nan")
36 | 
37 |     return median_pooled.squeeze(1)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/DJCM/encoder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.library.predictors.DJCM.utils import ResConvBlock
 9 | 
10 | class ResEncoderBlock(nn.Module):
11 |     def __init__(self, in_channels, out_channels, n_blocks, kernel_size):
12 |         super(ResEncoderBlock, self).__init__()
13 |         self.conv = nn.ModuleList([ResConvBlock(in_channels, out_channels)])
14 |         for _ in range(n_blocks - 1):
15 |             self.conv.append(ResConvBlock(out_channels, out_channels))
16 | 
17 |         self.pool = nn.MaxPool2d(kernel_size) if kernel_size is not None else None
18 | 
19 |     def forward(self, x):
20 |         for each_layer in self.conv:
21 |             x = each_layer(x)
22 | 
23 |         if self.pool is not None: return x, self.pool(x)
24 |         return x
25 | 
26 | class Encoder(nn.Module):
27 |     def __init__(self, in_channels, n_blocks):
28 |         super(Encoder, self).__init__()
29 |         self.en_blocks = nn.ModuleList([
30 |             ResEncoderBlock(in_channels, 32, n_blocks, (1, 2)), 
31 |             ResEncoderBlock(32, 64, n_blocks, (1, 2)), 
32 |             ResEncoderBlock(64, 128, n_blocks, (1, 2)), 
33 |             ResEncoderBlock(128, 256, n_blocks, (1, 2)), 
34 |             ResEncoderBlock(256, 384, n_blocks, (1, 2)), 
35 |             ResEncoderBlock(384, 384, n_blocks, (1, 2))
36 |         ])
37 | 
38 |     def forward(self, x):
39 |         concat_tensors = []
40 | 
41 |         for layer in self.en_blocks:
42 |             _, x = layer(x)
43 |             concat_tensors.append(_)
44 | 
45 |         return x, concat_tensors


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/FCPE/wav2mel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | from torchaudio.transforms import Resample
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from advanced_rvc_inference.library.predictors.FCPE.stft import STFT
10 | 
11 | class Wav2Mel:
12 |     def __init__(self, device=None, dtype=torch.float32):
13 |         self.sample_rate = 16000
14 |         self.hop_size = 160
15 |         if device is None: device = "cuda" if torch.cuda.is_available() else "cpu"
16 |         self.device = device
17 |         self.dtype = dtype
18 |         self.stft = STFT(16000, 128, 1024, 1024, 160, 0, 8000)
19 |         self.resample_kernel = {}
20 | 
21 |     def extract_nvstft(self, audio, keyshift=0, train=False):
22 |         return self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
23 | 
24 |     def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
25 |         audio = audio.to(self.dtype).to(self.device)
26 |         if sample_rate == self.sample_rate: audio_res = audio
27 |         else:
28 |             key_str = str(sample_rate)
29 |             if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample(sample_rate, self.sample_rate, lowpass_filter_width=128)
30 |             self.resample_kernel[key_str] = (self.resample_kernel[key_str].to(self.dtype).to(self.device))
31 |             audio_res = self.resample_kernel[key_str](audio)
32 | 
33 |         mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train) 
34 |         n_frames = int(audio.shape[1] // self.hop_size) + 1
35 |         mel = (torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel)
36 |         return mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
37 | 
38 |     def __call__(self, audio, sample_rate, keyshift=0, train=False):
39 |         return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/extra.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.variables import translations, configs
 9 | from advanced_rvc_inference.tabs.extra.child.fushion import fushion_tab
10 | from advanced_rvc_inference.tabs.extra.child.settings import settings_tab
11 | from advanced_rvc_inference.tabs.extra.child.read_model import read_model_tab
12 | from advanced_rvc_inference.tabs.extra.child.f0_extract import f0_extract_tab
13 | from advanced_rvc_inference.tabs.extra.child.create_srt import create_srt_tab
14 | from advanced_rvc_inference.tabs.extra.child.convert_model import convert_model_tab
15 | 
16 | def extra_tab(app):
17 |     with gr.TabItem(translations["extra"], visible=configs.get("extra_tab", True)):
18 |         with gr.TabItem(translations["fushion"], visible=configs.get("fushion_tab", True)):
19 |             gr.Markdown(translations["fushion_markdown"])
20 |             fushion_tab()
21 | 
22 |         with gr.TabItem(translations["read_model"], visible=configs.get("read_tab", True)):
23 |             gr.Markdown(translations["read_model_markdown"])
24 |             read_model_tab()
25 | 
26 |         with gr.TabItem(translations["convert_model"], visible=configs.get("onnx_tab", True)):
27 |             gr.Markdown(translations["pytorch2onnx"])
28 |             convert_model_tab()
29 | 
30 |         with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)):
31 |             gr.Markdown(translations["f0_extractor_markdown"])
32 |             f0_extract_tab()
33 | 
34 |         with gr.TabItem(translations["create_srt_tab"], visible=configs.get("create_srt_tab", True)):
35 |             gr.Markdown(translations["create_srt_markdown"])
36 |             create_srt_tab()
37 | 
38 |         with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)):
39 |             gr.Markdown(translations["settings_markdown"])
40 |             settings_tab(app)


--------------------------------------------------------------------------------
/advanced_rvc_inference/core/f0_extract.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.append(os.getcwd())
 5 | 
 6 | from advanced_rvc_inference.core.ui import gr_info, gr_warning
 7 | from advanced_rvc_inference.variables import config, translations, configs
 8 | 
 9 | def f0_extract(audio, f0_method, f0_onnx):
10 |     if not audio or not os.path.exists(audio) or os.path.isdir(audio): 
11 |         gr_warning(translations["input_not_valid"])
12 |         return [None]*2
13 | 
14 |     import librosa
15 |     import numpy as np
16 |     import matplotlib.pyplot as plt
17 | 
18 |     from advanced_rvc_inference.library.utils import check_assets, load_audio
19 |     from advanced_rvc_inference.library.predictors.Generator import Generator
20 | 
21 |     check_assets(f0_method, "", f0_onnx, "")
22 | 
23 |     f0_path = os.path.join(configs["f0_path"], os.path.splitext(os.path.basename(audio))[0])
24 |     image_path = os.path.join(f0_path, "f0.png")
25 |     txt_path = os.path.join(f0_path, "f0.txt")
26 | 
27 |     gr_info(translations["start_extract"])
28 | 
29 |     if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True)
30 | 
31 |     y = load_audio(audio, sample_rate=16000)
32 |     f0_generator = Generator(16000, 160, 50, 1100, 0.5, is_half=config.is_half, device=config.device, f0_onnx_mode=f0_onnx, del_onnx_model=f0_onnx)
33 |     _, pitchf = f0_generator.calculator(config.x_pad, f0_method, y, 0, None, 3, False, 0, None, False)
34 | 
35 |     F_temp = np.array(pitchf, dtype=np.float32)
36 |     F_temp[F_temp == 0] = np.nan
37 | 
38 |     f0 = 1200 * np.log2(F_temp / librosa.midi_to_hz(0))
39 | 
40 |     plt.figure(figsize=(10, 4))
41 |     plt.plot(f0)
42 |     plt.title(f0_method)
43 |     plt.xlabel(translations["time_frames"])
44 |     plt.ylabel(translations["Frequency"])
45 |     plt.savefig(image_path)
46 |     plt.close()
47 | 
48 |     with open(txt_path, "w") as f:
49 |         for i, f0_value in enumerate(f0):
50 |             f.write(f"{i * 100.0},{f0_value}\n")
51 | 
52 |     gr_info(translations["extract_done"])
53 | 
54 |     return [txt_path, image_path]


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/uvr5_lib/demucs/states.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import inspect
 5 | import warnings
 6 | import functools
 7 | 
 8 | sys.path.append(os.getcwd())
 9 | 
10 | from advanced_rvc_inference.variables import translations
11 | 
12 | def load_model(path_or_package, strict=False):
13 |     if isinstance(path_or_package, dict): package = path_or_package
14 |     elif isinstance(path_or_package, (str, os.PathLike)):
15 |         with warnings.catch_warnings():
16 |             warnings.simplefilter("ignore")
17 | 
18 |             package = torch.load(path_or_package, map_location="cpu", weights_only=False)
19 |     else: raise ValueError(f"{translations['type_not_valid']} {path_or_package}.")
20 | 
21 |     klass = package["klass"]
22 |     args = package["args"]
23 |     kwargs = package["kwargs"]
24 | 
25 |     if strict: model = klass(*args, **kwargs)
26 |     else:
27 |         sig = inspect.signature(klass)
28 | 
29 |         for key in list(kwargs):
30 |             if key not in sig.parameters:
31 |                 warnings.warn(translations["del_parameter"] + key)
32 | 
33 |                 del kwargs[key]
34 | 
35 |         model = klass(*args, **kwargs)
36 | 
37 |     state = package["state"]
38 | 
39 |     set_state(model, state)
40 | 
41 |     return model
42 | 
43 | def restore_quantized_state(model, state):
44 |     assert "meta" in state
45 | 
46 |     quantizer = state["meta"]["klass"](model, **state["meta"]["init_kwargs"])
47 | 
48 |     quantizer.restore_quantized_state(state)
49 |     
50 |     quantizer.detach()
51 | 
52 | def set_state(model, state, quantizer=None):
53 |     if state.get("__quantized"):
54 |         if quantizer is not None: quantizer.restore_quantized_state(model, state["quantized"])
55 |         else: restore_quantized_state(model, state)
56 |     else: model.load_state_dict(state)
57 | 
58 |     return state
59 | 
60 | def capture_init(init):
61 |     @functools.wraps(init)
62 |     def __init__(self, *args, **kwargs):
63 |         self._init_args_kwargs = (args, kwargs)
64 | 
65 |         init(self, *args, **kwargs)
66 | 
67 |     return __init__


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/PENN/core.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | 
 4 | import torch.nn.functional as F
 5 | 
 6 | PITCH_BINS, CENTS_PER_BIN, OCTAVE = 1440, 5, 1200
 7 | 
 8 | def frequency_to_bins(frequency, quantize_fn=torch.floor):
 9 |     return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
10 | 
11 | def cents_to_bins(cents, quantize_fn=torch.floor):
12 |     bins = quantize_fn(cents / CENTS_PER_BIN).long()
13 |     bins[bins < 0] = 0
14 |     bins[bins >= PITCH_BINS] = PITCH_BINS - 1
15 |     return bins
16 | 
17 | def cents_to_frequency(cents):
18 |     return 31 * 2 ** (cents / OCTAVE)
19 | 
20 | def bins_to_cents(bins):
21 |     return CENTS_PER_BIN * bins
22 | 
23 | def frequency_to_cents(frequency):
24 |     return OCTAVE * (frequency / 31).log2()
25 | 
26 | def seconds_to_samples(seconds, sample_rate=8000):
27 |     return seconds * sample_rate
28 | 
29 | def interpolate(pitch, periodicity, value):
30 |     voiced = periodicity > value
31 |     if not voiced.any(): return pitch
32 | 
33 |     pitch = pitch.log2()
34 |     pitch[..., 0] = pitch[voiced][..., 0]
35 |     pitch[..., -1] = pitch[voiced][..., -1]
36 |     voiced[..., 0] = True
37 |     voiced[..., -1] = True
38 |     pitch[~voiced] = _interpolate(torch.where(~voiced[0])[0][None], torch.where(voiced[0])[0][None], pitch[voiced][None])
39 | 
40 |     return 2 ** pitch
41 | 
42 | def _interpolate(x, xp, fp):
43 |     if xp.shape[-1] == 0: return x
44 |     if xp.shape[-1] == 1: return torch.full(x.shape, fp.squeeze(), device=fp.device, dtype=fp.dtype)
45 | 
46 |     m = (fp[:, 1:] - fp[:, :-1]) / (xp[:, 1:] - xp[:, :-1])
47 |     b = fp[:, :-1] - (m.mul(xp[:, :-1]))
48 | 
49 |     indicies = x[:, :, None].ge(xp[:, None, :]).sum(-1) - 1
50 |     indicies = indicies.clamp(0, m.shape[-1] - 1)
51 |     line_idx = torch.linspace(0, indicies.shape[0], 1, device=indicies.device).to(torch.long).expand(indicies.shape)
52 | 
53 |     return m[line_idx, indicies].mul(x) + b[line_idx, indicies]
54 | 
55 | def entropy(logits):
56 |     distribution = F.softmax(logits, dim=1)
57 |     return (1 + 1 / math.log(PITCH_BINS) * (distribution * (distribution + 1e-7).log()).sum(dim=1))


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/SWIFT/SWIFT.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import onnxruntime
 3 | 
 4 | import numpy as np
 5 | 
 6 | SAMPLE_RATE, HOP_LENGTH, FRAME_LENGTH = 16000, 256, 1024
 7 | 
 8 | class SWIFT:
 9 |     def __init__(self, model_path, fmin = 50, fmax = 1100, confidence_threshold = 0.9, providers = ["CPUExecutionProvider"]):
10 |         self.fmin = fmin
11 |         self.fmax = fmax
12 |         self.confidence_threshold = confidence_threshold
13 |         session_options = onnxruntime.SessionOptions()
14 |         session_options.inter_op_num_threads = 1
15 |         session_options.intra_op_num_threads = 1
16 |         self.pitch_session = onnxruntime.InferenceSession(model_path, session_options, providers=providers)
17 |         self.pitch_input_name = self.pitch_session.get_inputs()[0].name
18 | 
19 |     def _extract_pitch_and_confidence(self, audio_16k):
20 |         if audio_16k.ndim != 1 or len(audio_16k) == 0: raise ValueError
21 |         if len(audio_16k) < 256: audio_16k = np.pad(audio_16k, (0, max(0, 256 - len(audio_16k))), mode="constant")
22 | 
23 |         outputs = self.pitch_session.run(None, {self.pitch_input_name: audio_16k[None, :].astype(np.float32)})
24 |         if len(outputs) < 2: raise RuntimeError
25 | 
26 |         return outputs[0][0], outputs[1][0]
27 | 
28 |     def _compute_voicing(self, pitch_hz, confidence):
29 |         return (confidence > self.confidence_threshold) & (pitch_hz >= self.fmin) & (pitch_hz <= self.fmax)
30 | 
31 |     def _calculate_timestamps(self, n_frames):
32 |         frame_centers = np.arange(n_frames) * HOP_LENGTH + ((FRAME_LENGTH - 1) / 2 - ((FRAME_LENGTH - HOP_LENGTH) // 2))
33 |         return frame_centers / SAMPLE_RATE
34 | 
35 |     def detect_from_array(self, audio_array, sample_rate=SAMPLE_RATE):
36 |         if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=-1)
37 | 
38 |         audio_16k = librosa.resample(audio_array.astype(np.float32), orig_sr=sample_rate, target_sr=SAMPLE_RATE) if sample_rate != SAMPLE_RATE else audio_array
39 |         pitch_hz, confidence = self._extract_pitch_and_confidence(audio_16k)
40 | 
41 |         return pitch_hz, self._compute_voicing(pitch_hz, confidence), self._calculate_timestamps(len(pitch_hz))


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/training/extract_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import hashlib
 5 | import datetime
 6 | 
 7 | from collections import OrderedDict
 8 | 
 9 | sys.path.append(os.getcwd())
10 | 
11 | from advanced_rvc_inference.variables import logger, translations, config
12 | from advanced_rvc_inference.infer.training.utils import replace_keys_in_dict
13 | 
14 | def extract_model(ckpt, sr, pitch_guidance, name, model_path, epoch, step, version, hps, model_author, vocoder, energy_use):
15 |     try:
16 |         logger.info(translations["savemodel"].format(model_dir=model_path, epoch=epoch, step=step))
17 |         os.makedirs(os.path.dirname(model_path), exist_ok=True)
18 | 
19 |         opt = OrderedDict(weight={key: (value if not config.device.startswith("privateuseone") else value.detach().cpu()).to(torch.float16 if config.is_half else torch.float32) for key, value in ckpt.items() if "enc_q" not in key})
20 |         opt["config"] = [hps.data.filter_length // 2 + 1, 32, hps.model.inter_channels, hps.model.hidden_channels, hps.model.filter_channels, hps.model.n_heads, hps.model.n_layers, hps.model.kernel_size, hps.model.p_dropout, hps.model.resblock, hps.model.resblock_kernel_sizes, hps.model.resblock_dilation_sizes, hps.model.upsample_rates, hps.model.upsample_initial_channel, hps.model.upsample_kernel_sizes, hps.model.spk_embed_dim, hps.model.gin_channels, hps.data.sample_rate]
21 |         opt["epoch"] = f"{epoch}epoch"
22 |         opt["step"] = step
23 |         opt["sr"] = sr
24 |         opt["f0"] = int(pitch_guidance)
25 |         opt["version"] = version
26 |         opt["creation_date"] = datetime.datetime.now().isoformat()
27 |         opt["model_hash"] = hashlib.sha256(f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}".encode()).hexdigest()
28 |         opt["model_name"] = name
29 |         opt["author"] = model_author
30 |         opt["vocoder"] = vocoder
31 |         opt["energy"] = energy_use
32 | 
33 |         torch.save(replace_keys_in_dict(replace_keys_in_dict(opt, ".parametrizations.weight.original1", ".weight_v"), ".parametrizations.weight.original0", ".weight_g"), model_path)
34 |     except Exception as e:
35 |         logger.error(f"{translations['extract_model_error']}: {e}")


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/algorithm/commons.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def init_weights(m, mean=0.0, std=0.01):
 4 |     if m.__class__.__name__.find("Conv") != -1: m.weight.data.normal_(mean, std)
 5 | 
 6 | def get_padding(kernel_size, dilation=1):
 7 |     return int((kernel_size * dilation - dilation) / 2)
 8 | 
 9 | def convert_pad_shape(pad_shape):
10 |     return [item for sublist in pad_shape[::-1] for item in sublist]
11 | 
12 | def slice_segments(x, ids_str, segment_size = 4, dim = 2):
13 |     if dim == 2: ret = torch.zeros_like(x[:, :segment_size])
14 |     elif dim == 3: ret = torch.zeros_like(x[:, :, :segment_size])
15 | 
16 |     for i in range(x.size(0)):
17 |         idx_str = ids_str[i].item()
18 |         idx_end = idx_str + segment_size
19 | 
20 |         if dim == 2: ret[i] = x[i, idx_str:idx_end]
21 |         else: ret[i] = x[i, :, idx_str:idx_end]
22 | 
23 |     return ret
24 | 
25 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
26 |     b, _, t = x.size()
27 |     if x_lengths is None: x_lengths = t
28 | 
29 |     ids_str = (torch.rand([b]).to(device=x.device) * (x_lengths - segment_size + 1)).to(dtype=torch.long)
30 | 
31 |     return slice_segments(x, ids_str, segment_size, dim=3), ids_str
32 | 
33 | @torch.jit.script
34 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
35 |     n_channels_int = n_channels[0]
36 |     in_act = input_a + input_b
37 | 
38 |     return in_act[:, :n_channels_int, :].tanh() * in_act[:, n_channels_int:, :].sigmoid()
39 | 
40 | def sequence_mask(length, max_length = None):
41 |     if max_length is None: max_length = length.max()
42 |     return torch.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1)
43 | 
44 | def clip_grad_value(parameters, clip_value, norm_type=2):
45 |     if isinstance(parameters, torch.Tensor): parameters = [parameters]
46 |     norm_type = float(norm_type)
47 | 
48 |     if clip_value is not None: clip_value = float(clip_value)
49 |     total_norm = 0
50 | 
51 |     for p in list(filter(lambda p: p.grad is not None, parameters)):
52 |         total_norm += (p.grad.data.norm(norm_type)).item() ** norm_type
53 | 
54 |         if clip_value is not None: p.grad.data.clamp_(min=-clip_value, max=clip_value)
55 | 
56 |     return total_norm ** (1.0 / norm_type)


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/child/fushion.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.core.ui import visible, shutil_move
 9 | from advanced_rvc_inference.core.model_utils import fushion_model
10 | from advanced_rvc_inference.variables import translations, configs
11 | 
12 | def fushion_tab():
13 |     with gr.Row():
14 |         gr.Markdown(translations["fushion_markdown_2"])
15 |     with gr.Row():
16 |         name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True)
17 |     with gr.Row():
18 |         fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4)
19 |     with gr.Column():
20 |         with gr.Row():
21 |             model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"]) 
22 |             model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"])
23 |         with gr.Row():
24 |             model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth")
25 |             model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth")
26 |     with gr.Row():
27 |         ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True)
28 |     with gr.Row():
29 |         output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
30 |     with gr.Row():
31 |         model_a.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_a], outputs=[model_path_a])
32 |         model_b.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_b], outputs=[model_path_b])
33 |     with gr.Row():
34 |         fushion_button.click(
35 |             fn=fushion_model,
36 |             inputs=[
37 |                 name_to_save, 
38 |                 model_path_a, 
39 |                 model_path_b, 
40 |                 ratio
41 |             ],
42 |             outputs=[name_to_save, output_model],
43 |             api_name="fushion_model"
44 |         )
45 |         fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model])  


--------------------------------------------------------------------------------
/installer.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | REM Advanced RVC Inference - Windows Installation Script
 3 | REM This script installs all necessary dependencies for Advanced RVC Inference
 4 | 
 5 | echo ===========================================
 6 | echo Advanced RVC Inference Installation Script
 7 | echo ===========================================
 8 | 
 9 | REM Set up environment variables
10 | set PIP_PREFER_BINARY=1
11 | set PYTHONPATH=%CD%;%PYTHONPATH%
12 | 
13 | echo Setting up Python environment...
14 | 
15 | REM Install uv for fast package management
16 | echo Installing uv...
17 | powershell -Command "Invoke-RestMethod -Uri https://astral.sh/uv/install.ps1 | Invoke-Expression"
18 | 
19 | REM Add uv to PATH for current session
20 | set PATH=%LOCALAPPDATA%\uv;%PATH%
21 | 
22 | REM Create virtual environment using uv
23 | echo Creating virtual environment...
24 | uv venv
25 | 
26 | REM Activate the virtual environment
27 | call .venv\Scripts\activate.bat
28 | 
29 | echo Virtual environment activated.
30 | 
31 | REM Install torch with CUDA support
32 | echo Installing PyTorch with CUDA support...
33 | uv pip install --upgrade "torch>=2.0.0" "torchvision>=0.15.0" "torchaudio>=2.0.0" --index-url https://download.pytorch.org/whl/cu121
34 | 
35 | REM Install dependencies from requirements.txt
36 | echo Installing requirements...
37 | uv pip install -r requirements.txt --index-strategy unsafe-best-match
38 | 
39 | REM Install this package in development mode - only install if dependencies are available
40 | echo Installing Advanced RVC Inference package...
41 | uv pip install -e . || echo Warning: Development install failed, continuing with basic setup...
42 | 
43 | REM Install prerequisites for RVC
44 | echo Installing RVC prerequisites...
45 | python -c "from advanced_rvc_inference.core import run_prerequisites_script; run_prerequisites_script(pretraineds_hifigan=True, models=True, exe=True)"
46 | 
47 | echo ===========================================
48 | echo Installation completed successfully!
49 | echo ===========================================
50 | 
51 | echo To run the application, use one of the following commands:
52 | echo   python -m advanced_rvc_inference.app              // Run with default settings
53 | echo   python -m advanced_rvc_inference.app --share      // Run with public sharing
54 | echo   python -m advanced_rvc_inference.app --listen     // Run with external access
55 | 
56 | echo ===========================================
57 | 
58 | pause


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/DJCM/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch import nn
 4 | from einops.layers.torch import Rearrange
 5 | 
 6 | SAMPLE_RATE, WINDOW_LENGTH, N_CLASS = 16000, 1024, 360
 7 | 
 8 | def init_layer(layer):
 9 |     nn.init.xavier_uniform_(layer.weight)
10 |     if hasattr(layer, "bias") and layer.bias is not None: layer.bias.data.fill_(0.0)
11 | 
12 | def init_bn(bn):
13 |     bn.bias.data.fill_(0.0)
14 |     bn.weight.data.fill_(1.0)
15 |     bn.running_mean.data.fill_(0.0)
16 |     bn.running_var.data.fill_(1.0)
17 | 
18 | class BiGRU(nn.Module):
19 |     def __init__(self, patch_size, channels, depth):
20 |         super(BiGRU, self).__init__()
21 |         patch_width, patch_height = patch_size
22 |         patch_dim = channels * patch_height * patch_width
23 |         self.to_patch_embedding = nn.Sequential(Rearrange('b c (w p1) (h p2) -> b (w h) (p1 p2 c)', p1=patch_width, p2=patch_height))
24 |         self.gru = nn.GRU(patch_dim, patch_dim // 2, num_layers=depth, batch_first=True, bidirectional=True)
25 | 
26 |     def forward(self, x):
27 |         x = self.to_patch_embedding(x)
28 |         try:
29 |             return self.gru(x)[0]
30 |         except:
31 |             torch.backends.cudnn.enabled = False
32 |             return self.gru(x)[0]
33 | 
34 | class ResConvBlock(nn.Module):
35 |     def __init__(self, in_planes, out_planes):
36 |         super(ResConvBlock, self).__init__()
37 |         self.bn1 = nn.BatchNorm2d(in_planes, momentum=0.01)
38 |         self.bn2 = nn.BatchNorm2d(out_planes, momentum=0.01)
39 |         self.act1 = nn.PReLU()
40 |         self.act2 = nn.PReLU()
41 |         self.conv1 = nn.Conv2d(in_planes, out_planes, (3, 3), padding=(1, 1), bias=False)
42 |         self.conv2 = nn.Conv2d(out_planes, out_planes, (3, 3), padding=(1, 1), bias=False)
43 |         self.is_shortcut = False
44 | 
45 |         if in_planes != out_planes:
46 |             self.shortcut = nn.Conv2d(in_planes, out_planes, (1, 1))
47 |             self.is_shortcut = True
48 | 
49 |         self.init_weights()
50 | 
51 |     def init_weights(self):
52 |         init_bn(self.bn1)
53 |         init_bn(self.bn2)
54 |         init_layer(self.conv1)
55 |         init_layer(self.conv2)
56 |         if self.is_shortcut: init_layer(self.shortcut)
57 | 
58 |     def forward(self, x):
59 |         out = self.conv2(self.act2(self.bn2(self.conv1(self.act1(self.bn1(x))))))
60 | 
61 |         if self.is_shortcut: return self.shortcut(x) + out
62 |         else: return out + x


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Advanced-RVC
 2 | 
 3 | Thank you for your interest in contributing to Advanced-RVC! We’re excited to have you here, and we can’t wait to see what you’ll bring to our community-driven organization. This guide will walk you through how you can participate and contribute to our open-source projects.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | There are several ways you can get involved:
 8 | 
 9 | ### 1. Reporting Issues
10 | If you encounter bugs or have feature requests, you can help by [reporting an issue](https://github.com/ArkanDash/Advanced-RVC-Inference/issues). Please use clear, detailed information to help us understand the problem. Include:
11 | - A brief description of the issue.
12 | - Steps to reproduce the bug (if applicable).
13 | - Any suggestions you have for solving the problem.
14 | 
15 | ### 2. Suggesting Enhancements
16 | AI is a rapidly evolving field, and we welcome ideas for improvement! If you have an idea for an enhancement, please:
17 | - Open a [new issue](https://github.com/ArkanDash/Advanced-RVC-Inference/issues/new) and describe your suggestion.
18 | - Include why you think this enhancement would be useful.
19 | - Mention any alternatives you’ve considered.
20 | 
21 | ### 3. Submitting Code
22 | We are always looking for new code contributions, including new features, bug fixes, and documentation improvements. Here’s how you can submit code:
23 | 1. **Fork the repository** you want to contribute to.
24 | 2. **Create a branch** for your feature or fix:  
25 |    ```
26 |    git checkout -b your-branch-name
27 |    ```
28 | 3. **Make your changes** and commit them to your branch. Write a clear commit message.
29 | 4. **Push to your fork** and submit a [Pull Request (PR)](https://github.com/ArkanDash/Advanced-RVC-Inference/pulls).
30 | 
31 | #### Code Style Guidelines
32 | - Follow existing styles in the project you’re contributing to.
33 | - Make sure your code is clean, readable, and well-documented.
34 | 
35 | ### 4. Improving Documentation
36 | If you find areas of our documentation that need improvement, we’d love your help! Whether it’s fixing a typo or writing new tutorials, all contributions are appreciated. You can:
37 | - Edit markdown files directly in GitHub or locally.
38 | - Submit your changes via a [Pull Request](https://github.com/ArkanDash/Advanced-RVC-Inference/pulls).
39 | 
40 | ## Community & Discussions
41 | 
42 | Want to discuss your ideas, ask questions, or connect with other community members?  
43 | Join our [Discord](https://discord.gg/hvmsukmBHE) for real-time conversations.
44 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/backends/zluda.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"):
 4 |     class STFT:
 5 |         def __init__(self):
 6 |             self.device = "cuda"
 7 |             self.fourier_bases = {}
 8 | 
 9 |         def _get_fourier_basis(self, n_fft):
10 |             if n_fft in self.fourier_bases:
11 |                 return self.fourier_bases[n_fft]
12 | 
13 |             fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to(
14 |                 self.device
15 |             )
16 | 
17 |             cutoff = n_fft // 2 + 1
18 |             fourier_basis = torch.cat(
19 |                 [fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], dim=0
20 |             )
21 | 
22 |             self.fourier_bases[n_fft] = fourier_basis
23 |             return fourier_basis
24 | 
25 |         def transform(self, input, n_fft, hop_length, window):
26 |             fourier_basis = self._get_fourier_basis(n_fft)
27 |             fourier_basis = fourier_basis * window
28 | 
29 |             pad_amount = n_fft // 2
30 |             input = torch.nn.functional.pad(
31 |                 input, (pad_amount, pad_amount), mode="reflect"
32 |             )
33 | 
34 |             input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1)
35 |             fourier_transform = fourier_basis @ input_frames
36 |             cutoff = n_fft // 2 + 1
37 | 
38 |             return torch.complex(
39 |                 fourier_transform[:, :cutoff, :], fourier_transform[:, cutoff:, :]
40 |             )
41 | 
42 |     stft = STFT()
43 |     _torch_stft = torch.stft
44 | 
45 |     def z_stft(input, window, *args, **kwargs):
46 |         if (
47 |             kwargs.get("win_length") == None
48 |             and kwargs.get("center") == None
49 |             and kwargs.get("return_complex") == True
50 |         ):
51 |             return stft.transform(
52 |                 input, kwargs.get("n_fft"), kwargs.get("hop_length"), window
53 |             )
54 |         else:
55 |             return _torch_stft(
56 |                 input=input.cpu(), window=window.cpu(), *args, **kwargs
57 |             ).to(input.device)
58 | 
59 |     def z_jit(f, *_, **__):
60 |         f.graph = torch._C.Graph()
61 |         return f
62 | 
63 |     torch.stft = z_stft
64 |     torch.jit.script = z_jit
65 |     torch.backends.cudnn.enabled = False
66 |     torch.backends.cuda.enable_flash_sdp(False)
67 |     torch.backends.cuda.enable_math_sdp(True)
68 |     torch.backends.cuda.enable_mem_efficient_sdp(False)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/RMVPE/mel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import numpy as np
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | from librosa.filters import mel
10 | 
11 | sys.path.append(os.getcwd())
12 | 
13 | class MelSpectrogram(nn.Module):
14 |     def __init__(self, n_mel_channels, sample_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-5):
15 |         super().__init__()
16 |         n_fft = win_length if n_fft is None else n_fft
17 |         self.hann_window = {}
18 |         mel_basis = mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True)
19 |         mel_basis = torch.from_numpy(mel_basis).float()
20 |         self.register_buffer("mel_basis", mel_basis)
21 |         self.n_fft = win_length if n_fft is None else n_fft
22 |         self.hop_length = hop_length
23 |         self.win_length = win_length
24 |         self.sample_rate = sample_rate
25 |         self.n_mel_channels = n_mel_channels
26 |         self.clamp = clamp
27 | 
28 |     def forward(self, audio, keyshift=0, speed=1, center=True):
29 |         factor = 2 ** (keyshift / 12)
30 |         win_length_new = int(np.round(self.win_length * factor))
31 |         keyshift_key = str(keyshift) + "_" + str(audio.device)
32 |         if keyshift_key not in self.hann_window: self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device)
33 | 
34 |         n_fft = int(np.round(self.n_fft * factor))
35 |         hop_length = int(np.round(self.hop_length * speed))
36 | 
37 |         if str(audio.device).startswith(("ocl", "privateuseone")):
38 |             if not hasattr(self, "stft"): 
39 |                 from main.library.backends.utils import STFT
40 |                 self.stft = STFT(filter_length=n_fft, hop_length=hop_length, win_length=win_length_new).to(audio.device)
41 |             magnitude = self.stft.transform(audio, 1e-9)
42 |         else:
43 |             fft = torch.stft(audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length_new, window=self.hann_window[keyshift_key], center=center, return_complex=True)
44 |             magnitude = (fft.real.pow(2) + fft.imag.pow(2)).sqrt()
45 | 
46 |         if keyshift != 0:
47 |             size = self.n_fft // 2 + 1
48 |             resize = magnitude.size(1)
49 |             if resize < size: magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
50 |             magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
51 | 
52 |         mel_output = self.mel_basis @ magnitude
53 |         return mel_output.clamp(min=self.clamp).log()


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/FCPE/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | from torch import nn
 5 | from io import BytesIO
 6 | from Crypto.Cipher import AES
 7 | from Crypto.Util.Padding import unpad
 8 | 
 9 | def decrypt_model(configs, input_path):
10 |     with open(input_path, "rb") as f:
11 |         data = f.read()
12 | 
13 |     with open(os.path.join(configs["binary_path"], "decrypt.bin"), "rb") as f:
14 |         key = f.read()
15 | 
16 |     return BytesIO(unpad(AES.new(key, AES.MODE_CBC, data[:16]).decrypt(data[16:]), AES.block_size)).read()
17 | 
18 | def calc_same_padding(kernel_size):
19 |     pad = kernel_size // 2
20 |     return (pad, pad - (kernel_size + 1) % 2)
21 | 
22 | def l2_regularization(model, l2_alpha):
23 |     l2_loss = []
24 |     for module in model.modules():
25 |         if type(module) is nn.Conv2d: l2_loss.append((module.weight**2).sum() / 2.0)
26 | 
27 |     return l2_alpha * sum(l2_loss)
28 | 
29 | def torch_interp(x, xp, fp):
30 |     sort_idx = xp.argsort()
31 |     xp = xp[sort_idx]
32 |     fp = fp[sort_idx]
33 | 
34 |     right_idxs = torch.searchsorted(xp, x).clamp(max=len(xp) - 1)
35 |     left_idxs = (right_idxs - 1).clamp(min=0)
36 |     x_left = xp[left_idxs]
37 |     y_left = fp[left_idxs]
38 | 
39 |     interp_vals = y_left + ((x - x_left) * (fp[right_idxs] - y_left) / (xp[right_idxs] - x_left))
40 |     interp_vals[x < xp[0]] = fp[0]
41 |     interp_vals[x > xp[-1]] = fp[-1]
42 | 
43 |     return interp_vals
44 | 
45 | def batch_interp_with_replacement_detach(uv, f0):
46 |     result = f0.clone()
47 |     for i in range(uv.shape[0]):
48 |         interp_vals = torch_interp(torch.where(uv[i])[-1], torch.where(~uv[i])[-1], f0[i][~uv[i]]).detach()
49 |         result[i][uv[i]] = interp_vals
50 |         
51 |     return result
52 | 
53 | class DotDict(dict):
54 |     def __getattr__(*args):
55 |         val = dict.get(*args)
56 |         return DotDict(val) if type(val) is dict else val
57 | 
58 |     __setattr__ = dict.__setitem__
59 |     __delattr__ = dict.__delitem__
60 | 
61 | class Swish(nn.Module):
62 |     def forward(self, x):
63 |         return x * x.sigmoid()
64 | 
65 | class Transpose(nn.Module):
66 |     def __init__(self, dims):
67 |         super().__init__()
68 |         assert len(dims) == 2, "dims == 2"
69 |         self.dims = dims
70 | 
71 |     def forward(self, x):
72 |         return x.transpose(*self.dims)
73 | 
74 | class GLU(nn.Module):
75 |     def __init__(self, dim):
76 |         super().__init__()
77 |         self.dim = dim
78 | 
79 |     def forward(self, x):
80 |         out, gate = x.chunk(2, dim=self.dim)
81 |         return out * gate.sigmoid()


--------------------------------------------------------------------------------
/advanced_rvc_inference/core/csrt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.append(os.getcwd())
 5 | 
 6 | from advanced_rvc_inference.core.inference import whisper_process
 7 | from advanced_rvc_inference.library.utils import check_spk_diarization
 8 | from advanced_rvc_inference.core.ui import gr_info, gr_warning, process_output
 9 | from advanced_rvc_inference.variables import config, translations, configs, logger
10 | 
11 | def create_srt(model_size, input_audio, output_file, word_timestamps):
12 |     import multiprocessing as mp
13 | 
14 |     if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio): 
15 |         gr_warning(translations["input_not_valid"])
16 |         return [None]*2
17 |     
18 |     if not output_file.endswith(".srt"): output_file += ".srt"
19 |         
20 |     if not output_file:
21 |         gr_warning(translations["output_not_valid"])
22 |         return [None]*2
23 |     
24 |     output_dir = os.path.dirname(output_file)
25 |     if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
26 | 
27 |     info = ""
28 |     output_file = process_output(output_file)
29 | 
30 |     check_spk_diarization(model_size, speechbrain=False)
31 |     gr_info(translations["csrt"])
32 | 
33 |     try:
34 |         mp.set_start_method("spawn")
35 |     except:
36 |         pass
37 | 
38 |     whisper_queue = mp.Queue()
39 |     whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, word_timestamps))
40 |     whisperprocess.start()
41 | 
42 |     segments = whisper_queue.get()
43 | 
44 |     with open(output_file, "w", encoding="utf-8") as f:
45 |         for i, segment in enumerate(segments):
46 |             start = segment["start"]
47 |             end = segment["end"]
48 |             text = segment["text"].strip()
49 |             
50 |             index = f"{i+1}\n"
51 |             timestamp = f"{format_timestamp(start)} --> {format_timestamp(end)}\n"
52 |             text1 = f"{text}\n\n"
53 | 
54 |             f.write(index)
55 |             f.write(timestamp)
56 |             f.write(text1)
57 | 
58 |             info = info + index + timestamp + text1
59 |         logger.info(info)
60 |     
61 |     gr_info(translations["success"])
62 | 
63 |     return [{"value": output_file, "visible": True, "__type__": "update"}, info]
64 | 
65 | def format_timestamp(seconds):
66 |     hours = int(seconds // 3600)
67 |     minutes = int((seconds % 3600) // 60)
68 | 
69 |     seconds = int(seconds % 60)
70 |     miliseconds = int((seconds - int(seconds)) * 1000)
71 | 
72 |     return f"{hours:02}:{minutes:02}:{seconds:02},{miliseconds:03}"


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/DJCM/decoder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | sys.path.append(os.getcwd())
 9 | 
10 | from advanced_rvc_inference.library.predictors.DJCM.encoder import ResEncoderBlock
11 | from advanced_rvc_inference.library.predictors.DJCM.utils import ResConvBlock, BiGRU, init_bn, init_layer, N_CLASS, WINDOW_LENGTH
12 | 
13 | class ResDecoderBlock(nn.Module):
14 |     def __init__(self, in_channels, out_channels, n_blocks, stride):
15 |         super(ResDecoderBlock, self).__init__()
16 |         self.conv1 = nn.ConvTranspose2d(in_channels, out_channels, stride, stride, (0, 0), bias=False)
17 |         self.bn1 = nn.BatchNorm2d(in_channels, momentum=0.01)
18 |         self.conv = nn.ModuleList([ResConvBlock(out_channels * 2, out_channels)])
19 | 
20 |         for _ in range(n_blocks - 1):
21 |             self.conv.append(ResConvBlock(out_channels, out_channels))
22 | 
23 |         self.init_weights()
24 | 
25 |     def init_weights(self):
26 |         init_bn(self.bn1)
27 |         init_layer(self.conv1)
28 | 
29 |     def forward(self, x, concat):
30 |         x = self.conv1(F.relu_(self.bn1(x)))
31 |         x = torch.cat((x, concat), dim=1)
32 |     
33 |         for each_layer in self.conv:
34 |             x = each_layer(x)
35 |     
36 |         return x
37 | 
38 | class Decoder(nn.Module):
39 |     def __init__(self, n_blocks):
40 |         super(Decoder, self).__init__()
41 |         self.de_blocks = nn.ModuleList([
42 |             ResDecoderBlock(384, 384, n_blocks, (1, 2)), 
43 |             ResDecoderBlock(384, 384, n_blocks, (1, 2)), 
44 |             ResDecoderBlock(384, 256, n_blocks, (1, 2)), 
45 |             ResDecoderBlock(256, 128, n_blocks, (1, 2)), 
46 |             ResDecoderBlock(128, 64, n_blocks, (1, 2)), 
47 |             ResDecoderBlock(64, 32, n_blocks, (1, 2))
48 |         ])
49 | 
50 |     def forward(self, x, concat_tensors):
51 |         for i, layer in enumerate(self.de_blocks):
52 |             x = layer(x, concat_tensors[-1 - i])
53 | 
54 |         return x
55 | 
56 | class PE_Decoder(nn.Module):
57 |     def __init__(self, n_blocks, seq_layers=1):
58 |         super(PE_Decoder, self).__init__()
59 |         self.de_blocks = Decoder(n_blocks)
60 |         self.after_conv1 = ResEncoderBlock(32, 32, n_blocks, None)
61 |         self.after_conv2 = nn.Conv2d(32, 1, (1, 1))
62 |         self.fc = nn.Sequential(BiGRU((1, WINDOW_LENGTH // 2), 1, seq_layers), nn.Linear(WINDOW_LENGTH // 2, N_CLASS), nn.Sigmoid())
63 |         init_layer(self.after_conv2)
64 | 
65 |     def forward(self, x, concat_tensors):
66 |         return self.fc(self.after_conv2(self.after_conv1(self.de_blocks(x, concat_tensors)))).squeeze(1)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/generators/hifigan.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import torch.nn.utils.parametrize as parametrize
 6 | 
 7 | from torch.nn.utils import remove_weight_norm
 8 | from torch.nn.utils.parametrizations import weight_norm
 9 | 
10 | sys.path.append(os.getcwd())
11 | 
12 | from advanced_rvc_inference.library.algorithm.commons import init_weights
13 | from advanced_rvc_inference.library.algorithm.residuals import ResBlock, LRELU_SLOPE
14 | 
15 | class HiFiGANGenerator(torch.nn.Module):
16 |     def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
17 |         super(HiFiGANGenerator, self).__init__()
18 |         self.num_kernels = len(resblock_kernel_sizes)
19 |         self.num_upsamples = len(upsample_rates)
20 |         self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
21 |         self.ups = torch.nn.ModuleList()
22 |         self.resblocks = torch.nn.ModuleList()
23 | 
24 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
25 |             self.ups.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
26 |             ch = upsample_initial_channel // (2 ** (i + 1))
27 | 
28 |             for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
29 |                 self.resblocks.append(ResBlock(ch, k, d))
30 | 
31 |         self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
32 |         self.ups.apply(init_weights)
33 |         if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
34 | 
35 |     def forward(self, x, g = None):
36 |         x = self.conv_pre(x)
37 |         if g is not None: x += self.cond(g)
38 | 
39 |         for i in range(self.num_upsamples):
40 |             x = self.ups[i](torch.nn.functional.leaky_relu(x, LRELU_SLOPE))
41 |             xs = None
42 | 
43 |             for j in range(self.num_kernels):
44 |                 if xs is None: xs = self.resblocks[i * self.num_kernels + j](x)
45 |                 else: xs += self.resblocks[i * self.num_kernels + j](x)
46 |             x = xs / self.num_kernels
47 | 
48 |         return self.conv_post(torch.nn.functional.leaky_relu(x)).tanh()
49 |     
50 |     def remove_weight_norm(self):
51 |         for l in self.ups:
52 |             if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True)
53 |             else: remove_weight_norm(l)
54 | 
55 |         for l in self.resblocks:
56 |             l.remove_weight_norm()


--------------------------------------------------------------------------------
/advanced_rvc_inference/assets/config.txt:
--------------------------------------------------------------------------------
 1 | # Netscape HTTP Cookie File
 2 | # http://curl.haxx.se/rfc/cookie_spec.html
 3 | # This is a generated file!  Do not edit.
 4 | 
 5 | .youtube.com	TRUE	/	TRUE	1775861587	PREF	f4=4000000&tz=Europe.Copenhagen
 6 | .youtube.com	TRUE	/	TRUE	1775429540	SOCS	CAESEwgDEgk3MzM4ODU4MTgaAmVuIAEaBgiAy6O-Bg
 7 | .youtube.com	TRUE	/	TRUE	1772837586	__Secure-1PSIDTS	sidts-CjIBEJ3XV6pCalNg0CjCWF_v2t6TsOrMxJZS0U4Syxsj1Ar5Wt0j_I0mV1HvK8pjgzceWRAA
 8 | .youtube.com	TRUE	/	TRUE	1772837586	__Secure-3PSIDTS	sidts-CjIBEJ3XV6pCalNg0CjCWF_v2t6TsOrMxJZS0U4Syxsj1Ar5Wt0j_I0mV1HvK8pjgzceWRAA
 9 | .youtube.com	TRUE	/	FALSE	1775861586	HSID	A0YpCKpriRsFf-eaY
10 | .youtube.com	TRUE	/	TRUE	1775861586	SSID	A81Z41FkWZ4pTWTeH
11 | .youtube.com	TRUE	/	FALSE	1775861586	APISID	Sz068zUXho-XjSR6/AQiI5USjyJu7Gohv0
12 | .youtube.com	TRUE	/	TRUE	1775861586	SAPISID	jhCFMxYOhwdvfusq/AN48BaLVbLYevdtWs
13 | .youtube.com	TRUE	/	TRUE	1775861586	__Secure-1PAPISID	jhCFMxYOhwdvfusq/AN48BaLVbLYevdtWs
14 | .youtube.com	TRUE	/	TRUE	1775861586	__Secure-3PAPISID	jhCFMxYOhwdvfusq/AN48BaLVbLYevdtWs
15 | .youtube.com	TRUE	/	FALSE	1775861586	SID	g.a000ugiSAkUceGjFKLyl_8d6QCYNYGiTK5OslV6Yo4XcCJ12HlsifzqU3EwANmP9XbixAPxd9AACgYKASYSARQSFQHGX2Mia271rvQxsQP9duW_omjJFxoVAUF8yKpRiIybd7GrZtUXXe-mWs9h0076
16 | .youtube.com	TRUE	/	TRUE	1775861586	__Secure-1PSID	g.a000ugiSAkUceGjFKLyl_8d6QCYNYGiTK5OslV6Yo4XcCJ12Hlsiyqo-MXwLpsPxxYN7v1RgEAACgYKASoSARQSFQHGX2MiT4w8Mfz-Uacjva743UUmKRoVAUF8yKqBquoRXGGeFoLcSlEkzLr80076
17 | .youtube.com	TRUE	/	TRUE	1775861586	__Secure-3PSID	g.a000ugiSAkUceGjFKLyl_8d6QCYNYGiTK5OslV6Yo4XcCJ12Hlsic_IpbHF-rVcATgFnBx5-XQACgYKAV8SARQSFQHGX2MiHfIVTQ75805ff5G_9ErOohoVAUF8yKo3by8D34K0XL5d0nQs3pcU0076
18 | .youtube.com	TRUE	/	TRUE	1775861586	LOGIN_INFO	AFmmF2swRgIhAJ-I_I_kAC3fwjXFk8Ii8hS6J01HlWxT1hxOIZ_hC_b1AiEA7Dl6QKY61fi7podAChQOvsxRJLfhJp90urdIYHkvnJc:QUQ3MjNmeXJ5UTB6V2h1ZUdLTXBuS0p2QnJyZnk5M2RFWVNQNTQtaHdJSjhseFpIX2xJbC1DOGp0NUxzcFVkOVZyZkRVVEtQelktUGk2c2VjVnQtTmlwS2tBNUpOTUFfNTJ5SGVCMTY5STFSTjZjaXFNOHYtQm5BZTdEOWQxWkNPY0laZ3FibnZSYy1nNER2eVZYN0p1Ukl4bk1lMElIejRR
19 | .youtube.com	TRUE	/	TRUE	1775429537	__Secure-YEC	CgtnMXZOdi1WVXNpayjSzqi-BjIiCgJOTBIcEhgSFhMLFBUWFwwYGRobHB0eHw4PIBAREiEgUw%3D%3D
20 | .youtube.com	TRUE	/	FALSE	1772837590	SIDCC	AKEyXzX7AgkezzZjTXTDuEQejiJwX0Qa9krKmOjMc8i6VuxONJDa_91O2xgFKbiGRZh3F3kpIQ
21 | .youtube.com	TRUE	/	TRUE	1772837590	__Secure-1PSIDCC	AKEyXzW_Q4mUyomFQUP0p9Mv0o0rdzS5PBN-V7_XS2bloLP1wv5_9En8qmUAsarHn7wkU4KY
22 | .youtube.com	TRUE	/	TRUE	1772837590	__Secure-3PSIDCC	AKEyXzXTgT_gT0WiuG-2VSWgMoOm0vQSvFwlCYnqtQI8cPpFYOxpwOZfByhS1WNI1ZYvhZqK_Q
23 | .youtube.com	TRUE	/	TRUE	0	YSC	78pipAlj27I
24 | .youtube.com	TRUE	/	TRUE	1775429588	VISITOR_PRIVACY_METADATA	CgJOTBIcEhgSFhMLFBUWFwwYGRobHB0eHw4PIBAREiEgUw%3D%3D
25 | .youtube.com	TRUE	/	TRUE	1756853586	__Secure-ROLLOUT_TOKEN	CNCwjIqOv-6f8QEQjO3F88X2iwMYgL3K88X2iwM%3D
26 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/PESTO/PESTO.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | sys.path.append(os.getcwd())
 6 | 
 7 | class PESTO:
 8 |     def __init__(self, model_path, step_size=10, reduction="alwa", num_chunks=1, sample_rate=16000, device=None, providers=None, onnx=False):
 9 |         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
10 |         self.step_size = step_size
11 |         self.reduction = reduction
12 |         self.num_chunks = num_chunks
13 |         self.sample_rate = sample_rate
14 |         self.onnx = onnx
15 | 
16 |         if self.onnx:
17 |             import onnxruntime as ort
18 | 
19 |             sess_options = ort.SessionOptions()
20 |             sess_options.log_severity_level = 3
21 |             self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers)
22 |         else:
23 |             from main.library.predictors.PESTO.model import PPESTO, Resnet1d
24 |             from main.library.predictors.PESTO.preprocessor import Preprocessor
25 | 
26 |             ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
27 |             model = PPESTO(Resnet1d(**ckpt["hparams"]["encoder"]), preprocessor=Preprocessor(hop_size=step_size, sampling_rate=sample_rate, **ckpt["hcqt_params"]), crop_kwargs=ckpt["hparams"]["pitch_shift"], reduction=ckpt["hparams"]["reduction"])
28 |             model.load_state_dict(ckpt["state_dict"], strict=False)
29 | 
30 |             self.model = model.to(self.device).eval()
31 |             self.model.reduction = self.reduction
32 | 
33 |     def compute_f0(self, x):
34 |         assert x.ndim <= 2
35 | 
36 |         with torch.inference_mode():
37 |             with torch.no_grad():
38 |                 preds, confidence = [], []
39 | 
40 |                 for chunk in x.chunk(chunks=self.num_chunks):
41 |                     if self.onnx:
42 |                         model = self.model.run(
43 |                             [self.model.get_outputs()[0].name, self.model.get_outputs()[1].name], 
44 |                             {
45 |                                 self.model.get_inputs()[0].name: chunk.cpu().numpy()
46 |                             }
47 |                         )
48 |                         pred, conf = torch.tensor(model[0], device=self.device), torch.tensor(model[1], device=self.device)
49 |                     else:
50 |                         pred, conf = self.model(
51 |                             chunk, 
52 |                             sr=self.sample_rate, 
53 |                             convert_to_freq=True, 
54 |                             return_activations=False
55 |                         )
56 | 
57 |                     preds.append(pred)
58 |                     confidence.append(conf)
59 | 
60 |                 return torch.cat(preds, dim=0), torch.cat(confidence, dim=0)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/speaker_diarization/embedding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import numpy as np
 6 | import torch.nn.functional as F
 7 | 
 8 | from functools import cached_property
 9 | from torch.nn.utils.rnn import pad_sequence
10 | 
11 | sys.path.append(os.getcwd())
12 | 
13 | from advanced_rvc_inference.library.speaker_diarization.speechbrain import EncoderClassifier
14 | 
15 | class SpeechBrainPretrainedSpeakerEmbedding:
16 |     def __init__(self, embedding, device = None):
17 |         super().__init__()
18 | 
19 |         self.embedding = embedding
20 |         self.device = device or torch.device("cpu")
21 |         self.classifier_ = EncoderClassifier.from_hparams(source=self.embedding, run_opts={"device": self.device})
22 | 
23 |     @cached_property
24 |     def dimension(self):
25 |         *_, dimension = self.classifier_.encode_batch(torch.rand(1, 16000).to(self.device)).shape
26 |         return dimension
27 | 
28 |     @cached_property
29 |     def min_num_samples(self):
30 |         with torch.inference_mode():
31 |             lower, upper = 2, round(0.5 * self.classifier_.audio_normalizer.sample_rate)
32 |             middle = (lower + upper) // 2
33 | 
34 |             while lower + 1 < upper:
35 |                 try:
36 |                     _ = self.classifier_.encode_batch(torch.randn(1, middle).to(self.device))
37 |                     upper = middle
38 |                 except RuntimeError:
39 |                     lower = middle
40 | 
41 |                 middle = (lower + upper) // 2
42 | 
43 |         return upper
44 | 
45 |     def __call__(self, waveforms, masks = None):
46 |         batch_size, num_channels, num_samples = waveforms.shape
47 |         assert num_channels == 1
48 | 
49 |         waveforms = waveforms.squeeze(dim=1)
50 | 
51 |         if masks is None:
52 |             signals = waveforms.squeeze(dim=1)
53 |             wav_lens = signals.shape[1] * torch.ones(batch_size)
54 |         else:
55 |             batch_size_masks, _ = masks.shape
56 |             assert batch_size == batch_size_masks
57 | 
58 |             imasks = F.interpolate(masks.unsqueeze(dim=1), size=num_samples, mode="nearest").squeeze(dim=1) > 0.5
59 |             signals = pad_sequence([waveform[imask].contiguous() for waveform, imask in zip(waveforms, imasks)], batch_first=True)
60 |             wav_lens = imasks.sum(dim=1)
61 | 
62 |         max_len = wav_lens.max()
63 |         if max_len < self.min_num_samples: return np.nan * np.zeros((batch_size, self.dimension))
64 | 
65 |         too_short = wav_lens < self.min_num_samples
66 |         wav_lens = wav_lens / max_len
67 |         wav_lens[too_short] = 1.0
68 | 
69 |         embeddings = (self.classifier_.encode_batch(signals, wav_lens=wav_lens).squeeze(dim=1).cpu().numpy())
70 |         embeddings[too_short.cpu().numpy()] = np.nan
71 | 
72 |         return embeddings


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/FCPE/stft.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import numpy as np
 6 | import torch.nn.functional as F
 7 | 
 8 | from librosa.filters import mel
 9 | 
10 | sys.path.append(os.getcwd())
11 | 
12 | class STFT:
13 |     def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
14 |         self.target_sr = sr
15 |         self.n_mels = n_mels
16 |         self.n_fft = n_fft
17 |         self.win_size = win_size
18 |         self.hop_length = hop_length
19 |         self.fmin = fmin
20 |         self.fmax = fmax
21 |         self.clip_val = clip_val
22 |         self.mel_basis = {}
23 |         self.hann_window = {}
24 | 
25 |     def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
26 |         n_fft = self.n_fft
27 |         win_size = self.win_size
28 |         hop_length = self.hop_length
29 |         fmax = self.fmax
30 |         factor = 2 ** (keyshift / 12)
31 |         win_size_new = int(np.round(win_size * factor))
32 |         hop_length_new = int(np.round(hop_length * speed))
33 |         mel_basis = self.mel_basis if not train else {}
34 |         hann_window = self.hann_window if not train else {}
35 |         mel_basis_key = str(fmax) + "_" + str(y.device)
36 | 
37 |         if mel_basis_key not in mel_basis: mel_basis[mel_basis_key] = torch.from_numpy(mel(sr=self.target_sr, n_fft=n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=fmax)).float().to(y.device)
38 |         keyshift_key = str(keyshift) + "_" + str(y.device)
39 |         if keyshift_key not in hann_window: hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
40 | 
41 |         pad_left = (win_size_new - hop_length_new) // 2
42 |         pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left)
43 | 
44 |         pad = F.pad(y.unsqueeze(1), (pad_left, pad_right), mode="reflect" if pad_right < y.size(-1) else "constant").squeeze(1)
45 |         n_fft = int(np.round(n_fft * factor))
46 | 
47 |         if str(y.device).startswith(("ocl", "privateuseone")):
48 |             if not hasattr(self, "stft"): 
49 |                 from main.library.backends.utils import STFT as _STFT
50 |                 self.stft = _STFT(filter_length=n_fft, hop_length=hop_length_new, win_length=win_size_new).to(y.device)
51 |             spec = self.stft.transform(pad, 1e-9)
52 |         else:
53 |             spec = torch.stft(pad, n_fft, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=True)
54 |             spec = (spec.real.pow(2) + spec.imag.pow(2) + 1e-9).sqrt()
55 | 
56 |         if keyshift != 0:
57 |             size = n_fft // 2 + 1
58 |             resize = spec.size(1)
59 |             spec = (F.pad(spec, (0, 0, 0, size - resize)) if resize < size else spec[:, :size, :]) * win_size / win_size_new
60 | 
61 |         return ((mel_basis[mel_basis_key] @ spec).clamp(min=self.clip_val) * 1).log()


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/backends/opencl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import platform
 5 | import subprocess
 6 | 
 7 | try:
 8 |     import pytorch_ocl
 9 | except:
10 |     pytorch_ocl = None
11 | 
12 | sys.path.append(os.getcwd())
13 | 
14 | from advanced_rvc_inference.library.backends.utils import GRU
15 | 
16 | torch_available = pytorch_ocl != None
17 | if torch_available: adaptive_orig = torch.nn.AdaptiveAvgPool2d
18 | 
19 | def check_amd_gpu(gpu):
20 |     for i in ["RX", "AMD", "Vega", "Radeon", "FirePro"]:
21 |         return i in gpu
22 | 
23 | def get_amd_gpu_windows():
24 |     gpus = ""
25 | 
26 |     try:
27 |         gpus = subprocess.check_output("wmic path win32_VideoController get name", shell=True, stderr=subprocess.DEVNULL)
28 |     except subprocess.CalledProcessError:
29 |         gpus = subprocess.check_output('powershell "Get-CimInstance Win32_VideoController | Select-Object -ExpandProperty Name"', shell=True, stderr=subprocess.DEVNULL)
30 | 
31 |     return [gpu.strip() for gpu in gpus.decode().split('\n')[1:] if check_amd_gpu(gpu)]
32 | 
33 | def get_amd_gpu_linux():
34 |     try:
35 |         return [gpu for gpu in subprocess.check_output("lspci | grep VGA", shell=True).decode().split('\n') if check_amd_gpu(gpu)]
36 |     except:
37 |         return []
38 | 
39 | def get_gpu_list():
40 |     return (get_amd_gpu_windows() if platform.system() == "Windows" else get_amd_gpu_linux()) if torch_available else []
41 | 
42 | def device_count():
43 |     return len(get_gpu_list()) if torch_available else 0
44 | 
45 | def device_name(device_id = 0):
46 |     return (get_gpu_list()[device_id] if device_id >= 0 and device_id < device_count() else "") if torch_available else ""
47 | 
48 | def is_available():
49 |     return (device_count() > 0) if torch_available else False
50 | 
51 | def group_norm(x, num_groups, weight=None, bias=None, eps=1e-5):
52 |     N, C = x.shape[:2]
53 |     assert C % num_groups == 0
54 | 
55 |     shape = (N, num_groups, C // num_groups) + x.shape[2:]
56 |     x_reshaped = x.view(shape)
57 | 
58 |     dims = (2,) + tuple(range(3, x_reshaped.dim()))
59 |     mean = x_reshaped.mean(dim=dims, keepdim=True)
60 |     var = x_reshaped.var(dim=dims, keepdim=True, unbiased=False)
61 | 
62 |     x_norm = (x_reshaped - mean) / (var + eps).sqrt()
63 |     x_norm = x_norm.view_as(x)
64 | 
65 |     if weight is not None:
66 |         weight = weight.view(1, C, *([1] * (x.dim() - 2)))
67 |         x_norm = x_norm * weight
68 | 
69 |     if bias is not None:
70 |         bias = bias.view(1, C, *([1] * (x.dim() - 2)))
71 |         x_norm = x_norm + bias
72 | 
73 |     return x_norm
74 | 
75 | def script(f, *_, **__):
76 |     f.graph = pytorch_ocl.torch._C.Graph()
77 |     return f
78 | 
79 | def AdaptiveAvgPool2d(input):
80 |     input = input[0] if isinstance(input, tuple) else input
81 |     return adaptive_orig(input)
82 | 
83 | if torch_available:
84 |     torch.nn.GRU = GRU
85 |     torch.nn.AdaptiveAvgPool2d = AdaptiveAvgPool2d
86 |     torch.nn.functional.group_norm = group_norm
87 |     torch.jit.script = script


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/child/create_srt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.core.csrt import create_srt
 9 | from advanced_rvc_inference.core.ui import shutil_move, change_audios_choices
10 | from advanced_rvc_inference.variables import translations, file_types, configs, paths_for_files
11 | 
12 | def create_srt_tab():
13 |     with gr.Row():
14 |         gr.Markdown(translations["create_srt_markdown_2"])
15 |     with gr.Row():
16 |         with gr.Column():
17 |             srt_content = gr.Textbox(label=translations["srt_content"], value="", lines=9, max_lines=9, interactive=False)
18 |         with gr.Column():
19 |             word_timestamps = gr.Checkbox(label=translations["word_timestamps"], info=translations["word_timestamps_info"], value=False, interactive=True)
20 |             model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True)
21 |     with gr.Row():
22 |         convert_button = gr.Button(translations["convert_audio"], variant="primary")
23 |     with gr.Row():
24 |         with gr.Accordion(translations["input_output"], open=False):
25 |             with gr.Column():
26 |                 input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
27 |                 output_file = gr.Textbox(label=translations["srt_output_file"], value="srt/output.srt", placeholder="srt/output.srt", interactive=True)
28 |             with gr.Column():
29 |                 refresh = gr.Button(translations["refresh"])
30 |             with gr.Row():
31 |                 input_file = gr.Files(label=translations["drop_audio"], file_types=file_types)
32 |     with gr.Row():
33 |         play_audio = gr.Audio(interactive=False, label=translations["input_audio"])
34 |     with gr.Row():
35 |         output_srt = gr.File(label=translations["srt_output_file"], file_types=[".srt"], interactive=False, visible=False)
36 |     with gr.Row():
37 |         input_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input_file], outputs=[input_audio])
38 |         input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[play_audio])
39 |         refresh.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
40 |     with gr.Row():
41 |         convert_button.click(
42 |             fn=create_srt,
43 |             inputs=[
44 |                 model_size, 
45 |                 input_audio, 
46 |                 output_file, 
47 |                 word_timestamps
48 |             ],
49 |             outputs=[
50 |                 output_srt,
51 |                 srt_content
52 |             ],
53 |             api_name="create_srt"
54 |         )
55 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/training/anyprecision_optimizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.optim.optimizer import Optimizer
 4 | 
 5 | class AnyPrecisionAdamW(Optimizer):
 6 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, use_kahan_summation=True, momentum_dtype=torch.bfloat16, variance_dtype=torch.bfloat16, compensation_buffer_dtype=torch.bfloat16):
 7 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, use_kahan_summation=use_kahan_summation, momentum_dtype=momentum_dtype, variance_dtype=variance_dtype, compensation_buffer_dtype=compensation_buffer_dtype)
 8 |         super().__init__(params, defaults)
 9 | 
10 |     @torch.no_grad()
11 |     def step(self, closure=None):
12 |         if closure is not None:
13 |             with torch.enable_grad():
14 |                 closure()
15 | 
16 |         for group in self.param_groups:
17 |             beta1, beta2 = group["betas"]
18 |             lr = group["lr"]
19 |             weight_decay = group["weight_decay"]
20 |             eps = group["eps"]
21 |             use_kahan_summation = group["use_kahan_summation"]
22 |             momentum_dtype = group["momentum_dtype"]
23 |             variance_dtype = group["variance_dtype"]
24 |             compensation_buffer_dtype = group["compensation_buffer_dtype"]
25 | 
26 |             for p in group["params"]:
27 |                 if p.grad is None: continue
28 |                 if p.grad.is_sparse: raise RuntimeError
29 | 
30 |                 state = self.state[p]
31 |                 if len(state) == 0:
32 |                     state["step"] = torch.tensor(0.0)
33 |                     state["exp_avg"] = torch.zeros_like(p, dtype=momentum_dtype)
34 |                     state["exp_avg_sq"] = torch.zeros_like(p, dtype=variance_dtype)
35 |                     if use_kahan_summation: state["compensation"] = torch.zeros_like(p, dtype=compensation_buffer_dtype)
36 | 
37 |                 state["step"] += 1
38 |                 step = state["step"]
39 |                 exp_avg = state["exp_avg"]
40 |                 exp_avg_sq = state["exp_avg_sq"]
41 | 
42 |                 grad = p.grad
43 |                 if weight_decay: p.data.mul_(1 - lr * weight_decay)
44 | 
45 |                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
46 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
47 | 
48 |                 bias_correction1 = 1 - beta1 ** step
49 |                 step_size = lr / bias_correction1
50 | 
51 |                 denom_correction = (1 - beta2**step) ** 0.5
52 |                 centered_variance = (exp_avg_sq.sqrt() / denom_correction).add_(eps, alpha=1)
53 | 
54 |                 if use_kahan_summation:
55 |                     compensation = state["compensation"]
56 |                     compensation.addcdiv_(exp_avg, centered_variance, value=-step_size)
57 | 
58 |                     temp_buffer = p.detach().clone()
59 |                     p.data.add_(compensation)
60 |                     compensation.add_(temp_buffer.sub_(p.data))
61 |                 else: p.data.addcdiv_(exp_avg, centered_variance, value=-step_size)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/onnx/wrapper.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import onnx
 3 | import torch
 4 | import onnxruntime
 5 | 
 6 | import numpy as np
 7 | 
 8 | class ONNXRVC:
 9 |     def __init__(self, model_path, providers, log_severity_level = 3):
10 |         sess_options = onnxruntime.SessionOptions()
11 |         sess_options.log_severity_level = log_severity_level
12 | 
13 |         metadata_dict = None
14 |         for prop in onnx.load(model_path).metadata_props:
15 |             if prop.key == "model_info":
16 |                 metadata_dict = json.loads(prop.value)
17 |                 break
18 | 
19 |         self.cpt = {}
20 |         self.cpt["tgt_sr"] = metadata_dict.get("sr", 32000)
21 |         self.cpt["use_f0"] = metadata_dict.get("f0", 1)
22 |         self.cpt["version"] = metadata_dict.get("version", "v1")
23 |         self.cpt["energy"] = metadata_dict.get("energy", False)
24 |         self.net_g = onnxruntime.InferenceSession(
25 |             model_path, 
26 |             sess_options=sess_options, 
27 |             providers=providers
28 |         )
29 | 
30 |     def get_onnx_argument(self, feats, p_len, sid, pitch, pitchf, energy):
31 |         inputs = {
32 |             self.net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32),
33 |             self.net_g.get_inputs()[1].name: p_len.cpu().numpy(),
34 |             self.net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64),
35 |             self.net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32)
36 |         }
37 | 
38 |         if self.cpt["energy"]:
39 |             if self.cpt["use_f0"]:
40 |                 inputs.update({
41 |                     self.net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64),
42 |                     self.net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32),
43 |                     self.net_g.get_inputs()[6].name: energy.cpu().numpy().astype(np.float32)
44 |                 })
45 |             else:
46 |                 inputs.update({
47 |                     self.net_g.get_inputs()[4].name: energy.cpu().numpy().astype(np.float32)
48 |                 })
49 |         else:
50 |             if self.cpt["use_f0"]:
51 |                 inputs.update({
52 |                     self.net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64),
53 |                     self.net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32)
54 |                 })
55 | 
56 |         return inputs
57 |     
58 |     def to(self, device = "cpu"):
59 |         self.device = device
60 |         return self
61 | 
62 |     def infer(self, feats = None, p_len = None, pitch = None, pitchf = None, sid = None, energy = None):
63 |         output = self.net_g.run(
64 |             [self.net_g.get_outputs()[0].name], (
65 |                 self.get_onnx_argument(
66 |                     feats, 
67 |                     p_len, 
68 |                     sid, 
69 |                     pitch, 
70 |                     pitchf, 
71 |                     energy, 
72 |                 )
73 |             )
74 |         )
75 | 
76 |         return torch.as_tensor(output, device=self.device)
77 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/CREPE/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import functools
 3 | 
 4 | PITCH_BINS = 360
 5 | 
 6 | class MODEL(torch.nn.Module):
 7 |     def __init__(self, model='full'):
 8 |         super().__init__()
 9 |         in_channels = {"full": [1, 1024, 128, 128, 128, 256], "large": [1, 768, 96, 96, 96, 192], "medium": [1, 512, 64, 64, 64, 128], "small": [1, 256, 32, 32, 32, 64], "tiny": [1, 128, 16, 16, 16, 32]}[model]
10 |         out_channels = {"full": [1024, 128, 128, 128, 256, 512], "large": [768, 96, 96, 96, 192, 384], "medium": [512, 64, 64, 64, 128, 256], "small": [256, 32, 32, 32, 64, 128], "tiny": [128, 16, 16, 16, 32, 64]}[model]
11 |         self.in_features = {"full": 2048, "large": 1536, "medium": 1024, "small": 512, "tiny": 256}[model]
12 | 
13 |         kernel_sizes = [(512, 1)] + 5 * [(64, 1)]
14 |         strides = [(4, 1)] + 5 * [(1, 1)]
15 |         batch_norm_fn = functools.partial(torch.nn.BatchNorm2d, eps=0.0010000000474974513, momentum=0.0)
16 | 
17 |         self.conv1 = torch.nn.Conv2d(in_channels=in_channels[0], out_channels=out_channels[0], kernel_size=kernel_sizes[0], stride=strides[0])
18 |         self.conv1_BN = batch_norm_fn(num_features=out_channels[0])
19 | 
20 |         self.conv2 = torch.nn.Conv2d(in_channels=in_channels[1], out_channels=out_channels[1], kernel_size=kernel_sizes[1], stride=strides[1])
21 |         self.conv2_BN = batch_norm_fn(num_features=out_channels[1])
22 | 
23 |         self.conv3 = torch.nn.Conv2d(in_channels=in_channels[2], out_channels=out_channels[2], kernel_size=kernel_sizes[2], stride=strides[2])
24 |         self.conv3_BN = batch_norm_fn(num_features=out_channels[2])
25 | 
26 |         self.conv4 = torch.nn.Conv2d(in_channels=in_channels[3], out_channels=out_channels[3], kernel_size=kernel_sizes[3], stride=strides[3])
27 |         self.conv4_BN = batch_norm_fn(num_features=out_channels[3])
28 | 
29 |         self.conv5 = torch.nn.Conv2d(in_channels=in_channels[4], out_channels=out_channels[4], kernel_size=kernel_sizes[4], stride=strides[4])
30 |         self.conv5_BN = batch_norm_fn(num_features=out_channels[4])
31 | 
32 |         self.conv6 = torch.nn.Conv2d(in_channels=in_channels[5], out_channels=out_channels[5], kernel_size=kernel_sizes[5], stride=strides[5])
33 |         self.conv6_BN = batch_norm_fn(num_features=out_channels[5])
34 |         
35 |         self.classifier = torch.nn.Linear(in_features=self.in_features, out_features=PITCH_BINS)
36 | 
37 |     def forward(self, x, embed=False):
38 |         x = self.embed(x)
39 |         if embed: return x
40 |         return self.classifier(self.layer(x, self.conv6, self.conv6_BN).permute(0, 2, 1, 3).reshape(-1, self.in_features)).sigmoid()
41 | 
42 |     def embed(self, x):
43 |         x = x[:, None, :, None]
44 |         return self.layer(self.layer(self.layer(self.layer(self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254)), self.conv2, self.conv2_BN), self.conv3, self.conv3_BN), self.conv4, self.conv4_BN), self.conv5, self.conv5_BN)
45 | 
46 |     def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)):
47 |         return torch.nn.functional.max_pool2d(batch_norm(torch.nn.functional.relu(conv(torch.nn.functional.pad(x, padding)))), (2, 1), (2, 1))


--------------------------------------------------------------------------------
/advanced_rvc_inference/tabs/extra/child/f0_extract.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import gradio as gr
 5 | 
 6 | sys.path.append(os.getcwd())
 7 | 
 8 | from advanced_rvc_inference.core.f0_extract import f0_extract
 9 | from advanced_rvc_inference.core.ui import change_audios_choices, unlock_f0, shutil_move
10 | from advanced_rvc_inference.variables import translations, paths_for_files, method_f0, configs, file_types
11 | 
12 | def f0_extract_tab():
13 |     with gr.Row():
14 |         gr.Markdown(translations["f0_extractor_markdown_2"])
15 |     with gr.Row():
16 |         extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary")
17 |     with gr.Row():
18 |         with gr.Column():
19 |             upload_audio_file = gr.Files(label=translations["drop_audio"], file_types=file_types)
20 |             audioplay = gr.Audio(interactive=False, label=translations["input_audio"])
21 |         with gr.Column():
22 |             with gr.Accordion(translations["f0_method"], open=False):
23 |                 with gr.Group():
24 |                     with gr.Row():
25 |                         onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
26 |                         unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
27 |                     f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
28 |             with gr.Accordion(translations["audio_path"], open=True):
29 |                 input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
30 |                 refresh_audio_button = gr.Button(translations["refresh"])
31 |     with gr.Row():
32 |         gr.Markdown("___")
33 |     with gr.Row():
34 |         file_output = gr.File(label="", file_types=[".txt"], interactive=False)
35 |         image_output = gr.Image(label="", interactive=False)
36 |     with gr.Row():
37 |         upload_audio_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio_file], outputs=[input_audio_path])
38 |         input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay])
39 |         refresh_audio_button.click(fn=change_audios_choices, inputs=[input_audio_path], outputs=[input_audio_path])
40 |     with gr.Row():
41 |         unlock_full_method.change(fn=lambda method: {"choices": [m for m in unlock_f0(method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"}, inputs=[unlock_full_method], outputs=[f0_method_extract])
42 |         extractor_button.click(
43 |             fn=f0_extract,
44 |             inputs=[
45 |                 input_audio_path,
46 |                 f0_method_extract,
47 |                 onnx_f0_mode3
48 |             ],
49 |             outputs=[file_output, image_output],
50 |             api_name="f0_extract"
51 |         )
52 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/core/separate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | sys.path.append(os.getcwd())
 6 | 
 7 | from advanced_rvc_inference.core.ui import gr_info, gr_warning
 8 | from advanced_rvc_inference.variables import python, translations, configs
 9 | 
10 | def separate_music(
11 |     input_path,
12 |     output_dirs,
13 |     export_format, 
14 |     model_name, 
15 |     karaoke_model,
16 |     reverb_model,
17 |     denoise_model,
18 |     sample_rate,
19 |     shifts, 
20 |     batch_size, 
21 |     overlap, 
22 |     aggression,
23 |     hop_length, 
24 |     window_size,
25 |     segments_size, 
26 |     post_process_threshold,
27 |     enable_tta,
28 |     enable_denoise, 
29 |     high_end_process,
30 |     enable_post_process,
31 |     separate_backing,
32 |     separate_reverb
33 | ):
34 |     output_dirs = os.path.dirname(output_dirs) or output_dirs
35 | 
36 |     if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path): 
37 |         gr_warning(translations["input_not_valid"])
38 |         return [None]*4
39 |     
40 |     if not os.path.exists(output_dirs): 
41 |         gr_warning(translations["output_not_valid"])
42 |         return [None]*4
43 | 
44 |     if not os.path.exists(output_dirs): os.makedirs(output_dirs)
45 |     gr_info(translations["start"].format(start=translations["separator_music"]))
46 | 
47 |     subprocess.run([
48 |         python, configs["separate_path"], 
49 |         "--input_path", input_path,
50 |         "--output_dirs", output_dirs,
51 |         "--export_format", export_format,
52 |         "--model_name", model_name,
53 |         "--karaoke_model", karaoke_model,
54 |         "--reverb_model", reverb_model,
55 |         "--denoise_model", denoise_model,
56 |         "--sample_rate", str(sample_rate),
57 |         "--shifts", str(shifts),
58 |         "--batch_size", str(batch_size),
59 |         "--overlap", str(overlap),
60 |         "--aggression", str(aggression),
61 |         "--hop_length", str(hop_length),
62 |         "--window_size", str(window_size),
63 |         "--segments_size", str(segments_size),
64 |         "--post_process_threshold", str(post_process_threshold),
65 |         "--enable_tta", str(enable_tta),
66 |         "--enable_denoise", str(enable_denoise),
67 |         "--high_end_process", str(high_end_process),
68 |         "--enable_post_process", str(enable_post_process),
69 |         "--separate_backing", str(separate_backing),
70 |         "--separate_reverb", str(separate_reverb),
71 |     ])
72 | 
73 |     gr_info(translations["success"])
74 | 
75 |     filename, _ = os.path.splitext(os.path.basename(input_path))
76 |     output_dirs = os.path.join(output_dirs, filename)
77 | 
78 |     return [
79 |         os.path.join(
80 |             output_dirs, 
81 |             f"Original_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Original_Vocals.{export_format}"
82 |         ), 
83 |         os.path.join(
84 |             output_dirs, 
85 |             f"Instruments.{export_format}"
86 |         ), 
87 |         os.path.join(
88 |             output_dirs, 
89 |             f"Main_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Main_Vocals.{export_format}"
90 |         ) if separate_backing else None,
91 |         os.path.join(
92 |             output_dirs, 
93 |             f"Backing_Vocals.{export_format}"
94 |         ) if separate_backing else None
95 |     ] if os.path.isfile(input_path) else [None]*4


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=61.0", "wheel", "setuptools-scm[toml]>=6.2"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "advanced-rvc-inference"
  7 | version = "0.1.0"
  8 | description = "Advanced RVC Inference - A state-of-the-art web UI for rapid and effortless inference."
  9 | readme = "README.md"
 10 | requires-python = ">=3.9,<3.14"
 11 | license = {file = "LICENSE"}
 12 | authors = [
 13 |     {name = "ArkanDash"}
 14 | ]
 15 | classifiers = [
 16 |     "Development Status :: 3 - Alpha",
 17 |     "Intended Audience :: Developers",
 18 |     "Topic :: Software Development :: Build Tools",
 19 |     "License :: OSI Approved :: MIT License",
 20 |     "Programming Language :: Python :: 3",
 21 |     "Programming Language :: Python :: 3.9",
 22 |     "Programming Language :: Python :: 3.10",
 23 |     "Programming Language :: Python :: 3.11",
 24 |     "Programming Language :: Python :: 3.12",
 25 |     "Programming Language :: Python :: 3.13",
 26 | ]
 27 | dependencies = [
 28 |     # Platform-specific requirements
 29 |     "pip>=23.3",
 30 |     "wheel",
 31 |     "omegaconf>=2.0.6",
 32 |     "onnxruntime; sys_platform == 'darwin'",
 33 |     "onnxruntime-gpu; sys_platform != 'darwin'",
 34 |     
 35 |     # Core dependencies
 36 |     "PyYAML>=6.0",
 37 |     "tiktoken",
 38 |     "hyperpyyaml",
 39 |     "torch>=2.3.1",
 40 |     "tqdm>=4.63.1",
 41 |     "sortedcontainers",
 42 |     "torchvision>=0.18.1",
 43 |     "torchaudio>=2.3.1",
 44 |     "torchcodec>=0.8.1",
 45 |     
 46 |     # FAISS with version constraints
 47 |     "faiss-cpu==1.7.3; python_version < '3.12'",
 48 |     "faiss-cpu>=1.7.3; python_version >= '3.12'",
 49 |     
 50 |     # Machine learning, NLP and deep learning
 51 |     "transformers>=4.49.0",
 52 |     "scikit-learn",
 53 |     "einops>=0.8.0",
 54 |     
 55 |     # Pitch and sound processing
 56 |     "librosa>=0.10.2",
 57 |     "pydub>=0.25.1",
 58 |     "praat-parselmouth",
 59 |     "soundfile>=0.13.0",
 60 |     "pedalboard",
 61 |     
 62 |     # Data processing and calculation
 63 |     "numpy>=1.25.2,<2.0.0",
 64 |     "numba>=0.57.0",
 65 |     "scipy>=1.15.0",
 66 |     "matplotlib>=3.7.2",
 67 |     
 68 |     # Implementation and web framework
 69 |     "gradio>=5.23.3,<6.0.0",
 70 |     "requests>=2.32.3",
 71 |     "aiohttp",
 72 |     "pysrt",
 73 |     
 74 |     # Utility section
 75 |     "yt-dlp",
 76 |     "edge-tts>=7.2.0",
 77 |     "ffmpy==0.3.1",
 78 |     "ffmpeg-python>=0.2.0",
 79 |     "beautifulsoup4",
 80 |     
 81 |     # Tensorboard and ONNX
 82 |     "tensorboard",
 83 |     "onnx>=1.14",
 84 |     "onnxslim",
 85 |     "onnx2torch>=1.5.15",
 86 |     
 87 |     # Cryptography section
 88 |     "pycryptodome>=3.9.6,<4.0.0",
 89 |     
 90 |     # Realtime and VAD
 91 |     "sounddevice>=0.5.2",
 92 |     "webrtcvad-wheels>=2.0.14",
 93 | ]
 94 | 
 95 | [project.optional-dependencies]
 96 | dev = [
 97 |     "pytest",
 98 |     "pytest-cov",
 99 |     "black",
100 |     "flake8",
101 |     "mypy",
102 | ]
103 | 
104 | [project.urls]
105 | Homepage = "https://github.com/ArkanDash/Advanced-RVC-Inference"
106 | Repository = "https://github.com/ArkanDash/Advanced-RVC-Inference"
107 | Issues = "https://github.com/ArkanDash/Advanced-RVC-Inference/issues"
108 | 
109 | [project.scripts]
110 | advanced-rvc-inference = "advanced_rvc_inference.app:app"
111 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/create_index.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import faiss
 4 | import argparse
 5 | 
 6 | import numpy as np
 7 | 
 8 | from multiprocessing import cpu_count
 9 | from sklearn.cluster import MiniBatchKMeans
10 | 
11 | sys.path.append(os.getcwd())
12 | 
13 | from advanced_rvc_inference.variables import logger, translations, configs
14 | 
15 | def parse_arguments():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("--create_index", action='store_true')
18 |     parser.add_argument("--model_name", type=str, required=True)
19 |     parser.add_argument("--rvc_version", type=str, default="v2")
20 |     parser.add_argument("--index_algorithm", type=str, default="Auto")
21 | 
22 |     return parser.parse_args()
23 | 
24 | def main():
25 |     args = parse_arguments()
26 |     exp_dir = os.path.join(configs["logs_path"], args.model_name)
27 |     version, index_algorithm = args.rvc_version, args.index_algorithm
28 | 
29 |     log_data = {translations['modelname']: args.model_name, translations['model_path']: exp_dir, translations['training_version']: version, translations['index_algorithm_info']: index_algorithm}
30 |     for key, value in log_data.items():
31 |         logger.debug(f"{key}: {value}")
32 | 
33 |     try:
34 |         npys = []
35 |         feature_dir = os.path.join(exp_dir, f"{version}_extracted")
36 |         model_name = os.path.basename(exp_dir)
37 | 
38 |         for name in sorted(os.listdir(feature_dir)):
39 |             npys.append(np.load(os.path.join(feature_dir, name)))
40 | 
41 |         big_npy = np.concatenate(npys, axis=0)
42 |         big_npy_idx = np.arange(big_npy.shape[0])
43 |         np.random.shuffle(big_npy_idx)
44 |         big_npy = big_npy[big_npy_idx]
45 | 
46 |         if big_npy.shape[0] > 2e5 and (index_algorithm == "Auto" or index_algorithm == "KMeans"): big_npy = (MiniBatchKMeans(n_clusters=10000, verbose=True, batch_size=256 * cpu_count(), compute_labels=False, init="random").fit(big_npy).cluster_centers_)
47 |         np.save(os.path.join(exp_dir, "total_fea.npy"), big_npy)
48 | 
49 |         n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
50 |         index_trained = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat")
51 |         index_ivf_trained = faiss.extract_index_ivf(index_trained)
52 |         index_ivf_trained.nprobe = 1
53 |         index_trained.train(big_npy)
54 |         faiss.write_index(index_trained, os.path.join(exp_dir, f"trained_IVF{n_ivf}_Flat_nprobe_{index_ivf_trained.nprobe}_{model_name}_{version}.index"))
55 | 
56 |         index_added = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat")
57 |         index_ivf_added = faiss.extract_index_ivf(index_added)
58 |         index_ivf_added.nprobe = 1
59 |         index_added.train(big_npy)
60 |         batch_size_add = 8192
61 |     
62 |         for i in range(0, big_npy.shape[0], batch_size_add):
63 |             index_added.add(big_npy[i : i + batch_size_add])
64 | 
65 |         index_filepath_added = os.path.join(exp_dir, f"added_IVF{n_ivf}_Flat_nprobe_{index_ivf_added.nprobe}_{model_name}_{version}.index")
66 |         faiss.write_index(index_added, index_filepath_added)
67 |         logger.info(f"{translations['save_index']} '{index_filepath_added}'")
68 |     except Exception as e:
69 |         logger.error(f"{translations['create_index_error']}: {e}")
70 |         import traceback
71 |         logger.debug(traceback.format_exc())
72 | 
73 | if __name__ == "__main__": main()


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/extracting/rms.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | import tqdm
 5 | import torch
 6 | import librosa
 7 | import traceback
 8 | import concurrent.futures
 9 | 
10 | import numpy as np
11 | import torch.nn as nn
12 | 
13 | sys.path.append(os.getcwd())
14 | 
15 | from advanced_rvc_inference.library.utils import load_audio
16 | from advanced_rvc_inference.variables import logger, translations
17 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths
18 | 
19 | class RMSEnergyExtractor(nn.Module):
20 |     def __init__(self, frame_length=2048, hop_length=512, center=True, pad_mode = "reflect"):
21 |         super().__init__()
22 |         self.frame_length = frame_length
23 |         self.hop_length = hop_length
24 |         self.center = center
25 |         self.pad_mode = pad_mode
26 | 
27 |     def forward(self, x):
28 |         assert x.ndim == 2
29 |         assert x.shape[0] == 1
30 | 
31 |         if str(x.device).startswith(("ocl", "privateuseone")): x = x.contiguous()
32 | 
33 |         rms = torch.from_numpy(
34 |             librosa.feature.rms(
35 |                 y=x.squeeze(0).cpu().numpy(), 
36 |                 frame_length=self.frame_length, 
37 |                 hop_length=self.hop_length, 
38 |                 center=self.center, 
39 |                 pad_mode=self.pad_mode
40 |             )
41 |         )
42 | 
43 |         if str(x.device).startswith(("ocl", "privateuseone")): rms = rms.contiguous()
44 |         return rms.squeeze(-2).to(x.device)
45 |     
46 | def process_file_rms(files, device, threads):
47 |     threads = max(1, threads)
48 | 
49 |     module = RMSEnergyExtractor(
50 |         frame_length=2048, hop_length=160, center=True, pad_mode = "reflect"
51 |     ).to(device).eval().float()
52 | 
53 |     def worker(file_info):
54 |         try:
55 |             file, out_path = file_info
56 |             out_file_path = os.path.join(out_path, os.path.basename(file))
57 | 
58 |             if os.path.exists(out_file_path + ".npy"): return
59 |             feats = torch.from_numpy(load_audio(file, 16000)).unsqueeze(0)
60 | 
61 |             with torch.no_grad():
62 |                 feats = module(feats if device.startswith(("ocl", "privateuseone")) else feats.to(device))
63 |                 
64 |             np.save(out_file_path, feats.float().cpu().numpy(), allow_pickle=False)
65 |         except:
66 |             logger.debug(traceback.format_exc())
67 | 
68 |     with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
69 |         with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
70 |             for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
71 |                 pbar.update(1)
72 | 
73 | def run_rms_extraction(exp_dir, num_processes, devices, rms_extract):
74 |     if rms_extract:
75 |         wav_path, out_path = setup_paths(exp_dir, rms_extract=rms_extract)
76 |         paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")])
77 | 
78 |         start_time = time.time()
79 |         logger.info(translations["rms_start_extract"].format(num_processes=num_processes))
80 | 
81 |         with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
82 |             concurrent.futures.wait([executor.submit(process_file_rms, paths[i::len(devices)], devices[i], num_processes // len(devices)) for i in range(len(devices))])
83 | 
84 |         logger.info(translations["rms_success_extract"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/extracting/rms.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | import tqdm
 5 | import torch
 6 | import librosa
 7 | import traceback
 8 | import concurrent.futures
 9 | 
10 | import numpy as np
11 | import torch.nn as nn
12 | 
13 | sys.path.append(os.getcwd())
14 | 
15 | from advanced_rvc_inference.library.utils import load_audio
16 | from advanced_rvc_inference.variables import logger, translations
17 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths
18 | 
19 | class RMSEnergyExtractor(nn.Module):
20 |     def __init__(self, frame_length=2048, hop_length=512, center=True, pad_mode = "reflect"):
21 |         super().__init__()
22 |         self.frame_length = frame_length
23 |         self.hop_length = hop_length
24 |         self.center = center
25 |         self.pad_mode = pad_mode
26 | 
27 |     def forward(self, x):
28 |         assert x.ndim == 2
29 |         assert x.shape[0] == 1
30 | 
31 |         if str(x.device).startswith(("ocl", "privateuseone")): x = x.contiguous()
32 | 
33 |         rms = torch.from_numpy(
34 |             librosa.feature.rms(
35 |                 y=x.squeeze(0).cpu().numpy(), 
36 |                 frame_length=self.frame_length, 
37 |                 hop_length=self.hop_length, 
38 |                 center=self.center, 
39 |                 pad_mode=self.pad_mode
40 |             )
41 |         )
42 | 
43 |         if str(x.device).startswith(("ocl", "privateuseone")): rms = rms.contiguous()
44 |         return rms.squeeze(-2).to(x.device)
45 |     
46 | def process_file_rms(files, device, threads):
47 |     threads = max(1, threads)
48 | 
49 |     module = RMSEnergyExtractor(
50 |         frame_length=2048, hop_length=160, center=True, pad_mode = "reflect"
51 |     ).to(device).eval().float()
52 | 
53 |     def worker(file_info):
54 |         try:
55 |             file, out_path = file_info
56 |             out_file_path = os.path.join(out_path, os.path.basename(file))
57 | 
58 |             if os.path.exists(out_file_path + ".npy"): return
59 |             feats = torch.from_numpy(load_audio(file, 16000)).unsqueeze(0)
60 | 
61 |             with torch.no_grad():
62 |                 feats = module(feats if device.startswith(("ocl", "privateuseone")) else feats.to(device))
63 |                 
64 |             np.save(out_file_path, feats.float().cpu().numpy(), allow_pickle=False)
65 |         except:
66 |             logger.debug(traceback.format_exc())
67 | 
68 |     with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
69 |         with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
70 |             for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
71 |                 pbar.update(1)
72 | 
73 | def run_rms_extraction(exp_dir, num_processes, devices, rms_extract):
74 |     if rms_extract:
75 |         wav_path, out_path = setup_paths(exp_dir, rms_extract=rms_extract)
76 |         paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")])
77 | 
78 |         start_time = time.time()
79 |         logger.info(translations["rms_start_extract"].format(num_processes=num_processes))
80 | 
81 |         with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
82 |             concurrent.futures.wait([executor.submit(process_file_rms, paths[i::len(devices)], devices[i], num_processes // len(devices)) for i in range(len(devices))])
83 | 
84 |         logger.info(translations["rms_success_extract"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/extracting/embedding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gc
 3 | import sys
 4 | import time
 5 | import tqdm
 6 | import torch
 7 | import traceback
 8 | import concurrent.futures
 9 | 
10 | import numpy as np
11 | 
12 | sys.path.append(os.getcwd())
13 | 
14 | from advanced_rvc_inference.variables import logger, translations, config
15 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths
16 | from advanced_rvc_inference.library.utils import load_audio, load_embedders_model, extract_features
17 | 
18 | def process_file_embedding(files, embedder_model, embedders_mode, device, version, is_half, threads):
19 |     model = load_embedders_model(embedder_model, embedders_mode)
20 |     if isinstance(model, torch.nn.Module): model = model.to(device).to(torch.float16 if is_half else torch.float32).eval()
21 | 
22 |     def worker(file_info):
23 |         try:
24 |             file, out_path = file_info
25 |             out_file_path = os.path.join(out_path, os.path.basename(file.replace("wav", "npy"))) if os.path.isdir(out_path) else out_path
26 | 
27 |             if os.path.exists(out_file_path): return
28 |             feats = torch.from_numpy(load_audio(file, 16000)).to(device).to(torch.float16 if is_half else torch.float32)
29 | 
30 |             with torch.no_grad():
31 |                 feats = extract_features(model, feats.view(1, -1), version, device)
32 | 
33 |             feats = feats.squeeze(0).float().cpu().numpy()
34 |             if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False)
35 |             else: logger.warning(f"{file} {translations['NaN']}")
36 |         except:
37 |             logger.debug(traceback.format_exc())
38 | 
39 |     with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
40 |         with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
41 |             for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
42 |                 pbar.update(1)
43 | 
44 | def run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, is_half):
45 |     wav_path, out_path = setup_paths(exp_dir, version)
46 | 
47 |     logger.info(translations["start_extract_hubert"])
48 |     num_processes = 1 if (config.device.startswith("ocl") and embedders_mode == "onnx") or config.device.startswith("privateuseone") else num_processes
49 |     paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")])
50 | 
51 |     start_time = time.time()
52 |     with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
53 |         concurrent.futures.wait([executor.submit(process_file_embedding, paths[i::len(devices)], embedder_model, embedders_mode, devices[i], version, is_half, num_processes // len(devices)) for i in range(len(devices))])
54 |     
55 |     gc.collect()
56 |     logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))
57 | 
58 | def create_mute_file(version, embedder_model, embedders_mode, is_half):
59 |     start_time = time.time()
60 |     logger.info(translations["start_extract_hubert"])
61 | 
62 |     process_file_embedding([(os.path.join("assets", "logs", "mute", "sliced_audios_16k", "mute.wav"), os.path.join("assets", "logs", "mute", f"{version}_extracted", f"mute_{embedder_model}.npy"))], embedder_model, embedders_mode, config.device, version, is_half, 1)
63 | 
64 |     gc.collect()
65 |     logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/extracting/embedding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gc
 3 | import sys
 4 | import time
 5 | import tqdm
 6 | import torch
 7 | import traceback
 8 | import concurrent.futures
 9 | 
10 | import numpy as np
11 | 
12 | sys.path.append(os.getcwd())
13 | 
14 | from advanced_rvc_inference.variables import logger, translations, config
15 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths
16 | from advanced_rvc_inference.library.utils import load_audio, load_embedders_model, extract_features
17 | 
18 | def process_file_embedding(files, embedder_model, embedders_mode, device, version, is_half, threads):
19 |     model = load_embedders_model(embedder_model, embedders_mode)
20 |     if isinstance(model, torch.nn.Module): model = model.to(device).to(torch.float16 if is_half else torch.float32).eval()
21 | 
22 |     def worker(file_info):
23 |         try:
24 |             file, out_path = file_info
25 |             out_file_path = os.path.join(out_path, os.path.basename(file.replace("wav", "npy"))) if os.path.isdir(out_path) else out_path
26 | 
27 |             if os.path.exists(out_file_path): return
28 |             feats = torch.from_numpy(load_audio(file, 16000)).to(device).to(torch.float16 if is_half else torch.float32)
29 | 
30 |             with torch.no_grad():
31 |                 feats = extract_features(model, feats.view(1, -1), version, device)
32 | 
33 |             feats = feats.squeeze(0).float().cpu().numpy()
34 |             if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False)
35 |             else: logger.warning(f"{file} {translations['NaN']}")
36 |         except:
37 |             logger.debug(traceback.format_exc())
38 | 
39 |     with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
40 |         with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
41 |             for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
42 |                 pbar.update(1)
43 | 
44 | def run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, is_half):
45 |     wav_path, out_path = setup_paths(exp_dir, version)
46 | 
47 |     logger.info(translations["start_extract_hubert"])
48 |     num_processes = 1 if (config.device.startswith("ocl") and embedders_mode == "onnx") or config.device.startswith("privateuseone") else num_processes
49 |     paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")])
50 | 
51 |     start_time = time.time()
52 |     with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
53 |         concurrent.futures.wait([executor.submit(process_file_embedding, paths[i::len(devices)], embedder_model, embedders_mode, devices[i], version, is_half, num_processes // len(devices)) for i in range(len(devices))])
54 |     
55 |     gc.collect()
56 |     logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))
57 | 
58 | def create_mute_file(version, embedder_model, embedders_mode, is_half):
59 |     start_time = time.time()
60 |     logger.info(translations["start_extract_hubert"])
61 | 
62 |     process_file_embedding([(os.path.join("assets", "logs", "mute", "sliced_audios_16k", "mute.wav"), os.path.join("assets", "logs", "mute", f"{version}_extracted", f"mute_{embedder_model}.npy"))], embedder_model, embedders_mode, config.device, version, is_half, 1)
63 | 
64 |     gc.collect()
65 |     logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/algorithm/modules.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import torch.nn.utils.parametrize as parametrize
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from .commons import fused_add_tanh_sigmoid_multiply
10 | 
11 | class WaveNet(torch.nn.Module):
12 |     def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
13 |         super(WaveNet, self).__init__()
14 |         assert kernel_size % 2 == 1
15 |         self.hidden_channels = hidden_channels
16 |         self.kernel_size = (kernel_size,)
17 |         self.dilation_rate = dilation_rate
18 |         self.n_layers = n_layers
19 |         self.gin_channels = gin_channels
20 |         self.p_dropout = p_dropout
21 |         self.in_layers = torch.nn.ModuleList()
22 |         self.res_skip_layers = torch.nn.ModuleList()
23 |         self.drop = torch.nn.Dropout(p_dropout)
24 |         if gin_channels != 0: self.cond_layer = torch.nn.utils.parametrizations.weight_norm(torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), name="weight")
25 |         dilations = [dilation_rate ** i for i in range(n_layers)]
26 |         paddings = [(kernel_size * d - d) // 2 for d in dilations]
27 | 
28 |         for i in range(n_layers):
29 |             in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilations[i], padding=paddings[i])
30 |             in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight")
31 |             self.in_layers.append(in_layer)
32 |             res_skip_channels = (hidden_channels if i == n_layers - 1 else 2 * hidden_channels)
33 |             res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
34 |             res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight")
35 |             self.res_skip_layers.append(res_skip_layer)
36 | 
37 |     def forward(self, x, x_mask, g=None):
38 |         output = x.clone().zero_()
39 |         n_channels_tensor = torch.IntTensor([self.hidden_channels])
40 | 
41 |         if g is not None: g = self.cond_layer(g)
42 | 
43 |         for i in range(self.n_layers):
44 |             x_in = self.in_layers[i](x)
45 |             g_l = (g[:, i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels, :] if g is not None else 0)
46 |             res_skip_acts = self.res_skip_layers[i](self.drop(fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)))
47 | 
48 |             if i < self.n_layers - 1:
49 |                 x = (x + (res_skip_acts[:, : self.hidden_channels, :])) * x_mask
50 |                 output = output + res_skip_acts[:, self.hidden_channels :, :]
51 |             else: output = output + res_skip_acts
52 | 
53 |         return output * x_mask
54 | 
55 |     def remove_weight_norm(self):
56 |         if self.gin_channels != 0: 
57 |             if hasattr(self.cond_layer, "parametrizations") and "weight" in self.cond_layer.parametrizations: parametrize.remove_parametrizations(self.cond_layer, "weight", leave_parametrized=True)
58 |             else: torch.nn.utils.remove_weight_norm(self.cond_layer)
59 | 
60 |         for l in self.in_layers:
61 |             if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True)
62 |             else: torch.nn.utils.remove_weight_norm(l)
63 | 
64 |         for l in self.res_skip_layers:
65 |             if hasattr(l, "parametrizations") and "weight" in l.parametrizations: parametrize.remove_parametrizations(l, "weight", leave_parametrized=True)
66 |             else: torch.nn.utils.remove_weight_norm(l)


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/RMVPE/RMVPE.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import numpy as np
 6 | import torch.nn.functional as F
 7 | 
 8 | sys.path.append(os.getcwd())
 9 | 
10 | from advanced_rvc_inference.library.predictors.RMVPE.mel import MelSpectrogram
11 | 
12 | N_MELS, N_CLASS = 128, 360
13 | 
14 | class RMVPE:
15 |     def __init__(self, model_path, is_half, device=None, providers=None, onnx=False):
16 |         self.onnx = onnx
17 | 
18 |         if self.onnx:
19 |             import onnxruntime as ort
20 | 
21 |             sess_options = ort.SessionOptions()
22 |             sess_options.log_severity_level = 3
23 |             self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers)
24 |         else:
25 |             from advanced_rvc_inference.library.predictors.RMVPE.e2e import E2E
26 | 
27 |             model = E2E(4, 1, (2, 2))
28 |             ckpt = torch.load(model_path, map_location="cpu", weights_only=True)
29 |             model.load_state_dict(ckpt)
30 |             model.eval()
31 |             if is_half: model = model.half()
32 |             self.model = model.to(device)
33 | 
34 |         self.is_half = is_half
35 |         self.device = device
36 |         self.mel_extractor = MelSpectrogram(N_MELS, 16000, 1024, 160, None, 30, 8000).to(device)
37 |         cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
38 |         self.cents_mapping = np.pad(cents_mapping, (4, 4))
39 | 
40 |     def mel2hidden(self, mel):
41 |         with torch.no_grad():
42 |             n_frames = mel.shape[-1]
43 |             n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
44 |             if n_pad > 0: mel = F.pad(mel, (0, n_pad), mode="constant")
45 |             
46 |             if self.onnx:
47 |                 hidden = self.model.run(
48 |                     [self.model.get_outputs()[0].name], 
49 |                     {
50 |                         self.model.get_inputs()[0].name: mel.cpu().numpy().astype(np.float32)
51 |                     }
52 |                 )[0] 
53 |             else: 
54 |                 hidden = self.model(
55 |                     mel.half() if self.is_half else mel.float()
56 |                 )
57 | 
58 |             return hidden[:, :n_frames]
59 | 
60 |     def decode(self, hidden, thred=0.03):
61 |         f0 = 10 * (2 ** (self.to_local_average_cents(hidden, thred=thred) / 1200))
62 |         f0[f0 == 10] = 0
63 | 
64 |         return f0
65 | 
66 |     def infer_from_audio(self, audio, thred=0.03):
67 |         hidden = self.mel2hidden(self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True))
68 | 
69 |         return self.decode(hidden.squeeze(0).cpu().numpy().astype(np.float32) if not self.onnx else hidden[0], thred=thred)
70 |     
71 |     def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
72 |         f0 = self.infer_from_audio(audio, thred)
73 |         f0[(f0 < f0_min) | (f0 > f0_max)] = 0  
74 | 
75 |         return f0
76 | 
77 |     def to_local_average_cents(self, salience, thred=0.05):
78 |         center = np.argmax(salience, axis=1)
79 |         salience = np.pad(salience, ((0, 0), (4, 4)))
80 |         center += 4
81 |         todo_salience, todo_cents_mapping = [], []
82 |         starts = center - 4
83 |         ends = center + 5
84 | 
85 |         for idx in range(salience.shape[0]):
86 |             todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
87 |             todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
88 | 
89 |         todo_salience = np.array(todo_salience)
90 |         devided = np.sum(todo_salience * np.array(todo_cents_mapping), 1) / np.sum(todo_salience, 1)
91 |         devided[np.max(salience, axis=1) <= thred] = 0
92 | 
93 |         return devided


--------------------------------------------------------------------------------
/advanced_rvc_inference/app.py:
--------------------------------------------------------------------------------
  1 | import os, io
  2 | #import ssl
  3 | import sys
  4 | import time
  5 | import codecs
  6 | import logging
  7 | import warnings
  8 | 
  9 | import gradio as gr
 10 | 
 11 | sys.path.append(os.getcwd())
 12 | start_time = time.time()
 13 | 
 14 | from advanced_rvc_inference.tabs.extra.extra import extra_tab
 15 | from advanced_rvc_inference.tabs.training.training import training_tab
 16 | from advanced_rvc_inference.tabs.downloads.downloads import download_tab
 17 | from advanced_rvc_inference.tabs.inference.inference import inference_tab
 18 | from advanced_rvc_inference.variables import logger, config, translations, theme, font, configs, language, allow_disk
 19 | from advanced_rvc_inference.mainjs import js_code
 20 | #ssl._create_default_https_context = ssl._create_unverified_context
 21 | 
 22 | warnings.filterwarnings("ignore")
 23 | for l in ["httpx", "uvicorn", "httpcore", "urllib3"]:
 24 |     logging.getLogger(l).setLevel(logging.ERROR)
 25 | 
 26 | 
 27 | client_mode = "--client" in sys.argv
 28 | 
 29 | with gr.Blocks(
 30 |     title="📱 Advanced RVC Inference",
 31 |     js=js_code if client_mode else None, 
 32 |     theme=theme,
 33 | ) as app:
 34 |     gr.HTML("<h1 style='text-align: center;'>Advanced RVC Inference</h1>")
 35 |     
 36 | 
 37 |     with gr.Tabs():      
 38 |         inference_tab()
 39 | 
 40 | 
 41 |         if client_mode:
 42 |             from advanced_rvc_inference.tabs.realtime.realtime_client import realtime_client_tab
 43 |             realtime_client_tab()
 44 |         else:
 45 |             from advanced_rvc_inference.tabs.realtime.realtime import realtime_tab
 46 |             realtime_tab()
 47 | 
 48 |         training_tab()
 49 |         download_tab()
 50 |         extra_tab(app)
 51 | 
 52 |     
 53 |     with gr.Row(): 
 54 |         gr.Markdown(translations["terms_of_use"])
 55 |     with gr.Row():
 56 |         gr.Markdown(translations["exemption"])
 57 |     
 58 |     # This is the corrected line. It now correctly checks if the script is being run directly.
 59 |     if __name__ == "__main__":
 60 |         logger.info(config.device.replace("privateuseone", "dml"))
 61 |         logger.info(translations["start_app"])
 62 |         logger.info(translations["set_lang"].format(lang=language))
 63 |         port = configs.get("app_port", 7860)
 64 |         server_name = configs.get("server_name", "0.0.0.0")
 65 |         share = "--share" in sys.argv
 66 |         original_stdout = sys.stdout
 67 |         sys.stdout = io.StringIO()
 68 | 
 69 |         for i in range(configs.get("num_of_restart", 5)):
 70 |             try:
 71 |                 gradio_app, _, share_url = app.launch(
 72 |                     server_name=server_name, 
 73 |                     server_port=port, 
 74 |                     show_error=configs.get("app_show_error", False), 
 75 |                     inbrowser="--open" in sys.argv, 
 76 |                     share=share,
 77 |                     ssr_mode=True,
 78 |                     prevent_thread_lock=True,
 79 |                     allowed_paths=allow_disk,
 80 |                 )
 81 |                 break
 82 |             except OSError:
 83 |                 logger.debug(translations["port"].format(port=port))
 84 |                 port -= 1
 85 |             except Exception as e:
 86 |                 logger.error(translations["error_occurred"].format(e=e))
 87 |                 sys.exit(1)
 88 | 
 89 |         if client_mode:
 90 |             from advanced_rvc_inference.core.realtime_client import app as fastapi_app
 91 |             gradio_app.mount("/api", fastapi_app)
 92 |         
 93 |         sys.stdout = original_stdout
 94 |       
 95 |         
 96 |         logger.info(f"{translations['gradio_start']}: {(time.time() - start_time):.2f}s")
 97 | 
 98 |         print(f"{server_name}:{port}")
 99 |         if share: print(f"{share_url}")
100 |         while 1:
101 |             time.sleep(5)
102 | 
103 | 
104 | #endcode
105 | 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/uvr5_lib/vr_network/layers_new.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | 
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | sys.path.append(os.getcwd())
 9 | 
10 | from advanced_rvc_inference.library.uvr5_lib import spec_utils
11 | 
12 | class Conv2DBNActiv(nn.Module):
13 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
14 |         super(Conv2DBNActiv, self).__init__()
15 |         self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ())
16 | 
17 |     def __call__(self, input_tensor):
18 |         return self.conv(input_tensor)
19 | 
20 | class Encoder(nn.Module):
21 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
22 |         super(Encoder, self).__init__()
23 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
24 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
25 | 
26 |     def __call__(self, input_tensor):
27 |         hidden = self.conv1(input_tensor)
28 |         hidden = self.conv2(hidden)
29 | 
30 |         return hidden
31 | 
32 | class Decoder(nn.Module):
33 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
34 |         super(Decoder, self).__init__()
35 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
36 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
37 | 
38 |     def __call__(self, input_tensor, skip=None):
39 |         input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True)
40 | 
41 |         if skip is not None:
42 |             skip = spec_utils.crop_center(skip, input_tensor)
43 |             input_tensor = torch.cat([input_tensor, skip], dim=1)
44 | 
45 |         hidden = self.conv1(input_tensor)
46 | 
47 |         if self.dropout is not None:
48 |             hidden = self.dropout(hidden)
49 | 
50 |         return hidden
51 | 
52 | class ASPPModule(nn.Module):
53 |     def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
54 |         super(ASPPModule, self).__init__()
55 |         self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ))
56 |         self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
57 |         self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
58 |         self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
59 |         self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
60 |         self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
61 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
62 | 
63 |     def forward(self, input_tensor):
64 |         _, _, h, w = input_tensor.size()
65 | 
66 |         out = self.bottleneck(torch.cat((F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True), self.conv2(input_tensor), self.conv3(input_tensor), self.conv4(input_tensor), self.conv5(input_tensor)), dim=1))
67 | 
68 |         if self.dropout is not None:
69 |             out = self.dropout(out)
70 | 
71 |         return out
72 | 
73 | class LSTMModule(nn.Module):
74 |     def __init__(self, nin_conv, nin_lstm, nout_lstm):
75 |         super(LSTMModule, self).__init__()
76 |         self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
77 |         self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
78 |         self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
79 | 
80 |     def forward(self, input_tensor):
81 |         N, _, nbins, nframes = input_tensor.size()
82 | 
83 |         hidden, _ = self.lstm(self.conv(input_tensor)[:, 0].permute(2, 0, 1))
84 |         hidden = self.dense(hidden.reshape(-1, hidden.size()[-1])).reshape(nframes, N, 1, nbins)
85 | 
86 |         return hidden.permute(1, 2, 3, 0)


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/extracting/feature.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gc
 3 | import sys
 4 | import tqdm
 5 | import time
 6 | import traceback
 7 | import concurrent.futures
 8 | 
 9 | import numpy as np
10 | 
11 | sys.path.append(os.getcwd())
12 | 
13 | from advanced_rvc_inference.library.utils import load_audio
14 | from advanced_rvc_inference.variables import config, logger, translations
15 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths
16 | 
17 | class FeatureInput:
18 |     def __init__(self, is_half=config.is_half, device=config.device):
19 |         self.sample_rate = 16000
20 |         self.f0_max = 1100.0
21 |         self.f0_min = 50.0
22 |         self.device = device
23 |         self.is_half = is_half
24 | 
25 |     def process_file(self, file_info, f0_method, hop_length, f0_onnx, f0_autotune, f0_autotune_strength, alpha):
26 |         if not hasattr(self, "f0_gen"): 
27 |             from main.library.predictors.Generator import Generator
28 |             self.f0_gen = Generator(self.sample_rate, hop_length, self.f0_min, self.f0_max, alpha, self.is_half, self.device, f0_onnx, False)
29 | 
30 |         inp_path, opt_path1, opt_path2, file_inp = file_info
31 |         if os.path.exists(opt_path1 + ".npy") and os.path.exists(opt_path2 + ".npy"): return
32 | 
33 |         try:
34 |             pitch, pitchf = self.f0_gen.calculator(x_pad=config.x_pad, f0_method=f0_method, x=load_audio(file_inp, self.sample_rate), f0_up_key=0, p_len=None, filter_radius=3, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, manual_f0=None, proposal_pitch=False, proposal_pitch_threshold=0.0)
35 |             np.save(opt_path2, pitchf, allow_pickle=False)
36 |             np.save(opt_path1, pitch, allow_pickle=False)
37 |         except Exception as e:
38 |             logger.info(f"{translations['extract_file_error']} {inp_path}: {e}")
39 |             logger.debug(traceback.format_exc())
40 | 
41 |     def process_files(self, files, f0_method, hop_length, f0_onnx, device, is_half, threads, f0_autotune, f0_autotune_strength, alpha):
42 |         self.device = device
43 |         self.is_half = is_half
44 | 
45 |         def worker(file_info):
46 |             self.process_file(file_info, f0_method, hop_length, f0_onnx, f0_autotune, f0_autotune_strength, alpha)
47 | 
48 |         with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
49 |             with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
50 |                 for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
51 |                     pbar.update(1)
52 | 
53 | def run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, devices, f0_onnx, is_half, f0_autotune, f0_autotune_strength, alpha):
54 |     input_root, *output_roots = setup_paths(exp_dir)
55 |     output_root1, output_root2 = output_roots if len(output_roots) == 2 else (output_roots[0], None)
56 | 
57 |     logger.info(translations["extract_f0_method"].format(num_processes=num_processes, f0_method=f0_method))
58 |     num_processes = 1 if config.device.startswith(("ocl", "privateuseone")) and ("crepe" in f0_method or "fcpe" in f0_method or "rmvpe" in f0_method or "penn" in f0_method or "swift" in f0_method) else num_processes
59 |     paths = [(os.path.join(input_root, name), os.path.join(output_root1, name) if output_root1 else None, os.path.join(output_root2, name) if output_root2 else None, os.path.join(input_root, name)) for name in sorted(os.listdir(input_root)) if "spec" not in name]
60 | 
61 |     start_time = time.time()
62 |     feature_input = FeatureInput()
63 |     with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
64 |         concurrent.futures.wait([executor.submit(feature_input.process_files, paths[i::len(devices)], f0_method, hop_length, f0_onnx, devices[i], is_half, num_processes // len(devices), f0_autotune, f0_autotune_strength, alpha) for i in range(len(devices))])
65 |     
66 |     gc.collect()
67 |     logger.info(translations["extract_f0_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/extracting/feature.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gc
 3 | import sys
 4 | import tqdm
 5 | import time
 6 | import traceback
 7 | import concurrent.futures
 8 | 
 9 | import numpy as np
10 | 
11 | sys.path.append(os.getcwd())
12 | 
13 | from advanced_rvc_inference.library.utils import load_audio
14 | from advanced_rvc_inference.variables import config, logger, translations
15 | from advanced_rvc_inference.infer.extracting.setup_path import setup_paths
16 | 
17 | class FeatureInput:
18 |     def __init__(self, is_half=config.is_half, device=config.device):
19 |         self.sample_rate = 16000
20 |         self.f0_max = 1100.0
21 |         self.f0_min = 50.0
22 |         self.device = device
23 |         self.is_half = is_half
24 | 
25 |     def process_file(self, file_info, f0_method, hop_length, f0_onnx, f0_autotune, f0_autotune_strength, alpha):
26 |         if not hasattr(self, "f0_gen"): 
27 |             from main.library.predictors.Generator import Generator
28 |             self.f0_gen = Generator(self.sample_rate, hop_length, self.f0_min, self.f0_max, alpha, self.is_half, self.device, f0_onnx, False)
29 | 
30 |         inp_path, opt_path1, opt_path2, file_inp = file_info
31 |         if os.path.exists(opt_path1 + ".npy") and os.path.exists(opt_path2 + ".npy"): return
32 | 
33 |         try:
34 |             pitch, pitchf = self.f0_gen.calculator(x_pad=config.x_pad, f0_method=f0_method, x=load_audio(file_inp, self.sample_rate), f0_up_key=0, p_len=None, filter_radius=3, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, manual_f0=None, proposal_pitch=False, proposal_pitch_threshold=0.0)
35 |             np.save(opt_path2, pitchf, allow_pickle=False)
36 |             np.save(opt_path1, pitch, allow_pickle=False)
37 |         except Exception as e:
38 |             logger.info(f"{translations['extract_file_error']} {inp_path}: {e}")
39 |             logger.debug(traceback.format_exc())
40 | 
41 |     def process_files(self, files, f0_method, hop_length, f0_onnx, device, is_half, threads, f0_autotune, f0_autotune_strength, alpha):
42 |         self.device = device
43 |         self.is_half = is_half
44 | 
45 |         def worker(file_info):
46 |             self.process_file(file_info, f0_method, hop_length, f0_onnx, f0_autotune, f0_autotune_strength, alpha)
47 | 
48 |         with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
49 |             with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
50 |                 for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
51 |                     pbar.update(1)
52 | 
53 | def run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, devices, f0_onnx, is_half, f0_autotune, f0_autotune_strength, alpha):
54 |     input_root, *output_roots = setup_paths(exp_dir)
55 |     output_root1, output_root2 = output_roots if len(output_roots) == 2 else (output_roots[0], None)
56 | 
57 |     logger.info(translations["extract_f0_method"].format(num_processes=num_processes, f0_method=f0_method))
58 |     num_processes = 1 if config.device.startswith(("ocl", "privateuseone")) and ("crepe" in f0_method or "fcpe" in f0_method or "rmvpe" in f0_method or "penn" in f0_method or "swift" in f0_method) else num_processes
59 |     paths = [(os.path.join(input_root, name), os.path.join(output_root1, name) if output_root1 else None, os.path.join(output_root2, name) if output_root2 else None, os.path.join(input_root, name)) for name in sorted(os.listdir(input_root)) if "spec" not in name]
60 | 
61 |     start_time = time.time()
62 |     feature_input = FeatureInput()
63 |     with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
64 |         concurrent.futures.wait([executor.submit(feature_input.process_files, paths[i::len(devices)], f0_method, hop_length, f0_onnx, devices[i], is_half, num_processes // len(devices), f0_autotune, f0_autotune_strength, alpha) for i in range(len(devices))])
65 |     
66 |     gc.collect()
67 |     logger.info(translations["extract_f0_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))


--------------------------------------------------------------------------------
/Advanced-RVC.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "view-in-github",
  7 |         "colab_type": "text"
  8 |       },
  9 |       "source": [
 10 |         "<a href=\"https://colab.research.google.com/github/ArkanDash/Advanced-RVC-Inference/blob/master/Advanced-RVC.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 11 |       ]
 12 |     },
 13 |     {
 14 |       "cell_type": "markdown",
 15 |       "source": [
 16 |         "# Advanced RVC Inference\n",
 17 |         "\n",
 18 |         "\n",
 19 |         "\n",
 20 |         "[Discord](https://discord.gg/hvmsukmBHE) - [Github](https://github.com/ArkanDash/Advanced-RVC-Inference)"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "MRUEMUzm9iCh"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": null,
 29 |       "metadata": {
 30 |         "cellView": "form",
 31 |         "id": "vtON700qokuQ"
 32 |       },
 33 |       "outputs": [],
 34 |       "source": [
 35 |         "# @title Setup runtime environment\n",
 36 |         "from IPython.display import clear_output\n",
 37 |         "import codecs\n",
 38 |         "\n",
 39 |         "url = \"https://github.com/ArkanDash/Advanced-RVC-Inference.git\"\n",
 40 |         "\n",
 41 |         "program = \"program_master\"\n",
 42 |         "%cd /content\n",
 43 |         "!git clone {url} {program} --branch master --single-branch\n",
 44 |         "%cd {program}\n",
 45 |         "clear_output()\n",
 46 |         "\n",
 47 |         "print(\"installing The Program\")\n",
 48 |         "!apt-get -y install libportaudio2 -qq > /dev/null 2>&1\n",
 49 |         "\n",
 50 |         "!pip install uv -q > /dev/null 2>&1\n",
 51 |         "\n",
 52 |         "!uv pip install -r ./requirements.txt --no-cache-dir -q > /dev/null 2>&1\n",
 53 |         "\n",
 54 |         "\n",
 55 |         "print(\"✅ Finished installing requirements!\")"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "execution_count": null,
 61 |       "metadata": {
 62 |         "cellView": "form",
 63 |         "id": "nAlXiNYnFH9F"
 64 |       },
 65 |       "outputs": [],
 66 |       "source": [
 67 |         "# @title **Start server**\n",
 68 |         "# @markdown  ### Choose a sharing method:\n",
 69 |         "from IPython.display import clear_output\n",
 70 |         "import os\n",
 71 |         "method = \"gradio\"  # @param [\"gradio\", \"localtunnel\", \"ngrok\"]\n",
 72 |         "ngrok_token = \"If you selected the 'ngrok' method, obtain your auth token here: https://dashboard.ngrok.com/get-started/your-authtoken\" # @param {type:\"string\"}\n",
 73 |         "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n",
 74 |         "%cd /content/program_master\n",
 75 |         "clear_output()\n",
 76 |         "!git pull -q\n",
 77 |         "\n",
 78 |         "match method:\n",
 79 |         "  case 'gradio':\n",
 80 |         "    !python advanced_rvc_inference/app.py --share\n",
 81 |         "  case 'localtunnel':\n",
 82 |         "    !echo Password IP: $(curl --silent https://ipv4.icanhazip.com)\n",
 83 |         "    !echo\n",
 84 |         "    !lt --port 6969 & python  advanced_rvc_inference/app.py --listen\n",
 85 |         "  case 'ngrok':\n",
 86 |         "    import ngrok\n",
 87 |         "    ngrok.kill()\n",
 88 |         "    listener = await ngrok.forward(6969, authtoken=ngrok_token)\n",
 89 |         "    print(f\"Ngrok URL: {listener.url()}\")\n",
 90 |         "    !python advanced_rvc_inference/app.py --listen"
 91 |       ]
 92 |     }
 93 |   ],
 94 |   "metadata": {
 95 |     "accelerator": "GPU",
 96 |     "colab": {
 97 |       "collapsed_sections": [
 98 |         "NXXzfHi7Db-y"
 99 |       ],
100 |       "provenance": [],
101 |       "private_outputs": true,
102 |       "include_colab_link": true
103 |     },
104 |     "kernelspec": {
105 |       "display_name": "Python 3",
106 |       "name": "python3"
107 |     },
108 |     "language_info": {
109 |       "name": "python"
110 |     }
111 |   },
112 |   "nbformat": 4,
113 |   "nbformat_minor": 0
114 | }


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/speaker_diarization/parameter_transfer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import inspect
  4 | 
  5 | sys.path.append(os.getcwd())
  6 | 
  7 | from advanced_rvc_inference.library.speaker_diarization.speechbrain import fetch, run_on_main
  8 | from advanced_rvc_inference.library.speaker_diarization.features import DEFAULT_TRANSFER_HOOKS, DEFAULT_LOAD_HOOKS
  9 | 
 10 | def get_default_hook(obj, default_hooks):
 11 |     for cls in inspect.getmro(type(obj)):
 12 |         if cls in default_hooks: return default_hooks[cls]
 13 |         
 14 |     return None
 15 | 
 16 | class Pretrainer:
 17 |     def __init__(self, loadables=None, paths=None, custom_hooks=None, conditions=None):
 18 |         self.loadables = {}
 19 | 
 20 |         if loadables is not None: self.add_loadables(loadables)
 21 |         self.paths = {}
 22 | 
 23 |         if paths is not None: self.add_paths(paths)
 24 |         self.custom_hooks = {}
 25 | 
 26 |         if custom_hooks is not None: self.add_custom_hooks(custom_hooks)
 27 |         self.conditions = {}
 28 | 
 29 |         if conditions is not None: self.add_conditions(conditions)
 30 |         self.is_local = []
 31 | 
 32 |     def add_loadables(self, loadables):
 33 |         self.loadables.update(loadables)
 34 | 
 35 |     def add_paths(self, paths):
 36 |         self.paths.update(paths)
 37 | 
 38 |     def add_custom_hooks(self, custom_hooks):
 39 |         self.custom_hooks.update(custom_hooks)
 40 | 
 41 |     def add_conditions(self, conditions):
 42 |         self.conditions.update(conditions)
 43 | 
 44 |     @staticmethod
 45 |     def split_path(path):
 46 |         def split(src):
 47 |             if "/" in src: return src.rsplit("/", maxsplit=1)
 48 |             else: return "./", src
 49 | 
 50 |         return split(path)
 51 | 
 52 |     def collect_files(self, default_source=None):
 53 |         loadable_paths = {}
 54 |         for name in self.loadables:
 55 |             if not self.is_loadable(name): continue
 56 |             save_filename = name + ".ckpt"
 57 | 
 58 |             if name in self.paths: source, filename = self.split_path(self.paths[name])
 59 |             elif default_source is not None:
 60 |                 filename = save_filename
 61 |                 source = default_source
 62 |             else: raise ValueError
 63 | 
 64 |             fetch_kwargs = {"filename": filename, "source": source}
 65 |             path = None
 66 | 
 67 |             def run_fetch(**kwargs):
 68 |                 nonlocal path
 69 | 
 70 |                 path = fetch(**kwargs)
 71 | 
 72 |             run_on_main(run_fetch, kwargs=fetch_kwargs, post_func=run_fetch, post_kwargs=fetch_kwargs)
 73 | 
 74 |             loadable_paths[name] = path
 75 |             self.paths[name] = str(path)
 76 |             self.is_local.append(name)
 77 | 
 78 |         return loadable_paths
 79 | 
 80 |     def is_loadable(self, name):
 81 |         if name not in self.conditions: return True
 82 |         condition = self.conditions[name]
 83 | 
 84 |         if callable(condition): return condition()
 85 |         else: return bool(condition)
 86 | 
 87 |     def load_collected(self):
 88 |         paramfiles = {}
 89 |         for name in self.loadables:
 90 |             if not self.is_loadable(name): continue
 91 | 
 92 |             if name in self.is_local: paramfiles[name] = self.paths[name]
 93 |             else: raise ValueError
 94 | 
 95 |         self._call_load_hooks(paramfiles)
 96 | 
 97 |     def _call_load_hooks(self, paramfiles):
 98 |         for name, obj in self.loadables.items():
 99 |             if not self.is_loadable(name): continue
100 |             loadpath = paramfiles[name]
101 | 
102 |             if name in self.custom_hooks:
103 |                 self.custom_hooks[name](obj, loadpath)
104 |                 continue
105 | 
106 |             default_hook = get_default_hook(obj, DEFAULT_TRANSFER_HOOKS)
107 |             
108 |             if default_hook is not None:
109 |                 default_hook(obj, loadpath)
110 |                 continue
111 | 
112 |             default_hook = get_default_hook(obj, DEFAULT_LOAD_HOOKS)
113 | 
114 |             if default_hook is not None:
115 |                 end_of_epoch = False
116 |                 default_hook(obj, loadpath, end_of_epoch)
117 |                 continue
118 | 
119 |             raise RuntimeError


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/predictors/FCPE/encoder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from advanced_rvc_inference.library.predictors.FCPE.attentions import SelfAttention
10 | from advanced_rvc_inference.library.predictors.FCPE.utils import calc_same_padding, Transpose, GLU, Swish
11 | 
12 | class ConformerConvModule_LEGACY(nn.Module):
13 |     def __init__(self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0):
14 |         super().__init__()
15 |         inner_dim = dim * expansion_factor
16 |         self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), GLU(dim=1), DepthWiseConv1d_LEGACY(inner_dim, inner_dim, kernel_size=kernel_size, padding=(calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0))), Swish(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout))
17 | 
18 |     def forward(self, x):
19 |         return self.net(x)
20 | 
21 | class ConformerConvModule(nn.Module):
22 |     def __init__(self, dim, expansion_factor=2, kernel_size=31, dropout=0):
23 |         super().__init__()
24 |         inner_dim = dim * expansion_factor
25 |         self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), nn.GLU(dim=1), DepthWiseConv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=calc_same_padding(kernel_size)[0], groups=inner_dim), nn.SiLU(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout))
26 | 
27 |     def forward(self, x):
28 |         return self.net(x)
29 | 
30 | class DepthWiseConv1d_LEGACY(nn.Module):
31 |     def __init__(self, chan_in, chan_out, kernel_size, padding):
32 |         super().__init__()
33 |         self.padding = padding
34 |         self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
35 | 
36 |     def forward(self, x):
37 |         return self.conv(F.pad(x, self.padding))
38 | 
39 | class DepthWiseConv1d(nn.Module):
40 |     def __init__(self, chan_in, chan_out, kernel_size, padding, groups):
41 |         super().__init__()
42 |         self.conv = nn.Conv1d(chan_in, chan_out, kernel_size=kernel_size, padding=padding, groups=groups)
43 | 
44 |     def forward(self, x):
45 |         return self.conv(x)
46 | 
47 | class EncoderLayer(nn.Module):
48 |     def __init__(self, parent):
49 |         super().__init__()
50 |         self.conformer = ConformerConvModule_LEGACY(parent.dim_model)
51 |         self.norm = nn.LayerNorm(parent.dim_model)
52 |         self.dropout = nn.Dropout(parent.residual_dropout)
53 |         self.attn = SelfAttention(dim=parent.dim_model, heads=parent.num_heads, causal=False)
54 | 
55 |     def forward(self, phone, mask=None):
56 |         phone = phone + (self.attn(self.norm(phone), mask=mask))
57 |         return phone + (self.conformer(phone))
58 | 
59 | class ConformerNaiveEncoder(nn.Module):
60 |     def __init__(self, num_layers, num_heads, dim_model, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0):
61 |         super().__init__()
62 |         self.num_layers = num_layers
63 |         self.num_heads = num_heads
64 |         self.dim_model = dim_model
65 |         self.use_norm = use_norm
66 |         self.residual_dropout = 0.1  
67 |         self.attention_dropout = 0.1  
68 |         self.encoder_layers = nn.ModuleList([CFNEncoderLayer(dim_model, num_heads, use_norm, conv_only, conv_dropout, atten_dropout) for _ in range(num_layers)])
69 | 
70 |     def forward(self, x, mask=None):
71 |         for (_, layer) in enumerate(self.encoder_layers):
72 |             x = layer(x, mask)
73 | 
74 |         return x 
75 |     
76 | class CFNEncoderLayer(nn.Module):
77 |     def __init__(self, dim_model, num_heads = 8, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0):
78 |         super().__init__()
79 |         self.conformer = nn.Sequential(ConformerConvModule(dim_model), nn.Dropout(conv_dropout)) if conv_dropout > 0 else ConformerConvModule(dim_model)
80 |         self.norm = nn.LayerNorm(dim_model)
81 |         self.dropout = nn.Dropout(0.1)  
82 |         self.attn = SelfAttention(dim=dim_model, heads=num_heads, causal=False, use_norm=use_norm, dropout=atten_dropout) if not conv_only else None
83 | 
84 |     def forward(self, x, mask=None):
85 |         if self.attn is not None: x = x + (self.attn(self.norm(x), mask=mask))
86 |         return x + (self.conformer(x)) 


--------------------------------------------------------------------------------
/advanced_rvc_inference/library/algorithm/discriminators.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from torch.utils.checkpoint import checkpoint
 7 | from torch.nn.utils.parametrizations import spectral_norm, weight_norm
 8 | 
 9 | sys.path.append(os.getcwd())
10 | 
11 | from advanced_rvc_inference.library.algorithm.commons import get_padding
12 | from advanced_rvc_inference.library.algorithm.residuals import LRELU_SLOPE
13 | 
14 | class MultiPeriodDiscriminator(torch.nn.Module):
15 |     def __init__(self, version, use_spectral_norm=False, checkpointing=False):
16 |         super(MultiPeriodDiscriminator, self).__init__()
17 |         self.checkpointing = checkpointing
18 |         periods = ([2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37])
19 |         self.discriminators = torch.nn.ModuleList([DiscriminatorS(use_spectral_norm=use_spectral_norm, checkpointing=checkpointing)] + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm, checkpointing=checkpointing) for p in periods])
20 | 
21 |     def forward(self, y, y_hat):
22 |         y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
23 | 
24 |         for d in self.discriminators:
25 |             if self.training and self.checkpointing:
26 |                 def forward_discriminator(d, y, y_hat):
27 |                     y_d_r, fmap_r = d(y)
28 |                     y_d_g, fmap_g = d(y_hat)
29 | 
30 |                     return y_d_r, fmap_r, y_d_g, fmap_g
31 |                 y_d_r, fmap_r, y_d_g, fmap_g = checkpoint(forward_discriminator, d, y, y_hat, use_reentrant=False)
32 |             else:
33 |                 y_d_r, fmap_r = d(y)
34 |                 y_d_g, fmap_g = d(y_hat)
35 | 
36 |             y_d_rs.append(y_d_r); fmap_rs.append(fmap_r)
37 |             y_d_gs.append(y_d_g); fmap_gs.append(fmap_g)
38 | 
39 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
40 | 
41 | class DiscriminatorS(torch.nn.Module):
42 |     def __init__(self, use_spectral_norm=False, checkpointing=False):
43 |         super(DiscriminatorS, self).__init__()
44 |         self.checkpointing = checkpointing
45 |         norm_f = spectral_norm if use_spectral_norm else weight_norm
46 |         self.convs = torch.nn.ModuleList([norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)), norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2))])
47 |         self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
48 |         self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
49 | 
50 |     def forward(self, x):
51 |         fmap = []
52 | 
53 |         for conv in self.convs:
54 |             x = checkpoint(self.lrelu, checkpoint(conv, x, use_reentrant = False), use_reentrant = False) if self.training and self.checkpointing else self.lrelu(conv(x))
55 |             fmap.append(x)
56 | 
57 |         x = self.conv_post(x)
58 |         fmap.append(x)
59 | 
60 |         return x.flatten(1, -1), fmap
61 | 
62 | class DiscriminatorP(torch.nn.Module):
63 |     def __init__(self, period, kernel_size=5, use_spectral_norm=False, checkpointing=False):
64 |         super(DiscriminatorP, self).__init__()
65 |         self.period = period
66 |         self.checkpointing = checkpointing
67 |         norm_f = spectral_norm if use_spectral_norm else weight_norm
68 |         self.convs = torch.nn.ModuleList([norm_f(torch.nn.Conv2d(in_ch, out_ch, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))) for in_ch, out_ch, stride in zip([1, 32, 128, 512, 1024], [32, 128, 512, 1024, 1024], [3, 3, 3, 3, 1])])
69 |         self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
70 |         self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
71 | 
72 |     def forward(self, x):
73 |         fmap = []
74 |         b, c, t = x.shape
75 |         if t % self.period != 0: x = F.pad(x, (0, (self.period - (t % self.period))), "reflect")
76 |         x = x.view(b, c, -1, self.period)
77 | 
78 |         for conv in self.convs:
79 |             x = checkpoint(self.lrelu, checkpoint(conv, x, use_reentrant = False), use_reentrant = False) if self.training and self.checkpointing else self.lrelu(conv(x))
80 |             fmap.append(x)
81 | 
82 |         x = self.conv_post(x)
83 |         fmap.append(x)
84 |         return x.flatten(1, -1), fmap


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/extracting/preparing_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | 
 5 | from random import shuffle
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from advanced_rvc_inference.core.ui import configs, config
10 | from advanced_rvc_inference.infer.extracting.embedding import create_mute_file
11 | 
12 | def mute_file(embedders_mode, embedders_model, mute_base_path, rvc_version):
13 |     if embedders_mode.startswith(("spin", "whisper")):
14 |         mute_file = f"mute_{embedders_model}.npy"
15 |     else:
16 |         mute_file = {
17 |             "contentvec_base": "mute.npy",
18 |             "hubert_base": "mute.npy",
19 |             "vietnamese_hubert_base": "mute_vietnamese.npy",
20 |             "japanese_hubert_base": "mute_japanese.npy",
21 |             "korean_hubert_base": "mute_korean.npy",
22 |             "chinese_hubert_base": "mute_chinese.npy",
23 |             "portuguese_hubert_base": "mute_portuguese.npy"
24 |         }.get(embedders_model, None)
25 | 
26 |     if mute_file is None:
27 |         create_mute_file(rvc_version, embedders_model, embedders_mode, config.is_half)
28 |         mute_file = f"mute_{embedders_model}.npy"
29 | 
30 |     return os.path.join(mute_base_path, f"{rvc_version}_extracted", mute_file)
31 | 
32 | def generate_config(rvc_version, sample_rate, model_path):
33 |     config_save_path = os.path.join(model_path, "config.json")
34 |     if not os.path.exists(config_save_path): shutil.copy(os.path.join("main", "configs", rvc_version, f"{sample_rate}.json"), config_save_path)
35 | 
36 | def generate_filelist(pitch_guidance, model_path, rvc_version, sample_rate, embedders_mode = "fairseq", embedder_model = "hubert_base", rms_extract = False):
37 |     gt_wavs_dir, feature_dir = os.path.join(model_path, "sliced_audios"), os.path.join(model_path, f"{rvc_version}_extracted")
38 |     f0_dir, f0nsf_dir, energy_dir = None, None, None
39 | 
40 |     if pitch_guidance: f0_dir, f0nsf_dir = os.path.join(model_path, "f0"), os.path.join(model_path, "f0_voiced")
41 |     if rms_extract: energy_dir = os.path.join(model_path, "energy")
42 | 
43 |     gt_wavs_files, feature_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir)), set(name.split(".")[0] for name in os.listdir(feature_dir))
44 |     names = gt_wavs_files & feature_files
45 | 
46 |     if pitch_guidance: names = names & set(name.split(".")[0] for name in os.listdir(f0_dir)) & set(name.split(".")[0] for name in os.listdir(f0nsf_dir))
47 |     if rms_extract: names = names & set(name.split(".")[0] for name in os.listdir(energy_dir))
48 |     
49 |     options = []
50 |     mute_base_path = os.path.join(configs["logs_path"], "mute")
51 | 
52 |     for name in names:
53 |         option = {
54 |             True: {
55 |                 True: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|{energy_dir}/{name}.wav.npy|0",
56 |                 False: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|0"
57 |             },
58 |             False: {
59 |                 True: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{energy_dir}/{name}.wav.npy|0",
60 |                 False: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|0"
61 |             }
62 |         }[pitch_guidance][rms_extract]
63 | 
64 |         options.append(option)
65 | 
66 |     mute_audio_path, mute_feature_path = os.path.join(mute_base_path, "sliced_audios", f"mute{sample_rate}.wav"), mute_file(embedders_mode, embedder_model, mute_base_path, rvc_version)
67 |     
68 |     for _ in range(2):
69 |         option = {
70 |             True: {
71 |                 True: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'energy', 'mute.wav.npy')}|0",
72 |                 False: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|0"
73 |             },
74 |             False: {
75 |                 True: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'energy', 'mute.wav.npy')}|0",
76 |                 False: f"{mute_audio_path}|{mute_feature_path}|0"
77 |             }
78 |         }[pitch_guidance][rms_extract]
79 | 
80 |         options.append(option)
81 | 
82 |     shuffle(options)
83 |     with open(os.path.join(model_path, "filelist.txt"), "w") as f:
84 |         f.write("\n".join(options))


--------------------------------------------------------------------------------
/advanced_rvc_inference/infer/train/extracting/preparing_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | 
 5 | from random import shuffle
 6 | 
 7 | sys.path.append(os.getcwd())
 8 | 
 9 | from advanced_rvc_inference.core.ui import configs, config
10 | from advanced_rvc_inference.infer.extracting.embedding import create_mute_file
11 | 
12 | def mute_file(embedders_mode, embedders_model, mute_base_path, rvc_version):
13 |     if embedders_mode.startswith(("spin", "whisper")):
14 |         mute_file = f"mute_{embedders_model}.npy"
15 |     else:
16 |         mute_file = {
17 |             "contentvec_base": "mute.npy",
18 |             "hubert_base": "mute.npy",
19 |             "vietnamese_hubert_base": "mute_vietnamese.npy",
20 |             "japanese_hubert_base": "mute_japanese.npy",
21 |             "korean_hubert_base": "mute_korean.npy",
22 |             "chinese_hubert_base": "mute_chinese.npy",
23 |             "portuguese_hubert_base": "mute_portuguese.npy"
24 |         }.get(embedders_model, None)
25 | 
26 |     if mute_file is None:
27 |         create_mute_file(rvc_version, embedders_model, embedders_mode, config.is_half)
28 |         mute_file = f"mute_{embedders_model}.npy"
29 | 
30 |     return os.path.join(mute_base_path, f"{rvc_version}_extracted", mute_file)
31 | 
32 | def generate_config(rvc_version, sample_rate, model_path):
33 |     config_save_path = os.path.join(model_path, "config.json")
34 |     if not os.path.exists(config_save_path): shutil.copy(os.path.join("main", "configs", rvc_version, f"{sample_rate}.json"), config_save_path)
35 | 
36 | def generate_filelist(pitch_guidance, model_path, rvc_version, sample_rate, embedders_mode = "fairseq", embedder_model = "hubert_base", rms_extract = False):
37 |     gt_wavs_dir, feature_dir = os.path.join(model_path, "sliced_audios"), os.path.join(model_path, f"{rvc_version}_extracted")
38 |     f0_dir, f0nsf_dir, energy_dir = None, None, None
39 | 
40 |     if pitch_guidance: f0_dir, f0nsf_dir = os.path.join(model_path, "f0"), os.path.join(model_path, "f0_voiced")
41 |     if rms_extract: energy_dir = os.path.join(model_path, "energy")
42 | 
43 |     gt_wavs_files, feature_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir)), set(name.split(".")[0] for name in os.listdir(feature_dir))
44 |     names = gt_wavs_files & feature_files
45 | 
46 |     if pitch_guidance: names = names & set(name.split(".")[0] for name in os.listdir(f0_dir)) & set(name.split(".")[0] for name in os.listdir(f0nsf_dir))
47 |     if rms_extract: names = names & set(name.split(".")[0] for name in os.listdir(energy_dir))
48 |     
49 |     options = []
50 |     mute_base_path = os.path.join(configs["logs_path"], "mute")
51 | 
52 |     for name in names:
53 |         option = {
54 |             True: {
55 |                 True: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|{energy_dir}/{name}.wav.npy|0",
56 |                 False: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|0"
57 |             },
58 |             False: {
59 |                 True: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{energy_dir}/{name}.wav.npy|0",
60 |                 False: f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|0"
61 |             }
62 |         }[pitch_guidance][rms_extract]
63 | 
64 |         options.append(option)
65 | 
66 |     mute_audio_path, mute_feature_path = os.path.join(mute_base_path, "sliced_audios", f"mute{sample_rate}.wav"), mute_file(embedders_mode, embedder_model, mute_base_path, rvc_version)
67 |     
68 |     for _ in range(2):
69 |         option = {
70 |             True: {
71 |                 True: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'energy', 'mute.wav.npy')}|0",
72 |                 False: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|0"
73 |             },
74 |             False: {
75 |                 True: f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'energy', 'mute.wav.npy')}|0",
76 |                 False: f"{mute_audio_path}|{mute_feature_path}|0"
77 |             }
78 |         }[pitch_guidance][rms_extract]
79 | 
80 |         options.append(option)
81 | 
82 |     shuffle(options)
83 |     with open(os.path.join(model_path, "filelist.txt"), "w") as f:
84 |         f.write("\n".join(options))


--------------------------------------------------------------------------------