├── Readme.MD ├── _create_venv.bat ├── _install_cli_commands.bat ├── _install_dependencies.bat ├── _install_models.bat ├── _start_cli.bat ├── _start_llm_server.bat ├── _start_stt_server.bat ├── _start_tts_server.bat ├── _start_venv.bat ├── llm-cli ├── llm_client.py ├── llm_server.py ├── server.py ├── setup.py └── start_llm_server.py ├── requirements.txt ├── requirements_client.txt ├── stt-cli ├── server.py ├── setup.py ├── start_stt_server.py ├── stt_client.py └── stt_server.py └── tts-cli ├── bufferstream.py ├── download_models.py ├── requirements.txt ├── rvc ├── __pycache__ │ └── realtimervc.cpython-310.pyc ├── configs │ ├── __pycache__ │ │ └── config.cpython-310.pyc │ ├── config.json │ ├── config.py │ ├── v1 │ │ ├── 32k.json │ │ ├── 40k.json │ │ └── 48k.json │ └── v2 │ │ ├── 32k.json │ │ └── 48k.json ├── i18n │ ├── i18n.py │ ├── locale │ │ ├── en_US.json │ │ ├── es_ES.json │ │ ├── fr_FR.json │ │ ├── it_IT.json │ │ ├── ja_JP.json │ │ ├── ru_RU.json │ │ ├── tr_TR.json │ │ ├── zh_CN.json │ │ ├── zh_HK.json │ │ ├── zh_SG.json │ │ └── zh_TW.json │ ├── locale_diff.py │ └── scan_i18n.py ├── infer │ ├── lib │ │ ├── __pycache__ │ │ │ └── rmvpe.cpython-310.pyc │ │ ├── audio.py │ │ ├── infer_pack │ │ │ ├── __pycache__ │ │ │ │ ├── attentions.cpython-310.pyc │ │ │ │ ├── commons.cpython-310.pyc │ │ │ │ ├── models.cpython-310.pyc │ │ │ │ ├── modules.cpython-310.pyc │ │ │ │ └── transforms.cpython-310.pyc │ │ │ ├── attentions.py │ │ │ ├── commons.py │ │ │ ├── models.py │ │ │ ├── models_onnx.py │ │ │ ├── modules.py │ │ │ ├── modules │ │ │ │ └── F0Predictor │ │ │ │ │ ├── DioF0Predictor.py │ │ │ │ │ ├── F0Predictor.py │ │ │ │ │ ├── HarvestF0Predictor.py │ │ │ │ │ ├── PMF0Predictor.py │ │ │ │ │ └── __init__.py │ │ │ ├── onnx_inference.py │ │ │ └── transforms.py │ │ ├── jit │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-310.pyc │ │ │ │ └── get_synthesizer.cpython-310.pyc │ │ │ ├── get_hubert.py │ │ │ ├── get_rmvpe.py │ │ │ └── get_synthesizer.py │ │ ├── rmvpe.py │ │ ├── slicer2.py │ │ ├── train │ │ │ ├── data_utils.py │ │ │ ├── losses.py │ │ │ ├── mel_processing.py │ │ │ ├── process_ckpt.py │ │ │ └── utils.py │ │ └── uvr5_pack │ │ │ ├── lib_v5 │ │ │ ├── dataset.py │ │ │ ├── layers.py │ │ │ ├── layers_123812KB .py │ │ │ ├── layers_123821KB.py │ │ │ ├── layers_33966KB.py │ │ │ ├── layers_537227KB.py │ │ │ ├── layers_537238KB.py │ │ │ ├── layers_new.py │ │ │ ├── model_param_init.py │ │ │ ├── modelparams │ │ │ │ ├── 1band_sr16000_hl512.json │ │ │ │ ├── 1band_sr32000_hl512.json │ │ │ │ ├── 1band_sr33075_hl384.json │ │ │ │ ├── 1band_sr44100_hl1024.json │ │ │ │ ├── 1band_sr44100_hl256.json │ │ │ │ ├── 1band_sr44100_hl512.json │ │ │ │ ├── 1band_sr44100_hl512_cut.json │ │ │ │ ├── 2band_32000.json │ │ │ │ ├── 2band_44100_lofi.json │ │ │ │ ├── 2band_48000.json │ │ │ │ ├── 3band_44100.json │ │ │ │ ├── 3band_44100_mid.json │ │ │ │ ├── 3band_44100_msb2.json │ │ │ │ ├── 4band_44100.json │ │ │ │ ├── 4band_44100_mid.json │ │ │ │ ├── 4band_44100_msb.json │ │ │ │ ├── 4band_44100_msb2.json │ │ │ │ ├── 4band_44100_reverse.json │ │ │ │ ├── 4band_44100_sw.json │ │ │ │ ├── 4band_v2.json │ │ │ │ ├── 4band_v2_sn.json │ │ │ │ ├── 4band_v3.json │ │ │ │ └── ensemble.json │ │ │ ├── nets.py │ │ │ ├── nets_123812KB.py │ │ │ ├── nets_123821KB.py │ │ │ ├── nets_33966KB.py │ │ │ ├── nets_537227KB.py │ │ │ ├── nets_537238KB.py │ │ │ ├── nets_61968KB.py │ │ │ ├── nets_new.py │ │ │ └── spec_utils.py │ │ │ ├── name_params.json │ │ │ └── utils.py │ └── modules │ │ ├── ipex │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── gradscaler.py │ │ └── hijacks.py │ │ ├── onnx │ │ └── export.py │ │ ├── train │ │ ├── extract │ │ │ ├── extract_f0_print.py │ │ │ ├── extract_f0_rmvpe.py │ │ │ └── extract_f0_rmvpe_dml.py │ │ ├── extract_feature_print.py │ │ ├── preprocess.py │ │ └── train.py │ │ ├── uvr5 │ │ ├── mdxnet.py │ │ ├── modules.py │ │ └── vr.py │ │ └── vc │ │ ├── __init__.py │ │ ├── modules.py │ │ ├── pipeline.py │ │ └── utils.py ├── realtimervc.py └── tools │ ├── __pycache__ │ └── rvc_for_realtime.cpython-310.pyc │ ├── app.py │ ├── calc_rvc_model_similarity.py │ ├── dlmodels.bat │ ├── dlmodels.sh │ ├── download_models - Kopie.py │ ├── download_models.py │ ├── export_onnx.py │ ├── infer │ ├── infer-pm-index256.py │ ├── train-index-v2.py │ ├── train-index.py │ └── trans_weights.py │ ├── infer_batch_rvc.py │ ├── infer_cli.py │ ├── onnx_inference_demo.py │ ├── rvc_for_realtime.py │ └── torchgate │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── torchgate.cpython-310.pyc │ └── utils.cpython-310.pyc │ ├── torchgate.py │ └── utils.py ├── server.py ├── setup.py ├── start_tts_server.py ├── tts_client.py ├── tts_server.py ├── vanessa.json └── xtts_rvc_synthesizer.py /_create_venv.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | :: Set Python path (adjust this if needed) 4 | set PYTHON_EXE=python.exe 5 | 6 | 7 | echo Installing AI CLI Tools... 8 | setlocal enabledelayedexpansion 9 | 10 | :: Set current directory 11 | cd /d %~dp0 12 | 13 | echo Starting installation process... 14 | 15 | :: Create and activate virtual environment 16 | echo Creating and activating virtual environment... 17 | %PYTHON_EXE% -m venv venv 18 | call venv\Scripts\activate.bat 19 | 20 | :: Upgrade pip 21 | echo Upgrading pip... 22 | python -m pip install pip==23.3.1 23 | 24 | -------------------------------------------------------------------------------- /_install_cli_commands.bat: -------------------------------------------------------------------------------- 1 | echo Installing CLI commands 2 | cd llm-cli 3 | pip uninstall -y llm-cli 4 | pip install -e . 5 | cd .. 6 | cd stt-cli 7 | pip uninstall -y stt-cli 8 | pip install -e . 9 | cd .. 10 | cd tts-cli 11 | pip uninstall -y tts-cli 12 | pip install -e . 13 | cd .. 14 | 15 | echo Installation of CLI commands finished -------------------------------------------------------------------------------- /_install_dependencies.bat: -------------------------------------------------------------------------------- 1 | REM call _start_venv.bat 2 | echo Installating basic dependencies 3 | pip install -r requirements.txt 4 | 5 | echo Upgrading torch to use GPU 6 | pip install torch==2.3.1+cu121 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121 7 | 8 | echo Installation of dependencies finished -------------------------------------------------------------------------------- /_install_models.bat: -------------------------------------------------------------------------------- 1 | REM call _start_venv.bat 2 | echo Downloading models 3 | cd tts-cli 4 | python download_models.py 5 | 6 | echo Download of models finished -------------------------------------------------------------------------------- /_start_cli.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | cmd 3 | -------------------------------------------------------------------------------- /_start_llm_server.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | llm-server 3 | cmd 4 | -------------------------------------------------------------------------------- /_start_stt_server.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | stt-server 3 | cmd -------------------------------------------------------------------------------- /_start_tts_server.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | tts-server 3 | cmd 4 | -------------------------------------------------------------------------------- /_start_venv.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | call venv\Scripts\activate.bat 3 | cmd 4 | -------------------------------------------------------------------------------- /llm-cli/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="llm-cli", 5 | version="0.1", 6 | packages=find_packages(), 7 | entry_points={ 8 | 'console_scripts': [ 9 | 'llm=llm_client:main', 10 | 'llm-server=start_llm_server:main', 11 | ], 12 | }, 13 | ) -------------------------------------------------------------------------------- /llm-cli/start_llm_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | def main(): 6 | # Get the current script's directory (should be llm-cli) 7 | script_dir = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | # Move one directory up to access the venv 10 | root_dir = os.path.dirname(script_dir) 11 | os.chdir(root_dir) 12 | 13 | # Path to the virtual environment 14 | venv_path = os.path.join(root_dir, 'venv') 15 | 16 | # Path to the Python interpreter in the virtual environment 17 | if sys.platform == "win32": 18 | python_path = os.path.join(venv_path, 'Scripts', 'python.exe') 19 | else: 20 | python_path = os.path.join(venv_path, 'bin', 'python') 21 | 22 | # Change back to the llm-cli directory 23 | os.chdir(script_dir) 24 | 25 | # Prepare the command to run llm_server.py with all provided arguments 26 | command = [python_path, 'llm_server.py'] + sys.argv[1:] 27 | 28 | # Start the LLM server 29 | print("Starting LLM server...") 30 | print(f"Command: {command}") 31 | try: 32 | subprocess.run(command, check=True) 33 | except subprocess.CalledProcessError as e: 34 | print(f"Error starting LLM server: {e}") 35 | sys.exit(1) 36 | except FileNotFoundError: 37 | print(f"Error: Could not find Python interpreter at {python_path}") 38 | print("Make sure the virtual environment is set up correctly.") 39 | sys.exit(1) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # stt server depencendies 2 | realtimestt 3 | 4 | # tts server depencendies 5 | realtimetts[all] 6 | 7 | # rvc (realtime voice change) depencendies 8 | fairseq 9 | faiss-cpu 10 | praat-parselmouth 11 | torchcrepe 12 | torchfcpe 13 | pyworld -------------------------------------------------------------------------------- /requirements_client.txt: -------------------------------------------------------------------------------- 1 | websockets 2 | pyaudio 3 | websocket-client 4 | colorama 5 | tqdm -------------------------------------------------------------------------------- /stt-cli/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="stt-cli", 5 | version="0.1", 6 | packages=find_packages(), 7 | entry_points={ 8 | 'console_scripts': [ 9 | 'stt=stt_client:main', 10 | 'stt-server=start_stt_server:main', 11 | ], 12 | }, 13 | ) -------------------------------------------------------------------------------- /stt-cli/start_stt_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | def main(): 6 | # Get the current script's directory (should be stt-cli) 7 | script_dir = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | # Move one directory up to access the venv 10 | root_dir = os.path.dirname(script_dir) 11 | os.chdir(root_dir) 12 | 13 | # Path to the virtual environment 14 | venv_path = os.path.join(root_dir, 'venv') 15 | 16 | # Path to the Python interpreter in the virtual environment 17 | if sys.platform == "win32": 18 | python_path = os.path.join(venv_path, 'Scripts', 'python.exe') 19 | else: 20 | python_path = os.path.join(venv_path, 'bin', 'python') 21 | 22 | # Change back to the stt-cli directory 23 | os.chdir(script_dir) 24 | 25 | # Prepare the command to run stt_server.py with all provided arguments 26 | command = [python_path, 'stt_server.py'] + sys.argv[1:] 27 | 28 | # Start the STT server 29 | print("Starting STT server...") 30 | print(f"Command: {command}") 31 | try: 32 | subprocess.run(command, check=True) 33 | except subprocess.CalledProcessError as e: 34 | print(f"Error starting STT server: {e}") 35 | sys.exit(1) 36 | except FileNotFoundError: 37 | print(f"Error: Could not find Python interpreter at {python_path}") 38 | print("Make sure the virtual environment is set up correctly.") 39 | sys.exit(1) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /stt-cli/stt_server.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | print("Starting server, please wait...") 3 | 4 | from RealtimeSTT import AudioToTextRecorder 5 | import asyncio 6 | import websockets 7 | import threading 8 | import numpy as np 9 | from scipy.signal import resample 10 | import json 11 | 12 | recorder = None 13 | recorder_ready = threading.Event() 14 | client_websocket = None 15 | 16 | async def send_to_client(message): 17 | if client_websocket: 18 | await client_websocket.send(message) 19 | 20 | def text_detected(text): 21 | asyncio.new_event_loop().run_until_complete( 22 | send_to_client( 23 | json.dumps({ 24 | 'type': 'realtime', 25 | 'text': text 26 | }) 27 | ) 28 | ) 29 | print(f"\r{text}", flush=True, end='') 30 | 31 | recorder_config = { 32 | 'spinner': False, 33 | 'use_microphone': False, 34 | 'model': 'large-v2', 35 | 'silero_sensitivity': 0.4, 36 | 'silero_deactivity_detection': True, 37 | 'webrtc_sensitivity': 3, 38 | 'post_speech_silence_duration': 0.25, 39 | 'min_length_of_recording': 0, 40 | 'min_gap_between_recordings': 0, 41 | 'enable_realtime_transcription': True, 42 | 'realtime_processing_pause': 0, 43 | 'realtime_model_type': 'medium', 44 | 'on_realtime_transcription_stabilized': text_detected, 45 | } 46 | 47 | def _recorder_thread(): 48 | global recorder 49 | print("Initializing RealtimeSTT...") 50 | recorder = AudioToTextRecorder(**recorder_config) 51 | print("RealtimeSTT initialized") 52 | recorder_ready.set() 53 | while True: 54 | full_sentence = recorder.text() 55 | asyncio.new_event_loop().run_until_complete( 56 | send_to_client( 57 | json.dumps({ 58 | 'type': 'fullSentence', 59 | 'text': full_sentence 60 | }) 61 | ) 62 | ) 63 | print(f"\rSentence: {full_sentence}") 64 | 65 | def decode_and_resample( 66 | audio_data, 67 | original_sample_rate, 68 | target_sample_rate): 69 | 70 | # Decode 16-bit PCM data to numpy array 71 | audio_np = np.frombuffer(audio_data, dtype=np.int16) 72 | 73 | # Calculate the number of samples after resampling 74 | num_original_samples = len(audio_np) 75 | num_target_samples = int(num_original_samples * target_sample_rate / 76 | original_sample_rate) 77 | 78 | # Resample the audio 79 | resampled_audio = resample(audio_np, num_target_samples) 80 | 81 | return resampled_audio.astype(np.int16).tobytes() 82 | 83 | async def echo(websocket, path): 84 | print("Client connected") 85 | global client_websocket 86 | client_websocket = websocket 87 | async for message in websocket: 88 | 89 | if not recorder_ready.is_set(): 90 | print("Recorder not ready") 91 | continue 92 | 93 | metadata_length = int.from_bytes(message[:4], byteorder='little') 94 | metadata_json = message[4:4+metadata_length].decode('utf-8') 95 | metadata = json.loads(metadata_json) 96 | sample_rate = metadata['sampleRate'] 97 | chunk = message[4+metadata_length:] 98 | resampled_chunk = decode_and_resample(chunk, sample_rate, 16000) 99 | recorder.feed_audio(resampled_chunk) 100 | 101 | 102 | def main(): 103 | # start_server = websockets.serve(echo, "0.0.0.0", 9001) 104 | start_server = websockets.serve(echo, "localhost", 8011) 105 | 106 | recorder_thread = threading.Thread(target=_recorder_thread) 107 | recorder_thread.start() 108 | recorder_ready.wait() 109 | 110 | print("Server started. Press Ctrl+C to stop the server.") 111 | asyncio.get_event_loop().run_until_complete(start_server) 112 | asyncio.get_event_loop().run_forever() 113 | 114 | main() 115 | 116 | -------------------------------------------------------------------------------- /tts-cli/bufferstream.py: -------------------------------------------------------------------------------- 1 | import queue 2 | import threading 3 | import uuid 4 | from typing import Generator, List, Any 5 | 6 | class BufferStream: 7 | def __init__(self): 8 | self.items: queue.Queue = queue.Queue() 9 | self._stop_event: threading.Event = threading.Event() 10 | self.stopped: bool = False 11 | self.stream_id: str = str(uuid.uuid4()) 12 | 13 | def add(self, item: Any) -> None: 14 | """Add an item to the buffer.""" 15 | self.items.put(item) 16 | 17 | def stop(self) -> None: 18 | """Signal to stop the buffer stream.""" 19 | self._stop_event.set() 20 | 21 | def snapshot(self) -> List[Any]: 22 | """Take a snapshot of all items in the buffer without exhausting it.""" 23 | with self.items.mutex: 24 | return list(self.items.queue) 25 | 26 | def gen(self) -> Generator[Any, None, None]: 27 | """Generate items from the buffer, yielding them one at a time.""" 28 | while not self._stop_event.is_set() or not self.items.empty(): 29 | try: 30 | yield self.items.get(timeout=0.1) 31 | except queue.Empty: 32 | continue 33 | self.stopped = True 34 | -------------------------------------------------------------------------------- /tts-cli/download_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python script to download the ai models 3 | needed by linguflex from Huggingface's model hub. 4 | """ 5 | 6 | from huggingface_hub import hf_hub_download 7 | import os 8 | 9 | 10 | def create_directory(path): 11 | if not os.path.exists(path): 12 | os.makedirs(path) 13 | 14 | 15 | def create_directories(): 16 | create_directory("assets") 17 | create_directory("assets/hubert") 18 | create_directory("models") 19 | create_directory("models/rvc") 20 | create_directory("models/xtts") 21 | create_directory("models/xtts/Lasinya") 22 | 23 | 24 | def download_file( 25 | url, 26 | filename_server, 27 | path_local 28 | ): 29 | 30 | local_file = os.path.join(path_local, filename_server) 31 | if os.path.exists(local_file): 32 | print(f"File {filename_server} already exists in {path_local}.") 33 | return 34 | 35 | print(f"Downloading {filename_server} from repo {url} to {path_local}") 36 | hf_hub_download( 37 | repo_id=url, 38 | filename=filename_server, 39 | local_dir=path_local) 40 | 41 | 42 | create_directories() 43 | 44 | # download rvc base model (hubert) files 45 | print("Downloading hubert base model files") 46 | download_file( 47 | "KoljaB/RVC_Assets", "hubert_base.pt", "assets/hubert") 48 | download_file( 49 | "KoljaB/RVC_Assets", "hubert_inputs.pth", "assets/hubert") 50 | 51 | # download rvc trained model files 52 | print("Downloading rvc trained model files") 53 | download_file( 54 | "KoljaB/RVC_Models", "Lasinya.pth", "models/rvc") 55 | download_file( 56 | "KoljaB/RVC_Models", "Lasinya.index", "models/rvc") 57 | 58 | # download xtts trained model files 59 | from huggingface_hub import hf_hub_download 60 | import os 61 | 62 | def create_directory(path): 63 | if not os.path.exists(path): 64 | os.makedirs(path) 65 | 66 | def create_directories(): 67 | create_directory("assets") 68 | create_directory("assets/hubert") 69 | create_directory("models") 70 | create_directory("models/rvc") 71 | create_directory("models/xtts") 72 | create_directory("models/xtts/v2.0.2") 73 | 74 | def download_file(url, filename_server, path_local): 75 | local_file = os.path.join(path_local, filename_server) 76 | if os.path.exists(local_file): 77 | print(f"File {filename_server} already exists in {path_local}.") 78 | return 79 | 80 | print(f"Downloading {filename_server} from repo {url} to {path_local}") 81 | hf_hub_download( 82 | repo_id=url, 83 | filename=filename_server, 84 | local_dir=path_local) 85 | 86 | create_directories() 87 | 88 | # download rvc base model (hubert) files 89 | print("Downloading hubert base model files") 90 | download_file( 91 | "KoljaB/RVC_Assets", "hubert_base.pt", "assets/hubert") 92 | download_file( 93 | "KoljaB/RVC_Assets", "hubert_inputs.pth", "assets/hubert") 94 | 95 | # download rvc trained model files 96 | print("Downloading rvc trained model files") 97 | download_file( 98 | "KoljaB/RVC_Models", "Lasinya.pth", "models/rvc") 99 | download_file( 100 | "KoljaB/RVC_Models", "Lasinya.index", "models/rvc") 101 | 102 | # download xtts v2 base model files 103 | print("Downloading XTTS v2 base model files") 104 | download_file( 105 | "coqui/XTTS-v2", "config.json", "models/xtts/v2.0.2") 106 | download_file( 107 | "coqui/XTTS-v2", "model.pth", "models/xtts/v2.0.2") 108 | download_file( 109 | "coqui/XTTS-v2", "vocab.json", "models/xtts/v2.0.2") 110 | download_file( 111 | "coqui/XTTS-v2", "speakers_xtts.pth", "models/xtts/v2.0.2") 112 | 113 | 114 | # print("Downloading xtts trained model files (Lasinya)") 115 | # download_file( 116 | # "KoljaB/XTTS_Lasinya", "config.json", "models/xtts/Lasinya") 117 | # download_file( 118 | # "KoljaB/XTTS_Lasinya", "vocab.json", "models/xtts/Lasinya") 119 | # download_file( 120 | # "KoljaB/XTTS_Lasinya", "speakers_xtts.pth", "models/xtts/Lasinya") 121 | # download_file( 122 | # "KoljaB/XTTS_Lasinya", "model.pth", "models/xtts/Lasinya") 123 | -------------------------------------------------------------------------------- /tts-cli/requirements.txt: -------------------------------------------------------------------------------- 1 | fairseq==0.12.2 2 | faiss-cpu==1.7.3 3 | tensorboardX==2.6.2.2 4 | torchcrepe==0.0.20 5 | torchfcpe==0.0.4 6 | praat-parselmouth==0.4.3 7 | pyworld==0.3.2 8 | huggingface_hub==0.24.5 9 | 10 | -------------------------------------------------------------------------------- /tts-cli/rvc/__pycache__/realtimervc.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/__pycache__/realtimervc.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/configs/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/configs/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/configs/config.json: -------------------------------------------------------------------------------- 1 | {"pth_path": "D:/lasinya1.pth", "index_path": "D:/added_IVF3778_Flat_nprobe_1_aerith_v2.index", "sg_input_device": "Mikrofon (2- ArctisX PnP Microp (MME)", "sg_output_device": "Lautsprecher (Realtek(R) Audio) (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"} -------------------------------------------------------------------------------- /tts-cli/rvc/configs/v1/32k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,4,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tts-cli/rvc/configs/v1/40k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 40000, 21 | "filter_length": 2048, 22 | "hop_length": 400, 23 | "win_length": 2048, 24 | "n_mel_channels": 125, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tts-cli/rvc/configs/v1/48k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 11520, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,6,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tts-cli/rvc/configs/v2/32k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,8,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [20,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tts-cli/rvc/configs/v2/48k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 17280, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [12,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [24,20,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tts-cli/rvc/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | 6 | def load_language_list(language): 7 | with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: 8 | language_list = json.load(f) 9 | return language_list 10 | 11 | 12 | class I18nAuto: 13 | def __init__(self, language=None): 14 | if language in ["Auto", None]: 15 | language = locale.getdefaultlocale()[ 16 | 0 17 | ] # getlocale can't identify the system's language ((None, None)) 18 | if not os.path.exists(f"./i18n/locale/{language}.json"): 19 | language = "en_US" 20 | self.language = language 21 | self.language_map = load_language_list(language) 22 | 23 | def __call__(self, key): 24 | return self.language_map.get(key, key) 25 | 26 | def __repr__(self): 27 | return "Use Language: " + self.language 28 | -------------------------------------------------------------------------------- /tts-cli/rvc/i18n/locale_diff.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import OrderedDict 4 | 5 | # Define the standard file name 6 | standard_file = "locale/zh_CN.json" 7 | 8 | # Find all JSON files in the directory 9 | dir_path = "locale/" 10 | languages = [ 11 | os.path.join(dir_path, f) 12 | for f in os.listdir(dir_path) 13 | if f.endswith(".json") and f != standard_file 14 | ] 15 | 16 | # Load the standard file 17 | with open(standard_file, "r", encoding="utf-8") as f: 18 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 19 | 20 | # Loop through each language file 21 | for lang_file in languages: 22 | # Load the language file 23 | with open(lang_file, "r", encoding="utf-8") as f: 24 | lang_data = json.load(f, object_pairs_hook=OrderedDict) 25 | 26 | # Find the difference between the language file and the standard file 27 | diff = set(standard_data.keys()) - set(lang_data.keys()) 28 | 29 | miss = set(lang_data.keys()) - set(standard_data.keys()) 30 | 31 | # Add any missing keys to the language file 32 | for key in diff: 33 | lang_data[key] = key 34 | 35 | # Del any extra keys to the language file 36 | for key in miss: 37 | del lang_data[key] 38 | 39 | # Sort the keys of the language file to match the order of the standard file 40 | lang_data = OrderedDict( 41 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) 42 | ) 43 | 44 | # Save the updated language file 45 | with open(lang_file, "w", encoding="utf-8") as f: 46 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) 47 | f.write("\n") 48 | -------------------------------------------------------------------------------- /tts-cli/rvc/i18n/scan_i18n.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import glob 3 | import json 4 | from collections import OrderedDict 5 | 6 | 7 | def extract_i18n_strings(node): 8 | i18n_strings = [] 9 | 10 | if ( 11 | isinstance(node, ast.Call) 12 | and isinstance(node.func, ast.Name) 13 | and node.func.id == "i18n" 14 | ): 15 | for arg in node.args: 16 | if isinstance(arg, ast.Str): 17 | i18n_strings.append(arg.s) 18 | 19 | for child_node in ast.iter_child_nodes(node): 20 | i18n_strings.extend(extract_i18n_strings(child_node)) 21 | 22 | return i18n_strings 23 | 24 | 25 | # scan the directory for all .py files (recursively) 26 | # for each file, parse the code into an AST 27 | # for each AST, extract the i18n strings 28 | 29 | strings = [] 30 | for filename in glob.iglob("**/*.py", recursive=True): 31 | with open(filename, "r") as f: 32 | code = f.read() 33 | if "I18nAuto" in code: 34 | tree = ast.parse(code) 35 | i18n_strings = extract_i18n_strings(tree) 36 | print(filename, len(i18n_strings)) 37 | strings.extend(i18n_strings) 38 | code_keys = set(strings) 39 | """ 40 | n_i18n.py 41 | gui_v1.py 26 42 | app.py 16 43 | infer-web.py 147 44 | scan_i18n.py 0 45 | i18n.py 0 46 | lib/train/process_ckpt.py 1 47 | """ 48 | print() 49 | print("Total unique:", len(code_keys)) 50 | 51 | 52 | standard_file = "i18n/locale/zh_CN.json" 53 | with open(standard_file, "r", encoding="utf-8") as f: 54 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 55 | standard_keys = set(standard_data.keys()) 56 | 57 | # Define the standard file name 58 | unused_keys = standard_keys - code_keys 59 | print("Unused keys:", len(unused_keys)) 60 | for unused_key in unused_keys: 61 | print("\t", unused_key) 62 | 63 | missing_keys = code_keys - standard_keys 64 | print("Missing keys:", len(missing_keys)) 65 | for missing_key in missing_keys: 66 | print("\t", missing_key) 67 | 68 | code_keys_dict = OrderedDict() 69 | for s in strings: 70 | code_keys_dict[s] = s 71 | 72 | # write back 73 | with open(standard_file, "w", encoding="utf-8") as f: 74 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) 75 | f.write("\n") 76 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | 4 | import librosa 5 | import numpy as np 6 | import av 7 | from io import BytesIO 8 | 9 | 10 | def wav2(i, o, format): 11 | inp = av.open(i, "rb") 12 | if format == "m4a": 13 | format = "mp4" 14 | out = av.open(o, "wb", format=format) 15 | if format == "ogg": 16 | format = "libvorbis" 17 | if format == "mp4": 18 | format = "aac" 19 | 20 | ostream = out.add_stream(format) 21 | 22 | for frame in inp.decode(audio=0): 23 | for p in ostream.encode(frame): 24 | out.mux(p) 25 | 26 | for p in ostream.encode(None): 27 | out.mux(p) 28 | 29 | out.close() 30 | inp.close() 31 | 32 | 33 | def audio2(i, o, format, sr): 34 | inp = av.open(i, "rb") 35 | out = av.open(o, "wb", format=format) 36 | if format == "ogg": 37 | format = "libvorbis" 38 | if format == "f32le": 39 | format = "pcm_f32le" 40 | 41 | ostream = out.add_stream(format, channels=1) 42 | ostream.sample_rate = sr 43 | 44 | for frame in inp.decode(audio=0): 45 | for p in ostream.encode(frame): 46 | out.mux(p) 47 | 48 | out.close() 49 | inp.close() 50 | 51 | 52 | def load_audio(file, sr): 53 | file = ( 54 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 55 | ) # 防止小白拷路径头尾带了空格和"和回车 56 | if os.path.exists(file) == False: 57 | raise RuntimeError( 58 | "You input a wrong audio path that does not exists, please fix it!" 59 | ) 60 | try: 61 | with open(file, "rb") as f: 62 | with BytesIO() as out: 63 | audio2(f, out, "f32le", sr) 64 | return np.frombuffer(out.getvalue(), np.float32).flatten() 65 | 66 | except AttributeError: 67 | audio = file[1] / 32768.0 68 | if len(audio.shape) == 2: 69 | audio = np.mean(audio, -1) 70 | return librosa.resample(audio, orig_sr=file[0], target_sr=16000) 71 | 72 | except: 73 | raise RuntimeError(traceback.format_exc()) 74 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyworld 3 | 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class DioF0Predictor(F0Predictor): 8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | 14 | def interpolate_f0(self, f0): 15 | """ 16 | 对F0进行插值处理 17 | """ 18 | 19 | data = np.reshape(f0, (f0.size, 1)) 20 | 21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 22 | vuv_vector[data > 0.0] = 1.0 23 | vuv_vector[data <= 0.0] = 0.0 24 | 25 | ip_data = data 26 | 27 | frame_number = data.size 28 | last_value = 0.0 29 | for i in range(frame_number): 30 | if data[i] <= 0.0: 31 | j = i + 1 32 | for j in range(i + 1, frame_number): 33 | if data[j] > 0.0: 34 | break 35 | if j < frame_number - 1: 36 | if last_value > 0.0: 37 | step = (data[j] - data[i - 1]) / float(j - i) 38 | for k in range(i, j): 39 | ip_data[k] = data[i - 1] + step * (k - i + 1) 40 | else: 41 | for k in range(i, j): 42 | ip_data[k] = data[j] 43 | else: 44 | for k in range(i, frame_number): 45 | ip_data[k] = last_value 46 | else: 47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 48 | last_value = data[i] 49 | 50 | return ip_data[:, 0], vuv_vector[:, 0] 51 | 52 | def resize_f0(self, x, target_len): 53 | source = np.array(x) 54 | source[source < 0.001] = np.nan 55 | target = np.interp( 56 | np.arange(0, len(source) * target_len, len(source)) / target_len, 57 | np.arange(0, len(source)), 58 | source, 59 | ) 60 | res = np.nan_to_num(target) 61 | return res 62 | 63 | def compute_f0(self, wav, p_len=None): 64 | if p_len is None: 65 | p_len = wav.shape[0] // self.hop_length 66 | f0, t = pyworld.dio( 67 | wav.astype(np.double), 68 | fs=self.sampling_rate, 69 | f0_floor=self.f0_min, 70 | f0_ceil=self.f0_max, 71 | frame_period=1000 * self.hop_length / self.sampling_rate, 72 | ) 73 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 74 | for index, pitch in enumerate(f0): 75 | f0[index] = round(pitch, 1) 76 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 77 | 78 | def compute_f0_uv(self, wav, p_len=None): 79 | if p_len is None: 80 | p_len = wav.shape[0] // self.hop_length 81 | f0, t = pyworld.dio( 82 | wav.astype(np.double), 83 | fs=self.sampling_rate, 84 | f0_floor=self.f0_min, 85 | f0_ceil=self.f0_max, 86 | frame_period=1000 * self.hop_length / self.sampling_rate, 87 | ) 88 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 89 | for index, pitch in enumerate(f0): 90 | f0[index] = round(pitch, 1) 91 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 92 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py: -------------------------------------------------------------------------------- 1 | class F0Predictor(object): 2 | def compute_f0(self, wav, p_len): 3 | """ 4 | input: wav:[signal_length] 5 | p_len:int 6 | output: f0:[signal_length//hop_length] 7 | """ 8 | pass 9 | 10 | def compute_f0_uv(self, wav, p_len): 11 | """ 12 | input: wav:[signal_length] 13 | p_len:int 14 | output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] 15 | """ 16 | pass 17 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyworld 3 | 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class HarvestF0Predictor(F0Predictor): 8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | 14 | def interpolate_f0(self, f0): 15 | """ 16 | 对F0进行插值处理 17 | """ 18 | 19 | data = np.reshape(f0, (f0.size, 1)) 20 | 21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 22 | vuv_vector[data > 0.0] = 1.0 23 | vuv_vector[data <= 0.0] = 0.0 24 | 25 | ip_data = data 26 | 27 | frame_number = data.size 28 | last_value = 0.0 29 | for i in range(frame_number): 30 | if data[i] <= 0.0: 31 | j = i + 1 32 | for j in range(i + 1, frame_number): 33 | if data[j] > 0.0: 34 | break 35 | if j < frame_number - 1: 36 | if last_value > 0.0: 37 | step = (data[j] - data[i - 1]) / float(j - i) 38 | for k in range(i, j): 39 | ip_data[k] = data[i - 1] + step * (k - i + 1) 40 | else: 41 | for k in range(i, j): 42 | ip_data[k] = data[j] 43 | else: 44 | for k in range(i, frame_number): 45 | ip_data[k] = last_value 46 | else: 47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 48 | last_value = data[i] 49 | 50 | return ip_data[:, 0], vuv_vector[:, 0] 51 | 52 | def resize_f0(self, x, target_len): 53 | source = np.array(x) 54 | source[source < 0.001] = np.nan 55 | target = np.interp( 56 | np.arange(0, len(source) * target_len, len(source)) / target_len, 57 | np.arange(0, len(source)), 58 | source, 59 | ) 60 | res = np.nan_to_num(target) 61 | return res 62 | 63 | def compute_f0(self, wav, p_len=None): 64 | if p_len is None: 65 | p_len = wav.shape[0] // self.hop_length 66 | f0, t = pyworld.harvest( 67 | wav.astype(np.double), 68 | fs=self.sampling_rate, 69 | f0_ceil=self.f0_max, 70 | f0_floor=self.f0_min, 71 | frame_period=1000 * self.hop_length / self.sampling_rate, 72 | ) 73 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) 74 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 75 | 76 | def compute_f0_uv(self, wav, p_len=None): 77 | if p_len is None: 78 | p_len = wav.shape[0] // self.hop_length 79 | f0, t = pyworld.harvest( 80 | wav.astype(np.double), 81 | fs=self.sampling_rate, 82 | f0_floor=self.f0_min, 83 | f0_ceil=self.f0_max, 84 | frame_period=1000 * self.hop_length / self.sampling_rate, 85 | ) 86 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 87 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 88 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import parselmouth 3 | 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 5 | 6 | 7 | class PMF0Predictor(F0Predictor): 8 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 9 | self.hop_length = hop_length 10 | self.f0_min = f0_min 11 | self.f0_max = f0_max 12 | self.sampling_rate = sampling_rate 13 | 14 | def interpolate_f0(self, f0): 15 | """ 16 | 对F0进行插值处理 17 | """ 18 | 19 | data = np.reshape(f0, (f0.size, 1)) 20 | 21 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 22 | vuv_vector[data > 0.0] = 1.0 23 | vuv_vector[data <= 0.0] = 0.0 24 | 25 | ip_data = data 26 | 27 | frame_number = data.size 28 | last_value = 0.0 29 | for i in range(frame_number): 30 | if data[i] <= 0.0: 31 | j = i + 1 32 | for j in range(i + 1, frame_number): 33 | if data[j] > 0.0: 34 | break 35 | if j < frame_number - 1: 36 | if last_value > 0.0: 37 | step = (data[j] - data[i - 1]) / float(j - i) 38 | for k in range(i, j): 39 | ip_data[k] = data[i - 1] + step * (k - i + 1) 40 | else: 41 | for k in range(i, j): 42 | ip_data[k] = data[j] 43 | else: 44 | for k in range(i, frame_number): 45 | ip_data[k] = last_value 46 | else: 47 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 48 | last_value = data[i] 49 | 50 | return ip_data[:, 0], vuv_vector[:, 0] 51 | 52 | def compute_f0(self, wav, p_len=None): 53 | x = wav 54 | if p_len is None: 55 | p_len = x.shape[0] // self.hop_length 56 | else: 57 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 58 | time_step = self.hop_length / self.sampling_rate * 1000 59 | f0 = ( 60 | parselmouth.Sound(x, self.sampling_rate) 61 | .to_pitch_ac( 62 | time_step=time_step / 1000, 63 | voicing_threshold=0.6, 64 | pitch_floor=self.f0_min, 65 | pitch_ceiling=self.f0_max, 66 | ) 67 | .selected_array["frequency"] 68 | ) 69 | 70 | pad_size = (p_len - len(f0) + 1) // 2 71 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 72 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 73 | f0, uv = self.interpolate_f0(f0) 74 | return f0 75 | 76 | def compute_f0_uv(self, wav, p_len=None): 77 | x = wav 78 | if p_len is None: 79 | p_len = x.shape[0] // self.hop_length 80 | else: 81 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 82 | time_step = self.hop_length / self.sampling_rate * 1000 83 | f0 = ( 84 | parselmouth.Sound(x, self.sampling_rate) 85 | .to_pitch_ac( 86 | time_step=time_step / 1000, 87 | voicing_threshold=0.6, 88 | pitch_floor=self.f0_min, 89 | pitch_ceiling=self.f0_max, 90 | ) 91 | .selected_array["frequency"] 92 | ) 93 | 94 | pad_size = (p_len - len(f0) + 1) // 2 95 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 96 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 97 | f0, uv = self.interpolate_f0(f0) 98 | return f0, uv 99 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/jit/__init__.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import pickle 3 | import time 4 | import torch 5 | from tqdm import tqdm 6 | from collections import OrderedDict 7 | 8 | 9 | def load_inputs(path, device, is_half=False): 10 | parm = torch.load(path, map_location=torch.device("cpu")) 11 | for key in parm.keys(): 12 | parm[key] = parm[key].to(device) 13 | if is_half and parm[key].dtype == torch.float32: 14 | parm[key] = parm[key].half() 15 | elif not is_half and parm[key].dtype == torch.float16: 16 | parm[key] = parm[key].float() 17 | return parm 18 | 19 | 20 | def benchmark( 21 | model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False 22 | ): 23 | parm = load_inputs(inputs_path, device, is_half) 24 | total_ts = 0.0 25 | bar = tqdm(range(epoch)) 26 | for i in bar: 27 | start_time = time.perf_counter() 28 | o = model(**parm) 29 | total_ts += time.perf_counter() - start_time 30 | print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}") 31 | 32 | 33 | def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False): 34 | benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half) 35 | 36 | 37 | def to_jit_model( 38 | model_path, 39 | model_type: str, 40 | mode: str = "trace", 41 | inputs_path: str = None, 42 | device=torch.device("cpu"), 43 | is_half=False, 44 | ): 45 | model = None 46 | if model_type.lower() == "synthesizer": 47 | from .get_synthesizer import get_synthesizer 48 | 49 | model, _ = get_synthesizer(model_path, device) 50 | model.forward = model.infer 51 | elif model_type.lower() == "rmvpe": 52 | from .get_rmvpe import get_rmvpe 53 | 54 | model = get_rmvpe(model_path, device) 55 | elif model_type.lower() == "hubert": 56 | from .get_hubert import get_hubert_model 57 | 58 | model = get_hubert_model(model_path, device) 59 | model.forward = model.infer 60 | else: 61 | raise ValueError(f"No model type named {model_type}") 62 | model = model.eval() 63 | model = model.half() if is_half else model.float() 64 | if mode == "trace": 65 | assert not inputs_path 66 | inputs = load_inputs(inputs_path, device, is_half) 67 | model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) 68 | elif mode == "script": 69 | model_jit = torch.jit.script(model) 70 | model_jit.to(device) 71 | model_jit = model_jit.half() if is_half else model_jit.float() 72 | # model = model.half() if is_half else model.float() 73 | return (model, model_jit) 74 | 75 | 76 | def export( 77 | model: torch.nn.Module, 78 | mode: str = "trace", 79 | inputs: dict = None, 80 | device=torch.device("cpu"), 81 | is_half: bool = False, 82 | ) -> dict: 83 | model = model.half() if is_half else model.float() 84 | model.eval() 85 | if mode == "trace": 86 | assert inputs is not None 87 | model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) 88 | elif mode == "script": 89 | model_jit = torch.jit.script(model) 90 | model_jit.to(device) 91 | model_jit = model_jit.half() if is_half else model_jit.float() 92 | buffer = BytesIO() 93 | # model_jit=model_jit.cpu() 94 | torch.jit.save(model_jit, buffer) 95 | del model_jit 96 | cpt = OrderedDict() 97 | cpt["model"] = buffer.getvalue() 98 | cpt["is_half"] = is_half 99 | return cpt 100 | 101 | 102 | def load(path: str): 103 | with open(path, "rb") as f: 104 | return pickle.load(f) 105 | 106 | 107 | def save(ckpt: dict, save_path: str): 108 | with open(save_path, "wb") as f: 109 | pickle.dump(ckpt, f) 110 | 111 | 112 | def rmvpe_jit_export( 113 | model_path: str, 114 | mode: str = "script", 115 | inputs_path: str = None, 116 | save_path: str = None, 117 | device=torch.device("cpu"), 118 | is_half=False, 119 | ): 120 | if not save_path: 121 | save_path = model_path.rstrip(".pth") 122 | save_path += ".half.jit" if is_half else ".jit" 123 | if "cuda" in str(device) and ":" not in str(device): 124 | device = torch.device("cuda:0") 125 | from .get_rmvpe import get_rmvpe 126 | 127 | model = get_rmvpe(model_path, device) 128 | inputs = None 129 | if mode == "trace": 130 | inputs = load_inputs(inputs_path, device, is_half) 131 | ckpt = export(model, mode, inputs, device, is_half) 132 | ckpt["device"] = str(device) 133 | save(ckpt, save_path) 134 | return ckpt 135 | 136 | 137 | def synthesizer_jit_export( 138 | model_path: str, 139 | mode: str = "script", 140 | inputs_path: str = None, 141 | save_path: str = None, 142 | device=torch.device("cpu"), 143 | is_half=False, 144 | ): 145 | if not save_path: 146 | save_path = model_path.rstrip(".pth") 147 | save_path += ".half.jit" if is_half else ".jit" 148 | if "cuda" in str(device) and ":" not in str(device): 149 | device = torch.device("cuda:0") 150 | from .get_synthesizer import get_synthesizer 151 | 152 | model, cpt = get_synthesizer(model_path, device) 153 | assert isinstance(cpt, dict) 154 | model.forward = model.infer 155 | inputs = None 156 | if mode == "trace": 157 | inputs = load_inputs(inputs_path, device, is_half) 158 | ckpt = export(model, mode, inputs, device, is_half) 159 | cpt.pop("weight") 160 | cpt["model"] = ckpt["model"] 161 | cpt["device"] = device 162 | save(cpt, save_path) 163 | return cpt 164 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/jit/__pycache__/get_synthesizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/jit/__pycache__/get_synthesizer.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/jit/get_rmvpe.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_rmvpe(model_path="rvc/assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): 5 | from infer.lib.rmvpe import E2E 6 | 7 | model = E2E(4, 1, (2, 2)) 8 | ckpt = torch.load(model_path, map_location=device) 9 | model.load_state_dict(ckpt) 10 | model.eval() 11 | model = model.to(device) 12 | return model 13 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/jit/get_synthesizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_synthesizer(pth_path, device=torch.device("cpu")): 5 | from rvc.infer.lib.infer_pack.models import ( 6 | SynthesizerTrnMs256NSFsid, 7 | SynthesizerTrnMs256NSFsid_nono, 8 | SynthesizerTrnMs768NSFsid, 9 | SynthesizerTrnMs768NSFsid_nono, 10 | ) 11 | 12 | cpt = torch.load(pth_path, map_location=torch.device("cpu")) 13 | # tgt_sr = cpt["config"][-1] 14 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] 15 | if_f0 = cpt.get("f0", 1) 16 | version = cpt.get("version", "v1") 17 | if version == "v1": 18 | if if_f0 == 1: 19 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) 20 | else: 21 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) 22 | elif version == "v2": 23 | if if_f0 == 1: 24 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False) 25 | else: 26 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) 27 | del net_g.enc_q 28 | # net_g.forward = net_g.infer 29 | # ckpt = {} 30 | # ckpt["config"] = cpt["config"] 31 | # ckpt["f0"] = if_f0 32 | # ckpt["version"] = version 33 | # ckpt["info"] = cpt.get("info", "0epoch") 34 | net_g.load_state_dict(cpt["weight"], strict=False) 35 | net_g = net_g.float() 36 | net_g.eval().to(device) 37 | net_g.remove_weight_norm() 38 | return net_g, cpt 39 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/train/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | 15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 16 | loss = 0 17 | r_losses = [] 18 | g_losses = [] 19 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 20 | dr = dr.float() 21 | dg = dg.float() 22 | r_loss = torch.mean((1 - dr) ** 2) 23 | g_loss = torch.mean(dg**2) 24 | loss += r_loss + g_loss 25 | r_losses.append(r_loss.item()) 26 | g_losses.append(g_loss.item()) 27 | 28 | return loss, r_losses, g_losses 29 | 30 | 31 | def generator_loss(disc_outputs): 32 | loss = 0 33 | gen_losses = [] 34 | for dg in disc_outputs: 35 | dg = dg.float() 36 | l = torch.mean((1 - dg) ** 2) 37 | gen_losses.append(l) 38 | loss += l 39 | 40 | return loss, gen_losses 41 | 42 | 43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 44 | """ 45 | z_p, logs_q: [b, h, t_t] 46 | m_p, logs_p: [b, h, t_t] 47 | """ 48 | z_p = z_p.float() 49 | logs_q = logs_q.float() 50 | m_p = m_p.float() 51 | logs_p = logs_p.float() 52 | z_mask = z_mask.float() 53 | 54 | kl = logs_p - logs_q - 0.5 55 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 56 | kl = torch.sum(kl * z_mask) 57 | l = kl / torch.sum(z_mask) 58 | return l 59 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/train/mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | MAX_WAV_VALUE = 32768.0 9 | 10 | 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 12 | """ 13 | PARAMS 14 | ------ 15 | C: compression factor 16 | """ 17 | return torch.log(torch.clamp(x, min=clip_val) * C) 18 | 19 | 20 | def dynamic_range_decompression_torch(x, C=1): 21 | """ 22 | PARAMS 23 | ------ 24 | C: compression factor used to compress 25 | """ 26 | return torch.exp(x) / C 27 | 28 | 29 | def spectral_normalize_torch(magnitudes): 30 | return dynamic_range_compression_torch(magnitudes) 31 | 32 | 33 | def spectral_de_normalize_torch(magnitudes): 34 | return dynamic_range_decompression_torch(magnitudes) 35 | 36 | 37 | # Reusable banks 38 | mel_basis = {} 39 | hann_window = {} 40 | 41 | 42 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 43 | """Convert waveform into Linear-frequency Linear-amplitude spectrogram. 44 | 45 | Args: 46 | y :: (B, T) - Audio waveforms 47 | n_fft 48 | sampling_rate 49 | hop_size 50 | win_size 51 | center 52 | Returns: 53 | :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram 54 | """ 55 | 56 | # Window - Cache if needed 57 | global hann_window 58 | dtype_device = str(y.dtype) + "_" + str(y.device) 59 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 62 | dtype=y.dtype, device=y.device 63 | ) 64 | 65 | # Padding 66 | y = torch.nn.functional.pad( 67 | y.unsqueeze(1), 68 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 69 | mode="reflect", 70 | ) 71 | y = y.squeeze(1) 72 | 73 | # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) 74 | spec = torch.stft( 75 | y, 76 | n_fft, 77 | hop_length=hop_size, 78 | win_length=win_size, 79 | window=hann_window[wnsize_dtype_device], 80 | center=center, 81 | pad_mode="reflect", 82 | normalized=False, 83 | onesided=True, 84 | return_complex=True, 85 | ) 86 | 87 | # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) 88 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) 89 | return spec 90 | 91 | 92 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 93 | # MelBasis - Cache if needed 94 | global mel_basis 95 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 96 | fmax_dtype_device = str(fmax) + "_" + dtype_device 97 | if fmax_dtype_device not in mel_basis: 98 | mel = librosa_mel_fn( 99 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 100 | ) 101 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 102 | dtype=spec.dtype, device=spec.device 103 | ) 104 | 105 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) 106 | melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) 107 | melspec = spectral_normalize_torch(melspec) 108 | return melspec 109 | 110 | 111 | def mel_spectrogram_torch( 112 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 113 | ): 114 | """Convert waveform into Mel-frequency Log-amplitude spectrogram. 115 | 116 | Args: 117 | y :: (B, T) - Waveforms 118 | Returns: 119 | melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram 120 | """ 121 | # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) 122 | spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) 123 | 124 | # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) 125 | melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) 126 | 127 | return melspec 128 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class Encoder(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 31 | super(Encoder, self).__init__() 32 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) 33 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 34 | 35 | def __call__(self, x): 36 | h = self.conv1(x) 37 | h = self.conv2(h) 38 | 39 | return h 40 | 41 | 42 | class Decoder(nn.Module): 43 | def __init__( 44 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 45 | ): 46 | super(Decoder, self).__init__() 47 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 48 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 49 | self.dropout = nn.Dropout2d(0.1) if dropout else None 50 | 51 | def __call__(self, x, skip=None): 52 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 53 | 54 | if skip is not None: 55 | skip = spec_utils.crop_center(skip, x) 56 | x = torch.cat([x, skip], dim=1) 57 | 58 | h = self.conv1(x) 59 | # h = self.conv2(h) 60 | 61 | if self.dropout is not None: 62 | h = self.dropout(h) 63 | 64 | return h 65 | 66 | 67 | class ASPPModule(nn.Module): 68 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): 69 | super(ASPPModule, self).__init__() 70 | self.conv1 = nn.Sequential( 71 | nn.AdaptiveAvgPool2d((1, None)), 72 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), 73 | ) 74 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) 75 | self.conv3 = Conv2DBNActiv( 76 | nin, nout, 3, 1, dilations[0], dilations[0], activ=activ 77 | ) 78 | self.conv4 = Conv2DBNActiv( 79 | nin, nout, 3, 1, dilations[1], dilations[1], activ=activ 80 | ) 81 | self.conv5 = Conv2DBNActiv( 82 | nin, nout, 3, 1, dilations[2], dilations[2], activ=activ 83 | ) 84 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) 85 | self.dropout = nn.Dropout2d(0.1) if dropout else None 86 | 87 | def forward(self, x): 88 | _, _, h, w = x.size() 89 | feat1 = F.interpolate( 90 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 91 | ) 92 | feat2 = self.conv2(x) 93 | feat3 = self.conv3(x) 94 | feat4 = self.conv4(x) 95 | feat5 = self.conv5(x) 96 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 97 | out = self.bottleneck(out) 98 | 99 | if self.dropout is not None: 100 | out = self.dropout(out) 101 | 102 | return out 103 | 104 | 105 | class LSTMModule(nn.Module): 106 | def __init__(self, nin_conv, nin_lstm, nout_lstm): 107 | super(LSTMModule, self).__init__() 108 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) 109 | self.lstm = nn.LSTM( 110 | input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True 111 | ) 112 | self.dense = nn.Sequential( 113 | nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() 114 | ) 115 | 116 | def forward(self, x): 117 | N, _, nbins, nframes = x.size() 118 | h = self.conv(x)[:, 0] # N, nbins, nframes 119 | h = h.permute(2, 0, 1) # nframes, N, nbins 120 | h, _ = self.lstm(h) 121 | h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins 122 | h = h.reshape(nframes, N, 1, nbins) 123 | h = h.permute(1, 2, 3, 0) 124 | 125 | return h 126 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | 5 | default_param = {} 6 | default_param["bins"] = 768 7 | default_param["unstable_bins"] = 9 # training only 8 | default_param["reduction_bins"] = 762 # training only 9 | default_param["sr"] = 44100 10 | default_param["pre_filter_start"] = 757 11 | default_param["pre_filter_stop"] = 768 12 | default_param["band"] = {} 13 | 14 | 15 | default_param["band"][1] = { 16 | "sr": 11025, 17 | "hl": 128, 18 | "n_fft": 960, 19 | "crop_start": 0, 20 | "crop_stop": 245, 21 | "lpf_start": 61, # inference only 22 | "res_type": "polyphase", 23 | } 24 | 25 | default_param["band"][2] = { 26 | "sr": 44100, 27 | "hl": 512, 28 | "n_fft": 1536, 29 | "crop_start": 24, 30 | "crop_stop": 547, 31 | "hpf_start": 81, # inference only 32 | "res_type": "sinc_best", 33 | } 34 | 35 | 36 | def int_keys(d): 37 | r = {} 38 | for k, v in d: 39 | if k.isdigit(): 40 | k = int(k) 41 | r[k] = v 42 | return r 43 | 44 | 45 | class ModelParameters(object): 46 | def __init__(self, config_path=""): 47 | if ".pth" == pathlib.Path(config_path).suffix: 48 | import zipfile 49 | 50 | with zipfile.ZipFile(config_path, "r") as zip: 51 | self.param = json.loads( 52 | zip.read("param.json"), object_pairs_hook=int_keys 53 | ) 54 | elif ".json" == pathlib.Path(config_path).suffix: 55 | with open(config_path, "r") as f: 56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 57 | else: 58 | self.param = default_param 59 | 60 | for k in [ 61 | "mid_side", 62 | "mid_side_b", 63 | "mid_side_b2", 64 | "stereo_w", 65 | "stereo_n", 66 | "reverse", 67 | ]: 68 | if not k in self.param: 69 | self.param[k] = False 70 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets.py: -------------------------------------------------------------------------------- 1 | import layers 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import spec_utils 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 16) 44 | self.stg1_high_band_net = BaseASPPNet(2, 16) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(8, 16) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(16, 32) 51 | 52 | self.out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_33966KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 16) 43 | self.stg1_high_band_net = BaseASPPNet(2, 16) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(8, 16) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(16, 32) 50 | 51 | self.out = nn.Conv2d(32, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 64) 44 | self.stg1_high_band_net = BaseASPPNet(2, 64) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(32, 64) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(64, 128) 51 | 52 | self.out = nn.Conv2d(128, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 64) 44 | self.stg1_high_band_net = BaseASPPNet(2, 64) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(32, 64) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(64, 128) 51 | 52 | self.out = nn.Conv2d(128, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_new 6 | 7 | 8 | class BaseNet(nn.Module): 9 | def __init__( 10 | self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) 11 | ): 12 | super(BaseNet, self).__init__() 13 | self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) 14 | self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) 15 | self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) 16 | self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) 17 | self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) 18 | 19 | self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) 20 | 21 | self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) 22 | self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) 23 | self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) 24 | self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) 25 | self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) 26 | 27 | def __call__(self, x): 28 | e1 = self.enc1(x) 29 | e2 = self.enc2(e1) 30 | e3 = self.enc3(e2) 31 | e4 = self.enc4(e3) 32 | e5 = self.enc5(e4) 33 | 34 | h = self.aspp(e5) 35 | 36 | h = self.dec4(h, e4) 37 | h = self.dec3(h, e3) 38 | h = self.dec2(h, e2) 39 | h = torch.cat([h, self.lstm_dec2(h)], dim=1) 40 | h = self.dec1(h, e1) 41 | 42 | return h 43 | 44 | 45 | class CascadedNet(nn.Module): 46 | def __init__(self, n_fft, nout=32, nout_lstm=128): 47 | super(CascadedNet, self).__init__() 48 | 49 | self.max_bin = n_fft // 2 50 | self.output_bin = n_fft // 2 + 1 51 | self.nin_lstm = self.max_bin // 2 52 | self.offset = 64 53 | 54 | self.stg1_low_band_net = nn.Sequential( 55 | BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), 56 | layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), 57 | ) 58 | 59 | self.stg1_high_band_net = BaseNet( 60 | 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 61 | ) 62 | 63 | self.stg2_low_band_net = nn.Sequential( 64 | BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), 65 | layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), 66 | ) 67 | self.stg2_high_band_net = BaseNet( 68 | nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 69 | ) 70 | 71 | self.stg3_full_band_net = BaseNet( 72 | 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm 73 | ) 74 | 75 | self.out = nn.Conv2d(nout, 2, 1, bias=False) 76 | self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) 77 | 78 | def forward(self, x): 79 | x = x[:, :, : self.max_bin] 80 | 81 | bandw = x.size()[2] // 2 82 | l1_in = x[:, :, :bandw] 83 | h1_in = x[:, :, bandw:] 84 | l1 = self.stg1_low_band_net(l1_in) 85 | h1 = self.stg1_high_band_net(h1_in) 86 | aux1 = torch.cat([l1, h1], dim=2) 87 | 88 | l2_in = torch.cat([l1_in, l1], dim=1) 89 | h2_in = torch.cat([h1_in, h1], dim=1) 90 | l2 = self.stg2_low_band_net(l2_in) 91 | h2 = self.stg2_high_band_net(h2_in) 92 | aux2 = torch.cat([l2, h2], dim=2) 93 | 94 | f3_in = torch.cat([x, aux1, aux2], dim=1) 95 | f3 = self.stg3_full_band_net(f3_in) 96 | 97 | mask = torch.sigmoid(self.out(f3)) 98 | mask = F.pad( 99 | input=mask, 100 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 101 | mode="replicate", 102 | ) 103 | 104 | if self.training: 105 | aux = torch.cat([aux1, aux2], dim=1) 106 | aux = torch.sigmoid(self.aux_out(aux)) 107 | aux = F.pad( 108 | input=aux, 109 | pad=(0, 0, 0, self.output_bin - aux.size()[2]), 110 | mode="replicate", 111 | ) 112 | return mask, aux 113 | else: 114 | return mask 115 | 116 | def predict_mask(self, x): 117 | mask = self.forward(x) 118 | 119 | if self.offset > 0: 120 | mask = mask[:, :, :, self.offset : -self.offset] 121 | assert mask.size()[3] > 0 122 | 123 | return mask 124 | 125 | def predict(self, x, aggressiveness=None): 126 | mask = self.forward(x) 127 | pred_mag = x * mask 128 | 129 | if self.offset > 0: 130 | pred_mag = pred_mag[:, :, :, self.offset : -self.offset] 131 | assert pred_mag.size()[3] > 0 132 | 133 | return pred_mag 134 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/lib/uvr5_pack/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import torch 5 | from tqdm import tqdm 6 | 7 | 8 | def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: 9 | with open(file_name, "r") as f: 10 | data = json.load(f) 11 | 12 | return data 13 | 14 | 15 | def make_padding(width, cropsize, offset): 16 | left = offset 17 | roi_size = cropsize - left * 2 18 | if roi_size == 0: 19 | roi_size = cropsize 20 | right = roi_size - (width % roi_size) + left 21 | 22 | return left, right, roi_size 23 | 24 | 25 | def inference(X_spec, device, model, aggressiveness, data): 26 | """ 27 | data : dic configs 28 | """ 29 | 30 | def _execute( 31 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True 32 | ): 33 | model.eval() 34 | with torch.no_grad(): 35 | preds = [] 36 | 37 | iterations = [n_window] 38 | 39 | total_iterations = sum(iterations) 40 | for i in tqdm(range(n_window)): 41 | start = i * roi_size 42 | X_mag_window = X_mag_pad[ 43 | None, :, :, start : start + data["window_size"] 44 | ] 45 | X_mag_window = torch.from_numpy(X_mag_window) 46 | if is_half: 47 | X_mag_window = X_mag_window.half() 48 | X_mag_window = X_mag_window.to(device) 49 | 50 | pred = model.predict(X_mag_window, aggressiveness) 51 | 52 | pred = pred.detach().cpu().numpy() 53 | preds.append(pred[0]) 54 | 55 | pred = np.concatenate(preds, axis=2) 56 | return pred 57 | 58 | def preprocess(X_spec): 59 | X_mag = np.abs(X_spec) 60 | X_phase = np.angle(X_spec) 61 | 62 | return X_mag, X_phase 63 | 64 | X_mag, X_phase = preprocess(X_spec) 65 | 66 | coef = X_mag.max() 67 | X_mag_pre = X_mag / coef 68 | 69 | n_frame = X_mag_pre.shape[2] 70 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) 71 | n_window = int(np.ceil(n_frame / roi_size)) 72 | 73 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 74 | 75 | if list(model.state_dict().values())[0].dtype == torch.float16: 76 | is_half = True 77 | else: 78 | is_half = False 79 | pred = _execute( 80 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 81 | ) 82 | pred = pred[:, :, :n_frame] 83 | 84 | if data["tta"]: 85 | pad_l += roi_size // 2 86 | pad_r += roi_size // 2 87 | n_window += 1 88 | 89 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 90 | 91 | pred_tta = _execute( 92 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 93 | ) 94 | pred_tta = pred_tta[:, :, roi_size // 2 :] 95 | pred_tta = pred_tta[:, :, :n_frame] 96 | 97 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) 98 | else: 99 | return pred * coef, X_mag, np.exp(1.0j * X_phase) 100 | 101 | 102 | def _get_name_params(model_path, model_hash): 103 | data = load_data() 104 | flag = False 105 | ModelName = model_path 106 | for type in list(data): 107 | for model in list(data[type][0]): 108 | for i in range(len(data[type][0][model])): 109 | if str(data[type][0][model][i]["hash_name"]) == model_hash: 110 | flag = True 111 | elif str(data[type][0][model][i]["hash_name"]) in ModelName: 112 | flag = True 113 | 114 | if flag: 115 | model_params_auto = data[type][0][model][i]["model_params"] 116 | param_name_auto = data[type][0][model][i]["param_name"] 117 | if type == "equivalent": 118 | return param_name_auto, model_params_auto 119 | else: 120 | flag = False 121 | return param_name_auto, model_params_auto 122 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/onnx/export.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM 4 | 5 | 6 | def export_onnx(ModelPath, ExportedPath): 7 | cpt = torch.load(ModelPath, map_location="cpu") 8 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] 9 | vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 10 | 11 | test_phone = torch.rand(1, 200, vec_channels) # hidden unit 12 | test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) 13 | test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) 14 | test_pitchf = torch.rand(1, 200) # nsf基频 15 | test_ds = torch.LongTensor([0]) # 说话人ID 16 | test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) 17 | 18 | device = "cpu" # 导出时设备(不影响使用模型) 19 | 20 | net_g = SynthesizerTrnMsNSFsidM( 21 | *cpt["config"], is_half=False, version=cpt.get("version", "v1") 22 | ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) 23 | net_g.load_state_dict(cpt["weight"], strict=False) 24 | input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] 25 | output_names = [ 26 | "audio", 27 | ] 28 | # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 29 | torch.onnx.export( 30 | net_g, 31 | ( 32 | test_phone.to(device), 33 | test_phone_lengths.to(device), 34 | test_pitch.to(device), 35 | test_pitchf.to(device), 36 | test_ds.to(device), 37 | test_rnd.to(device), 38 | ), 39 | ExportedPath, 40 | dynamic_axes={ 41 | "phone": [1], 42 | "pitch": [1], 43 | "pitchf": [1], 44 | "rnd": [2], 45 | }, 46 | do_constant_folding=False, 47 | opset_version=13, 48 | verbose=False, 49 | input_names=input_names, 50 | output_names=output_names, 51 | ) 52 | return "Finished" 53 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/train/extract/extract_f0_rmvpe.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | import parselmouth 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | import logging 10 | 11 | import numpy as np 12 | import pyworld 13 | 14 | from infer.lib.audio import load_audio 15 | 16 | logging.getLogger("numba").setLevel(logging.WARNING) 17 | 18 | n_part = int(sys.argv[1]) 19 | i_part = int(sys.argv[2]) 20 | i_gpu = sys.argv[3] 21 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) 22 | exp_dir = sys.argv[4] 23 | is_half = sys.argv[5] 24 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+") 25 | 26 | 27 | def printt(strr): 28 | print(strr) 29 | f.write("%s\n" % strr) 30 | f.flush() 31 | 32 | 33 | class FeatureInput(object): 34 | def __init__(self, samplerate=16000, hop_size=160): 35 | self.fs = samplerate 36 | self.hop = hop_size 37 | 38 | self.f0_bin = 256 39 | self.f0_max = 1100.0 40 | self.f0_min = 50.0 41 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) 42 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) 43 | 44 | def compute_f0(self, path, f0_method): 45 | x = load_audio(path, self.fs) 46 | # p_len = x.shape[0] // self.hop 47 | if f0_method == "rmvpe": 48 | if hasattr(self, "model_rmvpe") == False: 49 | from infer.lib.rmvpe import RMVPE 50 | 51 | print("Loading rmvpe model") 52 | self.model_rmvpe = RMVPE( 53 | "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" 54 | ) 55 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) 56 | return f0 57 | 58 | def coarse_f0(self, f0): 59 | f0_mel = 1127 * np.log(1 + f0 / 700) 60 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( 61 | self.f0_bin - 2 62 | ) / (self.f0_mel_max - self.f0_mel_min) + 1 63 | 64 | # use 0 or 1 65 | f0_mel[f0_mel <= 1] = 1 66 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 67 | f0_coarse = np.rint(f0_mel).astype(int) 68 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( 69 | f0_coarse.max(), 70 | f0_coarse.min(), 71 | ) 72 | return f0_coarse 73 | 74 | def go(self, paths, f0_method): 75 | if len(paths) == 0: 76 | printt("no-f0-todo") 77 | else: 78 | printt("todo-f0-%s" % len(paths)) 79 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条 80 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): 81 | try: 82 | if idx % n == 0: 83 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) 84 | if ( 85 | os.path.exists(opt_path1 + ".npy") == True 86 | and os.path.exists(opt_path2 + ".npy") == True 87 | ): 88 | continue 89 | featur_pit = self.compute_f0(inp_path, f0_method) 90 | np.save( 91 | opt_path2, 92 | featur_pit, 93 | allow_pickle=False, 94 | ) # nsf 95 | coarse_pit = self.coarse_f0(featur_pit) 96 | np.save( 97 | opt_path1, 98 | coarse_pit, 99 | allow_pickle=False, 100 | ) # ori 101 | except: 102 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) 103 | 104 | 105 | if __name__ == "__main__": 106 | # exp_dir=r"E:\codes\py39\dataset\mi-test" 107 | # n_p=16 108 | # f = open("%s/log_extract_f0.log"%exp_dir, "w") 109 | printt(sys.argv) 110 | featureInput = FeatureInput() 111 | paths = [] 112 | inp_root = "%s/1_16k_wavs" % (exp_dir) 113 | opt_root1 = "%s/2a_f0" % (exp_dir) 114 | opt_root2 = "%s/2b-f0nsf" % (exp_dir) 115 | 116 | os.makedirs(opt_root1, exist_ok=True) 117 | os.makedirs(opt_root2, exist_ok=True) 118 | for name in sorted(list(os.listdir(inp_root))): 119 | inp_path = "%s/%s" % (inp_root, name) 120 | if "spec" in inp_path: 121 | continue 122 | opt_path1 = "%s/%s" % (opt_root1, name) 123 | opt_path2 = "%s/%s" % (opt_root2, name) 124 | paths.append([inp_path, opt_path1, opt_path2]) 125 | try: 126 | featureInput.go(paths[i_part::n_part], "rmvpe") 127 | except: 128 | printt("f0_all_fail-%s" % (traceback.format_exc())) 129 | # ps = [] 130 | # for i in range(n_p): 131 | # p = Process( 132 | # target=featureInput.go, 133 | # args=( 134 | # paths[i::n_p], 135 | # f0method, 136 | # ), 137 | # ) 138 | # ps.append(p) 139 | # p.start() 140 | # for i in range(n_p): 141 | # ps[i].join() 142 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/train/extract/extract_f0_rmvpe_dml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | import parselmouth 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | import logging 10 | 11 | import numpy as np 12 | import pyworld 13 | 14 | from infer.lib.audio import load_audio 15 | 16 | logging.getLogger("numba").setLevel(logging.WARNING) 17 | 18 | exp_dir = sys.argv[1] 19 | import torch_directml 20 | 21 | device = torch_directml.device(torch_directml.default_device()) 22 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+") 23 | 24 | 25 | def printt(strr): 26 | print(strr) 27 | f.write("%s\n" % strr) 28 | f.flush() 29 | 30 | 31 | class FeatureInput(object): 32 | def __init__(self, samplerate=16000, hop_size=160): 33 | self.fs = samplerate 34 | self.hop = hop_size 35 | 36 | self.f0_bin = 256 37 | self.f0_max = 1100.0 38 | self.f0_min = 50.0 39 | self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) 40 | self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) 41 | 42 | def compute_f0(self, path, f0_method): 43 | x = load_audio(path, self.fs) 44 | # p_len = x.shape[0] // self.hop 45 | if f0_method == "rmvpe": 46 | if hasattr(self, "model_rmvpe") == False: 47 | from infer.lib.rmvpe import RMVPE 48 | 49 | print("Loading rmvpe model") 50 | self.model_rmvpe = RMVPE( 51 | "assets/rmvpe/rmvpe.pt", is_half=False, device=device 52 | ) 53 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) 54 | return f0 55 | 56 | def coarse_f0(self, f0): 57 | f0_mel = 1127 * np.log(1 + f0 / 700) 58 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( 59 | self.f0_bin - 2 60 | ) / (self.f0_mel_max - self.f0_mel_min) + 1 61 | 62 | # use 0 or 1 63 | f0_mel[f0_mel <= 1] = 1 64 | f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 65 | f0_coarse = np.rint(f0_mel).astype(int) 66 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( 67 | f0_coarse.max(), 68 | f0_coarse.min(), 69 | ) 70 | return f0_coarse 71 | 72 | def go(self, paths, f0_method): 73 | if len(paths) == 0: 74 | printt("no-f0-todo") 75 | else: 76 | printt("todo-f0-%s" % len(paths)) 77 | n = max(len(paths) // 5, 1) # 每个进程最多打印5条 78 | for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): 79 | try: 80 | if idx % n == 0: 81 | printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) 82 | if ( 83 | os.path.exists(opt_path1 + ".npy") == True 84 | and os.path.exists(opt_path2 + ".npy") == True 85 | ): 86 | continue 87 | featur_pit = self.compute_f0(inp_path, f0_method) 88 | np.save( 89 | opt_path2, 90 | featur_pit, 91 | allow_pickle=False, 92 | ) # nsf 93 | coarse_pit = self.coarse_f0(featur_pit) 94 | np.save( 95 | opt_path1, 96 | coarse_pit, 97 | allow_pickle=False, 98 | ) # ori 99 | except: 100 | printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) 101 | 102 | 103 | if __name__ == "__main__": 104 | # exp_dir=r"E:\codes\py39\dataset\mi-test" 105 | # n_p=16 106 | # f = open("%s/log_extract_f0.log"%exp_dir, "w") 107 | printt(sys.argv) 108 | featureInput = FeatureInput() 109 | paths = [] 110 | inp_root = "%s/1_16k_wavs" % (exp_dir) 111 | opt_root1 = "%s/2a_f0" % (exp_dir) 112 | opt_root2 = "%s/2b-f0nsf" % (exp_dir) 113 | 114 | os.makedirs(opt_root1, exist_ok=True) 115 | os.makedirs(opt_root2, exist_ok=True) 116 | for name in sorted(list(os.listdir(inp_root))): 117 | inp_path = "%s/%s" % (inp_root, name) 118 | if "spec" in inp_path: 119 | continue 120 | opt_path1 = "%s/%s" % (opt_root1, name) 121 | opt_path2 = "%s/%s" % (opt_root2, name) 122 | paths.append([inp_path, opt_path1, opt_path2]) 123 | try: 124 | featureInput.go(paths, "rmvpe") 125 | except: 126 | printt("f0_all_fail-%s" % (traceback.format_exc())) 127 | # ps = [] 128 | # for i in range(n_p): 129 | # p = Process( 130 | # target=featureInput.go, 131 | # args=( 132 | # paths[i::n_p], 133 | # f0method, 134 | # ), 135 | # ) 136 | # ps.append(p) 137 | # p.start() 138 | # for i in range(n_p): 139 | # ps[i].join() 140 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/train/extract_feature_print.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | 5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" 6 | os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" 7 | 8 | device = sys.argv[1] 9 | n_part = int(sys.argv[2]) 10 | i_part = int(sys.argv[3]) 11 | if len(sys.argv) == 6: 12 | exp_dir = sys.argv[4] 13 | version = sys.argv[5] 14 | else: 15 | i_gpu = sys.argv[4] 16 | exp_dir = sys.argv[5] 17 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) 18 | version = sys.argv[6] 19 | import fairseq 20 | import numpy as np 21 | import soundfile as sf 22 | import torch 23 | import torch.nn.functional as F 24 | 25 | if "privateuseone" not in device: 26 | device = "cpu" 27 | if torch.cuda.is_available(): 28 | device = "cuda" 29 | elif torch.backends.mps.is_available(): 30 | device = "mps" 31 | else: 32 | import torch_directml 33 | 34 | device = torch_directml.device(torch_directml.default_device()) 35 | 36 | def forward_dml(ctx, x, scale): 37 | ctx.scale = scale 38 | res = x.clone().detach() 39 | return res 40 | 41 | fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml 42 | 43 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+") 44 | 45 | 46 | def printt(strr): 47 | print(strr) 48 | f.write("%s\n" % strr) 49 | f.flush() 50 | 51 | 52 | printt(sys.argv) 53 | model_path = "assets/hubert/hubert_base.pt" 54 | 55 | printt(exp_dir) 56 | wavPath = "%s/1_16k_wavs" % exp_dir 57 | outPath = ( 58 | "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir 59 | ) 60 | os.makedirs(outPath, exist_ok=True) 61 | 62 | 63 | # wave must be 16k, hop_size=320 64 | def readwave(wav_path, normalize=False): 65 | wav, sr = sf.read(wav_path) 66 | assert sr == 16000 67 | feats = torch.from_numpy(wav).float() 68 | if feats.dim() == 2: # double channels 69 | feats = feats.mean(-1) 70 | assert feats.dim() == 1, feats.dim() 71 | if normalize: 72 | with torch.no_grad(): 73 | feats = F.layer_norm(feats, feats.shape) 74 | feats = feats.view(1, -1) 75 | return feats 76 | 77 | 78 | # HuBERT model 79 | printt("load model(s) from {}".format(model_path)) 80 | # if hubert model is exist 81 | if os.access(model_path, os.F_OK) == False: 82 | printt( 83 | "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" 84 | % model_path 85 | ) 86 | exit(0) 87 | models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( 88 | [model_path], 89 | suffix="", 90 | ) 91 | model = models[0] 92 | model = model.to(device) 93 | printt("move model to %s" % device) 94 | if device not in ["mps", "cpu"]: 95 | model = model.half() 96 | model.eval() 97 | 98 | todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] 99 | n = max(1, len(todo) // 10) # 最多打印十条 100 | if len(todo) == 0: 101 | printt("no-feature-todo") 102 | else: 103 | printt("all-feature-%s" % len(todo)) 104 | for idx, file in enumerate(todo): 105 | try: 106 | if file.endswith(".wav"): 107 | wav_path = "%s/%s" % (wavPath, file) 108 | out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) 109 | 110 | if os.path.exists(out_path): 111 | continue 112 | 113 | feats = readwave(wav_path, normalize=saved_cfg.task.normalize) 114 | padding_mask = torch.BoolTensor(feats.shape).fill_(False) 115 | inputs = { 116 | "source": feats.half().to(device) 117 | if device not in ["mps", "cpu"] 118 | else feats.to(device), 119 | "padding_mask": padding_mask.to(device), 120 | "output_layer": 9 if version == "v1" else 12, # layer 9 121 | } 122 | with torch.no_grad(): 123 | logits = model.extract_features(**inputs) 124 | feats = ( 125 | model.final_proj(logits[0]) if version == "v1" else logits[0] 126 | ) 127 | 128 | feats = feats.squeeze(0).float().cpu().numpy() 129 | if np.isnan(feats).sum() == 0: 130 | np.save(out_path, feats, allow_pickle=False) 131 | else: 132 | printt("%s-contains nan" % file) 133 | if idx % n == 0: 134 | printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) 135 | except: 136 | printt(traceback.format_exc()) 137 | printt("all-feature-done") 138 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/train/preprocess.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import sys 4 | 5 | from scipy import signal 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | print(sys.argv) 10 | inp_root = sys.argv[1] 11 | sr = int(sys.argv[2]) 12 | n_p = int(sys.argv[3]) 13 | exp_dir = sys.argv[4] 14 | noparallel = sys.argv[5] == "True" 15 | per = float(sys.argv[6]) 16 | import multiprocessing 17 | import os 18 | import traceback 19 | 20 | import librosa 21 | import numpy as np 22 | from scipy.io import wavfile 23 | 24 | from infer.lib.audio import load_audio 25 | from infer.lib.slicer2 import Slicer 26 | 27 | mutex = multiprocessing.Lock() 28 | f = open("%s/preprocess.log" % exp_dir, "a+") 29 | 30 | 31 | def println(strr): 32 | mutex.acquire() 33 | print(strr) 34 | f.write("%s\n" % strr) 35 | f.flush() 36 | mutex.release() 37 | 38 | 39 | class PreProcess: 40 | def __init__(self, sr, exp_dir, per=3.0): 41 | self.slicer = Slicer( 42 | sr=sr, 43 | threshold=-42, 44 | min_length=1500, 45 | min_interval=400, 46 | hop_size=15, 47 | max_sil_kept=500, 48 | ) 49 | self.sr = sr 50 | self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) 51 | self.per = per 52 | self.overlap = 0.3 53 | self.tail = self.per + self.overlap 54 | self.max = 0.9 55 | self.alpha = 0.75 56 | self.exp_dir = exp_dir 57 | self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir 58 | self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir 59 | os.makedirs(self.exp_dir, exist_ok=True) 60 | os.makedirs(self.gt_wavs_dir, exist_ok=True) 61 | os.makedirs(self.wavs16k_dir, exist_ok=True) 62 | 63 | def norm_write(self, tmp_audio, idx0, idx1): 64 | tmp_max = np.abs(tmp_audio).max() 65 | if tmp_max > 2.5: 66 | print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) 67 | return 68 | tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( 69 | 1 - self.alpha 70 | ) * tmp_audio 71 | wavfile.write( 72 | "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), 73 | self.sr, 74 | tmp_audio.astype(np.float32), 75 | ) 76 | tmp_audio = librosa.resample( 77 | tmp_audio, orig_sr=self.sr, target_sr=16000 78 | ) # , res_type="soxr_vhq" 79 | wavfile.write( 80 | "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 81 | 16000, 82 | tmp_audio.astype(np.float32), 83 | ) 84 | 85 | def pipeline(self, path, idx0): 86 | try: 87 | audio = load_audio(path, self.sr) 88 | # zero phased digital filter cause pre-ringing noise... 89 | # audio = signal.filtfilt(self.bh, self.ah, audio) 90 | audio = signal.lfilter(self.bh, self.ah, audio) 91 | 92 | idx1 = 0 93 | for audio in self.slicer.slice(audio): 94 | i = 0 95 | while 1: 96 | start = int(self.sr * (self.per - self.overlap) * i) 97 | i += 1 98 | if len(audio[start:]) > self.tail * self.sr: 99 | tmp_audio = audio[start : start + int(self.per * self.sr)] 100 | self.norm_write(tmp_audio, idx0, idx1) 101 | idx1 += 1 102 | else: 103 | tmp_audio = audio[start:] 104 | idx1 += 1 105 | break 106 | self.norm_write(tmp_audio, idx0, idx1) 107 | println("%s->Suc." % path) 108 | except: 109 | println("%s->%s" % (path, traceback.format_exc())) 110 | 111 | def pipeline_mp(self, infos): 112 | for path, idx0 in infos: 113 | self.pipeline(path, idx0) 114 | 115 | def pipeline_mp_inp_dir(self, inp_root, n_p): 116 | try: 117 | infos = [ 118 | ("%s/%s" % (inp_root, name), idx) 119 | for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) 120 | ] 121 | if noparallel: 122 | for i in range(n_p): 123 | self.pipeline_mp(infos[i::n_p]) 124 | else: 125 | ps = [] 126 | for i in range(n_p): 127 | p = multiprocessing.Process( 128 | target=self.pipeline_mp, args=(infos[i::n_p],) 129 | ) 130 | ps.append(p) 131 | p.start() 132 | for i in range(n_p): 133 | ps[i].join() 134 | except: 135 | println("Fail. %s" % traceback.format_exc()) 136 | 137 | 138 | def preprocess_trainset(inp_root, sr, n_p, exp_dir, per): 139 | pp = PreProcess(sr, exp_dir, per) 140 | println("start preprocess") 141 | println(sys.argv) 142 | pp.pipeline_mp_inp_dir(inp_root, n_p) 143 | println("end preprocess") 144 | 145 | 146 | if __name__ == "__main__": 147 | preprocess_trainset(inp_root, sr, n_p, exp_dir, per) 148 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/uvr5/modules.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | import ffmpeg 8 | import torch 9 | 10 | from configs.config import Config 11 | from infer.modules.uvr5.mdxnet import MDXNetDereverb 12 | from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho 13 | 14 | config = Config() 15 | 16 | 17 | def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): 18 | infos = [] 19 | try: 20 | inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 21 | save_root_vocal = ( 22 | save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 23 | ) 24 | save_root_ins = ( 25 | save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 26 | ) 27 | if model_name == "onnx_dereverb_By_FoxJoy": 28 | pre_fun = MDXNetDereverb(15, config.device) 29 | else: 30 | func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho 31 | pre_fun = func( 32 | agg=int(agg), 33 | model_path=os.path.join( 34 | os.getenv("weight_uvr5_root"), model_name + ".pth" 35 | ), 36 | device=config.device, 37 | is_half=config.is_half, 38 | ) 39 | is_hp3 = "HP3" in model_name 40 | if inp_root != "": 41 | paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] 42 | else: 43 | paths = [path.name for path in paths] 44 | for path in paths: 45 | inp_path = os.path.join(inp_root, path) 46 | need_reformat = 1 47 | done = 0 48 | try: 49 | info = ffmpeg.probe(inp_path, cmd="ffprobe") 50 | if ( 51 | info["streams"][0]["channels"] == 2 52 | and info["streams"][0]["sample_rate"] == "44100" 53 | ): 54 | need_reformat = 0 55 | pre_fun._path_audio_( 56 | inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 57 | ) 58 | done = 1 59 | except: 60 | need_reformat = 1 61 | traceback.print_exc() 62 | if need_reformat == 1: 63 | tmp_path = "%s/%s.reformatted.wav" % ( 64 | os.path.join(os.environ["TEMP"]), 65 | os.path.basename(inp_path), 66 | ) 67 | os.system( 68 | "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" 69 | % (inp_path, tmp_path) 70 | ) 71 | inp_path = tmp_path 72 | try: 73 | if done == 0: 74 | pre_fun._path_audio_( 75 | inp_path, save_root_ins, save_root_vocal, format0 76 | ) 77 | infos.append("%s->Success" % (os.path.basename(inp_path))) 78 | yield "\n".join(infos) 79 | except: 80 | try: 81 | if done == 0: 82 | pre_fun._path_audio_( 83 | inp_path, save_root_ins, save_root_vocal, format0 84 | ) 85 | infos.append("%s->Success" % (os.path.basename(inp_path))) 86 | yield "\n".join(infos) 87 | except: 88 | infos.append( 89 | "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) 90 | ) 91 | yield "\n".join(infos) 92 | except: 93 | infos.append(traceback.format_exc()) 94 | yield "\n".join(infos) 95 | finally: 96 | try: 97 | if model_name == "onnx_dereverb_By_FoxJoy": 98 | del pre_fun.pred.model 99 | del pre_fun.pred.model_ 100 | else: 101 | del pre_fun.model 102 | del pre_fun 103 | except: 104 | traceback.print_exc() 105 | if torch.cuda.is_available(): 106 | torch.cuda.empty_cache() 107 | logger.info("Executed torch.cuda.empty_cache()") 108 | yield "\n".join(infos) 109 | -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/vc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/modules/vc/__init__.py -------------------------------------------------------------------------------- /tts-cli/rvc/infer/modules/vc/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from fairseq import checkpoint_utils 4 | 5 | 6 | def get_index_path_from_model(sid): 7 | return next( 8 | ( 9 | f 10 | for f in [ 11 | os.path.join(root, name) 12 | for root, _, files in os.walk(os.getenv("index_root"), topdown=False) 13 | for name in files 14 | if name.endswith(".index") and "trained" not in name 15 | ] 16 | if sid.split(".")[0] in f 17 | ), 18 | "", 19 | ) 20 | 21 | 22 | def load_hubert(config): 23 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task( 24 | ["assets/hubert/hubert_base.pt"], 25 | suffix="", 26 | ) 27 | hubert_model = models[0] 28 | hubert_model = hubert_model.to(config.device) 29 | if config.is_half: 30 | hubert_model = hubert_model.half() 31 | else: 32 | hubert_model = hubert_model.float() 33 | return hubert_model.eval() 34 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/__pycache__/rvc_for_realtime.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/__pycache__/rvc_for_realtime.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/tools/app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt") 5 | import gradio as gr 6 | from dotenv import load_dotenv 7 | 8 | from configs.config import Config 9 | from i18n.i18n import I18nAuto 10 | from infer.modules.vc.modules import VC 11 | 12 | logging.getLogger("numba").setLevel(logging.WARNING) 13 | logging.getLogger("markdown_it").setLevel(logging.WARNING) 14 | logging.getLogger("urllib3").setLevel(logging.WARNING) 15 | logging.getLogger("matplotlib").setLevel(logging.WARNING) 16 | logger = logging.getLogger(__name__) 17 | 18 | i18n = I18nAuto() 19 | logger.info(i18n) 20 | 21 | load_dotenv() 22 | config = Config() 23 | vc = VC(config) 24 | 25 | weight_root = os.getenv("weight_root") 26 | weight_uvr5_root = os.getenv("weight_uvr5_root") 27 | index_root = os.getenv("index_root") 28 | names = [] 29 | hubert_model = None 30 | for name in os.listdir(weight_root): 31 | if name.endswith(".pth"): 32 | names.append(name) 33 | index_paths = [] 34 | for root, dirs, files in os.walk(index_root, topdown=False): 35 | for name in files: 36 | if name.endswith(".index") and "trained" not in name: 37 | index_paths.append("%s/%s" % (root, name)) 38 | 39 | 40 | app = gr.Blocks() 41 | with app: 42 | with gr.Tabs(): 43 | with gr.TabItem("在线demo"): 44 | gr.Markdown( 45 | value=""" 46 | RVC 在线demo 47 | """ 48 | ) 49 | sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) 50 | with gr.Column(): 51 | spk_item = gr.Slider( 52 | minimum=0, 53 | maximum=2333, 54 | step=1, 55 | label=i18n("请选择说话人id"), 56 | value=0, 57 | visible=False, 58 | interactive=True, 59 | ) 60 | sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item]) 61 | gr.Markdown( 62 | value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ") 63 | ) 64 | vc_input3 = gr.Audio(label="上传音频(长度小于90秒)") 65 | vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0) 66 | f0method0 = gr.Radio( 67 | label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"), 68 | choices=["pm", "harvest", "crepe", "rmvpe"], 69 | value="pm", 70 | interactive=True, 71 | ) 72 | filter_radius0 = gr.Slider( 73 | minimum=0, 74 | maximum=7, 75 | label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), 76 | value=3, 77 | step=1, 78 | interactive=True, 79 | ) 80 | with gr.Column(): 81 | file_index1 = gr.Textbox( 82 | label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"), 83 | value="", 84 | interactive=False, 85 | visible=False, 86 | ) 87 | file_index2 = gr.Dropdown( 88 | label=i18n("自动检测index路径,下拉式选择(dropdown)"), 89 | choices=sorted(index_paths), 90 | interactive=True, 91 | ) 92 | index_rate1 = gr.Slider( 93 | minimum=0, 94 | maximum=1, 95 | label=i18n("检索特征占比"), 96 | value=0.88, 97 | interactive=True, 98 | ) 99 | resample_sr0 = gr.Slider( 100 | minimum=0, 101 | maximum=48000, 102 | label=i18n("后处理重采样至最终采样率,0为不进行重采样"), 103 | value=0, 104 | step=1, 105 | interactive=True, 106 | ) 107 | rms_mix_rate0 = gr.Slider( 108 | minimum=0, 109 | maximum=1, 110 | label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"), 111 | value=1, 112 | interactive=True, 113 | ) 114 | protect0 = gr.Slider( 115 | minimum=0, 116 | maximum=0.5, 117 | label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"), 118 | value=0.33, 119 | step=0.01, 120 | interactive=True, 121 | ) 122 | f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) 123 | but0 = gr.Button(i18n("转换"), variant="primary") 124 | vc_output1 = gr.Textbox(label=i18n("输出信息")) 125 | vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) 126 | but0.click( 127 | vc.vc_single, 128 | [ 129 | spk_item, 130 | vc_input3, 131 | vc_transform0, 132 | f0_file, 133 | f0method0, 134 | file_index1, 135 | file_index2, 136 | # file_big_npy1, 137 | index_rate1, 138 | filter_radius0, 139 | resample_sr0, 140 | rms_mix_rate0, 141 | protect0, 142 | ], 143 | [vc_output1, vc_output2], 144 | ) 145 | 146 | 147 | app.launch() 148 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/calc_rvc_model_similarity.py: -------------------------------------------------------------------------------- 1 | # This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py 2 | # Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models. 3 | import os 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | 13 | def cal_cross_attn(to_q, to_k, to_v, rand_input): 14 | hidden_dim, embed_dim = to_q.shape 15 | attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False) 16 | attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False) 17 | attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False) 18 | attn_to_q.load_state_dict({"weight": to_q}) 19 | attn_to_k.load_state_dict({"weight": to_k}) 20 | attn_to_v.load_state_dict({"weight": to_v}) 21 | 22 | return torch.einsum( 23 | "ik, jk -> ik", 24 | F.softmax( 25 | torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)), 26 | dim=-1, 27 | ), 28 | attn_to_v(rand_input), 29 | ) 30 | 31 | 32 | def model_hash(filename): 33 | try: 34 | with open(filename, "rb") as file: 35 | import hashlib 36 | 37 | m = hashlib.sha256() 38 | 39 | file.seek(0x100000) 40 | m.update(file.read(0x10000)) 41 | return m.hexdigest()[0:8] 42 | except FileNotFoundError: 43 | return "NOFILE" 44 | 45 | 46 | def eval(model, n, input): 47 | qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight" 48 | uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight" 49 | vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight" 50 | atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0] 51 | 52 | attn = cal_cross_attn(atoq, atok, atov, input) 53 | return attn 54 | 55 | 56 | def main(path, root): 57 | torch.manual_seed(114514) 58 | model_a = torch.load(path, map_location="cpu")["weight"] 59 | 60 | logger.info("Query:\t\t%s\t%s" % (path, model_hash(path))) 61 | 62 | map_attn_a = {} 63 | map_rand_input = {} 64 | for n in range(6): 65 | hidden_dim, embed_dim, _ = model_a[ 66 | f"enc_p.encoder.attn_layers.{n}.conv_v.weight" 67 | ].shape 68 | rand_input = torch.randn([embed_dim, hidden_dim]) 69 | 70 | map_attn_a[n] = eval(model_a, n, rand_input) 71 | map_rand_input[n] = rand_input 72 | 73 | del model_a 74 | 75 | for name in sorted(list(os.listdir(root))): 76 | path = "%s/%s" % (root, name) 77 | model_b = torch.load(path, map_location="cpu")["weight"] 78 | 79 | sims = [] 80 | for n in range(6): 81 | attn_a = map_attn_a[n] 82 | attn_b = eval(model_b, n, map_rand_input[n]) 83 | 84 | sim = torch.mean(torch.cosine_similarity(attn_a, attn_b)) 85 | sims.append(sim) 86 | 87 | logger.info( 88 | "Reference:\t%s\t%s\t%s" 89 | % (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%") 90 | ) 91 | 92 | 93 | if __name__ == "__main__": 94 | query_path = r"assets\weights\mi v3.pth" 95 | reference_root = r"assets\weights" 96 | main(query_path, reference_root) 97 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/download_models - Kopie.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import requests 4 | 5 | RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/" 6 | 7 | BASE_DIR = Path(__file__).resolve().parent.parent 8 | 9 | 10 | def dl_model(link, model_name, dir_name): 11 | with requests.get(f"{link}{model_name}") as r: 12 | r.raise_for_status() 13 | os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True) 14 | with open(dir_name / model_name, "wb") as f: 15 | for chunk in r.iter_content(chunk_size=8192): 16 | f.write(chunk) 17 | 18 | 19 | if __name__ == "__main__": 20 | print("Downloading hubert_base.pt...") 21 | dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert") 22 | print("Downloading rmvpe.pt...") 23 | dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe") 24 | print("Downloading vocals.onnx...") 25 | dl_model( 26 | RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/", 27 | "vocals.onnx", 28 | BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy", 29 | ) 30 | 31 | rvc_models_dir = BASE_DIR / "assets/pretrained" 32 | 33 | print("Downloading pretrained models:") 34 | 35 | model_names = [ 36 | "D32k.pth", 37 | "D40k.pth", 38 | "D48k.pth", 39 | "G32k.pth", 40 | "G40k.pth", 41 | "G48k.pth", 42 | "f0D32k.pth", 43 | "f0D40k.pth", 44 | "f0D48k.pth", 45 | "f0G32k.pth", 46 | "f0G40k.pth", 47 | "f0G48k.pth", 48 | ] 49 | for model in model_names: 50 | print(f"Downloading {model}...") 51 | dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir) 52 | 53 | rvc_models_dir = BASE_DIR / "assets/pretrained_v2" 54 | 55 | print("Downloading pretrained models v2:") 56 | 57 | for model in model_names: 58 | print(f"Downloading {model}...") 59 | dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir) 60 | 61 | print("Downloading uvr5_weights:") 62 | 63 | rvc_models_dir = BASE_DIR / "assets/uvr5_weights" 64 | 65 | model_names = [ 66 | "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth", 67 | "HP2_all_vocals.pth", 68 | "HP3_all_vocals.pth", 69 | "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth", 70 | "HP5_only_main_vocal.pth", 71 | "VR-DeEchoAggressive.pth", 72 | "VR-DeEchoDeReverb.pth", 73 | "VR-DeEchoNormal.pth", 74 | ] 75 | for model in model_names: 76 | print(f"Downloading {model}...") 77 | dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir) 78 | 79 | print("All models downloaded!") 80 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/download_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import requests 4 | 5 | RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/" 6 | 7 | BASE_DIR = Path(__file__).resolve().parent.parent 8 | 9 | 10 | def dl_model(link, model_name, dir_name): 11 | with requests.get(f"{link}{model_name}") as r: 12 | r.raise_for_status() 13 | os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True) 14 | with open(dir_name / model_name, "wb") as f: 15 | for chunk in r.iter_content(chunk_size=8192): 16 | f.write(chunk) 17 | 18 | 19 | if __name__ == "__main__": 20 | print("Downloading hubert_base.pt...") 21 | # dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert") 22 | print("Downloading rmvpe.pt...") 23 | # dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe") 24 | print("Downloading vocals.onnx...") 25 | # dl_model( 26 | # RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/", 27 | # "vocals.onnx", 28 | # BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy", 29 | # ) 30 | 31 | rvc_models_dir = BASE_DIR / "assets/pretrained" 32 | 33 | print("Downloading pretrained models:") 34 | 35 | model_names = [ 36 | # "D32k.pth", 37 | # "D40k.pth", 38 | # "D48k.pth", 39 | "G32k.pth", 40 | "G40k.pth", 41 | "G48k.pth", 42 | "f0D32k.pth", 43 | "f0D40k.pth", 44 | "f0D48k.pth", 45 | "f0G32k.pth", 46 | "f0G40k.pth", 47 | "f0G48k.pth", 48 | ] 49 | for model in model_names: 50 | print(f"Downloading {model}...") 51 | dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir) 52 | 53 | rvc_models_dir = BASE_DIR / "assets/pretrained_v2" 54 | 55 | print("Downloading pretrained models v2:") 56 | 57 | for model in model_names: 58 | print(f"Downloading {model}...") 59 | dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir) 60 | 61 | print("Downloading uvr5_weights:") 62 | 63 | rvc_models_dir = BASE_DIR / "assets/uvr5_weights" 64 | 65 | model_names = [ 66 | "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth", 67 | "HP2_all_vocals.pth", 68 | "HP3_all_vocals.pth", 69 | "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth", 70 | "HP5_only_main_vocal.pth", 71 | "VR-DeEchoAggressive.pth", 72 | "VR-DeEchoDeReverb.pth", 73 | "VR-DeEchoNormal.pth", 74 | ] 75 | for model in model_names: 76 | print(f"Downloading {model}...") 77 | dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir) 78 | 79 | print("All models downloaded!") 80 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/export_onnx.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM 3 | 4 | if __name__ == "__main__": 5 | MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用 6 | 7 | ModelPath = "Shiroha/shiroha.pth" # 模型路径 8 | ExportedPath = "model.onnx" # 输出路径 9 | hidden_channels = 256 # hidden_channels,为768Vec做准备 10 | cpt = torch.load(ModelPath, map_location="cpu") 11 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk 12 | print(*cpt["config"]) 13 | 14 | test_phone = torch.rand(1, 200, hidden_channels) # hidden unit 15 | test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) 16 | test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) 17 | test_pitchf = torch.rand(1, 200) # nsf基频 18 | test_ds = torch.LongTensor([0]) # 说话人ID 19 | test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) 20 | 21 | device = "cpu" # 导出时设备(不影响使用模型) 22 | 23 | net_g = SynthesizerTrnMsNSFsidM( 24 | *cpt["config"], is_half=False 25 | ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) 26 | net_g.load_state_dict(cpt["weight"], strict=False) 27 | input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] 28 | output_names = [ 29 | "audio", 30 | ] 31 | # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 32 | torch.onnx.export( 33 | net_g, 34 | ( 35 | test_phone.to(device), 36 | test_phone_lengths.to(device), 37 | test_pitch.to(device), 38 | test_pitchf.to(device), 39 | test_ds.to(device), 40 | test_rnd.to(device), 41 | ), 42 | ExportedPath, 43 | dynamic_axes={ 44 | "phone": [1], 45 | "pitch": [1], 46 | "pitchf": [1], 47 | "rnd": [2], 48 | }, 49 | do_constant_folding=False, 50 | opset_version=16, 51 | verbose=False, 52 | input_names=input_names, 53 | output_names=output_names, 54 | ) 55 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/infer/train-index-v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 3 | """ 4 | import os 5 | import traceback 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | from multiprocessing import cpu_count 11 | 12 | import faiss 13 | import numpy as np 14 | from sklearn.cluster import MiniBatchKMeans 15 | 16 | # ###########如果是原始特征要先写save 17 | n_cpu = 0 18 | if n_cpu == 0: 19 | n_cpu = cpu_count() 20 | inp_root = r"./logs/anz/3_feature768" 21 | npys = [] 22 | listdir_res = list(os.listdir(inp_root)) 23 | for name in sorted(listdir_res): 24 | phone = np.load("%s/%s" % (inp_root, name)) 25 | npys.append(phone) 26 | big_npy = np.concatenate(npys, 0) 27 | big_npy_idx = np.arange(big_npy.shape[0]) 28 | np.random.shuffle(big_npy_idx) 29 | big_npy = big_npy[big_npy_idx] 30 | logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G 31 | if big_npy.shape[0] > 2e5: 32 | # if(1): 33 | info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0] 34 | logger.info(info) 35 | try: 36 | big_npy = ( 37 | MiniBatchKMeans( 38 | n_clusters=10000, 39 | verbose=True, 40 | batch_size=256 * n_cpu, 41 | compute_labels=False, 42 | init="random", 43 | ) 44 | .fit(big_npy) 45 | .cluster_centers_ 46 | ) 47 | except: 48 | info = traceback.format_exc() 49 | logger.warning(info) 50 | 51 | np.save("tools/infer/big_src_feature_mi.npy", big_npy) 52 | 53 | ##################train+add 54 | # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy") 55 | n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) 56 | index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf) # mi 57 | logger.info("Training...") 58 | index_ivf = faiss.extract_index_ivf(index) # 59 | index_ivf.nprobe = 1 60 | index.train(big_npy) 61 | faiss.write_index( 62 | index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf) 63 | ) 64 | logger.info("Adding...") 65 | batch_size_add = 8192 66 | for i in range(0, big_npy.shape[0], batch_size_add): 67 | index.add(big_npy[i : i + batch_size_add]) 68 | faiss.write_index( 69 | index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf) 70 | ) 71 | """ 72 | 大小(都是FP32) 73 | big_src_feature 2.95G 74 | (3098036, 256) 75 | big_emb 4.43G 76 | (6196072, 192) 77 | big_emb双倍是因为求特征要repeat后再加pitch 78 | 79 | """ 80 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/infer/train-index.py: -------------------------------------------------------------------------------- 1 | """ 2 | 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 3 | """ 4 | import os 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | import faiss 10 | import numpy as np 11 | 12 | # ###########如果是原始特征要先写save 13 | inp_root = r"E:\codes\py39\dataset\mi\2-co256" 14 | npys = [] 15 | for name in sorted(list(os.listdir(inp_root))): 16 | phone = np.load("%s/%s" % (inp_root, name)) 17 | npys.append(phone) 18 | big_npy = np.concatenate(npys, 0) 19 | logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G 20 | np.save("infer/big_src_feature_mi.npy", big_npy) 21 | 22 | ##################train+add 23 | # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy") 24 | logger.debug(big_npy.shape) 25 | index = faiss.index_factory(256, "IVF512,Flat") # mi 26 | logger.info("Training...") 27 | index_ivf = faiss.extract_index_ivf(index) # 28 | index_ivf.nprobe = 9 29 | index.train(big_npy) 30 | faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index") 31 | logger.info("Adding...") 32 | index.add(big_npy) 33 | faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index") 34 | """ 35 | 大小(都是FP32) 36 | big_src_feature 2.95G 37 | (3098036, 256) 38 | big_emb 4.43G 39 | (6196072, 192) 40 | big_emb双倍是因为求特征要repeat后再加pitch 41 | 42 | """ 43 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/infer/trans_weights.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | 3 | import torch 4 | 5 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf# 6 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf# 7 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf# 8 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf# 9 | a = torch.load( 10 | r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth" 11 | )[ 12 | "model" 13 | ] # sim_nsf# 14 | for key in a.keys(): 15 | a[key] = a[key].half() 16 | # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")# 17 | # torch.save(a,"ft-mi-sim1k.pt")# 18 | torch.save(a, "ft-mi-no_opt-no_dropout.pt") # 19 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/infer_batch_rvc.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | print("Command-line arguments:", sys.argv) 6 | 7 | now_dir = os.getcwd() 8 | sys.path.append(now_dir) 9 | import sys 10 | 11 | import tqdm as tq 12 | from dotenv import load_dotenv 13 | from scipy.io import wavfile 14 | 15 | from configs.config import Config 16 | from infer.modules.vc.modules import VC 17 | 18 | 19 | def arg_parse() -> tuple: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--f0up_key", type=int, default=0) 22 | parser.add_argument("--input_path", type=str, help="input path") 23 | parser.add_argument("--index_path", type=str, help="index path") 24 | parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm") 25 | parser.add_argument("--opt_path", type=str, help="opt path") 26 | parser.add_argument("--model_name", type=str, help="store in assets/weight_root") 27 | parser.add_argument("--index_rate", type=float, default=0.66, help="index rate") 28 | parser.add_argument("--device", type=str, help="device") 29 | parser.add_argument("--is_half", type=bool, help="use half -> True") 30 | parser.add_argument("--filter_radius", type=int, default=3, help="filter radius") 31 | parser.add_argument("--resample_sr", type=int, default=0, help="resample sr") 32 | parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate") 33 | parser.add_argument("--protect", type=float, default=0.33, help="protect") 34 | 35 | args = parser.parse_args() 36 | sys.argv = sys.argv[:1] 37 | 38 | return args 39 | 40 | 41 | def main(): 42 | load_dotenv() 43 | args = arg_parse() 44 | config = Config() 45 | config.device = args.device if args.device else config.device 46 | config.is_half = args.is_half if args.is_half else config.is_half 47 | vc = VC(config) 48 | vc.get_vc(args.model_name) 49 | audios = os.listdir(args.input_path) 50 | for file in tq.tqdm(audios): 51 | if file.endswith(".wav"): 52 | file_path = os.path.join(args.input_path, file) 53 | _, wav_opt = vc.vc_single( 54 | 0, 55 | file_path, 56 | args.f0up_key, 57 | None, 58 | args.f0method, 59 | args.index_path, 60 | None, 61 | args.index_rate, 62 | args.filter_radius, 63 | args.resample_sr, 64 | args.rms_mix_rate, 65 | args.protect, 66 | ) 67 | out_path = os.path.join(args.opt_path, file) 68 | wavfile.write(out_path, wav_opt[0], wav_opt[1]) 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/infer_cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | from dotenv import load_dotenv 8 | from scipy.io import wavfile 9 | 10 | from configs.config import Config 11 | from infer.modules.vc.modules import VC 12 | 13 | #### 14 | # USAGE 15 | # 16 | # In your Terminal or CMD or whatever 17 | 18 | 19 | def arg_parse() -> tuple: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--f0up_key", type=int, default=0) 22 | parser.add_argument("--input_path", type=str, help="input path") 23 | parser.add_argument("--index_path", type=str, help="index path") 24 | parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm") 25 | parser.add_argument("--opt_path", type=str, help="opt path") 26 | parser.add_argument("--model_name", type=str, help="store in assets/weight_root") 27 | parser.add_argument("--index_rate", type=float, default=0.66, help="index rate") 28 | parser.add_argument("--device", type=str, help="device") 29 | parser.add_argument("--is_half", type=bool, help="use half -> True") 30 | parser.add_argument("--filter_radius", type=int, default=3, help="filter radius") 31 | parser.add_argument("--resample_sr", type=int, default=0, help="resample sr") 32 | parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate") 33 | parser.add_argument("--protect", type=float, default=0.33, help="protect") 34 | 35 | args = parser.parse_args() 36 | sys.argv = sys.argv[:1] 37 | 38 | return args 39 | 40 | 41 | def main(): 42 | load_dotenv() 43 | args = arg_parse() 44 | config = Config() 45 | config.device = args.device if args.device else config.device 46 | config.is_half = args.is_half if args.is_half else config.is_half 47 | vc = VC(config) 48 | vc.get_vc(args.model_name) 49 | _, wav_opt = vc.vc_single( 50 | 0, 51 | args.input_path, 52 | args.f0up_key, 53 | None, 54 | args.f0method, 55 | args.index_path, 56 | None, 57 | args.index_rate, 58 | args.filter_radius, 59 | args.resample_sr, 60 | args.rms_mix_rate, 61 | args.protect, 62 | ) 63 | wavfile.write(args.opt_path, wav_opt[0], wav_opt[1]) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/onnx_inference_demo.py: -------------------------------------------------------------------------------- 1 | import soundfile 2 | 3 | from ..infer.lib.infer_pack.onnx_inference import OnnxRVC 4 | 5 | hop_size = 512 6 | sampling_rate = 40000 # 采样率 7 | f0_up_key = 0 # 升降调 8 | sid = 0 # 角色ID 9 | f0_method = "dio" # F0提取算法 10 | model_path = "ShirohaRVC.onnx" # 模型的完整路径 11 | vec_name = "vec-256-layer-9" # 内部自动补齐为 f"pretrained/{vec_name}.onnx" 需要onnx的vec模型 12 | wav_path = "123.wav" # 输入路径或ByteIO实例 13 | out_path = "out.wav" # 输出路径或ByteIO实例 14 | 15 | model = OnnxRVC( 16 | model_path, vec_path=vec_name, sr=sampling_rate, hop_size=hop_size, device="cuda" 17 | ) 18 | 19 | audio = model.inference(wav_path, sid, f0_method=f0_method, f0_up_key=f0_up_key) 20 | 21 | soundfile.write(out_path, audio, sampling_rate) 22 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/torchgate/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | TorchGating is a PyTorch-based implementation of Spectral Gating 3 | ================================================ 4 | Author: Asaf Zorea 5 | 6 | Contents 7 | -------- 8 | torchgate imports all the functions from PyTorch, and in addition provides: 9 | TorchGating --- A PyTorch module that applies a spectral gate to an input signal 10 | 11 | """ 12 | from .torchgate import TorchGate 13 | -------------------------------------------------------------------------------- /tts-cli/rvc/tools/torchgate/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/torchgate/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/tools/torchgate/__pycache__/torchgate.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/torchgate/__pycache__/torchgate.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/tools/torchgate/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/torchgate/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /tts-cli/rvc/tools/torchgate/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.types import Number 3 | 4 | 5 | @torch.no_grad() 6 | def amp_to_db( 7 | x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40 8 | ) -> torch.Tensor: 9 | """ 10 | Convert the input tensor from amplitude to decibel scale. 11 | 12 | Arguments: 13 | x {[torch.Tensor]} -- [Input tensor.] 14 | 15 | Keyword Arguments: 16 | eps {[float]} -- [Small value to avoid numerical instability.] 17 | (default: {torch.finfo(torch.float64).eps}) 18 | top_db {[float]} -- [threshold the output at ``top_db`` below the peak] 19 | ` (default: {40}) 20 | 21 | Returns: 22 | [torch.Tensor] -- [Output tensor in decibel scale.] 23 | """ 24 | x_db = 20 * torch.log10(x.abs() + eps) 25 | return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1)) 26 | 27 | 28 | @torch.no_grad() 29 | def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor: 30 | """ 31 | Apply a sigmoid function with temperature scaling. 32 | 33 | Arguments: 34 | x {[torch.Tensor]} -- [Input tensor.] 35 | x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.] 36 | temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.] 37 | 38 | Returns: 39 | [torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.] 40 | """ 41 | return torch.sigmoid((x - x0) / temp_coeff) 42 | 43 | 44 | @torch.no_grad() 45 | def linspace( 46 | start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs 47 | ) -> torch.Tensor: 48 | """ 49 | Generate a linearly spaced 1-D tensor. 50 | 51 | Arguments: 52 | start {[Number]} -- [The starting value of the sequence.] 53 | stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False. 54 | In that case, the sequence consists of all but the last of ``num + 1`` 55 | evenly spaced samples, so that `stop` is excluded. Note that the step 56 | size changes when `endpoint` is False.] 57 | 58 | Keyword Arguments: 59 | num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.] 60 | endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included. 61 | Default is True.] 62 | **kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.] 63 | 64 | Returns: 65 | [torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.] 66 | """ 67 | if endpoint: 68 | return torch.linspace(start, stop, num, **kwargs) 69 | else: 70 | return torch.linspace(start, stop, num + 1, **kwargs)[:-1] 71 | -------------------------------------------------------------------------------- /tts-cli/server.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | import json 3 | import logging 4 | import threading 5 | import asyncio 6 | from queue import Queue, Empty 7 | import websockets 8 | from xtts_rvc_synthesizer import XTTSRVCSynthesizer 9 | 10 | # Initialize parameters 11 | xtts_model = "models/xtts/Lasinya" 12 | xtts_voice = "Lasinya_Reference.json" 13 | rvc_model = "models/rvc/Lasinya" 14 | use_logging = True 15 | 16 | # Use thread-safe Queues for audio chunks and control messages 17 | audio_queue = Queue() 18 | control_queue = Queue() 19 | 20 | # Set to store audio WebSocket connections 21 | audio_connections = set() 22 | 23 | # Event to signal the threads to stop 24 | stop_event = threading.Event() 25 | 26 | class TTSThread(threading.Thread): 27 | def __init__(self): 28 | super().__init__() 29 | self.tts = None 30 | 31 | def run(self): 32 | self.tts = XTTSRVCSynthesizer( 33 | xtts_model=xtts_model, 34 | xtts_voice=xtts_voice, 35 | rvc_model=rvc_model, 36 | rvc_sample_rate=40000, 37 | use_logging=use_logging, 38 | on_audio_chunk=self.on_audio_chunk 39 | ) 40 | while not stop_event.is_set(): 41 | try: 42 | data = control_queue.get(timeout=0.1) 43 | if data["type"] == "text": 44 | self.tts.push_text(data["content"]) 45 | elif data["type"] == "synthesize": 46 | self.tts.synthesize() 47 | except Empty: 48 | continue 49 | 50 | def on_audio_chunk(self, chunk): 51 | print("received chunk") 52 | audio_queue.put(chunk) 53 | 54 | async def process_audio_queue(): 55 | while True: 56 | try: 57 | chunk = audio_queue.get_nowait() 58 | print("Processing chunk from queue") 59 | await broadcast_audio_chunk(chunk) 60 | except Empty: 61 | await asyncio.sleep(0.01) 62 | 63 | async def broadcast_audio_chunk(chunk): 64 | print("broadcast_audio_chunk was called") 65 | for conn in list(audio_connections): 66 | try: 67 | await conn.send(chunk) 68 | except websockets.exceptions.ConnectionClosed: 69 | audio_connections.remove(conn) 70 | 71 | async def control_handler(websocket, path): 72 | try: 73 | async for message in websocket: 74 | data = json.loads(message) 75 | control_queue.put(data) 76 | await websocket.send(json.dumps({"type": f"{data['type']}_received"})) 77 | except websockets.exceptions.ConnectionClosed: 78 | logging.info("Control WebSocket connection closed") 79 | 80 | async def audio_handler(websocket, path): 81 | try: 82 | audio_connections.add(websocket) 83 | await websocket.wait_closed() 84 | finally: 85 | audio_connections.remove(websocket) 86 | 87 | async def main(): 88 | # Start the TTS thread 89 | tts_thread = TTSThread() 90 | tts_thread.start() 91 | 92 | # Start the audio processing task 93 | audio_task = asyncio.create_task(process_audio_queue()) 94 | 95 | control_server = await websockets.serve(control_handler, "localhost", 8000) 96 | audio_server = await websockets.serve(audio_handler, "localhost", 8001) 97 | 98 | print("Server CONTROL listening on ws://localhost:8000") 99 | print("Server AUDIO listening on ws://localhost:8001") 100 | 101 | try: 102 | await asyncio.gather(control_server.wait_closed(), audio_server.wait_closed(), audio_task) 103 | finally: 104 | stop_event.set() 105 | tts_thread.join() 106 | 107 | logging.basicConfig(level=logging.DEBUG if use_logging else logging.WARNING) 108 | asyncio.run(main()) -------------------------------------------------------------------------------- /tts-cli/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="tts-cli", 5 | version="0.1", 6 | packages=find_packages(), 7 | entry_points={ 8 | 'console_scripts': [ 9 | 'tts=tts_client:main', 10 | 'tts-server=start_tts_server:main', 11 | ], 12 | }, 13 | ) -------------------------------------------------------------------------------- /tts-cli/start_tts_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | def main(): 6 | # Get the current script's directory (should be tts-cli) 7 | script_dir = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | # Move one directory up to access the venv 10 | root_dir = os.path.dirname(script_dir) 11 | os.chdir(root_dir) 12 | 13 | # Path to the virtual environment 14 | venv_path = os.path.join(root_dir, 'venv') 15 | 16 | # Path to the Python interpreter in the virtual environment 17 | if sys.platform == "win32": 18 | python_path = os.path.join(venv_path, 'Scripts', 'python.exe') 19 | else: 20 | python_path = os.path.join(venv_path, 'bin', 'python') 21 | 22 | # Change back to the tts-cli directory 23 | os.chdir(script_dir) 24 | 25 | # Prepare the command to run tts_server.py with all provided arguments 26 | command = [python_path, 'tts_server.py'] + sys.argv[1:] 27 | 28 | # Start the TTS server 29 | print("Starting TTS server...") 30 | print(f"Command: {command}") 31 | try: 32 | subprocess.run(command, check=True) 33 | except subprocess.CalledProcessError as e: 34 | print(f"Error starting TTS server: {e}") 35 | sys.exit(1) 36 | except FileNotFoundError: 37 | print(f"Error: Could not find Python interpreter at {python_path}") 38 | print("Make sure the virtual environment is set up correctly.") 39 | sys.exit(1) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | 44 | # import os 45 | # import sys 46 | # import subprocess 47 | 48 | # def start_tts_server(): 49 | # # Get the current script's directory (should be tts-cli) 50 | # script_dir = os.path.dirname(os.path.abspath(__file__)) 51 | 52 | # # Move one directory up to access the venv 53 | # root_dir = os.path.dirname(script_dir) 54 | # os.chdir(root_dir) 55 | 56 | # # Path to the virtual environment 57 | # venv_path = os.path.join(root_dir, 'venv') 58 | 59 | # # Path to the Python interpreter in the virtual environment 60 | # if sys.platform == "win32": 61 | # python_path = os.path.join(venv_path, 'Scripts', 'python.exe') 62 | # else: 63 | # python_path = os.path.join(venv_path, 'bin', 'python') 64 | 65 | # # Change back to the tts-cli directory 66 | # os.chdir(script_dir) 67 | 68 | # # Start the TTS server 69 | # print("Starting TTS server...") 70 | # try: 71 | # subprocess.run([python_path, 'tts_server.py'], check=True) 72 | # except subprocess.CalledProcessError as e: 73 | # print(f"Error starting TTS server: {e}") 74 | # except FileNotFoundError: 75 | # print(f"Error: Could not find Python interpreter at {python_path}") 76 | # print("Make sure the virtual environment is set up correctly.") 77 | 78 | # def main(): 79 | # start_tts_server() 80 | 81 | # if __name__ == "__main__": 82 | # main() 83 | --------------------------------------------------------------------------------