├── Readme.MD
├── _create_venv.bat
├── _install_cli_commands.bat
├── _install_dependencies.bat
├── _install_models.bat
├── _start_cli.bat
├── _start_llm_server.bat
├── _start_stt_server.bat
├── _start_tts_server.bat
├── _start_venv.bat
├── llm-cli
    ├── llm_client.py
    ├── llm_server.py
    ├── server.py
    ├── setup.py
    └── start_llm_server.py
├── requirements.txt
├── requirements_client.txt
├── stt-cli
    ├── server.py
    ├── setup.py
    ├── start_stt_server.py
    ├── stt_client.py
    └── stt_server.py
└── tts-cli
    ├── bufferstream.py
    ├── download_models.py
    ├── requirements.txt
    ├── rvc
        ├── __pycache__
        │   └── realtimervc.cpython-310.pyc
        ├── configs
        │   ├── __pycache__
        │   │   └── config.cpython-310.pyc
        │   ├── config.json
        │   ├── config.py
        │   ├── v1
        │   │   ├── 32k.json
        │   │   ├── 40k.json
        │   │   └── 48k.json
        │   └── v2
        │   │   ├── 32k.json
        │   │   └── 48k.json
        ├── i18n
        │   ├── i18n.py
        │   ├── locale
        │   │   ├── en_US.json
        │   │   ├── es_ES.json
        │   │   ├── fr_FR.json
        │   │   ├── it_IT.json
        │   │   ├── ja_JP.json
        │   │   ├── ru_RU.json
        │   │   ├── tr_TR.json
        │   │   ├── zh_CN.json
        │   │   ├── zh_HK.json
        │   │   ├── zh_SG.json
        │   │   └── zh_TW.json
        │   ├── locale_diff.py
        │   └── scan_i18n.py
        ├── infer
        │   ├── lib
        │   │   ├── __pycache__
        │   │   │   └── rmvpe.cpython-310.pyc
        │   │   ├── audio.py
        │   │   ├── infer_pack
        │   │   │   ├── __pycache__
        │   │   │   │   ├── attentions.cpython-310.pyc
        │   │   │   │   ├── commons.cpython-310.pyc
        │   │   │   │   ├── models.cpython-310.pyc
        │   │   │   │   ├── modules.cpython-310.pyc
        │   │   │   │   └── transforms.cpython-310.pyc
        │   │   │   ├── attentions.py
        │   │   │   ├── commons.py
        │   │   │   ├── models.py
        │   │   │   ├── models_onnx.py
        │   │   │   ├── modules.py
        │   │   │   ├── modules
        │   │   │   │   └── F0Predictor
        │   │   │   │   │   ├── DioF0Predictor.py
        │   │   │   │   │   ├── F0Predictor.py
        │   │   │   │   │   ├── HarvestF0Predictor.py
        │   │   │   │   │   ├── PMF0Predictor.py
        │   │   │   │   │   └── __init__.py
        │   │   │   ├── onnx_inference.py
        │   │   │   └── transforms.py
        │   │   ├── jit
        │   │   │   ├── __init__.py
        │   │   │   ├── __pycache__
        │   │   │   │   ├── __init__.cpython-310.pyc
        │   │   │   │   └── get_synthesizer.cpython-310.pyc
        │   │   │   ├── get_hubert.py
        │   │   │   ├── get_rmvpe.py
        │   │   │   └── get_synthesizer.py
        │   │   ├── rmvpe.py
        │   │   ├── slicer2.py
        │   │   ├── train
        │   │   │   ├── data_utils.py
        │   │   │   ├── losses.py
        │   │   │   ├── mel_processing.py
        │   │   │   ├── process_ckpt.py
        │   │   │   └── utils.py
        │   │   └── uvr5_pack
        │   │   │   ├── lib_v5
        │   │   │       ├── dataset.py
        │   │   │       ├── layers.py
        │   │   │       ├── layers_123812KB .py
        │   │   │       ├── layers_123821KB.py
        │   │   │       ├── layers_33966KB.py
        │   │   │       ├── layers_537227KB.py
        │   │   │       ├── layers_537238KB.py
        │   │   │       ├── layers_new.py
        │   │   │       ├── model_param_init.py
        │   │   │       ├── modelparams
        │   │   │       │   ├── 1band_sr16000_hl512.json
        │   │   │       │   ├── 1band_sr32000_hl512.json
        │   │   │       │   ├── 1band_sr33075_hl384.json
        │   │   │       │   ├── 1band_sr44100_hl1024.json
        │   │   │       │   ├── 1band_sr44100_hl256.json
        │   │   │       │   ├── 1band_sr44100_hl512.json
        │   │   │       │   ├── 1band_sr44100_hl512_cut.json
        │   │   │       │   ├── 2band_32000.json
        │   │   │       │   ├── 2band_44100_lofi.json
        │   │   │       │   ├── 2band_48000.json
        │   │   │       │   ├── 3band_44100.json
        │   │   │       │   ├── 3band_44100_mid.json
        │   │   │       │   ├── 3band_44100_msb2.json
        │   │   │       │   ├── 4band_44100.json
        │   │   │       │   ├── 4band_44100_mid.json
        │   │   │       │   ├── 4band_44100_msb.json
        │   │   │       │   ├── 4band_44100_msb2.json
        │   │   │       │   ├── 4band_44100_reverse.json
        │   │   │       │   ├── 4band_44100_sw.json
        │   │   │       │   ├── 4band_v2.json
        │   │   │       │   ├── 4band_v2_sn.json
        │   │   │       │   ├── 4band_v3.json
        │   │   │       │   └── ensemble.json
        │   │   │       ├── nets.py
        │   │   │       ├── nets_123812KB.py
        │   │   │       ├── nets_123821KB.py
        │   │   │       ├── nets_33966KB.py
        │   │   │       ├── nets_537227KB.py
        │   │   │       ├── nets_537238KB.py
        │   │   │       ├── nets_61968KB.py
        │   │   │       ├── nets_new.py
        │   │   │       └── spec_utils.py
        │   │   │   ├── name_params.json
        │   │   │   └── utils.py
        │   └── modules
        │   │   ├── ipex
        │   │       ├── __init__.py
        │   │       ├── attention.py
        │   │       ├── gradscaler.py
        │   │       └── hijacks.py
        │   │   ├── onnx
        │   │       └── export.py
        │   │   ├── train
        │   │       ├── extract
        │   │       │   ├── extract_f0_print.py
        │   │       │   ├── extract_f0_rmvpe.py
        │   │       │   └── extract_f0_rmvpe_dml.py
        │   │       ├── extract_feature_print.py
        │   │       ├── preprocess.py
        │   │       └── train.py
        │   │   ├── uvr5
        │   │       ├── mdxnet.py
        │   │       ├── modules.py
        │   │       └── vr.py
        │   │   └── vc
        │   │       ├── __init__.py
        │   │       ├── modules.py
        │   │       ├── pipeline.py
        │   │       └── utils.py
        ├── realtimervc.py
        └── tools
        │   ├── __pycache__
        │       └── rvc_for_realtime.cpython-310.pyc
        │   ├── app.py
        │   ├── calc_rvc_model_similarity.py
        │   ├── dlmodels.bat
        │   ├── dlmodels.sh
        │   ├── download_models - Kopie.py
        │   ├── download_models.py
        │   ├── export_onnx.py
        │   ├── infer
        │       ├── infer-pm-index256.py
        │       ├── train-index-v2.py
        │       ├── train-index.py
        │       └── trans_weights.py
        │   ├── infer_batch_rvc.py
        │   ├── infer_cli.py
        │   ├── onnx_inference_demo.py
        │   ├── rvc_for_realtime.py
        │   └── torchgate
        │       ├── __init__.py
        │       ├── __pycache__
        │           ├── __init__.cpython-310.pyc
        │           ├── torchgate.cpython-310.pyc
        │           └── utils.cpython-310.pyc
        │       ├── torchgate.py
        │       └── utils.py
    ├── server.py
    ├── setup.py
    ├── start_tts_server.py
    ├── tts_client.py
    ├── tts_server.py
    ├── vanessa.json
    └── xtts_rvc_synthesizer.py


/_create_venv.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | :: Set Python path (adjust this if needed)
 4 | set PYTHON_EXE=python.exe
 5 | 
 6 | 
 7 | echo Installing AI CLI Tools...
 8 | setlocal enabledelayedexpansion
 9 | 
10 | :: Set current directory
11 | cd /d %~dp0
12 | 
13 | echo Starting installation process...
14 | 
15 | :: Create and activate virtual environment
16 | echo Creating and activating virtual environment...
17 | %PYTHON_EXE% -m venv venv
18 | call venv\Scripts\activate.bat
19 | 
20 | :: Upgrade pip
21 | echo Upgrading pip...
22 | python -m pip install pip==23.3.1
23 | 
24 | 


--------------------------------------------------------------------------------
/_install_cli_commands.bat:
--------------------------------------------------------------------------------
 1 | echo Installing CLI commands
 2 | cd llm-cli
 3 | pip uninstall -y llm-cli
 4 | pip install -e .
 5 | cd ..
 6 | cd stt-cli
 7 | pip uninstall -y stt-cli
 8 | pip install -e .
 9 | cd ..
10 | cd tts-cli
11 | pip uninstall -y tts-cli
12 | pip install -e .
13 | cd ..
14 | 
15 | echo Installation of CLI commands finished


--------------------------------------------------------------------------------
/_install_dependencies.bat:
--------------------------------------------------------------------------------
1 | REM call _start_venv.bat
2 | echo Installating basic dependencies
3 | pip install -r requirements.txt
4 | 
5 | echo Upgrading torch to use GPU
6 | pip install torch==2.3.1+cu121 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
7 | 
8 | echo Installation of dependencies finished


--------------------------------------------------------------------------------
/_install_models.bat:
--------------------------------------------------------------------------------
1 | REM call _start_venv.bat
2 | echo Downloading models
3 | cd tts-cli
4 | python download_models.py
5 | 
6 | echo Download of models finished


--------------------------------------------------------------------------------
/_start_cli.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | cmd
3 | 


--------------------------------------------------------------------------------
/_start_llm_server.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | llm-server
3 | cmd
4 | 


--------------------------------------------------------------------------------
/_start_stt_server.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | stt-server
3 | cmd


--------------------------------------------------------------------------------
/_start_tts_server.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | tts-server
3 | cmd
4 | 


--------------------------------------------------------------------------------
/_start_venv.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | call venv\Scripts\activate.bat
3 | cmd
4 | 


--------------------------------------------------------------------------------
/llm-cli/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="llm-cli",
 5 |     version="0.1",
 6 |     packages=find_packages(),
 7 |     entry_points={
 8 |         'console_scripts': [
 9 |             'llm=llm_client:main',
10 |             'llm-server=start_llm_server:main',
11 |         ],
12 |     },
13 | )


--------------------------------------------------------------------------------
/llm-cli/start_llm_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def main():
 6 |     # Get the current script's directory (should be llm-cli)
 7 |     script_dir = os.path.dirname(os.path.abspath(__file__))
 8 |     
 9 |     # Move one directory up to access the venv
10 |     root_dir = os.path.dirname(script_dir)
11 |     os.chdir(root_dir)
12 |     
13 |     # Path to the virtual environment
14 |     venv_path = os.path.join(root_dir, 'venv')
15 |     
16 |     # Path to the Python interpreter in the virtual environment
17 |     if sys.platform == "win32":
18 |         python_path = os.path.join(venv_path, 'Scripts', 'python.exe')
19 |     else:
20 |         python_path = os.path.join(venv_path, 'bin', 'python')
21 |     
22 |     # Change back to the llm-cli directory
23 |     os.chdir(script_dir)
24 |     
25 |     # Prepare the command to run llm_server.py with all provided arguments
26 |     command = [python_path, 'llm_server.py'] + sys.argv[1:]
27 | 
28 |     # Start the LLM server
29 |     print("Starting LLM server...")
30 |     print(f"Command: {command}")
31 |     try:
32 |         subprocess.run(command, check=True)
33 |     except subprocess.CalledProcessError as e:
34 |         print(f"Error starting LLM server: {e}")
35 |         sys.exit(1)
36 |     except FileNotFoundError:
37 |         print(f"Error: Could not find Python interpreter at {python_path}")
38 |         print("Make sure the virtual environment is set up correctly.")
39 |         sys.exit(1)
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # stt server depencendies
 2 | realtimestt
 3 | 
 4 | # tts server depencendies
 5 | realtimetts[all]
 6 | 
 7 | # rvc (realtime voice change) depencendies
 8 | fairseq
 9 | faiss-cpu
10 | praat-parselmouth
11 | torchcrepe
12 | torchfcpe
13 | pyworld


--------------------------------------------------------------------------------
/requirements_client.txt:
--------------------------------------------------------------------------------
1 | websockets
2 | pyaudio
3 | websocket-client
4 | colorama
5 | tqdm


--------------------------------------------------------------------------------
/stt-cli/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="stt-cli",
 5 |     version="0.1",
 6 |     packages=find_packages(),
 7 |     entry_points={
 8 |         'console_scripts': [
 9 |             'stt=stt_client:main',
10 |             'stt-server=start_stt_server:main',
11 |         ],
12 |     },
13 | )


--------------------------------------------------------------------------------
/stt-cli/start_stt_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def main():
 6 |     # Get the current script's directory (should be stt-cli)
 7 |     script_dir = os.path.dirname(os.path.abspath(__file__))
 8 |     
 9 |     # Move one directory up to access the venv
10 |     root_dir = os.path.dirname(script_dir)
11 |     os.chdir(root_dir)
12 |     
13 |     # Path to the virtual environment
14 |     venv_path = os.path.join(root_dir, 'venv')
15 |     
16 |     # Path to the Python interpreter in the virtual environment
17 |     if sys.platform == "win32":
18 |         python_path = os.path.join(venv_path, 'Scripts', 'python.exe')
19 |     else:
20 |         python_path = os.path.join(venv_path, 'bin', 'python')
21 |     
22 |     # Change back to the stt-cli directory
23 |     os.chdir(script_dir)
24 |     
25 |     # Prepare the command to run stt_server.py with all provided arguments
26 |     command = [python_path, 'stt_server.py'] + sys.argv[1:]
27 |     
28 |     # Start the STT server
29 |     print("Starting STT server...")
30 |     print(f"Command: {command}")
31 |     try:
32 |         subprocess.run(command, check=True)
33 |     except subprocess.CalledProcessError as e:
34 |         print(f"Error starting STT server: {e}")
35 |         sys.exit(1)
36 |     except FileNotFoundError:
37 |         print(f"Error: Could not find Python interpreter at {python_path}")
38 |         print("Make sure the virtual environment is set up correctly.")
39 |         sys.exit(1)
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/stt-cli/stt_server.py:
--------------------------------------------------------------------------------
  1 | if __name__ == '__main__':
  2 |     print("Starting server, please wait...")
  3 | 
  4 |     from RealtimeSTT import AudioToTextRecorder
  5 |     import asyncio
  6 |     import websockets
  7 |     import threading
  8 |     import numpy as np
  9 |     from scipy.signal import resample
 10 |     import json
 11 | 
 12 |     recorder = None
 13 |     recorder_ready = threading.Event()
 14 |     client_websocket = None
 15 | 
 16 |     async def send_to_client(message):
 17 |         if client_websocket:
 18 |             await client_websocket.send(message)
 19 | 
 20 |     def text_detected(text):
 21 |         asyncio.new_event_loop().run_until_complete(
 22 |             send_to_client(
 23 |                 json.dumps({
 24 |                     'type': 'realtime',
 25 |                     'text': text
 26 |                 })
 27 |             )
 28 |         )
 29 |         print(f"\r{text}", flush=True, end='')
 30 | 
 31 |     recorder_config = {
 32 |         'spinner': False,
 33 |         'use_microphone': False,
 34 |         'model': 'large-v2',
 35 |         'silero_sensitivity': 0.4,
 36 |         'silero_deactivity_detection': True,
 37 |         'webrtc_sensitivity': 3,
 38 |         'post_speech_silence_duration': 0.25,
 39 |         'min_length_of_recording': 0,
 40 |         'min_gap_between_recordings': 0,
 41 |         'enable_realtime_transcription': True,
 42 |         'realtime_processing_pause': 0,
 43 |         'realtime_model_type': 'medium',
 44 |         'on_realtime_transcription_stabilized': text_detected,
 45 |     }
 46 | 
 47 |     def _recorder_thread():
 48 |         global recorder
 49 |         print("Initializing RealtimeSTT...")
 50 |         recorder = AudioToTextRecorder(**recorder_config)
 51 |         print("RealtimeSTT initialized")
 52 |         recorder_ready.set()
 53 |         while True:
 54 |             full_sentence = recorder.text()
 55 |             asyncio.new_event_loop().run_until_complete(
 56 |                 send_to_client(
 57 |                     json.dumps({
 58 |                         'type': 'fullSentence',
 59 |                         'text': full_sentence
 60 |                     })
 61 |                 )
 62 |             )
 63 |             print(f"\rSentence: {full_sentence}")
 64 | 
 65 |     def decode_and_resample(
 66 |             audio_data,
 67 |             original_sample_rate,
 68 |             target_sample_rate):
 69 | 
 70 |         # Decode 16-bit PCM data to numpy array
 71 |         audio_np = np.frombuffer(audio_data, dtype=np.int16)
 72 | 
 73 |         # Calculate the number of samples after resampling
 74 |         num_original_samples = len(audio_np)
 75 |         num_target_samples = int(num_original_samples * target_sample_rate /
 76 |                                  original_sample_rate)
 77 | 
 78 |         # Resample the audio
 79 |         resampled_audio = resample(audio_np, num_target_samples)
 80 | 
 81 |         return resampled_audio.astype(np.int16).tobytes()
 82 | 
 83 |     async def echo(websocket, path):
 84 |         print("Client connected")
 85 |         global client_websocket
 86 |         client_websocket = websocket
 87 |         async for message in websocket:
 88 | 
 89 |             if not recorder_ready.is_set():
 90 |                 print("Recorder not ready")
 91 |                 continue
 92 | 
 93 |             metadata_length = int.from_bytes(message[:4], byteorder='little')
 94 |             metadata_json = message[4:4+metadata_length].decode('utf-8')
 95 |             metadata = json.loads(metadata_json)
 96 |             sample_rate = metadata['sampleRate']
 97 |             chunk = message[4+metadata_length:]
 98 |             resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
 99 |             recorder.feed_audio(resampled_chunk)
100 | 
101 | 
102 |     def main():            
103 |         # start_server = websockets.serve(echo, "0.0.0.0", 9001)
104 |         start_server = websockets.serve(echo, "localhost", 8011)
105 | 
106 |         recorder_thread = threading.Thread(target=_recorder_thread)
107 |         recorder_thread.start()
108 |         recorder_ready.wait()
109 | 
110 |         print("Server started. Press Ctrl+C to stop the server.")
111 |         asyncio.get_event_loop().run_until_complete(start_server)
112 |         asyncio.get_event_loop().run_forever()
113 | 
114 |     main()
115 | 
116 | 


--------------------------------------------------------------------------------
/tts-cli/bufferstream.py:
--------------------------------------------------------------------------------
 1 | import queue
 2 | import threading
 3 | import uuid
 4 | from typing import Generator, List, Any
 5 | 
 6 | class BufferStream:
 7 |     def __init__(self):
 8 |         self.items: queue.Queue = queue.Queue()
 9 |         self._stop_event: threading.Event = threading.Event()
10 |         self.stopped: bool = False
11 |         self.stream_id: str = str(uuid.uuid4())
12 | 
13 |     def add(self, item: Any) -> None:
14 |         """Add an item to the buffer."""
15 |         self.items.put(item)
16 | 
17 |     def stop(self) -> None:
18 |         """Signal to stop the buffer stream."""
19 |         self._stop_event.set()
20 | 
21 |     def snapshot(self) -> List[Any]:
22 |         """Take a snapshot of all items in the buffer without exhausting it."""
23 |         with self.items.mutex:
24 |             return list(self.items.queue)
25 | 
26 |     def gen(self) -> Generator[Any, None, None]:
27 |         """Generate items from the buffer, yielding them one at a time."""
28 |         while not self._stop_event.is_set() or not self.items.empty():
29 |             try:
30 |                 yield self.items.get(timeout=0.1)
31 |             except queue.Empty:
32 |                 continue
33 |         self.stopped = True
34 | 


--------------------------------------------------------------------------------
/tts-cli/download_models.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Python script to download the ai models
  3 | needed by linguflex from Huggingface's model hub.
  4 | """
  5 | 
  6 | from huggingface_hub import hf_hub_download
  7 | import os
  8 | 
  9 | 
 10 | def create_directory(path):
 11 |     if not os.path.exists(path):
 12 |         os.makedirs(path)
 13 | 
 14 | 
 15 | def create_directories():
 16 |     create_directory("assets")
 17 |     create_directory("assets/hubert")
 18 |     create_directory("models")
 19 |     create_directory("models/rvc")
 20 |     create_directory("models/xtts")
 21 |     create_directory("models/xtts/Lasinya")
 22 | 
 23 | 
 24 | def download_file(
 25 |         url,
 26 |         filename_server,
 27 |         path_local
 28 |         ):
 29 | 
 30 |     local_file = os.path.join(path_local, filename_server)
 31 |     if os.path.exists(local_file):
 32 |         print(f"File {filename_server} already exists in {path_local}.")
 33 |         return
 34 | 
 35 |     print(f"Downloading {filename_server} from repo {url} to {path_local}")
 36 |     hf_hub_download(
 37 |         repo_id=url,
 38 |         filename=filename_server,
 39 |         local_dir=path_local)
 40 | 
 41 | 
 42 | create_directories()
 43 | 
 44 | # download rvc base model (hubert) files
 45 | print("Downloading hubert base model files")
 46 | download_file(
 47 |     "KoljaB/RVC_Assets", "hubert_base.pt", "assets/hubert")
 48 | download_file(
 49 |     "KoljaB/RVC_Assets", "hubert_inputs.pth", "assets/hubert")
 50 | 
 51 | # download rvc trained model files
 52 | print("Downloading rvc trained model files")
 53 | download_file(
 54 |     "KoljaB/RVC_Models", "Lasinya.pth", "models/rvc")
 55 | download_file(
 56 |     "KoljaB/RVC_Models", "Lasinya.index", "models/rvc")
 57 | 
 58 | # download xtts trained model files
 59 | from huggingface_hub import hf_hub_download
 60 | import os
 61 | 
 62 | def create_directory(path):
 63 |     if not os.path.exists(path):
 64 |         os.makedirs(path)
 65 | 
 66 | def create_directories():
 67 |     create_directory("assets")
 68 |     create_directory("assets/hubert")
 69 |     create_directory("models")
 70 |     create_directory("models/rvc")
 71 |     create_directory("models/xtts")
 72 |     create_directory("models/xtts/v2.0.2")
 73 | 
 74 | def download_file(url, filename_server, path_local):
 75 |     local_file = os.path.join(path_local, filename_server)
 76 |     if os.path.exists(local_file):
 77 |         print(f"File {filename_server} already exists in {path_local}.")
 78 |         return
 79 | 
 80 |     print(f"Downloading {filename_server} from repo {url} to {path_local}")
 81 |     hf_hub_download(
 82 |         repo_id=url,
 83 |         filename=filename_server,
 84 |         local_dir=path_local)
 85 | 
 86 | create_directories()
 87 | 
 88 | # download rvc base model (hubert) files
 89 | print("Downloading hubert base model files")
 90 | download_file(
 91 |     "KoljaB/RVC_Assets", "hubert_base.pt", "assets/hubert")
 92 | download_file(
 93 |     "KoljaB/RVC_Assets", "hubert_inputs.pth", "assets/hubert")
 94 | 
 95 | # download rvc trained model files
 96 | print("Downloading rvc trained model files")
 97 | download_file(
 98 |     "KoljaB/RVC_Models", "Lasinya.pth", "models/rvc")
 99 | download_file(
100 |     "KoljaB/RVC_Models", "Lasinya.index", "models/rvc")
101 | 
102 | # download xtts v2 base model files
103 | print("Downloading XTTS v2 base model files")
104 | download_file(
105 |      "coqui/XTTS-v2", "config.json", "models/xtts/v2.0.2")
106 | download_file(
107 |      "coqui/XTTS-v2", "model.pth", "models/xtts/v2.0.2")
108 | download_file(
109 |      "coqui/XTTS-v2", "vocab.json", "models/xtts/v2.0.2")
110 | download_file(
111 |      "coqui/XTTS-v2", "speakers_xtts.pth", "models/xtts/v2.0.2")
112 |      
113 | 
114 | # print("Downloading xtts trained model files (Lasinya)")
115 | # download_file(
116 | #      "KoljaB/XTTS_Lasinya", "config.json", "models/xtts/Lasinya")
117 | # download_file(
118 | #      "KoljaB/XTTS_Lasinya", "vocab.json", "models/xtts/Lasinya")
119 | # download_file(
120 | #      "KoljaB/XTTS_Lasinya", "speakers_xtts.pth", "models/xtts/Lasinya")
121 | # download_file(
122 | #      "KoljaB/XTTS_Lasinya", "model.pth", "models/xtts/Lasinya")
123 | 


--------------------------------------------------------------------------------
/tts-cli/requirements.txt:
--------------------------------------------------------------------------------
 1 | fairseq==0.12.2
 2 | faiss-cpu==1.7.3
 3 | tensorboardX==2.6.2.2
 4 | torchcrepe==0.0.20
 5 | torchfcpe==0.0.4
 6 | praat-parselmouth==0.4.3
 7 | pyworld==0.3.2
 8 | huggingface_hub==0.24.5
 9 | 
10 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/__pycache__/realtimervc.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/__pycache__/realtimervc.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/configs/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/configs/__pycache__/config.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/configs/config.json:
--------------------------------------------------------------------------------
1 | {"pth_path": "D:/lasinya1.pth", "index_path": "D:/added_IVF3778_Flat_nprobe_1_aerith_v2.index", "sg_input_device": "Mikrofon (2- ArctisX PnP Microp (MME)", "sg_output_device": "Lautsprecher (Realtek(R) Audio) (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.0, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}


--------------------------------------------------------------------------------
/tts-cli/rvc/configs/v1/32k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": false,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 32000,
21 |     "filter_length": 1024,
22 |     "hop_length": 320,
23 |     "win_length": 1024,
24 |     "n_mel_channels": 80,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,4,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/configs/v1/40k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": false,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 40000,
21 |     "filter_length": 2048,
22 |     "hop_length": 400,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 125,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,10,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/configs/v1/48k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": false,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 11520,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 48000,
21 |     "filter_length": 2048,
22 |     "hop_length": 480,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 128,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,6,2,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [16,16,4,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/configs/v2/32k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": false,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 12800,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 32000,
21 |     "filter_length": 1024,
22 |     "hop_length": 320,
23 |     "win_length": 1024,
24 |     "n_mel_channels": 80,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [10,8,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [20,16,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/configs/v2/48k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "seed": 1234,
 5 |     "epochs": 20000,
 6 |     "learning_rate": 1e-4,
 7 |     "betas": [0.8, 0.99],
 8 |     "eps": 1e-9,
 9 |     "batch_size": 4,
10 |     "fp16_run": false,
11 |     "lr_decay": 0.999875,
12 |     "segment_size": 17280,
13 |     "init_lr_ratio": 1,
14 |     "warmup_epochs": 0,
15 |     "c_mel": 45,
16 |     "c_kl": 1.0
17 |   },
18 |   "data": {
19 |     "max_wav_value": 32768.0,
20 |     "sampling_rate": 48000,
21 |     "filter_length": 2048,
22 |     "hop_length": 480,
23 |     "win_length": 2048,
24 |     "n_mel_channels": 128,
25 |     "mel_fmin": 0.0,
26 |     "mel_fmax": null
27 |   },
28 |   "model": {
29 |     "inter_channels": 192,
30 |     "hidden_channels": 192,
31 |     "filter_channels": 768,
32 |     "n_heads": 2,
33 |     "n_layers": 6,
34 |     "kernel_size": 3,
35 |     "p_dropout": 0,
36 |     "resblock": "1",
37 |     "resblock_kernel_sizes": [3,7,11],
38 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39 |     "upsample_rates": [12,10,2,2],
40 |     "upsample_initial_channel": 512,
41 |     "upsample_kernel_sizes": [24,20,4,4],
42 |     "use_spectral_norm": false,
43 |     "gin_channels": 256,
44 |     "spk_embed_dim": 109
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/i18n/i18n.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import locale
 3 | import os
 4 | 
 5 | 
 6 | def load_language_list(language):
 7 |     with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f:
 8 |         language_list = json.load(f)
 9 |     return language_list
10 | 
11 | 
12 | class I18nAuto:
13 |     def __init__(self, language=None):
14 |         if language in ["Auto", None]:
15 |             language = locale.getdefaultlocale()[
16 |                 0
17 |             ]  # getlocale can't identify the system's language ((None, None))
18 |         if not os.path.exists(f"./i18n/locale/{language}.json"):
19 |             language = "en_US"
20 |         self.language = language
21 |         self.language_map = load_language_list(language)
22 | 
23 |     def __call__(self, key):
24 |         return self.language_map.get(key, key)
25 | 
26 |     def __repr__(self):
27 |         return "Use Language: " + self.language
28 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/i18n/locale_diff.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import OrderedDict
 4 | 
 5 | # Define the standard file name
 6 | standard_file = "locale/zh_CN.json"
 7 | 
 8 | # Find all JSON files in the directory
 9 | dir_path = "locale/"
10 | languages = [
11 |     os.path.join(dir_path, f)
12 |     for f in os.listdir(dir_path)
13 |     if f.endswith(".json") and f != standard_file
14 | ]
15 | 
16 | # Load the standard file
17 | with open(standard_file, "r", encoding="utf-8") as f:
18 |     standard_data = json.load(f, object_pairs_hook=OrderedDict)
19 | 
20 | # Loop through each language file
21 | for lang_file in languages:
22 |     # Load the language file
23 |     with open(lang_file, "r", encoding="utf-8") as f:
24 |         lang_data = json.load(f, object_pairs_hook=OrderedDict)
25 | 
26 |     # Find the difference between the language file and the standard file
27 |     diff = set(standard_data.keys()) - set(lang_data.keys())
28 | 
29 |     miss = set(lang_data.keys()) - set(standard_data.keys())
30 | 
31 |     # Add any missing keys to the language file
32 |     for key in diff:
33 |         lang_data[key] = key
34 | 
35 |     # Del any extra keys to the language file
36 |     for key in miss:
37 |         del lang_data[key]
38 | 
39 |     # Sort the keys of the language file to match the order of the standard file
40 |     lang_data = OrderedDict(
41 |         sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
42 |     )
43 | 
44 |     # Save the updated language file
45 |     with open(lang_file, "w", encoding="utf-8") as f:
46 |         json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
47 |         f.write("\n")
48 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/i18n/scan_i18n.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import glob
 3 | import json
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | def extract_i18n_strings(node):
 8 |     i18n_strings = []
 9 | 
10 |     if (
11 |         isinstance(node, ast.Call)
12 |         and isinstance(node.func, ast.Name)
13 |         and node.func.id == "i18n"
14 |     ):
15 |         for arg in node.args:
16 |             if isinstance(arg, ast.Str):
17 |                 i18n_strings.append(arg.s)
18 | 
19 |     for child_node in ast.iter_child_nodes(node):
20 |         i18n_strings.extend(extract_i18n_strings(child_node))
21 | 
22 |     return i18n_strings
23 | 
24 | 
25 | # scan the directory for all .py files (recursively)
26 | # for each file, parse the code into an AST
27 | # for each AST, extract the i18n strings
28 | 
29 | strings = []
30 | for filename in glob.iglob("**/*.py", recursive=True):
31 |     with open(filename, "r") as f:
32 |         code = f.read()
33 |         if "I18nAuto" in code:
34 |             tree = ast.parse(code)
35 |             i18n_strings = extract_i18n_strings(tree)
36 |             print(filename, len(i18n_strings))
37 |             strings.extend(i18n_strings)
38 | code_keys = set(strings)
39 | """
40 | n_i18n.py
41 | gui_v1.py 26
42 | app.py 16
43 | infer-web.py 147
44 | scan_i18n.py 0
45 | i18n.py 0
46 | lib/train/process_ckpt.py 1
47 | """
48 | print()
49 | print("Total unique:", len(code_keys))
50 | 
51 | 
52 | standard_file = "i18n/locale/zh_CN.json"
53 | with open(standard_file, "r", encoding="utf-8") as f:
54 |     standard_data = json.load(f, object_pairs_hook=OrderedDict)
55 | standard_keys = set(standard_data.keys())
56 | 
57 | # Define the standard file name
58 | unused_keys = standard_keys - code_keys
59 | print("Unused keys:", len(unused_keys))
60 | for unused_key in unused_keys:
61 |     print("\t", unused_key)
62 | 
63 | missing_keys = code_keys - standard_keys
64 | print("Missing keys:", len(missing_keys))
65 | for missing_key in missing_keys:
66 |     print("\t", missing_key)
67 | 
68 | code_keys_dict = OrderedDict()
69 | for s in strings:
70 |     code_keys_dict[s] = s
71 | 
72 | # write back
73 | with open(standard_file, "w", encoding="utf-8") as f:
74 |     json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
75 |     f.write("\n")
76 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/__pycache__/rmvpe.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/audio.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import traceback
 3 | 
 4 | import librosa
 5 | import numpy as np
 6 | import av
 7 | from io import BytesIO
 8 | 
 9 | 
10 | def wav2(i, o, format):
11 |     inp = av.open(i, "rb")
12 |     if format == "m4a":
13 |         format = "mp4"
14 |     out = av.open(o, "wb", format=format)
15 |     if format == "ogg":
16 |         format = "libvorbis"
17 |     if format == "mp4":
18 |         format = "aac"
19 | 
20 |     ostream = out.add_stream(format)
21 | 
22 |     for frame in inp.decode(audio=0):
23 |         for p in ostream.encode(frame):
24 |             out.mux(p)
25 | 
26 |     for p in ostream.encode(None):
27 |         out.mux(p)
28 | 
29 |     out.close()
30 |     inp.close()
31 | 
32 | 
33 | def audio2(i, o, format, sr):
34 |     inp = av.open(i, "rb")
35 |     out = av.open(o, "wb", format=format)
36 |     if format == "ogg":
37 |         format = "libvorbis"
38 |     if format == "f32le":
39 |         format = "pcm_f32le"
40 | 
41 |     ostream = out.add_stream(format, channels=1)
42 |     ostream.sample_rate = sr
43 | 
44 |     for frame in inp.decode(audio=0):
45 |         for p in ostream.encode(frame):
46 |             out.mux(p)
47 | 
48 |     out.close()
49 |     inp.close()
50 | 
51 | 
52 | def load_audio(file, sr):
53 |     file = (
54 |         file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
55 |     )  # 防止小白拷路径头尾带了空格和"和回车
56 |     if os.path.exists(file) == False:
57 |         raise RuntimeError(
58 |             "You input a wrong audio path that does not exists, please fix it!"
59 |         )
60 |     try:
61 |         with open(file, "rb") as f:
62 |             with BytesIO() as out:
63 |                 audio2(f, out, "f32le", sr)
64 |                 return np.frombuffer(out.getvalue(), np.float32).flatten()
65 | 
66 |     except AttributeError:
67 |         audio = file[1] / 32768.0
68 |         if len(audio.shape) == 2:
69 |             audio = np.mean(audio, -1)
70 |         return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
71 | 
72 |     except:
73 |         raise RuntimeError(traceback.format_exc())
74 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/attentions.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/commons.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/models.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/modules.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/__pycache__/transforms.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyworld
 3 | 
 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class DioF0Predictor(F0Predictor):
 8 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 | 
14 |     def interpolate_f0(self, f0):
15 |         """
16 |         对F0进行插值处理
17 |         """
18 | 
19 |         data = np.reshape(f0, (f0.size, 1))
20 | 
21 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 |         vuv_vector[data > 0.0] = 1.0
23 |         vuv_vector[data <= 0.0] = 0.0
24 | 
25 |         ip_data = data
26 | 
27 |         frame_number = data.size
28 |         last_value = 0.0
29 |         for i in range(frame_number):
30 |             if data[i] <= 0.0:
31 |                 j = i + 1
32 |                 for j in range(i + 1, frame_number):
33 |                     if data[j] > 0.0:
34 |                         break
35 |                 if j < frame_number - 1:
36 |                     if last_value > 0.0:
37 |                         step = (data[j] - data[i - 1]) / float(j - i)
38 |                         for k in range(i, j):
39 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
40 |                     else:
41 |                         for k in range(i, j):
42 |                             ip_data[k] = data[j]
43 |                 else:
44 |                     for k in range(i, frame_number):
45 |                         ip_data[k] = last_value
46 |             else:
47 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
48 |                 last_value = data[i]
49 | 
50 |         return ip_data[:, 0], vuv_vector[:, 0]
51 | 
52 |     def resize_f0(self, x, target_len):
53 |         source = np.array(x)
54 |         source[source < 0.001] = np.nan
55 |         target = np.interp(
56 |             np.arange(0, len(source) * target_len, len(source)) / target_len,
57 |             np.arange(0, len(source)),
58 |             source,
59 |         )
60 |         res = np.nan_to_num(target)
61 |         return res
62 | 
63 |     def compute_f0(self, wav, p_len=None):
64 |         if p_len is None:
65 |             p_len = wav.shape[0] // self.hop_length
66 |         f0, t = pyworld.dio(
67 |             wav.astype(np.double),
68 |             fs=self.sampling_rate,
69 |             f0_floor=self.f0_min,
70 |             f0_ceil=self.f0_max,
71 |             frame_period=1000 * self.hop_length / self.sampling_rate,
72 |         )
73 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
74 |         for index, pitch in enumerate(f0):
75 |             f0[index] = round(pitch, 1)
76 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
77 | 
78 |     def compute_f0_uv(self, wav, p_len=None):
79 |         if p_len is None:
80 |             p_len = wav.shape[0] // self.hop_length
81 |         f0, t = pyworld.dio(
82 |             wav.astype(np.double),
83 |             fs=self.sampling_rate,
84 |             f0_floor=self.f0_min,
85 |             f0_ceil=self.f0_max,
86 |             frame_period=1000 * self.hop_length / self.sampling_rate,
87 |         )
88 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
89 |         for index, pitch in enumerate(f0):
90 |             f0[index] = round(pitch, 1)
91 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
92 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py:
--------------------------------------------------------------------------------
 1 | class F0Predictor(object):
 2 |     def compute_f0(self, wav, p_len):
 3 |         """
 4 |         input: wav:[signal_length]
 5 |                p_len:int
 6 |         output: f0:[signal_length//hop_length]
 7 |         """
 8 |         pass
 9 | 
10 |     def compute_f0_uv(self, wav, p_len):
11 |         """
12 |         input: wav:[signal_length]
13 |                p_len:int
14 |         output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15 |         """
16 |         pass
17 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyworld
 3 | 
 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class HarvestF0Predictor(F0Predictor):
 8 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 | 
14 |     def interpolate_f0(self, f0):
15 |         """
16 |         对F0进行插值处理
17 |         """
18 | 
19 |         data = np.reshape(f0, (f0.size, 1))
20 | 
21 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 |         vuv_vector[data > 0.0] = 1.0
23 |         vuv_vector[data <= 0.0] = 0.0
24 | 
25 |         ip_data = data
26 | 
27 |         frame_number = data.size
28 |         last_value = 0.0
29 |         for i in range(frame_number):
30 |             if data[i] <= 0.0:
31 |                 j = i + 1
32 |                 for j in range(i + 1, frame_number):
33 |                     if data[j] > 0.0:
34 |                         break
35 |                 if j < frame_number - 1:
36 |                     if last_value > 0.0:
37 |                         step = (data[j] - data[i - 1]) / float(j - i)
38 |                         for k in range(i, j):
39 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
40 |                     else:
41 |                         for k in range(i, j):
42 |                             ip_data[k] = data[j]
43 |                 else:
44 |                     for k in range(i, frame_number):
45 |                         ip_data[k] = last_value
46 |             else:
47 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
48 |                 last_value = data[i]
49 | 
50 |         return ip_data[:, 0], vuv_vector[:, 0]
51 | 
52 |     def resize_f0(self, x, target_len):
53 |         source = np.array(x)
54 |         source[source < 0.001] = np.nan
55 |         target = np.interp(
56 |             np.arange(0, len(source) * target_len, len(source)) / target_len,
57 |             np.arange(0, len(source)),
58 |             source,
59 |         )
60 |         res = np.nan_to_num(target)
61 |         return res
62 | 
63 |     def compute_f0(self, wav, p_len=None):
64 |         if p_len is None:
65 |             p_len = wav.shape[0] // self.hop_length
66 |         f0, t = pyworld.harvest(
67 |             wav.astype(np.double),
68 |             fs=self.sampling_rate,
69 |             f0_ceil=self.f0_max,
70 |             f0_floor=self.f0_min,
71 |             frame_period=1000 * self.hop_length / self.sampling_rate,
72 |         )
73 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
74 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
75 | 
76 |     def compute_f0_uv(self, wav, p_len=None):
77 |         if p_len is None:
78 |             p_len = wav.shape[0] // self.hop_length
79 |         f0, t = pyworld.harvest(
80 |             wav.astype(np.double),
81 |             fs=self.sampling_rate,
82 |             f0_floor=self.f0_min,
83 |             f0_ceil=self.f0_max,
84 |             frame_period=1000 * self.hop_length / self.sampling_rate,
85 |         )
86 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
87 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
88 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import parselmouth
 3 | 
 4 | from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 5 | 
 6 | 
 7 | class PMF0Predictor(F0Predictor):
 8 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 9 |         self.hop_length = hop_length
10 |         self.f0_min = f0_min
11 |         self.f0_max = f0_max
12 |         self.sampling_rate = sampling_rate
13 | 
14 |     def interpolate_f0(self, f0):
15 |         """
16 |         对F0进行插值处理
17 |         """
18 | 
19 |         data = np.reshape(f0, (f0.size, 1))
20 | 
21 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22 |         vuv_vector[data > 0.0] = 1.0
23 |         vuv_vector[data <= 0.0] = 0.0
24 | 
25 |         ip_data = data
26 | 
27 |         frame_number = data.size
28 |         last_value = 0.0
29 |         for i in range(frame_number):
30 |             if data[i] <= 0.0:
31 |                 j = i + 1
32 |                 for j in range(i + 1, frame_number):
33 |                     if data[j] > 0.0:
34 |                         break
35 |                 if j < frame_number - 1:
36 |                     if last_value > 0.0:
37 |                         step = (data[j] - data[i - 1]) / float(j - i)
38 |                         for k in range(i, j):
39 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
40 |                     else:
41 |                         for k in range(i, j):
42 |                             ip_data[k] = data[j]
43 |                 else:
44 |                     for k in range(i, frame_number):
45 |                         ip_data[k] = last_value
46 |             else:
47 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
48 |                 last_value = data[i]
49 | 
50 |         return ip_data[:, 0], vuv_vector[:, 0]
51 | 
52 |     def compute_f0(self, wav, p_len=None):
53 |         x = wav
54 |         if p_len is None:
55 |             p_len = x.shape[0] // self.hop_length
56 |         else:
57 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
58 |         time_step = self.hop_length / self.sampling_rate * 1000
59 |         f0 = (
60 |             parselmouth.Sound(x, self.sampling_rate)
61 |             .to_pitch_ac(
62 |                 time_step=time_step / 1000,
63 |                 voicing_threshold=0.6,
64 |                 pitch_floor=self.f0_min,
65 |                 pitch_ceiling=self.f0_max,
66 |             )
67 |             .selected_array["frequency"]
68 |         )
69 | 
70 |         pad_size = (p_len - len(f0) + 1) // 2
71 |         if pad_size > 0 or p_len - len(f0) - pad_size > 0:
72 |             f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
73 |         f0, uv = self.interpolate_f0(f0)
74 |         return f0
75 | 
76 |     def compute_f0_uv(self, wav, p_len=None):
77 |         x = wav
78 |         if p_len is None:
79 |             p_len = x.shape[0] // self.hop_length
80 |         else:
81 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
82 |         time_step = self.hop_length / self.sampling_rate * 1000
83 |         f0 = (
84 |             parselmouth.Sound(x, self.sampling_rate)
85 |             .to_pitch_ac(
86 |                 time_step=time_step / 1000,
87 |                 voicing_threshold=0.6,
88 |                 pitch_floor=self.f0_min,
89 |                 pitch_ceiling=self.f0_max,
90 |             )
91 |             .selected_array["frequency"]
92 |         )
93 | 
94 |         pad_size = (p_len - len(f0) + 1) // 2
95 |         if pad_size > 0 or p_len - len(f0) - pad_size > 0:
96 |             f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
97 |         f0, uv = self.interpolate_f0(f0)
98 |         return f0, uv
99 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/infer_pack/modules/F0Predictor/__init__.py


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/jit/__init__.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | import pickle
  3 | import time
  4 | import torch
  5 | from tqdm import tqdm
  6 | from collections import OrderedDict
  7 | 
  8 | 
  9 | def load_inputs(path, device, is_half=False):
 10 |     parm = torch.load(path, map_location=torch.device("cpu"))
 11 |     for key in parm.keys():
 12 |         parm[key] = parm[key].to(device)
 13 |         if is_half and parm[key].dtype == torch.float32:
 14 |             parm[key] = parm[key].half()
 15 |         elif not is_half and parm[key].dtype == torch.float16:
 16 |             parm[key] = parm[key].float()
 17 |     return parm
 18 | 
 19 | 
 20 | def benchmark(
 21 |     model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False
 22 | ):
 23 |     parm = load_inputs(inputs_path, device, is_half)
 24 |     total_ts = 0.0
 25 |     bar = tqdm(range(epoch))
 26 |     for i in bar:
 27 |         start_time = time.perf_counter()
 28 |         o = model(**parm)
 29 |         total_ts += time.perf_counter() - start_time
 30 |     print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}")
 31 | 
 32 | 
 33 | def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False):
 34 |     benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half)
 35 | 
 36 | 
 37 | def to_jit_model(
 38 |     model_path,
 39 |     model_type: str,
 40 |     mode: str = "trace",
 41 |     inputs_path: str = None,
 42 |     device=torch.device("cpu"),
 43 |     is_half=False,
 44 | ):
 45 |     model = None
 46 |     if model_type.lower() == "synthesizer":
 47 |         from .get_synthesizer import get_synthesizer
 48 | 
 49 |         model, _ = get_synthesizer(model_path, device)
 50 |         model.forward = model.infer
 51 |     elif model_type.lower() == "rmvpe":
 52 |         from .get_rmvpe import get_rmvpe
 53 | 
 54 |         model = get_rmvpe(model_path, device)
 55 |     elif model_type.lower() == "hubert":
 56 |         from .get_hubert import get_hubert_model
 57 | 
 58 |         model = get_hubert_model(model_path, device)
 59 |         model.forward = model.infer
 60 |     else:
 61 |         raise ValueError(f"No model type named {model_type}")
 62 |     model = model.eval()
 63 |     model = model.half() if is_half else model.float()
 64 |     if mode == "trace":
 65 |         assert not inputs_path
 66 |         inputs = load_inputs(inputs_path, device, is_half)
 67 |         model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
 68 |     elif mode == "script":
 69 |         model_jit = torch.jit.script(model)
 70 |     model_jit.to(device)
 71 |     model_jit = model_jit.half() if is_half else model_jit.float()
 72 |     # model = model.half() if is_half else model.float()
 73 |     return (model, model_jit)
 74 | 
 75 | 
 76 | def export(
 77 |     model: torch.nn.Module,
 78 |     mode: str = "trace",
 79 |     inputs: dict = None,
 80 |     device=torch.device("cpu"),
 81 |     is_half: bool = False,
 82 | ) -> dict:
 83 |     model = model.half() if is_half else model.float()
 84 |     model.eval()
 85 |     if mode == "trace":
 86 |         assert inputs is not None
 87 |         model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
 88 |     elif mode == "script":
 89 |         model_jit = torch.jit.script(model)
 90 |     model_jit.to(device)
 91 |     model_jit = model_jit.half() if is_half else model_jit.float()
 92 |     buffer = BytesIO()
 93 |     # model_jit=model_jit.cpu()
 94 |     torch.jit.save(model_jit, buffer)
 95 |     del model_jit
 96 |     cpt = OrderedDict()
 97 |     cpt["model"] = buffer.getvalue()
 98 |     cpt["is_half"] = is_half
 99 |     return cpt
100 | 
101 | 
102 | def load(path: str):
103 |     with open(path, "rb") as f:
104 |         return pickle.load(f)
105 | 
106 | 
107 | def save(ckpt: dict, save_path: str):
108 |     with open(save_path, "wb") as f:
109 |         pickle.dump(ckpt, f)
110 | 
111 | 
112 | def rmvpe_jit_export(
113 |     model_path: str,
114 |     mode: str = "script",
115 |     inputs_path: str = None,
116 |     save_path: str = None,
117 |     device=torch.device("cpu"),
118 |     is_half=False,
119 | ):
120 |     if not save_path:
121 |         save_path = model_path.rstrip(".pth")
122 |         save_path += ".half.jit" if is_half else ".jit"
123 |     if "cuda" in str(device) and ":" not in str(device):
124 |         device = torch.device("cuda:0")
125 |     from .get_rmvpe import get_rmvpe
126 | 
127 |     model = get_rmvpe(model_path, device)
128 |     inputs = None
129 |     if mode == "trace":
130 |         inputs = load_inputs(inputs_path, device, is_half)
131 |     ckpt = export(model, mode, inputs, device, is_half)
132 |     ckpt["device"] = str(device)
133 |     save(ckpt, save_path)
134 |     return ckpt
135 | 
136 | 
137 | def synthesizer_jit_export(
138 |     model_path: str,
139 |     mode: str = "script",
140 |     inputs_path: str = None,
141 |     save_path: str = None,
142 |     device=torch.device("cpu"),
143 |     is_half=False,
144 | ):
145 |     if not save_path:
146 |         save_path = model_path.rstrip(".pth")
147 |         save_path += ".half.jit" if is_half else ".jit"
148 |     if "cuda" in str(device) and ":" not in str(device):
149 |         device = torch.device("cuda:0")
150 |     from .get_synthesizer import get_synthesizer
151 | 
152 |     model, cpt = get_synthesizer(model_path, device)
153 |     assert isinstance(cpt, dict)
154 |     model.forward = model.infer
155 |     inputs = None
156 |     if mode == "trace":
157 |         inputs = load_inputs(inputs_path, device, is_half)
158 |     ckpt = export(model, mode, inputs, device, is_half)
159 |     cpt.pop("weight")
160 |     cpt["model"] = ckpt["model"]
161 |     cpt["device"] = device
162 |     save(cpt, save_path)
163 |     return cpt
164 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/jit/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/jit/__pycache__/get_synthesizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/lib/jit/__pycache__/get_synthesizer.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/jit/get_rmvpe.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_rmvpe(model_path="rvc/assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
 5 |     from infer.lib.rmvpe import E2E
 6 | 
 7 |     model = E2E(4, 1, (2, 2))
 8 |     ckpt = torch.load(model_path, map_location=device)
 9 |     model.load_state_dict(ckpt)
10 |     model.eval()
11 |     model = model.to(device)
12 |     return model
13 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/jit/get_synthesizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_synthesizer(pth_path, device=torch.device("cpu")):
 5 |     from rvc.infer.lib.infer_pack.models import (
 6 |         SynthesizerTrnMs256NSFsid,
 7 |         SynthesizerTrnMs256NSFsid_nono,
 8 |         SynthesizerTrnMs768NSFsid,
 9 |         SynthesizerTrnMs768NSFsid_nono,
10 |     )
11 | 
12 |     cpt = torch.load(pth_path, map_location=torch.device("cpu"))
13 |     # tgt_sr = cpt["config"][-1]
14 |     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
15 |     if_f0 = cpt.get("f0", 1)
16 |     version = cpt.get("version", "v1")
17 |     if version == "v1":
18 |         if if_f0 == 1:
19 |             net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
20 |         else:
21 |             net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
22 |     elif version == "v2":
23 |         if if_f0 == 1:
24 |             net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False)
25 |         else:
26 |             net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
27 |     del net_g.enc_q
28 |     # net_g.forward = net_g.infer
29 |     # ckpt = {}
30 |     # ckpt["config"] = cpt["config"]
31 |     # ckpt["f0"] = if_f0
32 |     # ckpt["version"] = version
33 |     # ckpt["info"] = cpt.get("info", "0epoch")
34 |     net_g.load_state_dict(cpt["weight"], strict=False)
35 |     net_g = net_g.float()
36 |     net_g.eval().to(device)
37 |     net_g.remove_weight_norm()
38 |     return net_g, cpt
39 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/train/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | 
15 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16 |     loss = 0
17 |     r_losses = []
18 |     g_losses = []
19 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20 |         dr = dr.float()
21 |         dg = dg.float()
22 |         r_loss = torch.mean((1 - dr) ** 2)
23 |         g_loss = torch.mean(dg**2)
24 |         loss += r_loss + g_loss
25 |         r_losses.append(r_loss.item())
26 |         g_losses.append(g_loss.item())
27 | 
28 |     return loss, r_losses, g_losses
29 | 
30 | 
31 | def generator_loss(disc_outputs):
32 |     loss = 0
33 |     gen_losses = []
34 |     for dg in disc_outputs:
35 |         dg = dg.float()
36 |         l = torch.mean((1 - dg) ** 2)
37 |         gen_losses.append(l)
38 |         loss += l
39 | 
40 |     return loss, gen_losses
41 | 
42 | 
43 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44 |     """
45 |     z_p, logs_q: [b, h, t_t]
46 |     m_p, logs_p: [b, h, t_t]
47 |     """
48 |     z_p = z_p.float()
49 |     logs_q = logs_q.float()
50 |     m_p = m_p.float()
51 |     logs_p = logs_p.float()
52 |     z_mask = z_mask.float()
53 | 
54 |     kl = logs_p - logs_q - 0.5
55 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56 |     kl = torch.sum(kl * z_mask)
57 |     l = kl / torch.sum(z_mask)
58 |     return l
59 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/train/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | from librosa.filters import mel as librosa_mel_fn
  4 | import logging
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | MAX_WAV_VALUE = 32768.0
  9 | 
 10 | 
 11 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 12 |     """
 13 |     PARAMS
 14 |     ------
 15 |     C: compression factor
 16 |     """
 17 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 18 | 
 19 | 
 20 | def dynamic_range_decompression_torch(x, C=1):
 21 |     """
 22 |     PARAMS
 23 |     ------
 24 |     C: compression factor used to compress
 25 |     """
 26 |     return torch.exp(x) / C
 27 | 
 28 | 
 29 | def spectral_normalize_torch(magnitudes):
 30 |     return dynamic_range_compression_torch(magnitudes)
 31 | 
 32 | 
 33 | def spectral_de_normalize_torch(magnitudes):
 34 |     return dynamic_range_decompression_torch(magnitudes)
 35 | 
 36 | 
 37 | # Reusable banks
 38 | mel_basis = {}
 39 | hann_window = {}
 40 | 
 41 | 
 42 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 43 |     """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
 44 | 
 45 |     Args:
 46 |         y             :: (B, T) - Audio waveforms
 47 |         n_fft
 48 |         sampling_rate
 49 |         hop_size
 50 |         win_size
 51 |         center
 52 |     Returns:
 53 |         :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
 54 |     """
 55 | 
 56 |     # Window - Cache if needed
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 62 |             dtype=y.dtype, device=y.device
 63 |         )
 64 | 
 65 |     # Padding
 66 |     y = torch.nn.functional.pad(
 67 |         y.unsqueeze(1),
 68 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 69 |         mode="reflect",
 70 |     )
 71 |     y = y.squeeze(1)
 72 | 
 73 |     # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
 74 |     spec = torch.stft(
 75 |         y,
 76 |         n_fft,
 77 |         hop_length=hop_size,
 78 |         win_length=win_size,
 79 |         window=hann_window[wnsize_dtype_device],
 80 |         center=center,
 81 |         pad_mode="reflect",
 82 |         normalized=False,
 83 |         onesided=True,
 84 |         return_complex=True,
 85 |     )
 86 | 
 87 |     # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
 88 |     spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
 89 |     return spec
 90 | 
 91 | 
 92 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 93 |     # MelBasis - Cache if needed
 94 |     global mel_basis
 95 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 96 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 97 |     if fmax_dtype_device not in mel_basis:
 98 |         mel = librosa_mel_fn(
 99 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
100 |         )
101 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
102 |             dtype=spec.dtype, device=spec.device
103 |         )
104 | 
105 |     # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
106 |     melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
107 |     melspec = spectral_normalize_torch(melspec)
108 |     return melspec
109 | 
110 | 
111 | def mel_spectrogram_torch(
112 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
113 | ):
114 |     """Convert waveform into Mel-frequency Log-amplitude spectrogram.
115 | 
116 |     Args:
117 |         y       :: (B, T)           - Waveforms
118 |     Returns:
119 |         melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
120 |     """
121 |     # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
122 |     spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
123 | 
124 |     # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
125 |     melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
126 | 
127 |     return melspec
128 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/layers_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class Encoder(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 31 |         super(Encoder, self).__init__()
 32 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
 33 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 34 | 
 35 |     def __call__(self, x):
 36 |         h = self.conv1(x)
 37 |         h = self.conv2(h)
 38 | 
 39 |         return h
 40 | 
 41 | 
 42 | class Decoder(nn.Module):
 43 |     def __init__(
 44 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 45 |     ):
 46 |         super(Decoder, self).__init__()
 47 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 48 |         # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 49 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 50 | 
 51 |     def __call__(self, x, skip=None):
 52 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 53 | 
 54 |         if skip is not None:
 55 |             skip = spec_utils.crop_center(skip, x)
 56 |             x = torch.cat([x, skip], dim=1)
 57 | 
 58 |         h = self.conv1(x)
 59 |         # h = self.conv2(h)
 60 | 
 61 |         if self.dropout is not None:
 62 |             h = self.dropout(h)
 63 | 
 64 |         return h
 65 | 
 66 | 
 67 | class ASPPModule(nn.Module):
 68 |     def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
 69 |         super(ASPPModule, self).__init__()
 70 |         self.conv1 = nn.Sequential(
 71 |             nn.AdaptiveAvgPool2d((1, None)),
 72 |             Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
 73 |         )
 74 |         self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
 75 |         self.conv3 = Conv2DBNActiv(
 76 |             nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
 77 |         )
 78 |         self.conv4 = Conv2DBNActiv(
 79 |             nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
 80 |         )
 81 |         self.conv5 = Conv2DBNActiv(
 82 |             nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
 83 |         )
 84 |         self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
 85 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 86 | 
 87 |     def forward(self, x):
 88 |         _, _, h, w = x.size()
 89 |         feat1 = F.interpolate(
 90 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
 91 |         )
 92 |         feat2 = self.conv2(x)
 93 |         feat3 = self.conv3(x)
 94 |         feat4 = self.conv4(x)
 95 |         feat5 = self.conv5(x)
 96 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
 97 |         out = self.bottleneck(out)
 98 | 
 99 |         if self.dropout is not None:
100 |             out = self.dropout(out)
101 | 
102 |         return out
103 | 
104 | 
105 | class LSTMModule(nn.Module):
106 |     def __init__(self, nin_conv, nin_lstm, nout_lstm):
107 |         super(LSTMModule, self).__init__()
108 |         self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109 |         self.lstm = nn.LSTM(
110 |             input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111 |         )
112 |         self.dense = nn.Sequential(
113 |             nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114 |         )
115 | 
116 |     def forward(self, x):
117 |         N, _, nbins, nframes = x.size()
118 |         h = self.conv(x)[:, 0]  # N, nbins, nframes
119 |         h = h.permute(2, 0, 1)  # nframes, N, nbins
120 |         h, _ = self.lstm(h)
121 |         h = self.dense(h.reshape(-1, h.size()[-1]))  # nframes * N, nbins
122 |         h = h.reshape(nframes, N, 1, nbins)
123 |         h = h.permute(1, 2, 3, 0)
124 | 
125 |         return h
126 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pathlib
 4 | 
 5 | default_param = {}
 6 | default_param["bins"] = 768
 7 | default_param["unstable_bins"] = 9  # training only
 8 | default_param["reduction_bins"] = 762  # training only
 9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 | 
14 | 
15 | default_param["band"][1] = {
16 |     "sr": 11025,
17 |     "hl": 128,
18 |     "n_fft": 960,
19 |     "crop_start": 0,
20 |     "crop_stop": 245,
21 |     "lpf_start": 61,  # inference only
22 |     "res_type": "polyphase",
23 | }
24 | 
25 | default_param["band"][2] = {
26 |     "sr": 44100,
27 |     "hl": 512,
28 |     "n_fft": 1536,
29 |     "crop_start": 24,
30 |     "crop_stop": 547,
31 |     "hpf_start": 81,  # inference only
32 |     "res_type": "sinc_best",
33 | }
34 | 
35 | 
36 | def int_keys(d):
37 |     r = {}
38 |     for k, v in d:
39 |         if k.isdigit():
40 |             k = int(k)
41 |         r[k] = v
42 |     return r
43 | 
44 | 
45 | class ModelParameters(object):
46 |     def __init__(self, config_path=""):
47 |         if ".pth" == pathlib.Path(config_path).suffix:
48 |             import zipfile
49 | 
50 |             with zipfile.ZipFile(config_path, "r") as zip:
51 |                 self.param = json.loads(
52 |                     zip.read("param.json"), object_pairs_hook=int_keys
53 |                 )
54 |         elif ".json" == pathlib.Path(config_path).suffix:
55 |             with open(config_path, "r") as f:
56 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 |         else:
58 |             self.param = default_param
59 | 
60 |         for k in [
61 |             "mid_side",
62 |             "mid_side_b",
63 |             "mid_side_b2",
64 |             "stereo_w",
65 |             "stereo_n",
66 |             "reverse",
67 |         ]:
68 |             if not k in self.param:
69 |                 self.param[k] = False
70 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets.py:
--------------------------------------------------------------------------------
  1 | import layers
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import spec_utils
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 16)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 16)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(8, 16)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(16, 32)
 51 | 
 52 |         self.out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_33966KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 16)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 16)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(8, 16)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(16, 32)
 50 | 
 51 |         self.out = nn.Conv2d(32, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import layers_537238KB as layers
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 64)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 64)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(32, 64)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(64, 128)
 51 | 
 52 |         self.out = nn.Conv2d(128, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import layers_537238KB as layers
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 64)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 64)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(32, 64)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(64, 128)
 51 | 
 52 |         self.out = nn.Conv2d(128, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/lib_v5/nets_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_new
  6 | 
  7 | 
  8 | class BaseNet(nn.Module):
  9 |     def __init__(
 10 |         self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
 11 |     ):
 12 |         super(BaseNet, self).__init__()
 13 |         self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
 14 |         self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
 15 |         self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
 16 |         self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
 17 |         self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
 18 | 
 19 |         self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
 20 | 
 21 |         self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
 22 |         self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
 23 |         self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
 24 |         self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
 25 |         self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
 26 | 
 27 |     def __call__(self, x):
 28 |         e1 = self.enc1(x)
 29 |         e2 = self.enc2(e1)
 30 |         e3 = self.enc3(e2)
 31 |         e4 = self.enc4(e3)
 32 |         e5 = self.enc5(e4)
 33 | 
 34 |         h = self.aspp(e5)
 35 | 
 36 |         h = self.dec4(h, e4)
 37 |         h = self.dec3(h, e3)
 38 |         h = self.dec2(h, e2)
 39 |         h = torch.cat([h, self.lstm_dec2(h)], dim=1)
 40 |         h = self.dec1(h, e1)
 41 | 
 42 |         return h
 43 | 
 44 | 
 45 | class CascadedNet(nn.Module):
 46 |     def __init__(self, n_fft, nout=32, nout_lstm=128):
 47 |         super(CascadedNet, self).__init__()
 48 | 
 49 |         self.max_bin = n_fft // 2
 50 |         self.output_bin = n_fft // 2 + 1
 51 |         self.nin_lstm = self.max_bin // 2
 52 |         self.offset = 64
 53 | 
 54 |         self.stg1_low_band_net = nn.Sequential(
 55 |             BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
 56 |             layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
 57 |         )
 58 | 
 59 |         self.stg1_high_band_net = BaseNet(
 60 |             2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
 61 |         )
 62 | 
 63 |         self.stg2_low_band_net = nn.Sequential(
 64 |             BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
 65 |             layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
 66 |         )
 67 |         self.stg2_high_band_net = BaseNet(
 68 |             nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
 69 |         )
 70 | 
 71 |         self.stg3_full_band_net = BaseNet(
 72 |             3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
 73 |         )
 74 | 
 75 |         self.out = nn.Conv2d(nout, 2, 1, bias=False)
 76 |         self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
 77 | 
 78 |     def forward(self, x):
 79 |         x = x[:, :, : self.max_bin]
 80 | 
 81 |         bandw = x.size()[2] // 2
 82 |         l1_in = x[:, :, :bandw]
 83 |         h1_in = x[:, :, bandw:]
 84 |         l1 = self.stg1_low_band_net(l1_in)
 85 |         h1 = self.stg1_high_band_net(h1_in)
 86 |         aux1 = torch.cat([l1, h1], dim=2)
 87 | 
 88 |         l2_in = torch.cat([l1_in, l1], dim=1)
 89 |         h2_in = torch.cat([h1_in, h1], dim=1)
 90 |         l2 = self.stg2_low_band_net(l2_in)
 91 |         h2 = self.stg2_high_band_net(h2_in)
 92 |         aux2 = torch.cat([l2, h2], dim=2)
 93 | 
 94 |         f3_in = torch.cat([x, aux1, aux2], dim=1)
 95 |         f3 = self.stg3_full_band_net(f3_in)
 96 | 
 97 |         mask = torch.sigmoid(self.out(f3))
 98 |         mask = F.pad(
 99 |             input=mask,
100 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
101 |             mode="replicate",
102 |         )
103 | 
104 |         if self.training:
105 |             aux = torch.cat([aux1, aux2], dim=1)
106 |             aux = torch.sigmoid(self.aux_out(aux))
107 |             aux = F.pad(
108 |                 input=aux,
109 |                 pad=(0, 0, 0, self.output_bin - aux.size()[2]),
110 |                 mode="replicate",
111 |             )
112 |             return mask, aux
113 |         else:
114 |             return mask
115 | 
116 |     def predict_mask(self, x):
117 |         mask = self.forward(x)
118 | 
119 |         if self.offset > 0:
120 |             mask = mask[:, :, :, self.offset : -self.offset]
121 |             assert mask.size()[3] > 0
122 | 
123 |         return mask
124 | 
125 |     def predict(self, x, aggressiveness=None):
126 |         mask = self.forward(x)
127 |         pred_mag = x * mask
128 | 
129 |         if self.offset > 0:
130 |             pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
131 |             assert pred_mag.size()[3] > 0
132 | 
133 |         return pred_mag
134 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/lib/uvr5_pack/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict:
  9 |     with open(file_name, "r") as f:
 10 |         data = json.load(f)
 11 | 
 12 |     return data
 13 | 
 14 | 
 15 | def make_padding(width, cropsize, offset):
 16 |     left = offset
 17 |     roi_size = cropsize - left * 2
 18 |     if roi_size == 0:
 19 |         roi_size = cropsize
 20 |     right = roi_size - (width % roi_size) + left
 21 | 
 22 |     return left, right, roi_size
 23 | 
 24 | 
 25 | def inference(X_spec, device, model, aggressiveness, data):
 26 |     """
 27 |     data ： dic configs
 28 |     """
 29 | 
 30 |     def _execute(
 31 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
 32 |     ):
 33 |         model.eval()
 34 |         with torch.no_grad():
 35 |             preds = []
 36 | 
 37 |             iterations = [n_window]
 38 | 
 39 |             total_iterations = sum(iterations)
 40 |             for i in tqdm(range(n_window)):
 41 |                 start = i * roi_size
 42 |                 X_mag_window = X_mag_pad[
 43 |                     None, :, :, start : start + data["window_size"]
 44 |                 ]
 45 |                 X_mag_window = torch.from_numpy(X_mag_window)
 46 |                 if is_half:
 47 |                     X_mag_window = X_mag_window.half()
 48 |                 X_mag_window = X_mag_window.to(device)
 49 | 
 50 |                 pred = model.predict(X_mag_window, aggressiveness)
 51 | 
 52 |                 pred = pred.detach().cpu().numpy()
 53 |                 preds.append(pred[0])
 54 | 
 55 |             pred = np.concatenate(preds, axis=2)
 56 |         return pred
 57 | 
 58 |     def preprocess(X_spec):
 59 |         X_mag = np.abs(X_spec)
 60 |         X_phase = np.angle(X_spec)
 61 | 
 62 |         return X_mag, X_phase
 63 | 
 64 |     X_mag, X_phase = preprocess(X_spec)
 65 | 
 66 |     coef = X_mag.max()
 67 |     X_mag_pre = X_mag / coef
 68 | 
 69 |     n_frame = X_mag_pre.shape[2]
 70 |     pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
 71 |     n_window = int(np.ceil(n_frame / roi_size))
 72 | 
 73 |     X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 74 | 
 75 |     if list(model.state_dict().values())[0].dtype == torch.float16:
 76 |         is_half = True
 77 |     else:
 78 |         is_half = False
 79 |     pred = _execute(
 80 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 81 |     )
 82 |     pred = pred[:, :, :n_frame]
 83 | 
 84 |     if data["tta"]:
 85 |         pad_l += roi_size // 2
 86 |         pad_r += roi_size // 2
 87 |         n_window += 1
 88 | 
 89 |         X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 90 | 
 91 |         pred_tta = _execute(
 92 |             X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 93 |         )
 94 |         pred_tta = pred_tta[:, :, roi_size // 2 :]
 95 |         pred_tta = pred_tta[:, :, :n_frame]
 96 | 
 97 |         return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
 98 |     else:
 99 |         return pred * coef, X_mag, np.exp(1.0j * X_phase)
100 | 
101 | 
102 | def _get_name_params(model_path, model_hash):
103 |     data = load_data()
104 |     flag = False
105 |     ModelName = model_path
106 |     for type in list(data):
107 |         for model in list(data[type][0]):
108 |             for i in range(len(data[type][0][model])):
109 |                 if str(data[type][0][model][i]["hash_name"]) == model_hash:
110 |                     flag = True
111 |                 elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112 |                     flag = True
113 | 
114 |                 if flag:
115 |                     model_params_auto = data[type][0][model][i]["model_params"]
116 |                     param_name_auto = data[type][0][model][i]["param_name"]
117 |                     if type == "equivalent":
118 |                         return param_name_auto, model_params_auto
119 |                     else:
120 |                         flag = False
121 |     return param_name_auto, model_params_auto
122 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/onnx/export.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
 4 | 
 5 | 
 6 | def export_onnx(ModelPath, ExportedPath):
 7 |     cpt = torch.load(ModelPath, map_location="cpu")
 8 |     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
 9 |     vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
10 | 
11 |     test_phone = torch.rand(1, 200, vec_channels)  # hidden unit
12 |     test_phone_lengths = torch.tensor([200]).long()  # hidden unit 长度（貌似没啥用）
13 |     test_pitch = torch.randint(size=(1, 200), low=5, high=255)  # 基频（单位赫兹）
14 |     test_pitchf = torch.rand(1, 200)  # nsf基频
15 |     test_ds = torch.LongTensor([0])  # 说话人ID
16 |     test_rnd = torch.rand(1, 192, 200)  # 噪声（加入随机因子）
17 | 
18 |     device = "cpu"  # 导出时设备（不影响使用模型）
19 | 
20 |     net_g = SynthesizerTrnMsNSFsidM(
21 |         *cpt["config"], is_half=False, version=cpt.get("version", "v1")
22 |     )  # fp32导出（C++要支持fp16必须手动将内存重新排列所以暂时不用fp16）
23 |     net_g.load_state_dict(cpt["weight"], strict=False)
24 |     input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
25 |     output_names = [
26 |         "audio",
27 |     ]
28 |     # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
29 |     torch.onnx.export(
30 |         net_g,
31 |         (
32 |             test_phone.to(device),
33 |             test_phone_lengths.to(device),
34 |             test_pitch.to(device),
35 |             test_pitchf.to(device),
36 |             test_ds.to(device),
37 |             test_rnd.to(device),
38 |         ),
39 |         ExportedPath,
40 |         dynamic_axes={
41 |             "phone": [1],
42 |             "pitch": [1],
43 |             "pitchf": [1],
44 |             "rnd": [2],
45 |         },
46 |         do_constant_folding=False,
47 |         opset_version=13,
48 |         verbose=False,
49 |         input_names=input_names,
50 |         output_names=output_names,
51 |     )
52 |     return "Finished"
53 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/train/extract/extract_f0_rmvpe.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | 
  5 | import parselmouth
  6 | 
  7 | now_dir = os.getcwd()
  8 | sys.path.append(now_dir)
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import pyworld
 13 | 
 14 | from infer.lib.audio import load_audio
 15 | 
 16 | logging.getLogger("numba").setLevel(logging.WARNING)
 17 | 
 18 | n_part = int(sys.argv[1])
 19 | i_part = int(sys.argv[2])
 20 | i_gpu = sys.argv[3]
 21 | os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
 22 | exp_dir = sys.argv[4]
 23 | is_half = sys.argv[5]
 24 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
 25 | 
 26 | 
 27 | def printt(strr):
 28 |     print(strr)
 29 |     f.write("%s\n" % strr)
 30 |     f.flush()
 31 | 
 32 | 
 33 | class FeatureInput(object):
 34 |     def __init__(self, samplerate=16000, hop_size=160):
 35 |         self.fs = samplerate
 36 |         self.hop = hop_size
 37 | 
 38 |         self.f0_bin = 256
 39 |         self.f0_max = 1100.0
 40 |         self.f0_min = 50.0
 41 |         self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
 42 |         self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
 43 | 
 44 |     def compute_f0(self, path, f0_method):
 45 |         x = load_audio(path, self.fs)
 46 |         # p_len = x.shape[0] // self.hop
 47 |         if f0_method == "rmvpe":
 48 |             if hasattr(self, "model_rmvpe") == False:
 49 |                 from infer.lib.rmvpe import RMVPE
 50 | 
 51 |                 print("Loading rmvpe model")
 52 |                 self.model_rmvpe = RMVPE(
 53 |                     "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda"
 54 |                 )
 55 |             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
 56 |         return f0
 57 | 
 58 |     def coarse_f0(self, f0):
 59 |         f0_mel = 1127 * np.log(1 + f0 / 700)
 60 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
 61 |             self.f0_bin - 2
 62 |         ) / (self.f0_mel_max - self.f0_mel_min) + 1
 63 | 
 64 |         # use 0 or 1
 65 |         f0_mel[f0_mel <= 1] = 1
 66 |         f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
 67 |         f0_coarse = np.rint(f0_mel).astype(int)
 68 |         assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
 69 |             f0_coarse.max(),
 70 |             f0_coarse.min(),
 71 |         )
 72 |         return f0_coarse
 73 | 
 74 |     def go(self, paths, f0_method):
 75 |         if len(paths) == 0:
 76 |             printt("no-f0-todo")
 77 |         else:
 78 |             printt("todo-f0-%s" % len(paths))
 79 |             n = max(len(paths) // 5, 1)  # 每个进程最多打印5条
 80 |             for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
 81 |                 try:
 82 |                     if idx % n == 0:
 83 |                         printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
 84 |                     if (
 85 |                         os.path.exists(opt_path1 + ".npy") == True
 86 |                         and os.path.exists(opt_path2 + ".npy") == True
 87 |                     ):
 88 |                         continue
 89 |                     featur_pit = self.compute_f0(inp_path, f0_method)
 90 |                     np.save(
 91 |                         opt_path2,
 92 |                         featur_pit,
 93 |                         allow_pickle=False,
 94 |                     )  # nsf
 95 |                     coarse_pit = self.coarse_f0(featur_pit)
 96 |                     np.save(
 97 |                         opt_path1,
 98 |                         coarse_pit,
 99 |                         allow_pickle=False,
100 |                     )  # ori
101 |                 except:
102 |                     printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     # exp_dir=r"E:\codes\py39\dataset\mi-test"
107 |     # n_p=16
108 |     # f = open("%s/log_extract_f0.log"%exp_dir, "w")
109 |     printt(sys.argv)
110 |     featureInput = FeatureInput()
111 |     paths = []
112 |     inp_root = "%s/1_16k_wavs" % (exp_dir)
113 |     opt_root1 = "%s/2a_f0" % (exp_dir)
114 |     opt_root2 = "%s/2b-f0nsf" % (exp_dir)
115 | 
116 |     os.makedirs(opt_root1, exist_ok=True)
117 |     os.makedirs(opt_root2, exist_ok=True)
118 |     for name in sorted(list(os.listdir(inp_root))):
119 |         inp_path = "%s/%s" % (inp_root, name)
120 |         if "spec" in inp_path:
121 |             continue
122 |         opt_path1 = "%s/%s" % (opt_root1, name)
123 |         opt_path2 = "%s/%s" % (opt_root2, name)
124 |         paths.append([inp_path, opt_path1, opt_path2])
125 |     try:
126 |         featureInput.go(paths[i_part::n_part], "rmvpe")
127 |     except:
128 |         printt("f0_all_fail-%s" % (traceback.format_exc()))
129 |     # ps = []
130 |     # for i in range(n_p):
131 |     #     p = Process(
132 |     #         target=featureInput.go,
133 |     #         args=(
134 |     #             paths[i::n_p],
135 |     #             f0method,
136 |     #         ),
137 |     #     )
138 |     #     ps.append(p)
139 |     #     p.start()
140 |     # for i in range(n_p):
141 |     #     ps[i].join()
142 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/train/extract/extract_f0_rmvpe_dml.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | 
  5 | import parselmouth
  6 | 
  7 | now_dir = os.getcwd()
  8 | sys.path.append(now_dir)
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import pyworld
 13 | 
 14 | from infer.lib.audio import load_audio
 15 | 
 16 | logging.getLogger("numba").setLevel(logging.WARNING)
 17 | 
 18 | exp_dir = sys.argv[1]
 19 | import torch_directml
 20 | 
 21 | device = torch_directml.device(torch_directml.default_device())
 22 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
 23 | 
 24 | 
 25 | def printt(strr):
 26 |     print(strr)
 27 |     f.write("%s\n" % strr)
 28 |     f.flush()
 29 | 
 30 | 
 31 | class FeatureInput(object):
 32 |     def __init__(self, samplerate=16000, hop_size=160):
 33 |         self.fs = samplerate
 34 |         self.hop = hop_size
 35 | 
 36 |         self.f0_bin = 256
 37 |         self.f0_max = 1100.0
 38 |         self.f0_min = 50.0
 39 |         self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
 40 |         self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
 41 | 
 42 |     def compute_f0(self, path, f0_method):
 43 |         x = load_audio(path, self.fs)
 44 |         # p_len = x.shape[0] // self.hop
 45 |         if f0_method == "rmvpe":
 46 |             if hasattr(self, "model_rmvpe") == False:
 47 |                 from infer.lib.rmvpe import RMVPE
 48 | 
 49 |                 print("Loading rmvpe model")
 50 |                 self.model_rmvpe = RMVPE(
 51 |                     "assets/rmvpe/rmvpe.pt", is_half=False, device=device
 52 |                 )
 53 |             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
 54 |         return f0
 55 | 
 56 |     def coarse_f0(self, f0):
 57 |         f0_mel = 1127 * np.log(1 + f0 / 700)
 58 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
 59 |             self.f0_bin - 2
 60 |         ) / (self.f0_mel_max - self.f0_mel_min) + 1
 61 | 
 62 |         # use 0 or 1
 63 |         f0_mel[f0_mel <= 1] = 1
 64 |         f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
 65 |         f0_coarse = np.rint(f0_mel).astype(int)
 66 |         assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
 67 |             f0_coarse.max(),
 68 |             f0_coarse.min(),
 69 |         )
 70 |         return f0_coarse
 71 | 
 72 |     def go(self, paths, f0_method):
 73 |         if len(paths) == 0:
 74 |             printt("no-f0-todo")
 75 |         else:
 76 |             printt("todo-f0-%s" % len(paths))
 77 |             n = max(len(paths) // 5, 1)  # 每个进程最多打印5条
 78 |             for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
 79 |                 try:
 80 |                     if idx % n == 0:
 81 |                         printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
 82 |                     if (
 83 |                         os.path.exists(opt_path1 + ".npy") == True
 84 |                         and os.path.exists(opt_path2 + ".npy") == True
 85 |                     ):
 86 |                         continue
 87 |                     featur_pit = self.compute_f0(inp_path, f0_method)
 88 |                     np.save(
 89 |                         opt_path2,
 90 |                         featur_pit,
 91 |                         allow_pickle=False,
 92 |                     )  # nsf
 93 |                     coarse_pit = self.coarse_f0(featur_pit)
 94 |                     np.save(
 95 |                         opt_path1,
 96 |                         coarse_pit,
 97 |                         allow_pickle=False,
 98 |                     )  # ori
 99 |                 except:
100 |                     printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     # exp_dir=r"E:\codes\py39\dataset\mi-test"
105 |     # n_p=16
106 |     # f = open("%s/log_extract_f0.log"%exp_dir, "w")
107 |     printt(sys.argv)
108 |     featureInput = FeatureInput()
109 |     paths = []
110 |     inp_root = "%s/1_16k_wavs" % (exp_dir)
111 |     opt_root1 = "%s/2a_f0" % (exp_dir)
112 |     opt_root2 = "%s/2b-f0nsf" % (exp_dir)
113 | 
114 |     os.makedirs(opt_root1, exist_ok=True)
115 |     os.makedirs(opt_root2, exist_ok=True)
116 |     for name in sorted(list(os.listdir(inp_root))):
117 |         inp_path = "%s/%s" % (inp_root, name)
118 |         if "spec" in inp_path:
119 |             continue
120 |         opt_path1 = "%s/%s" % (opt_root1, name)
121 |         opt_path2 = "%s/%s" % (opt_root2, name)
122 |         paths.append([inp_path, opt_path1, opt_path2])
123 |     try:
124 |         featureInput.go(paths, "rmvpe")
125 |     except:
126 |         printt("f0_all_fail-%s" % (traceback.format_exc()))
127 |     # ps = []
128 |     # for i in range(n_p):
129 |     #     p = Process(
130 |     #         target=featureInput.go,
131 |     #         args=(
132 |     #             paths[i::n_p],
133 |     #             f0method,
134 |     #         ),
135 |     #     )
136 |     #     ps.append(p)
137 |     #     p.start()
138 |     # for i in range(n_p):
139 |     #     ps[i].join()
140 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/train/extract_feature_print.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | 
  5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
  6 | os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
  7 | 
  8 | device = sys.argv[1]
  9 | n_part = int(sys.argv[2])
 10 | i_part = int(sys.argv[3])
 11 | if len(sys.argv) == 6:
 12 |     exp_dir = sys.argv[4]
 13 |     version = sys.argv[5]
 14 | else:
 15 |     i_gpu = sys.argv[4]
 16 |     exp_dir = sys.argv[5]
 17 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
 18 |     version = sys.argv[6]
 19 | import fairseq
 20 | import numpy as np
 21 | import soundfile as sf
 22 | import torch
 23 | import torch.nn.functional as F
 24 | 
 25 | if "privateuseone" not in device:
 26 |     device = "cpu"
 27 |     if torch.cuda.is_available():
 28 |         device = "cuda"
 29 |     elif torch.backends.mps.is_available():
 30 |         device = "mps"
 31 | else:
 32 |     import torch_directml
 33 | 
 34 |     device = torch_directml.device(torch_directml.default_device())
 35 | 
 36 |     def forward_dml(ctx, x, scale):
 37 |         ctx.scale = scale
 38 |         res = x.clone().detach()
 39 |         return res
 40 | 
 41 |     fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
 42 | 
 43 | f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
 44 | 
 45 | 
 46 | def printt(strr):
 47 |     print(strr)
 48 |     f.write("%s\n" % strr)
 49 |     f.flush()
 50 | 
 51 | 
 52 | printt(sys.argv)
 53 | model_path = "assets/hubert/hubert_base.pt"
 54 | 
 55 | printt(exp_dir)
 56 | wavPath = "%s/1_16k_wavs" % exp_dir
 57 | outPath = (
 58 |     "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir
 59 | )
 60 | os.makedirs(outPath, exist_ok=True)
 61 | 
 62 | 
 63 | # wave must be 16k, hop_size=320
 64 | def readwave(wav_path, normalize=False):
 65 |     wav, sr = sf.read(wav_path)
 66 |     assert sr == 16000
 67 |     feats = torch.from_numpy(wav).float()
 68 |     if feats.dim() == 2:  # double channels
 69 |         feats = feats.mean(-1)
 70 |     assert feats.dim() == 1, feats.dim()
 71 |     if normalize:
 72 |         with torch.no_grad():
 73 |             feats = F.layer_norm(feats, feats.shape)
 74 |     feats = feats.view(1, -1)
 75 |     return feats
 76 | 
 77 | 
 78 | # HuBERT model
 79 | printt("load model(s) from {}".format(model_path))
 80 | # if hubert model is exist
 81 | if os.access(model_path, os.F_OK) == False:
 82 |     printt(
 83 |         "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
 84 |         % model_path
 85 |     )
 86 |     exit(0)
 87 | models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
 88 |     [model_path],
 89 |     suffix="",
 90 | )
 91 | model = models[0]
 92 | model = model.to(device)
 93 | printt("move model to %s" % device)
 94 | if device not in ["mps", "cpu"]:
 95 |     model = model.half()
 96 | model.eval()
 97 | 
 98 | todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
 99 | n = max(1, len(todo) // 10)  # 最多打印十条
100 | if len(todo) == 0:
101 |     printt("no-feature-todo")
102 | else:
103 |     printt("all-feature-%s" % len(todo))
104 |     for idx, file in enumerate(todo):
105 |         try:
106 |             if file.endswith(".wav"):
107 |                 wav_path = "%s/%s" % (wavPath, file)
108 |                 out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
109 | 
110 |                 if os.path.exists(out_path):
111 |                     continue
112 | 
113 |                 feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
114 |                 padding_mask = torch.BoolTensor(feats.shape).fill_(False)
115 |                 inputs = {
116 |                     "source": feats.half().to(device)
117 |                     if device not in ["mps", "cpu"]
118 |                     else feats.to(device),
119 |                     "padding_mask": padding_mask.to(device),
120 |                     "output_layer": 9 if version == "v1" else 12,  # layer 9
121 |                 }
122 |                 with torch.no_grad():
123 |                     logits = model.extract_features(**inputs)
124 |                     feats = (
125 |                         model.final_proj(logits[0]) if version == "v1" else logits[0]
126 |                     )
127 | 
128 |                 feats = feats.squeeze(0).float().cpu().numpy()
129 |                 if np.isnan(feats).sum() == 0:
130 |                     np.save(out_path, feats, allow_pickle=False)
131 |                 else:
132 |                     printt("%s-contains nan" % file)
133 |                 if idx % n == 0:
134 |                     printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
135 |         except:
136 |             printt(traceback.format_exc())
137 |     printt("all-feature-done")
138 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/train/preprocess.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | import sys
  4 | 
  5 | from scipy import signal
  6 | 
  7 | now_dir = os.getcwd()
  8 | sys.path.append(now_dir)
  9 | print(sys.argv)
 10 | inp_root = sys.argv[1]
 11 | sr = int(sys.argv[2])
 12 | n_p = int(sys.argv[3])
 13 | exp_dir = sys.argv[4]
 14 | noparallel = sys.argv[5] == "True"
 15 | per = float(sys.argv[6])
 16 | import multiprocessing
 17 | import os
 18 | import traceback
 19 | 
 20 | import librosa
 21 | import numpy as np
 22 | from scipy.io import wavfile
 23 | 
 24 | from infer.lib.audio import load_audio
 25 | from infer.lib.slicer2 import Slicer
 26 | 
 27 | mutex = multiprocessing.Lock()
 28 | f = open("%s/preprocess.log" % exp_dir, "a+")
 29 | 
 30 | 
 31 | def println(strr):
 32 |     mutex.acquire()
 33 |     print(strr)
 34 |     f.write("%s\n" % strr)
 35 |     f.flush()
 36 |     mutex.release()
 37 | 
 38 | 
 39 | class PreProcess:
 40 |     def __init__(self, sr, exp_dir, per=3.0):
 41 |         self.slicer = Slicer(
 42 |             sr=sr,
 43 |             threshold=-42,
 44 |             min_length=1500,
 45 |             min_interval=400,
 46 |             hop_size=15,
 47 |             max_sil_kept=500,
 48 |         )
 49 |         self.sr = sr
 50 |         self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
 51 |         self.per = per
 52 |         self.overlap = 0.3
 53 |         self.tail = self.per + self.overlap
 54 |         self.max = 0.9
 55 |         self.alpha = 0.75
 56 |         self.exp_dir = exp_dir
 57 |         self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
 58 |         self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
 59 |         os.makedirs(self.exp_dir, exist_ok=True)
 60 |         os.makedirs(self.gt_wavs_dir, exist_ok=True)
 61 |         os.makedirs(self.wavs16k_dir, exist_ok=True)
 62 | 
 63 |     def norm_write(self, tmp_audio, idx0, idx1):
 64 |         tmp_max = np.abs(tmp_audio).max()
 65 |         if tmp_max > 2.5:
 66 |             print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
 67 |             return
 68 |         tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
 69 |             1 - self.alpha
 70 |         ) * tmp_audio
 71 |         wavfile.write(
 72 |             "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
 73 |             self.sr,
 74 |             tmp_audio.astype(np.float32),
 75 |         )
 76 |         tmp_audio = librosa.resample(
 77 |             tmp_audio, orig_sr=self.sr, target_sr=16000
 78 |         )  # , res_type="soxr_vhq"
 79 |         wavfile.write(
 80 |             "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
 81 |             16000,
 82 |             tmp_audio.astype(np.float32),
 83 |         )
 84 | 
 85 |     def pipeline(self, path, idx0):
 86 |         try:
 87 |             audio = load_audio(path, self.sr)
 88 |             # zero phased digital filter cause pre-ringing noise...
 89 |             # audio = signal.filtfilt(self.bh, self.ah, audio)
 90 |             audio = signal.lfilter(self.bh, self.ah, audio)
 91 | 
 92 |             idx1 = 0
 93 |             for audio in self.slicer.slice(audio):
 94 |                 i = 0
 95 |                 while 1:
 96 |                     start = int(self.sr * (self.per - self.overlap) * i)
 97 |                     i += 1
 98 |                     if len(audio[start:]) > self.tail * self.sr:
 99 |                         tmp_audio = audio[start : start + int(self.per * self.sr)]
100 |                         self.norm_write(tmp_audio, idx0, idx1)
101 |                         idx1 += 1
102 |                     else:
103 |                         tmp_audio = audio[start:]
104 |                         idx1 += 1
105 |                         break
106 |                 self.norm_write(tmp_audio, idx0, idx1)
107 |             println("%s->Suc." % path)
108 |         except:
109 |             println("%s->%s" % (path, traceback.format_exc()))
110 | 
111 |     def pipeline_mp(self, infos):
112 |         for path, idx0 in infos:
113 |             self.pipeline(path, idx0)
114 | 
115 |     def pipeline_mp_inp_dir(self, inp_root, n_p):
116 |         try:
117 |             infos = [
118 |                 ("%s/%s" % (inp_root, name), idx)
119 |                 for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
120 |             ]
121 |             if noparallel:
122 |                 for i in range(n_p):
123 |                     self.pipeline_mp(infos[i::n_p])
124 |             else:
125 |                 ps = []
126 |                 for i in range(n_p):
127 |                     p = multiprocessing.Process(
128 |                         target=self.pipeline_mp, args=(infos[i::n_p],)
129 |                     )
130 |                     ps.append(p)
131 |                     p.start()
132 |                 for i in range(n_p):
133 |                     ps[i].join()
134 |         except:
135 |             println("Fail. %s" % traceback.format_exc())
136 | 
137 | 
138 | def preprocess_trainset(inp_root, sr, n_p, exp_dir, per):
139 |     pp = PreProcess(sr, exp_dir, per)
140 |     println("start preprocess")
141 |     println(sys.argv)
142 |     pp.pipeline_mp_inp_dir(inp_root, n_p)
143 |     println("end preprocess")
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     preprocess_trainset(inp_root, sr, n_p, exp_dir, per)
148 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/uvr5/modules.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import traceback
  3 | import logging
  4 | 
  5 | logger = logging.getLogger(__name__)
  6 | 
  7 | import ffmpeg
  8 | import torch
  9 | 
 10 | from configs.config import Config
 11 | from infer.modules.uvr5.mdxnet import MDXNetDereverb
 12 | from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
 13 | 
 14 | config = Config()
 15 | 
 16 | 
 17 | def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
 18 |     infos = []
 19 |     try:
 20 |         inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 21 |         save_root_vocal = (
 22 |             save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 23 |         )
 24 |         save_root_ins = (
 25 |             save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
 26 |         )
 27 |         if model_name == "onnx_dereverb_By_FoxJoy":
 28 |             pre_fun = MDXNetDereverb(15, config.device)
 29 |         else:
 30 |             func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
 31 |             pre_fun = func(
 32 |                 agg=int(agg),
 33 |                 model_path=os.path.join(
 34 |                     os.getenv("weight_uvr5_root"), model_name + ".pth"
 35 |                 ),
 36 |                 device=config.device,
 37 |                 is_half=config.is_half,
 38 |             )
 39 |         is_hp3 = "HP3" in model_name
 40 |         if inp_root != "":
 41 |             paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
 42 |         else:
 43 |             paths = [path.name for path in paths]
 44 |         for path in paths:
 45 |             inp_path = os.path.join(inp_root, path)
 46 |             need_reformat = 1
 47 |             done = 0
 48 |             try:
 49 |                 info = ffmpeg.probe(inp_path, cmd="ffprobe")
 50 |                 if (
 51 |                     info["streams"][0]["channels"] == 2
 52 |                     and info["streams"][0]["sample_rate"] == "44100"
 53 |                 ):
 54 |                     need_reformat = 0
 55 |                     pre_fun._path_audio_(
 56 |                         inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
 57 |                     )
 58 |                     done = 1
 59 |             except:
 60 |                 need_reformat = 1
 61 |                 traceback.print_exc()
 62 |             if need_reformat == 1:
 63 |                 tmp_path = "%s/%s.reformatted.wav" % (
 64 |                     os.path.join(os.environ["TEMP"]),
 65 |                     os.path.basename(inp_path),
 66 |                 )
 67 |                 os.system(
 68 |                     "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
 69 |                     % (inp_path, tmp_path)
 70 |                 )
 71 |                 inp_path = tmp_path
 72 |             try:
 73 |                 if done == 0:
 74 |                     pre_fun._path_audio_(
 75 |                         inp_path, save_root_ins, save_root_vocal, format0
 76 |                     )
 77 |                 infos.append("%s->Success" % (os.path.basename(inp_path)))
 78 |                 yield "\n".join(infos)
 79 |             except:
 80 |                 try:
 81 |                     if done == 0:
 82 |                         pre_fun._path_audio_(
 83 |                             inp_path, save_root_ins, save_root_vocal, format0
 84 |                         )
 85 |                     infos.append("%s->Success" % (os.path.basename(inp_path)))
 86 |                     yield "\n".join(infos)
 87 |                 except:
 88 |                     infos.append(
 89 |                         "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
 90 |                     )
 91 |                     yield "\n".join(infos)
 92 |     except:
 93 |         infos.append(traceback.format_exc())
 94 |         yield "\n".join(infos)
 95 |     finally:
 96 |         try:
 97 |             if model_name == "onnx_dereverb_By_FoxJoy":
 98 |                 del pre_fun.pred.model
 99 |                 del pre_fun.pred.model_
100 |             else:
101 |                 del pre_fun.model
102 |                 del pre_fun
103 |         except:
104 |             traceback.print_exc()
105 |         if torch.cuda.is_available():
106 |             torch.cuda.empty_cache()
107 |             logger.info("Executed torch.cuda.empty_cache()")
108 |     yield "\n".join(infos)
109 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/vc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/infer/modules/vc/__init__.py


--------------------------------------------------------------------------------
/tts-cli/rvc/infer/modules/vc/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from fairseq import checkpoint_utils
 4 | 
 5 | 
 6 | def get_index_path_from_model(sid):
 7 |     return next(
 8 |         (
 9 |             f
10 |             for f in [
11 |                 os.path.join(root, name)
12 |                 for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
13 |                 for name in files
14 |                 if name.endswith(".index") and "trained" not in name
15 |             ]
16 |             if sid.split(".")[0] in f
17 |         ),
18 |         "",
19 |     )
20 | 
21 | 
22 | def load_hubert(config):
23 |     models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
24 |         ["assets/hubert/hubert_base.pt"],
25 |         suffix="",
26 |     )
27 |     hubert_model = models[0]
28 |     hubert_model = hubert_model.to(config.device)
29 |     if config.is_half:
30 |         hubert_model = hubert_model.half()
31 |     else:
32 |         hubert_model = hubert_model.float()
33 |     return hubert_model.eval()
34 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/__pycache__/rvc_for_realtime.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/__pycache__/rvc_for_realtime.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/app.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
  5 | import gradio as gr
  6 | from dotenv import load_dotenv
  7 | 
  8 | from configs.config import Config
  9 | from i18n.i18n import I18nAuto
 10 | from infer.modules.vc.modules import VC
 11 | 
 12 | logging.getLogger("numba").setLevel(logging.WARNING)
 13 | logging.getLogger("markdown_it").setLevel(logging.WARNING)
 14 | logging.getLogger("urllib3").setLevel(logging.WARNING)
 15 | logging.getLogger("matplotlib").setLevel(logging.WARNING)
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | i18n = I18nAuto()
 19 | logger.info(i18n)
 20 | 
 21 | load_dotenv()
 22 | config = Config()
 23 | vc = VC(config)
 24 | 
 25 | weight_root = os.getenv("weight_root")
 26 | weight_uvr5_root = os.getenv("weight_uvr5_root")
 27 | index_root = os.getenv("index_root")
 28 | names = []
 29 | hubert_model = None
 30 | for name in os.listdir(weight_root):
 31 |     if name.endswith(".pth"):
 32 |         names.append(name)
 33 | index_paths = []
 34 | for root, dirs, files in os.walk(index_root, topdown=False):
 35 |     for name in files:
 36 |         if name.endswith(".index") and "trained" not in name:
 37 |             index_paths.append("%s/%s" % (root, name))
 38 | 
 39 | 
 40 | app = gr.Blocks()
 41 | with app:
 42 |     with gr.Tabs():
 43 |         with gr.TabItem("在线demo"):
 44 |             gr.Markdown(
 45 |                 value="""
 46 |                 RVC 在线demo
 47 |                 """
 48 |             )
 49 |             sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
 50 |             with gr.Column():
 51 |                 spk_item = gr.Slider(
 52 |                     minimum=0,
 53 |                     maximum=2333,
 54 |                     step=1,
 55 |                     label=i18n("请选择说话人id"),
 56 |                     value=0,
 57 |                     visible=False,
 58 |                     interactive=True,
 59 |                 )
 60 |             sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item])
 61 |             gr.Markdown(
 62 |                 value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
 63 |             )
 64 |             vc_input3 = gr.Audio(label="上传音频（长度小于90秒）")
 65 |             vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0)
 66 |             f0method0 = gr.Radio(
 67 |                 label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
 68 |                 choices=["pm", "harvest", "crepe", "rmvpe"],
 69 |                 value="pm",
 70 |                 interactive=True,
 71 |             )
 72 |             filter_radius0 = gr.Slider(
 73 |                 minimum=0,
 74 |                 maximum=7,
 75 |                 label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音"),
 76 |                 value=3,
 77 |                 step=1,
 78 |                 interactive=True,
 79 |             )
 80 |             with gr.Column():
 81 |                 file_index1 = gr.Textbox(
 82 |                     label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
 83 |                     value="",
 84 |                     interactive=False,
 85 |                     visible=False,
 86 |                 )
 87 |             file_index2 = gr.Dropdown(
 88 |                 label=i18n("自动检测index路径,下拉式选择(dropdown)"),
 89 |                 choices=sorted(index_paths),
 90 |                 interactive=True,
 91 |             )
 92 |             index_rate1 = gr.Slider(
 93 |                 minimum=0,
 94 |                 maximum=1,
 95 |                 label=i18n("检索特征占比"),
 96 |                 value=0.88,
 97 |                 interactive=True,
 98 |             )
 99 |             resample_sr0 = gr.Slider(
100 |                 minimum=0,
101 |                 maximum=48000,
102 |                 label=i18n("后处理重采样至最终采样率，0为不进行重采样"),
103 |                 value=0,
104 |                 step=1,
105 |                 interactive=True,
106 |             )
107 |             rms_mix_rate0 = gr.Slider(
108 |                 minimum=0,
109 |                 maximum=1,
110 |                 label=i18n("输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络"),
111 |                 value=1,
112 |                 interactive=True,
113 |             )
114 |             protect0 = gr.Slider(
115 |                 minimum=0,
116 |                 maximum=0.5,
117 |                 label=i18n("保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果"),
118 |                 value=0.33,
119 |                 step=0.01,
120 |                 interactive=True,
121 |             )
122 |             f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
123 |             but0 = gr.Button(i18n("转换"), variant="primary")
124 |             vc_output1 = gr.Textbox(label=i18n("输出信息"))
125 |             vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
126 |             but0.click(
127 |                 vc.vc_single,
128 |                 [
129 |                     spk_item,
130 |                     vc_input3,
131 |                     vc_transform0,
132 |                     f0_file,
133 |                     f0method0,
134 |                     file_index1,
135 |                     file_index2,
136 |                     # file_big_npy1,
137 |                     index_rate1,
138 |                     filter_radius0,
139 |                     resample_sr0,
140 |                     rms_mix_rate0,
141 |                     protect0,
142 |                 ],
143 |                 [vc_output1, vc_output2],
144 |             )
145 | 
146 | 
147 | app.launch()
148 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/calc_rvc_model_similarity.py:
--------------------------------------------------------------------------------
 1 | # This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py
 2 | # Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models.
 3 | import os
 4 | import logging
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | 
12 | 
13 | def cal_cross_attn(to_q, to_k, to_v, rand_input):
14 |     hidden_dim, embed_dim = to_q.shape
15 |     attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False)
16 |     attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False)
17 |     attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False)
18 |     attn_to_q.load_state_dict({"weight": to_q})
19 |     attn_to_k.load_state_dict({"weight": to_k})
20 |     attn_to_v.load_state_dict({"weight": to_v})
21 | 
22 |     return torch.einsum(
23 |         "ik, jk -> ik",
24 |         F.softmax(
25 |             torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)),
26 |             dim=-1,
27 |         ),
28 |         attn_to_v(rand_input),
29 |     )
30 | 
31 | 
32 | def model_hash(filename):
33 |     try:
34 |         with open(filename, "rb") as file:
35 |             import hashlib
36 | 
37 |             m = hashlib.sha256()
38 | 
39 |             file.seek(0x100000)
40 |             m.update(file.read(0x10000))
41 |             return m.hexdigest()[0:8]
42 |     except FileNotFoundError:
43 |         return "NOFILE"
44 | 
45 | 
46 | def eval(model, n, input):
47 |     qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight"
48 |     uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight"
49 |     vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
50 |     atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0]
51 | 
52 |     attn = cal_cross_attn(atoq, atok, atov, input)
53 |     return attn
54 | 
55 | 
56 | def main(path, root):
57 |     torch.manual_seed(114514)
58 |     model_a = torch.load(path, map_location="cpu")["weight"]
59 | 
60 |     logger.info("Query:\t\t%s\t%s" % (path, model_hash(path)))
61 | 
62 |     map_attn_a = {}
63 |     map_rand_input = {}
64 |     for n in range(6):
65 |         hidden_dim, embed_dim, _ = model_a[
66 |             f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
67 |         ].shape
68 |         rand_input = torch.randn([embed_dim, hidden_dim])
69 | 
70 |         map_attn_a[n] = eval(model_a, n, rand_input)
71 |         map_rand_input[n] = rand_input
72 | 
73 |     del model_a
74 | 
75 |     for name in sorted(list(os.listdir(root))):
76 |         path = "%s/%s" % (root, name)
77 |         model_b = torch.load(path, map_location="cpu")["weight"]
78 | 
79 |         sims = []
80 |         for n in range(6):
81 |             attn_a = map_attn_a[n]
82 |             attn_b = eval(model_b, n, map_rand_input[n])
83 | 
84 |             sim = torch.mean(torch.cosine_similarity(attn_a, attn_b))
85 |             sims.append(sim)
86 | 
87 |         logger.info(
88 |             "Reference:\t%s\t%s\t%s"
89 |             % (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%")
90 |         )
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     query_path = r"assets\weights\mi v3.pth"
95 |     reference_root = r"assets\weights"
96 |     main(query_path, reference_root)
97 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/download_models - Kopie.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | import requests
 4 | 
 5 | RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
 6 | 
 7 | BASE_DIR = Path(__file__).resolve().parent.parent
 8 | 
 9 | 
10 | def dl_model(link, model_name, dir_name):
11 |     with requests.get(f"{link}{model_name}") as r:
12 |         r.raise_for_status()
13 |         os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True)
14 |         with open(dir_name / model_name, "wb") as f:
15 |             for chunk in r.iter_content(chunk_size=8192):
16 |                 f.write(chunk)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     print("Downloading hubert_base.pt...")
21 |     dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert")
22 |     print("Downloading rmvpe.pt...")
23 |     dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe")
24 |     print("Downloading vocals.onnx...")
25 |     dl_model(
26 |         RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/",
27 |         "vocals.onnx",
28 |         BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy",
29 |     )
30 | 
31 |     rvc_models_dir = BASE_DIR / "assets/pretrained"
32 | 
33 |     print("Downloading pretrained models:")
34 | 
35 |     model_names = [
36 |         "D32k.pth",
37 |         "D40k.pth",
38 |         "D48k.pth",
39 |         "G32k.pth",
40 |         "G40k.pth",
41 |         "G48k.pth",
42 |         "f0D32k.pth",
43 |         "f0D40k.pth",
44 |         "f0D48k.pth",
45 |         "f0G32k.pth",
46 |         "f0G40k.pth",
47 |         "f0G48k.pth",
48 |     ]
49 |     for model in model_names:
50 |         print(f"Downloading {model}...")
51 |         dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir)
52 | 
53 |     rvc_models_dir = BASE_DIR / "assets/pretrained_v2"
54 | 
55 |     print("Downloading pretrained models v2:")
56 | 
57 |     for model in model_names:
58 |         print(f"Downloading {model}...")
59 |         dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir)
60 | 
61 |     print("Downloading uvr5_weights:")
62 | 
63 |     rvc_models_dir = BASE_DIR / "assets/uvr5_weights"
64 | 
65 |     model_names = [
66 |         "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth",
67 |         "HP2_all_vocals.pth",
68 |         "HP3_all_vocals.pth",
69 |         "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth",
70 |         "HP5_only_main_vocal.pth",
71 |         "VR-DeEchoAggressive.pth",
72 |         "VR-DeEchoDeReverb.pth",
73 |         "VR-DeEchoNormal.pth",
74 |     ]
75 |     for model in model_names:
76 |         print(f"Downloading {model}...")
77 |         dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir)
78 | 
79 |     print("All models downloaded!")
80 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/download_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | import requests
 4 | 
 5 | RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
 6 | 
 7 | BASE_DIR = Path(__file__).resolve().parent.parent
 8 | 
 9 | 
10 | def dl_model(link, model_name, dir_name):
11 |     with requests.get(f"{link}{model_name}") as r:
12 |         r.raise_for_status()
13 |         os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True)
14 |         with open(dir_name / model_name, "wb") as f:
15 |             for chunk in r.iter_content(chunk_size=8192):
16 |                 f.write(chunk)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     print("Downloading hubert_base.pt...")
21 |     # dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert")
22 |     print("Downloading rmvpe.pt...")
23 |     # dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe")
24 |     print("Downloading vocals.onnx...")
25 |     # dl_model(
26 |     #     RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/",
27 |     #     "vocals.onnx",
28 |     #     BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy",
29 |     # )
30 | 
31 |     rvc_models_dir = BASE_DIR / "assets/pretrained"
32 | 
33 |     print("Downloading pretrained models:")
34 | 
35 |     model_names = [
36 |         # "D32k.pth",
37 |         # "D40k.pth",
38 |         # "D48k.pth",
39 |         "G32k.pth",
40 |         "G40k.pth",
41 |         "G48k.pth",
42 |         "f0D32k.pth",
43 |         "f0D40k.pth",
44 |         "f0D48k.pth",
45 |         "f0G32k.pth",
46 |         "f0G40k.pth",
47 |         "f0G48k.pth",
48 |     ]
49 |     for model in model_names:
50 |         print(f"Downloading {model}...")
51 |         dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir)
52 | 
53 |     rvc_models_dir = BASE_DIR / "assets/pretrained_v2"
54 | 
55 |     print("Downloading pretrained models v2:")
56 | 
57 |     for model in model_names:
58 |         print(f"Downloading {model}...")
59 |         dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir)
60 | 
61 |     print("Downloading uvr5_weights:")
62 | 
63 |     rvc_models_dir = BASE_DIR / "assets/uvr5_weights"
64 | 
65 |     model_names = [
66 |         "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth",
67 |         "HP2_all_vocals.pth",
68 |         "HP3_all_vocals.pth",
69 |         "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth",
70 |         "HP5_only_main_vocal.pth",
71 |         "VR-DeEchoAggressive.pth",
72 |         "VR-DeEchoDeReverb.pth",
73 |         "VR-DeEchoNormal.pth",
74 |     ]
75 |     for model in model_names:
76 |         print(f"Downloading {model}...")
77 |         dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir)
78 | 
79 |     print("All models downloaded!")
80 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/export_onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
 3 | 
 4 | if __name__ == "__main__":
 5 |     MoeVS = True  # 模型是否为MoeVoiceStudio（原MoeSS）使用
 6 | 
 7 |     ModelPath = "Shiroha/shiroha.pth"  # 模型路径
 8 |     ExportedPath = "model.onnx"  # 输出路径
 9 |     hidden_channels = 256  # hidden_channels，为768Vec做准备
10 |     cpt = torch.load(ModelPath, map_location="cpu")
11 |     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
12 |     print(*cpt["config"])
13 | 
14 |     test_phone = torch.rand(1, 200, hidden_channels)  # hidden unit
15 |     test_phone_lengths = torch.tensor([200]).long()  # hidden unit 长度（貌似没啥用）
16 |     test_pitch = torch.randint(size=(1, 200), low=5, high=255)  # 基频（单位赫兹）
17 |     test_pitchf = torch.rand(1, 200)  # nsf基频
18 |     test_ds = torch.LongTensor([0])  # 说话人ID
19 |     test_rnd = torch.rand(1, 192, 200)  # 噪声（加入随机因子）
20 | 
21 |     device = "cpu"  # 导出时设备（不影响使用模型）
22 | 
23 |     net_g = SynthesizerTrnMsNSFsidM(
24 |         *cpt["config"], is_half=False
25 |     )  # fp32导出（C++要支持fp16必须手动将内存重新排列所以暂时不用fp16）
26 |     net_g.load_state_dict(cpt["weight"], strict=False)
27 |     input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
28 |     output_names = [
29 |         "audio",
30 |     ]
31 |     # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
32 |     torch.onnx.export(
33 |         net_g,
34 |         (
35 |             test_phone.to(device),
36 |             test_phone_lengths.to(device),
37 |             test_pitch.to(device),
38 |             test_pitchf.to(device),
39 |             test_ds.to(device),
40 |             test_rnd.to(device),
41 |         ),
42 |         ExportedPath,
43 |         dynamic_axes={
44 |             "phone": [1],
45 |             "pitch": [1],
46 |             "pitchf": [1],
47 |             "rnd": [2],
48 |         },
49 |         do_constant_folding=False,
50 |         opset_version=16,
51 |         verbose=False,
52 |         input_names=input_names,
53 |         output_names=output_names,
54 |     )
55 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/infer/train-index-v2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
 3 | """
 4 | import os
 5 | import traceback
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | from multiprocessing import cpu_count
11 | 
12 | import faiss
13 | import numpy as np
14 | from sklearn.cluster import MiniBatchKMeans
15 | 
16 | # ###########如果是原始特征要先写save
17 | n_cpu = 0
18 | if n_cpu == 0:
19 |     n_cpu = cpu_count()
20 | inp_root = r"./logs/anz/3_feature768"
21 | npys = []
22 | listdir_res = list(os.listdir(inp_root))
23 | for name in sorted(listdir_res):
24 |     phone = np.load("%s/%s" % (inp_root, name))
25 |     npys.append(phone)
26 | big_npy = np.concatenate(npys, 0)
27 | big_npy_idx = np.arange(big_npy.shape[0])
28 | np.random.shuffle(big_npy_idx)
29 | big_npy = big_npy[big_npy_idx]
30 | logger.debug(big_npy.shape)  # (6196072, 192)#fp32#4.43G
31 | if big_npy.shape[0] > 2e5:
32 |     # if(1):
33 |     info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
34 |     logger.info(info)
35 |     try:
36 |         big_npy = (
37 |             MiniBatchKMeans(
38 |                 n_clusters=10000,
39 |                 verbose=True,
40 |                 batch_size=256 * n_cpu,
41 |                 compute_labels=False,
42 |                 init="random",
43 |             )
44 |             .fit(big_npy)
45 |             .cluster_centers_
46 |         )
47 |     except:
48 |         info = traceback.format_exc()
49 |         logger.warning(info)
50 | 
51 | np.save("tools/infer/big_src_feature_mi.npy", big_npy)
52 | 
53 | ##################train+add
54 | # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
55 | n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
56 | index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf)  # mi
57 | logger.info("Training...")
58 | index_ivf = faiss.extract_index_ivf(index)  #
59 | index_ivf.nprobe = 1
60 | index.train(big_npy)
61 | faiss.write_index(
62 |     index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
63 | )
64 | logger.info("Adding...")
65 | batch_size_add = 8192
66 | for i in range(0, big_npy.shape[0], batch_size_add):
67 |     index.add(big_npy[i : i + batch_size_add])
68 | faiss.write_index(
69 |     index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)
70 | )
71 | """
72 | 大小（都是FP32）
73 | big_src_feature 2.95G
74 |     (3098036, 256)
75 | big_emb         4.43G
76 |     (6196072, 192)
77 | big_emb双倍是因为求特征要repeat后再加pitch
78 | 
79 | """
80 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/infer/train-index.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
 3 | """
 4 | import os
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | import faiss
10 | import numpy as np
11 | 
12 | # ###########如果是原始特征要先写save
13 | inp_root = r"E:\codes\py39\dataset\mi\2-co256"
14 | npys = []
15 | for name in sorted(list(os.listdir(inp_root))):
16 |     phone = np.load("%s/%s" % (inp_root, name))
17 |     npys.append(phone)
18 | big_npy = np.concatenate(npys, 0)
19 | logger.debug(big_npy.shape)  # (6196072, 192)#fp32#4.43G
20 | np.save("infer/big_src_feature_mi.npy", big_npy)
21 | 
22 | ##################train+add
23 | # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
24 | logger.debug(big_npy.shape)
25 | index = faiss.index_factory(256, "IVF512,Flat")  # mi
26 | logger.info("Training...")
27 | index_ivf = faiss.extract_index_ivf(index)  #
28 | index_ivf.nprobe = 9
29 | index.train(big_npy)
30 | faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
31 | logger.info("Adding...")
32 | index.add(big_npy)
33 | faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
34 | """
35 | 大小（都是FP32）
36 | big_src_feature 2.95G
37 |     (3098036, 256)
38 | big_emb         4.43G
39 |     (6196072, 192)
40 | big_emb双倍是因为求特征要repeat后再加pitch
41 | 
42 | """
43 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/infer/trans_weights.py:
--------------------------------------------------------------------------------
 1 | import pdb
 2 | 
 3 | import torch
 4 | 
 5 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
 6 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
 7 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
 8 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
 9 | a = torch.load(
10 |     r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
11 | )[
12 |     "model"
13 | ]  # sim_nsf#
14 | for key in a.keys():
15 |     a[key] = a[key].half()
16 | # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
17 | # torch.save(a,"ft-mi-sim1k.pt")#
18 | torch.save(a, "ft-mi-no_opt-no_dropout.pt")  #
19 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/infer_batch_rvc.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | 
 5 | print("Command-line arguments:", sys.argv)
 6 | 
 7 | now_dir = os.getcwd()
 8 | sys.path.append(now_dir)
 9 | import sys
10 | 
11 | import tqdm as tq
12 | from dotenv import load_dotenv
13 | from scipy.io import wavfile
14 | 
15 | from configs.config import Config
16 | from infer.modules.vc.modules import VC
17 | 
18 | 
19 | def arg_parse() -> tuple:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--f0up_key", type=int, default=0)
22 |     parser.add_argument("--input_path", type=str, help="input path")
23 |     parser.add_argument("--index_path", type=str, help="index path")
24 |     parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25 |     parser.add_argument("--opt_path", type=str, help="opt path")
26 |     parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27 |     parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28 |     parser.add_argument("--device", type=str, help="device")
29 |     parser.add_argument("--is_half", type=bool, help="use half -> True")
30 |     parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31 |     parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32 |     parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33 |     parser.add_argument("--protect", type=float, default=0.33, help="protect")
34 | 
35 |     args = parser.parse_args()
36 |     sys.argv = sys.argv[:1]
37 | 
38 |     return args
39 | 
40 | 
41 | def main():
42 |     load_dotenv()
43 |     args = arg_parse()
44 |     config = Config()
45 |     config.device = args.device if args.device else config.device
46 |     config.is_half = args.is_half if args.is_half else config.is_half
47 |     vc = VC(config)
48 |     vc.get_vc(args.model_name)
49 |     audios = os.listdir(args.input_path)
50 |     for file in tq.tqdm(audios):
51 |         if file.endswith(".wav"):
52 |             file_path = os.path.join(args.input_path, file)
53 |             _, wav_opt = vc.vc_single(
54 |                 0,
55 |                 file_path,
56 |                 args.f0up_key,
57 |                 None,
58 |                 args.f0method,
59 |                 args.index_path,
60 |                 None,
61 |                 args.index_rate,
62 |                 args.filter_radius,
63 |                 args.resample_sr,
64 |                 args.rms_mix_rate,
65 |                 args.protect,
66 |             )
67 |             out_path = os.path.join(args.opt_path, file)
68 |             wavfile.write(out_path, wav_opt[0], wav_opt[1])
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/infer_cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | 
 5 | now_dir = os.getcwd()
 6 | sys.path.append(now_dir)
 7 | from dotenv import load_dotenv
 8 | from scipy.io import wavfile
 9 | 
10 | from configs.config import Config
11 | from infer.modules.vc.modules import VC
12 | 
13 | ####
14 | # USAGE
15 | #
16 | # In your Terminal or CMD or whatever
17 | 
18 | 
19 | def arg_parse() -> tuple:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--f0up_key", type=int, default=0)
22 |     parser.add_argument("--input_path", type=str, help="input path")
23 |     parser.add_argument("--index_path", type=str, help="index path")
24 |     parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25 |     parser.add_argument("--opt_path", type=str, help="opt path")
26 |     parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27 |     parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28 |     parser.add_argument("--device", type=str, help="device")
29 |     parser.add_argument("--is_half", type=bool, help="use half -> True")
30 |     parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31 |     parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32 |     parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33 |     parser.add_argument("--protect", type=float, default=0.33, help="protect")
34 | 
35 |     args = parser.parse_args()
36 |     sys.argv = sys.argv[:1]
37 | 
38 |     return args
39 | 
40 | 
41 | def main():
42 |     load_dotenv()
43 |     args = arg_parse()
44 |     config = Config()
45 |     config.device = args.device if args.device else config.device
46 |     config.is_half = args.is_half if args.is_half else config.is_half
47 |     vc = VC(config)
48 |     vc.get_vc(args.model_name)
49 |     _, wav_opt = vc.vc_single(
50 |         0,
51 |         args.input_path,
52 |         args.f0up_key,
53 |         None,
54 |         args.f0method,
55 |         args.index_path,
56 |         None,
57 |         args.index_rate,
58 |         args.filter_radius,
59 |         args.resample_sr,
60 |         args.rms_mix_rate,
61 |         args.protect,
62 |     )
63 |     wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/onnx_inference_demo.py:
--------------------------------------------------------------------------------
 1 | import soundfile
 2 | 
 3 | from ..infer.lib.infer_pack.onnx_inference import OnnxRVC
 4 | 
 5 | hop_size = 512
 6 | sampling_rate = 40000  # 采样率
 7 | f0_up_key = 0  # 升降调
 8 | sid = 0  # 角色ID
 9 | f0_method = "dio"  # F0提取算法
10 | model_path = "ShirohaRVC.onnx"  # 模型的完整路径
11 | vec_name = "vec-256-layer-9"  # 内部自动补齐为 f"pretrained/{vec_name}.onnx" 需要onnx的vec模型
12 | wav_path = "123.wav"  # 输入路径或ByteIO实例
13 | out_path = "out.wav"  # 输出路径或ByteIO实例
14 | 
15 | model = OnnxRVC(
16 |     model_path, vec_path=vec_name, sr=sampling_rate, hop_size=hop_size, device="cuda"
17 | )
18 | 
19 | audio = model.inference(wav_path, sid, f0_method=f0_method, f0_up_key=f0_up_key)
20 | 
21 | soundfile.write(out_path, audio, sampling_rate)
22 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/torchgate/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TorchGating is a PyTorch-based implementation of Spectral Gating
 3 | ================================================
 4 | Author: Asaf Zorea
 5 | 
 6 | Contents
 7 | --------
 8 | torchgate imports all the functions from PyTorch, and in addition provides:
 9 |  TorchGating       --- A PyTorch module that applies a spectral gate to an input signal
10 | 
11 | """
12 | from .torchgate import TorchGate
13 | 


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/torchgate/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/torchgate/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/torchgate/__pycache__/torchgate.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/torchgate/__pycache__/torchgate.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/torchgate/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KoljaB/ai_cli_tools/b488d0ece80d222c3d648677a404de57706ec6ed/tts-cli/rvc/tools/torchgate/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/tts-cli/rvc/tools/torchgate/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.types import Number
 3 | 
 4 | 
 5 | @torch.no_grad()
 6 | def amp_to_db(
 7 |     x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40
 8 | ) -> torch.Tensor:
 9 |     """
10 |     Convert the input tensor from amplitude to decibel scale.
11 | 
12 |     Arguments:
13 |         x {[torch.Tensor]} -- [Input tensor.]
14 | 
15 |     Keyword Arguments:
16 |         eps {[float]} -- [Small value to avoid numerical instability.]
17 |                           (default: {torch.finfo(torch.float64).eps})
18 |         top_db {[float]} -- [threshold the output at ``top_db`` below the peak]
19 |             `             (default: {40})
20 | 
21 |     Returns:
22 |         [torch.Tensor] -- [Output tensor in decibel scale.]
23 |     """
24 |     x_db = 20 * torch.log10(x.abs() + eps)
25 |     return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1))
26 | 
27 | 
28 | @torch.no_grad()
29 | def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor:
30 |     """
31 |     Apply a sigmoid function with temperature scaling.
32 | 
33 |     Arguments:
34 |         x {[torch.Tensor]} -- [Input tensor.]
35 |         x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.]
36 |         temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.]
37 | 
38 |     Returns:
39 |         [torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.]
40 |     """
41 |     return torch.sigmoid((x - x0) / temp_coeff)
42 | 
43 | 
44 | @torch.no_grad()
45 | def linspace(
46 |     start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs
47 | ) -> torch.Tensor:
48 |     """
49 |     Generate a linearly spaced 1-D tensor.
50 | 
51 |     Arguments:
52 |         start {[Number]} -- [The starting value of the sequence.]
53 |         stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False.
54 |                             In that case, the sequence consists of all but the last of ``num + 1``
55 |                             evenly spaced samples, so that `stop` is excluded. Note that the step
56 |                             size changes when `endpoint` is False.]
57 | 
58 |     Keyword Arguments:
59 |         num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.]
60 |         endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included.
61 |                               Default is True.]
62 |         **kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.]
63 | 
64 |     Returns:
65 |         [torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.]
66 |     """
67 |     if endpoint:
68 |         return torch.linspace(start, stop, num, **kwargs)
69 |     else:
70 |         return torch.linspace(start, stop, num + 1, **kwargs)[:-1]
71 | 


--------------------------------------------------------------------------------
/tts-cli/server.py:
--------------------------------------------------------------------------------
  1 | if __name__ == "__main__":
  2 |     import json
  3 |     import logging
  4 |     import threading
  5 |     import asyncio
  6 |     from queue import Queue, Empty
  7 |     import websockets
  8 |     from xtts_rvc_synthesizer import XTTSRVCSynthesizer
  9 | 
 10 |     # Initialize parameters
 11 |     xtts_model = "models/xtts/Lasinya"
 12 |     xtts_voice = "Lasinya_Reference.json"
 13 |     rvc_model = "models/rvc/Lasinya"
 14 |     use_logging = True
 15 | 
 16 |     # Use thread-safe Queues for audio chunks and control messages
 17 |     audio_queue = Queue()
 18 |     control_queue = Queue()
 19 | 
 20 |     # Set to store audio WebSocket connections
 21 |     audio_connections = set()
 22 | 
 23 |     # Event to signal the threads to stop
 24 |     stop_event = threading.Event()
 25 | 
 26 |     class TTSThread(threading.Thread):
 27 |         def __init__(self):
 28 |             super().__init__()
 29 |             self.tts = None
 30 | 
 31 |         def run(self):
 32 |             self.tts = XTTSRVCSynthesizer(
 33 |                 xtts_model=xtts_model,
 34 |                 xtts_voice=xtts_voice,
 35 |                 rvc_model=rvc_model,
 36 |                 rvc_sample_rate=40000,
 37 |                 use_logging=use_logging,
 38 |                 on_audio_chunk=self.on_audio_chunk
 39 |             )
 40 |             while not stop_event.is_set():
 41 |                 try:
 42 |                     data = control_queue.get(timeout=0.1)
 43 |                     if data["type"] == "text":
 44 |                         self.tts.push_text(data["content"])
 45 |                     elif data["type"] == "synthesize":
 46 |                         self.tts.synthesize()
 47 |                 except Empty:
 48 |                     continue
 49 | 
 50 |         def on_audio_chunk(self, chunk):
 51 |             print("received chunk")
 52 |             audio_queue.put(chunk)
 53 | 
 54 |     async def process_audio_queue():
 55 |         while True:
 56 |             try:
 57 |                 chunk = audio_queue.get_nowait()
 58 |                 print("Processing chunk from queue")
 59 |                 await broadcast_audio_chunk(chunk)
 60 |             except Empty:
 61 |                 await asyncio.sleep(0.01)
 62 | 
 63 |     async def broadcast_audio_chunk(chunk):
 64 |         print("broadcast_audio_chunk was called")
 65 |         for conn in list(audio_connections):
 66 |             try:
 67 |                 await conn.send(chunk)
 68 |             except websockets.exceptions.ConnectionClosed:
 69 |                 audio_connections.remove(conn)
 70 | 
 71 |     async def control_handler(websocket, path):
 72 |         try:
 73 |             async for message in websocket:
 74 |                 data = json.loads(message)
 75 |                 control_queue.put(data)
 76 |                 await websocket.send(json.dumps({"type": f"{data['type']}_received"}))
 77 |         except websockets.exceptions.ConnectionClosed:
 78 |             logging.info("Control WebSocket connection closed")
 79 | 
 80 |     async def audio_handler(websocket, path):
 81 |         try:
 82 |             audio_connections.add(websocket)
 83 |             await websocket.wait_closed()
 84 |         finally:
 85 |             audio_connections.remove(websocket)
 86 | 
 87 |     async def main():
 88 |         # Start the TTS thread
 89 |         tts_thread = TTSThread()
 90 |         tts_thread.start()
 91 | 
 92 |         # Start the audio processing task
 93 |         audio_task = asyncio.create_task(process_audio_queue())
 94 |         
 95 |         control_server = await websockets.serve(control_handler, "localhost", 8000)
 96 |         audio_server = await websockets.serve(audio_handler, "localhost", 8001)
 97 |         
 98 |         print("Server CONTROL listening on ws://localhost:8000")
 99 |         print("Server AUDIO listening on ws://localhost:8001")
100 |         
101 |         try:
102 |             await asyncio.gather(control_server.wait_closed(), audio_server.wait_closed(), audio_task)
103 |         finally:
104 |             stop_event.set()
105 |             tts_thread.join()
106 | 
107 |     logging.basicConfig(level=logging.DEBUG if use_logging else logging.WARNING)
108 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tts-cli/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="tts-cli",
 5 |     version="0.1",
 6 |     packages=find_packages(),
 7 |     entry_points={
 8 |         'console_scripts': [
 9 |             'tts=tts_client:main',
10 |             'tts-server=start_tts_server:main',            
11 |         ],
12 |     },
13 | )


--------------------------------------------------------------------------------
/tts-cli/start_tts_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def main():
 6 |     # Get the current script's directory (should be tts-cli)
 7 |     script_dir = os.path.dirname(os.path.abspath(__file__))
 8 |     
 9 |     # Move one directory up to access the venv
10 |     root_dir = os.path.dirname(script_dir)
11 |     os.chdir(root_dir)
12 |     
13 |     # Path to the virtual environment
14 |     venv_path = os.path.join(root_dir, 'venv')
15 |     
16 |     # Path to the Python interpreter in the virtual environment
17 |     if sys.platform == "win32":
18 |         python_path = os.path.join(venv_path, 'Scripts', 'python.exe')
19 |     else:
20 |         python_path = os.path.join(venv_path, 'bin', 'python')
21 |     
22 |     # Change back to the tts-cli directory
23 |     os.chdir(script_dir)
24 |     
25 |     # Prepare the command to run tts_server.py with all provided arguments
26 |     command = [python_path, 'tts_server.py'] + sys.argv[1:]
27 |     
28 |     # Start the TTS server
29 |     print("Starting TTS server...")
30 |     print(f"Command: {command}")
31 |     try:
32 |         subprocess.run(command, check=True)
33 |     except subprocess.CalledProcessError as e:
34 |         print(f"Error starting TTS server: {e}")
35 |         sys.exit(1)
36 |     except FileNotFoundError:
37 |         print(f"Error: Could not find Python interpreter at {python_path}")
38 |         print("Make sure the virtual environment is set up correctly.")
39 |         sys.exit(1)
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 
44 | # import os
45 | # import sys
46 | # import subprocess
47 | 
48 | # def start_tts_server():
49 | #     # Get the current script's directory (should be tts-cli)
50 | #     script_dir = os.path.dirname(os.path.abspath(__file__))
51 |     
52 | #     # Move one directory up to access the venv
53 | #     root_dir = os.path.dirname(script_dir)
54 | #     os.chdir(root_dir)
55 |     
56 | #     # Path to the virtual environment
57 | #     venv_path = os.path.join(root_dir, 'venv')
58 |     
59 | #     # Path to the Python interpreter in the virtual environment
60 | #     if sys.platform == "win32":
61 | #         python_path = os.path.join(venv_path, 'Scripts', 'python.exe')
62 | #     else:
63 | #         python_path = os.path.join(venv_path, 'bin', 'python')
64 |     
65 | #     # Change back to the tts-cli directory
66 | #     os.chdir(script_dir)
67 |     
68 | #     # Start the TTS server
69 | #     print("Starting TTS server...")
70 | #     try:
71 | #         subprocess.run([python_path, 'tts_server.py'], check=True)
72 | #     except subprocess.CalledProcessError as e:
73 | #         print(f"Error starting TTS server: {e}")
74 | #     except FileNotFoundError:
75 | #         print(f"Error: Could not find Python interpreter at {python_path}")
76 | #         print("Make sure the virtual environment is set up correctly.")
77 | 
78 | # def main():
79 | #     start_tts_server()
80 | 
81 | # if __name__ == "__main__":
82 | #     main()
83 | 


--------------------------------------------------------------------------------