├── .dockerignore ├── .gitattributes ├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── RVCVoiceChanger_colab.ipynb ├── cog.yaml ├── images ├── webui_dl_model.png ├── webui_generate.png └── webui_upload_model.png ├── predict.py ├── requirements.txt ├── rvc_models ├── MODELS.txt └── public_models.json └── src ├── configs ├── 32k.json ├── 32k_v2.json ├── 40k.json ├── 48k.json └── 48k_v2.json ├── download_models.py ├── infer_pack ├── attentions.py ├── commons.py ├── models.py ├── models_onnx.py ├── models_onnx_moess.py ├── modules.py └── transforms.py ├── main.py ├── mdx.py ├── my_utils.py ├── rmvpe.py ├── rvc.py ├── trainset_preprocess_pipeline_print.py ├── vc_infer_pipeline.py └── webui.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # The .dockerignore file excludes files from the container build process. 2 | # 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file 4 | 5 | # Exclude Git files 6 | .git 7 | .github 8 | .gitignore 9 | 10 | # Exclude Python cache files 11 | __pycache__ 12 | .mypy_cache 13 | .pytest_cache 14 | .ruff_cache 15 | 16 | # Exclude Python virtual environment 17 | /venv 18 | 19 | # Output 20 | voice_output/*/*.wav 21 | voice_output/*/*.mp3 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # RVC Models 2 | rvc_models/*/*.pth 3 | rvc_models/*/*.index 4 | rvc_models/*/*.npy 5 | rvc_models/hubert_base.pt 6 | rvc_models/rmvpe.pt 7 | 8 | # Output 9 | voice_output/*/*.wav 10 | voice_output/*/*.mp3 11 | 12 | # Replicate 13 | nb.ipynb 14 | output.mp3 15 | *.zip 16 | *.wav 17 | rvc_models/*/*.json 18 | 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | share/python-wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | MANIFEST 46 | 47 | # PyInstaller 48 | # Usually these files are written by a python script from a template 49 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 50 | *.manifest 51 | *.spec 52 | 53 | # Installer logs 54 | pip-log.txt 55 | pip-delete-this-directory.txt 56 | 57 | # Unit test / coverage reports 58 | htmlcov/ 59 | .tox/ 60 | .nox/ 61 | .coverage 62 | .coverage.* 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | *.cover 67 | *.py,cover 68 | .hypothesis/ 69 | .pytest_cache/ 70 | cover/ 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | local_settings.py 79 | db.sqlite3 80 | db.sqlite3-journal 81 | 82 | # Flask stuff: 83 | instance/ 84 | .webassets-cache 85 | 86 | # Scrapy stuff: 87 | .scrapy 88 | 89 | # Sphinx documentation 90 | docs/_build/ 91 | 92 | # PyBuilder 93 | .pybuilder/ 94 | target/ 95 | 96 | # Jupyter Notebook 97 | .ipynb_checkpoints 98 | 99 | # IPython 100 | profile_default/ 101 | ipython_config.py 102 | 103 | # pyenv 104 | # For a library or package, you might want to ignore these files since the code is 105 | # intended to run in multiple environments; otherwise, check them in: 106 | # .python-version 107 | 108 | # pipenv 109 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 110 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 111 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 112 | # install all needed dependencies. 113 | #Pipfile.lock 114 | 115 | # poetry 116 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 117 | # This is especially recommended for binary packages to ensure reproducibility, and is more 118 | # commonly ignored for libraries. 119 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 120 | #poetry.lock 121 | 122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 123 | __pypackages__/ 124 | 125 | # Celery stuff 126 | celerybeat-schedule 127 | celerybeat.pid 128 | 129 | # SageMath parsed files 130 | *.sage.py 131 | 132 | # Environments 133 | .env 134 | .venv 135 | env/ 136 | venv/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | .idea/ 171 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 SociallyIneptWeeb 4 | Copyright (c) 2024 PseudoRAM 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | This project is a substantial rewrite and repurpose of the original AICoverGen 25 | project by SociallyIneptWeeb. The rewrite was done by PseudoRAM in 2024. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RVC Voice Changer 2 | 3 | [![Replicate](https://replicate.com/pseudoram/rvc-v2/badge)](https://replicate.com/pseudoram/rvc-v2) 4 | 5 | An autonomous pipeline to change voices using any RVC v2 trained AI voice model. This tool can be used to apply voice conversion to any audio input. 6 | 7 | ![](images/webui_generate.png?raw=true) 8 | 9 | WebUI is under constant development and testing, but you can try it out right now on local! 10 | 11 | ## Update RVC Voice Changer to latest version 12 | 13 | Install and pull any new requirements and changes by opening a command line window in the `RVC-v2-UI` directory and running the following commands. 14 | 15 | 16 | 17 | ``` 18 | pip install -r requirements.txt 19 | git pull 20 | ``` 21 | 22 | For colab users, simply click `Runtime` in the top navigation bar of the colab notebook and `Disconnect and delete runtime` in the dropdown menu. 23 | Then follow the instructions in the notebook to run the webui. 24 | 25 | ## Colab notebook 26 | 27 | (Hopefully coming soon) 28 | 29 | ## Setup 30 | 31 | ### Install Git and Python 32 | 33 | 34 | Follow the instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) to install Git on your computer. Also follow this [guide](https://realpython.com/installing-python/) to install Python **VERSION 3.9** if you haven't already. Using other versions of Python may result in dependency conflicts. 35 | 36 | Alternatively, you can use pyenv to manage Python versions: 37 | 38 | 1. Install pyenv following the instructions [here](https://github.com/pyenv/pyenv#installation). 39 | 2. Install Python 3.9: 40 | ``` 41 | pyenv install 3.9 42 | ``` 43 | 3. Set it as your local Python version: 44 | ``` 45 | pyenv local 3.9 46 | ``` 47 | 48 | 49 | ### Install ffmpeg 50 | 51 | Follow the instructions [here](https://www.hostinger.com/tutorials/how-to-install-ffmpeg) to install ffmpeg on your computer. 52 | 53 | 54 | ### Clone RVC-v2-UI repository and set up virtual environment 55 | 56 | Open a command line window and run these commands to clone this entire repository, create a virtual environment, and install the additional dependencies required. 57 | 58 | ``` 59 | git clone https://github.com/PseudoRAM/RVC-v2-UI 60 | cd RVC-v2-UI 61 | ``` 62 | #### Create and activate virtual environment 63 | ##### Using pyenv 64 | ``` 65 | pyenv exec python -m venv venv 66 | ``` 67 | ##### Not using pyenv 68 | ``` 69 | python -m venv venv 70 | ``` 71 | 72 | ##### Activate virtual environment 73 | ##### Windows 74 | ``` 75 | venv\Scripts\activate 76 | ``` 77 | ##### macOS and Linux 78 | ``` 79 | source venv/bin/activate 80 | ``` 81 | 82 | #### Install dependencies 83 | ``` 84 | pip install -r requirements.txt 85 | ``` 86 | 87 | ### Download required models 88 | 89 | Run the following command to download the required hubert base model. 90 | 91 | ``` 92 | python src/download_models.py 93 | ``` 94 | 95 | 96 | ## Usage with WebUI 97 | 98 | To run the RVC Voice Changer WebUI, run the following command. 99 | 100 | ``` 101 | python src/webui.py 102 | ``` 103 | 104 | | Flag | Description | 105 | |--------------------------------------------|-------------| 106 | | `-h`, `--help` | Show this help message and exit. | 107 | | `--share` | Create a public URL. This is useful for running the web UI on Google Colab. | 108 | | `--listen` | Make the web UI reachable from your local network. | 109 | | `--listen-host LISTEN_HOST` | The hostname that the server will use. | 110 | | `--listen-port LISTEN_PORT` | The listening port that the server will use. | 111 | 112 | Once the following output message `Running on local URL: http://127.0.0.1:7860` appears, you can click on the link to open a tab with the WebUI. 113 | 114 | ### Download RVC models via WebUI 115 | 116 | ![](images/webui_dl_model.png?raw=true) 117 | 118 | Navigate to the `Download model` tab, and paste the download link to the RVC model and give it a unique name. 119 | You may search the [AI Hub Discord](https://discord.gg/aihub) where already trained voice models are available for download. You may refer to the examples for how the download link should look like. 120 | The downloaded zip file should contain the .pth model file and an optional .index file. 121 | 122 | Once the 2 input fields are filled in, simply click `Download`! Once the output message says `[NAME] Model successfully downloaded!`, you should be able to use it in the `Convert Voice` tab after clicking the refresh models button! 123 | 124 | ### Upload RVC models via WebUI 125 | 126 | ![](images/webui_upload_model.png?raw=true) 127 | 128 | For people who have trained RVC v2 models locally and would like to use them for voice conversion. 129 | Navigate to the `Upload model` tab, and follow the instructions. 130 | Once the output message says `[NAME] Model successfully uploaded!`, you should be able to use it in the `Convert Voice` tab after clicking the refresh models button! 131 | 132 | ### Running the pipeline via WebUI 133 | 134 | ![](images/webui_generate.png?raw=true) 135 | 136 | - From the Voice Models dropdown menu, select the voice model to use. Click `Refresh Models` if you added the files manually to the [rvc_models](rvc_models) directory to refresh the list. 137 | - In the Input Audio field, upload your audio file. 138 | - Adjust the pitch as needed. This changes the pitch of the output voice. 139 | - Other advanced options for Voice conversion can be viewed by clicking the accordion arrow to expand. 140 | 141 | Once all options are filled in, click `Convert` and the AI generated voice should appear in a few moments depending on your GPU. 142 | 143 | ## Usage with CLI 144 | 145 | ### Running the pipeline 146 | 147 | To run the voice conversion pipeline using the command line, run the following command: 148 | 149 | ``` 150 | python src/main.py [pitch] [f0_method] [index_rate] [filter_radius] [rms_mix_rate] [protect] 151 | ``` 152 | 153 | | Parameter | Description | 154 | |----------------------------|-------------| 155 | | `input_audio` | Path to the input audio file. | 156 | | `rvc_model` | Name of the RVC model to use. | 157 | | `pitch` | (Optional) Pitch change in semitones. Default is 0. | 158 | | `f0_method` | (Optional) Pitch detection algorithm. Options: 'rmvpe' (default) or 'mangio-crepe'. | 159 | | `index_rate` | (Optional) Index rate for the voice conversion. Default is 0.5. Range: 0 to 1. | 160 | | `filter_radius` | (Optional) Filter radius for median filtering. Default is 3. Range: 0 to 7. | 161 | | `rms_mix_rate` | (Optional) RMS mix rate. Default is 0.25. Range: 0 to 1. | 162 | | `protect` | (Optional) Protect rate to preserve some original voice characteristics. Default is 0.33. Range: 0 to 0.5. | 163 | 164 | Example usage: 165 | ``` 166 | python src/main.py "path/to/input/audio.wav" "JohnDoe" 2 rmvpe 0.7 3 0.3 0.35 167 | ``` 168 | This command will convert the voice in "audio.wav" using the "JohnDoe" RVC model, raising the pitch by 2 semitones, using the 'rmvpe' pitch detection algorithm, with an index rate of 0.7, filter radius of 3, RMS mix rate of 0.3, and protect rate of 0.35. 169 | 170 | 171 | ## Manual Download of RVC models 172 | 173 | Unzip (if needed) and transfer the `.pth` and `.index` files to a new folder in the [rvc_models](rvc_models) directory. Each folder should only contain one `.pth` and one `.index` file. 174 | 175 | The directory structure should look something like this: 176 | ``` 177 | ├── rvc_models 178 | │ ├── John 179 | │ │ ├── JohnV2.pth 180 | │ │ └── added_IVF2237_Flat_nprobe_1_v2.index 181 | │ ├── May 182 | │ │ ├── May.pth 183 | │ │ └── added_IVF2237_Flat_nprobe_1_v2.index 184 | │ ├── MODELS.txt 185 | │ └── hubert_base.pt 186 | ├── voice_output 187 | └── src 188 | ``` 189 | 190 | 191 | 192 | ## Terms of Use 193 | 194 | The use of the converted voice for the following purposes is prohibited. 195 | 196 | * Criticizing or attacking individuals. 197 | * Advocating for or opposing specific political positions, religions, or ideologies. 198 | * Publicly displaying strongly stimulating expressions without proper zoning. 199 | * Selling of voice models and generated voice clips. 200 | * Impersonation of the original owner of the voice with malicious intentions to harm/hurt others. 201 | * Fraudulent purposes that lead to identity theft or fraudulent phone calls. 202 | 203 | ## Disclaimer 204 | 205 | I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software. -------------------------------------------------------------------------------- /RVCVoiceChanger_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "YYVAKuNBc-X4" 7 | }, 8 | "source": [ 9 | "# RVC Voice Changer WebUI\n", 10 | "\n", 11 | "Simply click `Runtime` in the top navigation bar and `Run all`. Wait for the output of the final cell to show the public gradio url and click on it." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "cellView": "form", 19 | "id": "vC4gLMHI9xb3" 20 | }, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "Done Cloning Repository\n" 27 | ] 28 | }, 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "Timer: 00:03:16" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "#@title Clone repository\n", 39 | "from IPython.display import clear_output, Javascript\n", 40 | "import codecs\n", 41 | "import threading\n", 42 | "import time\n", 43 | "cloneing=codecs.decode('uggcf://tvguho.pbz/LbheTvgUhoHfreanzr/EIP-i2-HV.tvg','rot_13')\n", 44 | "!git clone $cloneing RVC-v2-UI\n", 45 | "def update_timer_and_print():\n", 46 | " global timer\n", 47 | " while True:\n", 48 | " hours, remainder = divmod(timer, 3600)\n", 49 | " minutes, seconds = divmod(remainder, 60)\n", 50 | " timer_str = f'{hours:02}:{minutes:02}:{seconds:02}'\n", 51 | " print(f'\\rTimer: {timer_str}', end='', flush=True) # Print without a newline\n", 52 | " time.sleep(1)\n", 53 | " timer += 1\n", 54 | "timer = 0\n", 55 | "threading.Thread(target=update_timer_and_print, daemon=True).start()\n", 56 | "\n", 57 | "!rm -rf sample_data\n", 58 | "%cd RVC-v2-UI\n", 59 | "clear_output()\n", 60 | "print(\"Done Cloning Repository\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "metadata": { 67 | "cellView": "form", 68 | "id": "odzpJHpr_PaF" 69 | }, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Finished running this cell, proceed to the next cell\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "#@title Install requirements\n", 81 | "!pip install -q -r requirements.txt\n", 82 | "clear_output()\n", 83 | "print(\"Finished Installing Requirements\")\n", 84 | "!sudo apt update\n", 85 | "clear_output()\n", 86 | "print(\"Finished Updating\")\n", 87 | "!sudo apt install sox\n", 88 | "clear_output()\n", 89 | "print(\"Finished running this cell, proceed to the next cell\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": { 96 | "cellView": "form", 97 | "id": "SLWpcJc0AHSZ" 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Finished Downloading Hubert Base Model\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "#@title Download Hubert Base Model\n", 110 | "models=codecs.decode('fep/qbjaybnq_zbqryf.cl','rot_13')\n", 111 | "!python $models\n", 112 | "clear_output()\n", 113 | "print(\"Finished Downloading Hubert Base Model\")" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "cellView": "form", 121 | "id": "NEglTq6Ya9d0" 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "#@title Run WebUI\n", 126 | "runpice=codecs.decode('fep/jrohv.cl','rot_13')\n", 127 | "!python $runpice --share" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "accelerator": "GPU", 133 | "colab": { 134 | "provenance": [] 135 | }, 136 | "kernelspec": { 137 | "display_name": "Python 3", 138 | "name": "python3" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 3 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython3", 150 | "version": "3.9.13" 151 | } 152 | }, 153 | "nbformat": 4, 154 | "nbformat_minor": 0 155 | } 156 | -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | # set to true if your model requires a GPU 6 | gpu: true 7 | cuda: "11.8" 8 | # a list of ubuntu apt packages to install 9 | system_packages: 10 | - "libgl1-mesa-glx" 11 | - "ffmpeg" 12 | 13 | # python version in the form '3.11' or '3.11.4' 14 | python_version: "3.9" 15 | 16 | # a list of packages in the format == 17 | python_packages: 18 | - "fairseq==0.12.2" 19 | - "faiss-cpu==1.7.3" 20 | - "ffmpeg-python>=0.2.0" 21 | - "gradio==4.37.1" 22 | - "librosa==0.9.1" 23 | - "numpy==1.26.4" 24 | - "onnxruntime_gpu" 25 | - "praat-parselmouth>=0.4.2" 26 | - "pyworld==0.3.4" 27 | - "Requests==2.31.0" 28 | - "scipy==1.13.1" 29 | - "soundfile==0.12.1" 30 | - "--find-links https://download.pytorch.org/whl/torch_stable.html" 31 | - "torch==2.0.1+cu118" 32 | - "torchcrepe==0.0.20" 33 | - "tqdm==4.65.0" 34 | 35 | # commands run after the environment is setup 36 | run: 37 | - pip install --upgrade pip 38 | - apt-get update && apt-get install -y ffmpeg 39 | - pip install imageio[ffmpeg] 40 | 41 | # predict.py defines how predictions are run on your model 42 | predict: "predict.py:Predictor" 43 | -------------------------------------------------------------------------------- /images/webui_dl_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PseudoRAM/RVC-v2-UI/2934eb37906e8f310c91ac7eb82510677fb4a2f1/images/webui_dl_model.png -------------------------------------------------------------------------------- /images/webui_generate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PseudoRAM/RVC-v2-UI/2934eb37906e8f310c91ac7eb82510677fb4a2f1/images/webui_generate.png -------------------------------------------------------------------------------- /images/webui_upload_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PseudoRAM/RVC-v2-UI/2934eb37906e8f310c91ac7eb82510677fb4a2f1/images/webui_upload_model.png -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import urllib.parse 4 | from argparse import Namespace 5 | from cog import BasePredictor, Input, Path as CogPath 6 | 7 | sys.path.insert(0, os.path.abspath("src")) 8 | 9 | import main as m 10 | 11 | class Predictor(BasePredictor): 12 | def setup(self): 13 | """Load the model into memory to make running multiple predictions efficient""" 14 | pass 15 | 16 | def predict( 17 | self, 18 | input_audio: CogPath = Input( 19 | description="Upload your audio file here.", 20 | default=None, 21 | ), 22 | rvc_model: str = Input( 23 | description="RVC model for a specific voice. If using a custom model, this should match the name of the downloaded model. If a 'custom_rvc_model_download_url' is provided, this will be automatically set to the name of the downloaded model.", 24 | default="Obama", 25 | choices=[ 26 | "Obama", 27 | "Trump", 28 | "Sandy", 29 | "Rogan", 30 | "Obama", 31 | "CUSTOM", 32 | ], 33 | ), 34 | custom_rvc_model_download_url: str = Input( 35 | description="URL to download a custom RVC model. If provided, the model will be downloaded (if it doesn't already exist) and used for prediction, regardless of the 'rvc_model' value.", 36 | default=None, 37 | ), 38 | pitch_change: float = Input( 39 | description="Adjust pitch of AI vocals in semitones. Use positive values to increase pitch, negative to decrease.", 40 | default=0, 41 | ), 42 | index_rate: float = Input( 43 | description="Control how much of the AI's accent to leave in the vocals.", 44 | default=0.5, 45 | ge=0, 46 | le=1, 47 | ), 48 | filter_radius: int = Input( 49 | description="If >=3: apply median filtering to the harvested pitch results.", 50 | default=3, 51 | ge=0, 52 | le=7, 53 | ), 54 | rms_mix_rate: float = Input( 55 | description="Control how much to use the original vocal's loudness (0) or a fixed loudness (1).", 56 | default=0.25, 57 | ge=0, 58 | le=1, 59 | ), 60 | f0_method: str = Input( 61 | description="Pitch detection algorithm. 'rmvpe' for clarity in vocals, 'mangio-crepe' for smoother vocals.", 62 | default="rmvpe", 63 | choices=["rmvpe", "mangio-crepe"], 64 | ), 65 | crepe_hop_length: int = Input( 66 | description="When `f0_method` is set to `mangio-crepe`, this controls how often it checks for pitch changes in milliseconds.", 67 | default=128, 68 | ), 69 | protect: float = Input( 70 | description="Control how much of the original vocals' breath and voiceless consonants to leave in the AI vocals. Set 0.5 to disable.", 71 | default=0.33, 72 | ge=0, 73 | le=0.5, 74 | ), 75 | output_format: str = Input( 76 | description="wav for best quality and large file size, mp3 for decent quality and small file size.", 77 | default="mp3", 78 | choices=["mp3", "wav"], 79 | ), 80 | ) -> CogPath: 81 | """ 82 | Runs a single prediction on the model. 83 | """ 84 | if custom_rvc_model_download_url: 85 | custom_rvc_model_download_name = urllib.parse.unquote( 86 | custom_rvc_model_download_url.split("/")[-1] 87 | ) 88 | custom_rvc_model_download_name = os.path.splitext( 89 | custom_rvc_model_download_name 90 | )[0] 91 | print( 92 | f"[!] The model will be downloaded as '{custom_rvc_model_download_name}'." 93 | ) 94 | m.download_online_model( 95 | url=custom_rvc_model_download_url, 96 | dir_name=custom_rvc_model_download_name, 97 | overwrite=True 98 | ) 99 | rvc_model = custom_rvc_model_download_name 100 | else: 101 | print( 102 | "[!] Since no URL was provided, we will use the selected RVC model." 103 | ) 104 | 105 | rvc_dirname = rvc_model 106 | if not os.path.exists(os.path.join(m.rvc_models_dir, rvc_dirname)): 107 | raise Exception( 108 | f"The folder {os.path.join(m.rvc_models_dir, rvc_dirname)} does not exist." 109 | ) 110 | 111 | output_path = m.voice_conversion( 112 | str(input_audio), 113 | rvc_dirname, 114 | pitch_change, 115 | f0_method, 116 | index_rate, 117 | filter_radius, 118 | rms_mix_rate, 119 | protect 120 | ) 121 | print(f"[+] Converted audio generated at {output_path}") 122 | 123 | # Return the output path 124 | return CogPath(output_path) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fairseq==0.12.2 2 | faiss-cpu==1.7.3 3 | ffmpeg-python>=0.2.0 4 | gradio==4.37.1 5 | librosa==0.9.1 6 | numpy==1.26.4 7 | onnxruntime_gpu 8 | praat-parselmouth>=0.4.2 9 | pydantic==2.5.2 10 | pyworld==0.3.4 11 | Requests==2.31.0 12 | scipy==1.13.1 13 | soundfile==0.12.1 14 | --find-links https://download.pytorch.org/whl/torch_stable.html 15 | torch==2.0.1+cu118 16 | torchcrepe==0.0.20 17 | tqdm==4.65.0 18 | -------------------------------------------------------------------------------- /rvc_models/MODELS.txt: -------------------------------------------------------------------------------- 1 | RVC Models can be added as a folder here. Each folder should contain the model file (.pth extension), and an index file (.index extension). 2 | For example, a folder called Maya, containing 2 files, Maya.pth and added_IVF1905_Flat_nprobe_Maya_v2.index. -------------------------------------------------------------------------------- /rvc_models/public_models.json: -------------------------------------------------------------------------------- 1 | { 2 | "tags": { 3 | "English": "Character speaks English", 4 | "Japanese": "Character speaks Japanese", 5 | "Other Language": "The character speaks Other Language", 6 | "Anime": "Character from anime", 7 | "Vtuber": "Character is a vtuber", 8 | "Real person": "A person who exists in the real world", 9 | "Game character": "A character from the game" 10 | }, 11 | "voice_models": [ 12 | { 13 | "name": "Emilia", 14 | "url": "https://huggingface.co/RinkaEmina/RVC_Sharing/resolve/main/Emilia%20V2%2048000.zip", 15 | "description": "Emilia from Re:Zero", 16 | "added": "2023-07-31", 17 | "credit": "rinka4759", 18 | "tags": [ 19 | "Anime" 20 | ] 21 | }, 22 | { 23 | "name": "Klee", 24 | "url": "https://huggingface.co/qweshkka/Klee/resolve/main/Klee.zip", 25 | "description": "Klee from Genshin Impact", 26 | "added": "2023-07-31", 27 | "credit": "qweshsmashjuicefruity", 28 | "tags": [ 29 | "Game character", 30 | "Japanese" 31 | ] 32 | }, 33 | { 34 | "name": "Yelan", 35 | "url": "https://huggingface.co/iroaK/RVC2_Yelan_GenshinImpact/resolve/main/YelanJP.zip", 36 | "description": "Yelan from Genshin Impact", 37 | "added": "2023-07-31", 38 | "credit": "iroak", 39 | "tags": [ 40 | "Game character", 41 | "Japanese" 42 | ] 43 | }, 44 | { 45 | "name": "Yae Miko", 46 | "url": "https://huggingface.co/iroaK/RVC2_YaeMiko_GenshinImpact/resolve/main/Yae_MikoJP.zip", 47 | "description": "Yae Miko from Genshin Impact", 48 | "added": "2023-07-31", 49 | "credit": "iroak", 50 | "tags": [ 51 | "Game character", 52 | "Japanese" 53 | ] 54 | }, 55 | { 56 | "name": "Lisa", 57 | "url": "https://huggingface.co/qweshkka/Lisa2ver/resolve/main/Lisa.zip", 58 | "description": "Lisa from Genshin Impact", 59 | "added": "2023-07-31", 60 | "credit": "qweshsmashjuicefruity", 61 | "tags": [ 62 | "Game character", 63 | "English" 64 | ] 65 | }, 66 | { 67 | "name": "Kazuha", 68 | "url": "https://huggingface.co/iroaK/RVC2_Kazuha_GenshinImpact/resolve/main/Kazuha.zip", 69 | "description": "Kaedehara Kazuha from Genshin Impact", 70 | "added": "2023-07-31", 71 | "credit": "iroak", 72 | "tags": [ 73 | "Game character", 74 | "Japanese" 75 | ] 76 | }, 77 | { 78 | "name": "Barbara", 79 | "url": "https://huggingface.co/iroaK/RVC2_Barbara_GenshinImpact/resolve/main/BarbaraJP.zip", 80 | "description": "Barbara from Genshin Impact", 81 | "added": "2023-07-31", 82 | "credit": "iroak", 83 | "tags": [ 84 | "Game character", 85 | "Japanese" 86 | ] 87 | }, 88 | { 89 | "name": "Tom Holland", 90 | "url": "https://huggingface.co/TJKAI/TomHolland/resolve/main/TomHolland.zip", 91 | "description": "Tom Holland (Spider-Man)", 92 | "added": "2023-08-03", 93 | "credit": "tjkcreative", 94 | "tags": [ 95 | "Real person", 96 | "English" 97 | ] 98 | }, 99 | { 100 | "name": "Kamisato Ayaka", 101 | "url": "https://huggingface.co/benitheworld/ayaka-cn/resolve/main/ayaka-cn.zip", 102 | "description": "Kamisato Ayaka from Genshin Impact - CN voice actor", 103 | "added": "2023-08-03", 104 | "credit": "kannysoap", 105 | "tags": [ 106 | "Game character", 107 | "Other Language" 108 | ] 109 | }, 110 | { 111 | "name": "Amai Odayaka", 112 | "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Amai-Odayaka.zip", 113 | "description": "Amai Odayaka from Yandere Simulator", 114 | "added": "2023-08-03", 115 | "credit": "minecraftian47", 116 | "tags": [ 117 | "Anime", 118 | "English" 119 | ] 120 | }, 121 | { 122 | "name": "Compa - Hyperdimension Neptunia", 123 | "url": "https://huggingface.co/zeerowiibu/WiibuRVCCollection/resolve/main/Compa%20(Choujigen%20Game%20Neptunia)%20(JPN)%20(RVC%20v2)%20(150%20Epochs).zip", 124 | "description": "Compa from Choujigen Game Neptune (aka Hyperdimension Neptunia)", 125 | "added": "2023-08-03", 126 | "credit": "zeerowiibu", 127 | "tags": [ 128 | "Anime", 129 | "Japanese" 130 | ] 131 | }, 132 | { 133 | "name": "Fu Xuan", 134 | "url": "https://huggingface.co/Juneuarie/FuXuan/resolve/main/FuXuan.zip", 135 | "description": "Fu Xuan from Honkai Star Rail (HSR)", 136 | "added": "2023-08-03", 137 | "credit": "__june", 138 | "tags": [ 139 | "Game character", 140 | "English" 141 | ] 142 | }, 143 | { 144 | "name": "Xinyan", 145 | "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/XinyanRVC.zip", 146 | "description": "Xinyan from Genshin Impact", 147 | "added": "2023-08-03", 148 | "credit": "shyelijah", 149 | "tags": [ 150 | "Game character", 151 | "English" 152 | ] 153 | }, 154 | { 155 | "name": "Enterprise", 156 | "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Enterprise-JP.zip", 157 | "description": "Enterprise from Azur Lane", 158 | "added": "2023-08-03", 159 | "credit": "minecraftian47", 160 | "tags": [ 161 | "Anime", 162 | "Japanese" 163 | ] 164 | }, 165 | { 166 | "name": "Kurt Cobain", 167 | "url": "https://huggingface.co/Florstie/Kurt_Cobain_byFlorst/resolve/main/Kurt_Florst.zip", 168 | "description": "singer Kurt Cobain", 169 | "added": "2023-08-03", 170 | "credit": "florst", 171 | "tags": [ 172 | "Real person", 173 | "English" 174 | ] 175 | }, 176 | { 177 | "name": "Ironmouse", 178 | "url": "https://huggingface.co/Tempo-Hawk/IronmouseV2/resolve/main/IronmouseV2.zip", 179 | "description": "Ironmouse", 180 | "added": "2023-08-03", 181 | "credit": "ladyimpa", 182 | "tags": [ 183 | "Vtuber", 184 | "English" 185 | ] 186 | }, 187 | { 188 | "name": "Bratishkinoff", 189 | "url": "https://huggingface.co/JHmashups/Bratishkinoff/resolve/main/bratishkin.zip", 190 | "description": "Bratishkinoff (Bratishkin | Братишкин) - russian steamer ", 191 | "added": "2023-08-03", 192 | "credit": ".caddii", 193 | "tags": [ 194 | "Real person", 195 | "Other Language" 196 | ] 197 | }, 198 | { 199 | "name": "Yagami Light", 200 | "url": "https://huggingface.co/geekdom-tr/Yagami-Light/resolve/main/Yagami-Light.zip", 201 | "description": "Yagami Light (Miyano Mamoru) from death note", 202 | "added": "2023-08-03", 203 | "credit": "takka / takka#7700", 204 | "tags": [ 205 | "Anime", 206 | "Japanese" 207 | ] 208 | }, 209 | { 210 | "name": "Itashi", 211 | "url": "https://huggingface.co/4uGGun/4uGGunRVC/resolve/main/itashi.zip", 212 | "description": "Itashi (Russian fandubber AniLibria) ", 213 | "added": "2023-08-03", 214 | "credit": "BelochkaOff", 215 | "tags": [ 216 | "Anime", 217 | "Other Language", 218 | "Real person" 219 | ] 220 | }, 221 | { 222 | "name": "Michiru Kagemori", 223 | "url": "https://huggingface.co/WolfMK/MichiruKagemori/resolve/main/MichiruKagemori_RVC_V2.zip", 224 | "description": "Michiru Kagemori from Brand New Animal (300 Epochs)", 225 | "added": "2023-08-03", 226 | "credit": "wolfmk", 227 | "tags": [ 228 | "Anime", 229 | "English" 230 | ] 231 | } 232 | , 233 | { 234 | "name": "Kaeya", 235 | "url": "https://huggingface.co/nlordqting4444/nlordqtingRVC/resolve/main/Kaeya.zip", 236 | "description": "Kaeya (VA: Kohsuke Toriumi) from Genshin Impact (300 Epochs)", 237 | "added": "2023-08-03", 238 | "credit": "nlordqting4444", 239 | "tags": [ 240 | "Game character", 241 | "Japanese" 242 | ] 243 | }, 244 | { 245 | "name": "Mona Megistus", 246 | "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/MonaRVC.zip", 247 | "description": "Mona Megistus (VA: Felecia Angelle) from Genshin Impact (250 Epochs)", 248 | "added": "2023-08-03", 249 | "credit": "shyelijah", 250 | "tags": [ 251 | "Game character", 252 | "English" 253 | ] 254 | }, 255 | { 256 | "name": "Klee", 257 | "url": "https://huggingface.co/hardbop/AI_MODEL_THINGY/resolve/main/kleeeng_rvc.zip", 258 | "description": "Klee from Genshin Impact (400 Epochs)", 259 | "added": "2023-08-03", 260 | "credit": "hardbop", 261 | "tags": [ 262 | "Game character", 263 | "English" 264 | ] 265 | }, 266 | { 267 | "name": "Sakurakoji Kinako", 268 | "url": "https://huggingface.co/Gorodogi/RVC2MangioCrepe/resolve/main/kinakobetatwo700.zip", 269 | "description": "Sakurakoji Kinako (Suzuhara Nozomi) from Love Live! Superstar!! (700 Epoch)", 270 | "added": "2023-08-03", 271 | "credit": "ck1089", 272 | "tags": [ 273 | "Anime", 274 | "Japanese" 275 | ] 276 | }, 277 | { 278 | "name": "Minamo Kurosawa", 279 | "url": "https://huggingface.co/timothy10583/RVC/resolve/main/minamo-kurosawa.zip", 280 | "description": "Minamo (Nyamo) Kurosawa (Azumanga Daioh US DUB) (300 Epochs)", 281 | "added": "2023-08-03", 282 | "credit": "timothy10583", 283 | "tags": [ 284 | "Anime" 285 | ] 286 | }, 287 | { 288 | "name": "Neco Arc", 289 | "url": "https://huggingface.co/Ozzy-Helix/Neko_Arc_Neko_Aruku.RVCv2/resolve/main/Neko_Arc-V3-E600.zip", 290 | "description": "Neco Arc (Neco-Aruku) (Epochs 600)", 291 | "added": "2023-08-03", 292 | "credit": "ozzy_helix_", 293 | "tags": [ 294 | "Anime" 295 | ] 296 | }, 297 | { 298 | "name": "Makima", 299 | "url": "https://huggingface.co/andolei/makimaen/resolve/main/makima-en-dub.zip", 300 | "description": "Makima from Chainsaw Man (300 Epochs)", 301 | "added": "2023-08-03", 302 | "credit": "andpproximately", 303 | "tags": [ 304 | "Anime", 305 | "English" 306 | ] 307 | }, 308 | { 309 | "name": "PomPom", 310 | "url": "https://huggingface.co/benitheworld/pom-pom/resolve/main/pom-pom.zip", 311 | "description": "PomPom from Honkai Star Rail (HSR) (200 Epochs)", 312 | "added": "2023-08-03", 313 | "credit": "kannysoap", 314 | "tags": [ 315 | "Game character", 316 | "English" 317 | ] 318 | }, 319 | { 320 | "name": "Asuka Langley Soryu", 321 | "url": "https://huggingface.co/Piegirl/asukaadv/resolve/main/asuka.zip", 322 | "description": "Asuka Langley Soryu/Tiffany Grant from Neon Genesis Evangelion (400 Epochs)", 323 | "added": "2023-08-03", 324 | "credit": "piegirl", 325 | "tags": [ 326 | "Anime", 327 | "English" 328 | ] 329 | }, 330 | { 331 | "name": "Ochaco Uraraka", 332 | "url": "https://huggingface.co/legitdark/JP-Uraraka-By-Dan/resolve/main/JP-Uraraka-By-Dan.zip", 333 | "description": "Ochaco Uraraka from Boku no Hero Academia (320 Epochs)", 334 | "added": "2023-08-03", 335 | "credit": "danthevegetable", 336 | "tags": [ 337 | "Anime", 338 | "Japanese" 339 | ] 340 | }, 341 | { 342 | "name": "Sunaokami Shiroko", 343 | "url": "https://huggingface.co/LordDavis778/BlueArchivevoicemodels/resolve/main/SunaokamiShiroko.zip", 344 | "description": "Sunaokami Shiroko from Blue Archive (500 Epochs)", 345 | "added": "2023-08-03", 346 | "credit": "lorddavis778", 347 | "tags": [ 348 | "Anime" 349 | ] 350 | }, 351 | { 352 | "name": "Dainsleif", 353 | "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Dainsleif/Dainsleif.zip", 354 | "description": "Dainsleif from Genshin Impact (335 Epochs)", 355 | "added": "2023-08-03", 356 | "credit": "nasley", 357 | "tags": [ 358 | "Game character", 359 | "English" 360 | ] 361 | }, 362 | { 363 | "name": "Mae Asmr", 364 | "url": "https://huggingface.co/ctian/VRC/resolve/main/MaeASMR.zip", 365 | "description": "Mae Asmr - harvest mommy voice (YOUTUBE) (300 Epochs)", 366 | "added": "2023-08-03", 367 | "credit": "ctian_04", 368 | "tags": [ 369 | "English", 370 | "Real person", 371 | "Vtuber" 372 | ] 373 | }, 374 | { 375 | "name": "Hana Shirosaki ", 376 | "url": "https://huggingface.co/Pawlik17/HanaWataten/resolve/main/HanaWATATEN.zip", 377 | "description": "Hana Shirosaki / 白 咲 花 From Watashi ni Tenshi ga Maiorita! (570 Epochs)", 378 | "added": "2023-08-03", 379 | "credit": "tamalik", 380 | "tags": [ 381 | "Anime", 382 | "Japanese" 383 | ] 384 | }, 385 | { 386 | "name": "Kaguya Shinomiya ", 387 | "url": "https://huggingface.co/1ski/1skiRVCModels/resolve/main/kaguyav5.zip", 388 | "description": "Kaguya Shinomiya from Kaguya-Sama Love is war (200 Epochs)", 389 | "added": "2023-08-03", 390 | "credit": "1ski", 391 | "tags": [ 392 | "Anime", 393 | "Japanese" 394 | ] 395 | }, 396 | { 397 | "name": "Nai Shiro", 398 | "url": "https://huggingface.co/kuushiro/Shiro-RVC-No-Game-No-Life/resolve/main/shiro-jp-360-epochs.zip", 399 | "description": "Nai Shiro (Ai Kayano) from No Game No Life (360 Epochs)", 400 | "added": "2023-08-03", 401 | "credit": "kxouyou", 402 | "tags": [ 403 | "Anime", 404 | "Japanese" 405 | ] 406 | }, 407 | { 408 | "name": "Yuigahama Yui", 409 | "url": "https://huggingface.co/Zerokano/Yuigahama_Yui-RVCv2/resolve/main/Yuigahama_Yui.zip", 410 | "description": "Yuigahama Yui from Yahari Ore no Seishun Love Comedy wa Machigatteiru (250 Epochs)", 411 | "added": "2023-08-03", 412 | "credit": "zerokano", 413 | "tags": [ 414 | "Anime", 415 | "Japanese" 416 | ] 417 | }, 418 | { 419 | "name": "Fuwawa Abyssgard", 420 | "url": "https://huggingface.co/megaaziib/my-rvc-models-collection/resolve/main/fuwawa.zip", 421 | "description": "Fuwawa Abyssgard (FUWAMOCO) from Hololive gen 3 (250 Epochs)", 422 | "added": "2023-08-03", 423 | "credit": "megaaziib", 424 | "tags": [ 425 | "Vtuber", 426 | "English" 427 | ] 428 | }, 429 | { 430 | "name": "Kana Arima", 431 | "url": "https://huggingface.co/ddoumakunn/arimakanna/resolve/main/arimakanna.zip", 432 | "description": "Kana Arima from Oshi no Ko (250 Epochs)", 433 | "added": "2023-08-03", 434 | "credit": "ddoumakunn", 435 | "tags": [ 436 | "Anime", 437 | "Japanese" 438 | ] 439 | }, 440 | { 441 | "name": "Raiden Shogun", 442 | "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/RaidenShogun/RaidenShogun.zip", 443 | "description": "Raiden Shogun from Genshin Impact (310 Epochs)", 444 | "added": "2023-08-03", 445 | "credit": "nasley", 446 | "tags": [ 447 | "Game character", 448 | "English" 449 | ] 450 | }, 451 | { 452 | "name": "Alhaitham", 453 | "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Alhaitham/Alhaitham.zip", 454 | "description": "Alhaitham from Genshin Impact (320 Epochs)", 455 | "added": "2023-08-03", 456 | "credit": "nasley", 457 | "tags": [ 458 | "Game character", 459 | "English" 460 | ] 461 | }, 462 | { 463 | "name": "Izuku Midoriya", 464 | "url": "https://huggingface.co/BigGuy635/MHA/resolve/main/DekuJP.zip", 465 | "description": "Izuku Midoriya from Boku no Hero Academia (100 Epochs)", 466 | "added": "2023-08-03", 467 | "credit": "khjjnoffical", 468 | "tags": [ 469 | "Anime", 470 | "Japanese" 471 | ] 472 | }, 473 | { 474 | "name": "Kurumi Shiratori", 475 | "url": "https://huggingface.co/HarunaKasuga/YoshikoTsushima/resolve/main/KurumiShiratori.zip", 476 | "description": "Kurumi Shiratori (VA: Ruka Fukagawa) from D4DJ (500 Epochs)", 477 | "added": "2023-08-03", 478 | "credit": "seakrait", 479 | "tags": [ 480 | "Anime", 481 | "Japanese" 482 | ] 483 | }, 484 | { 485 | "name": "Veibae", 486 | "url": "https://huggingface.co/datasets/Papaquans/Veibae/resolve/main/veibae_e165_s125565.zip", 487 | "description": "Veibae (165 Epochs)", 488 | "added": "2023-08-03", 489 | "credit": "recairo", 490 | "tags": [ 491 | "Vtuber", 492 | "English" 493 | ] 494 | }, 495 | { 496 | "name": "Black Panther", 497 | "url": "https://huggingface.co/TJKAI/BlackPannther/resolve/main/BlackPanther.zip", 498 | "description": "Black Panther (Chadwick Boseman) (300 Epochs)", 499 | "added": "2023-08-03", 500 | "credit": "tjkcreative", 501 | "tags": [ 502 | "Real person", 503 | "English" 504 | ] 505 | }, 506 | { 507 | "name": "Gawr Gura", 508 | "url": "https://pixeldrain.com/u/3tJmABXA", 509 | "description": "Gawr Gura from Hololive EN", 510 | "added": "2023-08-05", 511 | "credit": "dacoolkid44 & hijack", 512 | "tags": [ 513 | "Vtuber" 514 | ] 515 | }, 516 | { 517 | "name": "Houshou Marine", 518 | "url": "https://pixeldrain.com/u/L1YLfZyU", 519 | "description": "Houshou Marine from Hololive JP", 520 | "added": "2023-08-05", 521 | "credit": "dacoolkid44 & hijack", 522 | "tags": [ 523 | "Vtuber", 524 | "Japanese" 525 | ] 526 | }, 527 | { 528 | "name": "Hoshimachi Suisei", 529 | "url": "https://pixeldrain.com/u/YP89C21u", 530 | "description": "Hoshimachi Suisei from Hololive JP", 531 | "added": "2023-08-05", 532 | "credit": "dacoolkid44 & hijack & Maki Ligon", 533 | "tags": [ 534 | "Vtuber", 535 | "Japanese" 536 | ] 537 | }, 538 | { 539 | "name": "Laplus Darkness", 540 | "url": "https://pixeldrain.com/u/zmuxv5Bf", 541 | "description": "Laplus Darkness from Hololive JP", 542 | "added": "2023-08-05", 543 | "credit": "dacoolkid44 & hijack", 544 | "tags": [ 545 | "Vtuber", 546 | "Japanese" 547 | ] 548 | }, 549 | { 550 | "name": "AZKi", 551 | "url": "https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models/resolve/main/AZKi%20(Hybrid).zip", 552 | "description": "AZKi from Hololive JP", 553 | "added": "2023-08-05", 554 | "credit": "Kit Lemonfoot / NSHFB", 555 | "tags": [ 556 | "Vtuber", 557 | "Japanese" 558 | ] 559 | }, 560 | { 561 | "name": "Ado", 562 | "url": "https://huggingface.co/pjesek/AdoRVCv2/resolve/main/AdoRVCv2.zip", 563 | "description": "Talented JP artist (500 epochs using every song from her first album)", 564 | "added": "2023-08-05", 565 | "credit": "pjesek", 566 | "tags": [ 567 | "Real person", 568 | "Japanese" 569 | ] 570 | }, 571 | { 572 | "name": "LiSA", 573 | "url": "https://huggingface.co/phant0m4r/LiSA/resolve/main/LiSA.zip", 574 | "description": "Talented JP artist (400 epochs)", 575 | "added": "2023-08-05", 576 | "credit": "Phant0m", 577 | "tags": [ 578 | "Real person", 579 | "Japanese" 580 | ] 581 | }, 582 | { 583 | "name": "Kokomi", 584 | "url": "https://huggingface.co/benitheworld/kokomi-kr/resolve/main/kokomi-kr.zip", 585 | "description": "Kokomi from Genshin Impact KR (300 Epochs)", 586 | "added": "2023-08-09", 587 | "credit": "kannysoap", 588 | "tags": [ 589 | "Game character", 590 | "Other Language" 591 | ] 592 | }, 593 | { 594 | "name": "Ivanzolo", 595 | "url": "https://huggingface.co/fenikkusugosuto/IvanZolo2004/resolve/main/ivanZolo.zip", 596 | "description": "Ivanzolo2004 russian streamer | Иван Золо 2004", 597 | "added": "2023-08-09", 598 | "credit": "prezervativ_naruto2009", 599 | "tags": [ 600 | "Other Language", 601 | "Real person" 602 | ] 603 | }, 604 | { 605 | "name": "Nilou", 606 | "url": "https://huggingface.co/benitheworld/nilou-kr/resolve/main/nilou-kr.zip", 607 | "description": "Nilou from Genshin Impact KR (300 Epochs)", 608 | "added": "2023-08-09", 609 | "credit": "kannysoap", 610 | "tags": [ 611 | "Game character", 612 | "Other Language" 613 | ] 614 | }, 615 | { 616 | "name": "Dr. Doofenshmirtz", 617 | "url": "https://huggingface.co/Argax/doofenshmirtz-RUS/resolve/main/doofenshmirtz.zip", 618 | "description": "RUS Dr. Doofenshmirtz from Phineas and Ferb (300 epochs)", 619 | "added": "2023-08-09", 620 | "credit": "argaxus", 621 | "tags": [ 622 | "Other Language" 623 | ] 624 | } 625 | ] 626 | } 627 | -------------------------------------------------------------------------------- /src/configs/32k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,4,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/configs/32k_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 32000, 21 | "filter_length": 1024, 22 | "hop_length": 320, 23 | "win_length": 1024, 24 | "n_mel_channels": 80, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,8,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [20,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/configs/40k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 12800, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 40000, 21 | "filter_length": 2048, 22 | "hop_length": 400, 23 | "win_length": 2048, 24 | "n_mel_channels": 125, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/configs/48k.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": false, 11 | "lr_decay": 0.999875, 12 | "segment_size": 11520, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [10,6,2,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [16,16,4,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/configs/48k_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "seed": 1234, 5 | "epochs": 20000, 6 | "learning_rate": 1e-4, 7 | "betas": [0.8, 0.99], 8 | "eps": 1e-9, 9 | "batch_size": 4, 10 | "fp16_run": true, 11 | "lr_decay": 0.999875, 12 | "segment_size": 17280, 13 | "init_lr_ratio": 1, 14 | "warmup_epochs": 0, 15 | "c_mel": 45, 16 | "c_kl": 1.0 17 | }, 18 | "data": { 19 | "max_wav_value": 32768.0, 20 | "sampling_rate": 48000, 21 | "filter_length": 2048, 22 | "hop_length": 480, 23 | "win_length": 2048, 24 | "n_mel_channels": 128, 25 | "mel_fmin": 0.0, 26 | "mel_fmax": null 27 | }, 28 | "model": { 29 | "inter_channels": 192, 30 | "hidden_channels": 192, 31 | "filter_channels": 768, 32 | "n_heads": 2, 33 | "n_layers": 6, 34 | "kernel_size": 3, 35 | "p_dropout": 0, 36 | "resblock": "1", 37 | "resblock_kernel_sizes": [3,7,11], 38 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 39 | "upsample_rates": [12,10,2,2], 40 | "upsample_initial_channel": 512, 41 | "upsample_kernel_sizes": [24,20,4,4], 42 | "use_spectral_norm": false, 43 | "gin_channels": 256, 44 | "spk_embed_dim": 109 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/download_models.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import requests 3 | 4 | RVC_DOWNLOAD_LINK = 'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/' 5 | 6 | BASE_DIR = Path(__file__).resolve().parent.parent 7 | rvc_models_dir = BASE_DIR / 'rvc_models' 8 | 9 | 10 | def dl_model(link, model_name, dir_name): 11 | with requests.get(f'{link}{model_name}') as r: 12 | r.raise_for_status() 13 | with open(dir_name / model_name, 'wb') as f: 14 | for chunk in r.iter_content(chunk_size=8192): 15 | f.write(chunk) 16 | 17 | 18 | if __name__ == '__main__': 19 | rvc_model_names = ['hubert_base.pt', 'rmvpe.pt'] 20 | for model in rvc_model_names: 21 | print(f'Downloading {model}...') 22 | dl_model(RVC_DOWNLOAD_LINK, model, rvc_models_dir) 23 | 24 | print('All models downloaded!') 25 | -------------------------------------------------------------------------------- /src/infer_pack/attentions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from infer_pack import commons 9 | from infer_pack import modules 10 | from infer_pack.modules import LayerNorm 11 | 12 | 13 | class Encoder(nn.Module): 14 | def __init__( 15 | self, 16 | hidden_channels, 17 | filter_channels, 18 | n_heads, 19 | n_layers, 20 | kernel_size=1, 21 | p_dropout=0.0, 22 | window_size=10, 23 | **kwargs 24 | ): 25 | super().__init__() 26 | self.hidden_channels = hidden_channels 27 | self.filter_channels = filter_channels 28 | self.n_heads = n_heads 29 | self.n_layers = n_layers 30 | self.kernel_size = kernel_size 31 | self.p_dropout = p_dropout 32 | self.window_size = window_size 33 | 34 | self.drop = nn.Dropout(p_dropout) 35 | self.attn_layers = nn.ModuleList() 36 | self.norm_layers_1 = nn.ModuleList() 37 | self.ffn_layers = nn.ModuleList() 38 | self.norm_layers_2 = nn.ModuleList() 39 | for i in range(self.n_layers): 40 | self.attn_layers.append( 41 | MultiHeadAttention( 42 | hidden_channels, 43 | hidden_channels, 44 | n_heads, 45 | p_dropout=p_dropout, 46 | window_size=window_size, 47 | ) 48 | ) 49 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 50 | self.ffn_layers.append( 51 | FFN( 52 | hidden_channels, 53 | hidden_channels, 54 | filter_channels, 55 | kernel_size, 56 | p_dropout=p_dropout, 57 | ) 58 | ) 59 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 60 | 61 | def forward(self, x, x_mask): 62 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 63 | x = x * x_mask 64 | for i in range(self.n_layers): 65 | y = self.attn_layers[i](x, x, attn_mask) 66 | y = self.drop(y) 67 | x = self.norm_layers_1[i](x + y) 68 | 69 | y = self.ffn_layers[i](x, x_mask) 70 | y = self.drop(y) 71 | x = self.norm_layers_2[i](x + y) 72 | x = x * x_mask 73 | return x 74 | 75 | 76 | class Decoder(nn.Module): 77 | def __init__( 78 | self, 79 | hidden_channels, 80 | filter_channels, 81 | n_heads, 82 | n_layers, 83 | kernel_size=1, 84 | p_dropout=0.0, 85 | proximal_bias=False, 86 | proximal_init=True, 87 | **kwargs 88 | ): 89 | super().__init__() 90 | self.hidden_channels = hidden_channels 91 | self.filter_channels = filter_channels 92 | self.n_heads = n_heads 93 | self.n_layers = n_layers 94 | self.kernel_size = kernel_size 95 | self.p_dropout = p_dropout 96 | self.proximal_bias = proximal_bias 97 | self.proximal_init = proximal_init 98 | 99 | self.drop = nn.Dropout(p_dropout) 100 | self.self_attn_layers = nn.ModuleList() 101 | self.norm_layers_0 = nn.ModuleList() 102 | self.encdec_attn_layers = nn.ModuleList() 103 | self.norm_layers_1 = nn.ModuleList() 104 | self.ffn_layers = nn.ModuleList() 105 | self.norm_layers_2 = nn.ModuleList() 106 | for i in range(self.n_layers): 107 | self.self_attn_layers.append( 108 | MultiHeadAttention( 109 | hidden_channels, 110 | hidden_channels, 111 | n_heads, 112 | p_dropout=p_dropout, 113 | proximal_bias=proximal_bias, 114 | proximal_init=proximal_init, 115 | ) 116 | ) 117 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 118 | self.encdec_attn_layers.append( 119 | MultiHeadAttention( 120 | hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout 121 | ) 122 | ) 123 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 124 | self.ffn_layers.append( 125 | FFN( 126 | hidden_channels, 127 | hidden_channels, 128 | filter_channels, 129 | kernel_size, 130 | p_dropout=p_dropout, 131 | causal=True, 132 | ) 133 | ) 134 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 135 | 136 | def forward(self, x, x_mask, h, h_mask): 137 | """ 138 | x: decoder input 139 | h: encoder output 140 | """ 141 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( 142 | device=x.device, dtype=x.dtype 143 | ) 144 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 145 | x = x * x_mask 146 | for i in range(self.n_layers): 147 | y = self.self_attn_layers[i](x, x, self_attn_mask) 148 | y = self.drop(y) 149 | x = self.norm_layers_0[i](x + y) 150 | 151 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 152 | y = self.drop(y) 153 | x = self.norm_layers_1[i](x + y) 154 | 155 | y = self.ffn_layers[i](x, x_mask) 156 | y = self.drop(y) 157 | x = self.norm_layers_2[i](x + y) 158 | x = x * x_mask 159 | return x 160 | 161 | 162 | class MultiHeadAttention(nn.Module): 163 | def __init__( 164 | self, 165 | channels, 166 | out_channels, 167 | n_heads, 168 | p_dropout=0.0, 169 | window_size=None, 170 | heads_share=True, 171 | block_length=None, 172 | proximal_bias=False, 173 | proximal_init=False, 174 | ): 175 | super().__init__() 176 | assert channels % n_heads == 0 177 | 178 | self.channels = channels 179 | self.out_channels = out_channels 180 | self.n_heads = n_heads 181 | self.p_dropout = p_dropout 182 | self.window_size = window_size 183 | self.heads_share = heads_share 184 | self.block_length = block_length 185 | self.proximal_bias = proximal_bias 186 | self.proximal_init = proximal_init 187 | self.attn = None 188 | 189 | self.k_channels = channels // n_heads 190 | self.conv_q = nn.Conv1d(channels, channels, 1) 191 | self.conv_k = nn.Conv1d(channels, channels, 1) 192 | self.conv_v = nn.Conv1d(channels, channels, 1) 193 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 194 | self.drop = nn.Dropout(p_dropout) 195 | 196 | if window_size is not None: 197 | n_heads_rel = 1 if heads_share else n_heads 198 | rel_stddev = self.k_channels**-0.5 199 | self.emb_rel_k = nn.Parameter( 200 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 201 | * rel_stddev 202 | ) 203 | self.emb_rel_v = nn.Parameter( 204 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 205 | * rel_stddev 206 | ) 207 | 208 | nn.init.xavier_uniform_(self.conv_q.weight) 209 | nn.init.xavier_uniform_(self.conv_k.weight) 210 | nn.init.xavier_uniform_(self.conv_v.weight) 211 | if proximal_init: 212 | with torch.no_grad(): 213 | self.conv_k.weight.copy_(self.conv_q.weight) 214 | self.conv_k.bias.copy_(self.conv_q.bias) 215 | 216 | def forward(self, x, c, attn_mask=None): 217 | q = self.conv_q(x) 218 | k = self.conv_k(c) 219 | v = self.conv_v(c) 220 | 221 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 222 | 223 | x = self.conv_o(x) 224 | return x 225 | 226 | def attention(self, query, key, value, mask=None): 227 | # reshape [b, d, t] -> [b, n_h, t, d_k] 228 | b, d, t_s, t_t = (*key.size(), query.size(2)) 229 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 230 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 231 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 232 | 233 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 234 | if self.window_size is not None: 235 | assert ( 236 | t_s == t_t 237 | ), "Relative attention is only available for self-attention." 238 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 239 | rel_logits = self._matmul_with_relative_keys( 240 | query / math.sqrt(self.k_channels), key_relative_embeddings 241 | ) 242 | scores_local = self._relative_position_to_absolute_position(rel_logits) 243 | scores = scores + scores_local 244 | if self.proximal_bias: 245 | assert t_s == t_t, "Proximal bias is only available for self-attention." 246 | scores = scores + self._attention_bias_proximal(t_s).to( 247 | device=scores.device, dtype=scores.dtype 248 | ) 249 | if mask is not None: 250 | scores = scores.masked_fill(mask == 0, -1e4) 251 | if self.block_length is not None: 252 | assert ( 253 | t_s == t_t 254 | ), "Local attention is only available for self-attention." 255 | block_mask = ( 256 | torch.ones_like(scores) 257 | .triu(-self.block_length) 258 | .tril(self.block_length) 259 | ) 260 | scores = scores.masked_fill(block_mask == 0, -1e4) 261 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 262 | p_attn = self.drop(p_attn) 263 | output = torch.matmul(p_attn, value) 264 | if self.window_size is not None: 265 | relative_weights = self._absolute_position_to_relative_position(p_attn) 266 | value_relative_embeddings = self._get_relative_embeddings( 267 | self.emb_rel_v, t_s 268 | ) 269 | output = output + self._matmul_with_relative_values( 270 | relative_weights, value_relative_embeddings 271 | ) 272 | output = ( 273 | output.transpose(2, 3).contiguous().view(b, d, t_t) 274 | ) # [b, n_h, t_t, d_k] -> [b, d, t_t] 275 | return output, p_attn 276 | 277 | def _matmul_with_relative_values(self, x, y): 278 | """ 279 | x: [b, h, l, m] 280 | y: [h or 1, m, d] 281 | ret: [b, h, l, d] 282 | """ 283 | ret = torch.matmul(x, y.unsqueeze(0)) 284 | return ret 285 | 286 | def _matmul_with_relative_keys(self, x, y): 287 | """ 288 | x: [b, h, l, d] 289 | y: [h or 1, m, d] 290 | ret: [b, h, l, m] 291 | """ 292 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 293 | return ret 294 | 295 | def _get_relative_embeddings(self, relative_embeddings, length): 296 | max_relative_position = 2 * self.window_size + 1 297 | # Pad first before slice to avoid using cond ops. 298 | pad_length = max(length - (self.window_size + 1), 0) 299 | slice_start_position = max((self.window_size + 1) - length, 0) 300 | slice_end_position = slice_start_position + 2 * length - 1 301 | if pad_length > 0: 302 | padded_relative_embeddings = F.pad( 303 | relative_embeddings, 304 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), 305 | ) 306 | else: 307 | padded_relative_embeddings = relative_embeddings 308 | used_relative_embeddings = padded_relative_embeddings[ 309 | :, slice_start_position:slice_end_position 310 | ] 311 | return used_relative_embeddings 312 | 313 | def _relative_position_to_absolute_position(self, x): 314 | """ 315 | x: [b, h, l, 2*l-1] 316 | ret: [b, h, l, l] 317 | """ 318 | batch, heads, length, _ = x.size() 319 | # Concat columns of pad to shift from relative to absolute indexing. 320 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) 321 | 322 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 323 | x_flat = x.view([batch, heads, length * 2 * length]) 324 | x_flat = F.pad( 325 | x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) 326 | ) 327 | 328 | # Reshape and slice out the padded elements. 329 | x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ 330 | :, :, :length, length - 1 : 331 | ] 332 | return x_final 333 | 334 | def _absolute_position_to_relative_position(self, x): 335 | """ 336 | x: [b, h, l, l] 337 | ret: [b, h, l, 2*l-1] 338 | """ 339 | batch, heads, length, _ = x.size() 340 | # padd along column 341 | x = F.pad( 342 | x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) 343 | ) 344 | x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) 345 | # add 0's in the beginning that will skew the elements after reshape 346 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 347 | x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] 348 | return x_final 349 | 350 | def _attention_bias_proximal(self, length): 351 | """Bias for self-attention to encourage attention to close positions. 352 | Args: 353 | length: an integer scalar. 354 | Returns: 355 | a Tensor with shape [1, 1, length, length] 356 | """ 357 | r = torch.arange(length, dtype=torch.float32) 358 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 359 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 360 | 361 | 362 | class FFN(nn.Module): 363 | def __init__( 364 | self, 365 | in_channels, 366 | out_channels, 367 | filter_channels, 368 | kernel_size, 369 | p_dropout=0.0, 370 | activation=None, 371 | causal=False, 372 | ): 373 | super().__init__() 374 | self.in_channels = in_channels 375 | self.out_channels = out_channels 376 | self.filter_channels = filter_channels 377 | self.kernel_size = kernel_size 378 | self.p_dropout = p_dropout 379 | self.activation = activation 380 | self.causal = causal 381 | 382 | if causal: 383 | self.padding = self._causal_padding 384 | else: 385 | self.padding = self._same_padding 386 | 387 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 388 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 389 | self.drop = nn.Dropout(p_dropout) 390 | 391 | def forward(self, x, x_mask): 392 | x = self.conv_1(self.padding(x * x_mask)) 393 | if self.activation == "gelu": 394 | x = x * torch.sigmoid(1.702 * x) 395 | else: 396 | x = torch.relu(x) 397 | x = self.drop(x) 398 | x = self.conv_2(self.padding(x * x_mask)) 399 | return x * x_mask 400 | 401 | def _causal_padding(self, x): 402 | if self.kernel_size == 1: 403 | return x 404 | pad_l = self.kernel_size - 1 405 | pad_r = 0 406 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 407 | x = F.pad(x, commons.convert_pad_shape(padding)) 408 | return x 409 | 410 | def _same_padding(self, x): 411 | if self.kernel_size == 1: 412 | return x 413 | pad_l = (self.kernel_size - 1) // 2 414 | pad_r = self.kernel_size // 2 415 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 416 | x = F.pad(x, commons.convert_pad_shape(padding)) 417 | return x 418 | -------------------------------------------------------------------------------- /src/infer_pack/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size * dilation - dilation) / 2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def kl_divergence(m_p, logs_p, m_q, logs_q): 25 | """KL(P||Q)""" 26 | kl = (logs_q - logs_p) - 0.5 27 | kl += ( 28 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 29 | ) 30 | return kl 31 | 32 | 33 | def rand_gumbel(shape): 34 | """Sample from the Gumbel distribution, protect from overflows.""" 35 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 36 | return -torch.log(-torch.log(uniform_samples)) 37 | 38 | 39 | def rand_gumbel_like(x): 40 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 41 | return g 42 | 43 | 44 | def slice_segments(x, ids_str, segment_size=4): 45 | ret = torch.zeros_like(x[:, :, :segment_size]) 46 | for i in range(x.size(0)): 47 | idx_str = ids_str[i] 48 | idx_end = idx_str + segment_size 49 | ret[i] = x[i, :, idx_str:idx_end] 50 | return ret 51 | 52 | 53 | def slice_segments2(x, ids_str, segment_size=4): 54 | ret = torch.zeros_like(x[:, :segment_size]) 55 | for i in range(x.size(0)): 56 | idx_str = ids_str[i] 57 | idx_end = idx_str + segment_size 58 | ret[i] = x[i, idx_str:idx_end] 59 | return ret 60 | 61 | 62 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 63 | b, d, t = x.size() 64 | if x_lengths is None: 65 | x_lengths = t 66 | ids_str_max = x_lengths - segment_size + 1 67 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 68 | ret = slice_segments(x, ids_str, segment_size) 69 | return ret, ids_str 70 | 71 | 72 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 73 | position = torch.arange(length, dtype=torch.float) 74 | num_timescales = channels // 2 75 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 76 | num_timescales - 1 77 | ) 78 | inv_timescales = min_timescale * torch.exp( 79 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 80 | ) 81 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 82 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 83 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 84 | signal = signal.view(1, channels, length) 85 | return signal 86 | 87 | 88 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 89 | b, channels, length = x.size() 90 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 91 | return x + signal.to(dtype=x.dtype, device=x.device) 92 | 93 | 94 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 95 | b, channels, length = x.size() 96 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 97 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 98 | 99 | 100 | def subsequent_mask(length): 101 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 102 | return mask 103 | 104 | 105 | @torch.jit.script 106 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 107 | n_channels_int = n_channels[0] 108 | in_act = input_a + input_b 109 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 110 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 111 | acts = t_act * s_act 112 | return acts 113 | 114 | 115 | def convert_pad_shape(pad_shape): 116 | l = pad_shape[::-1] 117 | pad_shape = [item for sublist in l for item in sublist] 118 | return pad_shape 119 | 120 | 121 | def shift_1d(x): 122 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 123 | return x 124 | 125 | 126 | def sequence_mask(length, max_length=None): 127 | if max_length is None: 128 | max_length = length.max() 129 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 130 | return x.unsqueeze(0) < length.unsqueeze(1) 131 | 132 | 133 | def generate_path(duration, mask): 134 | """ 135 | duration: [b, 1, t_x] 136 | mask: [b, 1, t_y, t_x] 137 | """ 138 | device = duration.device 139 | 140 | b, _, t_y, t_x = mask.shape 141 | cum_duration = torch.cumsum(duration, -1) 142 | 143 | cum_duration_flat = cum_duration.view(b * t_x) 144 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 145 | path = path.view(b, t_x, t_y) 146 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 147 | path = path.unsqueeze(1).transpose(2, 3) * mask 148 | return path 149 | 150 | 151 | def clip_grad_value_(parameters, clip_value, norm_type=2): 152 | if isinstance(parameters, torch.Tensor): 153 | parameters = [parameters] 154 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 155 | norm_type = float(norm_type) 156 | if clip_value is not None: 157 | clip_value = float(clip_value) 158 | 159 | total_norm = 0 160 | for p in parameters: 161 | param_norm = p.grad.data.norm(norm_type) 162 | total_norm += param_norm.item() ** norm_type 163 | if clip_value is not None: 164 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 165 | total_norm = total_norm ** (1.0 / norm_type) 166 | return total_norm 167 | -------------------------------------------------------------------------------- /src/infer_pack/models_onnx.py: -------------------------------------------------------------------------------- 1 | import math, pdb, os 2 | from time import time as ttime 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from infer_pack import modules 7 | from infer_pack import attentions 8 | from infer_pack import commons 9 | from infer_pack.commons import init_weights, get_padding 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 12 | from infer_pack.commons import init_weights 13 | import numpy as np 14 | from infer_pack import commons 15 | 16 | 17 | class TextEncoder256(nn.Module): 18 | def __init__( 19 | self, 20 | out_channels, 21 | hidden_channels, 22 | filter_channels, 23 | n_heads, 24 | n_layers, 25 | kernel_size, 26 | p_dropout, 27 | f0=True, 28 | ): 29 | super().__init__() 30 | self.out_channels = out_channels 31 | self.hidden_channels = hidden_channels 32 | self.filter_channels = filter_channels 33 | self.n_heads = n_heads 34 | self.n_layers = n_layers 35 | self.kernel_size = kernel_size 36 | self.p_dropout = p_dropout 37 | self.emb_phone = nn.Linear(256, hidden_channels) 38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 39 | if f0 == True: 40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 41 | self.encoder = attentions.Encoder( 42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 43 | ) 44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 45 | 46 | def forward(self, phone, pitch, lengths): 47 | if pitch == None: 48 | x = self.emb_phone(phone) 49 | else: 50 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 52 | x = self.lrelu(x) 53 | x = torch.transpose(x, 1, -1) # [b, h, t] 54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 55 | x.dtype 56 | ) 57 | x = self.encoder(x * x_mask, x_mask) 58 | stats = self.proj(x) * x_mask 59 | 60 | m, logs = torch.split(stats, self.out_channels, dim=1) 61 | return m, logs, x_mask 62 | 63 | 64 | class TextEncoder768(nn.Module): 65 | def __init__( 66 | self, 67 | out_channels, 68 | hidden_channels, 69 | filter_channels, 70 | n_heads, 71 | n_layers, 72 | kernel_size, 73 | p_dropout, 74 | f0=True, 75 | ): 76 | super().__init__() 77 | self.out_channels = out_channels 78 | self.hidden_channels = hidden_channels 79 | self.filter_channels = filter_channels 80 | self.n_heads = n_heads 81 | self.n_layers = n_layers 82 | self.kernel_size = kernel_size 83 | self.p_dropout = p_dropout 84 | self.emb_phone = nn.Linear(768, hidden_channels) 85 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 86 | if f0 == True: 87 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 88 | self.encoder = attentions.Encoder( 89 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 90 | ) 91 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 92 | 93 | def forward(self, phone, pitch, lengths): 94 | if pitch == None: 95 | x = self.emb_phone(phone) 96 | else: 97 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 98 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 99 | x = self.lrelu(x) 100 | x = torch.transpose(x, 1, -1) # [b, h, t] 101 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 102 | x.dtype 103 | ) 104 | x = self.encoder(x * x_mask, x_mask) 105 | stats = self.proj(x) * x_mask 106 | 107 | m, logs = torch.split(stats, self.out_channels, dim=1) 108 | return m, logs, x_mask 109 | 110 | 111 | class ResidualCouplingBlock(nn.Module): 112 | def __init__( 113 | self, 114 | channels, 115 | hidden_channels, 116 | kernel_size, 117 | dilation_rate, 118 | n_layers, 119 | n_flows=4, 120 | gin_channels=0, 121 | ): 122 | super().__init__() 123 | self.channels = channels 124 | self.hidden_channels = hidden_channels 125 | self.kernel_size = kernel_size 126 | self.dilation_rate = dilation_rate 127 | self.n_layers = n_layers 128 | self.n_flows = n_flows 129 | self.gin_channels = gin_channels 130 | 131 | self.flows = nn.ModuleList() 132 | for i in range(n_flows): 133 | self.flows.append( 134 | modules.ResidualCouplingLayer( 135 | channels, 136 | hidden_channels, 137 | kernel_size, 138 | dilation_rate, 139 | n_layers, 140 | gin_channels=gin_channels, 141 | mean_only=True, 142 | ) 143 | ) 144 | self.flows.append(modules.Flip()) 145 | 146 | def forward(self, x, x_mask, g=None, reverse=False): 147 | if not reverse: 148 | for flow in self.flows: 149 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 150 | else: 151 | for flow in reversed(self.flows): 152 | x = flow(x, x_mask, g=g, reverse=reverse) 153 | return x 154 | 155 | def remove_weight_norm(self): 156 | for i in range(self.n_flows): 157 | self.flows[i * 2].remove_weight_norm() 158 | 159 | 160 | class PosteriorEncoder(nn.Module): 161 | def __init__( 162 | self, 163 | in_channels, 164 | out_channels, 165 | hidden_channels, 166 | kernel_size, 167 | dilation_rate, 168 | n_layers, 169 | gin_channels=0, 170 | ): 171 | super().__init__() 172 | self.in_channels = in_channels 173 | self.out_channels = out_channels 174 | self.hidden_channels = hidden_channels 175 | self.kernel_size = kernel_size 176 | self.dilation_rate = dilation_rate 177 | self.n_layers = n_layers 178 | self.gin_channels = gin_channels 179 | 180 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 181 | self.enc = modules.WN( 182 | hidden_channels, 183 | kernel_size, 184 | dilation_rate, 185 | n_layers, 186 | gin_channels=gin_channels, 187 | ) 188 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 189 | 190 | def forward(self, x, x_lengths, g=None): 191 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 192 | x.dtype 193 | ) 194 | x = self.pre(x) * x_mask 195 | x = self.enc(x, x_mask, g=g) 196 | stats = self.proj(x) * x_mask 197 | m, logs = torch.split(stats, self.out_channels, dim=1) 198 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 199 | return z, m, logs, x_mask 200 | 201 | def remove_weight_norm(self): 202 | self.enc.remove_weight_norm() 203 | 204 | 205 | class Generator(torch.nn.Module): 206 | def __init__( 207 | self, 208 | initial_channel, 209 | resblock, 210 | resblock_kernel_sizes, 211 | resblock_dilation_sizes, 212 | upsample_rates, 213 | upsample_initial_channel, 214 | upsample_kernel_sizes, 215 | gin_channels=0, 216 | ): 217 | super(Generator, self).__init__() 218 | self.num_kernels = len(resblock_kernel_sizes) 219 | self.num_upsamples = len(upsample_rates) 220 | self.conv_pre = Conv1d( 221 | initial_channel, upsample_initial_channel, 7, 1, padding=3 222 | ) 223 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 224 | 225 | self.ups = nn.ModuleList() 226 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 227 | self.ups.append( 228 | weight_norm( 229 | ConvTranspose1d( 230 | upsample_initial_channel // (2**i), 231 | upsample_initial_channel // (2 ** (i + 1)), 232 | k, 233 | u, 234 | padding=(k - u) // 2, 235 | ) 236 | ) 237 | ) 238 | 239 | self.resblocks = nn.ModuleList() 240 | for i in range(len(self.ups)): 241 | ch = upsample_initial_channel // (2 ** (i + 1)) 242 | for j, (k, d) in enumerate( 243 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 244 | ): 245 | self.resblocks.append(resblock(ch, k, d)) 246 | 247 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 248 | self.ups.apply(init_weights) 249 | 250 | if gin_channels != 0: 251 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 252 | 253 | def forward(self, x, g=None): 254 | x = self.conv_pre(x) 255 | if g is not None: 256 | x = x + self.cond(g) 257 | 258 | for i in range(self.num_upsamples): 259 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 260 | x = self.ups[i](x) 261 | xs = None 262 | for j in range(self.num_kernels): 263 | if xs is None: 264 | xs = self.resblocks[i * self.num_kernels + j](x) 265 | else: 266 | xs += self.resblocks[i * self.num_kernels + j](x) 267 | x = xs / self.num_kernels 268 | x = F.leaky_relu(x) 269 | x = self.conv_post(x) 270 | x = torch.tanh(x) 271 | 272 | return x 273 | 274 | def remove_weight_norm(self): 275 | for l in self.ups: 276 | remove_weight_norm(l) 277 | for l in self.resblocks: 278 | l.remove_weight_norm() 279 | 280 | 281 | class SineGen(torch.nn.Module): 282 | """Definition of sine generator 283 | SineGen(samp_rate, harmonic_num = 0, 284 | sine_amp = 0.1, noise_std = 0.003, 285 | voiced_threshold = 0, 286 | flag_for_pulse=False) 287 | samp_rate: sampling rate in Hz 288 | harmonic_num: number of harmonic overtones (default 0) 289 | sine_amp: amplitude of sine-wavefrom (default 0.1) 290 | noise_std: std of Gaussian noise (default 0.003) 291 | voiced_thoreshold: F0 threshold for U/V classification (default 0) 292 | flag_for_pulse: this SinGen is used inside PulseGen (default False) 293 | Note: when flag_for_pulse is True, the first time step of a voiced 294 | segment is always sin(np.pi) or cos(0) 295 | """ 296 | 297 | def __init__( 298 | self, 299 | samp_rate, 300 | harmonic_num=0, 301 | sine_amp=0.1, 302 | noise_std=0.003, 303 | voiced_threshold=0, 304 | flag_for_pulse=False, 305 | ): 306 | super(SineGen, self).__init__() 307 | self.sine_amp = sine_amp 308 | self.noise_std = noise_std 309 | self.harmonic_num = harmonic_num 310 | self.dim = self.harmonic_num + 1 311 | self.sampling_rate = samp_rate 312 | self.voiced_threshold = voiced_threshold 313 | 314 | def _f02uv(self, f0): 315 | # generate uv signal 316 | uv = torch.ones_like(f0) 317 | uv = uv * (f0 > self.voiced_threshold) 318 | return uv 319 | 320 | def forward(self, f0, upp): 321 | """sine_tensor, uv = forward(f0) 322 | input F0: tensor(batchsize=1, length, dim=1) 323 | f0 for unvoiced steps should be 0 324 | output sine_tensor: tensor(batchsize=1, length, dim) 325 | output uv: tensor(batchsize=1, length, 1) 326 | """ 327 | with torch.no_grad(): 328 | f0 = f0[:, None].transpose(1, 2) 329 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) 330 | # fundamental component 331 | f0_buf[:, :, 0] = f0[:, :, 0] 332 | for idx in np.arange(self.harmonic_num): 333 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( 334 | idx + 2 335 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic 336 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 337 | rand_ini = torch.rand( 338 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device 339 | ) 340 | rand_ini[:, 0] = 0 341 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini 342 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 343 | tmp_over_one *= upp 344 | tmp_over_one = F.interpolate( 345 | tmp_over_one.transpose(2, 1), 346 | scale_factor=upp, 347 | mode="linear", 348 | align_corners=True, 349 | ).transpose(2, 1) 350 | rad_values = F.interpolate( 351 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" 352 | ).transpose( 353 | 2, 1 354 | ) ####### 355 | tmp_over_one %= 1 356 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 357 | cumsum_shift = torch.zeros_like(rad_values) 358 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 359 | sine_waves = torch.sin( 360 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi 361 | ) 362 | sine_waves = sine_waves * self.sine_amp 363 | uv = self._f02uv(f0) 364 | uv = F.interpolate( 365 | uv.transpose(2, 1), scale_factor=upp, mode="nearest" 366 | ).transpose(2, 1) 367 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 368 | noise = noise_amp * torch.randn_like(sine_waves) 369 | sine_waves = sine_waves * uv + noise 370 | return sine_waves, uv, noise 371 | 372 | 373 | class SourceModuleHnNSF(torch.nn.Module): 374 | """SourceModule for hn-nsf 375 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 376 | add_noise_std=0.003, voiced_threshod=0) 377 | sampling_rate: sampling_rate in Hz 378 | harmonic_num: number of harmonic above F0 (default: 0) 379 | sine_amp: amplitude of sine source signal (default: 0.1) 380 | add_noise_std: std of additive Gaussian noise (default: 0.003) 381 | note that amplitude of noise in unvoiced is decided 382 | by sine_amp 383 | voiced_threshold: threhold to set U/V given F0 (default: 0) 384 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 385 | F0_sampled (batchsize, length, 1) 386 | Sine_source (batchsize, length, 1) 387 | noise_source (batchsize, length 1) 388 | uv (batchsize, length, 1) 389 | """ 390 | 391 | def __init__( 392 | self, 393 | sampling_rate, 394 | harmonic_num=0, 395 | sine_amp=0.1, 396 | add_noise_std=0.003, 397 | voiced_threshod=0, 398 | is_half=True, 399 | ): 400 | super(SourceModuleHnNSF, self).__init__() 401 | 402 | self.sine_amp = sine_amp 403 | self.noise_std = add_noise_std 404 | self.is_half = is_half 405 | # to produce sine waveforms 406 | self.l_sin_gen = SineGen( 407 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod 408 | ) 409 | 410 | # to merge source harmonics into a single excitation 411 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) 412 | self.l_tanh = torch.nn.Tanh() 413 | 414 | def forward(self, x, upp=None): 415 | sine_wavs, uv, _ = self.l_sin_gen(x, upp) 416 | if self.is_half: 417 | sine_wavs = sine_wavs.half() 418 | sine_merge = self.l_tanh(self.l_linear(sine_wavs)) 419 | return sine_merge, None, None # noise, uv 420 | 421 | 422 | class GeneratorNSF(torch.nn.Module): 423 | def __init__( 424 | self, 425 | initial_channel, 426 | resblock, 427 | resblock_kernel_sizes, 428 | resblock_dilation_sizes, 429 | upsample_rates, 430 | upsample_initial_channel, 431 | upsample_kernel_sizes, 432 | gin_channels, 433 | sr, 434 | is_half=False, 435 | ): 436 | super(GeneratorNSF, self).__init__() 437 | self.num_kernels = len(resblock_kernel_sizes) 438 | self.num_upsamples = len(upsample_rates) 439 | 440 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) 441 | self.m_source = SourceModuleHnNSF( 442 | sampling_rate=sr, harmonic_num=0, is_half=is_half 443 | ) 444 | self.noise_convs = nn.ModuleList() 445 | self.conv_pre = Conv1d( 446 | initial_channel, upsample_initial_channel, 7, 1, padding=3 447 | ) 448 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 449 | 450 | self.ups = nn.ModuleList() 451 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 452 | c_cur = upsample_initial_channel // (2 ** (i + 1)) 453 | self.ups.append( 454 | weight_norm( 455 | ConvTranspose1d( 456 | upsample_initial_channel // (2**i), 457 | upsample_initial_channel // (2 ** (i + 1)), 458 | k, 459 | u, 460 | padding=(k - u) // 2, 461 | ) 462 | ) 463 | ) 464 | if i + 1 < len(upsample_rates): 465 | stride_f0 = np.prod(upsample_rates[i + 1 :]) 466 | self.noise_convs.append( 467 | Conv1d( 468 | 1, 469 | c_cur, 470 | kernel_size=stride_f0 * 2, 471 | stride=stride_f0, 472 | padding=stride_f0 // 2, 473 | ) 474 | ) 475 | else: 476 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) 477 | 478 | self.resblocks = nn.ModuleList() 479 | for i in range(len(self.ups)): 480 | ch = upsample_initial_channel // (2 ** (i + 1)) 481 | for j, (k, d) in enumerate( 482 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 483 | ): 484 | self.resblocks.append(resblock(ch, k, d)) 485 | 486 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 487 | self.ups.apply(init_weights) 488 | 489 | if gin_channels != 0: 490 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 491 | 492 | self.upp = np.prod(upsample_rates) 493 | 494 | def forward(self, x, f0, g=None): 495 | har_source, noi_source, uv = self.m_source(f0, self.upp) 496 | har_source = har_source.transpose(1, 2) 497 | x = self.conv_pre(x) 498 | if g is not None: 499 | x = x + self.cond(g) 500 | 501 | for i in range(self.num_upsamples): 502 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 503 | x = self.ups[i](x) 504 | x_source = self.noise_convs[i](har_source) 505 | x = x + x_source 506 | xs = None 507 | for j in range(self.num_kernels): 508 | if xs is None: 509 | xs = self.resblocks[i * self.num_kernels + j](x) 510 | else: 511 | xs += self.resblocks[i * self.num_kernels + j](x) 512 | x = xs / self.num_kernels 513 | x = F.leaky_relu(x) 514 | x = self.conv_post(x) 515 | x = torch.tanh(x) 516 | return x 517 | 518 | def remove_weight_norm(self): 519 | for l in self.ups: 520 | remove_weight_norm(l) 521 | for l in self.resblocks: 522 | l.remove_weight_norm() 523 | 524 | 525 | sr2sr = { 526 | "32k": 32000, 527 | "40k": 40000, 528 | "48k": 48000, 529 | } 530 | 531 | 532 | class SynthesizerTrnMsNSFsidM(nn.Module): 533 | def __init__( 534 | self, 535 | spec_channels, 536 | segment_size, 537 | inter_channels, 538 | hidden_channels, 539 | filter_channels, 540 | n_heads, 541 | n_layers, 542 | kernel_size, 543 | p_dropout, 544 | resblock, 545 | resblock_kernel_sizes, 546 | resblock_dilation_sizes, 547 | upsample_rates, 548 | upsample_initial_channel, 549 | upsample_kernel_sizes, 550 | spk_embed_dim, 551 | gin_channels, 552 | sr, 553 | **kwargs 554 | ): 555 | super().__init__() 556 | if type(sr) == type("strr"): 557 | sr = sr2sr[sr] 558 | self.spec_channels = spec_channels 559 | self.inter_channels = inter_channels 560 | self.hidden_channels = hidden_channels 561 | self.filter_channels = filter_channels 562 | self.n_heads = n_heads 563 | self.n_layers = n_layers 564 | self.kernel_size = kernel_size 565 | self.p_dropout = p_dropout 566 | self.resblock = resblock 567 | self.resblock_kernel_sizes = resblock_kernel_sizes 568 | self.resblock_dilation_sizes = resblock_dilation_sizes 569 | self.upsample_rates = upsample_rates 570 | self.upsample_initial_channel = upsample_initial_channel 571 | self.upsample_kernel_sizes = upsample_kernel_sizes 572 | self.segment_size = segment_size 573 | self.gin_channels = gin_channels 574 | # self.hop_length = hop_length# 575 | self.spk_embed_dim = spk_embed_dim 576 | if self.gin_channels == 256: 577 | self.enc_p = TextEncoder256( 578 | inter_channels, 579 | hidden_channels, 580 | filter_channels, 581 | n_heads, 582 | n_layers, 583 | kernel_size, 584 | p_dropout, 585 | ) 586 | else: 587 | self.enc_p = TextEncoder768( 588 | inter_channels, 589 | hidden_channels, 590 | filter_channels, 591 | n_heads, 592 | n_layers, 593 | kernel_size, 594 | p_dropout, 595 | ) 596 | self.dec = GeneratorNSF( 597 | inter_channels, 598 | resblock, 599 | resblock_kernel_sizes, 600 | resblock_dilation_sizes, 601 | upsample_rates, 602 | upsample_initial_channel, 603 | upsample_kernel_sizes, 604 | gin_channels=gin_channels, 605 | sr=sr, 606 | is_half=kwargs["is_half"], 607 | ) 608 | self.enc_q = PosteriorEncoder( 609 | spec_channels, 610 | inter_channels, 611 | hidden_channels, 612 | 5, 613 | 1, 614 | 16, 615 | gin_channels=gin_channels, 616 | ) 617 | self.flow = ResidualCouplingBlock( 618 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 619 | ) 620 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 621 | self.speaker_map = None 622 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 623 | 624 | def remove_weight_norm(self): 625 | self.dec.remove_weight_norm() 626 | self.flow.remove_weight_norm() 627 | self.enc_q.remove_weight_norm() 628 | 629 | def construct_spkmixmap(self, n_speaker): 630 | self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) 631 | for i in range(n_speaker): 632 | self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) 633 | self.speaker_map = self.speaker_map.unsqueeze(0) 634 | 635 | def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): 636 | if self.speaker_map is not None: # [N, S] * [S, B, 1, H] 637 | g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] 638 | g = g * self.speaker_map # [N, S, B, 1, H] 639 | g = torch.sum(g, dim=1) # [N, 1, B, 1, H] 640 | g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] 641 | else: 642 | g = g.unsqueeze(0) 643 | g = self.emb_g(g).transpose(1, 2) 644 | 645 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 646 | z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask 647 | z = self.flow(z_p, x_mask, g=g, reverse=True) 648 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) 649 | return o 650 | 651 | 652 | class MultiPeriodDiscriminator(torch.nn.Module): 653 | def __init__(self, use_spectral_norm=False): 654 | super(MultiPeriodDiscriminator, self).__init__() 655 | periods = [2, 3, 5, 7, 11, 17] 656 | # periods = [3, 5, 7, 11, 17, 23, 37] 657 | 658 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 659 | discs = discs + [ 660 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 661 | ] 662 | self.discriminators = nn.ModuleList(discs) 663 | 664 | def forward(self, y, y_hat): 665 | y_d_rs = [] # 666 | y_d_gs = [] 667 | fmap_rs = [] 668 | fmap_gs = [] 669 | for i, d in enumerate(self.discriminators): 670 | y_d_r, fmap_r = d(y) 671 | y_d_g, fmap_g = d(y_hat) 672 | # for j in range(len(fmap_r)): 673 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 674 | y_d_rs.append(y_d_r) 675 | y_d_gs.append(y_d_g) 676 | fmap_rs.append(fmap_r) 677 | fmap_gs.append(fmap_g) 678 | 679 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 680 | 681 | 682 | class MultiPeriodDiscriminatorV2(torch.nn.Module): 683 | def __init__(self, use_spectral_norm=False): 684 | super(MultiPeriodDiscriminatorV2, self).__init__() 685 | # periods = [2, 3, 5, 7, 11, 17] 686 | periods = [2, 3, 5, 7, 11, 17, 23, 37] 687 | 688 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 689 | discs = discs + [ 690 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 691 | ] 692 | self.discriminators = nn.ModuleList(discs) 693 | 694 | def forward(self, y, y_hat): 695 | y_d_rs = [] # 696 | y_d_gs = [] 697 | fmap_rs = [] 698 | fmap_gs = [] 699 | for i, d in enumerate(self.discriminators): 700 | y_d_r, fmap_r = d(y) 701 | y_d_g, fmap_g = d(y_hat) 702 | # for j in range(len(fmap_r)): 703 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 704 | y_d_rs.append(y_d_r) 705 | y_d_gs.append(y_d_g) 706 | fmap_rs.append(fmap_r) 707 | fmap_gs.append(fmap_g) 708 | 709 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 710 | 711 | 712 | class DiscriminatorS(torch.nn.Module): 713 | def __init__(self, use_spectral_norm=False): 714 | super(DiscriminatorS, self).__init__() 715 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 716 | self.convs = nn.ModuleList( 717 | [ 718 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 719 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 720 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 721 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 722 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 723 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 724 | ] 725 | ) 726 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 727 | 728 | def forward(self, x): 729 | fmap = [] 730 | 731 | for l in self.convs: 732 | x = l(x) 733 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 734 | fmap.append(x) 735 | x = self.conv_post(x) 736 | fmap.append(x) 737 | x = torch.flatten(x, 1, -1) 738 | 739 | return x, fmap 740 | 741 | 742 | class DiscriminatorP(torch.nn.Module): 743 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 744 | super(DiscriminatorP, self).__init__() 745 | self.period = period 746 | self.use_spectral_norm = use_spectral_norm 747 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 748 | self.convs = nn.ModuleList( 749 | [ 750 | norm_f( 751 | Conv2d( 752 | 1, 753 | 32, 754 | (kernel_size, 1), 755 | (stride, 1), 756 | padding=(get_padding(kernel_size, 1), 0), 757 | ) 758 | ), 759 | norm_f( 760 | Conv2d( 761 | 32, 762 | 128, 763 | (kernel_size, 1), 764 | (stride, 1), 765 | padding=(get_padding(kernel_size, 1), 0), 766 | ) 767 | ), 768 | norm_f( 769 | Conv2d( 770 | 128, 771 | 512, 772 | (kernel_size, 1), 773 | (stride, 1), 774 | padding=(get_padding(kernel_size, 1), 0), 775 | ) 776 | ), 777 | norm_f( 778 | Conv2d( 779 | 512, 780 | 1024, 781 | (kernel_size, 1), 782 | (stride, 1), 783 | padding=(get_padding(kernel_size, 1), 0), 784 | ) 785 | ), 786 | norm_f( 787 | Conv2d( 788 | 1024, 789 | 1024, 790 | (kernel_size, 1), 791 | 1, 792 | padding=(get_padding(kernel_size, 1), 0), 793 | ) 794 | ), 795 | ] 796 | ) 797 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 798 | 799 | def forward(self, x): 800 | fmap = [] 801 | 802 | # 1d to 2d 803 | b, c, t = x.shape 804 | if t % self.period != 0: # pad first 805 | n_pad = self.period - (t % self.period) 806 | x = F.pad(x, (0, n_pad), "reflect") 807 | t = t + n_pad 808 | x = x.view(b, c, t // self.period, self.period) 809 | 810 | for l in self.convs: 811 | x = l(x) 812 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 813 | fmap.append(x) 814 | x = self.conv_post(x) 815 | fmap.append(x) 816 | x = torch.flatten(x, 1, -1) 817 | 818 | return x, fmap 819 | -------------------------------------------------------------------------------- /src/infer_pack/modules.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import scipy 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 10 | from torch.nn.utils import weight_norm, remove_weight_norm 11 | 12 | from infer_pack import commons 13 | from infer_pack.commons import init_weights, get_padding 14 | from infer_pack.transforms import piecewise_rational_quadratic_transform 15 | 16 | 17 | LRELU_SLOPE = 0.1 18 | 19 | 20 | class LayerNorm(nn.Module): 21 | def __init__(self, channels, eps=1e-5): 22 | super().__init__() 23 | self.channels = channels 24 | self.eps = eps 25 | 26 | self.gamma = nn.Parameter(torch.ones(channels)) 27 | self.beta = nn.Parameter(torch.zeros(channels)) 28 | 29 | def forward(self, x): 30 | x = x.transpose(1, -1) 31 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 32 | return x.transpose(1, -1) 33 | 34 | 35 | class ConvReluNorm(nn.Module): 36 | def __init__( 37 | self, 38 | in_channels, 39 | hidden_channels, 40 | out_channels, 41 | kernel_size, 42 | n_layers, 43 | p_dropout, 44 | ): 45 | super().__init__() 46 | self.in_channels = in_channels 47 | self.hidden_channels = hidden_channels 48 | self.out_channels = out_channels 49 | self.kernel_size = kernel_size 50 | self.n_layers = n_layers 51 | self.p_dropout = p_dropout 52 | assert n_layers > 1, "Number of layers should be larger than 0." 53 | 54 | self.conv_layers = nn.ModuleList() 55 | self.norm_layers = nn.ModuleList() 56 | self.conv_layers.append( 57 | nn.Conv1d( 58 | in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 59 | ) 60 | ) 61 | self.norm_layers.append(LayerNorm(hidden_channels)) 62 | self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) 63 | for _ in range(n_layers - 1): 64 | self.conv_layers.append( 65 | nn.Conv1d( 66 | hidden_channels, 67 | hidden_channels, 68 | kernel_size, 69 | padding=kernel_size // 2, 70 | ) 71 | ) 72 | self.norm_layers.append(LayerNorm(hidden_channels)) 73 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 74 | self.proj.weight.data.zero_() 75 | self.proj.bias.data.zero_() 76 | 77 | def forward(self, x, x_mask): 78 | x_org = x 79 | for i in range(self.n_layers): 80 | x = self.conv_layers[i](x * x_mask) 81 | x = self.norm_layers[i](x) 82 | x = self.relu_drop(x) 83 | x = x_org + self.proj(x) 84 | return x * x_mask 85 | 86 | 87 | class DDSConv(nn.Module): 88 | """ 89 | Dialted and Depth-Separable Convolution 90 | """ 91 | 92 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): 93 | super().__init__() 94 | self.channels = channels 95 | self.kernel_size = kernel_size 96 | self.n_layers = n_layers 97 | self.p_dropout = p_dropout 98 | 99 | self.drop = nn.Dropout(p_dropout) 100 | self.convs_sep = nn.ModuleList() 101 | self.convs_1x1 = nn.ModuleList() 102 | self.norms_1 = nn.ModuleList() 103 | self.norms_2 = nn.ModuleList() 104 | for i in range(n_layers): 105 | dilation = kernel_size**i 106 | padding = (kernel_size * dilation - dilation) // 2 107 | self.convs_sep.append( 108 | nn.Conv1d( 109 | channels, 110 | channels, 111 | kernel_size, 112 | groups=channels, 113 | dilation=dilation, 114 | padding=padding, 115 | ) 116 | ) 117 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 118 | self.norms_1.append(LayerNorm(channels)) 119 | self.norms_2.append(LayerNorm(channels)) 120 | 121 | def forward(self, x, x_mask, g=None): 122 | if g is not None: 123 | x = x + g 124 | for i in range(self.n_layers): 125 | y = self.convs_sep[i](x * x_mask) 126 | y = self.norms_1[i](y) 127 | y = F.gelu(y) 128 | y = self.convs_1x1[i](y) 129 | y = self.norms_2[i](y) 130 | y = F.gelu(y) 131 | y = self.drop(y) 132 | x = x + y 133 | return x * x_mask 134 | 135 | 136 | class WN(torch.nn.Module): 137 | def __init__( 138 | self, 139 | hidden_channels, 140 | kernel_size, 141 | dilation_rate, 142 | n_layers, 143 | gin_channels=0, 144 | p_dropout=0, 145 | ): 146 | super(WN, self).__init__() 147 | assert kernel_size % 2 == 1 148 | self.hidden_channels = hidden_channels 149 | self.kernel_size = (kernel_size,) 150 | self.dilation_rate = dilation_rate 151 | self.n_layers = n_layers 152 | self.gin_channels = gin_channels 153 | self.p_dropout = p_dropout 154 | 155 | self.in_layers = torch.nn.ModuleList() 156 | self.res_skip_layers = torch.nn.ModuleList() 157 | self.drop = nn.Dropout(p_dropout) 158 | 159 | if gin_channels != 0: 160 | cond_layer = torch.nn.Conv1d( 161 | gin_channels, 2 * hidden_channels * n_layers, 1 162 | ) 163 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") 164 | 165 | for i in range(n_layers): 166 | dilation = dilation_rate**i 167 | padding = int((kernel_size * dilation - dilation) / 2) 168 | in_layer = torch.nn.Conv1d( 169 | hidden_channels, 170 | 2 * hidden_channels, 171 | kernel_size, 172 | dilation=dilation, 173 | padding=padding, 174 | ) 175 | in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") 176 | self.in_layers.append(in_layer) 177 | 178 | # last one is not necessary 179 | if i < n_layers - 1: 180 | res_skip_channels = 2 * hidden_channels 181 | else: 182 | res_skip_channels = hidden_channels 183 | 184 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 185 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") 186 | self.res_skip_layers.append(res_skip_layer) 187 | 188 | def forward(self, x, x_mask, g=None, **kwargs): 189 | output = torch.zeros_like(x) 190 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 191 | 192 | if g is not None: 193 | g = self.cond_layer(g) 194 | 195 | for i in range(self.n_layers): 196 | x_in = self.in_layers[i](x) 197 | if g is not None: 198 | cond_offset = i * 2 * self.hidden_channels 199 | g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] 200 | else: 201 | g_l = torch.zeros_like(x_in) 202 | 203 | acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) 204 | acts = self.drop(acts) 205 | 206 | res_skip_acts = self.res_skip_layers[i](acts) 207 | if i < self.n_layers - 1: 208 | res_acts = res_skip_acts[:, : self.hidden_channels, :] 209 | x = (x + res_acts) * x_mask 210 | output = output + res_skip_acts[:, self.hidden_channels :, :] 211 | else: 212 | output = output + res_skip_acts 213 | return output * x_mask 214 | 215 | def remove_weight_norm(self): 216 | if self.gin_channels != 0: 217 | torch.nn.utils.remove_weight_norm(self.cond_layer) 218 | for l in self.in_layers: 219 | torch.nn.utils.remove_weight_norm(l) 220 | for l in self.res_skip_layers: 221 | torch.nn.utils.remove_weight_norm(l) 222 | 223 | 224 | class ResBlock1(torch.nn.Module): 225 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 226 | super(ResBlock1, self).__init__() 227 | self.convs1 = nn.ModuleList( 228 | [ 229 | weight_norm( 230 | Conv1d( 231 | channels, 232 | channels, 233 | kernel_size, 234 | 1, 235 | dilation=dilation[0], 236 | padding=get_padding(kernel_size, dilation[0]), 237 | ) 238 | ), 239 | weight_norm( 240 | Conv1d( 241 | channels, 242 | channels, 243 | kernel_size, 244 | 1, 245 | dilation=dilation[1], 246 | padding=get_padding(kernel_size, dilation[1]), 247 | ) 248 | ), 249 | weight_norm( 250 | Conv1d( 251 | channels, 252 | channels, 253 | kernel_size, 254 | 1, 255 | dilation=dilation[2], 256 | padding=get_padding(kernel_size, dilation[2]), 257 | ) 258 | ), 259 | ] 260 | ) 261 | self.convs1.apply(init_weights) 262 | 263 | self.convs2 = nn.ModuleList( 264 | [ 265 | weight_norm( 266 | Conv1d( 267 | channels, 268 | channels, 269 | kernel_size, 270 | 1, 271 | dilation=1, 272 | padding=get_padding(kernel_size, 1), 273 | ) 274 | ), 275 | weight_norm( 276 | Conv1d( 277 | channels, 278 | channels, 279 | kernel_size, 280 | 1, 281 | dilation=1, 282 | padding=get_padding(kernel_size, 1), 283 | ) 284 | ), 285 | weight_norm( 286 | Conv1d( 287 | channels, 288 | channels, 289 | kernel_size, 290 | 1, 291 | dilation=1, 292 | padding=get_padding(kernel_size, 1), 293 | ) 294 | ), 295 | ] 296 | ) 297 | self.convs2.apply(init_weights) 298 | 299 | def forward(self, x, x_mask=None): 300 | for c1, c2 in zip(self.convs1, self.convs2): 301 | xt = F.leaky_relu(x, LRELU_SLOPE) 302 | if x_mask is not None: 303 | xt = xt * x_mask 304 | xt = c1(xt) 305 | xt = F.leaky_relu(xt, LRELU_SLOPE) 306 | if x_mask is not None: 307 | xt = xt * x_mask 308 | xt = c2(xt) 309 | x = xt + x 310 | if x_mask is not None: 311 | x = x * x_mask 312 | return x 313 | 314 | def remove_weight_norm(self): 315 | for l in self.convs1: 316 | remove_weight_norm(l) 317 | for l in self.convs2: 318 | remove_weight_norm(l) 319 | 320 | 321 | class ResBlock2(torch.nn.Module): 322 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 323 | super(ResBlock2, self).__init__() 324 | self.convs = nn.ModuleList( 325 | [ 326 | weight_norm( 327 | Conv1d( 328 | channels, 329 | channels, 330 | kernel_size, 331 | 1, 332 | dilation=dilation[0], 333 | padding=get_padding(kernel_size, dilation[0]), 334 | ) 335 | ), 336 | weight_norm( 337 | Conv1d( 338 | channels, 339 | channels, 340 | kernel_size, 341 | 1, 342 | dilation=dilation[1], 343 | padding=get_padding(kernel_size, dilation[1]), 344 | ) 345 | ), 346 | ] 347 | ) 348 | self.convs.apply(init_weights) 349 | 350 | def forward(self, x, x_mask=None): 351 | for c in self.convs: 352 | xt = F.leaky_relu(x, LRELU_SLOPE) 353 | if x_mask is not None: 354 | xt = xt * x_mask 355 | xt = c(xt) 356 | x = xt + x 357 | if x_mask is not None: 358 | x = x * x_mask 359 | return x 360 | 361 | def remove_weight_norm(self): 362 | for l in self.convs: 363 | remove_weight_norm(l) 364 | 365 | 366 | class Log(nn.Module): 367 | def forward(self, x, x_mask, reverse=False, **kwargs): 368 | if not reverse: 369 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 370 | logdet = torch.sum(-y, [1, 2]) 371 | return y, logdet 372 | else: 373 | x = torch.exp(x) * x_mask 374 | return x 375 | 376 | 377 | class Flip(nn.Module): 378 | def forward(self, x, *args, reverse=False, **kwargs): 379 | x = torch.flip(x, [1]) 380 | if not reverse: 381 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 382 | return x, logdet 383 | else: 384 | return x 385 | 386 | 387 | class ElementwiseAffine(nn.Module): 388 | def __init__(self, channels): 389 | super().__init__() 390 | self.channels = channels 391 | self.m = nn.Parameter(torch.zeros(channels, 1)) 392 | self.logs = nn.Parameter(torch.zeros(channels, 1)) 393 | 394 | def forward(self, x, x_mask, reverse=False, **kwargs): 395 | if not reverse: 396 | y = self.m + torch.exp(self.logs) * x 397 | y = y * x_mask 398 | logdet = torch.sum(self.logs * x_mask, [1, 2]) 399 | return y, logdet 400 | else: 401 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 402 | return x 403 | 404 | 405 | class ResidualCouplingLayer(nn.Module): 406 | def __init__( 407 | self, 408 | channels, 409 | hidden_channels, 410 | kernel_size, 411 | dilation_rate, 412 | n_layers, 413 | p_dropout=0, 414 | gin_channels=0, 415 | mean_only=False, 416 | ): 417 | assert channels % 2 == 0, "channels should be divisible by 2" 418 | super().__init__() 419 | self.channels = channels 420 | self.hidden_channels = hidden_channels 421 | self.kernel_size = kernel_size 422 | self.dilation_rate = dilation_rate 423 | self.n_layers = n_layers 424 | self.half_channels = channels // 2 425 | self.mean_only = mean_only 426 | 427 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 428 | self.enc = WN( 429 | hidden_channels, 430 | kernel_size, 431 | dilation_rate, 432 | n_layers, 433 | p_dropout=p_dropout, 434 | gin_channels=gin_channels, 435 | ) 436 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 437 | self.post.weight.data.zero_() 438 | self.post.bias.data.zero_() 439 | 440 | def forward(self, x, x_mask, g=None, reverse=False): 441 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 442 | h = self.pre(x0) * x_mask 443 | h = self.enc(h, x_mask, g=g) 444 | stats = self.post(h) * x_mask 445 | if not self.mean_only: 446 | m, logs = torch.split(stats, [self.half_channels] * 2, 1) 447 | else: 448 | m = stats 449 | logs = torch.zeros_like(m) 450 | 451 | if not reverse: 452 | x1 = m + x1 * torch.exp(logs) * x_mask 453 | x = torch.cat([x0, x1], 1) 454 | logdet = torch.sum(logs, [1, 2]) 455 | return x, logdet 456 | else: 457 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 458 | x = torch.cat([x0, x1], 1) 459 | return x 460 | 461 | def remove_weight_norm(self): 462 | self.enc.remove_weight_norm() 463 | 464 | 465 | class ConvFlow(nn.Module): 466 | def __init__( 467 | self, 468 | in_channels, 469 | filter_channels, 470 | kernel_size, 471 | n_layers, 472 | num_bins=10, 473 | tail_bound=5.0, 474 | ): 475 | super().__init__() 476 | self.in_channels = in_channels 477 | self.filter_channels = filter_channels 478 | self.kernel_size = kernel_size 479 | self.n_layers = n_layers 480 | self.num_bins = num_bins 481 | self.tail_bound = tail_bound 482 | self.half_channels = in_channels // 2 483 | 484 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 485 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) 486 | self.proj = nn.Conv1d( 487 | filter_channels, self.half_channels * (num_bins * 3 - 1), 1 488 | ) 489 | self.proj.weight.data.zero_() 490 | self.proj.bias.data.zero_() 491 | 492 | def forward(self, x, x_mask, g=None, reverse=False): 493 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 494 | h = self.pre(x0) 495 | h = self.convs(h, x_mask, g=g) 496 | h = self.proj(h) * x_mask 497 | 498 | b, c, t = x0.shape 499 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 500 | 501 | unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) 502 | unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( 503 | self.filter_channels 504 | ) 505 | unnormalized_derivatives = h[..., 2 * self.num_bins :] 506 | 507 | x1, logabsdet = piecewise_rational_quadratic_transform( 508 | x1, 509 | unnormalized_widths, 510 | unnormalized_heights, 511 | unnormalized_derivatives, 512 | inverse=reverse, 513 | tails="linear", 514 | tail_bound=self.tail_bound, 515 | ) 516 | 517 | x = torch.cat([x0, x1], 1) * x_mask 518 | logdet = torch.sum(logabsdet * x_mask, [1, 2]) 519 | if not reverse: 520 | return x, logdet 521 | else: 522 | return x 523 | -------------------------------------------------------------------------------- /src/infer_pack/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform( 13 | inputs, 14 | unnormalized_widths, 15 | unnormalized_heights, 16 | unnormalized_derivatives, 17 | inverse=False, 18 | tails=None, 19 | tail_bound=1.0, 20 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 21 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 22 | min_derivative=DEFAULT_MIN_DERIVATIVE, 23 | ): 24 | if tails is None: 25 | spline_fn = rational_quadratic_spline 26 | spline_kwargs = {} 27 | else: 28 | spline_fn = unconstrained_rational_quadratic_spline 29 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound} 30 | 31 | outputs, logabsdet = spline_fn( 32 | inputs=inputs, 33 | unnormalized_widths=unnormalized_widths, 34 | unnormalized_heights=unnormalized_heights, 35 | unnormalized_derivatives=unnormalized_derivatives, 36 | inverse=inverse, 37 | min_bin_width=min_bin_width, 38 | min_bin_height=min_bin_height, 39 | min_derivative=min_derivative, 40 | **spline_kwargs 41 | ) 42 | return outputs, logabsdet 43 | 44 | 45 | def searchsorted(bin_locations, inputs, eps=1e-6): 46 | bin_locations[..., -1] += eps 47 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 48 | 49 | 50 | def unconstrained_rational_quadratic_spline( 51 | inputs, 52 | unnormalized_widths, 53 | unnormalized_heights, 54 | unnormalized_derivatives, 55 | inverse=False, 56 | tails="linear", 57 | tail_bound=1.0, 58 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 59 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 60 | min_derivative=DEFAULT_MIN_DERIVATIVE, 61 | ): 62 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 63 | outside_interval_mask = ~inside_interval_mask 64 | 65 | outputs = torch.zeros_like(inputs) 66 | logabsdet = torch.zeros_like(inputs) 67 | 68 | if tails == "linear": 69 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 70 | constant = np.log(np.exp(1 - min_derivative) - 1) 71 | unnormalized_derivatives[..., 0] = constant 72 | unnormalized_derivatives[..., -1] = constant 73 | 74 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 75 | logabsdet[outside_interval_mask] = 0 76 | else: 77 | raise RuntimeError("{} tails are not implemented.".format(tails)) 78 | 79 | ( 80 | outputs[inside_interval_mask], 81 | logabsdet[inside_interval_mask], 82 | ) = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, 89 | right=tail_bound, 90 | bottom=-tail_bound, 91 | top=tail_bound, 92 | min_bin_width=min_bin_width, 93 | min_bin_height=min_bin_height, 94 | min_derivative=min_derivative, 95 | ) 96 | 97 | return outputs, logabsdet 98 | 99 | 100 | def rational_quadratic_spline( 101 | inputs, 102 | unnormalized_widths, 103 | unnormalized_heights, 104 | unnormalized_derivatives, 105 | inverse=False, 106 | left=0.0, 107 | right=1.0, 108 | bottom=0.0, 109 | top=1.0, 110 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 111 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 112 | min_derivative=DEFAULT_MIN_DERIVATIVE, 113 | ): 114 | if torch.min(inputs) < left or torch.max(inputs) > right: 115 | raise ValueError("Input to a transform is not within its domain") 116 | 117 | num_bins = unnormalized_widths.shape[-1] 118 | 119 | if min_bin_width * num_bins > 1.0: 120 | raise ValueError("Minimal bin width too large for the number of bins") 121 | if min_bin_height * num_bins > 1.0: 122 | raise ValueError("Minimal bin height too large for the number of bins") 123 | 124 | widths = F.softmax(unnormalized_widths, dim=-1) 125 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 126 | cumwidths = torch.cumsum(widths, dim=-1) 127 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) 128 | cumwidths = (right - left) * cumwidths + left 129 | cumwidths[..., 0] = left 130 | cumwidths[..., -1] = right 131 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 132 | 133 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 134 | 135 | heights = F.softmax(unnormalized_heights, dim=-1) 136 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 137 | cumheights = torch.cumsum(heights, dim=-1) 138 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) 139 | cumheights = (top - bottom) * cumheights + bottom 140 | cumheights[..., 0] = bottom 141 | cumheights[..., -1] = top 142 | heights = cumheights[..., 1:] - cumheights[..., :-1] 143 | 144 | if inverse: 145 | bin_idx = searchsorted(cumheights, inputs)[..., None] 146 | else: 147 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 148 | 149 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 150 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 151 | 152 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 153 | delta = heights / widths 154 | input_delta = delta.gather(-1, bin_idx)[..., 0] 155 | 156 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 157 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 158 | 159 | input_heights = heights.gather(-1, bin_idx)[..., 0] 160 | 161 | if inverse: 162 | a = (inputs - input_cumheights) * ( 163 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 164 | ) + input_heights * (input_delta - input_derivatives) 165 | b = input_heights * input_derivatives - (inputs - input_cumheights) * ( 166 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 167 | ) 168 | c = -input_delta * (inputs - input_cumheights) 169 | 170 | discriminant = b.pow(2) - 4 * a * c 171 | assert (discriminant >= 0).all() 172 | 173 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 174 | outputs = root * input_bin_widths + input_cumwidths 175 | 176 | theta_one_minus_theta = root * (1 - root) 177 | denominator = input_delta + ( 178 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 179 | * theta_one_minus_theta 180 | ) 181 | derivative_numerator = input_delta.pow(2) * ( 182 | input_derivatives_plus_one * root.pow(2) 183 | + 2 * input_delta * theta_one_minus_theta 184 | + input_derivatives * (1 - root).pow(2) 185 | ) 186 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 187 | 188 | return outputs, -logabsdet 189 | else: 190 | theta = (inputs - input_cumwidths) / input_bin_widths 191 | theta_one_minus_theta = theta * (1 - theta) 192 | 193 | numerator = input_heights * ( 194 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta 195 | ) 196 | denominator = input_delta + ( 197 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 198 | * theta_one_minus_theta 199 | ) 200 | outputs = input_cumheights + numerator / denominator 201 | 202 | derivative_numerator = input_delta.pow(2) * ( 203 | input_derivatives_plus_one * theta.pow(2) 204 | + 2 * input_delta * theta_one_minus_theta 205 | + input_derivatives * (1 - theta).pow(2) 206 | ) 207 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 208 | 209 | return outputs, logabsdet 210 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import urllib.parse 5 | import urllib.request 6 | import shutil 7 | import zipfile 8 | from rvc import rvc_infer, load_hubert, get_vc, Config 9 | 10 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 11 | rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models') 12 | output_dir = os.path.join(BASE_DIR, 'voice_output') 13 | 14 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 15 | is_half = False if device == "cpu" else True 16 | 17 | def download_online_model(url, dir_name, overwrite=False): 18 | try: 19 | # Parse the URL and extract the filename 20 | parsed_url = urllib.parse.urlparse(url) 21 | zip_name = os.path.basename(parsed_url.path) 22 | 23 | # Remove any query parameters from the filename 24 | zip_name = zip_name.split('?')[0] 25 | 26 | extraction_folder = os.path.join(rvc_models_dir, dir_name) 27 | if os.path.exists(extraction_folder): 28 | if overwrite: 29 | print(f"[!] Voice model directory {dir_name} already exists. Overwriting...") 30 | shutil.rmtree(extraction_folder) 31 | else: 32 | print(f"[!] Voice model directory {dir_name} already exists. Using existing model.") 33 | return f"[+] Using existing model: {dir_name}" 34 | 35 | # Download the file 36 | print(f"[*] Downloading model from {url}...") 37 | urllib.request.urlretrieve(url, zip_name) 38 | 39 | # Extract the zip file 40 | print(f"[*] Extracting model to {extraction_folder}...") 41 | with zipfile.ZipFile(zip_name, 'r') as zip_ref: 42 | zip_ref.extractall(extraction_folder) 43 | 44 | # Remove the zip file 45 | os.remove(zip_name) 46 | 47 | return f'[+] {dir_name} Model successfully downloaded and extracted!' 48 | except Exception as e: 49 | raise Exception(f"Error downloading model: {str(e)}") 50 | 51 | def get_rvc_model(voice_model): 52 | model_dir = os.path.join(rvc_models_dir, voice_model) 53 | for file in os.listdir(model_dir): 54 | if file.endswith('.pth'): 55 | return os.path.join(model_dir, file) 56 | raise FileNotFoundError(f"No .pth file found in RVC model directory: {model_dir}") 57 | 58 | def voice_conversion(input_audio, rvc_model, pitch=0, f0_method='rmvpe', index_rate=0.5, filter_radius=3, rms_mix_rate=0.25, protect=0.33): 59 | try: 60 | hubert_model = load_hubert(device, is_half, os.path.join(rvc_models_dir, "hubert_base.pt")) 61 | model_path = get_rvc_model(rvc_model) 62 | cpt, version, net_g, tgt_sr, vc = get_vc(device, is_half, Config(device, is_half), model_path) 63 | 64 | output_filename = os.path.join(output_dir, f"converted_{os.path.basename(input_audio)}") 65 | output_filename = os.path.splitext(output_filename)[0] + '.wav' 66 | os.makedirs(output_dir, exist_ok=True) 67 | 68 | rvc_infer("", index_rate, input_audio, output_filename, pitch, f0_method, cpt, version, net_g, 69 | filter_radius, tgt_sr, rms_mix_rate, protect, 160, vc, hubert_model) 70 | 71 | return output_filename 72 | except Exception as e: 73 | raise Exception(f"Voice conversion failed: {str(e)}") 74 | 75 | def print_example_usage(): 76 | print("\nUsage:") 77 | print('python main.py [pitch] [f0_method] [index_rate] [filter_radius] [rms_mix_rate] [protect]') 78 | print("\nRequired arguments:") 79 | print(" input_audio: path to input audio file") 80 | print(" rvc_model: name of the RVC model to use") 81 | print("\nOptional arguments:") 82 | print(" pitch: pitch shift (default: 0)") 83 | print(" f0_method: pitch extraction method (default: 'rmvpe')") 84 | print(" index_rate: index rate (default: 0.5)") 85 | print(" filter_radius: filter radius (default: 3)") 86 | print(" rms_mix_rate: RMS mix rate (default: 0.25)") 87 | print(" protect: protect rate (default: 0.33)") 88 | 89 | if __name__ == '__main__': 90 | if len(sys.argv) < 3: 91 | print("Error: Insufficient arguments.") 92 | print_example_usage() 93 | sys.exit(1) 94 | 95 | try: 96 | input_audio = sys.argv[1] 97 | rvc_model = sys.argv[2] 98 | pitch = int(sys.argv[3]) if len(sys.argv) > 3 else 0 99 | f0_method = sys.argv[4] if len(sys.argv) > 4 else 'rmvpe' 100 | index_rate = float(sys.argv[5]) if len(sys.argv) > 5 else 0.5 101 | filter_radius = int(sys.argv[6]) if len(sys.argv) > 6 else 3 102 | rms_mix_rate = float(sys.argv[7]) if len(sys.argv) > 7 else 0.25 103 | protect = float(sys.argv[8]) if len(sys.argv) > 8 else 0.33 104 | 105 | output_path = voice_conversion(input_audio, rvc_model, pitch, f0_method, index_rate, filter_radius, rms_mix_rate, protect) 106 | print(f"Converted audio saved to: {output_path}") 107 | except Exception as e: 108 | print(f"Error: {str(e)}") 109 | print_example_usage() 110 | -------------------------------------------------------------------------------- /src/mdx.py: -------------------------------------------------------------------------------- 1 | # This module is no longer used in the current implementation 2 | # Kept for potential future use 3 | 4 | def run_mdx(*args, **kwargs): 5 | raise NotImplementedError("MDX functionality is not used in the current version") 6 | -------------------------------------------------------------------------------- /src/my_utils.py: -------------------------------------------------------------------------------- 1 | import ffmpeg 2 | import numpy as np 3 | 4 | 5 | def load_audio(file, sr): 6 | try: 7 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 8 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 9 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 10 | file = ( 11 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 12 | ) # 防止小白拷路径头尾带了空格和"和回车 13 | out, _ = ( 14 | ffmpeg.input(file, threads=0) 15 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 17 | ) 18 | except Exception as e: 19 | raise RuntimeError(f"Failed to load audio: {e}") 20 | 21 | return np.frombuffer(out, np.float32).flatten() 22 | -------------------------------------------------------------------------------- /src/rmvpe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from librosa.filters import mel 6 | 7 | 8 | class BiGRU(nn.Module): 9 | def __init__(self, input_features, hidden_features, num_layers): 10 | super(BiGRU, self).__init__() 11 | self.gru = nn.GRU( 12 | input_features, 13 | hidden_features, 14 | num_layers=num_layers, 15 | batch_first=True, 16 | bidirectional=True, 17 | ) 18 | 19 | def forward(self, x): 20 | return self.gru(x)[0] 21 | 22 | 23 | class ConvBlockRes(nn.Module): 24 | def __init__(self, in_channels, out_channels, momentum=0.01): 25 | super(ConvBlockRes, self).__init__() 26 | self.conv = nn.Sequential( 27 | nn.Conv2d( 28 | in_channels=in_channels, 29 | out_channels=out_channels, 30 | kernel_size=(3, 3), 31 | stride=(1, 1), 32 | padding=(1, 1), 33 | bias=False, 34 | ), 35 | nn.BatchNorm2d(out_channels, momentum=momentum), 36 | nn.ReLU(), 37 | nn.Conv2d( 38 | in_channels=out_channels, 39 | out_channels=out_channels, 40 | kernel_size=(3, 3), 41 | stride=(1, 1), 42 | padding=(1, 1), 43 | bias=False, 44 | ), 45 | nn.BatchNorm2d(out_channels, momentum=momentum), 46 | nn.ReLU(), 47 | ) 48 | if in_channels != out_channels: 49 | self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) 50 | self.is_shortcut = True 51 | else: 52 | self.is_shortcut = False 53 | 54 | def forward(self, x): 55 | if self.is_shortcut: 56 | return self.conv(x) + self.shortcut(x) 57 | else: 58 | return self.conv(x) + x 59 | 60 | 61 | class Encoder(nn.Module): 62 | def __init__( 63 | self, 64 | in_channels, 65 | in_size, 66 | n_encoders, 67 | kernel_size, 68 | n_blocks, 69 | out_channels=16, 70 | momentum=0.01, 71 | ): 72 | super(Encoder, self).__init__() 73 | self.n_encoders = n_encoders 74 | self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) 75 | self.layers = nn.ModuleList() 76 | self.latent_channels = [] 77 | for i in range(self.n_encoders): 78 | self.layers.append( 79 | ResEncoderBlock( 80 | in_channels, out_channels, kernel_size, n_blocks, momentum=momentum 81 | ) 82 | ) 83 | self.latent_channels.append([out_channels, in_size]) 84 | in_channels = out_channels 85 | out_channels *= 2 86 | in_size //= 2 87 | self.out_size = in_size 88 | self.out_channel = out_channels 89 | 90 | def forward(self, x): 91 | concat_tensors = [] 92 | x = self.bn(x) 93 | for i in range(self.n_encoders): 94 | _, x = self.layers[i](x) 95 | concat_tensors.append(_) 96 | return x, concat_tensors 97 | 98 | 99 | class ResEncoderBlock(nn.Module): 100 | def __init__( 101 | self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 102 | ): 103 | super(ResEncoderBlock, self).__init__() 104 | self.n_blocks = n_blocks 105 | self.conv = nn.ModuleList() 106 | self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) 107 | for i in range(n_blocks - 1): 108 | self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) 109 | self.kernel_size = kernel_size 110 | if self.kernel_size is not None: 111 | self.pool = nn.AvgPool2d(kernel_size=kernel_size) 112 | 113 | def forward(self, x): 114 | for i in range(self.n_blocks): 115 | x = self.conv[i](x) 116 | if self.kernel_size is not None: 117 | return x, self.pool(x) 118 | else: 119 | return x 120 | 121 | 122 | class Intermediate(nn.Module): # 123 | def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): 124 | super(Intermediate, self).__init__() 125 | self.n_inters = n_inters 126 | self.layers = nn.ModuleList() 127 | self.layers.append( 128 | ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) 129 | ) 130 | for i in range(self.n_inters - 1): 131 | self.layers.append( 132 | ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) 133 | ) 134 | 135 | def forward(self, x): 136 | for i in range(self.n_inters): 137 | x = self.layers[i](x) 138 | return x 139 | 140 | 141 | class ResDecoderBlock(nn.Module): 142 | def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): 143 | super(ResDecoderBlock, self).__init__() 144 | out_padding = (0, 1) if stride == (1, 2) else (1, 1) 145 | self.n_blocks = n_blocks 146 | self.conv1 = nn.Sequential( 147 | nn.ConvTranspose2d( 148 | in_channels=in_channels, 149 | out_channels=out_channels, 150 | kernel_size=(3, 3), 151 | stride=stride, 152 | padding=(1, 1), 153 | output_padding=out_padding, 154 | bias=False, 155 | ), 156 | nn.BatchNorm2d(out_channels, momentum=momentum), 157 | nn.ReLU(), 158 | ) 159 | self.conv2 = nn.ModuleList() 160 | self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) 161 | for i in range(n_blocks - 1): 162 | self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) 163 | 164 | def forward(self, x, concat_tensor): 165 | x = self.conv1(x) 166 | x = torch.cat((x, concat_tensor), dim=1) 167 | for i in range(self.n_blocks): 168 | x = self.conv2[i](x) 169 | return x 170 | 171 | 172 | class Decoder(nn.Module): 173 | def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): 174 | super(Decoder, self).__init__() 175 | self.layers = nn.ModuleList() 176 | self.n_decoders = n_decoders 177 | for i in range(self.n_decoders): 178 | out_channels = in_channels // 2 179 | self.layers.append( 180 | ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) 181 | ) 182 | in_channels = out_channels 183 | 184 | def forward(self, x, concat_tensors): 185 | for i in range(self.n_decoders): 186 | x = self.layers[i](x, concat_tensors[-1 - i]) 187 | return x 188 | 189 | 190 | class DeepUnet(nn.Module): 191 | def __init__( 192 | self, 193 | kernel_size, 194 | n_blocks, 195 | en_de_layers=5, 196 | inter_layers=4, 197 | in_channels=1, 198 | en_out_channels=16, 199 | ): 200 | super(DeepUnet, self).__init__() 201 | self.encoder = Encoder( 202 | in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels 203 | ) 204 | self.intermediate = Intermediate( 205 | self.encoder.out_channel // 2, 206 | self.encoder.out_channel, 207 | inter_layers, 208 | n_blocks, 209 | ) 210 | self.decoder = Decoder( 211 | self.encoder.out_channel, en_de_layers, kernel_size, n_blocks 212 | ) 213 | 214 | def forward(self, x): 215 | x, concat_tensors = self.encoder(x) 216 | x = self.intermediate(x) 217 | x = self.decoder(x, concat_tensors) 218 | return x 219 | 220 | 221 | class E2E(nn.Module): 222 | def __init__( 223 | self, 224 | n_blocks, 225 | n_gru, 226 | kernel_size, 227 | en_de_layers=5, 228 | inter_layers=4, 229 | in_channels=1, 230 | en_out_channels=16, 231 | ): 232 | super(E2E, self).__init__() 233 | self.unet = DeepUnet( 234 | kernel_size, 235 | n_blocks, 236 | en_de_layers, 237 | inter_layers, 238 | in_channels, 239 | en_out_channels, 240 | ) 241 | self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) 242 | if n_gru: 243 | self.fc = nn.Sequential( 244 | BiGRU(3 * 128, 256, n_gru), 245 | nn.Linear(512, 360), 246 | nn.Dropout(0.25), 247 | nn.Sigmoid(), 248 | ) 249 | else: 250 | self.fc = nn.Sequential( 251 | nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() 252 | ) 253 | 254 | def forward(self, mel): 255 | mel = mel.transpose(-1, -2).unsqueeze(1) 256 | x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) 257 | x = self.fc(x) 258 | return x 259 | 260 | 261 | class MelSpectrogram(torch.nn.Module): 262 | def __init__( 263 | self, 264 | is_half, 265 | n_mel_channels, 266 | sampling_rate, 267 | win_length, 268 | hop_length, 269 | n_fft=None, 270 | mel_fmin=0, 271 | mel_fmax=None, 272 | clamp=1e-5, 273 | ): 274 | super().__init__() 275 | n_fft = win_length if n_fft is None else n_fft 276 | self.hann_window = {} 277 | mel_basis = mel( 278 | sr=sampling_rate, 279 | n_fft=n_fft, 280 | n_mels=n_mel_channels, 281 | fmin=mel_fmin, 282 | fmax=mel_fmax, 283 | htk=True, 284 | ) 285 | mel_basis = torch.from_numpy(mel_basis).float() 286 | self.register_buffer("mel_basis", mel_basis) 287 | self.n_fft = win_length if n_fft is None else n_fft 288 | self.hop_length = hop_length 289 | self.win_length = win_length 290 | self.sampling_rate = sampling_rate 291 | self.n_mel_channels = n_mel_channels 292 | self.clamp = clamp 293 | self.is_half = is_half 294 | 295 | def forward(self, audio, keyshift=0, speed=1, center=True): 296 | factor = 2 ** (keyshift / 12) 297 | n_fft_new = int(np.round(self.n_fft * factor)) 298 | win_length_new = int(np.round(self.win_length * factor)) 299 | hop_length_new = int(np.round(self.hop_length * speed)) 300 | keyshift_key = str(keyshift) + "_" + str(audio.device) 301 | if keyshift_key not in self.hann_window: 302 | self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( 303 | audio.device 304 | ) 305 | fft = torch.stft( 306 | audio, 307 | n_fft=n_fft_new, 308 | hop_length=hop_length_new, 309 | win_length=win_length_new, 310 | window=self.hann_window[keyshift_key], 311 | center=center, 312 | return_complex=True, 313 | ) 314 | magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) 315 | if keyshift != 0: 316 | size = self.n_fft // 2 + 1 317 | resize = magnitude.size(1) 318 | if resize < size: 319 | magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) 320 | magnitude = magnitude[:, :size, :] * self.win_length / win_length_new 321 | mel_output = torch.matmul(self.mel_basis, magnitude) 322 | if self.is_half == True: 323 | mel_output = mel_output.half() 324 | log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) 325 | return log_mel_spec 326 | 327 | 328 | class RMVPE: 329 | def __init__(self, model_path, is_half, device=None): 330 | self.resample_kernel = {} 331 | model = E2E(4, 1, (2, 2)) 332 | ckpt = torch.load(model_path, map_location="cpu") 333 | model.load_state_dict(ckpt) 334 | model.eval() 335 | if is_half == True: 336 | model = model.half() 337 | self.model = model 338 | self.resample_kernel = {} 339 | self.is_half = is_half 340 | if device is None: 341 | device = "cuda" if torch.cuda.is_available() else "cpu" 342 | self.device = device 343 | self.mel_extractor = MelSpectrogram( 344 | is_half, 128, 16000, 1024, 160, None, 30, 8000 345 | ).to(device) 346 | self.model = self.model.to(device) 347 | cents_mapping = 20 * np.arange(360) + 1997.3794084376191 348 | self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 349 | 350 | def mel2hidden(self, mel): 351 | with torch.no_grad(): 352 | n_frames = mel.shape[-1] 353 | mel = F.pad( 354 | mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" 355 | ) 356 | hidden = self.model(mel) 357 | return hidden[:, :n_frames] 358 | 359 | def decode(self, hidden, thred=0.03): 360 | cents_pred = self.to_local_average_cents(hidden, thred=thred) 361 | f0 = 10 * (2 ** (cents_pred / 1200)) 362 | f0[f0 == 10] = 0 363 | # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) 364 | return f0 365 | 366 | def infer_from_audio(self, audio, thred=0.03): 367 | audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) 368 | # torch.cuda.synchronize() 369 | # t0=ttime() 370 | mel = self.mel_extractor(audio, center=True) 371 | # torch.cuda.synchronize() 372 | # t1=ttime() 373 | hidden = self.mel2hidden(mel) 374 | # torch.cuda.synchronize() 375 | # t2=ttime() 376 | hidden = hidden.squeeze(0).cpu().numpy() 377 | if self.is_half == True: 378 | hidden = hidden.astype("float32") 379 | f0 = self.decode(hidden, thred=thred) 380 | # torch.cuda.synchronize() 381 | # t3=ttime() 382 | # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) 383 | return f0 384 | 385 | def to_local_average_cents(self, salience, thred=0.05): 386 | # t0 = ttime() 387 | center = np.argmax(salience, axis=1) # 帧长#index 388 | salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 389 | # t1 = ttime() 390 | center += 4 391 | todo_salience = [] 392 | todo_cents_mapping = [] 393 | starts = center - 4 394 | ends = center + 5 395 | for idx in range(salience.shape[0]): 396 | todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) 397 | todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) 398 | # t2 = ttime() 399 | todo_salience = np.array(todo_salience) # 帧长,9 400 | todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 401 | product_sum = np.sum(todo_salience * todo_cents_mapping, 1) 402 | weight_sum = np.sum(todo_salience, 1) # 帧长 403 | devided = product_sum / weight_sum # 帧长 404 | # t3 = ttime() 405 | maxx = np.max(salience, axis=1) # 帧长 406 | devided[maxx <= thred] = 0 407 | # t4 = ttime() 408 | # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) 409 | return devided 410 | -------------------------------------------------------------------------------- /src/rvc.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import cpu_count 2 | from pathlib import Path 3 | 4 | import torch 5 | from fairseq import checkpoint_utils 6 | from scipy.io import wavfile 7 | 8 | from infer_pack.models import ( 9 | SynthesizerTrnMs256NSFsid, 10 | SynthesizerTrnMs256NSFsid_nono, 11 | SynthesizerTrnMs768NSFsid, 12 | SynthesizerTrnMs768NSFsid_nono, 13 | ) 14 | from my_utils import load_audio 15 | from vc_infer_pipeline import VC 16 | 17 | BASE_DIR = Path(__file__).resolve().parent.parent 18 | 19 | 20 | class Config: 21 | def __init__(self, device, is_half): 22 | self.device = device 23 | self.is_half = is_half 24 | self.n_cpu = 0 25 | self.gpu_name = None 26 | self.gpu_mem = None 27 | self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() 28 | 29 | def device_config(self) -> tuple: 30 | if torch.cuda.is_available(): 31 | i_device = int(self.device.split(":")[-1]) 32 | self.gpu_name = torch.cuda.get_device_name(i_device) 33 | if ( 34 | ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) 35 | or "P40" in self.gpu_name.upper() 36 | or "1060" in self.gpu_name 37 | or "1070" in self.gpu_name 38 | or "1080" in self.gpu_name 39 | ): 40 | print("16 series/10 series P40 forced single precision") 41 | self.is_half = False 42 | for config_file in ["32k.json", "40k.json", "48k.json"]: 43 | with open(BASE_DIR / "src" / "configs" / config_file, "r") as f: 44 | strr = f.read().replace("true", "false") 45 | with open(BASE_DIR / "src" / "configs" / config_file, "w") as f: 46 | f.write(strr) 47 | with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f: 48 | strr = f.read().replace("3.7", "3.0") 49 | with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f: 50 | f.write(strr) 51 | else: 52 | self.gpu_name = None 53 | self.gpu_mem = int( 54 | torch.cuda.get_device_properties(i_device).total_memory 55 | / 1024 56 | / 1024 57 | / 1024 58 | + 0.4 59 | ) 60 | if self.gpu_mem <= 4: 61 | with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f: 62 | strr = f.read().replace("3.7", "3.0") 63 | with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f: 64 | f.write(strr) 65 | elif torch.backends.mps.is_available(): 66 | print("No supported N-card found, use MPS for inference") 67 | self.device = "mps" 68 | else: 69 | print("No supported N-card found, use CPU for inference") 70 | self.device = "cpu" 71 | self.is_half = True 72 | 73 | if self.n_cpu == 0: 74 | self.n_cpu = cpu_count() 75 | 76 | if self.is_half: 77 | # 6G memory config 78 | x_pad = 3 79 | x_query = 10 80 | x_center = 60 81 | x_max = 65 82 | else: 83 | # 5G memory config 84 | x_pad = 1 85 | x_query = 6 86 | x_center = 38 87 | x_max = 41 88 | 89 | if self.gpu_mem != None and self.gpu_mem <= 4: 90 | x_pad = 1 91 | x_query = 5 92 | x_center = 30 93 | x_max = 32 94 | 95 | return x_pad, x_query, x_center, x_max 96 | 97 | 98 | def load_hubert(device, is_half, model_path): 99 | models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([model_path], suffix='', ) 100 | hubert = models[0] 101 | hubert = hubert.to(device) 102 | 103 | if is_half: 104 | hubert = hubert.half() 105 | else: 106 | hubert = hubert.float() 107 | 108 | hubert.eval() 109 | return hubert 110 | 111 | 112 | def get_vc(device, is_half, config, model_path): 113 | cpt = torch.load(model_path, map_location='cpu') 114 | if "config" not in cpt or "weight" not in cpt: 115 | raise ValueError(f'Incorrect format for {model_path}. Use a voice model trained using RVC v2 instead.') 116 | 117 | tgt_sr = cpt["config"][-1] 118 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] 119 | if_f0 = cpt.get("f0", 1) 120 | version = cpt.get("version", "v1") 121 | 122 | if version == "v1": 123 | if if_f0 == 1: 124 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) 125 | else: 126 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) 127 | elif version == "v2": 128 | if if_f0 == 1: 129 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) 130 | else: 131 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) 132 | 133 | del net_g.enc_q 134 | print(net_g.load_state_dict(cpt["weight"], strict=False)) 135 | net_g.eval().to(device) 136 | 137 | if is_half: 138 | net_g = net_g.half() 139 | else: 140 | net_g = net_g.float() 141 | 142 | vc = VC(tgt_sr, config) 143 | return cpt, version, net_g, tgt_sr, vc 144 | 145 | 146 | def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model): 147 | audio = load_audio(input_path, 16000) 148 | times = [0, 0, 0] 149 | if_f0 = cpt.get('f0', 1) 150 | audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, input_path, times, pitch_change, f0_method, index_path, index_rate, if_f0, filter_radius, tgt_sr, 0, rms_mix_rate, version, protect, crepe_hop_length) 151 | wavfile.write(output_path, tgt_sr, audio_opt) 152 | -------------------------------------------------------------------------------- /src/trainset_preprocess_pipeline_print.py: -------------------------------------------------------------------------------- 1 | import sys, os, multiprocessing 2 | from scipy import signal 3 | 4 | now_dir = os.getcwd() 5 | sys.path.append(now_dir) 6 | 7 | inp_root = sys.argv[1] 8 | sr = int(sys.argv[2]) 9 | n_p = int(sys.argv[3]) 10 | exp_dir = sys.argv[4] 11 | noparallel = sys.argv[5] == "True" 12 | import numpy as np, os, traceback 13 | from slicer2 import Slicer 14 | import librosa, traceback 15 | from scipy.io import wavfile 16 | import multiprocessing 17 | from my_utils import load_audio 18 | import tqdm 19 | 20 | DoFormant = False 21 | Quefrency = 1.0 22 | Timbre = 1.0 23 | 24 | mutex = multiprocessing.Lock() 25 | f = open("%s/preprocess.log" % exp_dir, "a+") 26 | 27 | 28 | def println(strr): 29 | mutex.acquire() 30 | print(strr) 31 | f.write("%s\n" % strr) 32 | f.flush() 33 | mutex.release() 34 | 35 | 36 | class PreProcess: 37 | def __init__(self, sr, exp_dir): 38 | self.slicer = Slicer( 39 | sr=sr, 40 | threshold=-42, 41 | min_length=1500, 42 | min_interval=400, 43 | hop_size=15, 44 | max_sil_kept=500, 45 | ) 46 | self.sr = sr 47 | self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) 48 | self.per = 3.0 49 | self.overlap = 0.3 50 | self.tail = self.per + self.overlap 51 | self.max = 0.9 52 | self.alpha = 0.75 53 | self.exp_dir = exp_dir 54 | self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir 55 | self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir 56 | os.makedirs(self.exp_dir, exist_ok=True) 57 | os.makedirs(self.gt_wavs_dir, exist_ok=True) 58 | os.makedirs(self.wavs16k_dir, exist_ok=True) 59 | 60 | def norm_write(self, tmp_audio, idx0, idx1): 61 | tmp_max = np.abs(tmp_audio).max() 62 | if tmp_max > 2.5: 63 | print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) 64 | return 65 | tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( 66 | 1 - self.alpha 67 | ) * tmp_audio 68 | wavfile.write( 69 | "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), 70 | self.sr, 71 | tmp_audio.astype(np.float32), 72 | ) 73 | tmp_audio = librosa.resample( 74 | tmp_audio, orig_sr=self.sr, target_sr=16000 75 | ) # , res_type="soxr_vhq" 76 | wavfile.write( 77 | "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 78 | 16000, 79 | tmp_audio.astype(np.float32), 80 | ) 81 | 82 | def pipeline(self, path, idx0): 83 | try: 84 | audio = load_audio(path, self.sr, DoFormant, Quefrency, Timbre) 85 | # zero phased digital filter cause pre-ringing noise... 86 | # audio = signal.filtfilt(self.bh, self.ah, audio) 87 | audio = signal.lfilter(self.bh, self.ah, audio) 88 | 89 | idx1 = 0 90 | for audio in self.slicer.slice(audio): 91 | i = 0 92 | while 1: 93 | start = int(self.sr * (self.per - self.overlap) * i) 94 | i += 1 95 | if len(audio[start:]) > self.tail * self.sr: 96 | tmp_audio = audio[start : start + int(self.per * self.sr)] 97 | self.norm_write(tmp_audio, idx0, idx1) 98 | idx1 += 1 99 | else: 100 | tmp_audio = audio[start:] 101 | idx1 += 1 102 | break 103 | self.norm_write(tmp_audio, idx0, idx1) 104 | # println("%s->Suc." % path) 105 | except: 106 | println("%s->%s" % (path, traceback.format_exc())) 107 | 108 | def pipeline_mp(self, infos, thread_n): 109 | for path, idx0 in tqdm.tqdm( 110 | infos, position=thread_n, leave=True, desc="thread:%s" % thread_n 111 | ): 112 | self.pipeline(path, idx0) 113 | 114 | def pipeline_mp_inp_dir(self, inp_root, n_p): 115 | try: 116 | infos = [ 117 | ("%s/%s" % (inp_root, name), idx) 118 | for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) 119 | ] 120 | if noparallel: 121 | for i in range(n_p): 122 | self.pipeline_mp(infos[i::n_p]) 123 | else: 124 | ps = [] 125 | for i in range(n_p): 126 | p = multiprocessing.Process( 127 | target=self.pipeline_mp, args=(infos[i::n_p], i) 128 | ) 129 | ps.append(p) 130 | p.start() 131 | for i in range(n_p): 132 | ps[i].join() 133 | except: 134 | println("Fail. %s" % traceback.format_exc()) 135 | 136 | 137 | def preprocess_trainset(inp_root, sr, n_p, exp_dir): 138 | pp = PreProcess(sr, exp_dir) 139 | println("start preprocess") 140 | println(sys.argv) 141 | pp.pipeline_mp_inp_dir(inp_root, n_p) 142 | println("end preprocess") 143 | 144 | 145 | if __name__ == "__main__": 146 | preprocess_trainset(inp_root, sr, n_p, exp_dir) 147 | -------------------------------------------------------------------------------- /src/vc_infer_pipeline.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from time import time as ttime 3 | 4 | import faiss 5 | import librosa 6 | import numpy as np 7 | import os 8 | import parselmouth 9 | import pyworld 10 | import sys 11 | import torch 12 | import torch.nn.functional as F 13 | import torchcrepe 14 | import traceback 15 | from scipy import signal 16 | from torch import Tensor 17 | 18 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 19 | now_dir = os.path.join(BASE_DIR, 'src') 20 | sys.path.append(now_dir) 21 | 22 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) 23 | 24 | input_audio_path2wav = {} 25 | 26 | 27 | @lru_cache(maxsize=None) 28 | def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): 29 | audio = input_audio_path2wav[input_audio_path] 30 | f0, t = pyworld.harvest( 31 | audio, 32 | fs=fs, 33 | f0_ceil=f0max, 34 | f0_floor=f0min, 35 | frame_period=frame_period, 36 | ) 37 | f0 = pyworld.stonemask(audio, f0, t, fs) 38 | return f0 39 | 40 | 41 | def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 42 | # print(data1.max(),data2.max()) 43 | rms1 = librosa.feature.rms( 44 | y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 45 | ) # 每半秒一个点 46 | rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) 47 | rms1 = torch.from_numpy(rms1) 48 | rms1 = F.interpolate( 49 | rms1.unsqueeze(0), size=data2.shape[0], mode="linear" 50 | ).squeeze() 51 | rms2 = torch.from_numpy(rms2) 52 | rms2 = F.interpolate( 53 | rms2.unsqueeze(0), size=data2.shape[0], mode="linear" 54 | ).squeeze() 55 | rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) 56 | data2 *= ( 57 | torch.pow(rms1, torch.tensor(1 - rate)) 58 | * torch.pow(rms2, torch.tensor(rate - 1)) 59 | ).numpy() 60 | return data2 61 | 62 | 63 | class VC(object): 64 | def __init__(self, tgt_sr, config): 65 | self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( 66 | config.x_pad, 67 | config.x_query, 68 | config.x_center, 69 | config.x_max, 70 | config.is_half, 71 | ) 72 | self.sr = 16000 # hubert输入采样率 73 | self.window = 160 # 每帧点数 74 | self.t_pad = self.sr * self.x_pad # 每条前后pad时间 75 | self.t_pad_tgt = tgt_sr * self.x_pad 76 | self.t_pad2 = self.t_pad * 2 77 | self.t_query = self.sr * self.x_query # 查询切点前后查询时间 78 | self.t_center = self.sr * self.x_center # 查询切点位置 79 | self.t_max = self.sr * self.x_max # 免查询时长阈值 80 | self.device = config.device 81 | 82 | # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device) 83 | def get_optimal_torch_device(self, index: int = 0) -> torch.device: 84 | # Get cuda device 85 | if torch.cuda.is_available(): 86 | return torch.device( 87 | f"cuda:{index % torch.cuda.device_count()}" 88 | ) # Very fast 89 | elif torch.backends.mps.is_available(): 90 | return torch.device("mps") 91 | # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library 92 | # Else wise return the "cpu" as a torch device, 93 | return torch.device("cpu") 94 | 95 | # Fork Feature: Compute f0 with the crepe method 96 | def get_f0_crepe_computation( 97 | self, 98 | x, 99 | f0_min, 100 | f0_max, 101 | p_len, 102 | hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. 103 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full 104 | ): 105 | x = x.astype( 106 | np.float32 107 | ) # fixes the F.conv2D exception. We needed to convert double to float. 108 | x /= np.quantile(np.abs(x), 0.999) 109 | torch_device = self.get_optimal_torch_device() 110 | audio = torch.from_numpy(x).to(torch_device, copy=True) 111 | audio = torch.unsqueeze(audio, dim=0) 112 | if audio.ndim == 2 and audio.shape[0] > 1: 113 | audio = torch.mean(audio, dim=0, keepdim=True).detach() 114 | audio = audio.detach() 115 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) 116 | pitch: Tensor = torchcrepe.predict( 117 | audio, 118 | self.sr, 119 | hop_length, 120 | f0_min, 121 | f0_max, 122 | model, 123 | batch_size=hop_length * 2, 124 | device=torch_device, 125 | pad=True, 126 | ) 127 | p_len = p_len or x.shape[0] // hop_length 128 | # Resize the pitch for final f0 129 | source = np.array(pitch.squeeze(0).cpu().float().numpy()) 130 | source[source < 0.001] = np.nan 131 | target = np.interp( 132 | np.arange(0, len(source) * p_len, len(source)) / p_len, 133 | np.arange(0, len(source)), 134 | source, 135 | ) 136 | f0 = np.nan_to_num(target) 137 | return f0 # Resized f0 138 | 139 | def get_f0_official_crepe_computation( 140 | self, 141 | x, 142 | f0_min, 143 | f0_max, 144 | model="full", 145 | ): 146 | # Pick a batch size that doesn't cause memory errors on your gpu 147 | batch_size = 512 148 | # Compute pitch using first gpu 149 | audio = torch.tensor(np.copy(x))[None].float() 150 | f0, pd = torchcrepe.predict( 151 | audio, 152 | self.sr, 153 | self.window, 154 | f0_min, 155 | f0_max, 156 | model, 157 | batch_size=batch_size, 158 | device=self.device, 159 | return_periodicity=True, 160 | ) 161 | pd = torchcrepe.filter.median(pd, 3) 162 | f0 = torchcrepe.filter.mean(f0, 3) 163 | f0[pd < 0.1] = 0 164 | f0 = f0[0].cpu().numpy() 165 | return f0 166 | 167 | # Fork Feature: Compute pYIN f0 method 168 | def get_f0_pyin_computation(self, x, f0_min, f0_max): 169 | y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True) 170 | f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max) 171 | f0 = f0[1:] # Get rid of extra first frame 172 | return f0 173 | 174 | # Fork Feature: Acquire median hybrid f0 estimation calculation 175 | def get_f0_hybrid_computation( 176 | self, 177 | methods_str, 178 | input_audio_path, 179 | x, 180 | f0_min, 181 | f0_max, 182 | p_len, 183 | filter_radius, 184 | crepe_hop_length, 185 | time_step, 186 | ): 187 | # Get various f0 methods from input to use in the computation stack 188 | s = methods_str 189 | s = s.split("hybrid")[1] 190 | s = s.replace("[", "").replace("]", "") 191 | methods = s.split("+") 192 | f0_computation_stack = [] 193 | 194 | print("Calculating f0 pitch estimations for methods: %s" % str(methods)) 195 | x = x.astype(np.float32) 196 | x /= np.quantile(np.abs(x), 0.999) 197 | # Get f0 calculations for all methods specified 198 | for method in methods: 199 | f0 = None 200 | if method == "pm": 201 | f0 = ( 202 | parselmouth.Sound(x, self.sr) 203 | .to_pitch_ac( 204 | time_step=time_step / 1000, 205 | voicing_threshold=0.6, 206 | pitch_floor=f0_min, 207 | pitch_ceiling=f0_max, 208 | ) 209 | .selected_array["frequency"] 210 | ) 211 | pad_size = (p_len - len(f0) + 1) // 2 212 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 213 | f0 = np.pad( 214 | f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" 215 | ) 216 | elif method == "crepe": 217 | f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max) 218 | f0 = f0[1:] # Get rid of extra first frame 219 | elif method == "crepe-tiny": 220 | f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny") 221 | f0 = f0[1:] # Get rid of extra first frame 222 | elif method == "mangio-crepe": 223 | f0 = self.get_f0_crepe_computation( 224 | x, f0_min, f0_max, p_len, crepe_hop_length 225 | ) 226 | elif method == "mangio-crepe-tiny": 227 | f0 = self.get_f0_crepe_computation( 228 | x, f0_min, f0_max, p_len, crepe_hop_length, "tiny" 229 | ) 230 | elif method == "harvest": 231 | f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) 232 | if filter_radius > 2: 233 | f0 = signal.medfilt(f0, 3) 234 | f0 = f0[1:] # Get rid of first frame. 235 | elif method == "dio": # Potentially buggy? 236 | f0, t = pyworld.dio( 237 | x.astype(np.double), 238 | fs=self.sr, 239 | f0_ceil=f0_max, 240 | f0_floor=f0_min, 241 | frame_period=10, 242 | ) 243 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) 244 | f0 = signal.medfilt(f0, 3) 245 | f0 = f0[1:] 246 | # elif method == "pyin": Not Working just yet 247 | # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max) 248 | # Push method to the stack 249 | f0_computation_stack.append(f0) 250 | 251 | for fc in f0_computation_stack: 252 | print(len(fc)) 253 | 254 | print("Calculating hybrid median f0 from the stack of: %s" % str(methods)) 255 | f0_median_hybrid = None 256 | if len(f0_computation_stack) == 1: 257 | f0_median_hybrid = f0_computation_stack[0] 258 | else: 259 | f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0) 260 | return f0_median_hybrid 261 | 262 | def get_f0( 263 | self, 264 | input_audio_path, 265 | x, 266 | p_len, 267 | f0_up_key, 268 | f0_method, 269 | filter_radius, 270 | crepe_hop_length, 271 | inp_f0=None, 272 | ): 273 | global input_audio_path2wav 274 | time_step = self.window / self.sr * 1000 275 | f0_min = 50 276 | f0_max = 1100 277 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 278 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 279 | if f0_method == "pm": 280 | f0 = ( 281 | parselmouth.Sound(x, self.sr) 282 | .to_pitch_ac( 283 | time_step=time_step / 1000, 284 | voicing_threshold=0.6, 285 | pitch_floor=f0_min, 286 | pitch_ceiling=f0_max, 287 | ) 288 | .selected_array["frequency"] 289 | ) 290 | pad_size = (p_len - len(f0) + 1) // 2 291 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 292 | f0 = np.pad( 293 | f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" 294 | ) 295 | elif f0_method == "harvest": 296 | input_audio_path2wav[input_audio_path] = x.astype(np.double) 297 | f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) 298 | if filter_radius > 2: 299 | f0 = signal.medfilt(f0, 3) 300 | elif f0_method == "dio": # Potentially Buggy? 301 | f0, t = pyworld.dio( 302 | x.astype(np.double), 303 | fs=self.sr, 304 | f0_ceil=f0_max, 305 | f0_floor=f0_min, 306 | frame_period=10, 307 | ) 308 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) 309 | f0 = signal.medfilt(f0, 3) 310 | elif f0_method == "crepe": 311 | f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max) 312 | elif f0_method == "crepe-tiny": 313 | f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny") 314 | elif f0_method == "mangio-crepe": 315 | f0 = self.get_f0_crepe_computation( 316 | x, f0_min, f0_max, p_len, crepe_hop_length 317 | ) 318 | elif f0_method == "mangio-crepe-tiny": 319 | f0 = self.get_f0_crepe_computation( 320 | x, f0_min, f0_max, p_len, crepe_hop_length, "tiny" 321 | ) 322 | elif f0_method == "rmvpe": 323 | if hasattr(self, "model_rmvpe") == False: 324 | from rmvpe import RMVPE 325 | 326 | self.model_rmvpe = RMVPE( 327 | os.path.join(BASE_DIR, 'rvc_models', 'rmvpe.pt'), is_half=self.is_half, device=self.device 328 | ) 329 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) 330 | 331 | elif "hybrid" in f0_method: 332 | # Perform hybrid median pitch estimation 333 | input_audio_path2wav[input_audio_path] = x.astype(np.double) 334 | f0 = self.get_f0_hybrid_computation( 335 | f0_method, 336 | input_audio_path, 337 | x, 338 | f0_min, 339 | f0_max, 340 | p_len, 341 | filter_radius, 342 | crepe_hop_length, 343 | time_step, 344 | ) 345 | 346 | f0 *= pow(2, f0_up_key / 12) 347 | # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) 348 | tf0 = self.sr // self.window # 每秒f0点数 349 | if inp_f0 is not None: 350 | delta_t = np.round( 351 | (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 352 | ).astype("int16") 353 | replace_f0 = np.interp( 354 | list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] 355 | ) 356 | shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] 357 | f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ 358 | :shape 359 | ] 360 | # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) 361 | f0bak = f0.copy() 362 | f0_mel = 1127 * np.log(1 + f0 / 700) 363 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 364 | f0_mel_max - f0_mel_min 365 | ) + 1 366 | f0_mel[f0_mel <= 1] = 1 367 | f0_mel[f0_mel > 255] = 255 368 | f0_coarse = np.rint(f0_mel).astype(int) 369 | 370 | return f0_coarse, f0bak # 1-0 371 | 372 | def vc( 373 | self, 374 | model, 375 | net_g, 376 | sid, 377 | audio0, 378 | pitch, 379 | pitchf, 380 | times, 381 | index, 382 | big_npy, 383 | index_rate, 384 | version, 385 | protect, 386 | ): # ,file_index,file_big_npy 387 | feats = torch.from_numpy(audio0) 388 | if self.is_half: 389 | feats = feats.half() 390 | else: 391 | feats = feats.float() 392 | if feats.dim() == 2: # double channels 393 | feats = feats.mean(-1) 394 | assert feats.dim() == 1, feats.dim() 395 | feats = feats.view(1, -1) 396 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) 397 | 398 | inputs = { 399 | "source": feats.to(self.device), 400 | "padding_mask": padding_mask, 401 | "output_layer": 9 if version == "v1" else 12, 402 | } 403 | t0 = ttime() 404 | with torch.no_grad(): 405 | logits = model.extract_features(**inputs) 406 | feats = model.final_proj(logits[0]) if version == "v1" else logits[0] 407 | if protect < 0.5 and pitch != None and pitchf != None: 408 | feats0 = feats.clone() 409 | if ( 410 | isinstance(index, type(None)) == False 411 | and isinstance(big_npy, type(None)) == False 412 | and index_rate != 0 413 | ): 414 | npy = feats[0].cpu().numpy() 415 | if self.is_half: 416 | npy = npy.astype("float32") 417 | 418 | # _, I = index.search(npy, 1) 419 | # npy = big_npy[I.squeeze()] 420 | 421 | score, ix = index.search(npy, k=8) 422 | weight = np.square(1 / score) 423 | weight /= weight.sum(axis=1, keepdims=True) 424 | npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) 425 | 426 | if self.is_half: 427 | npy = npy.astype("float16") 428 | feats = ( 429 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate 430 | + (1 - index_rate) * feats 431 | ) 432 | 433 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) 434 | if protect < 0.5 and pitch != None and pitchf != None: 435 | feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( 436 | 0, 2, 1 437 | ) 438 | t1 = ttime() 439 | p_len = audio0.shape[0] // self.window 440 | if feats.shape[1] < p_len: 441 | p_len = feats.shape[1] 442 | if pitch != None and pitchf != None: 443 | pitch = pitch[:, :p_len] 444 | pitchf = pitchf[:, :p_len] 445 | 446 | if protect < 0.5 and pitch != None and pitchf != None: 447 | pitchff = pitchf.clone() 448 | pitchff[pitchf > 0] = 1 449 | pitchff[pitchf < 1] = protect 450 | pitchff = pitchff.unsqueeze(-1) 451 | feats = feats * pitchff + feats0 * (1 - pitchff) 452 | feats = feats.to(feats0.dtype) 453 | p_len = torch.tensor([p_len], device=self.device).long() 454 | with torch.no_grad(): 455 | if pitch != None and pitchf != None: 456 | audio1 = ( 457 | (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) 458 | .data.cpu() 459 | .float() 460 | .numpy() 461 | ) 462 | else: 463 | audio1 = ( 464 | (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() 465 | ) 466 | del feats, p_len, padding_mask 467 | if torch.cuda.is_available(): 468 | torch.cuda.empty_cache() 469 | t2 = ttime() 470 | times[0] += t1 - t0 471 | times[2] += t2 - t1 472 | return audio1 473 | 474 | def pipeline( 475 | self, 476 | model, 477 | net_g, 478 | sid, 479 | audio, 480 | input_audio_path, 481 | times, 482 | f0_up_key, 483 | f0_method, 484 | file_index, 485 | # file_big_npy, 486 | index_rate, 487 | if_f0, 488 | filter_radius, 489 | tgt_sr, 490 | resample_sr, 491 | rms_mix_rate, 492 | version, 493 | protect, 494 | crepe_hop_length, 495 | f0_file=None, 496 | ): 497 | if ( 498 | file_index != "" 499 | # and file_big_npy != "" 500 | # and os.path.exists(file_big_npy) == True 501 | and os.path.exists(file_index) == True 502 | and index_rate != 0 503 | ): 504 | try: 505 | index = faiss.read_index(file_index) 506 | # big_npy = np.load(file_big_npy) 507 | big_npy = index.reconstruct_n(0, index.ntotal) 508 | except: 509 | traceback.print_exc() 510 | index = big_npy = None 511 | else: 512 | index = big_npy = None 513 | audio = signal.filtfilt(bh, ah, audio) 514 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") 515 | opt_ts = [] 516 | if audio_pad.shape[0] > self.t_max: 517 | audio_sum = np.zeros_like(audio) 518 | for i in range(self.window): 519 | audio_sum += audio_pad[i : i - self.window] 520 | for t in range(self.t_center, audio.shape[0], self.t_center): 521 | opt_ts.append( 522 | t 523 | - self.t_query 524 | + np.where( 525 | np.abs(audio_sum[t - self.t_query : t + self.t_query]) 526 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() 527 | )[0][0] 528 | ) 529 | s = 0 530 | audio_opt = [] 531 | t = None 532 | t1 = ttime() 533 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") 534 | p_len = audio_pad.shape[0] // self.window 535 | inp_f0 = None 536 | if hasattr(f0_file, "name") == True: 537 | try: 538 | with open(f0_file.name, "r") as f: 539 | lines = f.read().strip("\n").split("\n") 540 | inp_f0 = [] 541 | for line in lines: 542 | inp_f0.append([float(i) for i in line.split(",")]) 543 | inp_f0 = np.array(inp_f0, dtype="float32") 544 | except: 545 | traceback.print_exc() 546 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() 547 | pitch, pitchf = None, None 548 | if if_f0 == 1: 549 | pitch, pitchf = self.get_f0( 550 | input_audio_path, 551 | audio_pad, 552 | p_len, 553 | f0_up_key, 554 | f0_method, 555 | filter_radius, 556 | crepe_hop_length, 557 | inp_f0, 558 | ) 559 | pitch = pitch[:p_len] 560 | pitchf = pitchf[:p_len] 561 | if self.device == "mps": 562 | pitchf = pitchf.astype(np.float32) 563 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() 564 | pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() 565 | t2 = ttime() 566 | times[1] += t2 - t1 567 | for t in opt_ts: 568 | t = t // self.window * self.window 569 | if if_f0 == 1: 570 | audio_opt.append( 571 | self.vc( 572 | model, 573 | net_g, 574 | sid, 575 | audio_pad[s : t + self.t_pad2 + self.window], 576 | pitch[:, s // self.window : (t + self.t_pad2) // self.window], 577 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window], 578 | times, 579 | index, 580 | big_npy, 581 | index_rate, 582 | version, 583 | protect, 584 | )[self.t_pad_tgt : -self.t_pad_tgt] 585 | ) 586 | else: 587 | audio_opt.append( 588 | self.vc( 589 | model, 590 | net_g, 591 | sid, 592 | audio_pad[s : t + self.t_pad2 + self.window], 593 | None, 594 | None, 595 | times, 596 | index, 597 | big_npy, 598 | index_rate, 599 | version, 600 | protect, 601 | )[self.t_pad_tgt : -self.t_pad_tgt] 602 | ) 603 | s = t 604 | if if_f0 == 1: 605 | audio_opt.append( 606 | self.vc( 607 | model, 608 | net_g, 609 | sid, 610 | audio_pad[t:], 611 | pitch[:, t // self.window :] if t is not None else pitch, 612 | pitchf[:, t // self.window :] if t is not None else pitchf, 613 | times, 614 | index, 615 | big_npy, 616 | index_rate, 617 | version, 618 | protect, 619 | )[self.t_pad_tgt : -self.t_pad_tgt] 620 | ) 621 | else: 622 | audio_opt.append( 623 | self.vc( 624 | model, 625 | net_g, 626 | sid, 627 | audio_pad[t:], 628 | None, 629 | None, 630 | times, 631 | index, 632 | big_npy, 633 | index_rate, 634 | version, 635 | protect, 636 | )[self.t_pad_tgt : -self.t_pad_tgt] 637 | ) 638 | audio_opt = np.concatenate(audio_opt) 639 | if rms_mix_rate != 1: 640 | audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) 641 | if resample_sr >= 16000 and tgt_sr != resample_sr: 642 | audio_opt = librosa.resample( 643 | audio_opt, orig_sr=tgt_sr, target_sr=resample_sr 644 | ) 645 | audio_max = np.abs(audio_opt).max() / 0.99 646 | max_int16 = 32768 647 | if audio_max > 1: 648 | max_int16 /= audio_max 649 | audio_opt = (audio_opt * max_int16).astype(np.int16) 650 | del pitch, pitchf, sid 651 | if torch.cuda.is_available(): 652 | torch.cuda.empty_cache() 653 | return audio_opt 654 | -------------------------------------------------------------------------------- /src/webui.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request 3 | import zipfile 4 | import torch 5 | from rvc import rvc_infer, load_hubert, get_vc, Config 6 | import urllib.parse 7 | import urllib.request 8 | import gradio as gr 9 | import logging 10 | 11 | logging.basicConfig(level=logging.DEBUG) 12 | logger = logging.getLogger(__name__) 13 | 14 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models') 16 | output_dir = os.path.join(BASE_DIR, 'voice_output') 17 | 18 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 19 | is_half = False if device == "cpu" else True 20 | 21 | def get_current_models(models_dir): 22 | models_list = os.listdir(models_dir) 23 | items_to_remove = ['hubert_base.pt', 'MODELS.txt', 'public_models.json', 'rmvpe.pt'] 24 | return [item for item in models_list if item not in items_to_remove] 25 | 26 | def update_models_list(): 27 | models_l = get_current_models(rvc_models_dir) 28 | return gr.Dropdown(choices=models_l, value=models_l[0] if models_l else None) 29 | 30 | def extract_zip(extraction_folder, zip_name): 31 | with zipfile.ZipFile(zip_name, 'r') as zip_ref: 32 | zip_ref.extractall(extraction_folder) 33 | os.remove(zip_name) 34 | 35 | def download_online_model(url, dir_name, progress=gr.Progress()): 36 | try: 37 | # Parse the URL and extract the filename 38 | parsed_url = urllib.parse.urlparse(url) 39 | zip_name = os.path.basename(parsed_url.path) 40 | 41 | # Remove any query parameters from the filename 42 | zip_name = zip_name.split('?')[0] 43 | 44 | extraction_folder = os.path.join(rvc_models_dir, dir_name) 45 | if os.path.exists(extraction_folder): 46 | raise gr.Error(f'Voice model directory {dir_name} already exists!') 47 | 48 | # Custom opener to report download progress 49 | class DownloadProgressBar(): 50 | def __init__(self): 51 | self.pbar = None 52 | 53 | def __call__(self, block_num, block_size, total_size): 54 | if not self.pbar: 55 | self.pbar = 0 56 | downloaded = block_num * block_size 57 | if downloaded < total_size: 58 | progress(downloaded / total_size, desc="Downloading...") 59 | else: 60 | progress(1.0, desc="Download complete") 61 | 62 | # Download the file with progress bar 63 | urllib.request.urlretrieve(url, zip_name, DownloadProgressBar()) 64 | 65 | progress(0, desc="Extracting...") 66 | extract_zip(extraction_folder, zip_name) 67 | progress(1.0, desc="Extraction complete") 68 | 69 | return f'[+] {dir_name} Model successfully downloaded and extracted!' 70 | except Exception as e: 71 | raise gr.Error(str(e)) 72 | 73 | def upload_local_model(zip_file, dir_name, progress=gr.Progress()): 74 | try: 75 | extraction_folder = os.path.join(rvc_models_dir, dir_name) 76 | if os.path.exists(extraction_folder): 77 | raise gr.Error(f'Voice model directory {dir_name} already exists!') 78 | 79 | extract_zip(extraction_folder, zip_file.name) 80 | return f'[+] {dir_name} Model successfully uploaded!' 81 | except Exception as e: 82 | return f"Error: {str(e)}" 83 | 84 | def load_rvc_model(rvc_model): 85 | model_dir = os.path.join(rvc_models_dir, rvc_model) 86 | model_path = os.path.join(model_dir, "model.pth") 87 | if not os.path.exists(model_path): 88 | pth_files = [f for f in os.listdir(model_dir) if f.endswith('.pth')] 89 | if pth_files: 90 | model_path = os.path.join(model_dir, pth_files[0]) 91 | else: 92 | raise FileNotFoundError(f"No .pth file found in RVC model directory: {model_dir}") 93 | 94 | config = Config(device, is_half) 95 | return get_vc(device, is_half, config, model_path) 96 | 97 | def voice_conversion(input_audio, rvc_model, pitch, f0_method, index_rate, filter_radius, rms_mix_rate, protect): 98 | try: 99 | hubert_model = load_hubert(device, is_half, os.path.join(rvc_models_dir, "hubert_base.pt")) 100 | cpt, version, net_g, tgt_sr, vc = load_rvc_model(rvc_model) 101 | 102 | output_filename = os.path.join(output_dir, f"converted_{os.path.basename(input_audio)}") 103 | output_filename = os.path.splitext(output_filename)[0] + '.wav' 104 | os.makedirs(output_dir, exist_ok=True) 105 | 106 | rvc_infer("", index_rate, input_audio, output_filename, pitch, f0_method, cpt, version, net_g, 107 | filter_radius, tgt_sr, rms_mix_rate, protect, 160, vc, hubert_model) 108 | 109 | return output_filename 110 | except Exception as e: 111 | raise gr.Error(f"Voice conversion failed: {str(e)}") 112 | 113 | if __name__ == '__main__': 114 | voice_models = get_current_models(rvc_models_dir) 115 | 116 | with gr.Blocks(title='RVC Voice Changer') as app: 117 | with gr.Tab("Convert Voice"): 118 | with gr.Row(): 119 | with gr.Column(): 120 | input_audio = gr.Audio(label='Input Audio', type='filepath') 121 | rvc_model = gr.Dropdown(voice_models, label='Voice Models') 122 | gr.Markdown("Select the AI voice model you want to use for conversion. Models are stored in the 'rvc_models' folder.") 123 | ref_btn = gr.Button('Refresh Models 🔁', variant='primary') 124 | 125 | with gr.Column(): 126 | pitch = gr.Slider(-22, 22, value=0, step=1, label='Pitch Change') 127 | gr.Markdown("Adjust the pitch of the output voice. Higher values make the voice higher, lower values make it deeper.") 128 | 129 | f0_method = gr.Dropdown(['rmvpe', 'mangio-crepe'], value='rmvpe', label='Pitch detection algorithm') 130 | gr.Markdown("Choose the algorithm for pitch detection. RMVPE is generally good for clarity, while Mangio-Crepe can produce smoother vocals.") 131 | 132 | index_rate = gr.Slider(0, 1, value=0.5, label='Index Rate') 133 | gr.Markdown("Controls how much of the AI voice's characteristics to keep. Higher values result in output closer to the AI voice.") 134 | 135 | filter_radius = gr.Slider(0, 7, value=3, step=1, label='Filter radius') 136 | gr.Markdown("Applies median filtering to pitch results. Can help reduce breathiness. Higher values smooth out the pitch more.") 137 | 138 | rms_mix_rate = gr.Slider(0, 1, value=0.25, label='RMS mix rate') 139 | gr.Markdown("Controls how much to mimic the original vocal's volume envelope. Higher values preserve more of the original dynamics.") 140 | 141 | protect = gr.Slider(0, 0.5, value=0.33, label='Protect rate') 142 | gr.Markdown("Protects voiceless consonants and breath sounds from being converted. Set to 0.5 to disable protection.") 143 | 144 | with gr.Row(): 145 | clear_btn = gr.Button("Clear") 146 | convert_btn = gr.Button("Convert", variant='primary') 147 | output_audio = gr.Audio(label='Converted Audio', type='filepath') 148 | 149 | ref_btn.click(update_models_list, None, outputs=rvc_model) 150 | convert_btn.click(voice_conversion, 151 | inputs=[input_audio, rvc_model, pitch, f0_method, index_rate, filter_radius, rms_mix_rate, protect], 152 | outputs=[output_audio]) 153 | clear_btn.click( 154 | lambda: [None, None, 0, 'rmvpe', 0.5, 3, 0.25, 0.33], 155 | outputs=[input_audio, output_audio, pitch, f0_method, index_rate, filter_radius, rms_mix_rate, protect] 156 | ) 157 | 158 | with gr.Tab('Download model'): 159 | model_zip_link = gr.Text(label='Download link to model') 160 | gr.Markdown("Provide a direct download link to a zip file containing the voice model. The zip should include a .pth model file and an optional .index file.") 161 | 162 | model_name = gr.Text(label='Name your model') 163 | gr.Markdown("Give your new model a unique name. This will be used as the folder name for the model files.") 164 | 165 | download_btn = gr.Button('Download 🌐', variant='primary') 166 | dl_output_message = gr.Text(label='Output Message', interactive=False) 167 | download_btn.click(download_online_model, inputs=[model_zip_link, model_name], outputs=dl_output_message) 168 | 169 | with gr.Tab('Upload model'): 170 | upload_zip = gr.File(label='Upload zip file', file_types=['.zip']) 171 | gr.Markdown("Upload a zip file containing your voice model. The zip should include a .pth model file and an optional .index file.") 172 | 173 | upload_model_name = gr.Text(label='Model name') 174 | gr.Markdown("Give your uploaded model a unique name. This will be used as the folder name for the model files.") 175 | 176 | upload_btn = gr.Button('Upload model', variant='primary') 177 | upload_output_message = gr.Text(label='Output Message', interactive=False) 178 | upload_btn.click(upload_local_model, inputs=[upload_zip, upload_model_name], outputs=upload_output_message) 179 | 180 | app.launch() 181 | --------------------------------------------------------------------------------