├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── app.py ├── assets ├── hubert │ └── req-hubert.txt └── rvmpe │ └── req-rvmpe.txt ├── docs ├── COMMAND_LINE_ARGUMENTS.md ├── HOW_TO_USE.md └── img │ └── ui_example.jpg ├── lib ├── config │ └── config.py ├── infer_pack │ ├── attentions.py │ ├── commons.py │ ├── models.py │ ├── models_dml.py │ ├── models_onnx.py │ ├── modules.py │ ├── modules │ │ └── F0Predictor │ │ │ ├── DioF0Predictor.py │ │ │ ├── F0Predictor.py │ │ │ ├── HarvestF0Predictor.py │ │ │ ├── PMF0Predictor.py │ │ │ └── __init__.py │ ├── onnx_inference.py │ └── transforms.py └── vc │ ├── audio.py │ ├── rmvpe.py │ ├── settings.py │ ├── utils.py │ └── vc_infer_pipeline.py ├── requirements.txt ├── run.bat ├── run.sh ├── start.bat ├── start.sh └── weights ├── folder_info.json └── model_pack └── model_info.json /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | infer_pack\__pycache__ 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd 364 | 365 | # build 366 | build 367 | monotonic_align/core.c 368 | *.o 369 | *.so 370 | *.dll 371 | 372 | # data 373 | /config.json 374 | /*.pth 375 | *.wav 376 | /monotonic_align/monotonic_align 377 | /resources 378 | /MoeGoe.spec 379 | /dist/MoeGoe 380 | /dist 381 | 382 | /env 383 | .idea 384 | .venv 385 | *.pt 386 | output -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 02/03/2024 Changelog:
2 | - Rearrange code and update dependencies. 3 | - Add batch and shell command for easy installation. 4 | 5 | 12/09/2023 Changelog:
6 | - Added documentation. 7 | - Support for non json file. 8 | 9 | 13/08/2023 Changelog:
10 | - Fix bugs. 11 | 12 | 08/08/2023 Changelog:
13 | - Limitation changes. 14 | - UI Changes for Youtube Input. 15 | - Added instrument volume. 16 | 17 | 29/07/2023 Changelog:
18 | - UI Changes for Non Limitation. 19 | - Added More Splitter Model. 20 | - Separate Youtube Download and Splitter. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 arkandash 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Multi-Model RVC Inference 4 | ### Simplified RVC Inference for HuggingFace or Google Colab 5 | 6 | [![License](https://img.shields.io/github/license/arkandash/Multi-Model-RVC-Inference?style=for-the-badge)](https://github.com/ArkanDash/Multi-Model-RVC-Inference/blob/master/LICENSE) 7 | [![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference) 8 |
9 | 10 | ### Information 11 | Please support the original RVC, without it, this inference wont be possible to make.
12 | [![Original RVC Repository](https://img.shields.io/badge/Github-Original%20RVC%20Repository-blue?style=for-the-badge&logo=github)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) 13 | #### Features 14 | - Support V1 & V2 Model ✅ 15 | - Youtube Audio Downloader ✅ 16 | - Demucs (Voice Splitter) [Internet required for downloading model] ✅ 17 | - TTS Support ✅ 18 | - Microphone Support ✅ 19 | - HuggingFace Spaces Inference [for CPU Tier only] ✅ 20 | - Remove Youtube & Input Path ✅ 21 | - Remove Crepe Support due to gpu requirement ✅ 22 | 23 | ### Automatic Installation 24 | Install [ffmpeg](https://ffmpeg.org/) first before running these command. 25 | - Windows 26 | Run the `start.bat` to download the model and dependencies.
27 | Run the `run.bat` to run the inference 28 | - MacOS & Linux 29 | For MacOS. before running the script, please install [wget](https://formulae.brew.sh/formula/wget)
30 | Run the `start.sh` to download the model and dependencies.
31 | Run the `run.sh` to run the inference 32 | 33 | ### Manual Installation 34 | 35 | 1. Install Pytorch
36 | - CPU only (any OS) 37 | ```bash 38 | pip install torch torchvision torchaudio 39 | ``` 40 | - Nvidia (CUDA used) 41 | ```bash 42 | # For Windows (Due to flashv2 not supported in windows, Issue: https://github.com/Dao-AILab/flash-attention/issues/345#issuecomment-1747473481) 43 | pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121 44 | # Other (Linux, etc) 45 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 46 | ``` 47 | 48 | 2. Install [ffmpeg](https://ffmpeg.org/) 49 | 50 | 3. Install Dependencies
51 | ```bash 52 | pip install -r requirements.txt 53 | ``` 54 | 55 | 4. Download Pre-model 56 | ```bash 57 | # Hubert Model 58 | https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/hubert_base.pt 59 | # Save it to /assets/hubert/hubert_base.pt 60 | 61 | # RVMPE (rmvpe pitch extraction, Optional) 62 | https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/rmvpe.pt 63 | # Save it to /assets/rvmpe/rmvpe.pt 64 | ``` 65 | 66 | 5. Run WebUI
67 | ```bash 68 | python app.py 69 | ``` 70 | 71 | ### [How to use](docs/HOW_TO_USE.md) 72 | ### [Command Line Arguments](docs/COMMAND_LINE_ARGUMENTS.md) 73 | 74 | # Other Inference 75 | [![Advanced RVC Inference](https://img.shields.io/badge/Github-Advanced_RVC_Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Advanced-RVC-Inference) -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import traceback 5 | import logging 6 | import gradio as gr 7 | import numpy as np 8 | import librosa 9 | import torch 10 | import asyncio 11 | import edge_tts 12 | import sys 13 | import io 14 | 15 | from datetime import datetime 16 | from lib.config.config import Config 17 | from lib.vc.vc_infer_pipeline import VC 18 | from lib.vc.settings import change_audio_mode 19 | from lib.vc.audio import load_audio 20 | from lib.infer_pack.models import ( 21 | SynthesizerTrnMs256NSFsid, 22 | SynthesizerTrnMs256NSFsid_nono, 23 | SynthesizerTrnMs768NSFsid, 24 | SynthesizerTrnMs768NSFsid_nono, 25 | ) 26 | from lib.vc.utils import ( 27 | combine_vocal_and_inst, 28 | cut_vocal_and_inst, 29 | download_audio, 30 | load_hubert 31 | ) 32 | 33 | config = Config() 34 | logging.getLogger("numba").setLevel(logging.WARNING) 35 | logger = logging.getLogger(__name__) 36 | spaces = os.getenv("SYSTEM") == "spaces" 37 | force_support = None 38 | if config.unsupported is False: 39 | if config.device == "mps" or config.device == "cpu": 40 | force_support = False 41 | else: 42 | force_support = True 43 | 44 | audio_mode = [] 45 | f0method_mode = [] 46 | f0method_info = "" 47 | hubert_model = load_hubert(config) 48 | 49 | if force_support is False or spaces is True: 50 | if spaces is True: 51 | audio_mode = ["Upload audio", "TTS Audio"] 52 | else: 53 | audio_mode = ["Input path", "Upload audio", "TTS Audio"] 54 | f0method_mode = ["pm", "harvest"] 55 | f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better). (Default: PM)" 56 | else: 57 | audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"] 58 | f0method_mode = ["pm", "harvest", "crepe"] 59 | f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better), and Crepe effect is good but requires GPU (Default: PM)" 60 | 61 | if os.path.isfile("rmvpe.pt"): 62 | f0method_mode.insert(2, "rmvpe") 63 | 64 | def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index): 65 | def vc_fn( 66 | vc_audio_mode, 67 | vc_input, 68 | vc_upload, 69 | tts_text, 70 | tts_voice, 71 | f0_up_key, 72 | f0_method, 73 | index_rate, 74 | filter_radius, 75 | resample_sr, 76 | rms_mix_rate, 77 | protect, 78 | ): 79 | try: 80 | logs = [] 81 | logger.info(f"Converting using {model_name}...") 82 | logs.append(f"Converting using {model_name}...") 83 | yield "\n".join(logs), None 84 | if vc_audio_mode == "Input path" or "Youtube" and vc_input != "": 85 | audio = load_audio(vc_input, 16000) 86 | audio_max = np.abs(audio).max() / 0.95 87 | if audio_max > 1: 88 | audio /= audio_max 89 | elif vc_audio_mode == "Upload audio": 90 | if vc_upload is None: 91 | return "You need to upload an audio", None 92 | sampling_rate, audio = vc_upload 93 | duration = audio.shape[0] / sampling_rate 94 | if duration > 20 and spaces: 95 | return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None 96 | audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) 97 | if len(audio.shape) > 1: 98 | audio = librosa.to_mono(audio.transpose(1, 0)) 99 | if sampling_rate != 16000: 100 | audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) 101 | elif vc_audio_mode == "TTS Audio": 102 | if len(tts_text) > 100 and spaces: 103 | return "Text is too long", None 104 | if tts_text is None or tts_voice is None: 105 | return "You need to enter text and select a voice", None 106 | os.makedirs("output", exist_ok=True) 107 | os.makedirs(os.path.join("output", "tts"), exist_ok=True) 108 | asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(os.path.join("output", "tts", "tts.mp3"))) 109 | audio, sr = librosa.load(os.path.join("output", "tts", "tts.mp3"), sr=16000, mono=True) 110 | vc_input = os.path.join("output", "tts", "tts.mp3") 111 | times = [0, 0, 0] 112 | f0_up_key = int(f0_up_key) 113 | audio_opt = vc.pipeline( 114 | hubert_model, 115 | net_g, 116 | 0, 117 | audio, 118 | vc_input, 119 | times, 120 | f0_up_key, 121 | f0_method, 122 | file_index, 123 | # file_big_npy, 124 | index_rate, 125 | if_f0, 126 | filter_radius, 127 | tgt_sr, 128 | resample_sr, 129 | rms_mix_rate, 130 | version, 131 | protect, 132 | f0_file=None, 133 | ) 134 | info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s" 135 | logger.info(f"{model_name} | {info}") 136 | logs.append(f"Successfully Convert {model_name}\n{info}") 137 | yield "\n".join(logs), (tgt_sr, audio_opt) 138 | except Exception as err: 139 | info = traceback.format_exc() 140 | logger.error(info) 141 | logger.error(f"Error when using {model_name}.\n{str(err)}") 142 | yield info, None 143 | return vc_fn 144 | 145 | def load_model(): 146 | categories = [] 147 | category_count = 0 148 | if os.path.isfile("weights/folder_info.json"): 149 | with open("weights/folder_info.json", "r", encoding="utf-8") as f: 150 | folder_info = json.load(f) 151 | for category_name, category_info in folder_info.items(): 152 | if not category_info['enable']: 153 | continue 154 | category_title = category_info['title'] 155 | category_folder = category_info['folder_path'] 156 | description = category_info['description'] 157 | models = [] 158 | with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f: 159 | models_info = json.load(f) 160 | for character_name, info in models_info.items(): 161 | if not info['enable']: 162 | continue 163 | model_title = info['title'] 164 | model_name = info['model_path'] 165 | model_author = info.get("author", None) 166 | model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}" 167 | model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}" 168 | cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu") 169 | tgt_sr = cpt["config"][-1] 170 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk 171 | if_f0 = cpt.get("f0", 1) 172 | version = cpt.get("version", "v1") 173 | if version == "v1": 174 | if if_f0 == 1: 175 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) 176 | else: 177 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) 178 | model_version = "V1" 179 | elif version == "v2": 180 | if if_f0 == 1: 181 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) 182 | else: 183 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) 184 | model_version = "V2" 185 | del net_g.enc_q 186 | logger.info(net_g.load_state_dict(cpt["weight"], strict=False)) 187 | net_g.eval().to(config.device) 188 | if config.is_half: 189 | net_g = net_g.half() 190 | else: 191 | net_g = net_g.float() 192 | vc = VC(tgt_sr, config) 193 | logger.info(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})") 194 | models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index))) 195 | category_count += 1 196 | categories.append([category_title, description, models]) 197 | elif os.path.exists("weights"): 198 | models = [] 199 | for w_root, w_dirs, _ in os.walk("weights"): 200 | model_count = 1 201 | for sub_dir in w_dirs: 202 | pth_files = glob.glob(f"weights/{sub_dir}/*.pth") 203 | index_files = glob.glob(f"weights/{sub_dir}/*.index") 204 | if pth_files == []: 205 | logger.debug(f"Model [{model_count}/{len(w_dirs)}]: No Model file detected, skipping...") 206 | continue 207 | cpt = torch.load(pth_files[0]) 208 | tgt_sr = cpt["config"][-1] 209 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk 210 | if_f0 = cpt.get("f0", 1) 211 | version = cpt.get("version", "v1") 212 | if version == "v1": 213 | if if_f0 == 1: 214 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) 215 | else: 216 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) 217 | model_version = "V1" 218 | elif version == "v2": 219 | if if_f0 == 1: 220 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) 221 | else: 222 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) 223 | model_version = "V2" 224 | del net_g.enc_q 225 | logger.info(net_g.load_state_dict(cpt["weight"], strict=False)) 226 | net_g.eval().to(config.device) 227 | if config.is_half: 228 | net_g = net_g.half() 229 | else: 230 | net_g = net_g.float() 231 | vc = VC(tgt_sr, config) 232 | if index_files == []: 233 | logger.warning("No Index file detected!") 234 | index_info = "None" 235 | model_index = "" 236 | else: 237 | index_info = index_files[0] 238 | model_index = index_files[0] 239 | logger.info(f"Model loaded [{model_count}/{len(w_dirs)}]: {index_files[0]} / {index_info} | ({model_version})") 240 | model_count += 1 241 | models.append((index_files[0][:-4], index_files[0][:-4], "", "", model_version, create_vc_fn(index_files[0], tgt_sr, net_g, vc, if_f0, version, model_index))) 242 | categories.append(["Models", "", models]) 243 | else: 244 | categories = [] 245 | return categories 246 | 247 | if __name__ == '__main__': 248 | categories = load_model() 249 | tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices()) 250 | voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list] 251 | with gr.Blocks() as app: 252 | gr.Markdown( 253 | "
\n\n"+ 254 | "# Multi Model RVC Inference\n\n"+ 255 | "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+ 256 | "
" 257 | ) 258 | if categories == []: 259 | gr.Markdown( 260 | "
\n\n"+ 261 | "## No model found, please add the model into weights folder\n\n"+ 262 | "
" 263 | ) 264 | for (folder_title, description, models) in categories: 265 | with gr.TabItem(folder_title): 266 | if description: 267 | gr.Markdown(f"###
{description}") 268 | with gr.Tabs(): 269 | if not models: 270 | gr.Markdown("#
No Model Loaded.") 271 | gr.Markdown("##
Please add the model or fix your model path.") 272 | continue 273 | for (name, title, author, cover, model_version, vc_fn) in models: 274 | with gr.TabItem(name): 275 | with gr.Row(): 276 | gr.Markdown( 277 | '
' 278 | f'
{title}
\n'+ 279 | f'
RVC {model_version} Model
\n'+ 280 | (f'
Model author: {author}
' if author else "")+ 281 | (f'' if cover else "")+ 282 | '
' 283 | ) 284 | with gr.Row(): 285 | if spaces is False: 286 | with gr.TabItem("Input"): 287 | with gr.Row(): 288 | with gr.Column(): 289 | vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio") 290 | # Input 291 | vc_input = gr.Textbox(label="Input audio path", visible=False) 292 | # Upload 293 | vc_upload = gr.Audio(label="Upload audio file", sources=["upload", "microphone"], visible=True, interactive=True) 294 | # Youtube 295 | vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)") 296 | vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...") 297 | vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False) 298 | vc_download_button = gr.Button("Download Audio", variant="primary", visible=False) 299 | vc_audio_preview = gr.Audio(label="Audio Preview", visible=False) 300 | # TTS 301 | tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False) 302 | tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female") 303 | with gr.Column(): 304 | vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)") 305 | vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False) 306 | vc_split = gr.Button("Split Audio", variant="primary", visible=False) 307 | vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False) 308 | vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False) 309 | with gr.TabItem("Convert"): 310 | with gr.Row(): 311 | with gr.Column(): 312 | vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice') 313 | f0method0 = gr.Radio( 314 | label="Pitch extraction algorithm", 315 | info=f0method_info, 316 | choices=f0method_mode, 317 | value="pm", 318 | interactive=True 319 | ) 320 | index_rate1 = gr.Slider( 321 | minimum=0, 322 | maximum=1, 323 | label="Retrieval feature ratio", 324 | info="(Default: 0.7)", 325 | value=0.7, 326 | interactive=True, 327 | ) 328 | filter_radius0 = gr.Slider( 329 | minimum=0, 330 | maximum=7, 331 | label="Apply Median Filtering", 332 | info="The value represents the filter radius and can reduce breathiness.", 333 | value=3, 334 | step=1, 335 | interactive=True, 336 | ) 337 | resample_sr0 = gr.Slider( 338 | minimum=0, 339 | maximum=48000, 340 | label="Resample the output audio", 341 | info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling", 342 | value=0, 343 | step=1, 344 | interactive=True, 345 | ) 346 | rms_mix_rate0 = gr.Slider( 347 | minimum=0, 348 | maximum=1, 349 | label="Volume Envelope", 350 | info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used", 351 | value=1, 352 | interactive=True, 353 | ) 354 | protect0 = gr.Slider( 355 | minimum=0, 356 | maximum=0.5, 357 | label="Voice Protection", 358 | info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy", 359 | value=0.5, 360 | step=0.01, 361 | interactive=True, 362 | ) 363 | with gr.Column(): 364 | vc_log = gr.Textbox(label="Output Information", interactive=False) 365 | vc_output = gr.Audio(label="Output Audio", interactive=False) 366 | vc_convert = gr.Button("Convert", variant="primary") 367 | vc_vocal_volume = gr.Slider( 368 | minimum=0, 369 | maximum=10, 370 | label="Vocal volume", 371 | value=1, 372 | interactive=True, 373 | step=1, 374 | info="Adjust vocal volume (Default: 1}", 375 | visible=False 376 | ) 377 | vc_inst_volume = gr.Slider( 378 | minimum=0, 379 | maximum=10, 380 | label="Instrument volume", 381 | value=1, 382 | interactive=True, 383 | step=1, 384 | info="Adjust instrument volume (Default: 1}", 385 | visible=False 386 | ) 387 | vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False) 388 | vc_combine = gr.Button("Combine",variant="primary", visible=False) 389 | else: 390 | with gr.Column(): 391 | vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio") 392 | # Input 393 | vc_input = gr.Textbox(label="Input audio path", visible=False) 394 | # Upload 395 | vc_upload = gr.Audio(label="Upload audio file", sources=["upload", "microphone"], visible=True, interactive=True) 396 | # Youtube 397 | vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)") 398 | vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...") 399 | vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False) 400 | vc_download_button = gr.Button("Download Audio", variant="primary", visible=False) 401 | vc_audio_preview = gr.Audio(label="Audio Preview", visible=False) 402 | # Splitter 403 | vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)") 404 | vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False) 405 | vc_split = gr.Button("Split Audio", variant="primary", visible=False) 406 | vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False) 407 | vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False) 408 | # TTS 409 | tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False) 410 | tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female") 411 | with gr.Column(): 412 | vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice') 413 | f0method0 = gr.Radio( 414 | label="Pitch extraction algorithm", 415 | info=f0method_info, 416 | choices=f0method_mode, 417 | value="pm", 418 | interactive=True 419 | ) 420 | index_rate1 = gr.Slider( 421 | minimum=0, 422 | maximum=1, 423 | label="Retrieval feature ratio", 424 | info="(Default: 0.7)", 425 | value=0.7, 426 | interactive=True, 427 | ) 428 | filter_radius0 = gr.Slider( 429 | minimum=0, 430 | maximum=7, 431 | label="Apply Median Filtering", 432 | info="The value represents the filter radius and can reduce breathiness.", 433 | value=3, 434 | step=1, 435 | interactive=True, 436 | ) 437 | resample_sr0 = gr.Slider( 438 | minimum=0, 439 | maximum=48000, 440 | label="Resample the output audio", 441 | info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling", 442 | value=0, 443 | step=1, 444 | interactive=True, 445 | ) 446 | rms_mix_rate0 = gr.Slider( 447 | minimum=0, 448 | maximum=1, 449 | label="Volume Envelope", 450 | info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used", 451 | value=1, 452 | interactive=True, 453 | ) 454 | protect0 = gr.Slider( 455 | minimum=0, 456 | maximum=0.5, 457 | label="Voice Protection", 458 | info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy", 459 | value=0.5, 460 | step=0.01, 461 | interactive=True, 462 | ) 463 | with gr.Column(): 464 | vc_log = gr.Textbox(label="Output Information", interactive=False) 465 | vc_output = gr.Audio(label="Output Audio", interactive=False) 466 | vc_convert = gr.Button("Convert", variant="primary") 467 | vc_vocal_volume = gr.Slider( 468 | minimum=0, 469 | maximum=10, 470 | label="Vocal volume", 471 | value=1, 472 | interactive=True, 473 | step=1, 474 | info="Adjust vocal volume (Default: 1}", 475 | visible=False 476 | ) 477 | vc_inst_volume = gr.Slider( 478 | minimum=0, 479 | maximum=10, 480 | label="Instrument volume", 481 | value=1, 482 | interactive=True, 483 | step=1, 484 | info="Adjust instrument volume (Default: 1}", 485 | visible=False 486 | ) 487 | vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False) 488 | vc_combine = gr.Button("Combine",variant="primary", visible=False) 489 | vc_convert.click( 490 | fn=vc_fn, 491 | inputs=[ 492 | vc_audio_mode, 493 | vc_input, 494 | vc_upload, 495 | tts_text, 496 | tts_voice, 497 | vc_transform0, 498 | f0method0, 499 | index_rate1, 500 | filter_radius0, 501 | resample_sr0, 502 | rms_mix_rate0, 503 | protect0, 504 | ], 505 | outputs=[vc_log ,vc_output] 506 | ) 507 | vc_download_button.click( 508 | fn=download_audio, 509 | inputs=[vc_link, vc_download_audio], 510 | outputs=[vc_audio_preview, vc_log_yt] 511 | ) 512 | vc_split.click( 513 | fn=cut_vocal_and_inst, 514 | inputs=[vc_split_model], 515 | outputs=[vc_split_log, vc_vocal_preview, vc_inst_preview, vc_input] 516 | ) 517 | vc_combine.click( 518 | fn=combine_vocal_and_inst, 519 | inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model], 520 | outputs=[vc_combined_output] 521 | ) 522 | vc_audio_mode.change( 523 | fn=change_audio_mode, 524 | inputs=[vc_audio_mode], 525 | outputs=[ 526 | vc_input, 527 | vc_upload, 528 | vc_download_audio, 529 | vc_link, 530 | vc_log_yt, 531 | vc_download_button, 532 | vc_split_model, 533 | vc_split_log, 534 | vc_split, 535 | vc_audio_preview, 536 | vc_vocal_preview, 537 | vc_inst_preview, 538 | vc_vocal_volume, 539 | vc_inst_volume, 540 | vc_combined_output, 541 | vc_combine, 542 | tts_text, 543 | tts_voice 544 | ] 545 | ) 546 | app.queue( 547 | max_size=20, 548 | api_open=config.api, 549 | ).launch( 550 | share=config.share, 551 | max_threads=1, 552 | allowed_paths=["weights"] 553 | ) -------------------------------------------------------------------------------- /assets/hubert/req-hubert.txt: -------------------------------------------------------------------------------- 1 | put hubert_base.pt here -------------------------------------------------------------------------------- /assets/rvmpe/req-rvmpe.txt: -------------------------------------------------------------------------------- 1 | this is optional for pitch extraction algorithm 2 | put rvmpe.pt here -------------------------------------------------------------------------------- /docs/COMMAND_LINE_ARGUMENTS.md: -------------------------------------------------------------------------------- 1 | ## List of Command Line Argument 2 | 3 | | Option | Description | 4 | |-----------------|--------------------------------------| 5 | | `--share` | Launch with public link | 6 | | `--api` | Launch with api | 7 | | `--unsupported` | Force unsupported feature due to device being unsupported| 8 | -------------------------------------------------------------------------------- /docs/HOW_TO_USE.md: -------------------------------------------------------------------------------- 1 | ## How to use? 2 | 3 | ### Simple 4 | 5 | 1. Delete the all file and folder inside the weights folder 6 | 2. Put all your model inside weights 7 | 8 | ``` 9 | rvc-inference/ 10 | ├─ weights/ 11 | │ ├─ [your model folder #1]/ (Put your pth and index file here) 12 | │ ├─ [your model folder #2]/ (Put your pth and index file here) 13 | │ ├─ ...other model 14 | ├─ ...other stuff 15 | ``` 16 | 4. Done 17 | 18 | Note: Custom character/model name, image file and author may not be available for this. 19 | 20 | ### Advanced Only 21 | 22 | This method is only for advanced user only. 23 | 1. Create folder_info.json inside weights and 24 | create a category folder that contains list of character model. 25 | 26 | ``` 27 | rvc-inference/ 28 | ├─ weights/ 29 | │ ├─ genshin impact/ 30 | │ ├─ folder_info.json 31 | ├─ ...other stuff 32 | ``` 33 | 34 | 2. Inside folder_info.json 35 | ```json 36 | "CATEGORY_TAB_NAME":{ 37 | "enable": true, 38 | "title": "CATEGORY_TITLE", 39 | "folder_path": "CATEGORY_FOLDER_PATH", 40 | "description": "CATEGORY_DESCRIPTION" 41 | } 42 | ``` 43 | 44 | folder_info.json info: 45 | - CATEGORY_TAB_NAME = an category tab name (this one is just a name without spaces, but it wont affect the ui category title) [Required] 46 | - enable = Enabled/Disabled cat [Required] 47 | - title = Title of the category (this one affect the ui category title) [Required] 48 | - folder_path = folder path to the category folder (ex. Genshin Impact) [Required] 49 | - description = Description below the selected tab [Optional] 50 | 51 | 3. Create model_info.json inside the category folder 52 | 53 | ``` 54 | rvc-inference/ 55 | ├─ weights/ 56 | │ ├─ genshin impact/ 57 | │ │ ├─ model_info.json 58 | │ ├─ folder_info.json 59 | ├─ ...other stuff 60 | ``` 61 | 4. Inside model_info.json (If you have more than one model just duplicate and change the value) 62 | ```json 63 | "FOLDER_PATH": { 64 | "enable": true, 65 | "model_path": "CHARACTER_BASEMODEL", 66 | "title": "CHARACTER_NAME", 67 | "cover": "CHARACTER_IMAGE", 68 | "feature_retrieval_library": "CHARACTER_MODEL_INDEX", 69 | "author": "MODEL_AUTHOR" 70 | } 71 | ``` 72 | model_info.json info: 73 | - FOLDER_PATH = folder path to the model [Required] 74 | - enable = Enabled/Disabled model [Required] 75 | - model_path = path to model file (ex. "venti.pth") [Required + must shown the extension] 76 | - title = Title of the character/model (this one affect the ui category title) [Required + must shown the extension] 77 | - cover = folder path to the image file (ex. "image.png") [Optional + must shown the extension] 78 | - feature_retrieval_library = path to index file (ex. "added_IVF4198_Flat_nprobe_1_zhongli-jp_v2.index") [Required] 79 | - author = Author of the model [Optional] 80 | 81 | 1. Put your desired model to your category folder 82 | Example: 83 | ``` 84 | rvc-inference/ 85 | ├─ weights/ 86 | │ ├─ genshin impact/ 87 | │ │ ├─ [your model folder #1]/ (Put your pth and index file here) 88 | │ │ ├─ [your model folder #2]/ (Put your pth and index file here) 89 | │ │ ├─ ...other model 90 | │ │ ├─ model_info.json 91 | │ ├─ folder_info.json 92 | ├─ ...other stuff 93 | ``` 94 | 1. Done. 95 | 96 | Note: 97 | - To add image to the ui just put your image into the model folder and setting up the image path in the folder info.json. 98 | 99 | More detail stuff: 100 | ![ui_example.jpg](./img/ui_example.jpg) -------------------------------------------------------------------------------- /docs/img/ui_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Multi-Model-RVC-Inference/661936e4dce121c8ad84113f7637308d1642c887/docs/img/ui_example.jpg -------------------------------------------------------------------------------- /lib/config/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import torch 4 | from multiprocessing import cpu_count 5 | 6 | class Config: 7 | def __init__(self): 8 | self.device = "cuda:0" 9 | self.is_half = True 10 | self.n_cpu = 0 11 | self.gpu_name = None 12 | self.gpu_mem = None 13 | ( 14 | self.share, 15 | self.api, 16 | self.unsupported, 17 | ) = self.arg_parse() 18 | self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() 19 | 20 | @staticmethod 21 | def arg_parse() -> tuple: 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("--share", action="store_true", help="Launch with public link") 24 | parser.add_argument("--api", action="store_true", help="Launch with api") 25 | parser.add_argument("--unsupported", action="store_true", help="Enable unsupported feature") 26 | cmd_opts = parser.parse_args() 27 | 28 | return ( 29 | cmd_opts.share, 30 | cmd_opts.api, 31 | cmd_opts.unsupported, 32 | ) 33 | 34 | # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+. 35 | # check `getattr` and try it for compatibility 36 | @staticmethod 37 | def has_mps() -> bool: 38 | if not torch.backends.mps.is_available(): 39 | return False 40 | try: 41 | torch.zeros(1).to(torch.device("mps")) 42 | return True 43 | except Exception: 44 | return False 45 | 46 | def device_config(self) -> tuple: 47 | if torch.cuda.is_available(): 48 | i_device = int(self.device.split(":")[-1]) 49 | self.gpu_name = torch.cuda.get_device_name(i_device) 50 | if ( 51 | ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) 52 | or "P40" in self.gpu_name.upper() 53 | or "1060" in self.gpu_name 54 | or "1070" in self.gpu_name 55 | or "1080" in self.gpu_name 56 | ): 57 | print("INFO: Found GPU", self.gpu_name, ", force to fp32") 58 | self.is_half = False 59 | else: 60 | print("INFO: Found GPU", self.gpu_name) 61 | self.gpu_mem = int( 62 | torch.cuda.get_device_properties(i_device).total_memory 63 | / 1024 64 | / 1024 65 | / 1024 66 | + 0.4 67 | ) 68 | elif self.has_mps(): 69 | print("INFO: No supported Nvidia GPU found, use MPS instead") 70 | self.device = "mps" 71 | self.is_half = False 72 | else: 73 | print("INFO: No supported Nvidia GPU found, use CPU instead") 74 | self.device = "cpu" 75 | self.is_half = False 76 | 77 | if self.n_cpu == 0: 78 | self.n_cpu = cpu_count() 79 | 80 | if self.is_half: 81 | # 6G显存配置 82 | x_pad = 3 83 | x_query = 10 84 | x_center = 60 85 | x_max = 65 86 | else: 87 | # 5G显存配置 88 | x_pad = 1 89 | x_query = 6 90 | x_center = 38 91 | x_max = 41 92 | 93 | if self.gpu_mem != None and self.gpu_mem <= 4: 94 | x_pad = 1 95 | x_query = 5 96 | x_center = 30 97 | x_max = 32 98 | 99 | return x_pad, x_query, x_center, x_max 100 | -------------------------------------------------------------------------------- /lib/infer_pack/attentions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from lib.infer_pack import commons 9 | from lib.infer_pack import modules 10 | from lib.infer_pack.modules import LayerNorm 11 | 12 | 13 | class Encoder(nn.Module): 14 | def __init__( 15 | self, 16 | hidden_channels, 17 | filter_channels, 18 | n_heads, 19 | n_layers, 20 | kernel_size=1, 21 | p_dropout=0.0, 22 | window_size=10, 23 | **kwargs 24 | ): 25 | super().__init__() 26 | self.hidden_channels = hidden_channels 27 | self.filter_channels = filter_channels 28 | self.n_heads = n_heads 29 | self.n_layers = n_layers 30 | self.kernel_size = kernel_size 31 | self.p_dropout = p_dropout 32 | self.window_size = window_size 33 | 34 | self.drop = nn.Dropout(p_dropout) 35 | self.attn_layers = nn.ModuleList() 36 | self.norm_layers_1 = nn.ModuleList() 37 | self.ffn_layers = nn.ModuleList() 38 | self.norm_layers_2 = nn.ModuleList() 39 | for i in range(self.n_layers): 40 | self.attn_layers.append( 41 | MultiHeadAttention( 42 | hidden_channels, 43 | hidden_channels, 44 | n_heads, 45 | p_dropout=p_dropout, 46 | window_size=window_size, 47 | ) 48 | ) 49 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 50 | self.ffn_layers.append( 51 | FFN( 52 | hidden_channels, 53 | hidden_channels, 54 | filter_channels, 55 | kernel_size, 56 | p_dropout=p_dropout, 57 | ) 58 | ) 59 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 60 | 61 | def forward(self, x, x_mask): 62 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 63 | x = x * x_mask 64 | for i in range(self.n_layers): 65 | y = self.attn_layers[i](x, x, attn_mask) 66 | y = self.drop(y) 67 | x = self.norm_layers_1[i](x + y) 68 | 69 | y = self.ffn_layers[i](x, x_mask) 70 | y = self.drop(y) 71 | x = self.norm_layers_2[i](x + y) 72 | x = x * x_mask 73 | return x 74 | 75 | 76 | class Decoder(nn.Module): 77 | def __init__( 78 | self, 79 | hidden_channels, 80 | filter_channels, 81 | n_heads, 82 | n_layers, 83 | kernel_size=1, 84 | p_dropout=0.0, 85 | proximal_bias=False, 86 | proximal_init=True, 87 | **kwargs 88 | ): 89 | super().__init__() 90 | self.hidden_channels = hidden_channels 91 | self.filter_channels = filter_channels 92 | self.n_heads = n_heads 93 | self.n_layers = n_layers 94 | self.kernel_size = kernel_size 95 | self.p_dropout = p_dropout 96 | self.proximal_bias = proximal_bias 97 | self.proximal_init = proximal_init 98 | 99 | self.drop = nn.Dropout(p_dropout) 100 | self.self_attn_layers = nn.ModuleList() 101 | self.norm_layers_0 = nn.ModuleList() 102 | self.encdec_attn_layers = nn.ModuleList() 103 | self.norm_layers_1 = nn.ModuleList() 104 | self.ffn_layers = nn.ModuleList() 105 | self.norm_layers_2 = nn.ModuleList() 106 | for i in range(self.n_layers): 107 | self.self_attn_layers.append( 108 | MultiHeadAttention( 109 | hidden_channels, 110 | hidden_channels, 111 | n_heads, 112 | p_dropout=p_dropout, 113 | proximal_bias=proximal_bias, 114 | proximal_init=proximal_init, 115 | ) 116 | ) 117 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 118 | self.encdec_attn_layers.append( 119 | MultiHeadAttention( 120 | hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout 121 | ) 122 | ) 123 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 124 | self.ffn_layers.append( 125 | FFN( 126 | hidden_channels, 127 | hidden_channels, 128 | filter_channels, 129 | kernel_size, 130 | p_dropout=p_dropout, 131 | causal=True, 132 | ) 133 | ) 134 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 135 | 136 | def forward(self, x, x_mask, h, h_mask): 137 | """ 138 | x: decoder input 139 | h: encoder output 140 | """ 141 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( 142 | device=x.device, dtype=x.dtype 143 | ) 144 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 145 | x = x * x_mask 146 | for i in range(self.n_layers): 147 | y = self.self_attn_layers[i](x, x, self_attn_mask) 148 | y = self.drop(y) 149 | x = self.norm_layers_0[i](x + y) 150 | 151 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 152 | y = self.drop(y) 153 | x = self.norm_layers_1[i](x + y) 154 | 155 | y = self.ffn_layers[i](x, x_mask) 156 | y = self.drop(y) 157 | x = self.norm_layers_2[i](x + y) 158 | x = x * x_mask 159 | return x 160 | 161 | 162 | class MultiHeadAttention(nn.Module): 163 | def __init__( 164 | self, 165 | channels, 166 | out_channels, 167 | n_heads, 168 | p_dropout=0.0, 169 | window_size=None, 170 | heads_share=True, 171 | block_length=None, 172 | proximal_bias=False, 173 | proximal_init=False, 174 | ): 175 | super().__init__() 176 | assert channels % n_heads == 0 177 | 178 | self.channels = channels 179 | self.out_channels = out_channels 180 | self.n_heads = n_heads 181 | self.p_dropout = p_dropout 182 | self.window_size = window_size 183 | self.heads_share = heads_share 184 | self.block_length = block_length 185 | self.proximal_bias = proximal_bias 186 | self.proximal_init = proximal_init 187 | self.attn = None 188 | 189 | self.k_channels = channels // n_heads 190 | self.conv_q = nn.Conv1d(channels, channels, 1) 191 | self.conv_k = nn.Conv1d(channels, channels, 1) 192 | self.conv_v = nn.Conv1d(channels, channels, 1) 193 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 194 | self.drop = nn.Dropout(p_dropout) 195 | 196 | if window_size is not None: 197 | n_heads_rel = 1 if heads_share else n_heads 198 | rel_stddev = self.k_channels**-0.5 199 | self.emb_rel_k = nn.Parameter( 200 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 201 | * rel_stddev 202 | ) 203 | self.emb_rel_v = nn.Parameter( 204 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 205 | * rel_stddev 206 | ) 207 | 208 | nn.init.xavier_uniform_(self.conv_q.weight) 209 | nn.init.xavier_uniform_(self.conv_k.weight) 210 | nn.init.xavier_uniform_(self.conv_v.weight) 211 | if proximal_init: 212 | with torch.no_grad(): 213 | self.conv_k.weight.copy_(self.conv_q.weight) 214 | self.conv_k.bias.copy_(self.conv_q.bias) 215 | 216 | def forward(self, x, c, attn_mask=None): 217 | q = self.conv_q(x) 218 | k = self.conv_k(c) 219 | v = self.conv_v(c) 220 | 221 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 222 | 223 | x = self.conv_o(x) 224 | return x 225 | 226 | def attention(self, query, key, value, mask=None): 227 | # reshape [b, d, t] -> [b, n_h, t, d_k] 228 | b, d, t_s, t_t = (*key.size(), query.size(2)) 229 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 230 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 231 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 232 | 233 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 234 | if self.window_size is not None: 235 | assert ( 236 | t_s == t_t 237 | ), "Relative attention is only available for self-attention." 238 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 239 | rel_logits = self._matmul_with_relative_keys( 240 | query / math.sqrt(self.k_channels), key_relative_embeddings 241 | ) 242 | scores_local = self._relative_position_to_absolute_position(rel_logits) 243 | scores = scores + scores_local 244 | if self.proximal_bias: 245 | assert t_s == t_t, "Proximal bias is only available for self-attention." 246 | scores = scores + self._attention_bias_proximal(t_s).to( 247 | device=scores.device, dtype=scores.dtype 248 | ) 249 | if mask is not None: 250 | scores = scores.masked_fill(mask == 0, -1e4) 251 | if self.block_length is not None: 252 | assert ( 253 | t_s == t_t 254 | ), "Local attention is only available for self-attention." 255 | block_mask = ( 256 | torch.ones_like(scores) 257 | .triu(-self.block_length) 258 | .tril(self.block_length) 259 | ) 260 | scores = scores.masked_fill(block_mask == 0, -1e4) 261 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 262 | p_attn = self.drop(p_attn) 263 | output = torch.matmul(p_attn, value) 264 | if self.window_size is not None: 265 | relative_weights = self._absolute_position_to_relative_position(p_attn) 266 | value_relative_embeddings = self._get_relative_embeddings( 267 | self.emb_rel_v, t_s 268 | ) 269 | output = output + self._matmul_with_relative_values( 270 | relative_weights, value_relative_embeddings 271 | ) 272 | output = ( 273 | output.transpose(2, 3).contiguous().view(b, d, t_t) 274 | ) # [b, n_h, t_t, d_k] -> [b, d, t_t] 275 | return output, p_attn 276 | 277 | def _matmul_with_relative_values(self, x, y): 278 | """ 279 | x: [b, h, l, m] 280 | y: [h or 1, m, d] 281 | ret: [b, h, l, d] 282 | """ 283 | ret = torch.matmul(x, y.unsqueeze(0)) 284 | return ret 285 | 286 | def _matmul_with_relative_keys(self, x, y): 287 | """ 288 | x: [b, h, l, d] 289 | y: [h or 1, m, d] 290 | ret: [b, h, l, m] 291 | """ 292 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 293 | return ret 294 | 295 | def _get_relative_embeddings(self, relative_embeddings, length): 296 | max_relative_position = 2 * self.window_size + 1 297 | # Pad first before slice to avoid using cond ops. 298 | pad_length = max(length - (self.window_size + 1), 0) 299 | slice_start_position = max((self.window_size + 1) - length, 0) 300 | slice_end_position = slice_start_position + 2 * length - 1 301 | if pad_length > 0: 302 | padded_relative_embeddings = F.pad( 303 | relative_embeddings, 304 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), 305 | ) 306 | else: 307 | padded_relative_embeddings = relative_embeddings 308 | used_relative_embeddings = padded_relative_embeddings[ 309 | :, slice_start_position:slice_end_position 310 | ] 311 | return used_relative_embeddings 312 | 313 | def _relative_position_to_absolute_position(self, x): 314 | """ 315 | x: [b, h, l, 2*l-1] 316 | ret: [b, h, l, l] 317 | """ 318 | batch, heads, length, _ = x.size() 319 | # Concat columns of pad to shift from relative to absolute indexing. 320 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) 321 | 322 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 323 | x_flat = x.view([batch, heads, length * 2 * length]) 324 | x_flat = F.pad( 325 | x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) 326 | ) 327 | 328 | # Reshape and slice out the padded elements. 329 | x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ 330 | :, :, :length, length - 1 : 331 | ] 332 | return x_final 333 | 334 | def _absolute_position_to_relative_position(self, x): 335 | """ 336 | x: [b, h, l, l] 337 | ret: [b, h, l, 2*l-1] 338 | """ 339 | batch, heads, length, _ = x.size() 340 | # padd along column 341 | x = F.pad( 342 | x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) 343 | ) 344 | x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) 345 | # add 0's in the beginning that will skew the elements after reshape 346 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 347 | x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] 348 | return x_final 349 | 350 | def _attention_bias_proximal(self, length): 351 | """Bias for self-attention to encourage attention to close positions. 352 | Args: 353 | length: an integer scalar. 354 | Returns: 355 | a Tensor with shape [1, 1, length, length] 356 | """ 357 | r = torch.arange(length, dtype=torch.float32) 358 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 359 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 360 | 361 | 362 | class FFN(nn.Module): 363 | def __init__( 364 | self, 365 | in_channels, 366 | out_channels, 367 | filter_channels, 368 | kernel_size, 369 | p_dropout=0.0, 370 | activation=None, 371 | causal=False, 372 | ): 373 | super().__init__() 374 | self.in_channels = in_channels 375 | self.out_channels = out_channels 376 | self.filter_channels = filter_channels 377 | self.kernel_size = kernel_size 378 | self.p_dropout = p_dropout 379 | self.activation = activation 380 | self.causal = causal 381 | 382 | if causal: 383 | self.padding = self._causal_padding 384 | else: 385 | self.padding = self._same_padding 386 | 387 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 388 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 389 | self.drop = nn.Dropout(p_dropout) 390 | 391 | def forward(self, x, x_mask): 392 | x = self.conv_1(self.padding(x * x_mask)) 393 | if self.activation == "gelu": 394 | x = x * torch.sigmoid(1.702 * x) 395 | else: 396 | x = torch.relu(x) 397 | x = self.drop(x) 398 | x = self.conv_2(self.padding(x * x_mask)) 399 | return x * x_mask 400 | 401 | def _causal_padding(self, x): 402 | if self.kernel_size == 1: 403 | return x 404 | pad_l = self.kernel_size - 1 405 | pad_r = 0 406 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 407 | x = F.pad(x, commons.convert_pad_shape(padding)) 408 | return x 409 | 410 | def _same_padding(self, x): 411 | if self.kernel_size == 1: 412 | return x 413 | pad_l = (self.kernel_size - 1) // 2 414 | pad_r = self.kernel_size // 2 415 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 416 | x = F.pad(x, commons.convert_pad_shape(padding)) 417 | return x 418 | -------------------------------------------------------------------------------- /lib/infer_pack/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size * dilation - dilation) / 2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def kl_divergence(m_p, logs_p, m_q, logs_q): 25 | """KL(P||Q)""" 26 | kl = (logs_q - logs_p) - 0.5 27 | kl += ( 28 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 29 | ) 30 | return kl 31 | 32 | 33 | def rand_gumbel(shape): 34 | """Sample from the Gumbel distribution, protect from overflows.""" 35 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 36 | return -torch.log(-torch.log(uniform_samples)) 37 | 38 | 39 | def rand_gumbel_like(x): 40 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 41 | return g 42 | 43 | 44 | def slice_segments(x, ids_str, segment_size=4): 45 | ret = torch.zeros_like(x[:, :, :segment_size]) 46 | for i in range(x.size(0)): 47 | idx_str = ids_str[i] 48 | idx_end = idx_str + segment_size 49 | ret[i] = x[i, :, idx_str:idx_end] 50 | return ret 51 | 52 | 53 | def slice_segments2(x, ids_str, segment_size=4): 54 | ret = torch.zeros_like(x[:, :segment_size]) 55 | for i in range(x.size(0)): 56 | idx_str = ids_str[i] 57 | idx_end = idx_str + segment_size 58 | ret[i] = x[i, idx_str:idx_end] 59 | return ret 60 | 61 | 62 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 63 | b, d, t = x.size() 64 | if x_lengths is None: 65 | x_lengths = t 66 | ids_str_max = x_lengths - segment_size + 1 67 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 68 | ret = slice_segments(x, ids_str, segment_size) 69 | return ret, ids_str 70 | 71 | 72 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 73 | position = torch.arange(length, dtype=torch.float) 74 | num_timescales = channels // 2 75 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 76 | num_timescales - 1 77 | ) 78 | inv_timescales = min_timescale * torch.exp( 79 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 80 | ) 81 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 82 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 83 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 84 | signal = signal.view(1, channels, length) 85 | return signal 86 | 87 | 88 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 89 | b, channels, length = x.size() 90 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 91 | return x + signal.to(dtype=x.dtype, device=x.device) 92 | 93 | 94 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 95 | b, channels, length = x.size() 96 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 97 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 98 | 99 | 100 | def subsequent_mask(length): 101 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 102 | return mask 103 | 104 | 105 | @torch.jit.script 106 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 107 | n_channels_int = n_channels[0] 108 | in_act = input_a + input_b 109 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 110 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 111 | acts = t_act * s_act 112 | return acts 113 | 114 | 115 | def convert_pad_shape(pad_shape): 116 | l = pad_shape[::-1] 117 | pad_shape = [item for sublist in l for item in sublist] 118 | return pad_shape 119 | 120 | 121 | def shift_1d(x): 122 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 123 | return x 124 | 125 | 126 | def sequence_mask(length, max_length=None): 127 | if max_length is None: 128 | max_length = length.max() 129 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 130 | return x.unsqueeze(0) < length.unsqueeze(1) 131 | 132 | 133 | def generate_path(duration, mask): 134 | """ 135 | duration: [b, 1, t_x] 136 | mask: [b, 1, t_y, t_x] 137 | """ 138 | device = duration.device 139 | 140 | b, _, t_y, t_x = mask.shape 141 | cum_duration = torch.cumsum(duration, -1) 142 | 143 | cum_duration_flat = cum_duration.view(b * t_x) 144 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 145 | path = path.view(b, t_x, t_y) 146 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 147 | path = path.unsqueeze(1).transpose(2, 3) * mask 148 | return path 149 | 150 | 151 | def clip_grad_value_(parameters, clip_value, norm_type=2): 152 | if isinstance(parameters, torch.Tensor): 153 | parameters = [parameters] 154 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 155 | norm_type = float(norm_type) 156 | if clip_value is not None: 157 | clip_value = float(clip_value) 158 | 159 | total_norm = 0 160 | for p in parameters: 161 | param_norm = p.grad.data.norm(norm_type) 162 | total_norm += param_norm.item() ** norm_type 163 | if clip_value is not None: 164 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 165 | total_norm = total_norm ** (1.0 / norm_type) 166 | return total_norm 167 | -------------------------------------------------------------------------------- /lib/infer_pack/models_onnx.py: -------------------------------------------------------------------------------- 1 | import math, pdb, os 2 | from time import time as ttime 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from lib.infer_pack import modules 7 | from lib.infer_pack import attentions 8 | from lib.infer_pack import commons 9 | from lib.infer_pack.commons import init_weights, get_padding 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 12 | from lib.infer_pack.commons import init_weights 13 | import numpy as np 14 | from lib.infer_pack import commons 15 | 16 | 17 | class TextEncoder256(nn.Module): 18 | def __init__( 19 | self, 20 | out_channels, 21 | hidden_channels, 22 | filter_channels, 23 | n_heads, 24 | n_layers, 25 | kernel_size, 26 | p_dropout, 27 | f0=True, 28 | ): 29 | super().__init__() 30 | self.out_channels = out_channels 31 | self.hidden_channels = hidden_channels 32 | self.filter_channels = filter_channels 33 | self.n_heads = n_heads 34 | self.n_layers = n_layers 35 | self.kernel_size = kernel_size 36 | self.p_dropout = p_dropout 37 | self.emb_phone = nn.Linear(256, hidden_channels) 38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 39 | if f0 == True: 40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 41 | self.encoder = attentions.Encoder( 42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 43 | ) 44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 45 | 46 | def forward(self, phone, pitch, lengths): 47 | if pitch == None: 48 | x = self.emb_phone(phone) 49 | else: 50 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 52 | x = self.lrelu(x) 53 | x = torch.transpose(x, 1, -1) # [b, h, t] 54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 55 | x.dtype 56 | ) 57 | x = self.encoder(x * x_mask, x_mask) 58 | stats = self.proj(x) * x_mask 59 | 60 | m, logs = torch.split(stats, self.out_channels, dim=1) 61 | return m, logs, x_mask 62 | 63 | 64 | class TextEncoder768(nn.Module): 65 | def __init__( 66 | self, 67 | out_channels, 68 | hidden_channels, 69 | filter_channels, 70 | n_heads, 71 | n_layers, 72 | kernel_size, 73 | p_dropout, 74 | f0=True, 75 | ): 76 | super().__init__() 77 | self.out_channels = out_channels 78 | self.hidden_channels = hidden_channels 79 | self.filter_channels = filter_channels 80 | self.n_heads = n_heads 81 | self.n_layers = n_layers 82 | self.kernel_size = kernel_size 83 | self.p_dropout = p_dropout 84 | self.emb_phone = nn.Linear(768, hidden_channels) 85 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 86 | if f0 == True: 87 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 88 | self.encoder = attentions.Encoder( 89 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 90 | ) 91 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 92 | 93 | def forward(self, phone, pitch, lengths): 94 | if pitch == None: 95 | x = self.emb_phone(phone) 96 | else: 97 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 98 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 99 | x = self.lrelu(x) 100 | x = torch.transpose(x, 1, -1) # [b, h, t] 101 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 102 | x.dtype 103 | ) 104 | x = self.encoder(x * x_mask, x_mask) 105 | stats = self.proj(x) * x_mask 106 | 107 | m, logs = torch.split(stats, self.out_channels, dim=1) 108 | return m, logs, x_mask 109 | 110 | 111 | class ResidualCouplingBlock(nn.Module): 112 | def __init__( 113 | self, 114 | channels, 115 | hidden_channels, 116 | kernel_size, 117 | dilation_rate, 118 | n_layers, 119 | n_flows=4, 120 | gin_channels=0, 121 | ): 122 | super().__init__() 123 | self.channels = channels 124 | self.hidden_channels = hidden_channels 125 | self.kernel_size = kernel_size 126 | self.dilation_rate = dilation_rate 127 | self.n_layers = n_layers 128 | self.n_flows = n_flows 129 | self.gin_channels = gin_channels 130 | 131 | self.flows = nn.ModuleList() 132 | for i in range(n_flows): 133 | self.flows.append( 134 | modules.ResidualCouplingLayer( 135 | channels, 136 | hidden_channels, 137 | kernel_size, 138 | dilation_rate, 139 | n_layers, 140 | gin_channels=gin_channels, 141 | mean_only=True, 142 | ) 143 | ) 144 | self.flows.append(modules.Flip()) 145 | 146 | def forward(self, x, x_mask, g=None, reverse=False): 147 | if not reverse: 148 | for flow in self.flows: 149 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 150 | else: 151 | for flow in reversed(self.flows): 152 | x = flow(x, x_mask, g=g, reverse=reverse) 153 | return x 154 | 155 | def remove_weight_norm(self): 156 | for i in range(self.n_flows): 157 | self.flows[i * 2].remove_weight_norm() 158 | 159 | 160 | class PosteriorEncoder(nn.Module): 161 | def __init__( 162 | self, 163 | in_channels, 164 | out_channels, 165 | hidden_channels, 166 | kernel_size, 167 | dilation_rate, 168 | n_layers, 169 | gin_channels=0, 170 | ): 171 | super().__init__() 172 | self.in_channels = in_channels 173 | self.out_channels = out_channels 174 | self.hidden_channels = hidden_channels 175 | self.kernel_size = kernel_size 176 | self.dilation_rate = dilation_rate 177 | self.n_layers = n_layers 178 | self.gin_channels = gin_channels 179 | 180 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 181 | self.enc = modules.WN( 182 | hidden_channels, 183 | kernel_size, 184 | dilation_rate, 185 | n_layers, 186 | gin_channels=gin_channels, 187 | ) 188 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 189 | 190 | def forward(self, x, x_lengths, g=None): 191 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 192 | x.dtype 193 | ) 194 | x = self.pre(x) * x_mask 195 | x = self.enc(x, x_mask, g=g) 196 | stats = self.proj(x) * x_mask 197 | m, logs = torch.split(stats, self.out_channels, dim=1) 198 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 199 | return z, m, logs, x_mask 200 | 201 | def remove_weight_norm(self): 202 | self.enc.remove_weight_norm() 203 | 204 | 205 | class Generator(torch.nn.Module): 206 | def __init__( 207 | self, 208 | initial_channel, 209 | resblock, 210 | resblock_kernel_sizes, 211 | resblock_dilation_sizes, 212 | upsample_rates, 213 | upsample_initial_channel, 214 | upsample_kernel_sizes, 215 | gin_channels=0, 216 | ): 217 | super(Generator, self).__init__() 218 | self.num_kernels = len(resblock_kernel_sizes) 219 | self.num_upsamples = len(upsample_rates) 220 | self.conv_pre = Conv1d( 221 | initial_channel, upsample_initial_channel, 7, 1, padding=3 222 | ) 223 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 224 | 225 | self.ups = nn.ModuleList() 226 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 227 | self.ups.append( 228 | weight_norm( 229 | ConvTranspose1d( 230 | upsample_initial_channel // (2**i), 231 | upsample_initial_channel // (2 ** (i + 1)), 232 | k, 233 | u, 234 | padding=(k - u) // 2, 235 | ) 236 | ) 237 | ) 238 | 239 | self.resblocks = nn.ModuleList() 240 | for i in range(len(self.ups)): 241 | ch = upsample_initial_channel // (2 ** (i + 1)) 242 | for j, (k, d) in enumerate( 243 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 244 | ): 245 | self.resblocks.append(resblock(ch, k, d)) 246 | 247 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 248 | self.ups.apply(init_weights) 249 | 250 | if gin_channels != 0: 251 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 252 | 253 | def forward(self, x, g=None): 254 | x = self.conv_pre(x) 255 | if g is not None: 256 | x = x + self.cond(g) 257 | 258 | for i in range(self.num_upsamples): 259 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 260 | x = self.ups[i](x) 261 | xs = None 262 | for j in range(self.num_kernels): 263 | if xs is None: 264 | xs = self.resblocks[i * self.num_kernels + j](x) 265 | else: 266 | xs += self.resblocks[i * self.num_kernels + j](x) 267 | x = xs / self.num_kernels 268 | x = F.leaky_relu(x) 269 | x = self.conv_post(x) 270 | x = torch.tanh(x) 271 | 272 | return x 273 | 274 | def remove_weight_norm(self): 275 | for l in self.ups: 276 | remove_weight_norm(l) 277 | for l in self.resblocks: 278 | l.remove_weight_norm() 279 | 280 | 281 | class SineGen(torch.nn.Module): 282 | """Definition of sine generator 283 | SineGen(samp_rate, harmonic_num = 0, 284 | sine_amp = 0.1, noise_std = 0.003, 285 | voiced_threshold = 0, 286 | flag_for_pulse=False) 287 | samp_rate: sampling rate in Hz 288 | harmonic_num: number of harmonic overtones (default 0) 289 | sine_amp: amplitude of sine-wavefrom (default 0.1) 290 | noise_std: std of Gaussian noise (default 0.003) 291 | voiced_thoreshold: F0 threshold for U/V classification (default 0) 292 | flag_for_pulse: this SinGen is used inside PulseGen (default False) 293 | Note: when flag_for_pulse is True, the first time step of a voiced 294 | segment is always sin(np.pi) or cos(0) 295 | """ 296 | 297 | def __init__( 298 | self, 299 | samp_rate, 300 | harmonic_num=0, 301 | sine_amp=0.1, 302 | noise_std=0.003, 303 | voiced_threshold=0, 304 | flag_for_pulse=False, 305 | ): 306 | super(SineGen, self).__init__() 307 | self.sine_amp = sine_amp 308 | self.noise_std = noise_std 309 | self.harmonic_num = harmonic_num 310 | self.dim = self.harmonic_num + 1 311 | self.sampling_rate = samp_rate 312 | self.voiced_threshold = voiced_threshold 313 | 314 | def _f02uv(self, f0): 315 | # generate uv signal 316 | uv = torch.ones_like(f0) 317 | uv = uv * (f0 > self.voiced_threshold) 318 | return uv 319 | 320 | def forward(self, f0, upp): 321 | """sine_tensor, uv = forward(f0) 322 | input F0: tensor(batchsize=1, length, dim=1) 323 | f0 for unvoiced steps should be 0 324 | output sine_tensor: tensor(batchsize=1, length, dim) 325 | output uv: tensor(batchsize=1, length, 1) 326 | """ 327 | with torch.no_grad(): 328 | f0 = f0[:, None].transpose(1, 2) 329 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) 330 | # fundamental component 331 | f0_buf[:, :, 0] = f0[:, :, 0] 332 | for idx in np.arange(self.harmonic_num): 333 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( 334 | idx + 2 335 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic 336 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 337 | rand_ini = torch.rand( 338 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device 339 | ) 340 | rand_ini[:, 0] = 0 341 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini 342 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 343 | tmp_over_one *= upp 344 | tmp_over_one = F.interpolate( 345 | tmp_over_one.transpose(2, 1), 346 | scale_factor=upp, 347 | mode="linear", 348 | align_corners=True, 349 | ).transpose(2, 1) 350 | rad_values = F.interpolate( 351 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" 352 | ).transpose( 353 | 2, 1 354 | ) ####### 355 | tmp_over_one %= 1 356 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 357 | cumsum_shift = torch.zeros_like(rad_values) 358 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 359 | sine_waves = torch.sin( 360 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi 361 | ) 362 | sine_waves = sine_waves * self.sine_amp 363 | uv = self._f02uv(f0) 364 | uv = F.interpolate( 365 | uv.transpose(2, 1), scale_factor=upp, mode="nearest" 366 | ).transpose(2, 1) 367 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 368 | noise = noise_amp * torch.randn_like(sine_waves) 369 | sine_waves = sine_waves * uv + noise 370 | return sine_waves, uv, noise 371 | 372 | 373 | class SourceModuleHnNSF(torch.nn.Module): 374 | """SourceModule for hn-nsf 375 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 376 | add_noise_std=0.003, voiced_threshod=0) 377 | sampling_rate: sampling_rate in Hz 378 | harmonic_num: number of harmonic above F0 (default: 0) 379 | sine_amp: amplitude of sine source signal (default: 0.1) 380 | add_noise_std: std of additive Gaussian noise (default: 0.003) 381 | note that amplitude of noise in unvoiced is decided 382 | by sine_amp 383 | voiced_threshold: threhold to set U/V given F0 (default: 0) 384 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 385 | F0_sampled (batchsize, length, 1) 386 | Sine_source (batchsize, length, 1) 387 | noise_source (batchsize, length 1) 388 | uv (batchsize, length, 1) 389 | """ 390 | 391 | def __init__( 392 | self, 393 | sampling_rate, 394 | harmonic_num=0, 395 | sine_amp=0.1, 396 | add_noise_std=0.003, 397 | voiced_threshod=0, 398 | is_half=True, 399 | ): 400 | super(SourceModuleHnNSF, self).__init__() 401 | 402 | self.sine_amp = sine_amp 403 | self.noise_std = add_noise_std 404 | self.is_half = is_half 405 | # to produce sine waveforms 406 | self.l_sin_gen = SineGen( 407 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod 408 | ) 409 | 410 | # to merge source harmonics into a single excitation 411 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) 412 | self.l_tanh = torch.nn.Tanh() 413 | 414 | def forward(self, x, upp=None): 415 | sine_wavs, uv, _ = self.l_sin_gen(x, upp) 416 | if self.is_half: 417 | sine_wavs = sine_wavs.half() 418 | sine_merge = self.l_tanh(self.l_linear(sine_wavs)) 419 | return sine_merge, None, None # noise, uv 420 | 421 | 422 | class GeneratorNSF(torch.nn.Module): 423 | def __init__( 424 | self, 425 | initial_channel, 426 | resblock, 427 | resblock_kernel_sizes, 428 | resblock_dilation_sizes, 429 | upsample_rates, 430 | upsample_initial_channel, 431 | upsample_kernel_sizes, 432 | gin_channels, 433 | sr, 434 | is_half=False, 435 | ): 436 | super(GeneratorNSF, self).__init__() 437 | self.num_kernels = len(resblock_kernel_sizes) 438 | self.num_upsamples = len(upsample_rates) 439 | 440 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) 441 | self.m_source = SourceModuleHnNSF( 442 | sampling_rate=sr, harmonic_num=0, is_half=is_half 443 | ) 444 | self.noise_convs = nn.ModuleList() 445 | self.conv_pre = Conv1d( 446 | initial_channel, upsample_initial_channel, 7, 1, padding=3 447 | ) 448 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 449 | 450 | self.ups = nn.ModuleList() 451 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 452 | c_cur = upsample_initial_channel // (2 ** (i + 1)) 453 | self.ups.append( 454 | weight_norm( 455 | ConvTranspose1d( 456 | upsample_initial_channel // (2**i), 457 | upsample_initial_channel // (2 ** (i + 1)), 458 | k, 459 | u, 460 | padding=(k - u) // 2, 461 | ) 462 | ) 463 | ) 464 | if i + 1 < len(upsample_rates): 465 | stride_f0 = np.prod(upsample_rates[i + 1 :]) 466 | self.noise_convs.append( 467 | Conv1d( 468 | 1, 469 | c_cur, 470 | kernel_size=stride_f0 * 2, 471 | stride=stride_f0, 472 | padding=stride_f0 // 2, 473 | ) 474 | ) 475 | else: 476 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) 477 | 478 | self.resblocks = nn.ModuleList() 479 | for i in range(len(self.ups)): 480 | ch = upsample_initial_channel // (2 ** (i + 1)) 481 | for j, (k, d) in enumerate( 482 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 483 | ): 484 | self.resblocks.append(resblock(ch, k, d)) 485 | 486 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 487 | self.ups.apply(init_weights) 488 | 489 | if gin_channels != 0: 490 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 491 | 492 | self.upp = np.prod(upsample_rates) 493 | 494 | def forward(self, x, f0, g=None): 495 | har_source, noi_source, uv = self.m_source(f0, self.upp) 496 | har_source = har_source.transpose(1, 2) 497 | x = self.conv_pre(x) 498 | if g is not None: 499 | x = x + self.cond(g) 500 | 501 | for i in range(self.num_upsamples): 502 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 503 | x = self.ups[i](x) 504 | x_source = self.noise_convs[i](har_source) 505 | x = x + x_source 506 | xs = None 507 | for j in range(self.num_kernels): 508 | if xs is None: 509 | xs = self.resblocks[i * self.num_kernels + j](x) 510 | else: 511 | xs += self.resblocks[i * self.num_kernels + j](x) 512 | x = xs / self.num_kernels 513 | x = F.leaky_relu(x) 514 | x = self.conv_post(x) 515 | x = torch.tanh(x) 516 | return x 517 | 518 | def remove_weight_norm(self): 519 | for l in self.ups: 520 | remove_weight_norm(l) 521 | for l in self.resblocks: 522 | l.remove_weight_norm() 523 | 524 | 525 | sr2sr = { 526 | "32k": 32000, 527 | "40k": 40000, 528 | "48k": 48000, 529 | } 530 | 531 | 532 | class SynthesizerTrnMsNSFsidM(nn.Module): 533 | def __init__( 534 | self, 535 | spec_channels, 536 | segment_size, 537 | inter_channels, 538 | hidden_channels, 539 | filter_channels, 540 | n_heads, 541 | n_layers, 542 | kernel_size, 543 | p_dropout, 544 | resblock, 545 | resblock_kernel_sizes, 546 | resblock_dilation_sizes, 547 | upsample_rates, 548 | upsample_initial_channel, 549 | upsample_kernel_sizes, 550 | spk_embed_dim, 551 | gin_channels, 552 | sr, 553 | version, 554 | **kwargs 555 | ): 556 | super().__init__() 557 | if type(sr) == type("strr"): 558 | sr = sr2sr[sr] 559 | self.spec_channels = spec_channels 560 | self.inter_channels = inter_channels 561 | self.hidden_channels = hidden_channels 562 | self.filter_channels = filter_channels 563 | self.n_heads = n_heads 564 | self.n_layers = n_layers 565 | self.kernel_size = kernel_size 566 | self.p_dropout = p_dropout 567 | self.resblock = resblock 568 | self.resblock_kernel_sizes = resblock_kernel_sizes 569 | self.resblock_dilation_sizes = resblock_dilation_sizes 570 | self.upsample_rates = upsample_rates 571 | self.upsample_initial_channel = upsample_initial_channel 572 | self.upsample_kernel_sizes = upsample_kernel_sizes 573 | self.segment_size = segment_size 574 | self.gin_channels = gin_channels 575 | # self.hop_length = hop_length# 576 | self.spk_embed_dim = spk_embed_dim 577 | if version == "v1": 578 | self.enc_p = TextEncoder256( 579 | inter_channels, 580 | hidden_channels, 581 | filter_channels, 582 | n_heads, 583 | n_layers, 584 | kernel_size, 585 | p_dropout, 586 | ) 587 | else: 588 | self.enc_p = TextEncoder768( 589 | inter_channels, 590 | hidden_channels, 591 | filter_channels, 592 | n_heads, 593 | n_layers, 594 | kernel_size, 595 | p_dropout, 596 | ) 597 | self.dec = GeneratorNSF( 598 | inter_channels, 599 | resblock, 600 | resblock_kernel_sizes, 601 | resblock_dilation_sizes, 602 | upsample_rates, 603 | upsample_initial_channel, 604 | upsample_kernel_sizes, 605 | gin_channels=gin_channels, 606 | sr=sr, 607 | is_half=kwargs["is_half"], 608 | ) 609 | self.enc_q = PosteriorEncoder( 610 | spec_channels, 611 | inter_channels, 612 | hidden_channels, 613 | 5, 614 | 1, 615 | 16, 616 | gin_channels=gin_channels, 617 | ) 618 | self.flow = ResidualCouplingBlock( 619 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 620 | ) 621 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 622 | self.speaker_map = None 623 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 624 | 625 | def remove_weight_norm(self): 626 | self.dec.remove_weight_norm() 627 | self.flow.remove_weight_norm() 628 | self.enc_q.remove_weight_norm() 629 | 630 | def construct_spkmixmap(self, n_speaker): 631 | self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) 632 | for i in range(n_speaker): 633 | self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) 634 | self.speaker_map = self.speaker_map.unsqueeze(0) 635 | 636 | def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): 637 | if self.speaker_map is not None: # [N, S] * [S, B, 1, H] 638 | g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] 639 | g = g * self.speaker_map # [N, S, B, 1, H] 640 | g = torch.sum(g, dim=1) # [N, 1, B, 1, H] 641 | g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] 642 | else: 643 | g = g.unsqueeze(0) 644 | g = self.emb_g(g).transpose(1, 2) 645 | 646 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 647 | z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask 648 | z = self.flow(z_p, x_mask, g=g, reverse=True) 649 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) 650 | return o 651 | 652 | 653 | class MultiPeriodDiscriminator(torch.nn.Module): 654 | def __init__(self, use_spectral_norm=False): 655 | super(MultiPeriodDiscriminator, self).__init__() 656 | periods = [2, 3, 5, 7, 11, 17] 657 | # periods = [3, 5, 7, 11, 17, 23, 37] 658 | 659 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 660 | discs = discs + [ 661 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 662 | ] 663 | self.discriminators = nn.ModuleList(discs) 664 | 665 | def forward(self, y, y_hat): 666 | y_d_rs = [] # 667 | y_d_gs = [] 668 | fmap_rs = [] 669 | fmap_gs = [] 670 | for i, d in enumerate(self.discriminators): 671 | y_d_r, fmap_r = d(y) 672 | y_d_g, fmap_g = d(y_hat) 673 | # for j in range(len(fmap_r)): 674 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 675 | y_d_rs.append(y_d_r) 676 | y_d_gs.append(y_d_g) 677 | fmap_rs.append(fmap_r) 678 | fmap_gs.append(fmap_g) 679 | 680 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 681 | 682 | 683 | class MultiPeriodDiscriminatorV2(torch.nn.Module): 684 | def __init__(self, use_spectral_norm=False): 685 | super(MultiPeriodDiscriminatorV2, self).__init__() 686 | # periods = [2, 3, 5, 7, 11, 17] 687 | periods = [2, 3, 5, 7, 11, 17, 23, 37] 688 | 689 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 690 | discs = discs + [ 691 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 692 | ] 693 | self.discriminators = nn.ModuleList(discs) 694 | 695 | def forward(self, y, y_hat): 696 | y_d_rs = [] # 697 | y_d_gs = [] 698 | fmap_rs = [] 699 | fmap_gs = [] 700 | for i, d in enumerate(self.discriminators): 701 | y_d_r, fmap_r = d(y) 702 | y_d_g, fmap_g = d(y_hat) 703 | # for j in range(len(fmap_r)): 704 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 705 | y_d_rs.append(y_d_r) 706 | y_d_gs.append(y_d_g) 707 | fmap_rs.append(fmap_r) 708 | fmap_gs.append(fmap_g) 709 | 710 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 711 | 712 | 713 | class DiscriminatorS(torch.nn.Module): 714 | def __init__(self, use_spectral_norm=False): 715 | super(DiscriminatorS, self).__init__() 716 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 717 | self.convs = nn.ModuleList( 718 | [ 719 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 720 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 721 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 722 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 723 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 724 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 725 | ] 726 | ) 727 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 728 | 729 | def forward(self, x): 730 | fmap = [] 731 | 732 | for l in self.convs: 733 | x = l(x) 734 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 735 | fmap.append(x) 736 | x = self.conv_post(x) 737 | fmap.append(x) 738 | x = torch.flatten(x, 1, -1) 739 | 740 | return x, fmap 741 | 742 | 743 | class DiscriminatorP(torch.nn.Module): 744 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 745 | super(DiscriminatorP, self).__init__() 746 | self.period = period 747 | self.use_spectral_norm = use_spectral_norm 748 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 749 | self.convs = nn.ModuleList( 750 | [ 751 | norm_f( 752 | Conv2d( 753 | 1, 754 | 32, 755 | (kernel_size, 1), 756 | (stride, 1), 757 | padding=(get_padding(kernel_size, 1), 0), 758 | ) 759 | ), 760 | norm_f( 761 | Conv2d( 762 | 32, 763 | 128, 764 | (kernel_size, 1), 765 | (stride, 1), 766 | padding=(get_padding(kernel_size, 1), 0), 767 | ) 768 | ), 769 | norm_f( 770 | Conv2d( 771 | 128, 772 | 512, 773 | (kernel_size, 1), 774 | (stride, 1), 775 | padding=(get_padding(kernel_size, 1), 0), 776 | ) 777 | ), 778 | norm_f( 779 | Conv2d( 780 | 512, 781 | 1024, 782 | (kernel_size, 1), 783 | (stride, 1), 784 | padding=(get_padding(kernel_size, 1), 0), 785 | ) 786 | ), 787 | norm_f( 788 | Conv2d( 789 | 1024, 790 | 1024, 791 | (kernel_size, 1), 792 | 1, 793 | padding=(get_padding(kernel_size, 1), 0), 794 | ) 795 | ), 796 | ] 797 | ) 798 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 799 | 800 | def forward(self, x): 801 | fmap = [] 802 | 803 | # 1d to 2d 804 | b, c, t = x.shape 805 | if t % self.period != 0: # pad first 806 | n_pad = self.period - (t % self.period) 807 | x = F.pad(x, (0, n_pad), "reflect") 808 | t = t + n_pad 809 | x = x.view(b, c, t // self.period, self.period) 810 | 811 | for l in self.convs: 812 | x = l(x) 813 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 814 | fmap.append(x) 815 | x = self.conv_post(x) 816 | fmap.append(x) 817 | x = torch.flatten(x, 1, -1) 818 | 819 | return x, fmap 820 | -------------------------------------------------------------------------------- /lib/infer_pack/modules.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import scipy 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 10 | from torch.nn.utils import weight_norm, remove_weight_norm 11 | 12 | from lib.infer_pack import commons 13 | from lib.infer_pack.commons import init_weights, get_padding 14 | from lib.infer_pack.transforms import piecewise_rational_quadratic_transform 15 | 16 | 17 | LRELU_SLOPE = 0.1 18 | 19 | 20 | class LayerNorm(nn.Module): 21 | def __init__(self, channels, eps=1e-5): 22 | super().__init__() 23 | self.channels = channels 24 | self.eps = eps 25 | 26 | self.gamma = nn.Parameter(torch.ones(channels)) 27 | self.beta = nn.Parameter(torch.zeros(channels)) 28 | 29 | def forward(self, x): 30 | x = x.transpose(1, -1) 31 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 32 | return x.transpose(1, -1) 33 | 34 | 35 | class ConvReluNorm(nn.Module): 36 | def __init__( 37 | self, 38 | in_channels, 39 | hidden_channels, 40 | out_channels, 41 | kernel_size, 42 | n_layers, 43 | p_dropout, 44 | ): 45 | super().__init__() 46 | self.in_channels = in_channels 47 | self.hidden_channels = hidden_channels 48 | self.out_channels = out_channels 49 | self.kernel_size = kernel_size 50 | self.n_layers = n_layers 51 | self.p_dropout = p_dropout 52 | assert n_layers > 1, "Number of layers should be larger than 0." 53 | 54 | self.conv_layers = nn.ModuleList() 55 | self.norm_layers = nn.ModuleList() 56 | self.conv_layers.append( 57 | nn.Conv1d( 58 | in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 59 | ) 60 | ) 61 | self.norm_layers.append(LayerNorm(hidden_channels)) 62 | self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) 63 | for _ in range(n_layers - 1): 64 | self.conv_layers.append( 65 | nn.Conv1d( 66 | hidden_channels, 67 | hidden_channels, 68 | kernel_size, 69 | padding=kernel_size // 2, 70 | ) 71 | ) 72 | self.norm_layers.append(LayerNorm(hidden_channels)) 73 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 74 | self.proj.weight.data.zero_() 75 | self.proj.bias.data.zero_() 76 | 77 | def forward(self, x, x_mask): 78 | x_org = x 79 | for i in range(self.n_layers): 80 | x = self.conv_layers[i](x * x_mask) 81 | x = self.norm_layers[i](x) 82 | x = self.relu_drop(x) 83 | x = x_org + self.proj(x) 84 | return x * x_mask 85 | 86 | 87 | class DDSConv(nn.Module): 88 | """ 89 | Dialted and Depth-Separable Convolution 90 | """ 91 | 92 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): 93 | super().__init__() 94 | self.channels = channels 95 | self.kernel_size = kernel_size 96 | self.n_layers = n_layers 97 | self.p_dropout = p_dropout 98 | 99 | self.drop = nn.Dropout(p_dropout) 100 | self.convs_sep = nn.ModuleList() 101 | self.convs_1x1 = nn.ModuleList() 102 | self.norms_1 = nn.ModuleList() 103 | self.norms_2 = nn.ModuleList() 104 | for i in range(n_layers): 105 | dilation = kernel_size**i 106 | padding = (kernel_size * dilation - dilation) // 2 107 | self.convs_sep.append( 108 | nn.Conv1d( 109 | channels, 110 | channels, 111 | kernel_size, 112 | groups=channels, 113 | dilation=dilation, 114 | padding=padding, 115 | ) 116 | ) 117 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 118 | self.norms_1.append(LayerNorm(channels)) 119 | self.norms_2.append(LayerNorm(channels)) 120 | 121 | def forward(self, x, x_mask, g=None): 122 | if g is not None: 123 | x = x + g 124 | for i in range(self.n_layers): 125 | y = self.convs_sep[i](x * x_mask) 126 | y = self.norms_1[i](y) 127 | y = F.gelu(y) 128 | y = self.convs_1x1[i](y) 129 | y = self.norms_2[i](y) 130 | y = F.gelu(y) 131 | y = self.drop(y) 132 | x = x + y 133 | return x * x_mask 134 | 135 | 136 | class WN(torch.nn.Module): 137 | def __init__( 138 | self, 139 | hidden_channels, 140 | kernel_size, 141 | dilation_rate, 142 | n_layers, 143 | gin_channels=0, 144 | p_dropout=0, 145 | ): 146 | super(WN, self).__init__() 147 | assert kernel_size % 2 == 1 148 | self.hidden_channels = hidden_channels 149 | self.kernel_size = (kernel_size,) 150 | self.dilation_rate = dilation_rate 151 | self.n_layers = n_layers 152 | self.gin_channels = gin_channels 153 | self.p_dropout = p_dropout 154 | 155 | self.in_layers = torch.nn.ModuleList() 156 | self.res_skip_layers = torch.nn.ModuleList() 157 | self.drop = nn.Dropout(p_dropout) 158 | 159 | if gin_channels != 0: 160 | cond_layer = torch.nn.Conv1d( 161 | gin_channels, 2 * hidden_channels * n_layers, 1 162 | ) 163 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") 164 | 165 | for i in range(n_layers): 166 | dilation = dilation_rate**i 167 | padding = int((kernel_size * dilation - dilation) / 2) 168 | in_layer = torch.nn.Conv1d( 169 | hidden_channels, 170 | 2 * hidden_channels, 171 | kernel_size, 172 | dilation=dilation, 173 | padding=padding, 174 | ) 175 | in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") 176 | self.in_layers.append(in_layer) 177 | 178 | # last one is not necessary 179 | if i < n_layers - 1: 180 | res_skip_channels = 2 * hidden_channels 181 | else: 182 | res_skip_channels = hidden_channels 183 | 184 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 185 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") 186 | self.res_skip_layers.append(res_skip_layer) 187 | 188 | def forward(self, x, x_mask, g=None, **kwargs): 189 | output = torch.zeros_like(x) 190 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 191 | 192 | if g is not None: 193 | g = self.cond_layer(g) 194 | 195 | for i in range(self.n_layers): 196 | x_in = self.in_layers[i](x) 197 | if g is not None: 198 | cond_offset = i * 2 * self.hidden_channels 199 | g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] 200 | else: 201 | g_l = torch.zeros_like(x_in) 202 | 203 | acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) 204 | acts = self.drop(acts) 205 | 206 | res_skip_acts = self.res_skip_layers[i](acts) 207 | if i < self.n_layers - 1: 208 | res_acts = res_skip_acts[:, : self.hidden_channels, :] 209 | x = (x + res_acts) * x_mask 210 | output = output + res_skip_acts[:, self.hidden_channels :, :] 211 | else: 212 | output = output + res_skip_acts 213 | return output * x_mask 214 | 215 | def remove_weight_norm(self): 216 | if self.gin_channels != 0: 217 | torch.nn.utils.remove_weight_norm(self.cond_layer) 218 | for l in self.in_layers: 219 | torch.nn.utils.remove_weight_norm(l) 220 | for l in self.res_skip_layers: 221 | torch.nn.utils.remove_weight_norm(l) 222 | 223 | 224 | class ResBlock1(torch.nn.Module): 225 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 226 | super(ResBlock1, self).__init__() 227 | self.convs1 = nn.ModuleList( 228 | [ 229 | weight_norm( 230 | Conv1d( 231 | channels, 232 | channels, 233 | kernel_size, 234 | 1, 235 | dilation=dilation[0], 236 | padding=get_padding(kernel_size, dilation[0]), 237 | ) 238 | ), 239 | weight_norm( 240 | Conv1d( 241 | channels, 242 | channels, 243 | kernel_size, 244 | 1, 245 | dilation=dilation[1], 246 | padding=get_padding(kernel_size, dilation[1]), 247 | ) 248 | ), 249 | weight_norm( 250 | Conv1d( 251 | channels, 252 | channels, 253 | kernel_size, 254 | 1, 255 | dilation=dilation[2], 256 | padding=get_padding(kernel_size, dilation[2]), 257 | ) 258 | ), 259 | ] 260 | ) 261 | self.convs1.apply(init_weights) 262 | 263 | self.convs2 = nn.ModuleList( 264 | [ 265 | weight_norm( 266 | Conv1d( 267 | channels, 268 | channels, 269 | kernel_size, 270 | 1, 271 | dilation=1, 272 | padding=get_padding(kernel_size, 1), 273 | ) 274 | ), 275 | weight_norm( 276 | Conv1d( 277 | channels, 278 | channels, 279 | kernel_size, 280 | 1, 281 | dilation=1, 282 | padding=get_padding(kernel_size, 1), 283 | ) 284 | ), 285 | weight_norm( 286 | Conv1d( 287 | channels, 288 | channels, 289 | kernel_size, 290 | 1, 291 | dilation=1, 292 | padding=get_padding(kernel_size, 1), 293 | ) 294 | ), 295 | ] 296 | ) 297 | self.convs2.apply(init_weights) 298 | 299 | def forward(self, x, x_mask=None): 300 | for c1, c2 in zip(self.convs1, self.convs2): 301 | xt = F.leaky_relu(x, LRELU_SLOPE) 302 | if x_mask is not None: 303 | xt = xt * x_mask 304 | xt = c1(xt) 305 | xt = F.leaky_relu(xt, LRELU_SLOPE) 306 | if x_mask is not None: 307 | xt = xt * x_mask 308 | xt = c2(xt) 309 | x = xt + x 310 | if x_mask is not None: 311 | x = x * x_mask 312 | return x 313 | 314 | def remove_weight_norm(self): 315 | for l in self.convs1: 316 | remove_weight_norm(l) 317 | for l in self.convs2: 318 | remove_weight_norm(l) 319 | 320 | 321 | class ResBlock2(torch.nn.Module): 322 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 323 | super(ResBlock2, self).__init__() 324 | self.convs = nn.ModuleList( 325 | [ 326 | weight_norm( 327 | Conv1d( 328 | channels, 329 | channels, 330 | kernel_size, 331 | 1, 332 | dilation=dilation[0], 333 | padding=get_padding(kernel_size, dilation[0]), 334 | ) 335 | ), 336 | weight_norm( 337 | Conv1d( 338 | channels, 339 | channels, 340 | kernel_size, 341 | 1, 342 | dilation=dilation[1], 343 | padding=get_padding(kernel_size, dilation[1]), 344 | ) 345 | ), 346 | ] 347 | ) 348 | self.convs.apply(init_weights) 349 | 350 | def forward(self, x, x_mask=None): 351 | for c in self.convs: 352 | xt = F.leaky_relu(x, LRELU_SLOPE) 353 | if x_mask is not None: 354 | xt = xt * x_mask 355 | xt = c(xt) 356 | x = xt + x 357 | if x_mask is not None: 358 | x = x * x_mask 359 | return x 360 | 361 | def remove_weight_norm(self): 362 | for l in self.convs: 363 | remove_weight_norm(l) 364 | 365 | 366 | class Log(nn.Module): 367 | def forward(self, x, x_mask, reverse=False, **kwargs): 368 | if not reverse: 369 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 370 | logdet = torch.sum(-y, [1, 2]) 371 | return y, logdet 372 | else: 373 | x = torch.exp(x) * x_mask 374 | return x 375 | 376 | 377 | class Flip(nn.Module): 378 | def forward(self, x, *args, reverse=False, **kwargs): 379 | x = torch.flip(x, [1]) 380 | if not reverse: 381 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 382 | return x, logdet 383 | else: 384 | return x 385 | 386 | 387 | class ElementwiseAffine(nn.Module): 388 | def __init__(self, channels): 389 | super().__init__() 390 | self.channels = channels 391 | self.m = nn.Parameter(torch.zeros(channels, 1)) 392 | self.logs = nn.Parameter(torch.zeros(channels, 1)) 393 | 394 | def forward(self, x, x_mask, reverse=False, **kwargs): 395 | if not reverse: 396 | y = self.m + torch.exp(self.logs) * x 397 | y = y * x_mask 398 | logdet = torch.sum(self.logs * x_mask, [1, 2]) 399 | return y, logdet 400 | else: 401 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 402 | return x 403 | 404 | 405 | class ResidualCouplingLayer(nn.Module): 406 | def __init__( 407 | self, 408 | channels, 409 | hidden_channels, 410 | kernel_size, 411 | dilation_rate, 412 | n_layers, 413 | p_dropout=0, 414 | gin_channels=0, 415 | mean_only=False, 416 | ): 417 | assert channels % 2 == 0, "channels should be divisible by 2" 418 | super().__init__() 419 | self.channels = channels 420 | self.hidden_channels = hidden_channels 421 | self.kernel_size = kernel_size 422 | self.dilation_rate = dilation_rate 423 | self.n_layers = n_layers 424 | self.half_channels = channels // 2 425 | self.mean_only = mean_only 426 | 427 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 428 | self.enc = WN( 429 | hidden_channels, 430 | kernel_size, 431 | dilation_rate, 432 | n_layers, 433 | p_dropout=p_dropout, 434 | gin_channels=gin_channels, 435 | ) 436 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 437 | self.post.weight.data.zero_() 438 | self.post.bias.data.zero_() 439 | 440 | def forward(self, x, x_mask, g=None, reverse=False): 441 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 442 | h = self.pre(x0) * x_mask 443 | h = self.enc(h, x_mask, g=g) 444 | stats = self.post(h) * x_mask 445 | if not self.mean_only: 446 | m, logs = torch.split(stats, [self.half_channels] * 2, 1) 447 | else: 448 | m = stats 449 | logs = torch.zeros_like(m) 450 | 451 | if not reverse: 452 | x1 = m + x1 * torch.exp(logs) * x_mask 453 | x = torch.cat([x0, x1], 1) 454 | logdet = torch.sum(logs, [1, 2]) 455 | return x, logdet 456 | else: 457 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 458 | x = torch.cat([x0, x1], 1) 459 | return x 460 | 461 | def remove_weight_norm(self): 462 | self.enc.remove_weight_norm() 463 | 464 | 465 | class ConvFlow(nn.Module): 466 | def __init__( 467 | self, 468 | in_channels, 469 | filter_channels, 470 | kernel_size, 471 | n_layers, 472 | num_bins=10, 473 | tail_bound=5.0, 474 | ): 475 | super().__init__() 476 | self.in_channels = in_channels 477 | self.filter_channels = filter_channels 478 | self.kernel_size = kernel_size 479 | self.n_layers = n_layers 480 | self.num_bins = num_bins 481 | self.tail_bound = tail_bound 482 | self.half_channels = in_channels // 2 483 | 484 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 485 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) 486 | self.proj = nn.Conv1d( 487 | filter_channels, self.half_channels * (num_bins * 3 - 1), 1 488 | ) 489 | self.proj.weight.data.zero_() 490 | self.proj.bias.data.zero_() 491 | 492 | def forward(self, x, x_mask, g=None, reverse=False): 493 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 494 | h = self.pre(x0) 495 | h = self.convs(h, x_mask, g=g) 496 | h = self.proj(h) * x_mask 497 | 498 | b, c, t = x0.shape 499 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 500 | 501 | unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) 502 | unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( 503 | self.filter_channels 504 | ) 505 | unnormalized_derivatives = h[..., 2 * self.num_bins :] 506 | 507 | x1, logabsdet = piecewise_rational_quadratic_transform( 508 | x1, 509 | unnormalized_widths, 510 | unnormalized_heights, 511 | unnormalized_derivatives, 512 | inverse=reverse, 513 | tails="linear", 514 | tail_bound=self.tail_bound, 515 | ) 516 | 517 | x = torch.cat([x0, x1], 1) * x_mask 518 | logdet = torch.sum(logabsdet * x_mask, [1, 2]) 519 | if not reverse: 520 | return x, logdet 521 | else: 522 | return x 523 | -------------------------------------------------------------------------------- /lib/infer_pack/modules/F0Predictor/DioF0Predictor.py: -------------------------------------------------------------------------------- 1 | from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 2 | import pyworld 3 | import numpy as np 4 | 5 | 6 | class DioF0Predictor(F0Predictor): 7 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 8 | self.hop_length = hop_length 9 | self.f0_min = f0_min 10 | self.f0_max = f0_max 11 | self.sampling_rate = sampling_rate 12 | 13 | def interpolate_f0(self, f0): 14 | """ 15 | 对F0进行插值处理 16 | """ 17 | 18 | data = np.reshape(f0, (f0.size, 1)) 19 | 20 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 21 | vuv_vector[data > 0.0] = 1.0 22 | vuv_vector[data <= 0.0] = 0.0 23 | 24 | ip_data = data 25 | 26 | frame_number = data.size 27 | last_value = 0.0 28 | for i in range(frame_number): 29 | if data[i] <= 0.0: 30 | j = i + 1 31 | for j in range(i + 1, frame_number): 32 | if data[j] > 0.0: 33 | break 34 | if j < frame_number - 1: 35 | if last_value > 0.0: 36 | step = (data[j] - data[i - 1]) / float(j - i) 37 | for k in range(i, j): 38 | ip_data[k] = data[i - 1] + step * (k - i + 1) 39 | else: 40 | for k in range(i, j): 41 | ip_data[k] = data[j] 42 | else: 43 | for k in range(i, frame_number): 44 | ip_data[k] = last_value 45 | else: 46 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 47 | last_value = data[i] 48 | 49 | return ip_data[:, 0], vuv_vector[:, 0] 50 | 51 | def resize_f0(self, x, target_len): 52 | source = np.array(x) 53 | source[source < 0.001] = np.nan 54 | target = np.interp( 55 | np.arange(0, len(source) * target_len, len(source)) / target_len, 56 | np.arange(0, len(source)), 57 | source, 58 | ) 59 | res = np.nan_to_num(target) 60 | return res 61 | 62 | def compute_f0(self, wav, p_len=None): 63 | if p_len is None: 64 | p_len = wav.shape[0] // self.hop_length 65 | f0, t = pyworld.dio( 66 | wav.astype(np.double), 67 | fs=self.sampling_rate, 68 | f0_floor=self.f0_min, 69 | f0_ceil=self.f0_max, 70 | frame_period=1000 * self.hop_length / self.sampling_rate, 71 | ) 72 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 73 | for index, pitch in enumerate(f0): 74 | f0[index] = round(pitch, 1) 75 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 76 | 77 | def compute_f0_uv(self, wav, p_len=None): 78 | if p_len is None: 79 | p_len = wav.shape[0] // self.hop_length 80 | f0, t = pyworld.dio( 81 | wav.astype(np.double), 82 | fs=self.sampling_rate, 83 | f0_floor=self.f0_min, 84 | f0_ceil=self.f0_max, 85 | frame_period=1000 * self.hop_length / self.sampling_rate, 86 | ) 87 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 88 | for index, pitch in enumerate(f0): 89 | f0[index] = round(pitch, 1) 90 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 91 | -------------------------------------------------------------------------------- /lib/infer_pack/modules/F0Predictor/F0Predictor.py: -------------------------------------------------------------------------------- 1 | class F0Predictor(object): 2 | def compute_f0(self, wav, p_len): 3 | """ 4 | input: wav:[signal_length] 5 | p_len:int 6 | output: f0:[signal_length//hop_length] 7 | """ 8 | pass 9 | 10 | def compute_f0_uv(self, wav, p_len): 11 | """ 12 | input: wav:[signal_length] 13 | p_len:int 14 | output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] 15 | """ 16 | pass 17 | -------------------------------------------------------------------------------- /lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py: -------------------------------------------------------------------------------- 1 | from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 2 | import pyworld 3 | import numpy as np 4 | 5 | 6 | class HarvestF0Predictor(F0Predictor): 7 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 8 | self.hop_length = hop_length 9 | self.f0_min = f0_min 10 | self.f0_max = f0_max 11 | self.sampling_rate = sampling_rate 12 | 13 | def interpolate_f0(self, f0): 14 | """ 15 | 对F0进行插值处理 16 | """ 17 | 18 | data = np.reshape(f0, (f0.size, 1)) 19 | 20 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 21 | vuv_vector[data > 0.0] = 1.0 22 | vuv_vector[data <= 0.0] = 0.0 23 | 24 | ip_data = data 25 | 26 | frame_number = data.size 27 | last_value = 0.0 28 | for i in range(frame_number): 29 | if data[i] <= 0.0: 30 | j = i + 1 31 | for j in range(i + 1, frame_number): 32 | if data[j] > 0.0: 33 | break 34 | if j < frame_number - 1: 35 | if last_value > 0.0: 36 | step = (data[j] - data[i - 1]) / float(j - i) 37 | for k in range(i, j): 38 | ip_data[k] = data[i - 1] + step * (k - i + 1) 39 | else: 40 | for k in range(i, j): 41 | ip_data[k] = data[j] 42 | else: 43 | for k in range(i, frame_number): 44 | ip_data[k] = last_value 45 | else: 46 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 47 | last_value = data[i] 48 | 49 | return ip_data[:, 0], vuv_vector[:, 0] 50 | 51 | def resize_f0(self, x, target_len): 52 | source = np.array(x) 53 | source[source < 0.001] = np.nan 54 | target = np.interp( 55 | np.arange(0, len(source) * target_len, len(source)) / target_len, 56 | np.arange(0, len(source)), 57 | source, 58 | ) 59 | res = np.nan_to_num(target) 60 | return res 61 | 62 | def compute_f0(self, wav, p_len=None): 63 | if p_len is None: 64 | p_len = wav.shape[0] // self.hop_length 65 | f0, t = pyworld.harvest( 66 | wav.astype(np.double), 67 | fs=self.hop_length, 68 | f0_ceil=self.f0_max, 69 | f0_floor=self.f0_min, 70 | frame_period=1000 * self.hop_length / self.sampling_rate, 71 | ) 72 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) 73 | return self.interpolate_f0(self.resize_f0(f0, p_len))[0] 74 | 75 | def compute_f0_uv(self, wav, p_len=None): 76 | if p_len is None: 77 | p_len = wav.shape[0] // self.hop_length 78 | f0, t = pyworld.harvest( 79 | wav.astype(np.double), 80 | fs=self.sampling_rate, 81 | f0_floor=self.f0_min, 82 | f0_ceil=self.f0_max, 83 | frame_period=1000 * self.hop_length / self.sampling_rate, 84 | ) 85 | f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) 86 | return self.interpolate_f0(self.resize_f0(f0, p_len)) 87 | -------------------------------------------------------------------------------- /lib/infer_pack/modules/F0Predictor/PMF0Predictor.py: -------------------------------------------------------------------------------- 1 | from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor 2 | import parselmouth 3 | import numpy as np 4 | 5 | 6 | class PMF0Predictor(F0Predictor): 7 | def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): 8 | self.hop_length = hop_length 9 | self.f0_min = f0_min 10 | self.f0_max = f0_max 11 | self.sampling_rate = sampling_rate 12 | 13 | def interpolate_f0(self, f0): 14 | """ 15 | 对F0进行插值处理 16 | """ 17 | 18 | data = np.reshape(f0, (f0.size, 1)) 19 | 20 | vuv_vector = np.zeros((data.size, 1), dtype=np.float32) 21 | vuv_vector[data > 0.0] = 1.0 22 | vuv_vector[data <= 0.0] = 0.0 23 | 24 | ip_data = data 25 | 26 | frame_number = data.size 27 | last_value = 0.0 28 | for i in range(frame_number): 29 | if data[i] <= 0.0: 30 | j = i + 1 31 | for j in range(i + 1, frame_number): 32 | if data[j] > 0.0: 33 | break 34 | if j < frame_number - 1: 35 | if last_value > 0.0: 36 | step = (data[j] - data[i - 1]) / float(j - i) 37 | for k in range(i, j): 38 | ip_data[k] = data[i - 1] + step * (k - i + 1) 39 | else: 40 | for k in range(i, j): 41 | ip_data[k] = data[j] 42 | else: 43 | for k in range(i, frame_number): 44 | ip_data[k] = last_value 45 | else: 46 | ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 47 | last_value = data[i] 48 | 49 | return ip_data[:, 0], vuv_vector[:, 0] 50 | 51 | def compute_f0(self, wav, p_len=None): 52 | x = wav 53 | if p_len is None: 54 | p_len = x.shape[0] // self.hop_length 55 | else: 56 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 57 | time_step = self.hop_length / self.sampling_rate * 1000 58 | f0 = ( 59 | parselmouth.Sound(x, self.sampling_rate) 60 | .to_pitch_ac( 61 | time_step=time_step / 1000, 62 | voicing_threshold=0.6, 63 | pitch_floor=self.f0_min, 64 | pitch_ceiling=self.f0_max, 65 | ) 66 | .selected_array["frequency"] 67 | ) 68 | 69 | pad_size = (p_len - len(f0) + 1) // 2 70 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 71 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 72 | f0, uv = self.interpolate_f0(f0) 73 | return f0 74 | 75 | def compute_f0_uv(self, wav, p_len=None): 76 | x = wav 77 | if p_len is None: 78 | p_len = x.shape[0] // self.hop_length 79 | else: 80 | assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" 81 | time_step = self.hop_length / self.sampling_rate * 1000 82 | f0 = ( 83 | parselmouth.Sound(x, self.sampling_rate) 84 | .to_pitch_ac( 85 | time_step=time_step / 1000, 86 | voicing_threshold=0.6, 87 | pitch_floor=self.f0_min, 88 | pitch_ceiling=self.f0_max, 89 | ) 90 | .selected_array["frequency"] 91 | ) 92 | 93 | pad_size = (p_len - len(f0) + 1) // 2 94 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 95 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 96 | f0, uv = self.interpolate_f0(f0) 97 | return f0, uv 98 | -------------------------------------------------------------------------------- /lib/infer_pack/modules/F0Predictor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArkanDash/Multi-Model-RVC-Inference/661936e4dce121c8ad84113f7637308d1642c887/lib/infer_pack/modules/F0Predictor/__init__.py -------------------------------------------------------------------------------- /lib/infer_pack/onnx_inference.py: -------------------------------------------------------------------------------- 1 | import onnxruntime 2 | import librosa 3 | import numpy as np 4 | import soundfile 5 | 6 | 7 | class ContentVec: 8 | def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None): 9 | print("load model(s) from {}".format(vec_path)) 10 | if device == "cpu" or device is None: 11 | providers = ["CPUExecutionProvider"] 12 | elif device == "cuda": 13 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] 14 | elif device == "dml": 15 | providers = ["DmlExecutionProvider"] 16 | else: 17 | raise RuntimeError("Unsportted Device") 18 | self.model = onnxruntime.InferenceSession(vec_path, providers=providers) 19 | 20 | def __call__(self, wav): 21 | return self.forward(wav) 22 | 23 | def forward(self, wav): 24 | feats = wav 25 | if feats.ndim == 2: # double channels 26 | feats = feats.mean(-1) 27 | assert feats.ndim == 1, feats.ndim 28 | feats = np.expand_dims(np.expand_dims(feats, 0), 0) 29 | onnx_input = {self.model.get_inputs()[0].name: feats} 30 | logits = self.model.run(None, onnx_input)[0] 31 | return logits.transpose(0, 2, 1) 32 | 33 | 34 | def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): 35 | if f0_predictor == "pm": 36 | from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor 37 | 38 | f0_predictor_object = PMF0Predictor( 39 | hop_length=hop_length, sampling_rate=sampling_rate 40 | ) 41 | elif f0_predictor == "harvest": 42 | from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( 43 | HarvestF0Predictor, 44 | ) 45 | 46 | f0_predictor_object = HarvestF0Predictor( 47 | hop_length=hop_length, sampling_rate=sampling_rate 48 | ) 49 | elif f0_predictor == "dio": 50 | from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor 51 | 52 | f0_predictor_object = DioF0Predictor( 53 | hop_length=hop_length, sampling_rate=sampling_rate 54 | ) 55 | else: 56 | raise Exception("Unknown f0 predictor") 57 | return f0_predictor_object 58 | 59 | 60 | class OnnxRVC: 61 | def __init__( 62 | self, 63 | model_path, 64 | sr=40000, 65 | hop_size=512, 66 | vec_path="vec-768-layer-12", 67 | device="cpu", 68 | ): 69 | vec_path = f"pretrained/{vec_path}.onnx" 70 | self.vec_model = ContentVec(vec_path, device) 71 | if device == "cpu" or device is None: 72 | providers = ["CPUExecutionProvider"] 73 | elif device == "cuda": 74 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] 75 | elif device == "dml": 76 | providers = ["DmlExecutionProvider"] 77 | else: 78 | raise RuntimeError("Unsportted Device") 79 | self.model = onnxruntime.InferenceSession(model_path, providers=providers) 80 | self.sampling_rate = sr 81 | self.hop_size = hop_size 82 | 83 | def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd): 84 | onnx_input = { 85 | self.model.get_inputs()[0].name: hubert, 86 | self.model.get_inputs()[1].name: hubert_length, 87 | self.model.get_inputs()[2].name: pitch, 88 | self.model.get_inputs()[3].name: pitchf, 89 | self.model.get_inputs()[4].name: ds, 90 | self.model.get_inputs()[5].name: rnd, 91 | } 92 | return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16) 93 | 94 | def inference( 95 | self, 96 | raw_path, 97 | sid, 98 | f0_method="dio", 99 | f0_up_key=0, 100 | pad_time=0.5, 101 | cr_threshold=0.02, 102 | ): 103 | f0_min = 50 104 | f0_max = 1100 105 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 106 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 107 | f0_predictor = get_f0_predictor( 108 | f0_method, 109 | hop_length=self.hop_size, 110 | sampling_rate=self.sampling_rate, 111 | threshold=cr_threshold, 112 | ) 113 | wav, sr = librosa.load(raw_path, sr=self.sampling_rate) 114 | org_length = len(wav) 115 | if org_length / sr > 50.0: 116 | raise RuntimeError("Reached Max Length") 117 | 118 | wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000) 119 | wav16k = wav16k 120 | 121 | hubert = self.vec_model(wav16k) 122 | hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) 123 | hubert_length = hubert.shape[1] 124 | 125 | pitchf = f0_predictor.compute_f0(wav, hubert_length) 126 | pitchf = pitchf * 2 ** (f0_up_key / 12) 127 | pitch = pitchf.copy() 128 | f0_mel = 1127 * np.log(1 + pitch / 700) 129 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 130 | f0_mel_max - f0_mel_min 131 | ) + 1 132 | f0_mel[f0_mel <= 1] = 1 133 | f0_mel[f0_mel > 255] = 255 134 | pitch = np.rint(f0_mel).astype(np.int64) 135 | 136 | pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) 137 | pitch = pitch.reshape(1, len(pitch)) 138 | ds = np.array([sid]).astype(np.int64) 139 | 140 | rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) 141 | hubert_length = np.array([hubert_length]).astype(np.int64) 142 | 143 | out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze() 144 | out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") 145 | return out_wav[0:org_length] 146 | -------------------------------------------------------------------------------- /lib/infer_pack/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform( 13 | inputs, 14 | unnormalized_widths, 15 | unnormalized_heights, 16 | unnormalized_derivatives, 17 | inverse=False, 18 | tails=None, 19 | tail_bound=1.0, 20 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 21 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 22 | min_derivative=DEFAULT_MIN_DERIVATIVE, 23 | ): 24 | if tails is None: 25 | spline_fn = rational_quadratic_spline 26 | spline_kwargs = {} 27 | else: 28 | spline_fn = unconstrained_rational_quadratic_spline 29 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound} 30 | 31 | outputs, logabsdet = spline_fn( 32 | inputs=inputs, 33 | unnormalized_widths=unnormalized_widths, 34 | unnormalized_heights=unnormalized_heights, 35 | unnormalized_derivatives=unnormalized_derivatives, 36 | inverse=inverse, 37 | min_bin_width=min_bin_width, 38 | min_bin_height=min_bin_height, 39 | min_derivative=min_derivative, 40 | **spline_kwargs 41 | ) 42 | return outputs, logabsdet 43 | 44 | 45 | def searchsorted(bin_locations, inputs, eps=1e-6): 46 | bin_locations[..., -1] += eps 47 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 48 | 49 | 50 | def unconstrained_rational_quadratic_spline( 51 | inputs, 52 | unnormalized_widths, 53 | unnormalized_heights, 54 | unnormalized_derivatives, 55 | inverse=False, 56 | tails="linear", 57 | tail_bound=1.0, 58 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 59 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 60 | min_derivative=DEFAULT_MIN_DERIVATIVE, 61 | ): 62 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 63 | outside_interval_mask = ~inside_interval_mask 64 | 65 | outputs = torch.zeros_like(inputs) 66 | logabsdet = torch.zeros_like(inputs) 67 | 68 | if tails == "linear": 69 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 70 | constant = np.log(np.exp(1 - min_derivative) - 1) 71 | unnormalized_derivatives[..., 0] = constant 72 | unnormalized_derivatives[..., -1] = constant 73 | 74 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 75 | logabsdet[outside_interval_mask] = 0 76 | else: 77 | raise RuntimeError("{} tails are not implemented.".format(tails)) 78 | 79 | ( 80 | outputs[inside_interval_mask], 81 | logabsdet[inside_interval_mask], 82 | ) = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, 89 | right=tail_bound, 90 | bottom=-tail_bound, 91 | top=tail_bound, 92 | min_bin_width=min_bin_width, 93 | min_bin_height=min_bin_height, 94 | min_derivative=min_derivative, 95 | ) 96 | 97 | return outputs, logabsdet 98 | 99 | 100 | def rational_quadratic_spline( 101 | inputs, 102 | unnormalized_widths, 103 | unnormalized_heights, 104 | unnormalized_derivatives, 105 | inverse=False, 106 | left=0.0, 107 | right=1.0, 108 | bottom=0.0, 109 | top=1.0, 110 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 111 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 112 | min_derivative=DEFAULT_MIN_DERIVATIVE, 113 | ): 114 | if torch.min(inputs) < left or torch.max(inputs) > right: 115 | raise ValueError("Input to a transform is not within its domain") 116 | 117 | num_bins = unnormalized_widths.shape[-1] 118 | 119 | if min_bin_width * num_bins > 1.0: 120 | raise ValueError("Minimal bin width too large for the number of bins") 121 | if min_bin_height * num_bins > 1.0: 122 | raise ValueError("Minimal bin height too large for the number of bins") 123 | 124 | widths = F.softmax(unnormalized_widths, dim=-1) 125 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 126 | cumwidths = torch.cumsum(widths, dim=-1) 127 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) 128 | cumwidths = (right - left) * cumwidths + left 129 | cumwidths[..., 0] = left 130 | cumwidths[..., -1] = right 131 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 132 | 133 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 134 | 135 | heights = F.softmax(unnormalized_heights, dim=-1) 136 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 137 | cumheights = torch.cumsum(heights, dim=-1) 138 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) 139 | cumheights = (top - bottom) * cumheights + bottom 140 | cumheights[..., 0] = bottom 141 | cumheights[..., -1] = top 142 | heights = cumheights[..., 1:] - cumheights[..., :-1] 143 | 144 | if inverse: 145 | bin_idx = searchsorted(cumheights, inputs)[..., None] 146 | else: 147 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 148 | 149 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 150 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 151 | 152 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 153 | delta = heights / widths 154 | input_delta = delta.gather(-1, bin_idx)[..., 0] 155 | 156 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 157 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 158 | 159 | input_heights = heights.gather(-1, bin_idx)[..., 0] 160 | 161 | if inverse: 162 | a = (inputs - input_cumheights) * ( 163 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 164 | ) + input_heights * (input_delta - input_derivatives) 165 | b = input_heights * input_derivatives - (inputs - input_cumheights) * ( 166 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 167 | ) 168 | c = -input_delta * (inputs - input_cumheights) 169 | 170 | discriminant = b.pow(2) - 4 * a * c 171 | assert (discriminant >= 0).all() 172 | 173 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 174 | outputs = root * input_bin_widths + input_cumwidths 175 | 176 | theta_one_minus_theta = root * (1 - root) 177 | denominator = input_delta + ( 178 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 179 | * theta_one_minus_theta 180 | ) 181 | derivative_numerator = input_delta.pow(2) * ( 182 | input_derivatives_plus_one * root.pow(2) 183 | + 2 * input_delta * theta_one_minus_theta 184 | + input_derivatives * (1 - root).pow(2) 185 | ) 186 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 187 | 188 | return outputs, -logabsdet 189 | else: 190 | theta = (inputs - input_cumwidths) / input_bin_widths 191 | theta_one_minus_theta = theta * (1 - theta) 192 | 193 | numerator = input_heights * ( 194 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta 195 | ) 196 | denominator = input_delta + ( 197 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 198 | * theta_one_minus_theta 199 | ) 200 | outputs = input_cumheights + numerator / denominator 201 | 202 | derivative_numerator = input_delta.pow(2) * ( 203 | input_derivatives_plus_one * theta.pow(2) 204 | + 2 * input_delta * theta_one_minus_theta 205 | + input_derivatives * (1 - theta).pow(2) 206 | ) 207 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 208 | 209 | return outputs, logabsdet 210 | -------------------------------------------------------------------------------- /lib/vc/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | 4 | import librosa 5 | import numpy as np 6 | import av 7 | from io import BytesIO 8 | 9 | 10 | def wav2(i, o, format): 11 | inp = av.open(i, "rb") 12 | if format == "m4a": 13 | format = "mp4" 14 | out = av.open(o, "wb", format=format) 15 | if format == "ogg": 16 | format = "libvorbis" 17 | if format == "mp4": 18 | format = "aac" 19 | 20 | ostream = out.add_stream(format) 21 | 22 | for frame in inp.decode(audio=0): 23 | for p in ostream.encode(frame): 24 | out.mux(p) 25 | 26 | for p in ostream.encode(None): 27 | out.mux(p) 28 | 29 | out.close() 30 | inp.close() 31 | 32 | 33 | def audio2(i, o, format, sr): 34 | inp = av.open(i, "rb") 35 | out = av.open(o, "wb", format=format) 36 | if format == "ogg": 37 | format = "libvorbis" 38 | if format == "f32le": 39 | format = "pcm_f32le" 40 | 41 | ostream = out.add_stream(format, channels=1) 42 | ostream.sample_rate = sr 43 | 44 | for frame in inp.decode(audio=0): 45 | for p in ostream.encode(frame): 46 | out.mux(p) 47 | 48 | out.close() 49 | inp.close() 50 | 51 | 52 | def load_audio(file, sr): 53 | file = ( 54 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 55 | ) # 防止小白拷路径头尾带了空格和"和回车 56 | if os.path.exists(file) == False: 57 | raise RuntimeError( 58 | "You input a wrong audio path that does not exists, please fix it!" 59 | ) 60 | try: 61 | with open(file, "rb") as f: 62 | with BytesIO() as out: 63 | audio2(f, out, "f32le", sr) 64 | return np.frombuffer(out.getvalue(), np.float32).flatten() 65 | 66 | except AttributeError: 67 | audio = file[1] / 32768.0 68 | if len(audio.shape) == 2: 69 | audio = np.mean(audio, -1) 70 | return librosa.resample(audio, orig_sr=file[0], target_sr=16000) 71 | 72 | except: 73 | raise RuntimeError(traceback.format_exc()) 74 | -------------------------------------------------------------------------------- /lib/vc/rmvpe.py: -------------------------------------------------------------------------------- 1 | import sys, torch, numpy as np, traceback, pdb 2 | import torch.nn as nn 3 | from time import time as ttime 4 | import torch.nn.functional as F 5 | 6 | 7 | class BiGRU(nn.Module): 8 | def __init__(self, input_features, hidden_features, num_layers): 9 | super(BiGRU, self).__init__() 10 | self.gru = nn.GRU( 11 | input_features, 12 | hidden_features, 13 | num_layers=num_layers, 14 | batch_first=True, 15 | bidirectional=True, 16 | ) 17 | 18 | def forward(self, x): 19 | return self.gru(x)[0] 20 | 21 | 22 | class ConvBlockRes(nn.Module): 23 | def __init__(self, in_channels, out_channels, momentum=0.01): 24 | super(ConvBlockRes, self).__init__() 25 | self.conv = nn.Sequential( 26 | nn.Conv2d( 27 | in_channels=in_channels, 28 | out_channels=out_channels, 29 | kernel_size=(3, 3), 30 | stride=(1, 1), 31 | padding=(1, 1), 32 | bias=False, 33 | ), 34 | nn.BatchNorm2d(out_channels, momentum=momentum), 35 | nn.ReLU(), 36 | nn.Conv2d( 37 | in_channels=out_channels, 38 | out_channels=out_channels, 39 | kernel_size=(3, 3), 40 | stride=(1, 1), 41 | padding=(1, 1), 42 | bias=False, 43 | ), 44 | nn.BatchNorm2d(out_channels, momentum=momentum), 45 | nn.ReLU(), 46 | ) 47 | if in_channels != out_channels: 48 | self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) 49 | self.is_shortcut = True 50 | else: 51 | self.is_shortcut = False 52 | 53 | def forward(self, x): 54 | if self.is_shortcut: 55 | return self.conv(x) + self.shortcut(x) 56 | else: 57 | return self.conv(x) + x 58 | 59 | 60 | class Encoder(nn.Module): 61 | def __init__( 62 | self, 63 | in_channels, 64 | in_size, 65 | n_encoders, 66 | kernel_size, 67 | n_blocks, 68 | out_channels=16, 69 | momentum=0.01, 70 | ): 71 | super(Encoder, self).__init__() 72 | self.n_encoders = n_encoders 73 | self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) 74 | self.layers = nn.ModuleList() 75 | self.latent_channels = [] 76 | for i in range(self.n_encoders): 77 | self.layers.append( 78 | ResEncoderBlock( 79 | in_channels, out_channels, kernel_size, n_blocks, momentum=momentum 80 | ) 81 | ) 82 | self.latent_channels.append([out_channels, in_size]) 83 | in_channels = out_channels 84 | out_channels *= 2 85 | in_size //= 2 86 | self.out_size = in_size 87 | self.out_channel = out_channels 88 | 89 | def forward(self, x): 90 | concat_tensors = [] 91 | x = self.bn(x) 92 | for i in range(self.n_encoders): 93 | _, x = self.layers[i](x) 94 | concat_tensors.append(_) 95 | return x, concat_tensors 96 | 97 | 98 | class ResEncoderBlock(nn.Module): 99 | def __init__( 100 | self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 101 | ): 102 | super(ResEncoderBlock, self).__init__() 103 | self.n_blocks = n_blocks 104 | self.conv = nn.ModuleList() 105 | self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) 106 | for i in range(n_blocks - 1): 107 | self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) 108 | self.kernel_size = kernel_size 109 | if self.kernel_size is not None: 110 | self.pool = nn.AvgPool2d(kernel_size=kernel_size) 111 | 112 | def forward(self, x): 113 | for i in range(self.n_blocks): 114 | x = self.conv[i](x) 115 | if self.kernel_size is not None: 116 | return x, self.pool(x) 117 | else: 118 | return x 119 | 120 | 121 | class Intermediate(nn.Module): # 122 | def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): 123 | super(Intermediate, self).__init__() 124 | self.n_inters = n_inters 125 | self.layers = nn.ModuleList() 126 | self.layers.append( 127 | ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) 128 | ) 129 | for i in range(self.n_inters - 1): 130 | self.layers.append( 131 | ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) 132 | ) 133 | 134 | def forward(self, x): 135 | for i in range(self.n_inters): 136 | x = self.layers[i](x) 137 | return x 138 | 139 | 140 | class ResDecoderBlock(nn.Module): 141 | def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): 142 | super(ResDecoderBlock, self).__init__() 143 | out_padding = (0, 1) if stride == (1, 2) else (1, 1) 144 | self.n_blocks = n_blocks 145 | self.conv1 = nn.Sequential( 146 | nn.ConvTranspose2d( 147 | in_channels=in_channels, 148 | out_channels=out_channels, 149 | kernel_size=(3, 3), 150 | stride=stride, 151 | padding=(1, 1), 152 | output_padding=out_padding, 153 | bias=False, 154 | ), 155 | nn.BatchNorm2d(out_channels, momentum=momentum), 156 | nn.ReLU(), 157 | ) 158 | self.conv2 = nn.ModuleList() 159 | self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) 160 | for i in range(n_blocks - 1): 161 | self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) 162 | 163 | def forward(self, x, concat_tensor): 164 | x = self.conv1(x) 165 | x = torch.cat((x, concat_tensor), dim=1) 166 | for i in range(self.n_blocks): 167 | x = self.conv2[i](x) 168 | return x 169 | 170 | 171 | class Decoder(nn.Module): 172 | def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): 173 | super(Decoder, self).__init__() 174 | self.layers = nn.ModuleList() 175 | self.n_decoders = n_decoders 176 | for i in range(self.n_decoders): 177 | out_channels = in_channels // 2 178 | self.layers.append( 179 | ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) 180 | ) 181 | in_channels = out_channels 182 | 183 | def forward(self, x, concat_tensors): 184 | for i in range(self.n_decoders): 185 | x = self.layers[i](x, concat_tensors[-1 - i]) 186 | return x 187 | 188 | 189 | class DeepUnet(nn.Module): 190 | def __init__( 191 | self, 192 | kernel_size, 193 | n_blocks, 194 | en_de_layers=5, 195 | inter_layers=4, 196 | in_channels=1, 197 | en_out_channels=16, 198 | ): 199 | super(DeepUnet, self).__init__() 200 | self.encoder = Encoder( 201 | in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels 202 | ) 203 | self.intermediate = Intermediate( 204 | self.encoder.out_channel // 2, 205 | self.encoder.out_channel, 206 | inter_layers, 207 | n_blocks, 208 | ) 209 | self.decoder = Decoder( 210 | self.encoder.out_channel, en_de_layers, kernel_size, n_blocks 211 | ) 212 | 213 | def forward(self, x): 214 | x, concat_tensors = self.encoder(x) 215 | x = self.intermediate(x) 216 | x = self.decoder(x, concat_tensors) 217 | return x 218 | 219 | 220 | class E2E(nn.Module): 221 | def __init__( 222 | self, 223 | n_blocks, 224 | n_gru, 225 | kernel_size, 226 | en_de_layers=5, 227 | inter_layers=4, 228 | in_channels=1, 229 | en_out_channels=16, 230 | ): 231 | super(E2E, self).__init__() 232 | self.unet = DeepUnet( 233 | kernel_size, 234 | n_blocks, 235 | en_de_layers, 236 | inter_layers, 237 | in_channels, 238 | en_out_channels, 239 | ) 240 | self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) 241 | if n_gru: 242 | self.fc = nn.Sequential( 243 | BiGRU(3 * 128, 256, n_gru), 244 | nn.Linear(512, 360), 245 | nn.Dropout(0.25), 246 | nn.Sigmoid(), 247 | ) 248 | else: 249 | self.fc = nn.Sequential( 250 | nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() 251 | ) 252 | 253 | def forward(self, mel): 254 | mel = mel.transpose(-1, -2).unsqueeze(1) 255 | x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) 256 | x = self.fc(x) 257 | return x 258 | 259 | 260 | from librosa.filters import mel 261 | 262 | 263 | class MelSpectrogram(torch.nn.Module): 264 | def __init__( 265 | self, 266 | is_half, 267 | n_mel_channels, 268 | sampling_rate, 269 | win_length, 270 | hop_length, 271 | n_fft=None, 272 | mel_fmin=0, 273 | mel_fmax=None, 274 | clamp=1e-5, 275 | ): 276 | super().__init__() 277 | n_fft = win_length if n_fft is None else n_fft 278 | self.hann_window = {} 279 | mel_basis = mel( 280 | sr=sampling_rate, 281 | n_fft=n_fft, 282 | n_mels=n_mel_channels, 283 | fmin=mel_fmin, 284 | fmax=mel_fmax, 285 | htk=True, 286 | ) 287 | mel_basis = torch.from_numpy(mel_basis).float() 288 | self.register_buffer("mel_basis", mel_basis) 289 | self.n_fft = win_length if n_fft is None else n_fft 290 | self.hop_length = hop_length 291 | self.win_length = win_length 292 | self.sampling_rate = sampling_rate 293 | self.n_mel_channels = n_mel_channels 294 | self.clamp = clamp 295 | self.is_half = is_half 296 | 297 | def forward(self, audio, keyshift=0, speed=1, center=True): 298 | factor = 2 ** (keyshift / 12) 299 | n_fft_new = int(np.round(self.n_fft * factor)) 300 | win_length_new = int(np.round(self.win_length * factor)) 301 | hop_length_new = int(np.round(self.hop_length * speed)) 302 | keyshift_key = str(keyshift) + "_" + str(audio.device) 303 | if keyshift_key not in self.hann_window: 304 | self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( 305 | audio.device 306 | ) 307 | fft = torch.stft( 308 | audio, 309 | n_fft=n_fft_new, 310 | hop_length=hop_length_new, 311 | win_length=win_length_new, 312 | window=self.hann_window[keyshift_key], 313 | center=center, 314 | return_complex=True, 315 | ) 316 | magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) 317 | if keyshift != 0: 318 | size = self.n_fft // 2 + 1 319 | resize = magnitude.size(1) 320 | if resize < size: 321 | magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) 322 | magnitude = magnitude[:, :size, :] * self.win_length / win_length_new 323 | mel_output = torch.matmul(self.mel_basis, magnitude) 324 | if self.is_half == True: 325 | mel_output = mel_output.half() 326 | log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) 327 | return log_mel_spec 328 | 329 | 330 | class RMVPE: 331 | def __init__(self, model_path, is_half, device=None): 332 | self.resample_kernel = {} 333 | model = E2E(4, 1, (2, 2)) 334 | ckpt = torch.load(model_path, map_location="cpu") 335 | model.load_state_dict(ckpt) 336 | model.eval() 337 | if is_half == True: 338 | model = model.half() 339 | self.model = model 340 | self.resample_kernel = {} 341 | self.is_half = is_half 342 | if device is None: 343 | device = "cuda" if torch.cuda.is_available() else "cpu" 344 | self.device = device 345 | self.mel_extractor = MelSpectrogram( 346 | is_half, 128, 16000, 1024, 160, None, 30, 8000 347 | ).to(device) 348 | self.model = self.model.to(device) 349 | cents_mapping = 20 * np.arange(360) + 1997.3794084376191 350 | self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 351 | 352 | def mel2hidden(self, mel): 353 | with torch.no_grad(): 354 | n_frames = mel.shape[-1] 355 | mel = F.pad( 356 | mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" 357 | ) 358 | hidden = self.model(mel) 359 | return hidden[:, :n_frames] 360 | 361 | def decode(self, hidden, thred=0.03): 362 | cents_pred = self.to_local_average_cents(hidden, thred=thred) 363 | f0 = 10 * (2 ** (cents_pred / 1200)) 364 | f0[f0 == 10] = 0 365 | # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) 366 | return f0 367 | 368 | def infer_from_audio(self, audio, thred=0.03): 369 | audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) 370 | # torch.cuda.synchronize() 371 | # t0=ttime() 372 | mel = self.mel_extractor(audio, center=True) 373 | # torch.cuda.synchronize() 374 | # t1=ttime() 375 | hidden = self.mel2hidden(mel) 376 | # torch.cuda.synchronize() 377 | # t2=ttime() 378 | hidden = hidden.squeeze(0).cpu().numpy() 379 | if self.is_half == True: 380 | hidden = hidden.astype("float32") 381 | f0 = self.decode(hidden, thred=thred) 382 | # torch.cuda.synchronize() 383 | # t3=ttime() 384 | # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) 385 | return f0 386 | 387 | def to_local_average_cents(self, salience, thred=0.05): 388 | # t0 = ttime() 389 | center = np.argmax(salience, axis=1) # 帧长#index 390 | salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 391 | # t1 = ttime() 392 | center += 4 393 | todo_salience = [] 394 | todo_cents_mapping = [] 395 | starts = center - 4 396 | ends = center + 5 397 | for idx in range(salience.shape[0]): 398 | todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) 399 | todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) 400 | # t2 = ttime() 401 | todo_salience = np.array(todo_salience) # 帧长,9 402 | todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 403 | product_sum = np.sum(todo_salience * todo_cents_mapping, 1) 404 | weight_sum = np.sum(todo_salience, 1) # 帧长 405 | devided = product_sum / weight_sum # 帧长 406 | # t3 = ttime() 407 | maxx = np.max(salience, axis=1) # 帧长 408 | devided[maxx <= thred] = 0 409 | # t4 = ttime() 410 | # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) 411 | return devided 412 | 413 | 414 | # if __name__ == '__main__': 415 | # audio, sampling_rate = sf.read("卢本伟语录~1.wav") 416 | # if len(audio.shape) > 1: 417 | # audio = librosa.to_mono(audio.transpose(1, 0)) 418 | # audio_bak = audio.copy() 419 | # if sampling_rate != 16000: 420 | # audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) 421 | # model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt" 422 | # thred = 0.03 # 0.01 423 | # device = 'cuda' if torch.cuda.is_available() else 'cpu' 424 | # rmvpe = RMVPE(model_path,is_half=False, device=device) 425 | # t0=ttime() 426 | # f0 = rmvpe.infer_from_audio(audio, thred=thred) 427 | # f0 = rmvpe.infer_from_audio(audio, thred=thred) 428 | # f0 = rmvpe.infer_from_audio(audio, thred=thred) 429 | # f0 = rmvpe.infer_from_audio(audio, thred=thred) 430 | # f0 = rmvpe.infer_from_audio(audio, thred=thred) 431 | # t1=ttime() 432 | # print(f0.shape,t1-t0) 433 | -------------------------------------------------------------------------------- /lib/vc/settings.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | def change_audio_mode(vc_audio_mode): 4 | if vc_audio_mode == "Input path": 5 | return ( 6 | # Input & Upload 7 | gr.Textbox(visible=True), 8 | gr.Audio(visible=False), 9 | # Youtube 10 | gr.Dropdown(visible=False), 11 | gr.Textbox(visible=False), 12 | gr.Textbox(visible=False), 13 | gr.Button(visible=False), 14 | # Splitter 15 | gr.Dropdown(visible=False), 16 | gr.Textbox(visible=False), 17 | gr.Button(visible=False), 18 | gr.Audio(visible=False), 19 | gr.Audio(visible=False), 20 | gr.Audio(visible=False), 21 | gr.Slider(visible=False), 22 | gr.Slider(visible=False), 23 | gr.Audio(visible=False), 24 | gr.Button(visible=False), 25 | # TTS 26 | gr.Textbox(visible=False), 27 | gr.Dropdown(visible=False) 28 | ) 29 | elif vc_audio_mode == "Upload audio": 30 | return ( 31 | # Input & Upload 32 | gr.Textbox(visible=False), 33 | gr.Audio(visible=True), 34 | # Youtube 35 | gr.Dropdown(visible=False), 36 | gr.Textbox(visible=False), 37 | gr.Textbox(visible=False), 38 | gr.Button(visible=False), 39 | # Splitter 40 | gr.Dropdown(visible=False), 41 | gr.Textbox(visible=False), 42 | gr.Button(visible=False), 43 | gr.Audio(visible=False), 44 | gr.Audio(visible=False), 45 | gr.Audio(visible=False), 46 | gr.Slider(visible=False), 47 | gr.Slider(visible=False), 48 | gr.Audio(visible=False), 49 | gr.Button(visible=False), 50 | # TTS 51 | gr.Textbox(visible=False), 52 | gr.Dropdown(visible=False) 53 | ) 54 | elif vc_audio_mode == "Youtube": 55 | return ( 56 | # Input & Upload 57 | gr.Textbox(visible=False), 58 | gr.Audio(visible=False), 59 | # Youtube 60 | gr.Dropdown(visible=True), 61 | gr.Textbox(visible=True), 62 | gr.Textbox(visible=True), 63 | gr.Button(visible=True), 64 | # Splitter 65 | gr.Dropdown(visible=True), 66 | gr.Textbox(visible=True), 67 | gr.Button(visible=True), 68 | gr.Audio(visible=True), 69 | gr.Audio(visible=True), 70 | gr.Audio(visible=True), 71 | gr.Slider(visible=True), 72 | gr.Slider(visible=True), 73 | gr.Audio(visible=True), 74 | gr.Button(visible=True), 75 | # TTS 76 | gr.Textbox(visible=False), 77 | gr.Dropdown(visible=False) 78 | ) 79 | elif vc_audio_mode == "TTS Audio": 80 | return ( 81 | # Input & Upload 82 | gr.Textbox(visible=False), 83 | gr.Audio(visible=False), 84 | # Youtube 85 | gr.Dropdown(visible=False), 86 | gr.Textbox(visible=False), 87 | gr.Textbox(visible=False), 88 | gr.Button(visible=False), 89 | # Splitter 90 | gr.Dropdown(visible=False), 91 | gr.Textbox(visible=False), 92 | gr.Button(visible=False), 93 | gr.Audio(visible=False), 94 | gr.Audio(visible=False), 95 | gr.Audio(visible=False), 96 | gr.Slider(visible=False), 97 | gr.Slider(visible=False), 98 | gr.Audio(visible=False), 99 | gr.Button(visible=False), 100 | # TTS 101 | gr.Textbox(visible=True), 102 | gr.Dropdown(visible=True) 103 | ) -------------------------------------------------------------------------------- /lib/vc/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import wave 3 | import subprocess 4 | import yt_dlp 5 | import ffmpeg 6 | import logging 7 | from fairseq import checkpoint_utils 8 | logger = logging.getLogger(__name__) 9 | 10 | def load_hubert(config): 11 | path_check = os.path.exists("assets/hubert/hubert_base.pt") 12 | if path_check is False: 13 | logger.warn("hubert_base.pt is missing. Please check the documentation for to get it.") 14 | else: 15 | logger.info("hubert_base.pt found.") 16 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task( 17 | [os.path.join("assets", "hubert", "hubert_base.pt")], 18 | suffix="", 19 | ) 20 | hubert_model = models[0] 21 | hubert_model = hubert_model.to(config.device) 22 | if config.is_half: 23 | hubert_model = hubert_model.half() 24 | else: 25 | hubert_model = hubert_model.float() 26 | hubert_model.eval() 27 | return hubert_model 28 | 29 | def download_audio(url, audio_provider): 30 | logs = [] 31 | if url == "": 32 | logs.append("URL required!") 33 | yield None, "\n".join(logs) 34 | return None, "\n".join(logs) 35 | if not os.path.exists("yt"): 36 | os.mkdir("yt") 37 | if audio_provider == "Youtube": 38 | logs.append("Downloading the audio...") 39 | yield None, "\n".join(logs) 40 | ydl_opts = { 41 | 'noplaylist': True, 42 | 'format': 'bestaudio/best', 43 | 'postprocessors': [{ 44 | 'key': 'FFmpegExtractAudio', 45 | 'preferredcodec': 'wav', 46 | }], 47 | "outtmpl": 'yt/audio', 48 | } 49 | audio_path = "yt/audio.wav" 50 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 51 | ydl.download([url]) 52 | logs.append("Download Complete.") 53 | yield audio_path, "\n".join(logs) 54 | 55 | def cut_vocal_and_inst(split_model): 56 | logs = [] 57 | logs.append("Starting the audio splitting process...") 58 | yield "\n".join(logs), None, None, None 59 | command = f"demucs --two-stems=vocals -n {split_model} yt/audio.wav -o output" 60 | result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True) 61 | for line in result.stdout: 62 | logs.append(line) 63 | yield "\n".join(logs), None, None, None 64 | logger.info(result.stdout) 65 | vocal = f"output/{split_model}/audio/vocals.wav" 66 | inst = f"output/{split_model}/audio/no_vocals.wav" 67 | logs.append("Audio splitting complete.") 68 | yield "\n".join(logs), vocal, inst, vocal 69 | 70 | def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model): 71 | if not os.path.exists("output/result"): 72 | os.mkdir("output/result") 73 | vocal_path = "output/result/output.wav" 74 | output_path = "output/result/combine.mp3" 75 | inst_path = f"output/{split_model}/audio/no_vocals.wav" 76 | with wave.open(vocal_path, "w") as wave_file: 77 | wave_file.setnchannels(1) 78 | wave_file.setsampwidth(2) 79 | wave_file.setframerate(audio_data[0]) 80 | wave_file.writeframes(audio_data[1].tobytes()) 81 | command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}' 82 | result = subprocess.run(command.split(), stdout=subprocess.PIPE) 83 | logger.info(result.stdout.decode()) 84 | return output_path -------------------------------------------------------------------------------- /lib/vc/vc_infer_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np, parselmouth, torch, pdb, sys, os 2 | from time import time as ttime 3 | import torch.nn.functional as F 4 | import scipy.signal as signal 5 | import pyworld, os, traceback, faiss, librosa, torchcrepe 6 | from scipy import signal 7 | from functools import lru_cache 8 | 9 | now_dir = os.getcwd() 10 | sys.path.append(now_dir) 11 | 12 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) 13 | 14 | input_audio_path2wav = {} 15 | 16 | 17 | @lru_cache 18 | def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): 19 | audio = input_audio_path2wav[input_audio_path] 20 | f0, t = pyworld.harvest( 21 | audio, 22 | fs=fs, 23 | f0_ceil=f0max, 24 | f0_floor=f0min, 25 | frame_period=frame_period, 26 | ) 27 | f0 = pyworld.stonemask(audio, f0, t, fs) 28 | return f0 29 | 30 | 31 | def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 32 | # print(data1.max(),data2.max()) 33 | rms1 = librosa.feature.rms( 34 | y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 35 | ) # 每半秒一个点 36 | rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) 37 | rms1 = torch.from_numpy(rms1) 38 | rms1 = F.interpolate( 39 | rms1.unsqueeze(0), size=data2.shape[0], mode="linear" 40 | ).squeeze() 41 | rms2 = torch.from_numpy(rms2) 42 | rms2 = F.interpolate( 43 | rms2.unsqueeze(0), size=data2.shape[0], mode="linear" 44 | ).squeeze() 45 | rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) 46 | data2 *= ( 47 | torch.pow(rms1, torch.tensor(1 - rate)) 48 | * torch.pow(rms2, torch.tensor(rate - 1)) 49 | ).numpy() 50 | return data2 51 | 52 | 53 | class VC(object): 54 | def __init__(self, tgt_sr, config): 55 | self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( 56 | config.x_pad, 57 | config.x_query, 58 | config.x_center, 59 | config.x_max, 60 | config.is_half, 61 | ) 62 | self.sr = 16000 # hubert输入采样率 63 | self.window = 160 # 每帧点数 64 | self.t_pad = self.sr * self.x_pad # 每条前后pad时间 65 | self.t_pad_tgt = tgt_sr * self.x_pad 66 | self.t_pad2 = self.t_pad * 2 67 | self.t_query = self.sr * self.x_query # 查询切点前后查询时间 68 | self.t_center = self.sr * self.x_center # 查询切点位置 69 | self.t_max = self.sr * self.x_max # 免查询时长阈值 70 | self.device = config.device 71 | 72 | def get_f0( 73 | self, 74 | input_audio_path, 75 | x, 76 | p_len, 77 | f0_up_key, 78 | f0_method, 79 | filter_radius, 80 | inp_f0=None, 81 | ): 82 | global input_audio_path2wav 83 | time_step = self.window / self.sr * 1000 84 | f0_min = 50 85 | f0_max = 1100 86 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 87 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 88 | if f0_method == "pm": 89 | f0 = ( 90 | parselmouth.Sound(x, self.sr) 91 | .to_pitch_ac( 92 | time_step=time_step / 1000, 93 | voicing_threshold=0.6, 94 | pitch_floor=f0_min, 95 | pitch_ceiling=f0_max, 96 | ) 97 | .selected_array["frequency"] 98 | ) 99 | pad_size = (p_len - len(f0) + 1) // 2 100 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 101 | f0 = np.pad( 102 | f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" 103 | ) 104 | elif f0_method == "harvest": 105 | input_audio_path2wav[input_audio_path] = x.astype(np.double) 106 | f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) 107 | if filter_radius > 2: 108 | f0 = signal.medfilt(f0, 3) 109 | elif f0_method == "crepe": 110 | model = "full" 111 | # Pick a batch size that doesn't cause memory errors on your gpu 112 | batch_size = 512 113 | # Compute pitch using first gpu 114 | audio = torch.tensor(np.copy(x))[None].float() 115 | f0, pd = torchcrepe.predict( 116 | audio, 117 | self.sr, 118 | self.window, 119 | f0_min, 120 | f0_max, 121 | model, 122 | batch_size=batch_size, 123 | device=self.device, 124 | return_periodicity=True, 125 | ) 126 | pd = torchcrepe.filter.median(pd, 3) 127 | f0 = torchcrepe.filter.mean(f0, 3) 128 | f0[pd < 0.1] = 0 129 | f0 = f0[0].cpu().numpy() 130 | elif f0_method == "rmvpe": 131 | if hasattr(self, "model_rmvpe") == False: 132 | from rmvpe import RMVPE 133 | 134 | print("loading rmvpe model") 135 | self.model_rmvpe = RMVPE( 136 | os.path.join("assets", "rvmpe", "rmvpe.pt"), is_half=self.is_half, device=self.device 137 | ) 138 | f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) 139 | f0 *= pow(2, f0_up_key / 12) 140 | # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) 141 | tf0 = self.sr // self.window # 每秒f0点数 142 | if inp_f0 is not None: 143 | delta_t = np.round( 144 | (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 145 | ).astype("int16") 146 | replace_f0 = np.interp( 147 | list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] 148 | ) 149 | shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] 150 | f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ 151 | :shape 152 | ] 153 | # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) 154 | f0bak = f0.copy() 155 | f0_mel = 1127 * np.log(1 + f0 / 700) 156 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 157 | f0_mel_max - f0_mel_min 158 | ) + 1 159 | f0_mel[f0_mel <= 1] = 1 160 | f0_mel[f0_mel > 255] = 255 161 | f0_coarse = np.rint(f0_mel).astype(np.int) 162 | return f0_coarse, f0bak # 1-0 163 | 164 | def vc( 165 | self, 166 | model, 167 | net_g, 168 | sid, 169 | audio0, 170 | pitch, 171 | pitchf, 172 | times, 173 | index, 174 | big_npy, 175 | index_rate, 176 | version, 177 | protect, 178 | ): # ,file_index,file_big_npy 179 | feats = torch.from_numpy(audio0) 180 | if self.is_half: 181 | feats = feats.half() 182 | else: 183 | feats = feats.float() 184 | if feats.dim() == 2: # double channels 185 | feats = feats.mean(-1) 186 | assert feats.dim() == 1, feats.dim() 187 | feats = feats.view(1, -1) 188 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) 189 | 190 | inputs = { 191 | "source": feats.to(self.device), 192 | "padding_mask": padding_mask, 193 | "output_layer": 9 if version == "v1" else 12, 194 | } 195 | t0 = ttime() 196 | with torch.no_grad(): 197 | logits = model.extract_features(**inputs) 198 | feats = model.final_proj(logits[0]) if version == "v1" else logits[0] 199 | if protect < 0.5 and pitch != None and pitchf != None: 200 | feats0 = feats.clone() 201 | if ( 202 | isinstance(index, type(None)) == False 203 | and isinstance(big_npy, type(None)) == False 204 | and index_rate != 0 205 | ): 206 | npy = feats[0].cpu().numpy() 207 | if self.is_half: 208 | npy = npy.astype("float32") 209 | 210 | # _, I = index.search(npy, 1) 211 | # npy = big_npy[I.squeeze()] 212 | 213 | score, ix = index.search(npy, k=8) 214 | weight = np.square(1 / score) 215 | weight /= weight.sum(axis=1, keepdims=True) 216 | npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) 217 | 218 | if self.is_half: 219 | npy = npy.astype("float16") 220 | feats = ( 221 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate 222 | + (1 - index_rate) * feats 223 | ) 224 | 225 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) 226 | if protect < 0.5 and pitch != None and pitchf != None: 227 | feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( 228 | 0, 2, 1 229 | ) 230 | t1 = ttime() 231 | p_len = audio0.shape[0] // self.window 232 | if feats.shape[1] < p_len: 233 | p_len = feats.shape[1] 234 | if pitch != None and pitchf != None: 235 | pitch = pitch[:, :p_len] 236 | pitchf = pitchf[:, :p_len] 237 | 238 | if protect < 0.5 and pitch != None and pitchf != None: 239 | pitchff = pitchf.clone() 240 | pitchff[pitchf > 0] = 1 241 | pitchff[pitchf < 1] = protect 242 | pitchff = pitchff.unsqueeze(-1) 243 | feats = feats * pitchff + feats0 * (1 - pitchff) 244 | feats = feats.to(feats0.dtype) 245 | p_len = torch.tensor([p_len], device=self.device).long() 246 | with torch.no_grad(): 247 | if pitch != None and pitchf != None: 248 | audio1 = ( 249 | (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) 250 | .data.cpu() 251 | .float() 252 | .numpy() 253 | ) 254 | else: 255 | audio1 = ( 256 | (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() 257 | ) 258 | del feats, p_len, padding_mask 259 | if torch.cuda.is_available(): 260 | torch.cuda.empty_cache() 261 | t2 = ttime() 262 | times[0] += t1 - t0 263 | times[2] += t2 - t1 264 | return audio1 265 | 266 | def pipeline( 267 | self, 268 | model, 269 | net_g, 270 | sid, 271 | audio, 272 | input_audio_path, 273 | times, 274 | f0_up_key, 275 | f0_method, 276 | file_index, 277 | # file_big_npy, 278 | index_rate, 279 | if_f0, 280 | filter_radius, 281 | tgt_sr, 282 | resample_sr, 283 | rms_mix_rate, 284 | version, 285 | protect, 286 | f0_file=None, 287 | ): 288 | if ( 289 | file_index != "" 290 | # and file_big_npy != "" 291 | # and os.path.exists(file_big_npy) == True 292 | and os.path.exists(file_index) == True 293 | and index_rate != 0 294 | ): 295 | try: 296 | index = faiss.read_index(file_index) 297 | # big_npy = np.load(file_big_npy) 298 | big_npy = index.reconstruct_n(0, index.ntotal) 299 | except: 300 | traceback.print_exc() 301 | index = big_npy = None 302 | else: 303 | index = big_npy = None 304 | audio = signal.filtfilt(bh, ah, audio) 305 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") 306 | opt_ts = [] 307 | if audio_pad.shape[0] > self.t_max: 308 | audio_sum = np.zeros_like(audio) 309 | for i in range(self.window): 310 | audio_sum += audio_pad[i : i - self.window] 311 | for t in range(self.t_center, audio.shape[0], self.t_center): 312 | opt_ts.append( 313 | t 314 | - self.t_query 315 | + np.where( 316 | np.abs(audio_sum[t - self.t_query : t + self.t_query]) 317 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() 318 | )[0][0] 319 | ) 320 | s = 0 321 | audio_opt = [] 322 | t = None 323 | t1 = ttime() 324 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") 325 | p_len = audio_pad.shape[0] // self.window 326 | inp_f0 = None 327 | if hasattr(f0_file, "name") == True: 328 | try: 329 | with open(f0_file.name, "r") as f: 330 | lines = f.read().strip("\n").split("\n") 331 | inp_f0 = [] 332 | for line in lines: 333 | inp_f0.append([float(i) for i in line.split(",")]) 334 | inp_f0 = np.array(inp_f0, dtype="float32") 335 | except: 336 | traceback.print_exc() 337 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() 338 | pitch, pitchf = None, None 339 | if if_f0 == 1: 340 | pitch, pitchf = self.get_f0( 341 | input_audio_path, 342 | audio_pad, 343 | p_len, 344 | f0_up_key, 345 | f0_method, 346 | filter_radius, 347 | inp_f0, 348 | ) 349 | pitch = pitch[:p_len] 350 | pitchf = pitchf[:p_len] 351 | if self.device == "mps": 352 | pitchf = pitchf.astype(np.float32) 353 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() 354 | pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() 355 | t2 = ttime() 356 | times[1] += t2 - t1 357 | for t in opt_ts: 358 | t = t // self.window * self.window 359 | if if_f0 == 1: 360 | audio_opt.append( 361 | self.vc( 362 | model, 363 | net_g, 364 | sid, 365 | audio_pad[s : t + self.t_pad2 + self.window], 366 | pitch[:, s // self.window : (t + self.t_pad2) // self.window], 367 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window], 368 | times, 369 | index, 370 | big_npy, 371 | index_rate, 372 | version, 373 | protect, 374 | )[self.t_pad_tgt : -self.t_pad_tgt] 375 | ) 376 | else: 377 | audio_opt.append( 378 | self.vc( 379 | model, 380 | net_g, 381 | sid, 382 | audio_pad[s : t + self.t_pad2 + self.window], 383 | None, 384 | None, 385 | times, 386 | index, 387 | big_npy, 388 | index_rate, 389 | version, 390 | protect, 391 | )[self.t_pad_tgt : -self.t_pad_tgt] 392 | ) 393 | s = t 394 | if if_f0 == 1: 395 | audio_opt.append( 396 | self.vc( 397 | model, 398 | net_g, 399 | sid, 400 | audio_pad[t:], 401 | pitch[:, t // self.window :] if t is not None else pitch, 402 | pitchf[:, t // self.window :] if t is not None else pitchf, 403 | times, 404 | index, 405 | big_npy, 406 | index_rate, 407 | version, 408 | protect, 409 | )[self.t_pad_tgt : -self.t_pad_tgt] 410 | ) 411 | else: 412 | audio_opt.append( 413 | self.vc( 414 | model, 415 | net_g, 416 | sid, 417 | audio_pad[t:], 418 | None, 419 | None, 420 | times, 421 | index, 422 | big_npy, 423 | index_rate, 424 | version, 425 | protect, 426 | )[self.t_pad_tgt : -self.t_pad_tgt] 427 | ) 428 | audio_opt = np.concatenate(audio_opt) 429 | if rms_mix_rate != 1: 430 | audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) 431 | if resample_sr >= 16000 and tgt_sr != resample_sr: 432 | audio_opt = librosa.resample( 433 | audio_opt, orig_sr=tgt_sr, target_sr=resample_sr 434 | ) 435 | audio_max = np.abs(audio_opt).max() / 0.99 436 | max_int16 = 32768 437 | if audio_max > 1: 438 | max_int16 /= audio_max 439 | audio_opt = (audio_opt * max_int16).astype(np.int16) 440 | del pitch, pitchf, sid 441 | if torch.cuda.is_available(): 442 | torch.cuda.empty_cache() 443 | return audio_opt 444 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel 2 | setuptools 3 | ffmpeg 4 | numba==0.56.4 5 | numpy==1.23.5 6 | scipy 7 | librosa==0.9.1 8 | fairseq==0.12.2 9 | faiss-cpu==1.7.3 10 | gradio>==4.19.2 11 | pyworld==0.3.2 12 | soundfile>=0.12.1 13 | praat-parselmouth>=0.4.2 14 | httpx==0.23.0 15 | tensorboard 16 | tensorboardX 17 | torchcrepe 18 | onnxruntime 19 | demucs 20 | edge-tts 21 | yt_dlp 22 | pytube 23 | av 24 | -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Activate virtual environment 4 | call venv\Scripts\activate 5 | 6 | REM Run the inference script 7 | python app.py -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # Activate virtual environment 2 | source my_env/bin/activate 3 | 4 | # Run the inference script 5 | python app.py 6 | -------------------------------------------------------------------------------- /start.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | rem Check if Python exists 4 | python --version > NUL 2>&1 5 | IF ERRORLEVEL 1 ( 6 | ECHO Python is not installed. Please install Python before running this script. 7 | EXIT /B 1 8 | ) 9 | 10 | rem Create virtual environment (.venv) 11 | python -m venv .venv 12 | 13 | rem Activate virtual environment 14 | .venv\Scripts\activate 15 | 16 | rem Check for Nvidia GPU using nvidia-smi 17 | nvidia-smi > NUL 2>&1 18 | IF ERRORLEVEL 1 ( 19 | rem Install CPU version 20 | pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cpu 21 | ) ELSE ( 22 | rem Install GPU version 23 | pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121 24 | ) 25 | 26 | rem Install dependencies from requirements.txt 27 | pip install -r requirements.txt 28 | 29 | rem Download requirement voice model 30 | powershell -Command "Invoke-WebRequest https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt?download=true -OutFile assets/hubert/hubert_base.pt" 31 | powershell -Command "Invoke-WebRequest https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt?download=true -OutFile assets/rvmpe/rmvpe.pt" 32 | 33 | rem Run the inference app 34 | python app.py 35 | 36 | ECHO Finished! -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if Python exists 4 | if ! command -v python &> /dev/null; then 5 | echo "Python is not installed. Please install Python before running this script." 6 | exit 1 7 | fi 8 | 9 | # Create virtual environment (.venv) 10 | python -m venv .venv 11 | 12 | # Activate virtual environment 13 | source .venv/bin/activate 14 | 15 | # Check for Nvidia GPU using nvidia-smi 16 | if nvidia-smi &> /dev/null; then 17 | # Install GPU version 18 | pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121 19 | else 20 | # Install CPU version 21 | pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cpu 22 | fi 23 | 24 | # Install dependencies from requirements.txt 25 | pip install -r requirements.txt 26 | 27 | # Download requirement voice model 28 | wget https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt?download=true -O assets/hubert/hubert_base.pt 29 | wget https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt?download=true -O assets/rvmpe/rmvpe.pt 30 | 31 | 32 | # Run the inference app 33 | python app.py 34 | 35 | echo "Finished!" 36 | -------------------------------------------------------------------------------- /weights/folder_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "CATEGORY_TAB_NAME":{ 3 | "enable": true, 4 | "title": "CATEGORY_TITLE", 5 | "folder_path": "CATEGORY_FOLDER_PATH", 6 | "description": "CATEGORY_DESCRIPTION" 7 | } 8 | } -------------------------------------------------------------------------------- /weights/model_pack/model_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "FOLDER_PATH": { 3 | "enable": true, 4 | "model_path": "CHARACTER_BASEMODEL", 5 | "title": "CHARACTER_NAME", 6 | "cover": "CHARACTER_IMAGE", 7 | "feature_retrieval_library": "CHARACTER_MODEL_INDEX", 8 | "author": "MODEL_AUTHOR" 9 | } 10 | } --------------------------------------------------------------------------------