├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── app.py
├── assets
    ├── hubert
    │   └── req-hubert.txt
    └── rvmpe
    │   └── req-rvmpe.txt
├── docs
    ├── COMMAND_LINE_ARGUMENTS.md
    ├── HOW_TO_USE.md
    └── img
    │   └── ui_example.jpg
├── lib
    ├── config
    │   └── config.py
    ├── infer_pack
    │   ├── attentions.py
    │   ├── commons.py
    │   ├── models.py
    │   ├── models_dml.py
    │   ├── models_onnx.py
    │   ├── modules.py
    │   ├── modules
    │   │   └── F0Predictor
    │   │   │   ├── DioF0Predictor.py
    │   │   │   ├── F0Predictor.py
    │   │   │   ├── HarvestF0Predictor.py
    │   │   │   ├── PMF0Predictor.py
    │   │   │   └── __init__.py
    │   ├── onnx_inference.py
    │   └── transforms.py
    └── vc
    │   ├── audio.py
    │   ├── rmvpe.py
    │   ├── settings.py
    │   ├── utils.py
    │   └── vc_infer_pipeline.py
├── requirements.txt
├── run.bat
├── run.sh
├── start.bat
├── start.sh
└── weights
    ├── folder_info.json
    └── model_pack
        └── model_info.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Ww][Ii][Nn]32/
 27 | [Aa][Rr][Mm]/
 28 | [Aa][Rr][Mm]64/
 29 | bld/
 30 | [Bb]in/
 31 | [Oo]bj/
 32 | [Oo]ut/
 33 | [Ll]og/
 34 | [Ll]ogs/
 35 | infer_pack\__pycache__
 36 | # Visual Studio 2015/2017 cache/options directory
 37 | .vs/
 38 | # Uncomment if you have tasks that create the project's static files in wwwroot
 39 | #wwwroot/
 40 | 
 41 | # Visual Studio 2017 auto generated files
 42 | Generated\ Files/
 43 | 
 44 | # MSTest test Results
 45 | [Tt]est[Rr]esult*/
 46 | [Bb]uild[Ll]og.*
 47 | 
 48 | # NUnit
 49 | *.VisualState.xml
 50 | TestResult.xml
 51 | nunit-*.xml
 52 | 
 53 | # Build Results of an ATL Project
 54 | [Dd]ebugPS/
 55 | [Rr]eleasePS/
 56 | dlldata.c
 57 | 
 58 | # Benchmark Results
 59 | BenchmarkDotNet.Artifacts/
 60 | 
 61 | # .NET Core
 62 | project.lock.json
 63 | project.fragment.lock.json
 64 | artifacts/
 65 | 
 66 | # ASP.NET Scaffolding
 67 | ScaffoldingReadMe.txt
 68 | 
 69 | # StyleCop
 70 | StyleCopReport.xml
 71 | 
 72 | # Files built by Visual Studio
 73 | *_i.c
 74 | *_p.c
 75 | *_h.h
 76 | *.ilk
 77 | *.meta
 78 | *.obj
 79 | *.iobj
 80 | *.pch
 81 | *.pdb
 82 | *.ipdb
 83 | *.pgc
 84 | *.pgd
 85 | *.rsp
 86 | *.sbr
 87 | *.tlb
 88 | *.tli
 89 | *.tlh
 90 | *.tmp
 91 | *.tmp_proj
 92 | *_wpftmp.csproj
 93 | *.log
 94 | *.vspscc
 95 | *.vssscc
 96 | .builds
 97 | *.pidb
 98 | *.svclog
 99 | *.scc
100 | 
101 | # Chutzpah Test files
102 | _Chutzpah*
103 | 
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 | 
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 | 
121 | # Visual Studio Trace Files
122 | *.e2e
123 | 
124 | # TFS 2012 Local Workspace
125 | $tf/
126 | 
127 | # Guidance Automation Toolkit
128 | *.gpState
129 | 
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 | 
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 | 
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 | 
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 | 
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 | 
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 | 
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 | 
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 | 
163 | # Web workbench (sass)
164 | .sass-cache/
165 | 
166 | # Installshield output folder
167 | [Ee]xpress/
168 | 
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 | 
179 | # Click-Once directory
180 | publish/
181 | 
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 | 
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 | 
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 | 
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 | 
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 | 
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 | 
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 | 
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 | 
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 | 
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 | 
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 | 
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 | 
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 | 
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 | 
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 | 
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 | 
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 | 
288 | # Visual Studio 6 build log
289 | *.plg
290 | 
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 | 
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 | 
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 | 
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 | 
309 | # FAKE - F# Make
310 | .fake/
311 | 
312 | # CodeRush personal settings
313 | .cr/personal
314 | 
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | 
318 | 
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 | 
323 | # Tabs Studio
324 | *.tss
325 | 
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 | 
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 | 
335 | # OpenCover UI analysis results
336 | OpenCover/
337 | 
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 | 
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 | 
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 | 
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 | 
350 | # Local History for Visual Studio
351 | .localhistory/
352 | 
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 | 
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 | 
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 | 
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
364 | 
365 | # build
366 | build
367 | monotonic_align/core.c
368 | *.o
369 | *.so
370 | *.dll
371 | 
372 | # data
373 | /config.json
374 | /*.pth
375 | *.wav
376 | /monotonic_align/monotonic_align
377 | /resources
378 | /MoeGoe.spec
379 | /dist/MoeGoe
380 | /dist
381 | 
382 | /env
383 | .idea
384 | .venv
385 | *.pt
386 | output


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | 02/03/2024 Changelog: <br />
 2 | - Rearrange code and update dependencies.
 3 | - Add batch and shell command for easy installation.
 4 | 
 5 | 12/09/2023 Changelog: <br />
 6 | - Added documentation.
 7 | - Support for non json file.
 8 | 
 9 | 13/08/2023 Changelog: <br />
10 | - Fix bugs.
11 | 
12 | 08/08/2023 Changelog: <br />
13 | - Limitation changes.
14 | - UI Changes for Youtube Input.
15 | - Added instrument volume.
16 | 
17 | 29/07/2023 Changelog: <br />
18 | - UI Changes for Non Limitation.
19 | - Added More Splitter Model.
20 | - Separate Youtube Download and Splitter.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 arkandash
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | # Multi-Model RVC Inference
 4 | ### Simplified RVC Inference for HuggingFace or Google Colab
 5 | 
 6 | [![License](https://img.shields.io/github/license/arkandash/Multi-Model-RVC-Inference?style=for-the-badge)](https://github.com/ArkanDash/Multi-Model-RVC-Inference/blob/master/LICENSE)
 7 | [![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)
 8 | </div>
 9 | 
10 | ### Information
11 | Please support the original RVC, without it, this inference wont be possible to make.<br />
12 | [![Original RVC Repository](https://img.shields.io/badge/Github-Original%20RVC%20Repository-blue?style=for-the-badge&logo=github)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
13 | #### Features
14 | - Support V1 & V2 Model ✅
15 | - Youtube Audio Downloader ✅
16 | - Demucs (Voice Splitter) [Internet required for downloading model] ✅
17 | - TTS Support ✅
18 | - Microphone Support ✅
19 | - HuggingFace Spaces Inference [for CPU Tier only] ✅
20 |     - Remove Youtube & Input Path ✅
21 |     - Remove Crepe Support due to gpu requirement ✅
22 | 
23 | ### Automatic Installation
24 |   Install [ffmpeg](https://ffmpeg.org/) first before running these command.
25 |   - Windows
26 |   Run the `start.bat` to download the model and dependencies. <br />
27 |   Run the `run.bat` to run the inference
28 |   - MacOS & Linux
29 |   For MacOS. before running the script, please install [wget](https://formulae.brew.sh/formula/wget) <br />
30 |   Run the `start.sh` to download the model and dependencies. <br />
31 |   Run the `run.sh` to run the inference
32 | 
33 | ### Manual Installation
34 | 
35 | 1. Install Pytorch <br />
36 |     - CPU only (any OS)
37 |     ```bash
38 |     pip install torch torchvision torchaudio
39 |     ```
40 |     - Nvidia (CUDA used)
41 |     ```bash
42 |     # For Windows (Due to flashv2 not supported in windows, Issue: https://github.com/Dao-AILab/flash-attention/issues/345#issuecomment-1747473481)
43 |     pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
44 |     # Other (Linux, etc)
45 |     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
46 |     ```
47 | 
48 | 2. Install [ffmpeg](https://ffmpeg.org/)
49 | 
50 | 3. Install Dependencies<br />
51 | ```bash
52 | pip install -r requirements.txt
53 | ```
54 | 
55 | 4. Download Pre-model 
56 | ```bash
57 | # Hubert Model
58 | https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/hubert_base.pt
59 | # Save it to /assets/hubert/hubert_base.pt
60 | 
61 | # RVMPE (rmvpe pitch extraction, Optional)
62 | https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/rmvpe.pt
63 | # Save it to /assets/rvmpe/rmvpe.pt
64 | ```
65 | 
66 | 5. Run WebUI <br />
67 | ```bash
68 | python app.py
69 | ```
70 | 
71 | ### [How to use](docs/HOW_TO_USE.md)
72 | ### [Command Line Arguments](docs/COMMAND_LINE_ARGUMENTS.md)
73 | 
74 | # Other Inference
75 | [![Advanced RVC Inference](https://img.shields.io/badge/Github-Advanced_RVC_Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Advanced-RVC-Inference)


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import json
  4 | import traceback
  5 | import logging
  6 | import gradio as gr
  7 | import numpy as np
  8 | import librosa
  9 | import torch
 10 | import asyncio
 11 | import edge_tts
 12 | import sys
 13 | import io
 14 | 
 15 | from datetime import datetime
 16 | from lib.config.config import Config
 17 | from lib.vc.vc_infer_pipeline import VC
 18 | from lib.vc.settings import change_audio_mode
 19 | from lib.vc.audio import load_audio
 20 | from lib.infer_pack.models import (
 21 |     SynthesizerTrnMs256NSFsid,
 22 |     SynthesizerTrnMs256NSFsid_nono,
 23 |     SynthesizerTrnMs768NSFsid,
 24 |     SynthesizerTrnMs768NSFsid_nono,
 25 | )
 26 | from lib.vc.utils import (
 27 |     combine_vocal_and_inst,
 28 |     cut_vocal_and_inst,
 29 |     download_audio,
 30 |     load_hubert
 31 | )
 32 | 
 33 | config = Config()
 34 | logging.getLogger("numba").setLevel(logging.WARNING)
 35 | logger = logging.getLogger(__name__)
 36 | spaces = os.getenv("SYSTEM") == "spaces"
 37 | force_support = None
 38 | if config.unsupported is False:
 39 |     if config.device == "mps" or config.device == "cpu":
 40 |         force_support = False
 41 | else:
 42 |     force_support = True
 43 | 
 44 | audio_mode = []
 45 | f0method_mode = []
 46 | f0method_info = ""
 47 | hubert_model = load_hubert(config)
 48 | 
 49 | if force_support is False or spaces is True:
 50 |     if spaces is True:
 51 |         audio_mode = ["Upload audio", "TTS Audio"]
 52 |     else:
 53 |         audio_mode = ["Input path", "Upload audio", "TTS Audio"]
 54 |     f0method_mode = ["pm", "harvest"]
 55 |     f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better). (Default: PM)"
 56 | else:
 57 |     audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
 58 |     f0method_mode = ["pm", "harvest", "crepe"]
 59 |     f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better), and Crepe effect is good but requires GPU (Default: PM)"
 60 | 
 61 | if os.path.isfile("rmvpe.pt"):
 62 |     f0method_mode.insert(2, "rmvpe")
 63 | 
 64 | def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
 65 |     def vc_fn(
 66 |         vc_audio_mode,
 67 |         vc_input, 
 68 |         vc_upload,
 69 |         tts_text,
 70 |         tts_voice,
 71 |         f0_up_key,
 72 |         f0_method,
 73 |         index_rate,
 74 |         filter_radius,
 75 |         resample_sr,
 76 |         rms_mix_rate,
 77 |         protect,
 78 |     ):
 79 |         try:
 80 |             logs = []
 81 |             logger.info(f"Converting using {model_name}...")
 82 |             logs.append(f"Converting using {model_name}...")
 83 |             yield "\n".join(logs), None
 84 |             if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
 85 |                 audio = load_audio(vc_input, 16000)
 86 |                 audio_max = np.abs(audio).max() / 0.95
 87 |                 if audio_max > 1:
 88 |                     audio /= audio_max
 89 |             elif vc_audio_mode == "Upload audio":
 90 |                 if vc_upload is None:
 91 |                     return "You need to upload an audio", None
 92 |                 sampling_rate, audio = vc_upload
 93 |                 duration = audio.shape[0] / sampling_rate
 94 |                 if duration > 20 and spaces:
 95 |                     return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
 96 |                 audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
 97 |                 if len(audio.shape) > 1:
 98 |                     audio = librosa.to_mono(audio.transpose(1, 0))
 99 |                 if sampling_rate != 16000:
100 |                     audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
101 |             elif vc_audio_mode == "TTS Audio":
102 |                 if len(tts_text) > 100 and spaces:
103 |                     return "Text is too long", None
104 |                 if tts_text is None or tts_voice is None:
105 |                     return "You need to enter text and select a voice", None
106 |                 os.makedirs("output", exist_ok=True)
107 |                 os.makedirs(os.path.join("output", "tts"), exist_ok=True)
108 |                 asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(os.path.join("output", "tts", "tts.mp3")))
109 |                 audio, sr = librosa.load(os.path.join("output", "tts", "tts.mp3"), sr=16000, mono=True)
110 |                 vc_input = os.path.join("output", "tts", "tts.mp3")
111 |             times = [0, 0, 0]
112 |             f0_up_key = int(f0_up_key)
113 |             audio_opt = vc.pipeline(
114 |                 hubert_model,
115 |                 net_g,
116 |                 0,
117 |                 audio,
118 |                 vc_input,
119 |                 times,
120 |                 f0_up_key,
121 |                 f0_method,
122 |                 file_index,
123 |                 # file_big_npy,
124 |                 index_rate,
125 |                 if_f0,
126 |                 filter_radius,
127 |                 tgt_sr,
128 |                 resample_sr,
129 |                 rms_mix_rate,
130 |                 version,
131 |                 protect,
132 |                 f0_file=None,
133 |             )
134 |             info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
135 |             logger.info(f"{model_name} | {info}")
136 |             logs.append(f"Successfully Convert {model_name}\n{info}")
137 |             yield "\n".join(logs), (tgt_sr, audio_opt)
138 |         except Exception as err:
139 |             info = traceback.format_exc()
140 |             logger.error(info)
141 |             logger.error(f"Error when using {model_name}.\n{str(err)}")
142 |             yield info, None
143 |     return vc_fn
144 | 
145 | def load_model():
146 |     categories = []
147 |     category_count = 0
148 |     if os.path.isfile("weights/folder_info.json"):
149 |         with open("weights/folder_info.json", "r", encoding="utf-8") as f:
150 |             folder_info = json.load(f)
151 |         for category_name, category_info in folder_info.items():
152 |             if not category_info['enable']:
153 |                 continue
154 |             category_title = category_info['title']
155 |             category_folder = category_info['folder_path']
156 |             description = category_info['description']
157 |             models = []
158 |             with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
159 |                 models_info = json.load(f)
160 |             for character_name, info in models_info.items():
161 |                 if not info['enable']:
162 |                     continue
163 |                 model_title = info['title']
164 |                 model_name = info['model_path']
165 |                 model_author = info.get("author", None)
166 |                 model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
167 |                 model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
168 |                 cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
169 |                 tgt_sr = cpt["config"][-1]
170 |                 cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
171 |                 if_f0 = cpt.get("f0", 1)
172 |                 version = cpt.get("version", "v1")
173 |                 if version == "v1":
174 |                     if if_f0 == 1:
175 |                         net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
176 |                     else:
177 |                         net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
178 |                     model_version = "V1"
179 |                 elif version == "v2":
180 |                     if if_f0 == 1:
181 |                         net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
182 |                     else:
183 |                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
184 |                     model_version = "V2"
185 |                 del net_g.enc_q
186 |                 logger.info(net_g.load_state_dict(cpt["weight"], strict=False))
187 |                 net_g.eval().to(config.device)
188 |                 if config.is_half:
189 |                     net_g = net_g.half()
190 |                 else:
191 |                     net_g = net_g.float()
192 |                 vc = VC(tgt_sr, config)
193 |                 logger.info(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
194 |                 models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
195 |             category_count += 1
196 |             categories.append([category_title, description, models])
197 |     elif os.path.exists("weights"):
198 |         models = []
199 |         for w_root, w_dirs, _ in os.walk("weights"):
200 |             model_count = 1
201 |             for sub_dir in w_dirs:
202 |                 pth_files = glob.glob(f"weights/{sub_dir}/*.pth")
203 |                 index_files = glob.glob(f"weights/{sub_dir}/*.index")
204 |                 if pth_files == []:
205 |                     logger.debug(f"Model [{model_count}/{len(w_dirs)}]: No Model file detected, skipping...")
206 |                     continue
207 |                 cpt = torch.load(pth_files[0])
208 |                 tgt_sr = cpt["config"][-1]
209 |                 cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
210 |                 if_f0 = cpt.get("f0", 1)
211 |                 version = cpt.get("version", "v1")
212 |                 if version == "v1":
213 |                     if if_f0 == 1:
214 |                         net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
215 |                     else:
216 |                         net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
217 |                     model_version = "V1"
218 |                 elif version == "v2":
219 |                     if if_f0 == 1:
220 |                         net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
221 |                     else:
222 |                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
223 |                     model_version = "V2"
224 |                 del net_g.enc_q
225 |                 logger.info(net_g.load_state_dict(cpt["weight"], strict=False))
226 |                 net_g.eval().to(config.device)
227 |                 if config.is_half:
228 |                     net_g = net_g.half()
229 |                 else:
230 |                     net_g = net_g.float()
231 |                 vc = VC(tgt_sr, config)
232 |                 if index_files == []:
233 |                     logger.warning("No Index file detected!")
234 |                     index_info = "None"
235 |                     model_index = ""
236 |                 else:
237 |                     index_info = index_files[0]
238 |                     model_index = index_files[0]
239 |                 logger.info(f"Model loaded [{model_count}/{len(w_dirs)}]: {index_files[0]} / {index_info} | ({model_version})")
240 |                 model_count += 1
241 |                 models.append((index_files[0][:-4], index_files[0][:-4], "", "", model_version, create_vc_fn(index_files[0], tgt_sr, net_g, vc, if_f0, version, model_index)))
242 |         categories.append(["Models", "", models])
243 |     else:
244 |         categories = []
245 |     return categories
246 | 
247 | if __name__ == '__main__':
248 |     categories = load_model()
249 |     tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
250 |     voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
251 |     with gr.Blocks() as app:
252 |         gr.Markdown(
253 |             "<div align='center'>\n\n"+
254 |             "# Multi Model RVC Inference\n\n"+
255 |             "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+
256 |             "</div>"
257 |         )
258 |         if categories == []:
259 |             gr.Markdown(
260 |                 "<div align='center'>\n\n"+
261 |                 "## No model found, please add the model into weights folder\n\n"+
262 |                 "</div>"
263 |             )
264 |         for (folder_title, description, models) in categories:
265 |             with gr.TabItem(folder_title):
266 |                 if description:
267 |                     gr.Markdown(f"### <center> {description}")
268 |                 with gr.Tabs():
269 |                     if not models:
270 |                         gr.Markdown("# <center> No Model Loaded.")
271 |                         gr.Markdown("## <center> Please add the model or fix your model path.")
272 |                         continue
273 |                     for (name, title, author, cover, model_version, vc_fn) in models:
274 |                         with gr.TabItem(name):
275 |                             with gr.Row():
276 |                                 gr.Markdown(
277 |                                     '<div align="center">'
278 |                                     f'<div>{title}</div>\n'+
279 |                                     f'<div>RVC {model_version} Model</div>\n'+
280 |                                     (f'<div>Model author: {author}</div>' if author else "")+
281 |                                     (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
282 |                                     '</div>'
283 |                                 )
284 |                             with gr.Row():
285 |                                 if spaces is False:
286 |                                     with gr.TabItem("Input"):
287 |                                         with gr.Row():
288 |                                             with gr.Column():
289 |                                                 vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
290 |                                                 # Input
291 |                                                 vc_input = gr.Textbox(label="Input audio path", visible=False)
292 |                                                 # Upload
293 |                                                 vc_upload = gr.Audio(label="Upload audio file", sources=["upload", "microphone"], visible=True, interactive=True)
294 |                                                 # Youtube
295 |                                                 vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
296 |                                                 vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
297 |                                                 vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False)
298 |                                                 vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
299 |                                                 vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
300 |                                                 # TTS
301 |                                                 tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False)
302 |                                                 tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
303 |                                             with gr.Column():
304 |                                                 vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
305 |                                                 vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False)
306 |                                                 vc_split = gr.Button("Split Audio", variant="primary", visible=False)
307 |                                                 vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
308 |                                                 vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
309 |                                     with gr.TabItem("Convert"):
310 |                                         with gr.Row():
311 |                                             with gr.Column():
312 |                                                 vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
313 |                                                 f0method0 = gr.Radio(
314 |                                                     label="Pitch extraction algorithm",
315 |                                                     info=f0method_info,
316 |                                                     choices=f0method_mode,
317 |                                                     value="pm",
318 |                                                     interactive=True
319 |                                                 )
320 |                                                 index_rate1 = gr.Slider(
321 |                                                     minimum=0,
322 |                                                     maximum=1,
323 |                                                     label="Retrieval feature ratio",
324 |                                                     info="(Default: 0.7)",
325 |                                                     value=0.7,
326 |                                                     interactive=True,
327 |                                                 )
328 |                                                 filter_radius0 = gr.Slider(
329 |                                                     minimum=0,
330 |                                                     maximum=7,
331 |                                                     label="Apply Median Filtering",
332 |                                                     info="The value represents the filter radius and can reduce breathiness.",
333 |                                                     value=3,
334 |                                                     step=1,
335 |                                                     interactive=True,
336 |                                                 )
337 |                                                 resample_sr0 = gr.Slider(
338 |                                                     minimum=0,
339 |                                                     maximum=48000,
340 |                                                     label="Resample the output audio",
341 |                                                     info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
342 |                                                     value=0,
343 |                                                     step=1,
344 |                                                     interactive=True,
345 |                                                 )
346 |                                                 rms_mix_rate0 = gr.Slider(
347 |                                                     minimum=0,
348 |                                                     maximum=1,
349 |                                                     label="Volume Envelope",
350 |                                                     info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
351 |                                                     value=1,
352 |                                                     interactive=True,
353 |                                                 )
354 |                                                 protect0 = gr.Slider(
355 |                                                     minimum=0,
356 |                                                     maximum=0.5,
357 |                                                     label="Voice Protection",
358 |                                                     info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
359 |                                                     value=0.5,
360 |                                                     step=0.01,
361 |                                                     interactive=True,
362 |                                                 )
363 |                                             with gr.Column():
364 |                                                 vc_log = gr.Textbox(label="Output Information", interactive=False)
365 |                                                 vc_output = gr.Audio(label="Output Audio", interactive=False)
366 |                                                 vc_convert = gr.Button("Convert", variant="primary")
367 |                                                 vc_vocal_volume = gr.Slider(
368 |                                                     minimum=0,
369 |                                                     maximum=10,
370 |                                                     label="Vocal volume",
371 |                                                     value=1,
372 |                                                     interactive=True,
373 |                                                     step=1,
374 |                                                     info="Adjust vocal volume (Default: 1}",
375 |                                                     visible=False
376 |                                                 )
377 |                                                 vc_inst_volume = gr.Slider(
378 |                                                     minimum=0,
379 |                                                     maximum=10,
380 |                                                     label="Instrument volume",
381 |                                                     value=1,
382 |                                                     interactive=True,
383 |                                                     step=1,
384 |                                                     info="Adjust instrument volume (Default: 1}",
385 |                                                     visible=False
386 |                                                 )
387 |                                                 vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
388 |                                                 vc_combine =  gr.Button("Combine",variant="primary", visible=False)
389 |                                 else:
390 |                                     with gr.Column():
391 |                                         vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
392 |                                         # Input
393 |                                         vc_input = gr.Textbox(label="Input audio path", visible=False)
394 |                                         # Upload
395 |                                         vc_upload = gr.Audio(label="Upload audio file", sources=["upload", "microphone"], visible=True, interactive=True)
396 |                                         # Youtube
397 |                                         vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
398 |                                         vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
399 |                                         vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False)
400 |                                         vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
401 |                                         vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
402 |                                         # Splitter
403 |                                         vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
404 |                                         vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False)
405 |                                         vc_split = gr.Button("Split Audio", variant="primary", visible=False)
406 |                                         vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
407 |                                         vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
408 |                                         # TTS
409 |                                         tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False)
410 |                                         tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
411 |                                     with gr.Column():
412 |                                         vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
413 |                                         f0method0 = gr.Radio(
414 |                                             label="Pitch extraction algorithm",
415 |                                             info=f0method_info,
416 |                                             choices=f0method_mode,
417 |                                             value="pm",
418 |                                             interactive=True
419 |                                         )
420 |                                         index_rate1 = gr.Slider(
421 |                                             minimum=0,
422 |                                             maximum=1,
423 |                                             label="Retrieval feature ratio",
424 |                                             info="(Default: 0.7)",
425 |                                             value=0.7,
426 |                                             interactive=True,
427 |                                         )
428 |                                         filter_radius0 = gr.Slider(
429 |                                             minimum=0,
430 |                                             maximum=7,
431 |                                             label="Apply Median Filtering",
432 |                                             info="The value represents the filter radius and can reduce breathiness.",
433 |                                             value=3,
434 |                                             step=1,
435 |                                             interactive=True,
436 |                                         )
437 |                                         resample_sr0 = gr.Slider(
438 |                                             minimum=0,
439 |                                             maximum=48000,
440 |                                             label="Resample the output audio",
441 |                                             info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
442 |                                             value=0,
443 |                                             step=1,
444 |                                             interactive=True,
445 |                                         )
446 |                                         rms_mix_rate0 = gr.Slider(
447 |                                             minimum=0,
448 |                                             maximum=1,
449 |                                             label="Volume Envelope",
450 |                                             info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
451 |                                             value=1,
452 |                                             interactive=True,
453 |                                         )
454 |                                         protect0 = gr.Slider(
455 |                                             minimum=0,
456 |                                             maximum=0.5,
457 |                                             label="Voice Protection",
458 |                                             info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
459 |                                             value=0.5,
460 |                                             step=0.01,
461 |                                             interactive=True,
462 |                                         )
463 |                                     with gr.Column():
464 |                                         vc_log = gr.Textbox(label="Output Information", interactive=False)
465 |                                         vc_output = gr.Audio(label="Output Audio", interactive=False)
466 |                                         vc_convert = gr.Button("Convert", variant="primary")
467 |                                         vc_vocal_volume = gr.Slider(
468 |                                             minimum=0,
469 |                                             maximum=10,
470 |                                             label="Vocal volume",
471 |                                             value=1,
472 |                                             interactive=True,
473 |                                             step=1,
474 |                                             info="Adjust vocal volume (Default: 1}",
475 |                                             visible=False
476 |                                         )
477 |                                         vc_inst_volume = gr.Slider(
478 |                                             minimum=0,
479 |                                             maximum=10,
480 |                                             label="Instrument volume",
481 |                                             value=1,
482 |                                             interactive=True,
483 |                                             step=1,
484 |                                             info="Adjust instrument volume (Default: 1}",
485 |                                             visible=False
486 |                                         )
487 |                                         vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
488 |                                         vc_combine =  gr.Button("Combine",variant="primary", visible=False)
489 |                         vc_convert.click(
490 |                             fn=vc_fn, 
491 |                             inputs=[
492 |                                 vc_audio_mode,
493 |                                 vc_input,
494 |                                 vc_upload,
495 |                                 tts_text,
496 |                                 tts_voice,
497 |                                 vc_transform0,
498 |                                 f0method0,
499 |                                 index_rate1,
500 |                                 filter_radius0,
501 |                                 resample_sr0,
502 |                                 rms_mix_rate0,
503 |                                 protect0,
504 |                             ], 
505 |                             outputs=[vc_log ,vc_output]
506 |                         )
507 |                         vc_download_button.click(
508 |                             fn=download_audio, 
509 |                             inputs=[vc_link, vc_download_audio], 
510 |                             outputs=[vc_audio_preview, vc_log_yt]
511 |                         )
512 |                         vc_split.click(
513 |                             fn=cut_vocal_and_inst, 
514 |                             inputs=[vc_split_model], 
515 |                             outputs=[vc_split_log, vc_vocal_preview, vc_inst_preview, vc_input]
516 |                         )
517 |                         vc_combine.click(
518 |                             fn=combine_vocal_and_inst,
519 |                             inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
520 |                             outputs=[vc_combined_output]
521 |                         )
522 |                         vc_audio_mode.change(
523 |                             fn=change_audio_mode,
524 |                             inputs=[vc_audio_mode],
525 |                             outputs=[
526 |                                 vc_input,
527 |                                 vc_upload,
528 |                                 vc_download_audio,
529 |                                 vc_link,
530 |                                 vc_log_yt,
531 |                                 vc_download_button,
532 |                                 vc_split_model,
533 |                                 vc_split_log,
534 |                                 vc_split,
535 |                                 vc_audio_preview,
536 |                                 vc_vocal_preview,
537 |                                 vc_inst_preview,
538 |                                 vc_vocal_volume,
539 |                                 vc_inst_volume,
540 |                                 vc_combined_output,
541 |                                 vc_combine,
542 |                                 tts_text,
543 |                                 tts_voice
544 |                             ]
545 |                         )
546 |         app.queue(
547 |             max_size=20,
548 |             api_open=config.api,
549 |         ).launch(
550 |             share=config.share,
551 |             max_threads=1,
552 |             allowed_paths=["weights"]
553 |         )


--------------------------------------------------------------------------------
/assets/hubert/req-hubert.txt:
--------------------------------------------------------------------------------
1 | put hubert_base.pt here


--------------------------------------------------------------------------------
/assets/rvmpe/req-rvmpe.txt:
--------------------------------------------------------------------------------
1 | this is optional for pitch extraction algorithm
2 | put rvmpe.pt here


--------------------------------------------------------------------------------
/docs/COMMAND_LINE_ARGUMENTS.md:
--------------------------------------------------------------------------------
1 | ## List of Command Line Argument
2 | 
3 | | Option          | Description                          |
4 | |-----------------|--------------------------------------|
5 | | `--share`       | Launch with public link              |
6 | | `--api`         | Launch with api                      |
7 | | `--unsupported` | Force unsupported feature due to device being unsupported|
8 | 


--------------------------------------------------------------------------------
/docs/HOW_TO_USE.md:
--------------------------------------------------------------------------------
  1 | ## How to use?
  2 | 
  3 | ### Simple
  4 | 
  5 | 1. Delete the all file and folder inside the weights folder
  6 | 2. Put all your model inside weights
  7 | 
  8 | ```
  9 | rvc-inference/
 10 | ├─ weights/
 11 | │  ├─ [your model folder #1]/ (Put your pth and index file here)
 12 | │  ├─ [your model folder #2]/ (Put your pth and index file here)
 13 | │  ├─ ...other model
 14 | ├─ ...other stuff
 15 | ```
 16 | 4. Done
 17 | 
 18 | Note: Custom character/model name, image file and author may not be available for this.
 19 | 
 20 | ### Advanced Only
 21 | 
 22 | This method is only for advanced user only.
 23 | 1. Create folder_info.json inside weights and 
 24 | create a category folder that contains list of character model.
 25 | 
 26 | ```
 27 | rvc-inference/
 28 | ├─ weights/
 29 | │  ├─ genshin impact/
 30 | │  ├─ folder_info.json
 31 | ├─ ...other stuff
 32 | ```
 33 | 
 34 | 2. Inside folder_info.json
 35 | ```json
 36 | "CATEGORY_TAB_NAME":{
 37 |     "enable": true,
 38 |     "title": "CATEGORY_TITLE",
 39 |     "folder_path": "CATEGORY_FOLDER_PATH",
 40 |     "description": "CATEGORY_DESCRIPTION"
 41 | }
 42 | ```
 43 | 
 44 | folder_info.json info:
 45 | - CATEGORY_TAB_NAME = an category tab name (this one is just a name without spaces, but it wont affect the ui category title) [Required]
 46 | - enable = Enabled/Disabled cat [Required]
 47 | - title = Title of the category (this one affect the ui category title) [Required]
 48 | - folder_path = folder path to the category folder (ex. Genshin Impact) [Required]
 49 | - description = Description below the selected tab [Optional]
 50 | 
 51 | 3. Create model_info.json inside the category folder 
 52 | 
 53 | ```
 54 | rvc-inference/
 55 | ├─ weights/
 56 | │  ├─ genshin impact/
 57 | │  │  ├─ model_info.json
 58 | │  ├─ folder_info.json
 59 | ├─ ...other stuff
 60 | ```
 61 | 4. Inside model_info.json (If you have more than one model just duplicate and change the value)
 62 | ```json
 63 | "FOLDER_PATH": {
 64 |     "enable": true,
 65 |     "model_path": "CHARACTER_BASEMODEL",
 66 |     "title": "CHARACTER_NAME",
 67 |     "cover": "CHARACTER_IMAGE",
 68 |     "feature_retrieval_library": "CHARACTER_MODEL_INDEX",
 69 |     "author": "MODEL_AUTHOR"
 70 | }
 71 | ```
 72 | model_info.json info:
 73 | - FOLDER_PATH = folder path to the model [Required]
 74 | - enable = Enabled/Disabled model [Required]
 75 | - model_path = path to model file (ex. "venti.pth") [Required + must shown the extension]
 76 | - title = Title of the character/model (this one affect the ui category title) [Required + must shown the extension]
 77 | - cover = folder path to the image file (ex. "image.png") [Optional + must shown the extension]
 78 | - feature_retrieval_library = path to index file (ex. "added_IVF4198_Flat_nprobe_1_zhongli-jp_v2.index") [Required]
 79 | - author = Author of the model [Optional]
 80 | 
 81 | 1. Put your desired model to your category folder
 82 | Example:
 83 | ```
 84 | rvc-inference/
 85 | ├─ weights/
 86 | │  ├─ genshin impact/
 87 | │  │  ├─ [your model folder #1]/ (Put your pth and index file here)
 88 | │  │  ├─ [your model folder #2]/ (Put your pth and index file here)
 89 | │  │  ├─ ...other model
 90 | │  │  ├─ model_info.json
 91 | │  ├─ folder_info.json
 92 | ├─ ...other stuff
 93 | ```
 94 | 1. Done.
 95 | 
 96 | Note:
 97 | - To add image to the ui just put your image into the model folder and setting up the image path in the folder info.json.
 98 | 
 99 | More detail stuff:
100 | ![ui_example.jpg](./img/ui_example.jpg)


--------------------------------------------------------------------------------
/docs/img/ui_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Multi-Model-RVC-Inference/661936e4dce121c8ad84113f7637308d1642c887/docs/img/ui_example.jpg


--------------------------------------------------------------------------------
/lib/config/config.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import torch
  4 | from multiprocessing import cpu_count
  5 | 
  6 | class Config:
  7 |     def __init__(self):
  8 |         self.device = "cuda:0"
  9 |         self.is_half = True
 10 |         self.n_cpu = 0
 11 |         self.gpu_name = None
 12 |         self.gpu_mem = None
 13 |         (
 14 |             self.share,
 15 |             self.api,
 16 |             self.unsupported,
 17 |         ) = self.arg_parse()
 18 |         self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
 19 | 
 20 |     @staticmethod
 21 |     def arg_parse() -> tuple:
 22 |         parser = argparse.ArgumentParser()
 23 |         parser.add_argument("--share", action="store_true", help="Launch with public link")
 24 |         parser.add_argument("--api", action="store_true", help="Launch with api")
 25 |         parser.add_argument("--unsupported", action="store_true", help="Enable unsupported feature")
 26 |         cmd_opts = parser.parse_args()
 27 | 
 28 |         return (
 29 |             cmd_opts.share,
 30 |             cmd_opts.api,
 31 |             cmd_opts.unsupported,
 32 |         )
 33 | 
 34 |     # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
 35 |     # check `getattr` and try it for compatibility
 36 |     @staticmethod
 37 |     def has_mps() -> bool:
 38 |         if not torch.backends.mps.is_available():
 39 |             return False
 40 |         try:
 41 |             torch.zeros(1).to(torch.device("mps"))
 42 |             return True
 43 |         except Exception:
 44 |             return False
 45 | 
 46 |     def device_config(self) -> tuple:
 47 |         if torch.cuda.is_available():
 48 |             i_device = int(self.device.split(":")[-1])
 49 |             self.gpu_name = torch.cuda.get_device_name(i_device)
 50 |             if (
 51 |                 ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
 52 |                 or "P40" in self.gpu_name.upper()
 53 |                 or "1060" in self.gpu_name
 54 |                 or "1070" in self.gpu_name
 55 |                 or "1080" in self.gpu_name
 56 |             ):
 57 |                 print("INFO: Found GPU", self.gpu_name, ", force to fp32")
 58 |                 self.is_half = False
 59 |             else:
 60 |                 print("INFO: Found GPU", self.gpu_name)
 61 |             self.gpu_mem = int(
 62 |                 torch.cuda.get_device_properties(i_device).total_memory
 63 |                 / 1024
 64 |                 / 1024
 65 |                 / 1024
 66 |                 + 0.4
 67 |             )
 68 |         elif self.has_mps():
 69 |             print("INFO: No supported Nvidia GPU found, use MPS instead")
 70 |             self.device = "mps"
 71 |             self.is_half = False
 72 |         else:
 73 |             print("INFO: No supported Nvidia GPU found, use CPU instead")
 74 |             self.device = "cpu"
 75 |             self.is_half = False
 76 | 
 77 |         if self.n_cpu == 0:
 78 |             self.n_cpu = cpu_count()
 79 | 
 80 |         if self.is_half:
 81 |             # 6G显存配置
 82 |             x_pad = 3
 83 |             x_query = 10
 84 |             x_center = 60
 85 |             x_max = 65
 86 |         else:
 87 |             # 5G显存配置
 88 |             x_pad = 1
 89 |             x_query = 6
 90 |             x_center = 38
 91 |             x_max = 41
 92 | 
 93 |         if self.gpu_mem != None and self.gpu_mem <= 4:
 94 |             x_pad = 1
 95 |             x_query = 5
 96 |             x_center = 30
 97 |             x_max = 32
 98 | 
 99 |         return x_pad, x_query, x_center, x_max
100 | 


--------------------------------------------------------------------------------
/lib/infer_pack/attentions.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from lib.infer_pack import commons
  9 | from lib.infer_pack import modules
 10 | from lib.infer_pack.modules import LayerNorm
 11 | 
 12 | 
 13 | class Encoder(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         hidden_channels,
 17 |         filter_channels,
 18 |         n_heads,
 19 |         n_layers,
 20 |         kernel_size=1,
 21 |         p_dropout=0.0,
 22 |         window_size=10,
 23 |         **kwargs
 24 |     ):
 25 |         super().__init__()
 26 |         self.hidden_channels = hidden_channels
 27 |         self.filter_channels = filter_channels
 28 |         self.n_heads = n_heads
 29 |         self.n_layers = n_layers
 30 |         self.kernel_size = kernel_size
 31 |         self.p_dropout = p_dropout
 32 |         self.window_size = window_size
 33 | 
 34 |         self.drop = nn.Dropout(p_dropout)
 35 |         self.attn_layers = nn.ModuleList()
 36 |         self.norm_layers_1 = nn.ModuleList()
 37 |         self.ffn_layers = nn.ModuleList()
 38 |         self.norm_layers_2 = nn.ModuleList()
 39 |         for i in range(self.n_layers):
 40 |             self.attn_layers.append(
 41 |                 MultiHeadAttention(
 42 |                     hidden_channels,
 43 |                     hidden_channels,
 44 |                     n_heads,
 45 |                     p_dropout=p_dropout,
 46 |                     window_size=window_size,
 47 |                 )
 48 |             )
 49 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
 50 |             self.ffn_layers.append(
 51 |                 FFN(
 52 |                     hidden_channels,
 53 |                     hidden_channels,
 54 |                     filter_channels,
 55 |                     kernel_size,
 56 |                     p_dropout=p_dropout,
 57 |                 )
 58 |             )
 59 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
 60 | 
 61 |     def forward(self, x, x_mask):
 62 |         attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 63 |         x = x * x_mask
 64 |         for i in range(self.n_layers):
 65 |             y = self.attn_layers[i](x, x, attn_mask)
 66 |             y = self.drop(y)
 67 |             x = self.norm_layers_1[i](x + y)
 68 | 
 69 |             y = self.ffn_layers[i](x, x_mask)
 70 |             y = self.drop(y)
 71 |             x = self.norm_layers_2[i](x + y)
 72 |         x = x * x_mask
 73 |         return x
 74 | 
 75 | 
 76 | class Decoder(nn.Module):
 77 |     def __init__(
 78 |         self,
 79 |         hidden_channels,
 80 |         filter_channels,
 81 |         n_heads,
 82 |         n_layers,
 83 |         kernel_size=1,
 84 |         p_dropout=0.0,
 85 |         proximal_bias=False,
 86 |         proximal_init=True,
 87 |         **kwargs
 88 |     ):
 89 |         super().__init__()
 90 |         self.hidden_channels = hidden_channels
 91 |         self.filter_channels = filter_channels
 92 |         self.n_heads = n_heads
 93 |         self.n_layers = n_layers
 94 |         self.kernel_size = kernel_size
 95 |         self.p_dropout = p_dropout
 96 |         self.proximal_bias = proximal_bias
 97 |         self.proximal_init = proximal_init
 98 | 
 99 |         self.drop = nn.Dropout(p_dropout)
100 |         self.self_attn_layers = nn.ModuleList()
101 |         self.norm_layers_0 = nn.ModuleList()
102 |         self.encdec_attn_layers = nn.ModuleList()
103 |         self.norm_layers_1 = nn.ModuleList()
104 |         self.ffn_layers = nn.ModuleList()
105 |         self.norm_layers_2 = nn.ModuleList()
106 |         for i in range(self.n_layers):
107 |             self.self_attn_layers.append(
108 |                 MultiHeadAttention(
109 |                     hidden_channels,
110 |                     hidden_channels,
111 |                     n_heads,
112 |                     p_dropout=p_dropout,
113 |                     proximal_bias=proximal_bias,
114 |                     proximal_init=proximal_init,
115 |                 )
116 |             )
117 |             self.norm_layers_0.append(LayerNorm(hidden_channels))
118 |             self.encdec_attn_layers.append(
119 |                 MultiHeadAttention(
120 |                     hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121 |                 )
122 |             )
123 |             self.norm_layers_1.append(LayerNorm(hidden_channels))
124 |             self.ffn_layers.append(
125 |                 FFN(
126 |                     hidden_channels,
127 |                     hidden_channels,
128 |                     filter_channels,
129 |                     kernel_size,
130 |                     p_dropout=p_dropout,
131 |                     causal=True,
132 |                 )
133 |             )
134 |             self.norm_layers_2.append(LayerNorm(hidden_channels))
135 | 
136 |     def forward(self, x, x_mask, h, h_mask):
137 |         """
138 |         x: decoder input
139 |         h: encoder output
140 |         """
141 |         self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142 |             device=x.device, dtype=x.dtype
143 |         )
144 |         encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145 |         x = x * x_mask
146 |         for i in range(self.n_layers):
147 |             y = self.self_attn_layers[i](x, x, self_attn_mask)
148 |             y = self.drop(y)
149 |             x = self.norm_layers_0[i](x + y)
150 | 
151 |             y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152 |             y = self.drop(y)
153 |             x = self.norm_layers_1[i](x + y)
154 | 
155 |             y = self.ffn_layers[i](x, x_mask)
156 |             y = self.drop(y)
157 |             x = self.norm_layers_2[i](x + y)
158 |         x = x * x_mask
159 |         return x
160 | 
161 | 
162 | class MultiHeadAttention(nn.Module):
163 |     def __init__(
164 |         self,
165 |         channels,
166 |         out_channels,
167 |         n_heads,
168 |         p_dropout=0.0,
169 |         window_size=None,
170 |         heads_share=True,
171 |         block_length=None,
172 |         proximal_bias=False,
173 |         proximal_init=False,
174 |     ):
175 |         super().__init__()
176 |         assert channels % n_heads == 0
177 | 
178 |         self.channels = channels
179 |         self.out_channels = out_channels
180 |         self.n_heads = n_heads
181 |         self.p_dropout = p_dropout
182 |         self.window_size = window_size
183 |         self.heads_share = heads_share
184 |         self.block_length = block_length
185 |         self.proximal_bias = proximal_bias
186 |         self.proximal_init = proximal_init
187 |         self.attn = None
188 | 
189 |         self.k_channels = channels // n_heads
190 |         self.conv_q = nn.Conv1d(channels, channels, 1)
191 |         self.conv_k = nn.Conv1d(channels, channels, 1)
192 |         self.conv_v = nn.Conv1d(channels, channels, 1)
193 |         self.conv_o = nn.Conv1d(channels, out_channels, 1)
194 |         self.drop = nn.Dropout(p_dropout)
195 | 
196 |         if window_size is not None:
197 |             n_heads_rel = 1 if heads_share else n_heads
198 |             rel_stddev = self.k_channels**-0.5
199 |             self.emb_rel_k = nn.Parameter(
200 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201 |                 * rel_stddev
202 |             )
203 |             self.emb_rel_v = nn.Parameter(
204 |                 torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205 |                 * rel_stddev
206 |             )
207 | 
208 |         nn.init.xavier_uniform_(self.conv_q.weight)
209 |         nn.init.xavier_uniform_(self.conv_k.weight)
210 |         nn.init.xavier_uniform_(self.conv_v.weight)
211 |         if proximal_init:
212 |             with torch.no_grad():
213 |                 self.conv_k.weight.copy_(self.conv_q.weight)
214 |                 self.conv_k.bias.copy_(self.conv_q.bias)
215 | 
216 |     def forward(self, x, c, attn_mask=None):
217 |         q = self.conv_q(x)
218 |         k = self.conv_k(c)
219 |         v = self.conv_v(c)
220 | 
221 |         x, self.attn = self.attention(q, k, v, mask=attn_mask)
222 | 
223 |         x = self.conv_o(x)
224 |         return x
225 | 
226 |     def attention(self, query, key, value, mask=None):
227 |         # reshape [b, d, t] -> [b, n_h, t, d_k]
228 |         b, d, t_s, t_t = (*key.size(), query.size(2))
229 |         query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230 |         key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231 |         value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232 | 
233 |         scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234 |         if self.window_size is not None:
235 |             assert (
236 |                 t_s == t_t
237 |             ), "Relative attention is only available for self-attention."
238 |             key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239 |             rel_logits = self._matmul_with_relative_keys(
240 |                 query / math.sqrt(self.k_channels), key_relative_embeddings
241 |             )
242 |             scores_local = self._relative_position_to_absolute_position(rel_logits)
243 |             scores = scores + scores_local
244 |         if self.proximal_bias:
245 |             assert t_s == t_t, "Proximal bias is only available for self-attention."
246 |             scores = scores + self._attention_bias_proximal(t_s).to(
247 |                 device=scores.device, dtype=scores.dtype
248 |             )
249 |         if mask is not None:
250 |             scores = scores.masked_fill(mask == 0, -1e4)
251 |             if self.block_length is not None:
252 |                 assert (
253 |                     t_s == t_t
254 |                 ), "Local attention is only available for self-attention."
255 |                 block_mask = (
256 |                     torch.ones_like(scores)
257 |                     .triu(-self.block_length)
258 |                     .tril(self.block_length)
259 |                 )
260 |                 scores = scores.masked_fill(block_mask == 0, -1e4)
261 |         p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
262 |         p_attn = self.drop(p_attn)
263 |         output = torch.matmul(p_attn, value)
264 |         if self.window_size is not None:
265 |             relative_weights = self._absolute_position_to_relative_position(p_attn)
266 |             value_relative_embeddings = self._get_relative_embeddings(
267 |                 self.emb_rel_v, t_s
268 |             )
269 |             output = output + self._matmul_with_relative_values(
270 |                 relative_weights, value_relative_embeddings
271 |             )
272 |         output = (
273 |             output.transpose(2, 3).contiguous().view(b, d, t_t)
274 |         )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
275 |         return output, p_attn
276 | 
277 |     def _matmul_with_relative_values(self, x, y):
278 |         """
279 |         x: [b, h, l, m]
280 |         y: [h or 1, m, d]
281 |         ret: [b, h, l, d]
282 |         """
283 |         ret = torch.matmul(x, y.unsqueeze(0))
284 |         return ret
285 | 
286 |     def _matmul_with_relative_keys(self, x, y):
287 |         """
288 |         x: [b, h, l, d]
289 |         y: [h or 1, m, d]
290 |         ret: [b, h, l, m]
291 |         """
292 |         ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293 |         return ret
294 | 
295 |     def _get_relative_embeddings(self, relative_embeddings, length):
296 |         max_relative_position = 2 * self.window_size + 1
297 |         # Pad first before slice to avoid using cond ops.
298 |         pad_length = max(length - (self.window_size + 1), 0)
299 |         slice_start_position = max((self.window_size + 1) - length, 0)
300 |         slice_end_position = slice_start_position + 2 * length - 1
301 |         if pad_length > 0:
302 |             padded_relative_embeddings = F.pad(
303 |                 relative_embeddings,
304 |                 commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305 |             )
306 |         else:
307 |             padded_relative_embeddings = relative_embeddings
308 |         used_relative_embeddings = padded_relative_embeddings[
309 |             :, slice_start_position:slice_end_position
310 |         ]
311 |         return used_relative_embeddings
312 | 
313 |     def _relative_position_to_absolute_position(self, x):
314 |         """
315 |         x: [b, h, l, 2*l-1]
316 |         ret: [b, h, l, l]
317 |         """
318 |         batch, heads, length, _ = x.size()
319 |         # Concat columns of pad to shift from relative to absolute indexing.
320 |         x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321 | 
322 |         # Concat extra elements so to add up to shape (len+1, 2*len-1).
323 |         x_flat = x.view([batch, heads, length * 2 * length])
324 |         x_flat = F.pad(
325 |             x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326 |         )
327 | 
328 |         # Reshape and slice out the padded elements.
329 |         x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330 |             :, :, :length, length - 1 :
331 |         ]
332 |         return x_final
333 | 
334 |     def _absolute_position_to_relative_position(self, x):
335 |         """
336 |         x: [b, h, l, l]
337 |         ret: [b, h, l, 2*l-1]
338 |         """
339 |         batch, heads, length, _ = x.size()
340 |         # padd along column
341 |         x = F.pad(
342 |             x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343 |         )
344 |         x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345 |         # add 0's in the beginning that will skew the elements after reshape
346 |         x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347 |         x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348 |         return x_final
349 | 
350 |     def _attention_bias_proximal(self, length):
351 |         """Bias for self-attention to encourage attention to close positions.
352 |         Args:
353 |           length: an integer scalar.
354 |         Returns:
355 |           a Tensor with shape [1, 1, length, length]
356 |         """
357 |         r = torch.arange(length, dtype=torch.float32)
358 |         diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359 |         return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360 | 
361 | 
362 | class FFN(nn.Module):
363 |     def __init__(
364 |         self,
365 |         in_channels,
366 |         out_channels,
367 |         filter_channels,
368 |         kernel_size,
369 |         p_dropout=0.0,
370 |         activation=None,
371 |         causal=False,
372 |     ):
373 |         super().__init__()
374 |         self.in_channels = in_channels
375 |         self.out_channels = out_channels
376 |         self.filter_channels = filter_channels
377 |         self.kernel_size = kernel_size
378 |         self.p_dropout = p_dropout
379 |         self.activation = activation
380 |         self.causal = causal
381 | 
382 |         if causal:
383 |             self.padding = self._causal_padding
384 |         else:
385 |             self.padding = self._same_padding
386 | 
387 |         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388 |         self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389 |         self.drop = nn.Dropout(p_dropout)
390 | 
391 |     def forward(self, x, x_mask):
392 |         x = self.conv_1(self.padding(x * x_mask))
393 |         if self.activation == "gelu":
394 |             x = x * torch.sigmoid(1.702 * x)
395 |         else:
396 |             x = torch.relu(x)
397 |         x = self.drop(x)
398 |         x = self.conv_2(self.padding(x * x_mask))
399 |         return x * x_mask
400 | 
401 |     def _causal_padding(self, x):
402 |         if self.kernel_size == 1:
403 |             return x
404 |         pad_l = self.kernel_size - 1
405 |         pad_r = 0
406 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407 |         x = F.pad(x, commons.convert_pad_shape(padding))
408 |         return x
409 | 
410 |     def _same_padding(self, x):
411 |         if self.kernel_size == 1:
412 |             return x
413 |         pad_l = (self.kernel_size - 1) // 2
414 |         pad_r = self.kernel_size // 2
415 |         padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416 |         x = F.pad(x, commons.convert_pad_shape(padding))
417 |         return x
418 | 


--------------------------------------------------------------------------------
/lib/infer_pack/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def init_weights(m, mean=0.0, std=0.01):
  9 |     classname = m.__class__.__name__
 10 |     if classname.find("Conv") != -1:
 11 |         m.weight.data.normal_(mean, std)
 12 | 
 13 | 
 14 | def get_padding(kernel_size, dilation=1):
 15 |     return int((kernel_size * dilation - dilation) / 2)
 16 | 
 17 | 
 18 | def convert_pad_shape(pad_shape):
 19 |     l = pad_shape[::-1]
 20 |     pad_shape = [item for sublist in l for item in sublist]
 21 |     return pad_shape
 22 | 
 23 | 
 24 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 25 |     """KL(P||Q)"""
 26 |     kl = (logs_q - logs_p) - 0.5
 27 |     kl += (
 28 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 29 |     )
 30 |     return kl
 31 | 
 32 | 
 33 | def rand_gumbel(shape):
 34 |     """Sample from the Gumbel distribution, protect from overflows."""
 35 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 36 |     return -torch.log(-torch.log(uniform_samples))
 37 | 
 38 | 
 39 | def rand_gumbel_like(x):
 40 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 41 |     return g
 42 | 
 43 | 
 44 | def slice_segments(x, ids_str, segment_size=4):
 45 |     ret = torch.zeros_like(x[:, :, :segment_size])
 46 |     for i in range(x.size(0)):
 47 |         idx_str = ids_str[i]
 48 |         idx_end = idx_str + segment_size
 49 |         ret[i] = x[i, :, idx_str:idx_end]
 50 |     return ret
 51 | 
 52 | 
 53 | def slice_segments2(x, ids_str, segment_size=4):
 54 |     ret = torch.zeros_like(x[:, :segment_size])
 55 |     for i in range(x.size(0)):
 56 |         idx_str = ids_str[i]
 57 |         idx_end = idx_str + segment_size
 58 |         ret[i] = x[i, idx_str:idx_end]
 59 |     return ret
 60 | 
 61 | 
 62 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 63 |     b, d, t = x.size()
 64 |     if x_lengths is None:
 65 |         x_lengths = t
 66 |     ids_str_max = x_lengths - segment_size + 1
 67 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 68 |     ret = slice_segments(x, ids_str, segment_size)
 69 |     return ret, ids_str
 70 | 
 71 | 
 72 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 73 |     position = torch.arange(length, dtype=torch.float)
 74 |     num_timescales = channels // 2
 75 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 76 |         num_timescales - 1
 77 |     )
 78 |     inv_timescales = min_timescale * torch.exp(
 79 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 80 |     )
 81 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 82 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 83 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 84 |     signal = signal.view(1, channels, length)
 85 |     return signal
 86 | 
 87 | 
 88 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 89 |     b, channels, length = x.size()
 90 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 91 |     return x + signal.to(dtype=x.dtype, device=x.device)
 92 | 
 93 | 
 94 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 95 |     b, channels, length = x.size()
 96 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 97 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 98 | 
 99 | 
100 | def subsequent_mask(length):
101 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
102 |     return mask
103 | 
104 | 
105 | @torch.jit.script
106 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
107 |     n_channels_int = n_channels[0]
108 |     in_act = input_a + input_b
109 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
110 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
111 |     acts = t_act * s_act
112 |     return acts
113 | 
114 | 
115 | def convert_pad_shape(pad_shape):
116 |     l = pad_shape[::-1]
117 |     pad_shape = [item for sublist in l for item in sublist]
118 |     return pad_shape
119 | 
120 | 
121 | def shift_1d(x):
122 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
123 |     return x
124 | 
125 | 
126 | def sequence_mask(length, max_length=None):
127 |     if max_length is None:
128 |         max_length = length.max()
129 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
130 |     return x.unsqueeze(0) < length.unsqueeze(1)
131 | 
132 | 
133 | def generate_path(duration, mask):
134 |     """
135 |     duration: [b, 1, t_x]
136 |     mask: [b, 1, t_y, t_x]
137 |     """
138 |     device = duration.device
139 | 
140 |     b, _, t_y, t_x = mask.shape
141 |     cum_duration = torch.cumsum(duration, -1)
142 | 
143 |     cum_duration_flat = cum_duration.view(b * t_x)
144 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145 |     path = path.view(b, t_x, t_y)
146 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147 |     path = path.unsqueeze(1).transpose(2, 3) * mask
148 |     return path
149 | 
150 | 
151 | def clip_grad_value_(parameters, clip_value, norm_type=2):
152 |     if isinstance(parameters, torch.Tensor):
153 |         parameters = [parameters]
154 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
155 |     norm_type = float(norm_type)
156 |     if clip_value is not None:
157 |         clip_value = float(clip_value)
158 | 
159 |     total_norm = 0
160 |     for p in parameters:
161 |         param_norm = p.grad.data.norm(norm_type)
162 |         total_norm += param_norm.item() ** norm_type
163 |         if clip_value is not None:
164 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
165 |     total_norm = total_norm ** (1.0 / norm_type)
166 |     return total_norm
167 | 


--------------------------------------------------------------------------------
/lib/infer_pack/models_onnx.py:
--------------------------------------------------------------------------------
  1 | import math, pdb, os
  2 | from time import time as ttime
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | from lib.infer_pack import modules
  7 | from lib.infer_pack import attentions
  8 | from lib.infer_pack import commons
  9 | from lib.infer_pack.commons import init_weights, get_padding
 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 12 | from lib.infer_pack.commons import init_weights
 13 | import numpy as np
 14 | from lib.infer_pack import commons
 15 | 
 16 | 
 17 | class TextEncoder256(nn.Module):
 18 |     def __init__(
 19 |         self,
 20 |         out_channels,
 21 |         hidden_channels,
 22 |         filter_channels,
 23 |         n_heads,
 24 |         n_layers,
 25 |         kernel_size,
 26 |         p_dropout,
 27 |         f0=True,
 28 |     ):
 29 |         super().__init__()
 30 |         self.out_channels = out_channels
 31 |         self.hidden_channels = hidden_channels
 32 |         self.filter_channels = filter_channels
 33 |         self.n_heads = n_heads
 34 |         self.n_layers = n_layers
 35 |         self.kernel_size = kernel_size
 36 |         self.p_dropout = p_dropout
 37 |         self.emb_phone = nn.Linear(256, hidden_channels)
 38 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
 39 |         if f0 == True:
 40 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
 41 |         self.encoder = attentions.Encoder(
 42 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
 43 |         )
 44 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 45 | 
 46 |     def forward(self, phone, pitch, lengths):
 47 |         if pitch == None:
 48 |             x = self.emb_phone(phone)
 49 |         else:
 50 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
 51 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
 52 |         x = self.lrelu(x)
 53 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
 54 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
 55 |             x.dtype
 56 |         )
 57 |         x = self.encoder(x * x_mask, x_mask)
 58 |         stats = self.proj(x) * x_mask
 59 | 
 60 |         m, logs = torch.split(stats, self.out_channels, dim=1)
 61 |         return m, logs, x_mask
 62 | 
 63 | 
 64 | class TextEncoder768(nn.Module):
 65 |     def __init__(
 66 |         self,
 67 |         out_channels,
 68 |         hidden_channels,
 69 |         filter_channels,
 70 |         n_heads,
 71 |         n_layers,
 72 |         kernel_size,
 73 |         p_dropout,
 74 |         f0=True,
 75 |     ):
 76 |         super().__init__()
 77 |         self.out_channels = out_channels
 78 |         self.hidden_channels = hidden_channels
 79 |         self.filter_channels = filter_channels
 80 |         self.n_heads = n_heads
 81 |         self.n_layers = n_layers
 82 |         self.kernel_size = kernel_size
 83 |         self.p_dropout = p_dropout
 84 |         self.emb_phone = nn.Linear(768, hidden_channels)
 85 |         self.lrelu = nn.LeakyReLU(0.1, inplace=True)
 86 |         if f0 == True:
 87 |             self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
 88 |         self.encoder = attentions.Encoder(
 89 |             hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
 90 |         )
 91 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 92 | 
 93 |     def forward(self, phone, pitch, lengths):
 94 |         if pitch == None:
 95 |             x = self.emb_phone(phone)
 96 |         else:
 97 |             x = self.emb_phone(phone) + self.emb_pitch(pitch)
 98 |         x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
 99 |         x = self.lrelu(x)
100 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
101 |         x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102 |             x.dtype
103 |         )
104 |         x = self.encoder(x * x_mask, x_mask)
105 |         stats = self.proj(x) * x_mask
106 | 
107 |         m, logs = torch.split(stats, self.out_channels, dim=1)
108 |         return m, logs, x_mask
109 | 
110 | 
111 | class ResidualCouplingBlock(nn.Module):
112 |     def __init__(
113 |         self,
114 |         channels,
115 |         hidden_channels,
116 |         kernel_size,
117 |         dilation_rate,
118 |         n_layers,
119 |         n_flows=4,
120 |         gin_channels=0,
121 |     ):
122 |         super().__init__()
123 |         self.channels = channels
124 |         self.hidden_channels = hidden_channels
125 |         self.kernel_size = kernel_size
126 |         self.dilation_rate = dilation_rate
127 |         self.n_layers = n_layers
128 |         self.n_flows = n_flows
129 |         self.gin_channels = gin_channels
130 | 
131 |         self.flows = nn.ModuleList()
132 |         for i in range(n_flows):
133 |             self.flows.append(
134 |                 modules.ResidualCouplingLayer(
135 |                     channels,
136 |                     hidden_channels,
137 |                     kernel_size,
138 |                     dilation_rate,
139 |                     n_layers,
140 |                     gin_channels=gin_channels,
141 |                     mean_only=True,
142 |                 )
143 |             )
144 |             self.flows.append(modules.Flip())
145 | 
146 |     def forward(self, x, x_mask, g=None, reverse=False):
147 |         if not reverse:
148 |             for flow in self.flows:
149 |                 x, _ = flow(x, x_mask, g=g, reverse=reverse)
150 |         else:
151 |             for flow in reversed(self.flows):
152 |                 x = flow(x, x_mask, g=g, reverse=reverse)
153 |         return x
154 | 
155 |     def remove_weight_norm(self):
156 |         for i in range(self.n_flows):
157 |             self.flows[i * 2].remove_weight_norm()
158 | 
159 | 
160 | class PosteriorEncoder(nn.Module):
161 |     def __init__(
162 |         self,
163 |         in_channels,
164 |         out_channels,
165 |         hidden_channels,
166 |         kernel_size,
167 |         dilation_rate,
168 |         n_layers,
169 |         gin_channels=0,
170 |     ):
171 |         super().__init__()
172 |         self.in_channels = in_channels
173 |         self.out_channels = out_channels
174 |         self.hidden_channels = hidden_channels
175 |         self.kernel_size = kernel_size
176 |         self.dilation_rate = dilation_rate
177 |         self.n_layers = n_layers
178 |         self.gin_channels = gin_channels
179 | 
180 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181 |         self.enc = modules.WN(
182 |             hidden_channels,
183 |             kernel_size,
184 |             dilation_rate,
185 |             n_layers,
186 |             gin_channels=gin_channels,
187 |         )
188 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189 | 
190 |     def forward(self, x, x_lengths, g=None):
191 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192 |             x.dtype
193 |         )
194 |         x = self.pre(x) * x_mask
195 |         x = self.enc(x, x_mask, g=g)
196 |         stats = self.proj(x) * x_mask
197 |         m, logs = torch.split(stats, self.out_channels, dim=1)
198 |         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199 |         return z, m, logs, x_mask
200 | 
201 |     def remove_weight_norm(self):
202 |         self.enc.remove_weight_norm()
203 | 
204 | 
205 | class Generator(torch.nn.Module):
206 |     def __init__(
207 |         self,
208 |         initial_channel,
209 |         resblock,
210 |         resblock_kernel_sizes,
211 |         resblock_dilation_sizes,
212 |         upsample_rates,
213 |         upsample_initial_channel,
214 |         upsample_kernel_sizes,
215 |         gin_channels=0,
216 |     ):
217 |         super(Generator, self).__init__()
218 |         self.num_kernels = len(resblock_kernel_sizes)
219 |         self.num_upsamples = len(upsample_rates)
220 |         self.conv_pre = Conv1d(
221 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
222 |         )
223 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224 | 
225 |         self.ups = nn.ModuleList()
226 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227 |             self.ups.append(
228 |                 weight_norm(
229 |                     ConvTranspose1d(
230 |                         upsample_initial_channel // (2**i),
231 |                         upsample_initial_channel // (2 ** (i + 1)),
232 |                         k,
233 |                         u,
234 |                         padding=(k - u) // 2,
235 |                     )
236 |                 )
237 |             )
238 | 
239 |         self.resblocks = nn.ModuleList()
240 |         for i in range(len(self.ups)):
241 |             ch = upsample_initial_channel // (2 ** (i + 1))
242 |             for j, (k, d) in enumerate(
243 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
244 |             ):
245 |                 self.resblocks.append(resblock(ch, k, d))
246 | 
247 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248 |         self.ups.apply(init_weights)
249 | 
250 |         if gin_channels != 0:
251 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252 | 
253 |     def forward(self, x, g=None):
254 |         x = self.conv_pre(x)
255 |         if g is not None:
256 |             x = x + self.cond(g)
257 | 
258 |         for i in range(self.num_upsamples):
259 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
260 |             x = self.ups[i](x)
261 |             xs = None
262 |             for j in range(self.num_kernels):
263 |                 if xs is None:
264 |                     xs = self.resblocks[i * self.num_kernels + j](x)
265 |                 else:
266 |                     xs += self.resblocks[i * self.num_kernels + j](x)
267 |             x = xs / self.num_kernels
268 |         x = F.leaky_relu(x)
269 |         x = self.conv_post(x)
270 |         x = torch.tanh(x)
271 | 
272 |         return x
273 | 
274 |     def remove_weight_norm(self):
275 |         for l in self.ups:
276 |             remove_weight_norm(l)
277 |         for l in self.resblocks:
278 |             l.remove_weight_norm()
279 | 
280 | 
281 | class SineGen(torch.nn.Module):
282 |     """Definition of sine generator
283 |     SineGen(samp_rate, harmonic_num = 0,
284 |             sine_amp = 0.1, noise_std = 0.003,
285 |             voiced_threshold = 0,
286 |             flag_for_pulse=False)
287 |     samp_rate: sampling rate in Hz
288 |     harmonic_num: number of harmonic overtones (default 0)
289 |     sine_amp: amplitude of sine-wavefrom (default 0.1)
290 |     noise_std: std of Gaussian noise (default 0.003)
291 |     voiced_thoreshold: F0 threshold for U/V classification (default 0)
292 |     flag_for_pulse: this SinGen is used inside PulseGen (default False)
293 |     Note: when flag_for_pulse is True, the first time step of a voiced
294 |         segment is always sin(np.pi) or cos(0)
295 |     """
296 | 
297 |     def __init__(
298 |         self,
299 |         samp_rate,
300 |         harmonic_num=0,
301 |         sine_amp=0.1,
302 |         noise_std=0.003,
303 |         voiced_threshold=0,
304 |         flag_for_pulse=False,
305 |     ):
306 |         super(SineGen, self).__init__()
307 |         self.sine_amp = sine_amp
308 |         self.noise_std = noise_std
309 |         self.harmonic_num = harmonic_num
310 |         self.dim = self.harmonic_num + 1
311 |         self.sampling_rate = samp_rate
312 |         self.voiced_threshold = voiced_threshold
313 | 
314 |     def _f02uv(self, f0):
315 |         # generate uv signal
316 |         uv = torch.ones_like(f0)
317 |         uv = uv * (f0 > self.voiced_threshold)
318 |         return uv
319 | 
320 |     def forward(self, f0, upp):
321 |         """sine_tensor, uv = forward(f0)
322 |         input F0: tensor(batchsize=1, length, dim=1)
323 |                   f0 for unvoiced steps should be 0
324 |         output sine_tensor: tensor(batchsize=1, length, dim)
325 |         output uv: tensor(batchsize=1, length, 1)
326 |         """
327 |         with torch.no_grad():
328 |             f0 = f0[:, None].transpose(1, 2)
329 |             f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330 |             # fundamental component
331 |             f0_buf[:, :, 0] = f0[:, :, 0]
332 |             for idx in np.arange(self.harmonic_num):
333 |                 f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334 |                     idx + 2
335 |                 )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336 |             rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
337 |             rand_ini = torch.rand(
338 |                 f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339 |             )
340 |             rand_ini[:, 0] = 0
341 |             rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342 |             tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
343 |             tmp_over_one *= upp
344 |             tmp_over_one = F.interpolate(
345 |                 tmp_over_one.transpose(2, 1),
346 |                 scale_factor=upp,
347 |                 mode="linear",
348 |                 align_corners=True,
349 |             ).transpose(2, 1)
350 |             rad_values = F.interpolate(
351 |                 rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352 |             ).transpose(
353 |                 2, 1
354 |             )  #######
355 |             tmp_over_one %= 1
356 |             tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357 |             cumsum_shift = torch.zeros_like(rad_values)
358 |             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359 |             sine_waves = torch.sin(
360 |                 torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361 |             )
362 |             sine_waves = sine_waves * self.sine_amp
363 |             uv = self._f02uv(f0)
364 |             uv = F.interpolate(
365 |                 uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366 |             ).transpose(2, 1)
367 |             noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368 |             noise = noise_amp * torch.randn_like(sine_waves)
369 |             sine_waves = sine_waves * uv + noise
370 |         return sine_waves, uv, noise
371 | 
372 | 
373 | class SourceModuleHnNSF(torch.nn.Module):
374 |     """SourceModule for hn-nsf
375 |     SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376 |                  add_noise_std=0.003, voiced_threshod=0)
377 |     sampling_rate: sampling_rate in Hz
378 |     harmonic_num: number of harmonic above F0 (default: 0)
379 |     sine_amp: amplitude of sine source signal (default: 0.1)
380 |     add_noise_std: std of additive Gaussian noise (default: 0.003)
381 |         note that amplitude of noise in unvoiced is decided
382 |         by sine_amp
383 |     voiced_threshold: threhold to set U/V given F0 (default: 0)
384 |     Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385 |     F0_sampled (batchsize, length, 1)
386 |     Sine_source (batchsize, length, 1)
387 |     noise_source (batchsize, length 1)
388 |     uv (batchsize, length, 1)
389 |     """
390 | 
391 |     def __init__(
392 |         self,
393 |         sampling_rate,
394 |         harmonic_num=0,
395 |         sine_amp=0.1,
396 |         add_noise_std=0.003,
397 |         voiced_threshod=0,
398 |         is_half=True,
399 |     ):
400 |         super(SourceModuleHnNSF, self).__init__()
401 | 
402 |         self.sine_amp = sine_amp
403 |         self.noise_std = add_noise_std
404 |         self.is_half = is_half
405 |         # to produce sine waveforms
406 |         self.l_sin_gen = SineGen(
407 |             sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408 |         )
409 | 
410 |         # to merge source harmonics into a single excitation
411 |         self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412 |         self.l_tanh = torch.nn.Tanh()
413 | 
414 |     def forward(self, x, upp=None):
415 |         sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416 |         if self.is_half:
417 |             sine_wavs = sine_wavs.half()
418 |         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419 |         return sine_merge, None, None  # noise, uv
420 | 
421 | 
422 | class GeneratorNSF(torch.nn.Module):
423 |     def __init__(
424 |         self,
425 |         initial_channel,
426 |         resblock,
427 |         resblock_kernel_sizes,
428 |         resblock_dilation_sizes,
429 |         upsample_rates,
430 |         upsample_initial_channel,
431 |         upsample_kernel_sizes,
432 |         gin_channels,
433 |         sr,
434 |         is_half=False,
435 |     ):
436 |         super(GeneratorNSF, self).__init__()
437 |         self.num_kernels = len(resblock_kernel_sizes)
438 |         self.num_upsamples = len(upsample_rates)
439 | 
440 |         self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441 |         self.m_source = SourceModuleHnNSF(
442 |             sampling_rate=sr, harmonic_num=0, is_half=is_half
443 |         )
444 |         self.noise_convs = nn.ModuleList()
445 |         self.conv_pre = Conv1d(
446 |             initial_channel, upsample_initial_channel, 7, 1, padding=3
447 |         )
448 |         resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449 | 
450 |         self.ups = nn.ModuleList()
451 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452 |             c_cur = upsample_initial_channel // (2 ** (i + 1))
453 |             self.ups.append(
454 |                 weight_norm(
455 |                     ConvTranspose1d(
456 |                         upsample_initial_channel // (2**i),
457 |                         upsample_initial_channel // (2 ** (i + 1)),
458 |                         k,
459 |                         u,
460 |                         padding=(k - u) // 2,
461 |                     )
462 |                 )
463 |             )
464 |             if i + 1 < len(upsample_rates):
465 |                 stride_f0 = np.prod(upsample_rates[i + 1 :])
466 |                 self.noise_convs.append(
467 |                     Conv1d(
468 |                         1,
469 |                         c_cur,
470 |                         kernel_size=stride_f0 * 2,
471 |                         stride=stride_f0,
472 |                         padding=stride_f0 // 2,
473 |                     )
474 |                 )
475 |             else:
476 |                 self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477 | 
478 |         self.resblocks = nn.ModuleList()
479 |         for i in range(len(self.ups)):
480 |             ch = upsample_initial_channel // (2 ** (i + 1))
481 |             for j, (k, d) in enumerate(
482 |                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
483 |             ):
484 |                 self.resblocks.append(resblock(ch, k, d))
485 | 
486 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487 |         self.ups.apply(init_weights)
488 | 
489 |         if gin_channels != 0:
490 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491 | 
492 |         self.upp = np.prod(upsample_rates)
493 | 
494 |     def forward(self, x, f0, g=None):
495 |         har_source, noi_source, uv = self.m_source(f0, self.upp)
496 |         har_source = har_source.transpose(1, 2)
497 |         x = self.conv_pre(x)
498 |         if g is not None:
499 |             x = x + self.cond(g)
500 | 
501 |         for i in range(self.num_upsamples):
502 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
503 |             x = self.ups[i](x)
504 |             x_source = self.noise_convs[i](har_source)
505 |             x = x + x_source
506 |             xs = None
507 |             for j in range(self.num_kernels):
508 |                 if xs is None:
509 |                     xs = self.resblocks[i * self.num_kernels + j](x)
510 |                 else:
511 |                     xs += self.resblocks[i * self.num_kernels + j](x)
512 |             x = xs / self.num_kernels
513 |         x = F.leaky_relu(x)
514 |         x = self.conv_post(x)
515 |         x = torch.tanh(x)
516 |         return x
517 | 
518 |     def remove_weight_norm(self):
519 |         for l in self.ups:
520 |             remove_weight_norm(l)
521 |         for l in self.resblocks:
522 |             l.remove_weight_norm()
523 | 
524 | 
525 | sr2sr = {
526 |     "32k": 32000,
527 |     "40k": 40000,
528 |     "48k": 48000,
529 | }
530 | 
531 | 
532 | class SynthesizerTrnMsNSFsidM(nn.Module):
533 |     def __init__(
534 |         self,
535 |         spec_channels,
536 |         segment_size,
537 |         inter_channels,
538 |         hidden_channels,
539 |         filter_channels,
540 |         n_heads,
541 |         n_layers,
542 |         kernel_size,
543 |         p_dropout,
544 |         resblock,
545 |         resblock_kernel_sizes,
546 |         resblock_dilation_sizes,
547 |         upsample_rates,
548 |         upsample_initial_channel,
549 |         upsample_kernel_sizes,
550 |         spk_embed_dim,
551 |         gin_channels,
552 |         sr,
553 |         version,
554 |         **kwargs
555 |     ):
556 |         super().__init__()
557 |         if type(sr) == type("strr"):
558 |             sr = sr2sr[sr]
559 |         self.spec_channels = spec_channels
560 |         self.inter_channels = inter_channels
561 |         self.hidden_channels = hidden_channels
562 |         self.filter_channels = filter_channels
563 |         self.n_heads = n_heads
564 |         self.n_layers = n_layers
565 |         self.kernel_size = kernel_size
566 |         self.p_dropout = p_dropout
567 |         self.resblock = resblock
568 |         self.resblock_kernel_sizes = resblock_kernel_sizes
569 |         self.resblock_dilation_sizes = resblock_dilation_sizes
570 |         self.upsample_rates = upsample_rates
571 |         self.upsample_initial_channel = upsample_initial_channel
572 |         self.upsample_kernel_sizes = upsample_kernel_sizes
573 |         self.segment_size = segment_size
574 |         self.gin_channels = gin_channels
575 |         # self.hop_length = hop_length#
576 |         self.spk_embed_dim = spk_embed_dim
577 |         if version == "v1":
578 |             self.enc_p = TextEncoder256(
579 |                 inter_channels,
580 |                 hidden_channels,
581 |                 filter_channels,
582 |                 n_heads,
583 |                 n_layers,
584 |                 kernel_size,
585 |                 p_dropout,
586 |             )
587 |         else:
588 |             self.enc_p = TextEncoder768(
589 |                 inter_channels,
590 |                 hidden_channels,
591 |                 filter_channels,
592 |                 n_heads,
593 |                 n_layers,
594 |                 kernel_size,
595 |                 p_dropout,
596 |             )
597 |         self.dec = GeneratorNSF(
598 |             inter_channels,
599 |             resblock,
600 |             resblock_kernel_sizes,
601 |             resblock_dilation_sizes,
602 |             upsample_rates,
603 |             upsample_initial_channel,
604 |             upsample_kernel_sizes,
605 |             gin_channels=gin_channels,
606 |             sr=sr,
607 |             is_half=kwargs["is_half"],
608 |         )
609 |         self.enc_q = PosteriorEncoder(
610 |             spec_channels,
611 |             inter_channels,
612 |             hidden_channels,
613 |             5,
614 |             1,
615 |             16,
616 |             gin_channels=gin_channels,
617 |         )
618 |         self.flow = ResidualCouplingBlock(
619 |             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
620 |         )
621 |         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
622 |         self.speaker_map = None
623 |         print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
624 | 
625 |     def remove_weight_norm(self):
626 |         self.dec.remove_weight_norm()
627 |         self.flow.remove_weight_norm()
628 |         self.enc_q.remove_weight_norm()
629 | 
630 |     def construct_spkmixmap(self, n_speaker):
631 |         self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
632 |         for i in range(n_speaker):
633 |             self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
634 |         self.speaker_map = self.speaker_map.unsqueeze(0)
635 | 
636 |     def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
637 |         if self.speaker_map is not None:  # [N, S]  *  [S, B, 1, H]
638 |             g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1))  # [N, S, B, 1, 1]
639 |             g = g * self.speaker_map  # [N, S, B, 1, H]
640 |             g = torch.sum(g, dim=1)  # [N, 1, B, 1, H]
641 |             g = g.transpose(0, -1).transpose(0, -2).squeeze(0)  # [B, H, N]
642 |         else:
643 |             g = g.unsqueeze(0)
644 |             g = self.emb_g(g).transpose(1, 2)
645 | 
646 |         m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
647 |         z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
648 |         z = self.flow(z_p, x_mask, g=g, reverse=True)
649 |         o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
650 |         return o
651 | 
652 | 
653 | class MultiPeriodDiscriminator(torch.nn.Module):
654 |     def __init__(self, use_spectral_norm=False):
655 |         super(MultiPeriodDiscriminator, self).__init__()
656 |         periods = [2, 3, 5, 7, 11, 17]
657 |         # periods = [3, 5, 7, 11, 17, 23, 37]
658 | 
659 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
660 |         discs = discs + [
661 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
662 |         ]
663 |         self.discriminators = nn.ModuleList(discs)
664 | 
665 |     def forward(self, y, y_hat):
666 |         y_d_rs = []  #
667 |         y_d_gs = []
668 |         fmap_rs = []
669 |         fmap_gs = []
670 |         for i, d in enumerate(self.discriminators):
671 |             y_d_r, fmap_r = d(y)
672 |             y_d_g, fmap_g = d(y_hat)
673 |             # for j in range(len(fmap_r)):
674 |             #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
675 |             y_d_rs.append(y_d_r)
676 |             y_d_gs.append(y_d_g)
677 |             fmap_rs.append(fmap_r)
678 |             fmap_gs.append(fmap_g)
679 | 
680 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
681 | 
682 | 
683 | class MultiPeriodDiscriminatorV2(torch.nn.Module):
684 |     def __init__(self, use_spectral_norm=False):
685 |         super(MultiPeriodDiscriminatorV2, self).__init__()
686 |         # periods = [2, 3, 5, 7, 11, 17]
687 |         periods = [2, 3, 5, 7, 11, 17, 23, 37]
688 | 
689 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
690 |         discs = discs + [
691 |             DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
692 |         ]
693 |         self.discriminators = nn.ModuleList(discs)
694 | 
695 |     def forward(self, y, y_hat):
696 |         y_d_rs = []  #
697 |         y_d_gs = []
698 |         fmap_rs = []
699 |         fmap_gs = []
700 |         for i, d in enumerate(self.discriminators):
701 |             y_d_r, fmap_r = d(y)
702 |             y_d_g, fmap_g = d(y_hat)
703 |             # for j in range(len(fmap_r)):
704 |             #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
705 |             y_d_rs.append(y_d_r)
706 |             y_d_gs.append(y_d_g)
707 |             fmap_rs.append(fmap_r)
708 |             fmap_gs.append(fmap_g)
709 | 
710 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
711 | 
712 | 
713 | class DiscriminatorS(torch.nn.Module):
714 |     def __init__(self, use_spectral_norm=False):
715 |         super(DiscriminatorS, self).__init__()
716 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
717 |         self.convs = nn.ModuleList(
718 |             [
719 |                 norm_f(Conv1d(1, 16, 15, 1, padding=7)),
720 |                 norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
721 |                 norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
722 |                 norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
723 |                 norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
724 |                 norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
725 |             ]
726 |         )
727 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
728 | 
729 |     def forward(self, x):
730 |         fmap = []
731 | 
732 |         for l in self.convs:
733 |             x = l(x)
734 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
735 |             fmap.append(x)
736 |         x = self.conv_post(x)
737 |         fmap.append(x)
738 |         x = torch.flatten(x, 1, -1)
739 | 
740 |         return x, fmap
741 | 
742 | 
743 | class DiscriminatorP(torch.nn.Module):
744 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
745 |         super(DiscriminatorP, self).__init__()
746 |         self.period = period
747 |         self.use_spectral_norm = use_spectral_norm
748 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
749 |         self.convs = nn.ModuleList(
750 |             [
751 |                 norm_f(
752 |                     Conv2d(
753 |                         1,
754 |                         32,
755 |                         (kernel_size, 1),
756 |                         (stride, 1),
757 |                         padding=(get_padding(kernel_size, 1), 0),
758 |                     )
759 |                 ),
760 |                 norm_f(
761 |                     Conv2d(
762 |                         32,
763 |                         128,
764 |                         (kernel_size, 1),
765 |                         (stride, 1),
766 |                         padding=(get_padding(kernel_size, 1), 0),
767 |                     )
768 |                 ),
769 |                 norm_f(
770 |                     Conv2d(
771 |                         128,
772 |                         512,
773 |                         (kernel_size, 1),
774 |                         (stride, 1),
775 |                         padding=(get_padding(kernel_size, 1), 0),
776 |                     )
777 |                 ),
778 |                 norm_f(
779 |                     Conv2d(
780 |                         512,
781 |                         1024,
782 |                         (kernel_size, 1),
783 |                         (stride, 1),
784 |                         padding=(get_padding(kernel_size, 1), 0),
785 |                     )
786 |                 ),
787 |                 norm_f(
788 |                     Conv2d(
789 |                         1024,
790 |                         1024,
791 |                         (kernel_size, 1),
792 |                         1,
793 |                         padding=(get_padding(kernel_size, 1), 0),
794 |                     )
795 |                 ),
796 |             ]
797 |         )
798 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
799 | 
800 |     def forward(self, x):
801 |         fmap = []
802 | 
803 |         # 1d to 2d
804 |         b, c, t = x.shape
805 |         if t % self.period != 0:  # pad first
806 |             n_pad = self.period - (t % self.period)
807 |             x = F.pad(x, (0, n_pad), "reflect")
808 |             t = t + n_pad
809 |         x = x.view(b, c, t // self.period, self.period)
810 | 
811 |         for l in self.convs:
812 |             x = l(x)
813 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
814 |             fmap.append(x)
815 |         x = self.conv_post(x)
816 |         fmap.append(x)
817 |         x = torch.flatten(x, 1, -1)
818 | 
819 |         return x, fmap
820 | 


--------------------------------------------------------------------------------
/lib/infer_pack/modules.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import scipy
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 10 | from torch.nn.utils import weight_norm, remove_weight_norm
 11 | 
 12 | from lib.infer_pack import commons
 13 | from lib.infer_pack.commons import init_weights, get_padding
 14 | from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
 15 | 
 16 | 
 17 | LRELU_SLOPE = 0.1
 18 | 
 19 | 
 20 | class LayerNorm(nn.Module):
 21 |     def __init__(self, channels, eps=1e-5):
 22 |         super().__init__()
 23 |         self.channels = channels
 24 |         self.eps = eps
 25 | 
 26 |         self.gamma = nn.Parameter(torch.ones(channels))
 27 |         self.beta = nn.Parameter(torch.zeros(channels))
 28 | 
 29 |     def forward(self, x):
 30 |         x = x.transpose(1, -1)
 31 |         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 32 |         return x.transpose(1, -1)
 33 | 
 34 | 
 35 | class ConvReluNorm(nn.Module):
 36 |     def __init__(
 37 |         self,
 38 |         in_channels,
 39 |         hidden_channels,
 40 |         out_channels,
 41 |         kernel_size,
 42 |         n_layers,
 43 |         p_dropout,
 44 |     ):
 45 |         super().__init__()
 46 |         self.in_channels = in_channels
 47 |         self.hidden_channels = hidden_channels
 48 |         self.out_channels = out_channels
 49 |         self.kernel_size = kernel_size
 50 |         self.n_layers = n_layers
 51 |         self.p_dropout = p_dropout
 52 |         assert n_layers > 1, "Number of layers should be larger than 0."
 53 | 
 54 |         self.conv_layers = nn.ModuleList()
 55 |         self.norm_layers = nn.ModuleList()
 56 |         self.conv_layers.append(
 57 |             nn.Conv1d(
 58 |                 in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
 59 |             )
 60 |         )
 61 |         self.norm_layers.append(LayerNorm(hidden_channels))
 62 |         self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
 63 |         for _ in range(n_layers - 1):
 64 |             self.conv_layers.append(
 65 |                 nn.Conv1d(
 66 |                     hidden_channels,
 67 |                     hidden_channels,
 68 |                     kernel_size,
 69 |                     padding=kernel_size // 2,
 70 |                 )
 71 |             )
 72 |             self.norm_layers.append(LayerNorm(hidden_channels))
 73 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 74 |         self.proj.weight.data.zero_()
 75 |         self.proj.bias.data.zero_()
 76 | 
 77 |     def forward(self, x, x_mask):
 78 |         x_org = x
 79 |         for i in range(self.n_layers):
 80 |             x = self.conv_layers[i](x * x_mask)
 81 |             x = self.norm_layers[i](x)
 82 |             x = self.relu_drop(x)
 83 |         x = x_org + self.proj(x)
 84 |         return x * x_mask
 85 | 
 86 | 
 87 | class DDSConv(nn.Module):
 88 |     """
 89 |     Dialted and Depth-Separable Convolution
 90 |     """
 91 | 
 92 |     def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
 93 |         super().__init__()
 94 |         self.channels = channels
 95 |         self.kernel_size = kernel_size
 96 |         self.n_layers = n_layers
 97 |         self.p_dropout = p_dropout
 98 | 
 99 |         self.drop = nn.Dropout(p_dropout)
100 |         self.convs_sep = nn.ModuleList()
101 |         self.convs_1x1 = nn.ModuleList()
102 |         self.norms_1 = nn.ModuleList()
103 |         self.norms_2 = nn.ModuleList()
104 |         for i in range(n_layers):
105 |             dilation = kernel_size**i
106 |             padding = (kernel_size * dilation - dilation) // 2
107 |             self.convs_sep.append(
108 |                 nn.Conv1d(
109 |                     channels,
110 |                     channels,
111 |                     kernel_size,
112 |                     groups=channels,
113 |                     dilation=dilation,
114 |                     padding=padding,
115 |                 )
116 |             )
117 |             self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118 |             self.norms_1.append(LayerNorm(channels))
119 |             self.norms_2.append(LayerNorm(channels))
120 | 
121 |     def forward(self, x, x_mask, g=None):
122 |         if g is not None:
123 |             x = x + g
124 |         for i in range(self.n_layers):
125 |             y = self.convs_sep[i](x * x_mask)
126 |             y = self.norms_1[i](y)
127 |             y = F.gelu(y)
128 |             y = self.convs_1x1[i](y)
129 |             y = self.norms_2[i](y)
130 |             y = F.gelu(y)
131 |             y = self.drop(y)
132 |             x = x + y
133 |         return x * x_mask
134 | 
135 | 
136 | class WN(torch.nn.Module):
137 |     def __init__(
138 |         self,
139 |         hidden_channels,
140 |         kernel_size,
141 |         dilation_rate,
142 |         n_layers,
143 |         gin_channels=0,
144 |         p_dropout=0,
145 |     ):
146 |         super(WN, self).__init__()
147 |         assert kernel_size % 2 == 1
148 |         self.hidden_channels = hidden_channels
149 |         self.kernel_size = (kernel_size,)
150 |         self.dilation_rate = dilation_rate
151 |         self.n_layers = n_layers
152 |         self.gin_channels = gin_channels
153 |         self.p_dropout = p_dropout
154 | 
155 |         self.in_layers = torch.nn.ModuleList()
156 |         self.res_skip_layers = torch.nn.ModuleList()
157 |         self.drop = nn.Dropout(p_dropout)
158 | 
159 |         if gin_channels != 0:
160 |             cond_layer = torch.nn.Conv1d(
161 |                 gin_channels, 2 * hidden_channels * n_layers, 1
162 |             )
163 |             self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164 | 
165 |         for i in range(n_layers):
166 |             dilation = dilation_rate**i
167 |             padding = int((kernel_size * dilation - dilation) / 2)
168 |             in_layer = torch.nn.Conv1d(
169 |                 hidden_channels,
170 |                 2 * hidden_channels,
171 |                 kernel_size,
172 |                 dilation=dilation,
173 |                 padding=padding,
174 |             )
175 |             in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176 |             self.in_layers.append(in_layer)
177 | 
178 |             # last one is not necessary
179 |             if i < n_layers - 1:
180 |                 res_skip_channels = 2 * hidden_channels
181 |             else:
182 |                 res_skip_channels = hidden_channels
183 | 
184 |             res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186 |             self.res_skip_layers.append(res_skip_layer)
187 | 
188 |     def forward(self, x, x_mask, g=None, **kwargs):
189 |         output = torch.zeros_like(x)
190 |         n_channels_tensor = torch.IntTensor([self.hidden_channels])
191 | 
192 |         if g is not None:
193 |             g = self.cond_layer(g)
194 | 
195 |         for i in range(self.n_layers):
196 |             x_in = self.in_layers[i](x)
197 |             if g is not None:
198 |                 cond_offset = i * 2 * self.hidden_channels
199 |                 g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
200 |             else:
201 |                 g_l = torch.zeros_like(x_in)
202 | 
203 |             acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
204 |             acts = self.drop(acts)
205 | 
206 |             res_skip_acts = self.res_skip_layers[i](acts)
207 |             if i < self.n_layers - 1:
208 |                 res_acts = res_skip_acts[:, : self.hidden_channels, :]
209 |                 x = (x + res_acts) * x_mask
210 |                 output = output + res_skip_acts[:, self.hidden_channels :, :]
211 |             else:
212 |                 output = output + res_skip_acts
213 |         return output * x_mask
214 | 
215 |     def remove_weight_norm(self):
216 |         if self.gin_channels != 0:
217 |             torch.nn.utils.remove_weight_norm(self.cond_layer)
218 |         for l in self.in_layers:
219 |             torch.nn.utils.remove_weight_norm(l)
220 |         for l in self.res_skip_layers:
221 |             torch.nn.utils.remove_weight_norm(l)
222 | 
223 | 
224 | class ResBlock1(torch.nn.Module):
225 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
226 |         super(ResBlock1, self).__init__()
227 |         self.convs1 = nn.ModuleList(
228 |             [
229 |                 weight_norm(
230 |                     Conv1d(
231 |                         channels,
232 |                         channels,
233 |                         kernel_size,
234 |                         1,
235 |                         dilation=dilation[0],
236 |                         padding=get_padding(kernel_size, dilation[0]),
237 |                     )
238 |                 ),
239 |                 weight_norm(
240 |                     Conv1d(
241 |                         channels,
242 |                         channels,
243 |                         kernel_size,
244 |                         1,
245 |                         dilation=dilation[1],
246 |                         padding=get_padding(kernel_size, dilation[1]),
247 |                     )
248 |                 ),
249 |                 weight_norm(
250 |                     Conv1d(
251 |                         channels,
252 |                         channels,
253 |                         kernel_size,
254 |                         1,
255 |                         dilation=dilation[2],
256 |                         padding=get_padding(kernel_size, dilation[2]),
257 |                     )
258 |                 ),
259 |             ]
260 |         )
261 |         self.convs1.apply(init_weights)
262 | 
263 |         self.convs2 = nn.ModuleList(
264 |             [
265 |                 weight_norm(
266 |                     Conv1d(
267 |                         channels,
268 |                         channels,
269 |                         kernel_size,
270 |                         1,
271 |                         dilation=1,
272 |                         padding=get_padding(kernel_size, 1),
273 |                     )
274 |                 ),
275 |                 weight_norm(
276 |                     Conv1d(
277 |                         channels,
278 |                         channels,
279 |                         kernel_size,
280 |                         1,
281 |                         dilation=1,
282 |                         padding=get_padding(kernel_size, 1),
283 |                     )
284 |                 ),
285 |                 weight_norm(
286 |                     Conv1d(
287 |                         channels,
288 |                         channels,
289 |                         kernel_size,
290 |                         1,
291 |                         dilation=1,
292 |                         padding=get_padding(kernel_size, 1),
293 |                     )
294 |                 ),
295 |             ]
296 |         )
297 |         self.convs2.apply(init_weights)
298 | 
299 |     def forward(self, x, x_mask=None):
300 |         for c1, c2 in zip(self.convs1, self.convs2):
301 |             xt = F.leaky_relu(x, LRELU_SLOPE)
302 |             if x_mask is not None:
303 |                 xt = xt * x_mask
304 |             xt = c1(xt)
305 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
306 |             if x_mask is not None:
307 |                 xt = xt * x_mask
308 |             xt = c2(xt)
309 |             x = xt + x
310 |         if x_mask is not None:
311 |             x = x * x_mask
312 |         return x
313 | 
314 |     def remove_weight_norm(self):
315 |         for l in self.convs1:
316 |             remove_weight_norm(l)
317 |         for l in self.convs2:
318 |             remove_weight_norm(l)
319 | 
320 | 
321 | class ResBlock2(torch.nn.Module):
322 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
323 |         super(ResBlock2, self).__init__()
324 |         self.convs = nn.ModuleList(
325 |             [
326 |                 weight_norm(
327 |                     Conv1d(
328 |                         channels,
329 |                         channels,
330 |                         kernel_size,
331 |                         1,
332 |                         dilation=dilation[0],
333 |                         padding=get_padding(kernel_size, dilation[0]),
334 |                     )
335 |                 ),
336 |                 weight_norm(
337 |                     Conv1d(
338 |                         channels,
339 |                         channels,
340 |                         kernel_size,
341 |                         1,
342 |                         dilation=dilation[1],
343 |                         padding=get_padding(kernel_size, dilation[1]),
344 |                     )
345 |                 ),
346 |             ]
347 |         )
348 |         self.convs.apply(init_weights)
349 | 
350 |     def forward(self, x, x_mask=None):
351 |         for c in self.convs:
352 |             xt = F.leaky_relu(x, LRELU_SLOPE)
353 |             if x_mask is not None:
354 |                 xt = xt * x_mask
355 |             xt = c(xt)
356 |             x = xt + x
357 |         if x_mask is not None:
358 |             x = x * x_mask
359 |         return x
360 | 
361 |     def remove_weight_norm(self):
362 |         for l in self.convs:
363 |             remove_weight_norm(l)
364 | 
365 | 
366 | class Log(nn.Module):
367 |     def forward(self, x, x_mask, reverse=False, **kwargs):
368 |         if not reverse:
369 |             y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
370 |             logdet = torch.sum(-y, [1, 2])
371 |             return y, logdet
372 |         else:
373 |             x = torch.exp(x) * x_mask
374 |             return x
375 | 
376 | 
377 | class Flip(nn.Module):
378 |     def forward(self, x, *args, reverse=False, **kwargs):
379 |         x = torch.flip(x, [1])
380 |         if not reverse:
381 |             logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
382 |             return x, logdet
383 |         else:
384 |             return x
385 | 
386 | 
387 | class ElementwiseAffine(nn.Module):
388 |     def __init__(self, channels):
389 |         super().__init__()
390 |         self.channels = channels
391 |         self.m = nn.Parameter(torch.zeros(channels, 1))
392 |         self.logs = nn.Parameter(torch.zeros(channels, 1))
393 | 
394 |     def forward(self, x, x_mask, reverse=False, **kwargs):
395 |         if not reverse:
396 |             y = self.m + torch.exp(self.logs) * x
397 |             y = y * x_mask
398 |             logdet = torch.sum(self.logs * x_mask, [1, 2])
399 |             return y, logdet
400 |         else:
401 |             x = (x - self.m) * torch.exp(-self.logs) * x_mask
402 |             return x
403 | 
404 | 
405 | class ResidualCouplingLayer(nn.Module):
406 |     def __init__(
407 |         self,
408 |         channels,
409 |         hidden_channels,
410 |         kernel_size,
411 |         dilation_rate,
412 |         n_layers,
413 |         p_dropout=0,
414 |         gin_channels=0,
415 |         mean_only=False,
416 |     ):
417 |         assert channels % 2 == 0, "channels should be divisible by 2"
418 |         super().__init__()
419 |         self.channels = channels
420 |         self.hidden_channels = hidden_channels
421 |         self.kernel_size = kernel_size
422 |         self.dilation_rate = dilation_rate
423 |         self.n_layers = n_layers
424 |         self.half_channels = channels // 2
425 |         self.mean_only = mean_only
426 | 
427 |         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
428 |         self.enc = WN(
429 |             hidden_channels,
430 |             kernel_size,
431 |             dilation_rate,
432 |             n_layers,
433 |             p_dropout=p_dropout,
434 |             gin_channels=gin_channels,
435 |         )
436 |         self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
437 |         self.post.weight.data.zero_()
438 |         self.post.bias.data.zero_()
439 | 
440 |     def forward(self, x, x_mask, g=None, reverse=False):
441 |         x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
442 |         h = self.pre(x0) * x_mask
443 |         h = self.enc(h, x_mask, g=g)
444 |         stats = self.post(h) * x_mask
445 |         if not self.mean_only:
446 |             m, logs = torch.split(stats, [self.half_channels] * 2, 1)
447 |         else:
448 |             m = stats
449 |             logs = torch.zeros_like(m)
450 | 
451 |         if not reverse:
452 |             x1 = m + x1 * torch.exp(logs) * x_mask
453 |             x = torch.cat([x0, x1], 1)
454 |             logdet = torch.sum(logs, [1, 2])
455 |             return x, logdet
456 |         else:
457 |             x1 = (x1 - m) * torch.exp(-logs) * x_mask
458 |             x = torch.cat([x0, x1], 1)
459 |             return x
460 | 
461 |     def remove_weight_norm(self):
462 |         self.enc.remove_weight_norm()
463 | 
464 | 
465 | class ConvFlow(nn.Module):
466 |     def __init__(
467 |         self,
468 |         in_channels,
469 |         filter_channels,
470 |         kernel_size,
471 |         n_layers,
472 |         num_bins=10,
473 |         tail_bound=5.0,
474 |     ):
475 |         super().__init__()
476 |         self.in_channels = in_channels
477 |         self.filter_channels = filter_channels
478 |         self.kernel_size = kernel_size
479 |         self.n_layers = n_layers
480 |         self.num_bins = num_bins
481 |         self.tail_bound = tail_bound
482 |         self.half_channels = in_channels // 2
483 | 
484 |         self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
485 |         self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
486 |         self.proj = nn.Conv1d(
487 |             filter_channels, self.half_channels * (num_bins * 3 - 1), 1
488 |         )
489 |         self.proj.weight.data.zero_()
490 |         self.proj.bias.data.zero_()
491 | 
492 |     def forward(self, x, x_mask, g=None, reverse=False):
493 |         x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
494 |         h = self.pre(x0)
495 |         h = self.convs(h, x_mask, g=g)
496 |         h = self.proj(h) * x_mask
497 | 
498 |         b, c, t = x0.shape
499 |         h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
500 | 
501 |         unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
502 |         unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
503 |             self.filter_channels
504 |         )
505 |         unnormalized_derivatives = h[..., 2 * self.num_bins :]
506 | 
507 |         x1, logabsdet = piecewise_rational_quadratic_transform(
508 |             x1,
509 |             unnormalized_widths,
510 |             unnormalized_heights,
511 |             unnormalized_derivatives,
512 |             inverse=reverse,
513 |             tails="linear",
514 |             tail_bound=self.tail_bound,
515 |         )
516 | 
517 |         x = torch.cat([x0, x1], 1) * x_mask
518 |         logdet = torch.sum(logabsdet * x_mask, [1, 2])
519 |         if not reverse:
520 |             return x, logdet
521 |         else:
522 |             return x
523 | 


--------------------------------------------------------------------------------
/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py:
--------------------------------------------------------------------------------
 1 | from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 2 | import pyworld
 3 | import numpy as np
 4 | 
 5 | 
 6 | class DioF0Predictor(F0Predictor):
 7 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 8 |         self.hop_length = hop_length
 9 |         self.f0_min = f0_min
10 |         self.f0_max = f0_max
11 |         self.sampling_rate = sampling_rate
12 | 
13 |     def interpolate_f0(self, f0):
14 |         """
15 |         对F0进行插值处理
16 |         """
17 | 
18 |         data = np.reshape(f0, (f0.size, 1))
19 | 
20 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21 |         vuv_vector[data > 0.0] = 1.0
22 |         vuv_vector[data <= 0.0] = 0.0
23 | 
24 |         ip_data = data
25 | 
26 |         frame_number = data.size
27 |         last_value = 0.0
28 |         for i in range(frame_number):
29 |             if data[i] <= 0.0:
30 |                 j = i + 1
31 |                 for j in range(i + 1, frame_number):
32 |                     if data[j] > 0.0:
33 |                         break
34 |                 if j < frame_number - 1:
35 |                     if last_value > 0.0:
36 |                         step = (data[j] - data[i - 1]) / float(j - i)
37 |                         for k in range(i, j):
38 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
39 |                     else:
40 |                         for k in range(i, j):
41 |                             ip_data[k] = data[j]
42 |                 else:
43 |                     for k in range(i, frame_number):
44 |                         ip_data[k] = last_value
45 |             else:
46 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
47 |                 last_value = data[i]
48 | 
49 |         return ip_data[:, 0], vuv_vector[:, 0]
50 | 
51 |     def resize_f0(self, x, target_len):
52 |         source = np.array(x)
53 |         source[source < 0.001] = np.nan
54 |         target = np.interp(
55 |             np.arange(0, len(source) * target_len, len(source)) / target_len,
56 |             np.arange(0, len(source)),
57 |             source,
58 |         )
59 |         res = np.nan_to_num(target)
60 |         return res
61 | 
62 |     def compute_f0(self, wav, p_len=None):
63 |         if p_len is None:
64 |             p_len = wav.shape[0] // self.hop_length
65 |         f0, t = pyworld.dio(
66 |             wav.astype(np.double),
67 |             fs=self.sampling_rate,
68 |             f0_floor=self.f0_min,
69 |             f0_ceil=self.f0_max,
70 |             frame_period=1000 * self.hop_length / self.sampling_rate,
71 |         )
72 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
73 |         for index, pitch in enumerate(f0):
74 |             f0[index] = round(pitch, 1)
75 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
76 | 
77 |     def compute_f0_uv(self, wav, p_len=None):
78 |         if p_len is None:
79 |             p_len = wav.shape[0] // self.hop_length
80 |         f0, t = pyworld.dio(
81 |             wav.astype(np.double),
82 |             fs=self.sampling_rate,
83 |             f0_floor=self.f0_min,
84 |             f0_ceil=self.f0_max,
85 |             frame_period=1000 * self.hop_length / self.sampling_rate,
86 |         )
87 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
88 |         for index, pitch in enumerate(f0):
89 |             f0[index] = round(pitch, 1)
90 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
91 | 


--------------------------------------------------------------------------------
/lib/infer_pack/modules/F0Predictor/F0Predictor.py:
--------------------------------------------------------------------------------
 1 | class F0Predictor(object):
 2 |     def compute_f0(self, wav, p_len):
 3 |         """
 4 |         input: wav:[signal_length]
 5 |                p_len:int
 6 |         output: f0:[signal_length//hop_length]
 7 |         """
 8 |         pass
 9 | 
10 |     def compute_f0_uv(self, wav, p_len):
11 |         """
12 |         input: wav:[signal_length]
13 |                p_len:int
14 |         output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15 |         """
16 |         pass
17 | 


--------------------------------------------------------------------------------
/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py:
--------------------------------------------------------------------------------
 1 | from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 2 | import pyworld
 3 | import numpy as np
 4 | 
 5 | 
 6 | class HarvestF0Predictor(F0Predictor):
 7 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 8 |         self.hop_length = hop_length
 9 |         self.f0_min = f0_min
10 |         self.f0_max = f0_max
11 |         self.sampling_rate = sampling_rate
12 | 
13 |     def interpolate_f0(self, f0):
14 |         """
15 |         对F0进行插值处理
16 |         """
17 | 
18 |         data = np.reshape(f0, (f0.size, 1))
19 | 
20 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21 |         vuv_vector[data > 0.0] = 1.0
22 |         vuv_vector[data <= 0.0] = 0.0
23 | 
24 |         ip_data = data
25 | 
26 |         frame_number = data.size
27 |         last_value = 0.0
28 |         for i in range(frame_number):
29 |             if data[i] <= 0.0:
30 |                 j = i + 1
31 |                 for j in range(i + 1, frame_number):
32 |                     if data[j] > 0.0:
33 |                         break
34 |                 if j < frame_number - 1:
35 |                     if last_value > 0.0:
36 |                         step = (data[j] - data[i - 1]) / float(j - i)
37 |                         for k in range(i, j):
38 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
39 |                     else:
40 |                         for k in range(i, j):
41 |                             ip_data[k] = data[j]
42 |                 else:
43 |                     for k in range(i, frame_number):
44 |                         ip_data[k] = last_value
45 |             else:
46 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
47 |                 last_value = data[i]
48 | 
49 |         return ip_data[:, 0], vuv_vector[:, 0]
50 | 
51 |     def resize_f0(self, x, target_len):
52 |         source = np.array(x)
53 |         source[source < 0.001] = np.nan
54 |         target = np.interp(
55 |             np.arange(0, len(source) * target_len, len(source)) / target_len,
56 |             np.arange(0, len(source)),
57 |             source,
58 |         )
59 |         res = np.nan_to_num(target)
60 |         return res
61 | 
62 |     def compute_f0(self, wav, p_len=None):
63 |         if p_len is None:
64 |             p_len = wav.shape[0] // self.hop_length
65 |         f0, t = pyworld.harvest(
66 |             wav.astype(np.double),
67 |             fs=self.hop_length,
68 |             f0_ceil=self.f0_max,
69 |             f0_floor=self.f0_min,
70 |             frame_period=1000 * self.hop_length / self.sampling_rate,
71 |         )
72 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
73 |         return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
74 | 
75 |     def compute_f0_uv(self, wav, p_len=None):
76 |         if p_len is None:
77 |             p_len = wav.shape[0] // self.hop_length
78 |         f0, t = pyworld.harvest(
79 |             wav.astype(np.double),
80 |             fs=self.sampling_rate,
81 |             f0_floor=self.f0_min,
82 |             f0_ceil=self.f0_max,
83 |             frame_period=1000 * self.hop_length / self.sampling_rate,
84 |         )
85 |         f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
86 |         return self.interpolate_f0(self.resize_f0(f0, p_len))
87 | 


--------------------------------------------------------------------------------
/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py:
--------------------------------------------------------------------------------
 1 | from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 2 | import parselmouth
 3 | import numpy as np
 4 | 
 5 | 
 6 | class PMF0Predictor(F0Predictor):
 7 |     def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
 8 |         self.hop_length = hop_length
 9 |         self.f0_min = f0_min
10 |         self.f0_max = f0_max
11 |         self.sampling_rate = sampling_rate
12 | 
13 |     def interpolate_f0(self, f0):
14 |         """
15 |         对F0进行插值处理
16 |         """
17 | 
18 |         data = np.reshape(f0, (f0.size, 1))
19 | 
20 |         vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21 |         vuv_vector[data > 0.0] = 1.0
22 |         vuv_vector[data <= 0.0] = 0.0
23 | 
24 |         ip_data = data
25 | 
26 |         frame_number = data.size
27 |         last_value = 0.0
28 |         for i in range(frame_number):
29 |             if data[i] <= 0.0:
30 |                 j = i + 1
31 |                 for j in range(i + 1, frame_number):
32 |                     if data[j] > 0.0:
33 |                         break
34 |                 if j < frame_number - 1:
35 |                     if last_value > 0.0:
36 |                         step = (data[j] - data[i - 1]) / float(j - i)
37 |                         for k in range(i, j):
38 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
39 |                     else:
40 |                         for k in range(i, j):
41 |                             ip_data[k] = data[j]
42 |                 else:
43 |                     for k in range(i, frame_number):
44 |                         ip_data[k] = last_value
45 |             else:
46 |                 ip_data[i] = data[i]  # 这里可能存在一个没有必要的拷贝
47 |                 last_value = data[i]
48 | 
49 |         return ip_data[:, 0], vuv_vector[:, 0]
50 | 
51 |     def compute_f0(self, wav, p_len=None):
52 |         x = wav
53 |         if p_len is None:
54 |             p_len = x.shape[0] // self.hop_length
55 |         else:
56 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
57 |         time_step = self.hop_length / self.sampling_rate * 1000
58 |         f0 = (
59 |             parselmouth.Sound(x, self.sampling_rate)
60 |             .to_pitch_ac(
61 |                 time_step=time_step / 1000,
62 |                 voicing_threshold=0.6,
63 |                 pitch_floor=self.f0_min,
64 |                 pitch_ceiling=self.f0_max,
65 |             )
66 |             .selected_array["frequency"]
67 |         )
68 | 
69 |         pad_size = (p_len - len(f0) + 1) // 2
70 |         if pad_size > 0 or p_len - len(f0) - pad_size > 0:
71 |             f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
72 |         f0, uv = self.interpolate_f0(f0)
73 |         return f0
74 | 
75 |     def compute_f0_uv(self, wav, p_len=None):
76 |         x = wav
77 |         if p_len is None:
78 |             p_len = x.shape[0] // self.hop_length
79 |         else:
80 |             assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
81 |         time_step = self.hop_length / self.sampling_rate * 1000
82 |         f0 = (
83 |             parselmouth.Sound(x, self.sampling_rate)
84 |             .to_pitch_ac(
85 |                 time_step=time_step / 1000,
86 |                 voicing_threshold=0.6,
87 |                 pitch_floor=self.f0_min,
88 |                 pitch_ceiling=self.f0_max,
89 |             )
90 |             .selected_array["frequency"]
91 |         )
92 | 
93 |         pad_size = (p_len - len(f0) + 1) // 2
94 |         if pad_size > 0 or p_len - len(f0) - pad_size > 0:
95 |             f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
96 |         f0, uv = self.interpolate_f0(f0)
97 |         return f0, uv
98 | 


--------------------------------------------------------------------------------
/lib/infer_pack/modules/F0Predictor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArkanDash/Multi-Model-RVC-Inference/661936e4dce121c8ad84113f7637308d1642c887/lib/infer_pack/modules/F0Predictor/__init__.py


--------------------------------------------------------------------------------
/lib/infer_pack/onnx_inference.py:
--------------------------------------------------------------------------------
  1 | import onnxruntime
  2 | import librosa
  3 | import numpy as np
  4 | import soundfile
  5 | 
  6 | 
  7 | class ContentVec:
  8 |     def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
  9 |         print("load model(s) from {}".format(vec_path))
 10 |         if device == "cpu" or device is None:
 11 |             providers = ["CPUExecutionProvider"]
 12 |         elif device == "cuda":
 13 |             providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
 14 |         elif device == "dml":
 15 |             providers = ["DmlExecutionProvider"]
 16 |         else:
 17 |             raise RuntimeError("Unsportted Device")
 18 |         self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
 19 | 
 20 |     def __call__(self, wav):
 21 |         return self.forward(wav)
 22 | 
 23 |     def forward(self, wav):
 24 |         feats = wav
 25 |         if feats.ndim == 2:  # double channels
 26 |             feats = feats.mean(-1)
 27 |         assert feats.ndim == 1, feats.ndim
 28 |         feats = np.expand_dims(np.expand_dims(feats, 0), 0)
 29 |         onnx_input = {self.model.get_inputs()[0].name: feats}
 30 |         logits = self.model.run(None, onnx_input)[0]
 31 |         return logits.transpose(0, 2, 1)
 32 | 
 33 | 
 34 | def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
 35 |     if f0_predictor == "pm":
 36 |         from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
 37 | 
 38 |         f0_predictor_object = PMF0Predictor(
 39 |             hop_length=hop_length, sampling_rate=sampling_rate
 40 |         )
 41 |     elif f0_predictor == "harvest":
 42 |         from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
 43 |             HarvestF0Predictor,
 44 |         )
 45 | 
 46 |         f0_predictor_object = HarvestF0Predictor(
 47 |             hop_length=hop_length, sampling_rate=sampling_rate
 48 |         )
 49 |     elif f0_predictor == "dio":
 50 |         from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
 51 | 
 52 |         f0_predictor_object = DioF0Predictor(
 53 |             hop_length=hop_length, sampling_rate=sampling_rate
 54 |         )
 55 |     else:
 56 |         raise Exception("Unknown f0 predictor")
 57 |     return f0_predictor_object
 58 | 
 59 | 
 60 | class OnnxRVC:
 61 |     def __init__(
 62 |         self,
 63 |         model_path,
 64 |         sr=40000,
 65 |         hop_size=512,
 66 |         vec_path="vec-768-layer-12",
 67 |         device="cpu",
 68 |     ):
 69 |         vec_path = f"pretrained/{vec_path}.onnx"
 70 |         self.vec_model = ContentVec(vec_path, device)
 71 |         if device == "cpu" or device is None:
 72 |             providers = ["CPUExecutionProvider"]
 73 |         elif device == "cuda":
 74 |             providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
 75 |         elif device == "dml":
 76 |             providers = ["DmlExecutionProvider"]
 77 |         else:
 78 |             raise RuntimeError("Unsportted Device")
 79 |         self.model = onnxruntime.InferenceSession(model_path, providers=providers)
 80 |         self.sampling_rate = sr
 81 |         self.hop_size = hop_size
 82 | 
 83 |     def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
 84 |         onnx_input = {
 85 |             self.model.get_inputs()[0].name: hubert,
 86 |             self.model.get_inputs()[1].name: hubert_length,
 87 |             self.model.get_inputs()[2].name: pitch,
 88 |             self.model.get_inputs()[3].name: pitchf,
 89 |             self.model.get_inputs()[4].name: ds,
 90 |             self.model.get_inputs()[5].name: rnd,
 91 |         }
 92 |         return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
 93 | 
 94 |     def inference(
 95 |         self,
 96 |         raw_path,
 97 |         sid,
 98 |         f0_method="dio",
 99 |         f0_up_key=0,
100 |         pad_time=0.5,
101 |         cr_threshold=0.02,
102 |     ):
103 |         f0_min = 50
104 |         f0_max = 1100
105 |         f0_mel_min = 1127 * np.log(1 + f0_min / 700)
106 |         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
107 |         f0_predictor = get_f0_predictor(
108 |             f0_method,
109 |             hop_length=self.hop_size,
110 |             sampling_rate=self.sampling_rate,
111 |             threshold=cr_threshold,
112 |         )
113 |         wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
114 |         org_length = len(wav)
115 |         if org_length / sr > 50.0:
116 |             raise RuntimeError("Reached Max Length")
117 | 
118 |         wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
119 |         wav16k = wav16k
120 | 
121 |         hubert = self.vec_model(wav16k)
122 |         hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
123 |         hubert_length = hubert.shape[1]
124 | 
125 |         pitchf = f0_predictor.compute_f0(wav, hubert_length)
126 |         pitchf = pitchf * 2 ** (f0_up_key / 12)
127 |         pitch = pitchf.copy()
128 |         f0_mel = 1127 * np.log(1 + pitch / 700)
129 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
130 |             f0_mel_max - f0_mel_min
131 |         ) + 1
132 |         f0_mel[f0_mel <= 1] = 1
133 |         f0_mel[f0_mel > 255] = 255
134 |         pitch = np.rint(f0_mel).astype(np.int64)
135 | 
136 |         pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
137 |         pitch = pitch.reshape(1, len(pitch))
138 |         ds = np.array([sid]).astype(np.int64)
139 | 
140 |         rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
141 |         hubert_length = np.array([hubert_length]).astype(np.int64)
142 | 
143 |         out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
144 |         out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
145 |         return out_wav[0:org_length]
146 | 


--------------------------------------------------------------------------------
/lib/infer_pack/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(
 13 |     inputs,
 14 |     unnormalized_widths,
 15 |     unnormalized_heights,
 16 |     unnormalized_derivatives,
 17 |     inverse=False,
 18 |     tails=None,
 19 |     tail_bound=1.0,
 20 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 21 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 22 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 23 | ):
 24 |     if tails is None:
 25 |         spline_fn = rational_quadratic_spline
 26 |         spline_kwargs = {}
 27 |     else:
 28 |         spline_fn = unconstrained_rational_quadratic_spline
 29 |         spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
 30 | 
 31 |     outputs, logabsdet = spline_fn(
 32 |         inputs=inputs,
 33 |         unnormalized_widths=unnormalized_widths,
 34 |         unnormalized_heights=unnormalized_heights,
 35 |         unnormalized_derivatives=unnormalized_derivatives,
 36 |         inverse=inverse,
 37 |         min_bin_width=min_bin_width,
 38 |         min_bin_height=min_bin_height,
 39 |         min_derivative=min_derivative,
 40 |         **spline_kwargs
 41 |     )
 42 |     return outputs, logabsdet
 43 | 
 44 | 
 45 | def searchsorted(bin_locations, inputs, eps=1e-6):
 46 |     bin_locations[..., -1] += eps
 47 |     return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
 48 | 
 49 | 
 50 | def unconstrained_rational_quadratic_spline(
 51 |     inputs,
 52 |     unnormalized_widths,
 53 |     unnormalized_heights,
 54 |     unnormalized_derivatives,
 55 |     inverse=False,
 56 |     tails="linear",
 57 |     tail_bound=1.0,
 58 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 59 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 60 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 61 | ):
 62 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 63 |     outside_interval_mask = ~inside_interval_mask
 64 | 
 65 |     outputs = torch.zeros_like(inputs)
 66 |     logabsdet = torch.zeros_like(inputs)
 67 | 
 68 |     if tails == "linear":
 69 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 70 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 71 |         unnormalized_derivatives[..., 0] = constant
 72 |         unnormalized_derivatives[..., -1] = constant
 73 | 
 74 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 75 |         logabsdet[outside_interval_mask] = 0
 76 |     else:
 77 |         raise RuntimeError("{} tails are not implemented.".format(tails))
 78 | 
 79 |     (
 80 |         outputs[inside_interval_mask],
 81 |         logabsdet[inside_interval_mask],
 82 |     ) = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound,
 89 |         right=tail_bound,
 90 |         bottom=-tail_bound,
 91 |         top=tail_bound,
 92 |         min_bin_width=min_bin_width,
 93 |         min_bin_height=min_bin_height,
 94 |         min_derivative=min_derivative,
 95 |     )
 96 | 
 97 |     return outputs, logabsdet
 98 | 
 99 | 
100 | def rational_quadratic_spline(
101 |     inputs,
102 |     unnormalized_widths,
103 |     unnormalized_heights,
104 |     unnormalized_derivatives,
105 |     inverse=False,
106 |     left=0.0,
107 |     right=1.0,
108 |     bottom=0.0,
109 |     top=1.0,
110 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
113 | ):
114 |     if torch.min(inputs) < left or torch.max(inputs) > right:
115 |         raise ValueError("Input to a transform is not within its domain")
116 | 
117 |     num_bins = unnormalized_widths.shape[-1]
118 | 
119 |     if min_bin_width * num_bins > 1.0:
120 |         raise ValueError("Minimal bin width too large for the number of bins")
121 |     if min_bin_height * num_bins > 1.0:
122 |         raise ValueError("Minimal bin height too large for the number of bins")
123 | 
124 |     widths = F.softmax(unnormalized_widths, dim=-1)
125 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126 |     cumwidths = torch.cumsum(widths, dim=-1)
127 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128 |     cumwidths = (right - left) * cumwidths + left
129 |     cumwidths[..., 0] = left
130 |     cumwidths[..., -1] = right
131 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132 | 
133 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134 | 
135 |     heights = F.softmax(unnormalized_heights, dim=-1)
136 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137 |     cumheights = torch.cumsum(heights, dim=-1)
138 |     cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139 |     cumheights = (top - bottom) * cumheights + bottom
140 |     cumheights[..., 0] = bottom
141 |     cumheights[..., -1] = top
142 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
143 | 
144 |     if inverse:
145 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
146 |     else:
147 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
148 | 
149 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151 | 
152 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153 |     delta = heights / widths
154 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
155 | 
156 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158 | 
159 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
160 | 
161 |     if inverse:
162 |         a = (inputs - input_cumheights) * (
163 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
164 |         ) + input_heights * (input_delta - input_derivatives)
165 |         b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
167 |         )
168 |         c = -input_delta * (inputs - input_cumheights)
169 | 
170 |         discriminant = b.pow(2) - 4 * a * c
171 |         assert (discriminant >= 0).all()
172 | 
173 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
174 |         outputs = root * input_bin_widths + input_cumwidths
175 | 
176 |         theta_one_minus_theta = root * (1 - root)
177 |         denominator = input_delta + (
178 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179 |             * theta_one_minus_theta
180 |         )
181 |         derivative_numerator = input_delta.pow(2) * (
182 |             input_derivatives_plus_one * root.pow(2)
183 |             + 2 * input_delta * theta_one_minus_theta
184 |             + input_derivatives * (1 - root).pow(2)
185 |         )
186 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187 | 
188 |         return outputs, -logabsdet
189 |     else:
190 |         theta = (inputs - input_cumwidths) / input_bin_widths
191 |         theta_one_minus_theta = theta * (1 - theta)
192 | 
193 |         numerator = input_heights * (
194 |             input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195 |         )
196 |         denominator = input_delta + (
197 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198 |             * theta_one_minus_theta
199 |         )
200 |         outputs = input_cumheights + numerator / denominator
201 | 
202 |         derivative_numerator = input_delta.pow(2) * (
203 |             input_derivatives_plus_one * theta.pow(2)
204 |             + 2 * input_delta * theta_one_minus_theta
205 |             + input_derivatives * (1 - theta).pow(2)
206 |         )
207 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208 | 
209 |         return outputs, logabsdet
210 | 


--------------------------------------------------------------------------------
/lib/vc/audio.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import traceback
 3 | 
 4 | import librosa
 5 | import numpy as np
 6 | import av
 7 | from io import BytesIO
 8 | 
 9 | 
10 | def wav2(i, o, format):
11 |     inp = av.open(i, "rb")
12 |     if format == "m4a":
13 |         format = "mp4"
14 |     out = av.open(o, "wb", format=format)
15 |     if format == "ogg":
16 |         format = "libvorbis"
17 |     if format == "mp4":
18 |         format = "aac"
19 | 
20 |     ostream = out.add_stream(format)
21 | 
22 |     for frame in inp.decode(audio=0):
23 |         for p in ostream.encode(frame):
24 |             out.mux(p)
25 | 
26 |     for p in ostream.encode(None):
27 |         out.mux(p)
28 | 
29 |     out.close()
30 |     inp.close()
31 | 
32 | 
33 | def audio2(i, o, format, sr):
34 |     inp = av.open(i, "rb")
35 |     out = av.open(o, "wb", format=format)
36 |     if format == "ogg":
37 |         format = "libvorbis"
38 |     if format == "f32le":
39 |         format = "pcm_f32le"
40 | 
41 |     ostream = out.add_stream(format, channels=1)
42 |     ostream.sample_rate = sr
43 | 
44 |     for frame in inp.decode(audio=0):
45 |         for p in ostream.encode(frame):
46 |             out.mux(p)
47 | 
48 |     out.close()
49 |     inp.close()
50 | 
51 | 
52 | def load_audio(file, sr):
53 |     file = (
54 |         file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
55 |     )  # 防止小白拷路径头尾带了空格和"和回车
56 |     if os.path.exists(file) == False:
57 |         raise RuntimeError(
58 |             "You input a wrong audio path that does not exists, please fix it!"
59 |         )
60 |     try:
61 |         with open(file, "rb") as f:
62 |             with BytesIO() as out:
63 |                 audio2(f, out, "f32le", sr)
64 |                 return np.frombuffer(out.getvalue(), np.float32).flatten()
65 | 
66 |     except AttributeError:
67 |         audio = file[1] / 32768.0
68 |         if len(audio.shape) == 2:
69 |             audio = np.mean(audio, -1)
70 |         return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
71 | 
72 |     except:
73 |         raise RuntimeError(traceback.format_exc())
74 | 


--------------------------------------------------------------------------------
/lib/vc/rmvpe.py:
--------------------------------------------------------------------------------
  1 | import sys, torch, numpy as np, traceback, pdb
  2 | import torch.nn as nn
  3 | from time import time as ttime
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | class BiGRU(nn.Module):
  8 |     def __init__(self, input_features, hidden_features, num_layers):
  9 |         super(BiGRU, self).__init__()
 10 |         self.gru = nn.GRU(
 11 |             input_features,
 12 |             hidden_features,
 13 |             num_layers=num_layers,
 14 |             batch_first=True,
 15 |             bidirectional=True,
 16 |         )
 17 | 
 18 |     def forward(self, x):
 19 |         return self.gru(x)[0]
 20 | 
 21 | 
 22 | class ConvBlockRes(nn.Module):
 23 |     def __init__(self, in_channels, out_channels, momentum=0.01):
 24 |         super(ConvBlockRes, self).__init__()
 25 |         self.conv = nn.Sequential(
 26 |             nn.Conv2d(
 27 |                 in_channels=in_channels,
 28 |                 out_channels=out_channels,
 29 |                 kernel_size=(3, 3),
 30 |                 stride=(1, 1),
 31 |                 padding=(1, 1),
 32 |                 bias=False,
 33 |             ),
 34 |             nn.BatchNorm2d(out_channels, momentum=momentum),
 35 |             nn.ReLU(),
 36 |             nn.Conv2d(
 37 |                 in_channels=out_channels,
 38 |                 out_channels=out_channels,
 39 |                 kernel_size=(3, 3),
 40 |                 stride=(1, 1),
 41 |                 padding=(1, 1),
 42 |                 bias=False,
 43 |             ),
 44 |             nn.BatchNorm2d(out_channels, momentum=momentum),
 45 |             nn.ReLU(),
 46 |         )
 47 |         if in_channels != out_channels:
 48 |             self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
 49 |             self.is_shortcut = True
 50 |         else:
 51 |             self.is_shortcut = False
 52 | 
 53 |     def forward(self, x):
 54 |         if self.is_shortcut:
 55 |             return self.conv(x) + self.shortcut(x)
 56 |         else:
 57 |             return self.conv(x) + x
 58 | 
 59 | 
 60 | class Encoder(nn.Module):
 61 |     def __init__(
 62 |         self,
 63 |         in_channels,
 64 |         in_size,
 65 |         n_encoders,
 66 |         kernel_size,
 67 |         n_blocks,
 68 |         out_channels=16,
 69 |         momentum=0.01,
 70 |     ):
 71 |         super(Encoder, self).__init__()
 72 |         self.n_encoders = n_encoders
 73 |         self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
 74 |         self.layers = nn.ModuleList()
 75 |         self.latent_channels = []
 76 |         for i in range(self.n_encoders):
 77 |             self.layers.append(
 78 |                 ResEncoderBlock(
 79 |                     in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
 80 |                 )
 81 |             )
 82 |             self.latent_channels.append([out_channels, in_size])
 83 |             in_channels = out_channels
 84 |             out_channels *= 2
 85 |             in_size //= 2
 86 |         self.out_size = in_size
 87 |         self.out_channel = out_channels
 88 | 
 89 |     def forward(self, x):
 90 |         concat_tensors = []
 91 |         x = self.bn(x)
 92 |         for i in range(self.n_encoders):
 93 |             _, x = self.layers[i](x)
 94 |             concat_tensors.append(_)
 95 |         return x, concat_tensors
 96 | 
 97 | 
 98 | class ResEncoderBlock(nn.Module):
 99 |     def __init__(
100 |         self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
101 |     ):
102 |         super(ResEncoderBlock, self).__init__()
103 |         self.n_blocks = n_blocks
104 |         self.conv = nn.ModuleList()
105 |         self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
106 |         for i in range(n_blocks - 1):
107 |             self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
108 |         self.kernel_size = kernel_size
109 |         if self.kernel_size is not None:
110 |             self.pool = nn.AvgPool2d(kernel_size=kernel_size)
111 | 
112 |     def forward(self, x):
113 |         for i in range(self.n_blocks):
114 |             x = self.conv[i](x)
115 |         if self.kernel_size is not None:
116 |             return x, self.pool(x)
117 |         else:
118 |             return x
119 | 
120 | 
121 | class Intermediate(nn.Module):  #
122 |     def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
123 |         super(Intermediate, self).__init__()
124 |         self.n_inters = n_inters
125 |         self.layers = nn.ModuleList()
126 |         self.layers.append(
127 |             ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
128 |         )
129 |         for i in range(self.n_inters - 1):
130 |             self.layers.append(
131 |                 ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
132 |             )
133 | 
134 |     def forward(self, x):
135 |         for i in range(self.n_inters):
136 |             x = self.layers[i](x)
137 |         return x
138 | 
139 | 
140 | class ResDecoderBlock(nn.Module):
141 |     def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
142 |         super(ResDecoderBlock, self).__init__()
143 |         out_padding = (0, 1) if stride == (1, 2) else (1, 1)
144 |         self.n_blocks = n_blocks
145 |         self.conv1 = nn.Sequential(
146 |             nn.ConvTranspose2d(
147 |                 in_channels=in_channels,
148 |                 out_channels=out_channels,
149 |                 kernel_size=(3, 3),
150 |                 stride=stride,
151 |                 padding=(1, 1),
152 |                 output_padding=out_padding,
153 |                 bias=False,
154 |             ),
155 |             nn.BatchNorm2d(out_channels, momentum=momentum),
156 |             nn.ReLU(),
157 |         )
158 |         self.conv2 = nn.ModuleList()
159 |         self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
160 |         for i in range(n_blocks - 1):
161 |             self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
162 | 
163 |     def forward(self, x, concat_tensor):
164 |         x = self.conv1(x)
165 |         x = torch.cat((x, concat_tensor), dim=1)
166 |         for i in range(self.n_blocks):
167 |             x = self.conv2[i](x)
168 |         return x
169 | 
170 | 
171 | class Decoder(nn.Module):
172 |     def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
173 |         super(Decoder, self).__init__()
174 |         self.layers = nn.ModuleList()
175 |         self.n_decoders = n_decoders
176 |         for i in range(self.n_decoders):
177 |             out_channels = in_channels // 2
178 |             self.layers.append(
179 |                 ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
180 |             )
181 |             in_channels = out_channels
182 | 
183 |     def forward(self, x, concat_tensors):
184 |         for i in range(self.n_decoders):
185 |             x = self.layers[i](x, concat_tensors[-1 - i])
186 |         return x
187 | 
188 | 
189 | class DeepUnet(nn.Module):
190 |     def __init__(
191 |         self,
192 |         kernel_size,
193 |         n_blocks,
194 |         en_de_layers=5,
195 |         inter_layers=4,
196 |         in_channels=1,
197 |         en_out_channels=16,
198 |     ):
199 |         super(DeepUnet, self).__init__()
200 |         self.encoder = Encoder(
201 |             in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
202 |         )
203 |         self.intermediate = Intermediate(
204 |             self.encoder.out_channel // 2,
205 |             self.encoder.out_channel,
206 |             inter_layers,
207 |             n_blocks,
208 |         )
209 |         self.decoder = Decoder(
210 |             self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
211 |         )
212 | 
213 |     def forward(self, x):
214 |         x, concat_tensors = self.encoder(x)
215 |         x = self.intermediate(x)
216 |         x = self.decoder(x, concat_tensors)
217 |         return x
218 | 
219 | 
220 | class E2E(nn.Module):
221 |     def __init__(
222 |         self,
223 |         n_blocks,
224 |         n_gru,
225 |         kernel_size,
226 |         en_de_layers=5,
227 |         inter_layers=4,
228 |         in_channels=1,
229 |         en_out_channels=16,
230 |     ):
231 |         super(E2E, self).__init__()
232 |         self.unet = DeepUnet(
233 |             kernel_size,
234 |             n_blocks,
235 |             en_de_layers,
236 |             inter_layers,
237 |             in_channels,
238 |             en_out_channels,
239 |         )
240 |         self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
241 |         if n_gru:
242 |             self.fc = nn.Sequential(
243 |                 BiGRU(3 * 128, 256, n_gru),
244 |                 nn.Linear(512, 360),
245 |                 nn.Dropout(0.25),
246 |                 nn.Sigmoid(),
247 |             )
248 |         else:
249 |             self.fc = nn.Sequential(
250 |                 nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
251 |             )
252 | 
253 |     def forward(self, mel):
254 |         mel = mel.transpose(-1, -2).unsqueeze(1)
255 |         x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
256 |         x = self.fc(x)
257 |         return x
258 | 
259 | 
260 | from librosa.filters import mel
261 | 
262 | 
263 | class MelSpectrogram(torch.nn.Module):
264 |     def __init__(
265 |         self,
266 |         is_half,
267 |         n_mel_channels,
268 |         sampling_rate,
269 |         win_length,
270 |         hop_length,
271 |         n_fft=None,
272 |         mel_fmin=0,
273 |         mel_fmax=None,
274 |         clamp=1e-5,
275 |     ):
276 |         super().__init__()
277 |         n_fft = win_length if n_fft is None else n_fft
278 |         self.hann_window = {}
279 |         mel_basis = mel(
280 |             sr=sampling_rate,
281 |             n_fft=n_fft,
282 |             n_mels=n_mel_channels,
283 |             fmin=mel_fmin,
284 |             fmax=mel_fmax,
285 |             htk=True,
286 |         )
287 |         mel_basis = torch.from_numpy(mel_basis).float()
288 |         self.register_buffer("mel_basis", mel_basis)
289 |         self.n_fft = win_length if n_fft is None else n_fft
290 |         self.hop_length = hop_length
291 |         self.win_length = win_length
292 |         self.sampling_rate = sampling_rate
293 |         self.n_mel_channels = n_mel_channels
294 |         self.clamp = clamp
295 |         self.is_half = is_half
296 | 
297 |     def forward(self, audio, keyshift=0, speed=1, center=True):
298 |         factor = 2 ** (keyshift / 12)
299 |         n_fft_new = int(np.round(self.n_fft * factor))
300 |         win_length_new = int(np.round(self.win_length * factor))
301 |         hop_length_new = int(np.round(self.hop_length * speed))
302 |         keyshift_key = str(keyshift) + "_" + str(audio.device)
303 |         if keyshift_key not in self.hann_window:
304 |             self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
305 |                 audio.device
306 |             )
307 |         fft = torch.stft(
308 |             audio,
309 |             n_fft=n_fft_new,
310 |             hop_length=hop_length_new,
311 |             win_length=win_length_new,
312 |             window=self.hann_window[keyshift_key],
313 |             center=center,
314 |             return_complex=True,
315 |         )
316 |         magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
317 |         if keyshift != 0:
318 |             size = self.n_fft // 2 + 1
319 |             resize = magnitude.size(1)
320 |             if resize < size:
321 |                 magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
322 |             magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
323 |         mel_output = torch.matmul(self.mel_basis, magnitude)
324 |         if self.is_half == True:
325 |             mel_output = mel_output.half()
326 |         log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
327 |         return log_mel_spec
328 | 
329 | 
330 | class RMVPE:
331 |     def __init__(self, model_path, is_half, device=None):
332 |         self.resample_kernel = {}
333 |         model = E2E(4, 1, (2, 2))
334 |         ckpt = torch.load(model_path, map_location="cpu")
335 |         model.load_state_dict(ckpt)
336 |         model.eval()
337 |         if is_half == True:
338 |             model = model.half()
339 |         self.model = model
340 |         self.resample_kernel = {}
341 |         self.is_half = is_half
342 |         if device is None:
343 |             device = "cuda" if torch.cuda.is_available() else "cpu"
344 |         self.device = device
345 |         self.mel_extractor = MelSpectrogram(
346 |             is_half, 128, 16000, 1024, 160, None, 30, 8000
347 |         ).to(device)
348 |         self.model = self.model.to(device)
349 |         cents_mapping = 20 * np.arange(360) + 1997.3794084376191
350 |         self.cents_mapping = np.pad(cents_mapping, (4, 4))  # 368
351 | 
352 |     def mel2hidden(self, mel):
353 |         with torch.no_grad():
354 |             n_frames = mel.shape[-1]
355 |             mel = F.pad(
356 |                 mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
357 |             )
358 |             hidden = self.model(mel)
359 |             return hidden[:, :n_frames]
360 | 
361 |     def decode(self, hidden, thred=0.03):
362 |         cents_pred = self.to_local_average_cents(hidden, thred=thred)
363 |         f0 = 10 * (2 ** (cents_pred / 1200))
364 |         f0[f0 == 10] = 0
365 |         # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
366 |         return f0
367 | 
368 |     def infer_from_audio(self, audio, thred=0.03):
369 |         audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
370 |         # torch.cuda.synchronize()
371 |         # t0=ttime()
372 |         mel = self.mel_extractor(audio, center=True)
373 |         # torch.cuda.synchronize()
374 |         # t1=ttime()
375 |         hidden = self.mel2hidden(mel)
376 |         # torch.cuda.synchronize()
377 |         # t2=ttime()
378 |         hidden = hidden.squeeze(0).cpu().numpy()
379 |         if self.is_half == True:
380 |             hidden = hidden.astype("float32")
381 |         f0 = self.decode(hidden, thred=thred)
382 |         # torch.cuda.synchronize()
383 |         # t3=ttime()
384 |         # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
385 |         return f0
386 | 
387 |     def to_local_average_cents(self, salience, thred=0.05):
388 |         # t0 = ttime()
389 |         center = np.argmax(salience, axis=1)  # 帧长#index
390 |         salience = np.pad(salience, ((0, 0), (4, 4)))  # 帧长,368
391 |         # t1 = ttime()
392 |         center += 4
393 |         todo_salience = []
394 |         todo_cents_mapping = []
395 |         starts = center - 4
396 |         ends = center + 5
397 |         for idx in range(salience.shape[0]):
398 |             todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
399 |             todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
400 |         # t2 = ttime()
401 |         todo_salience = np.array(todo_salience)  # 帧长，9
402 |         todo_cents_mapping = np.array(todo_cents_mapping)  # 帧长，9
403 |         product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
404 |         weight_sum = np.sum(todo_salience, 1)  # 帧长
405 |         devided = product_sum / weight_sum  # 帧长
406 |         # t3 = ttime()
407 |         maxx = np.max(salience, axis=1)  # 帧长
408 |         devided[maxx <= thred] = 0
409 |         # t4 = ttime()
410 |         # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
411 |         return devided
412 | 
413 | 
414 | # if __name__ == '__main__':
415 | #     audio, sampling_rate = sf.read("卢本伟语录~1.wav")
416 | #     if len(audio.shape) > 1:
417 | #         audio = librosa.to_mono(audio.transpose(1, 0))
418 | #     audio_bak = audio.copy()
419 | #     if sampling_rate != 16000:
420 | #         audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
421 | #     model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt"
422 | #     thred = 0.03  # 0.01
423 | #     device = 'cuda' if torch.cuda.is_available() else 'cpu'
424 | #     rmvpe = RMVPE(model_path,is_half=False, device=device)
425 | #     t0=ttime()
426 | #     f0 = rmvpe.infer_from_audio(audio, thred=thred)
427 | #     f0 = rmvpe.infer_from_audio(audio, thred=thred)
428 | #     f0 = rmvpe.infer_from_audio(audio, thred=thred)
429 | #     f0 = rmvpe.infer_from_audio(audio, thred=thred)
430 | #     f0 = rmvpe.infer_from_audio(audio, thred=thred)
431 | #     t1=ttime()
432 | #     print(f0.shape,t1-t0)
433 | 


--------------------------------------------------------------------------------
/lib/vc/settings.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | 
  3 | def change_audio_mode(vc_audio_mode):
  4 |     if vc_audio_mode == "Input path":
  5 |         return (
  6 |             # Input & Upload
  7 |             gr.Textbox(visible=True),
  8 |             gr.Audio(visible=False),
  9 |             # Youtube
 10 |             gr.Dropdown(visible=False),
 11 |             gr.Textbox(visible=False),
 12 |             gr.Textbox(visible=False),
 13 |             gr.Button(visible=False),
 14 |             # Splitter
 15 |             gr.Dropdown(visible=False),
 16 |             gr.Textbox(visible=False),
 17 |             gr.Button(visible=False),
 18 |             gr.Audio(visible=False),
 19 |             gr.Audio(visible=False),
 20 |             gr.Audio(visible=False),
 21 |             gr.Slider(visible=False),
 22 |             gr.Slider(visible=False),
 23 |             gr.Audio(visible=False),
 24 |             gr.Button(visible=False),
 25 |             # TTS
 26 |             gr.Textbox(visible=False),
 27 |             gr.Dropdown(visible=False)
 28 |         )
 29 |     elif vc_audio_mode == "Upload audio":
 30 |         return (
 31 |             # Input & Upload
 32 |             gr.Textbox(visible=False),
 33 |             gr.Audio(visible=True),
 34 |             # Youtube
 35 |             gr.Dropdown(visible=False),
 36 |             gr.Textbox(visible=False),
 37 |             gr.Textbox(visible=False),
 38 |             gr.Button(visible=False),
 39 |             # Splitter
 40 |             gr.Dropdown(visible=False),
 41 |             gr.Textbox(visible=False),
 42 |             gr.Button(visible=False),
 43 |             gr.Audio(visible=False),
 44 |             gr.Audio(visible=False),
 45 |             gr.Audio(visible=False),
 46 |             gr.Slider(visible=False),
 47 |             gr.Slider(visible=False),
 48 |             gr.Audio(visible=False),
 49 |             gr.Button(visible=False),
 50 |             # TTS
 51 |             gr.Textbox(visible=False),
 52 |             gr.Dropdown(visible=False)
 53 |         )
 54 |     elif vc_audio_mode == "Youtube":
 55 |         return (
 56 |             # Input & Upload
 57 |             gr.Textbox(visible=False),
 58 |             gr.Audio(visible=False),
 59 |             # Youtube
 60 |             gr.Dropdown(visible=True),
 61 |             gr.Textbox(visible=True),
 62 |             gr.Textbox(visible=True),
 63 |             gr.Button(visible=True),
 64 |             # Splitter
 65 |             gr.Dropdown(visible=True),
 66 |             gr.Textbox(visible=True),
 67 |             gr.Button(visible=True),
 68 |             gr.Audio(visible=True),
 69 |             gr.Audio(visible=True),
 70 |             gr.Audio(visible=True),
 71 |             gr.Slider(visible=True),
 72 |             gr.Slider(visible=True),
 73 |             gr.Audio(visible=True),
 74 |             gr.Button(visible=True),
 75 |             # TTS
 76 |             gr.Textbox(visible=False),
 77 |             gr.Dropdown(visible=False)
 78 |         )
 79 |     elif vc_audio_mode == "TTS Audio":
 80 |         return (
 81 |             # Input & Upload
 82 |             gr.Textbox(visible=False),
 83 |             gr.Audio(visible=False),
 84 |             # Youtube
 85 |             gr.Dropdown(visible=False),
 86 |             gr.Textbox(visible=False),
 87 |             gr.Textbox(visible=False),
 88 |             gr.Button(visible=False),
 89 |             # Splitter
 90 |             gr.Dropdown(visible=False),
 91 |             gr.Textbox(visible=False),
 92 |             gr.Button(visible=False),
 93 |             gr.Audio(visible=False),
 94 |             gr.Audio(visible=False),
 95 |             gr.Audio(visible=False),
 96 |             gr.Slider(visible=False),
 97 |             gr.Slider(visible=False),
 98 |             gr.Audio(visible=False),
 99 |             gr.Button(visible=False),
100 |             # TTS
101 |             gr.Textbox(visible=True),
102 |             gr.Dropdown(visible=True)
103 |         )


--------------------------------------------------------------------------------
/lib/vc/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import wave
 3 | import subprocess
 4 | import yt_dlp
 5 | import ffmpeg
 6 | import logging
 7 | from fairseq import checkpoint_utils
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | def load_hubert(config):
11 |     path_check = os.path.exists("assets/hubert/hubert_base.pt")
12 |     if path_check is False:
13 |         logger.warn("hubert_base.pt is missing. Please check the documentation for to get it.")
14 |     else:
15 |         logger.info("hubert_base.pt found.")
16 |     models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
17 |         [os.path.join("assets", "hubert", "hubert_base.pt")],
18 |         suffix="",
19 |     )
20 |     hubert_model = models[0]
21 |     hubert_model = hubert_model.to(config.device)
22 |     if config.is_half:
23 |         hubert_model = hubert_model.half()
24 |     else:
25 |         hubert_model = hubert_model.float()
26 |     hubert_model.eval()
27 |     return hubert_model
28 | 
29 | def download_audio(url, audio_provider):
30 |     logs = []
31 |     if url == "":
32 |         logs.append("URL required!")
33 |         yield None, "\n".join(logs)
34 |         return None, "\n".join(logs)
35 |     if not os.path.exists("yt"):
36 |         os.mkdir("yt")
37 |     if audio_provider == "Youtube":
38 |         logs.append("Downloading the audio...")
39 |         yield None, "\n".join(logs)
40 |         ydl_opts = {
41 |             'noplaylist': True,
42 |             'format': 'bestaudio/best',
43 |             'postprocessors': [{
44 |                 'key': 'FFmpegExtractAudio',
45 |                 'preferredcodec': 'wav',
46 |             }],
47 |             "outtmpl": 'yt/audio',
48 |         }
49 |         audio_path = "yt/audio.wav"
50 |         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
51 |             ydl.download([url])
52 |         logs.append("Download Complete.")
53 |         yield audio_path, "\n".join(logs)
54 | 
55 | def cut_vocal_and_inst(split_model):
56 |     logs = []
57 |     logs.append("Starting the audio splitting process...")
58 |     yield "\n".join(logs), None, None, None
59 |     command = f"demucs --two-stems=vocals -n {split_model} yt/audio.wav -o output"
60 |     result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
61 |     for line in result.stdout:
62 |         logs.append(line)
63 |         yield "\n".join(logs), None, None, None
64 |     logger.info(result.stdout)
65 |     vocal = f"output/{split_model}/audio/vocals.wav"
66 |     inst = f"output/{split_model}/audio/no_vocals.wav"
67 |     logs.append("Audio splitting complete.")
68 |     yield "\n".join(logs), vocal, inst, vocal
69 | 
70 | def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
71 |     if not os.path.exists("output/result"):
72 |         os.mkdir("output/result")
73 |     vocal_path = "output/result/output.wav"
74 |     output_path = "output/result/combine.mp3"
75 |     inst_path = f"output/{split_model}/audio/no_vocals.wav"
76 |     with wave.open(vocal_path, "w") as wave_file:
77 |         wave_file.setnchannels(1) 
78 |         wave_file.setsampwidth(2)
79 |         wave_file.setframerate(audio_data[0])
80 |         wave_file.writeframes(audio_data[1].tobytes())
81 |     command =  f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
82 |     result = subprocess.run(command.split(), stdout=subprocess.PIPE)
83 |     logger.info(result.stdout.decode())
84 |     return output_path


--------------------------------------------------------------------------------
/lib/vc/vc_infer_pipeline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np, parselmouth, torch, pdb, sys, os
  2 | from time import time as ttime
  3 | import torch.nn.functional as F
  4 | import scipy.signal as signal
  5 | import pyworld, os, traceback, faiss, librosa, torchcrepe
  6 | from scipy import signal
  7 | from functools import lru_cache
  8 | 
  9 | now_dir = os.getcwd()
 10 | sys.path.append(now_dir)
 11 | 
 12 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
 13 | 
 14 | input_audio_path2wav = {}
 15 | 
 16 | 
 17 | @lru_cache
 18 | def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
 19 |     audio = input_audio_path2wav[input_audio_path]
 20 |     f0, t = pyworld.harvest(
 21 |         audio,
 22 |         fs=fs,
 23 |         f0_ceil=f0max,
 24 |         f0_floor=f0min,
 25 |         frame_period=frame_period,
 26 |     )
 27 |     f0 = pyworld.stonemask(audio, f0, t, fs)
 28 |     return f0
 29 | 
 30 | 
 31 | def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出音频,rate是2的占比
 32 |     # print(data1.max(),data2.max())
 33 |     rms1 = librosa.feature.rms(
 34 |         y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
 35 |     )  # 每半秒一个点
 36 |     rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
 37 |     rms1 = torch.from_numpy(rms1)
 38 |     rms1 = F.interpolate(
 39 |         rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
 40 |     ).squeeze()
 41 |     rms2 = torch.from_numpy(rms2)
 42 |     rms2 = F.interpolate(
 43 |         rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
 44 |     ).squeeze()
 45 |     rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
 46 |     data2 *= (
 47 |         torch.pow(rms1, torch.tensor(1 - rate))
 48 |         * torch.pow(rms2, torch.tensor(rate - 1))
 49 |     ).numpy()
 50 |     return data2
 51 | 
 52 | 
 53 | class VC(object):
 54 |     def __init__(self, tgt_sr, config):
 55 |         self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
 56 |             config.x_pad,
 57 |             config.x_query,
 58 |             config.x_center,
 59 |             config.x_max,
 60 |             config.is_half,
 61 |         )
 62 |         self.sr = 16000  # hubert输入采样率
 63 |         self.window = 160  # 每帧点数
 64 |         self.t_pad = self.sr * self.x_pad  # 每条前后pad时间
 65 |         self.t_pad_tgt = tgt_sr * self.x_pad
 66 |         self.t_pad2 = self.t_pad * 2
 67 |         self.t_query = self.sr * self.x_query  # 查询切点前后查询时间
 68 |         self.t_center = self.sr * self.x_center  # 查询切点位置
 69 |         self.t_max = self.sr * self.x_max  # 免查询时长阈值
 70 |         self.device = config.device
 71 | 
 72 |     def get_f0(
 73 |         self,
 74 |         input_audio_path,
 75 |         x,
 76 |         p_len,
 77 |         f0_up_key,
 78 |         f0_method,
 79 |         filter_radius,
 80 |         inp_f0=None,
 81 |     ):
 82 |         global input_audio_path2wav
 83 |         time_step = self.window / self.sr * 1000
 84 |         f0_min = 50
 85 |         f0_max = 1100
 86 |         f0_mel_min = 1127 * np.log(1 + f0_min / 700)
 87 |         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 88 |         if f0_method == "pm":
 89 |             f0 = (
 90 |                 parselmouth.Sound(x, self.sr)
 91 |                 .to_pitch_ac(
 92 |                     time_step=time_step / 1000,
 93 |                     voicing_threshold=0.6,
 94 |                     pitch_floor=f0_min,
 95 |                     pitch_ceiling=f0_max,
 96 |                 )
 97 |                 .selected_array["frequency"]
 98 |             )
 99 |             pad_size = (p_len - len(f0) + 1) // 2
100 |             if pad_size > 0 or p_len - len(f0) - pad_size > 0:
101 |                 f0 = np.pad(
102 |                     f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
103 |                 )
104 |         elif f0_method == "harvest":
105 |             input_audio_path2wav[input_audio_path] = x.astype(np.double)
106 |             f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
107 |             if filter_radius > 2:
108 |                 f0 = signal.medfilt(f0, 3)
109 |         elif f0_method == "crepe":
110 |             model = "full"
111 |             # Pick a batch size that doesn't cause memory errors on your gpu
112 |             batch_size = 512
113 |             # Compute pitch using first gpu
114 |             audio = torch.tensor(np.copy(x))[None].float()
115 |             f0, pd = torchcrepe.predict(
116 |                 audio,
117 |                 self.sr,
118 |                 self.window,
119 |                 f0_min,
120 |                 f0_max,
121 |                 model,
122 |                 batch_size=batch_size,
123 |                 device=self.device,
124 |                 return_periodicity=True,
125 |             )
126 |             pd = torchcrepe.filter.median(pd, 3)
127 |             f0 = torchcrepe.filter.mean(f0, 3)
128 |             f0[pd < 0.1] = 0
129 |             f0 = f0[0].cpu().numpy()
130 |         elif f0_method == "rmvpe":
131 |             if hasattr(self, "model_rmvpe") == False:
132 |                 from rmvpe import RMVPE
133 | 
134 |                 print("loading rmvpe model")
135 |                 self.model_rmvpe = RMVPE(
136 |                     os.path.join("assets", "rvmpe", "rmvpe.pt"), is_half=self.is_half, device=self.device
137 |                 )
138 |             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
139 |         f0 *= pow(2, f0_up_key / 12)
140 |         # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
141 |         tf0 = self.sr // self.window  # 每秒f0点数
142 |         if inp_f0 is not None:
143 |             delta_t = np.round(
144 |                 (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
145 |             ).astype("int16")
146 |             replace_f0 = np.interp(
147 |                 list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
148 |             )
149 |             shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
150 |             f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
151 |                 :shape
152 |             ]
153 |         # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
154 |         f0bak = f0.copy()
155 |         f0_mel = 1127 * np.log(1 + f0 / 700)
156 |         f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
157 |             f0_mel_max - f0_mel_min
158 |         ) + 1
159 |         f0_mel[f0_mel <= 1] = 1
160 |         f0_mel[f0_mel > 255] = 255
161 |         f0_coarse = np.rint(f0_mel).astype(np.int)
162 |         return f0_coarse, f0bak  # 1-0
163 | 
164 |     def vc(
165 |         self,
166 |         model,
167 |         net_g,
168 |         sid,
169 |         audio0,
170 |         pitch,
171 |         pitchf,
172 |         times,
173 |         index,
174 |         big_npy,
175 |         index_rate,
176 |         version,
177 |         protect,
178 |     ):  # ,file_index,file_big_npy
179 |         feats = torch.from_numpy(audio0)
180 |         if self.is_half:
181 |             feats = feats.half()
182 |         else:
183 |             feats = feats.float()
184 |         if feats.dim() == 2:  # double channels
185 |             feats = feats.mean(-1)
186 |         assert feats.dim() == 1, feats.dim()
187 |         feats = feats.view(1, -1)
188 |         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
189 | 
190 |         inputs = {
191 |             "source": feats.to(self.device),
192 |             "padding_mask": padding_mask,
193 |             "output_layer": 9 if version == "v1" else 12,
194 |         }
195 |         t0 = ttime()
196 |         with torch.no_grad():
197 |             logits = model.extract_features(**inputs)
198 |             feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
199 |         if protect < 0.5 and pitch != None and pitchf != None:
200 |             feats0 = feats.clone()
201 |         if (
202 |             isinstance(index, type(None)) == False
203 |             and isinstance(big_npy, type(None)) == False
204 |             and index_rate != 0
205 |         ):
206 |             npy = feats[0].cpu().numpy()
207 |             if self.is_half:
208 |                 npy = npy.astype("float32")
209 | 
210 |             # _, I = index.search(npy, 1)
211 |             # npy = big_npy[I.squeeze()]
212 | 
213 |             score, ix = index.search(npy, k=8)
214 |             weight = np.square(1 / score)
215 |             weight /= weight.sum(axis=1, keepdims=True)
216 |             npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
217 | 
218 |             if self.is_half:
219 |                 npy = npy.astype("float16")
220 |             feats = (
221 |                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
222 |                 + (1 - index_rate) * feats
223 |             )
224 | 
225 |         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
226 |         if protect < 0.5 and pitch != None and pitchf != None:
227 |             feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
228 |                 0, 2, 1
229 |             )
230 |         t1 = ttime()
231 |         p_len = audio0.shape[0] // self.window
232 |         if feats.shape[1] < p_len:
233 |             p_len = feats.shape[1]
234 |             if pitch != None and pitchf != None:
235 |                 pitch = pitch[:, :p_len]
236 |                 pitchf = pitchf[:, :p_len]
237 | 
238 |         if protect < 0.5 and pitch != None and pitchf != None:
239 |             pitchff = pitchf.clone()
240 |             pitchff[pitchf > 0] = 1
241 |             pitchff[pitchf < 1] = protect
242 |             pitchff = pitchff.unsqueeze(-1)
243 |             feats = feats * pitchff + feats0 * (1 - pitchff)
244 |             feats = feats.to(feats0.dtype)
245 |         p_len = torch.tensor([p_len], device=self.device).long()
246 |         with torch.no_grad():
247 |             if pitch != None and pitchf != None:
248 |                 audio1 = (
249 |                     (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
250 |                     .data.cpu()
251 |                     .float()
252 |                     .numpy()
253 |                 )
254 |             else:
255 |                 audio1 = (
256 |                     (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
257 |                 )
258 |         del feats, p_len, padding_mask
259 |         if torch.cuda.is_available():
260 |             torch.cuda.empty_cache()
261 |         t2 = ttime()
262 |         times[0] += t1 - t0
263 |         times[2] += t2 - t1
264 |         return audio1
265 | 
266 |     def pipeline(
267 |         self,
268 |         model,
269 |         net_g,
270 |         sid,
271 |         audio,
272 |         input_audio_path,
273 |         times,
274 |         f0_up_key,
275 |         f0_method,
276 |         file_index,
277 |         # file_big_npy,
278 |         index_rate,
279 |         if_f0,
280 |         filter_radius,
281 |         tgt_sr,
282 |         resample_sr,
283 |         rms_mix_rate,
284 |         version,
285 |         protect,
286 |         f0_file=None,
287 |     ):
288 |         if (
289 |             file_index != ""
290 |             # and file_big_npy != ""
291 |             # and os.path.exists(file_big_npy) == True
292 |             and os.path.exists(file_index) == True
293 |             and index_rate != 0
294 |         ):
295 |             try:
296 |                 index = faiss.read_index(file_index)
297 |                 # big_npy = np.load(file_big_npy)
298 |                 big_npy = index.reconstruct_n(0, index.ntotal)
299 |             except:
300 |                 traceback.print_exc()
301 |                 index = big_npy = None
302 |         else:
303 |             index = big_npy = None
304 |         audio = signal.filtfilt(bh, ah, audio)
305 |         audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
306 |         opt_ts = []
307 |         if audio_pad.shape[0] > self.t_max:
308 |             audio_sum = np.zeros_like(audio)
309 |             for i in range(self.window):
310 |                 audio_sum += audio_pad[i : i - self.window]
311 |             for t in range(self.t_center, audio.shape[0], self.t_center):
312 |                 opt_ts.append(
313 |                     t
314 |                     - self.t_query
315 |                     + np.where(
316 |                         np.abs(audio_sum[t - self.t_query : t + self.t_query])
317 |                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
318 |                     )[0][0]
319 |                 )
320 |         s = 0
321 |         audio_opt = []
322 |         t = None
323 |         t1 = ttime()
324 |         audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
325 |         p_len = audio_pad.shape[0] // self.window
326 |         inp_f0 = None
327 |         if hasattr(f0_file, "name") == True:
328 |             try:
329 |                 with open(f0_file.name, "r") as f:
330 |                     lines = f.read().strip("\n").split("\n")
331 |                 inp_f0 = []
332 |                 for line in lines:
333 |                     inp_f0.append([float(i) for i in line.split(",")])
334 |                 inp_f0 = np.array(inp_f0, dtype="float32")
335 |             except:
336 |                 traceback.print_exc()
337 |         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
338 |         pitch, pitchf = None, None
339 |         if if_f0 == 1:
340 |             pitch, pitchf = self.get_f0(
341 |                 input_audio_path,
342 |                 audio_pad,
343 |                 p_len,
344 |                 f0_up_key,
345 |                 f0_method,
346 |                 filter_radius,
347 |                 inp_f0,
348 |             )
349 |             pitch = pitch[:p_len]
350 |             pitchf = pitchf[:p_len]
351 |             if self.device == "mps":
352 |                 pitchf = pitchf.astype(np.float32)
353 |             pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
354 |             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
355 |         t2 = ttime()
356 |         times[1] += t2 - t1
357 |         for t in opt_ts:
358 |             t = t // self.window * self.window
359 |             if if_f0 == 1:
360 |                 audio_opt.append(
361 |                     self.vc(
362 |                         model,
363 |                         net_g,
364 |                         sid,
365 |                         audio_pad[s : t + self.t_pad2 + self.window],
366 |                         pitch[:, s // self.window : (t + self.t_pad2) // self.window],
367 |                         pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
368 |                         times,
369 |                         index,
370 |                         big_npy,
371 |                         index_rate,
372 |                         version,
373 |                         protect,
374 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
375 |                 )
376 |             else:
377 |                 audio_opt.append(
378 |                     self.vc(
379 |                         model,
380 |                         net_g,
381 |                         sid,
382 |                         audio_pad[s : t + self.t_pad2 + self.window],
383 |                         None,
384 |                         None,
385 |                         times,
386 |                         index,
387 |                         big_npy,
388 |                         index_rate,
389 |                         version,
390 |                         protect,
391 |                     )[self.t_pad_tgt : -self.t_pad_tgt]
392 |                 )
393 |             s = t
394 |         if if_f0 == 1:
395 |             audio_opt.append(
396 |                 self.vc(
397 |                     model,
398 |                     net_g,
399 |                     sid,
400 |                     audio_pad[t:],
401 |                     pitch[:, t // self.window :] if t is not None else pitch,
402 |                     pitchf[:, t // self.window :] if t is not None else pitchf,
403 |                     times,
404 |                     index,
405 |                     big_npy,
406 |                     index_rate,
407 |                     version,
408 |                     protect,
409 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
410 |             )
411 |         else:
412 |             audio_opt.append(
413 |                 self.vc(
414 |                     model,
415 |                     net_g,
416 |                     sid,
417 |                     audio_pad[t:],
418 |                     None,
419 |                     None,
420 |                     times,
421 |                     index,
422 |                     big_npy,
423 |                     index_rate,
424 |                     version,
425 |                     protect,
426 |                 )[self.t_pad_tgt : -self.t_pad_tgt]
427 |             )
428 |         audio_opt = np.concatenate(audio_opt)
429 |         if rms_mix_rate != 1:
430 |             audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
431 |         if resample_sr >= 16000 and tgt_sr != resample_sr:
432 |             audio_opt = librosa.resample(
433 |                 audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
434 |             )
435 |         audio_max = np.abs(audio_opt).max() / 0.99
436 |         max_int16 = 32768
437 |         if audio_max > 1:
438 |             max_int16 /= audio_max
439 |         audio_opt = (audio_opt * max_int16).astype(np.int16)
440 |         del pitch, pitchf, sid
441 |         if torch.cuda.is_available():
442 |             torch.cuda.empty_cache()
443 |         return audio_opt
444 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | wheel
 2 | setuptools
 3 | ffmpeg
 4 | numba==0.56.4
 5 | numpy==1.23.5
 6 | scipy
 7 | librosa==0.9.1
 8 | fairseq==0.12.2
 9 | faiss-cpu==1.7.3
10 | gradio>==4.19.2
11 | pyworld==0.3.2
12 | soundfile>=0.12.1
13 | praat-parselmouth>=0.4.2
14 | httpx==0.23.0
15 | tensorboard
16 | tensorboardX
17 | torchcrepe
18 | onnxruntime
19 | demucs
20 | edge-tts
21 | yt_dlp
22 | pytube
23 | av
24 | 


--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 | 
3 | REM Activate virtual environment
4 | call venv\Scripts\activate
5 | 
6 | REM Run the inference script
7 | python app.py


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | # Activate virtual environment
2 | source my_env/bin/activate
3 | 
4 | # Run the inference script
5 | python app.py
6 | 


--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | rem Check if Python exists
 4 | python --version > NUL 2>&1
 5 | IF ERRORLEVEL 1 (
 6 |   ECHO Python is not installed. Please install Python before running this script.
 7 |   EXIT /B 1
 8 | )
 9 | 
10 | rem Create virtual environment (.venv)
11 | python -m venv .venv
12 | 
13 | rem Activate virtual environment
14 | .venv\Scripts\activate
15 | 
16 | rem Check for Nvidia GPU using nvidia-smi
17 | nvidia-smi > NUL 2>&1
18 | IF ERRORLEVEL 1 (
19 |   rem Install CPU version
20 |   pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cpu
21 | ) ELSE (
22 |   rem Install GPU version
23 |   pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
24 | )
25 | 
26 | rem Install dependencies from requirements.txt
27 | pip install -r requirements.txt
28 | 
29 | rem Download requirement voice model
30 | powershell -Command "Invoke-WebRequest https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt?download=true -OutFile assets/hubert/hubert_base.pt"
31 | powershell -Command "Invoke-WebRequest https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt?download=true -OutFile assets/rvmpe/rmvpe.pt"
32 | 
33 | rem Run the inference app
34 | python app.py
35 | 
36 | ECHO Finished!


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if Python exists
 4 | if ! command -v python &> /dev/null; then
 5 |   echo "Python is not installed. Please install Python before running this script."
 6 |   exit 1
 7 | fi
 8 | 
 9 | # Create virtual environment (.venv)
10 | python -m venv .venv
11 | 
12 | # Activate virtual environment
13 | source .venv/bin/activate
14 | 
15 | # Check for Nvidia GPU using nvidia-smi
16 | if nvidia-smi &> /dev/null; then
17 |   # Install GPU version
18 |   pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
19 | else
20 |   # Install CPU version
21 |   pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cpu
22 | fi
23 | 
24 | # Install dependencies from requirements.txt
25 | pip install -r requirements.txt
26 | 
27 | # Download requirement voice model
28 | wget https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt?download=true -O assets/hubert/hubert_base.pt
29 | wget https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt?download=true -O assets/rvmpe/rmvpe.pt
30 | 
31 | 
32 | # Run the inference app
33 | python app.py
34 | 
35 | echo "Finished!"
36 | 


--------------------------------------------------------------------------------
/weights/folder_info.json:
--------------------------------------------------------------------------------
1 | {
2 |     "CATEGORY_TAB_NAME":{
3 |         "enable": true,
4 |         "title": "CATEGORY_TITLE",
5 |         "folder_path": "CATEGORY_FOLDER_PATH",
6 |         "description": "CATEGORY_DESCRIPTION"
7 |     }
8 | }


--------------------------------------------------------------------------------
/weights/model_pack/model_info.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"FOLDER_PATH": {
 3 | 		"enable": true,
 4 | 		"model_path": "CHARACTER_BASEMODEL",
 5 | 		"title": "CHARACTER_NAME",
 6 | 		"cover": "CHARACTER_IMAGE",
 7 | 		"feature_retrieval_library": "CHARACTER_MODEL_INDEX",
 8 | 		"author": "MODEL_AUTHOR"
 9 |     }
10 | }


--------------------------------------------------------------------------------