├── .funcignore ├── .gitignore ├── .vscode ├── extensions.json ├── launch.json ├── settings.json └── tasks.json ├── README.md ├── api.py ├── attentions.py ├── clean ├── __init__.py └── function.json ├── cleankr ├── __init__.py └── function.json ├── commons.py ├── host.json ├── models.py ├── modules.py ├── monotonic_align ├── __init__.py └── core.py ├── requirements.txt ├── speak ├── __init__.py └── function.json ├── speak2 ├── __init__.py └── function.json ├── speakkr ├── __init__.py └── function.json ├── text ├── LICENSE ├── __init__.py └── cleaners.py ├── transforms.py └── utils.py /.funcignore: -------------------------------------------------------------------------------- 1 | .git* 2 | .vscode 3 | local.settings.json 4 | test 5 | .venv 6 | .DS_Store 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | *.pyc 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd 364 | 365 | # build 366 | build 367 | monotonic_align/core.c 368 | *.o 369 | *.so 370 | *.dll 371 | 372 | # Byte-compiled / optimized / DLL files 373 | __pycache__/ 374 | *.py[cod] 375 | *$py.class 376 | 377 | # C extensions 378 | *.so 379 | 380 | # Distribution / packaging 381 | .Python 382 | build/ 383 | develop-eggs/ 384 | dist/ 385 | downloads/ 386 | eggs/ 387 | .eggs/ 388 | lib/ 389 | lib64/ 390 | parts/ 391 | sdist/ 392 | var/ 393 | wheels/ 394 | pip-wheel-metadata/ 395 | share/python-wheels/ 396 | *.egg-info/ 397 | .installed.cfg 398 | *.egg 399 | MANIFEST 400 | 401 | # PyInstaller 402 | # Usually these files are written by a python script from a template 403 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 404 | *.manifest 405 | *.spec 406 | 407 | # Installer logs 408 | pip-log.txt 409 | pip-delete-this-directory.txt 410 | 411 | # Unit test / coverage reports 412 | htmlcov/ 413 | .tox/ 414 | .nox/ 415 | .coverage 416 | .coverage.* 417 | .cache 418 | nosetests.xml 419 | coverage.xml 420 | *.cover 421 | .hypothesis/ 422 | .pytest_cache/ 423 | 424 | # Translations 425 | *.mo 426 | *.pot 427 | 428 | # Django stuff: 429 | *.log 430 | local_settings.py 431 | db.sqlite3 432 | 433 | # Flask stuff: 434 | instance/ 435 | .webassets-cache 436 | 437 | # Scrapy stuff: 438 | .scrapy 439 | 440 | # Sphinx documentation 441 | docs/_build/ 442 | 443 | # PyBuilder 444 | target/ 445 | 446 | # Jupyter Notebook 447 | .ipynb_checkpoints 448 | 449 | # IPython 450 | profile_default/ 451 | ipython_config.py 452 | 453 | # pyenv 454 | .python-version 455 | 456 | # pipenv 457 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 458 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 459 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 460 | # install all needed dependencies. 461 | #Pipfile.lock 462 | 463 | # celery beat schedule file 464 | celerybeat-schedule 465 | 466 | # SageMath parsed files 467 | *.sage.py 468 | 469 | # Environments 470 | .env 471 | .venv 472 | env/ 473 | venv/ 474 | ENV/ 475 | env.bak/ 476 | venv.bak/ 477 | 478 | # Spyder project settings 479 | .spyderproject 480 | .spyproject 481 | 482 | # Rope project settings 483 | .ropeproject 484 | 485 | # mkdocs documentation 486 | /site 487 | 488 | # mypy 489 | .mypy_cache/ 490 | .dmypy.json 491 | dmypy.json 492 | 493 | # Pyre type checker 494 | .pyre/ 495 | 496 | # Azure Functions artifacts 497 | bin 498 | obj 499 | appsettings.json 500 | local.settings.json 501 | 502 | # Azurite artifacts 503 | __blobstorage__ 504 | __queuestorage__ 505 | __azurite_db*__.json 506 | .python_packages 507 | 508 | .DS_Store 509 | 510 | *.pth 511 | config.json 512 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-python.python" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Attach to Python Functions", 6 | "type": "python", 7 | "request": "attach", 8 | "port": 9091, 9 | "preLaunchTask": "func: host start" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "azureFunctions.deploySubpath": ".", 3 | "azureFunctions.scmDoBuildDuringDeployment": true, 4 | "azureFunctions.pythonVenv": ".venv", 5 | "azureFunctions.projectLanguage": "Python", 6 | "azureFunctions.projectRuntime": "~4", 7 | "debug.internalConsoleOptions": "neverOpen" 8 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "type": "func", 6 | "command": "host start", 7 | "problemMatcher": "$func-python-watch", 8 | "isBackground": true, 9 | "dependsOn": "pip install (functions)" 10 | }, 11 | { 12 | "label": "pip install (functions)", 13 | "type": "shell", 14 | "osx": { 15 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple" 16 | }, 17 | "windows": { 18 | "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt" 19 | }, 20 | "linux": { 21 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" 22 | }, 23 | "problemMatcher": [] 24 | } 25 | ] 26 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MoeGoe Azure Cloud Function API 2 | See [MoeGoe](https://github.com/CjangCjengh/MoeGoe) 3 | 4 | ## Japanese 5 | 6 | > Nene + Meguru + Yoshino + Mako + Murasame + Koharu + Nanami 7 | 8 | - GET https://moegoe.azurewebsites.net/api/speak?text=これは一つ簡単なテストです&id=0 9 | 10 | return ogg file in body 11 | 12 | - GET https://moegoe.azurewebsites.net/api/clean?text=これは一つ簡単なテストです 13 | 14 | return cleaned text in body 15 | 16 | ``` 17 | ko↑rewa hI↑to↓tsU ka↑NtaNna te↓sUtodesU. 18 | ``` 19 | 20 | - GET https://moegoe.azurewebsites.net/api/speak?cleantext=ko↑rewahI↑totsUka↑NtaNnate↓sUtodesU.&id=1 21 | 22 | return ogg file in body 23 | 24 | | ID | Speaker | 25 | | ---- | ---- | 26 | | 0 | 綾地寧々 | 27 | | 1 | 因幡めぐる | 28 | | 2 | 朝武芳乃 | 29 | | 3 | 常陸茉子 | 30 | | 4 | ムラサメ | 31 | | 5 | 鞍馬小春 | 32 | | 6 | 在原七海 | 33 | 34 | > HamidashiCreative 35 | 36 | replace`speak`to`speak2` 37 | 38 | | ID | Speaker | 39 | | ---- | ---- | 40 | | 0 | 和泉妃愛 | 41 | | 1 | 常盤華乃 | 42 | | 2 | 錦あすみ | 43 | | 3 | 鎌倉詩桜 | 44 | | 4 | 竜閑天梨 | 45 | | 5 | 和泉里 | 46 | | 6 | 新川広夢 | 47 | | 7 | 聖莉々子 | 48 | 49 | 50 | ## Korean 51 | 52 | > Sua + Mimiru + Arin + Yeonhwa + Yuhwa + Seonbae 53 | 54 | - GET https://moegoe.azurewebsites.net/api/speakkr?text=이것은%20간단한%20테스트이다&id=0 55 | 56 | return ogg file in body 57 | 58 | - GET https://moegoe.azurewebsites.net/api/cleankr?text=이것은%20간단한%20테스트이다 59 | 60 | return cleaned text in body 61 | 62 | ``` 63 | ㅇㅣㄱㅓㅅㅇㅡㄴ ㄱㅏㄴㄷㅏㄴㅎㅏㄴ ㅌㅔㅅㅡㅌㅡㅇㅣㄷㅏ. 64 | ``` 65 | 66 | - GET https://moegoe.azurewebsites.net/api/speakkr?cleantext=ㅇㅣㄱㅓㅅㅇㅡㄴ%20ㄱㅏㄴㄷㅏㄴㅎㅏㄴ%20ㅌㅔㅅㅡㅌㅡㅇㅣㄷㅏ.&id=1 67 | 68 | return ogg file in body 69 | 70 | | ID | Speaker | 71 | | ---- | ---- | 72 | | 0 | 수아 | 73 | | 1 | 미미르 | 74 | | 2 | 아린 | 75 | | 3 | 연화 | 76 | | 4 | 유화 | 77 | | 5 | 선배 | 78 | 79 | ## Optional Parameters 80 | 81 | ### speak 82 | - **format**: ogg(default), mp3 or wav 83 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | 3 | from io import BytesIO 4 | from pathlib import Path 5 | from torch import no_grad, LongTensor 6 | 7 | import commons 8 | from utils import load_checkpoint, get_hparams_from_file, wav2 9 | from models import SynthesizerTrn 10 | from text import text_to_sequence, _clean_text 11 | from urllib.parse import unquote 12 | 13 | from scipy.io.wavfile import write 14 | 15 | 16 | class Cleaner(): 17 | def __init__(self, configfile: str): 18 | self.cleanernames = get_hparams_from_file(str(Path(__file__).parent/configfile)).data.text_cleaners 19 | 20 | def main(self, req: func.HttpRequest) -> func.HttpResponse: 21 | text = req.params.get('text') 22 | if not text: 23 | return func.HttpResponse( 24 | "400 BAD REQUEST: null text", 25 | status_code=400 26 | ) 27 | try: 28 | return func.HttpResponse( 29 | _clean_text(unquote(text), self.cleanernames), 30 | status_code=200 31 | ) 32 | except: 33 | return func.HttpResponse( 34 | "400 BAD REQUEST: invalid text", 35 | status_code=400 36 | ) 37 | 38 | 39 | class Speaker(): 40 | def __init__(self, configfile: str, pthfile: str): 41 | self.hps_ms = get_hparams_from_file(str(Path(__file__).parent/configfile)) 42 | self.net_g_ms = SynthesizerTrn( 43 | len(self.hps_ms.symbols), 44 | self.hps_ms.data.filter_length // 2 + 1, 45 | self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, 46 | n_speakers=self.hps_ms.data.n_speakers, 47 | **self.hps_ms.model) 48 | _ = self.net_g_ms.eval() 49 | load_checkpoint(str(Path(__file__).parent/pthfile), self.net_g_ms) 50 | 51 | def get_text(self, text: str, cleaned=False): 52 | if cleaned: 53 | text_norm = text_to_sequence(text, self.hps_ms.symbols, []) 54 | else: 55 | text_norm = text_to_sequence(text, self.hps_ms.symbols, self.hps_ms.data.text_cleaners) 56 | if self.hps_ms.data.add_blank: 57 | text_norm = commons.intersperse(text_norm, 0) 58 | text_norm = LongTensor(text_norm) 59 | return text_norm 60 | 61 | def main(self, req: func.HttpRequest) -> func.HttpResponse: 62 | text = req.params.get('text') 63 | cleantext = req.params.get('cleantext') 64 | if not text and not cleantext: 65 | return func.HttpResponse( 66 | "400 BAD REQUEST: null text", 67 | status_code=400 68 | ) 69 | if text and cleantext: 70 | return func.HttpResponse( 71 | "400 BAD REQUEST: text and cleantext cannot be set both", 72 | status_code=400 73 | ) 74 | cleaned = False 75 | if cleantext: 76 | cleaned = True 77 | text = cleantext 78 | speaker_id = req.params.get('id') 79 | if not speaker_id: 80 | return func.HttpResponse( 81 | "400 BAD REQUEST: null speaker id", 82 | status_code=400 83 | ) 84 | try: 85 | speaker_id = int(speaker_id) 86 | except: 87 | return func.HttpResponse( 88 | "400 BAD REQUEST: invalid speaker id", 89 | status_code=400 90 | ) 91 | if speaker_id not in range(self.hps_ms.data.n_speakers): 92 | return func.HttpResponse( 93 | "400 BAD REQUEST: speaker id out of range", 94 | status_code=400 95 | ) 96 | format = req.params.get('format') 97 | if not format: format = "ogg" 98 | if format not in ("ogg", "mp3", "wav"): 99 | return func.HttpResponse( 100 | "400 BAD REQUEST: invalid format", 101 | status_code=400 102 | ) 103 | try: 104 | stn_tst = self.get_text(unquote(text), cleaned) 105 | except: 106 | return func.HttpResponse( 107 | "400 BAD REQUEST: invalid text", 108 | status_code=400 109 | ) 110 | try: 111 | with no_grad(): 112 | x_tst = stn_tst.unsqueeze(0) 113 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 114 | sid = LongTensor([speaker_id]) 115 | audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy() 116 | with BytesIO() as f: 117 | write(f, self.hps_ms.data.sampling_rate, audio) 118 | if format == "wav": 119 | return func.HttpResponse( 120 | f.getvalue(), 121 | status_code=200, 122 | mimetype="audio/wav", 123 | ) 124 | else: 125 | f.seek(0, 0) 126 | with BytesIO() as ofp: 127 | wav2(f, ofp, format) 128 | return func.HttpResponse( 129 | ofp.getvalue(), 130 | status_code=200, 131 | mimetype="audio/mpeg" if format == "mp3" else "audio/ogg", 132 | ) 133 | except Exception as e: 134 | return func.HttpResponse( 135 | "500 Internal Server Error\n"+str(e), 136 | status_code=500, 137 | ) 138 | -------------------------------------------------------------------------------- /attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | from modules import LayerNorm 8 | 9 | 10 | class Encoder(nn.Module): 11 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 12 | super().__init__() 13 | self.hidden_channels = hidden_channels 14 | self.filter_channels = filter_channels 15 | self.n_heads = n_heads 16 | self.n_layers = n_layers 17 | self.kernel_size = kernel_size 18 | self.p_dropout = p_dropout 19 | self.window_size = window_size 20 | 21 | self.drop = nn.Dropout(p_dropout) 22 | self.attn_layers = nn.ModuleList() 23 | self.norm_layers_1 = nn.ModuleList() 24 | self.ffn_layers = nn.ModuleList() 25 | self.norm_layers_2 = nn.ModuleList() 26 | for i in range(self.n_layers): 27 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 28 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 29 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 30 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 31 | 32 | def forward(self, x, x_mask): 33 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 34 | x = x * x_mask 35 | for i in range(self.n_layers): 36 | y = self.attn_layers[i](x, x, attn_mask) 37 | y = self.drop(y) 38 | x = self.norm_layers_1[i](x + y) 39 | 40 | y = self.ffn_layers[i](x, x_mask) 41 | y = self.drop(y) 42 | x = self.norm_layers_2[i](x + y) 43 | x = x * x_mask 44 | return x 45 | 46 | 47 | class Decoder(nn.Module): 48 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 49 | super().__init__() 50 | self.hidden_channels = hidden_channels 51 | self.filter_channels = filter_channels 52 | self.n_heads = n_heads 53 | self.n_layers = n_layers 54 | self.kernel_size = kernel_size 55 | self.p_dropout = p_dropout 56 | self.proximal_bias = proximal_bias 57 | self.proximal_init = proximal_init 58 | 59 | self.drop = nn.Dropout(p_dropout) 60 | self.self_attn_layers = nn.ModuleList() 61 | self.norm_layers_0 = nn.ModuleList() 62 | self.encdec_attn_layers = nn.ModuleList() 63 | self.norm_layers_1 = nn.ModuleList() 64 | self.ffn_layers = nn.ModuleList() 65 | self.norm_layers_2 = nn.ModuleList() 66 | for i in range(self.n_layers): 67 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 68 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 69 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 70 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 71 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 72 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 73 | 74 | def forward(self, x, x_mask, h, h_mask): 75 | """ 76 | x: decoder input 77 | h: encoder output 78 | """ 79 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 80 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 81 | x = x * x_mask 82 | for i in range(self.n_layers): 83 | y = self.self_attn_layers[i](x, x, self_attn_mask) 84 | y = self.drop(y) 85 | x = self.norm_layers_0[i](x + y) 86 | 87 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 88 | y = self.drop(y) 89 | x = self.norm_layers_1[i](x + y) 90 | 91 | y = self.ffn_layers[i](x, x_mask) 92 | y = self.drop(y) 93 | x = self.norm_layers_2[i](x + y) 94 | x = x * x_mask 95 | return x 96 | 97 | 98 | class MultiHeadAttention(nn.Module): 99 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 100 | super().__init__() 101 | assert channels % n_heads == 0 102 | 103 | self.channels = channels 104 | self.out_channels = out_channels 105 | self.n_heads = n_heads 106 | self.p_dropout = p_dropout 107 | self.window_size = window_size 108 | self.heads_share = heads_share 109 | self.block_length = block_length 110 | self.proximal_bias = proximal_bias 111 | self.proximal_init = proximal_init 112 | self.attn = None 113 | 114 | self.k_channels = channels // n_heads 115 | self.conv_q = nn.Conv1d(channels, channels, 1) 116 | self.conv_k = nn.Conv1d(channels, channels, 1) 117 | self.conv_v = nn.Conv1d(channels, channels, 1) 118 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 119 | self.drop = nn.Dropout(p_dropout) 120 | 121 | if window_size is not None: 122 | n_heads_rel = 1 if heads_share else n_heads 123 | rel_stddev = self.k_channels**-0.5 124 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 125 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 126 | 127 | nn.init.xavier_uniform_(self.conv_q.weight) 128 | nn.init.xavier_uniform_(self.conv_k.weight) 129 | nn.init.xavier_uniform_(self.conv_v.weight) 130 | if proximal_init: 131 | with torch.no_grad(): 132 | self.conv_k.weight.copy_(self.conv_q.weight) 133 | self.conv_k.bias.copy_(self.conv_q.bias) 134 | 135 | def forward(self, x, c, attn_mask=None): 136 | q = self.conv_q(x) 137 | k = self.conv_k(c) 138 | v = self.conv_v(c) 139 | 140 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 141 | 142 | x = self.conv_o(x) 143 | return x 144 | 145 | def attention(self, query, key, value, mask=None): 146 | # reshape [b, d, t] -> [b, n_h, t, d_k] 147 | b, d, t_s, t_t = (*key.size(), query.size(2)) 148 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 149 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 150 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 151 | 152 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 153 | if self.window_size is not None: 154 | assert t_s == t_t, "Relative attention is only available for self-attention." 155 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 156 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 157 | scores_local = self._relative_position_to_absolute_position(rel_logits) 158 | scores = scores + scores_local 159 | if self.proximal_bias: 160 | assert t_s == t_t, "Proximal bias is only available for self-attention." 161 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 162 | if mask is not None: 163 | scores = scores.masked_fill(mask == 0, -1e4) 164 | if self.block_length is not None: 165 | assert t_s == t_t, "Local attention is only available for self-attention." 166 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 167 | scores = scores.masked_fill(block_mask == 0, -1e4) 168 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 169 | p_attn = self.drop(p_attn) 170 | output = torch.matmul(p_attn, value) 171 | if self.window_size is not None: 172 | relative_weights = self._absolute_position_to_relative_position(p_attn) 173 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 174 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 175 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 176 | return output, p_attn 177 | 178 | def _matmul_with_relative_values(self, x, y): 179 | """ 180 | x: [b, h, l, m] 181 | y: [h or 1, m, d] 182 | ret: [b, h, l, d] 183 | """ 184 | ret = torch.matmul(x, y.unsqueeze(0)) 185 | return ret 186 | 187 | def _matmul_with_relative_keys(self, x, y): 188 | """ 189 | x: [b, h, l, d] 190 | y: [h or 1, m, d] 191 | ret: [b, h, l, m] 192 | """ 193 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 194 | return ret 195 | 196 | def _get_relative_embeddings(self, relative_embeddings, length): 197 | max_relative_position = 2 * self.window_size + 1 198 | # Pad first before slice to avoid using cond ops. 199 | pad_length = max(length - (self.window_size + 1), 0) 200 | slice_start_position = max((self.window_size + 1) - length, 0) 201 | slice_end_position = slice_start_position + 2 * length - 1 202 | if pad_length > 0: 203 | padded_relative_embeddings = F.pad( 204 | relative_embeddings, 205 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 206 | else: 207 | padded_relative_embeddings = relative_embeddings 208 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 209 | return used_relative_embeddings 210 | 211 | def _relative_position_to_absolute_position(self, x): 212 | """ 213 | x: [b, h, l, 2*l-1] 214 | ret: [b, h, l, l] 215 | """ 216 | batch, heads, length, _ = x.size() 217 | # Concat columns of pad to shift from relative to absolute indexing. 218 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 219 | 220 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 221 | x_flat = x.view([batch, heads, length * 2 * length]) 222 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 223 | 224 | # Reshape and slice out the padded elements. 225 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 226 | return x_final 227 | 228 | def _absolute_position_to_relative_position(self, x): 229 | """ 230 | x: [b, h, l, l] 231 | ret: [b, h, l, 2*l-1] 232 | """ 233 | batch, heads, length, _ = x.size() 234 | # padd along column 235 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 236 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 237 | # add 0's in the beginning that will skew the elements after reshape 238 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 239 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 240 | return x_final 241 | 242 | def _attention_bias_proximal(self, length): 243 | """Bias for self-attention to encourage attention to close positions. 244 | Args: 245 | length: an integer scalar. 246 | Returns: 247 | a Tensor with shape [1, 1, length, length] 248 | """ 249 | r = torch.arange(length, dtype=torch.float32) 250 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 251 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 252 | 253 | 254 | class FFN(nn.Module): 255 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 256 | super().__init__() 257 | self.in_channels = in_channels 258 | self.out_channels = out_channels 259 | self.filter_channels = filter_channels 260 | self.kernel_size = kernel_size 261 | self.p_dropout = p_dropout 262 | self.activation = activation 263 | self.causal = causal 264 | 265 | if causal: 266 | self.padding = self._causal_padding 267 | else: 268 | self.padding = self._same_padding 269 | 270 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 271 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 272 | self.drop = nn.Dropout(p_dropout) 273 | 274 | def forward(self, x, x_mask): 275 | x = self.conv_1(self.padding(x * x_mask)) 276 | if self.activation == "gelu": 277 | x = x * torch.sigmoid(1.702 * x) 278 | else: 279 | x = torch.relu(x) 280 | x = self.drop(x) 281 | x = self.conv_2(self.padding(x * x_mask)) 282 | return x * x_mask 283 | 284 | def _causal_padding(self, x): 285 | if self.kernel_size == 1: 286 | return x 287 | pad_l = self.kernel_size - 1 288 | pad_r = 0 289 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 290 | x = F.pad(x, commons.convert_pad_shape(padding)) 291 | return x 292 | 293 | def _same_padding(self, x): 294 | if self.kernel_size == 1: 295 | return x 296 | pad_l = (self.kernel_size - 1) // 2 297 | pad_r = self.kernel_size // 2 298 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 299 | x = F.pad(x, commons.convert_pad_shape(padding)) 300 | return x 301 | -------------------------------------------------------------------------------- /clean/__init__.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | 3 | from api import Cleaner 4 | 5 | 6 | cleaner = Cleaner('Yuzu/config.json') 7 | 8 | 9 | def main(req: func.HttpRequest) -> func.HttpResponse: 10 | return cleaner.main(req) 11 | -------------------------------------------------------------------------------- /clean/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": ["get"] 10 | }, 11 | { 12 | "type": "http", 13 | "direction": "out", 14 | "name": "$return" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /cleankr/__init__.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | 3 | from api import Cleaner 4 | 5 | 6 | cleaner = Cleaner('TheFoxAwaitsMe/config.json') 7 | 8 | 9 | def main(req: func.HttpRequest) -> func.HttpResponse: 10 | return cleaner.main(req) 11 | -------------------------------------------------------------------------------- /cleankr/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": ["get"] 10 | }, 11 | { 12 | "type": "http", 13 | "direction": "out", 14 | "name": "$return" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | 5 | def init_weights(m, mean=0.0, std=0.01): 6 | classname = m.__class__.__name__ 7 | if classname.find("Conv") != -1: 8 | m.weight.data.normal_(mean, std) 9 | 10 | 11 | def get_padding(kernel_size, dilation=1): 12 | return int((kernel_size*dilation - dilation)/2) 13 | 14 | 15 | def intersperse(lst, item): 16 | result = [item] * (len(lst) * 2 + 1) 17 | result[1::2] = lst 18 | return result 19 | 20 | 21 | def slice_segments(x, ids_str, segment_size=4): 22 | ret = torch.zeros_like(x[:, :, :segment_size]) 23 | for i in range(x.size(0)): 24 | idx_str = ids_str[i] 25 | idx_end = idx_str + segment_size 26 | ret[i] = x[i, :, idx_str:idx_end] 27 | return ret 28 | 29 | 30 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 31 | b, d, t = x.size() 32 | if x_lengths is None: 33 | x_lengths = t 34 | ids_str_max = x_lengths - segment_size + 1 35 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 36 | ret = slice_segments(x, ids_str, segment_size) 37 | return ret, ids_str 38 | 39 | 40 | def subsequent_mask(length): 41 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 42 | return mask 43 | 44 | 45 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 46 | n_channels_int = n_channels[0] 47 | in_act = input_a + input_b 48 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 49 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 50 | acts = t_act * s_act 51 | return acts 52 | 53 | 54 | def convert_pad_shape(pad_shape): 55 | l = pad_shape[::-1] 56 | pad_shape = [item for sublist in l for item in sublist] 57 | return pad_shape 58 | 59 | 60 | def sequence_mask(length, max_length=None): 61 | if max_length is None: 62 | max_length = length.max() 63 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 64 | return x.unsqueeze(0) < length.unsqueeze(1) 65 | 66 | 67 | def generate_path(duration, mask): 68 | """ 69 | duration: [b, 1, t_x] 70 | mask: [b, 1, t_y, t_x] 71 | """ 72 | 73 | b, _, t_y, t_x = mask.shape 74 | cum_duration = torch.cumsum(duration, -1) 75 | 76 | cum_duration_flat = cum_duration.view(b * t_x) 77 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 78 | path = path.view(b, t_x, t_y) 79 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 80 | path = path.unsqueeze(1).transpose(2,3) * mask 81 | return path 82 | -------------------------------------------------------------------------------- /host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "logging": { 4 | "applicationInsights": { 5 | "samplingSettings": { 6 | "isEnabled": true, 7 | "excludedTypes": "Request" 8 | } 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | import modules 8 | import attentions 9 | import monotonic_align 10 | 11 | from torch.nn import Conv1d, ConvTranspose1d, Conv2d 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 13 | from commons import init_weights, get_padding 14 | 15 | 16 | class StochasticDurationPredictor(nn.Module): 17 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): 18 | super().__init__() 19 | filter_channels = in_channels # it needs to be removed from future version. 20 | self.in_channels = in_channels 21 | self.filter_channels = filter_channels 22 | self.kernel_size = kernel_size 23 | self.p_dropout = p_dropout 24 | self.n_flows = n_flows 25 | self.gin_channels = gin_channels 26 | 27 | self.log_flow = modules.Log() 28 | self.flows = nn.ModuleList() 29 | self.flows.append(modules.ElementwiseAffine(2)) 30 | for i in range(n_flows): 31 | self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 32 | self.flows.append(modules.Flip()) 33 | 34 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 35 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 36 | self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 37 | self.post_flows = nn.ModuleList() 38 | self.post_flows.append(modules.ElementwiseAffine(2)) 39 | for i in range(4): 40 | self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 41 | self.post_flows.append(modules.Flip()) 42 | 43 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 44 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 45 | self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 46 | if gin_channels != 0: 47 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 48 | 49 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 50 | x = torch.detach(x) 51 | x = self.pre(x) 52 | if g is not None: 53 | g = torch.detach(g) 54 | x = x + self.cond(g) 55 | x = self.convs(x, x_mask) 56 | x = self.proj(x) * x_mask 57 | 58 | if not reverse: 59 | flows = self.flows 60 | assert w is not None 61 | 62 | logdet_tot_q = 0 63 | h_w = self.post_pre(w) 64 | h_w = self.post_convs(h_w, x_mask) 65 | h_w = self.post_proj(h_w) * x_mask 66 | e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask 67 | z_q = e_q 68 | for flow in self.post_flows: 69 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 70 | logdet_tot_q += logdet_q 71 | z_u, z1 = torch.split(z_q, [1, 1], 1) 72 | u = torch.sigmoid(z_u) * x_mask 73 | z0 = (w - u) * x_mask 74 | logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2]) 75 | logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q 76 | 77 | logdet_tot = 0 78 | z0, logdet = self.log_flow(z0, x_mask) 79 | logdet_tot += logdet 80 | z = torch.cat([z0, z1], 1) 81 | for flow in flows: 82 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 83 | logdet_tot = logdet_tot + logdet 84 | nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot 85 | return nll + logq # [b] 86 | else: 87 | flows = list(reversed(self.flows)) 88 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 89 | z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale 90 | for flow in flows: 91 | z = flow(z, x_mask, g=x, reverse=reverse) 92 | z0, z1 = torch.split(z, [1, 1], 1) 93 | logw = z0 94 | return logw 95 | 96 | 97 | class DurationPredictor(nn.Module): 98 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): 99 | super().__init__() 100 | 101 | self.in_channels = in_channels 102 | self.filter_channels = filter_channels 103 | self.kernel_size = kernel_size 104 | self.p_dropout = p_dropout 105 | self.gin_channels = gin_channels 106 | 107 | self.drop = nn.Dropout(p_dropout) 108 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) 109 | self.norm_1 = modules.LayerNorm(filter_channels) 110 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) 111 | self.norm_2 = modules.LayerNorm(filter_channels) 112 | self.proj = nn.Conv1d(filter_channels, 1, 1) 113 | 114 | if gin_channels != 0: 115 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 116 | 117 | def forward(self, x, x_mask, g=None): 118 | x = torch.detach(x) 119 | if g is not None: 120 | g = torch.detach(g) 121 | x = x + self.cond(g) 122 | x = self.conv_1(x * x_mask) 123 | x = torch.relu(x) 124 | x = self.norm_1(x) 125 | x = self.drop(x) 126 | x = self.conv_2(x * x_mask) 127 | x = torch.relu(x) 128 | x = self.norm_2(x) 129 | x = self.drop(x) 130 | x = self.proj(x * x_mask) 131 | return x * x_mask 132 | 133 | 134 | class TextEncoder(nn.Module): 135 | def __init__(self, 136 | n_vocab, 137 | out_channels, 138 | hidden_channels, 139 | filter_channels, 140 | n_heads, 141 | n_layers, 142 | kernel_size, 143 | p_dropout): 144 | super().__init__() 145 | self.n_vocab = n_vocab 146 | self.out_channels = out_channels 147 | self.hidden_channels = hidden_channels 148 | self.filter_channels = filter_channels 149 | self.n_heads = n_heads 150 | self.n_layers = n_layers 151 | self.kernel_size = kernel_size 152 | self.p_dropout = p_dropout 153 | 154 | self.emb = nn.Embedding(n_vocab, hidden_channels) 155 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) 156 | 157 | self.encoder = attentions.Encoder( 158 | hidden_channels, 159 | filter_channels, 160 | n_heads, 161 | n_layers, 162 | kernel_size, 163 | p_dropout) 164 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) 165 | 166 | def forward(self, x, x_lengths): 167 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] 168 | x = torch.transpose(x, 1, -1) # [b, h, t] 169 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 170 | 171 | x = self.encoder(x * x_mask, x_mask) 172 | stats = self.proj(x) * x_mask 173 | 174 | m, logs = torch.split(stats, self.out_channels, dim=1) 175 | return x, m, logs, x_mask 176 | 177 | 178 | class ResidualCouplingBlock(nn.Module): 179 | def __init__(self, 180 | channels, 181 | hidden_channels, 182 | kernel_size, 183 | dilation_rate, 184 | n_layers, 185 | n_flows=4, 186 | gin_channels=0): 187 | super().__init__() 188 | self.channels = channels 189 | self.hidden_channels = hidden_channels 190 | self.kernel_size = kernel_size 191 | self.dilation_rate = dilation_rate 192 | self.n_layers = n_layers 193 | self.n_flows = n_flows 194 | self.gin_channels = gin_channels 195 | 196 | self.flows = nn.ModuleList() 197 | for i in range(n_flows): 198 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) 199 | self.flows.append(modules.Flip()) 200 | 201 | def forward(self, x, x_mask, g=None, reverse=False): 202 | if not reverse: 203 | for flow in self.flows: 204 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 205 | else: 206 | for flow in reversed(self.flows): 207 | x = flow(x, x_mask, g=g, reverse=reverse) 208 | return x 209 | 210 | 211 | class PosteriorEncoder(nn.Module): 212 | def __init__(self, 213 | in_channels, 214 | out_channels, 215 | hidden_channels, 216 | kernel_size, 217 | dilation_rate, 218 | n_layers, 219 | gin_channels=0): 220 | super().__init__() 221 | self.in_channels = in_channels 222 | self.out_channels = out_channels 223 | self.hidden_channels = hidden_channels 224 | self.kernel_size = kernel_size 225 | self.dilation_rate = dilation_rate 226 | self.n_layers = n_layers 227 | self.gin_channels = gin_channels 228 | 229 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 230 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) 231 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 232 | 233 | def forward(self, x, x_lengths, g=None): 234 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 235 | x = self.pre(x) * x_mask 236 | x = self.enc(x, x_mask, g=g) 237 | stats = self.proj(x) * x_mask 238 | m, logs = torch.split(stats, self.out_channels, dim=1) 239 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 240 | return z, m, logs, x_mask 241 | 242 | 243 | class Generator(torch.nn.Module): 244 | def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): 245 | super(Generator, self).__init__() 246 | self.num_kernels = len(resblock_kernel_sizes) 247 | self.num_upsamples = len(upsample_rates) 248 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 249 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 250 | 251 | self.ups = nn.ModuleList() 252 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 253 | self.ups.append(weight_norm( 254 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), 255 | k, u, padding=(k-u)//2))) 256 | 257 | self.resblocks = nn.ModuleList() 258 | for i in range(len(self.ups)): 259 | ch = upsample_initial_channel//(2**(i+1)) 260 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 261 | self.resblocks.append(resblock(ch, k, d)) 262 | 263 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 264 | self.ups.apply(init_weights) 265 | 266 | if gin_channels != 0: 267 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 268 | 269 | def forward(self, x, g=None): 270 | x = self.conv_pre(x) 271 | if g is not None: 272 | x = x + self.cond(g) 273 | 274 | for i in range(self.num_upsamples): 275 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 276 | x = self.ups[i](x) 277 | xs = None 278 | for j in range(self.num_kernels): 279 | if xs is None: 280 | xs = self.resblocks[i*self.num_kernels+j](x) 281 | else: 282 | xs += self.resblocks[i*self.num_kernels+j](x) 283 | x = xs / self.num_kernels 284 | x = F.leaky_relu(x) 285 | x = self.conv_post(x) 286 | x = torch.tanh(x) 287 | 288 | return x 289 | 290 | def remove_weight_norm(self): 291 | print('Removing weight norm...') 292 | for l in self.ups: 293 | remove_weight_norm(l) 294 | for l in self.resblocks: 295 | l.remove_weight_norm() 296 | 297 | 298 | class DiscriminatorP(torch.nn.Module): 299 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 300 | super(DiscriminatorP, self).__init__() 301 | self.period = period 302 | self.use_spectral_norm = use_spectral_norm 303 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 304 | self.convs = nn.ModuleList([ 305 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 306 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 307 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 308 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 309 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), 310 | ]) 311 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 312 | 313 | def forward(self, x): 314 | fmap = [] 315 | 316 | # 1d to 2d 317 | b, c, t = x.shape 318 | if t % self.period != 0: # pad first 319 | n_pad = self.period - (t % self.period) 320 | x = F.pad(x, (0, n_pad), "reflect") 321 | t = t + n_pad 322 | x = x.view(b, c, t // self.period, self.period) 323 | 324 | for l in self.convs: 325 | x = l(x) 326 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 327 | fmap.append(x) 328 | x = self.conv_post(x) 329 | fmap.append(x) 330 | x = torch.flatten(x, 1, -1) 331 | 332 | return x, fmap 333 | 334 | 335 | class DiscriminatorS(torch.nn.Module): 336 | def __init__(self, use_spectral_norm=False): 337 | super(DiscriminatorS, self).__init__() 338 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 339 | self.convs = nn.ModuleList([ 340 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 341 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 342 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 343 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 344 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 345 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 346 | ]) 347 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 348 | 349 | def forward(self, x): 350 | fmap = [] 351 | 352 | for l in self.convs: 353 | x = l(x) 354 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 355 | fmap.append(x) 356 | x = self.conv_post(x) 357 | fmap.append(x) 358 | x = torch.flatten(x, 1, -1) 359 | 360 | return x, fmap 361 | 362 | 363 | class MultiPeriodDiscriminator(torch.nn.Module): 364 | def __init__(self, use_spectral_norm=False): 365 | super(MultiPeriodDiscriminator, self).__init__() 366 | periods = [2,3,5,7,11] 367 | 368 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 369 | discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] 370 | self.discriminators = nn.ModuleList(discs) 371 | 372 | def forward(self, y, y_hat): 373 | y_d_rs = [] 374 | y_d_gs = [] 375 | fmap_rs = [] 376 | fmap_gs = [] 377 | for i, d in enumerate(self.discriminators): 378 | y_d_r, fmap_r = d(y) 379 | y_d_g, fmap_g = d(y_hat) 380 | y_d_rs.append(y_d_r) 381 | y_d_gs.append(y_d_g) 382 | fmap_rs.append(fmap_r) 383 | fmap_gs.append(fmap_g) 384 | 385 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 386 | 387 | 388 | 389 | class SynthesizerTrn(nn.Module): 390 | """ 391 | Synthesizer for Training 392 | """ 393 | 394 | def __init__(self, 395 | n_vocab, 396 | spec_channels, 397 | segment_size, 398 | inter_channels, 399 | hidden_channels, 400 | filter_channels, 401 | n_heads, 402 | n_layers, 403 | kernel_size, 404 | p_dropout, 405 | resblock, 406 | resblock_kernel_sizes, 407 | resblock_dilation_sizes, 408 | upsample_rates, 409 | upsample_initial_channel, 410 | upsample_kernel_sizes, 411 | n_speakers=0, 412 | gin_channels=0, 413 | use_sdp=True, 414 | **kwargs): 415 | 416 | super().__init__() 417 | self.n_vocab = n_vocab 418 | self.spec_channels = spec_channels 419 | self.inter_channels = inter_channels 420 | self.hidden_channels = hidden_channels 421 | self.filter_channels = filter_channels 422 | self.n_heads = n_heads 423 | self.n_layers = n_layers 424 | self.kernel_size = kernel_size 425 | self.p_dropout = p_dropout 426 | self.resblock = resblock 427 | self.resblock_kernel_sizes = resblock_kernel_sizes 428 | self.resblock_dilation_sizes = resblock_dilation_sizes 429 | self.upsample_rates = upsample_rates 430 | self.upsample_initial_channel = upsample_initial_channel 431 | self.upsample_kernel_sizes = upsample_kernel_sizes 432 | self.segment_size = segment_size 433 | self.n_speakers = n_speakers 434 | self.gin_channels = gin_channels 435 | 436 | self.use_sdp = use_sdp 437 | 438 | self.enc_p = TextEncoder(n_vocab, 439 | inter_channels, 440 | hidden_channels, 441 | filter_channels, 442 | n_heads, 443 | n_layers, 444 | kernel_size, 445 | p_dropout) 446 | self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) 447 | self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 448 | self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) 449 | 450 | if use_sdp: 451 | self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) 452 | else: 453 | self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) 454 | 455 | if n_speakers > 1: 456 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 457 | 458 | def forward(self, x, x_lengths, y, y_lengths, sid=None): 459 | 460 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths) 461 | if self.n_speakers > 0: 462 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 463 | else: 464 | g = None 465 | 466 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 467 | z_p = self.flow(z, y_mask, g=g) 468 | 469 | with torch.no_grad(): 470 | # negative cross-entropy 471 | s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] 472 | neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s] 473 | neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] 474 | neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] 475 | neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s] 476 | neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 477 | 478 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 479 | attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach() 480 | 481 | w = attn.sum(2) 482 | if self.use_sdp: 483 | l_length = self.dp(x, x_mask, w, g=g) 484 | l_length = l_length / torch.sum(x_mask) 485 | else: 486 | logw_ = torch.log(w + 1e-6) * x_mask 487 | logw = self.dp(x, x_mask, g=g) 488 | l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging 489 | 490 | # expand prior 491 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) 492 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) 493 | 494 | z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) 495 | o = self.dec(z_slice, g=g) 496 | return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) 497 | 498 | def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None): 499 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths) 500 | if self.n_speakers > 0: 501 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 502 | else: 503 | g = None 504 | 505 | if self.use_sdp: 506 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) 507 | else: 508 | logw = self.dp(x, x_mask, g=g) 509 | w = torch.exp(logw) * x_mask * length_scale 510 | w_ceil = torch.ceil(w) 511 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 512 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) 513 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 514 | attn = commons.generate_path(w_ceil, attn_mask) 515 | 516 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 517 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 518 | 519 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 520 | z = self.flow(z_p, y_mask, g=g, reverse=True) 521 | o = self.dec((z * y_mask)[:,:,:max_len], g=g) 522 | return o, attn, y_mask, (z, z_p, m_p, logs_p) 523 | 524 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): 525 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 526 | g_src = self.emb_g(sid_src).unsqueeze(-1) 527 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 528 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 529 | z_p = self.flow(z, y_mask, g=g_src) 530 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 531 | o_hat = self.dec(z_hat * y_mask, g=g_tgt) 532 | return o_hat, y_mask, (z, z_p, z_hat) 533 | 534 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch.nn import Conv1d 7 | from torch.nn.utils import weight_norm, remove_weight_norm 8 | 9 | import commons 10 | from commons import init_weights, get_padding 11 | from transforms import piecewise_rational_quadratic_transform 12 | 13 | 14 | LRELU_SLOPE = 0.1 15 | 16 | 17 | class LayerNorm(nn.Module): 18 | def __init__(self, channels, eps=1e-5): 19 | super().__init__() 20 | self.channels = channels 21 | self.eps = eps 22 | 23 | self.gamma = nn.Parameter(torch.ones(channels)) 24 | self.beta = nn.Parameter(torch.zeros(channels)) 25 | 26 | def forward(self, x): 27 | x = x.transpose(1, -1) 28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 29 | return x.transpose(1, -1) 30 | 31 | 32 | class ConvReluNorm(nn.Module): 33 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 34 | super().__init__() 35 | self.in_channels = in_channels 36 | self.hidden_channels = hidden_channels 37 | self.out_channels = out_channels 38 | self.kernel_size = kernel_size 39 | self.n_layers = n_layers 40 | self.p_dropout = p_dropout 41 | assert n_layers > 1, "Number of layers should be larger than 0." 42 | 43 | self.conv_layers = nn.ModuleList() 44 | self.norm_layers = nn.ModuleList() 45 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 46 | self.norm_layers.append(LayerNorm(hidden_channels)) 47 | self.relu_drop = nn.Sequential( 48 | nn.ReLU(), 49 | nn.Dropout(p_dropout)) 50 | for _ in range(n_layers-1): 51 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 52 | self.norm_layers.append(LayerNorm(hidden_channels)) 53 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 54 | self.proj.weight.data.zero_() 55 | self.proj.bias.data.zero_() 56 | 57 | def forward(self, x, x_mask): 58 | x_org = x 59 | for i in range(self.n_layers): 60 | x = self.conv_layers[i](x * x_mask) 61 | x = self.norm_layers[i](x) 62 | x = self.relu_drop(x) 63 | x = x_org + self.proj(x) 64 | return x * x_mask 65 | 66 | 67 | class DDSConv(nn.Module): 68 | """ 69 | Dialted and Depth-Separable Convolution 70 | """ 71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 72 | super().__init__() 73 | self.channels = channels 74 | self.kernel_size = kernel_size 75 | self.n_layers = n_layers 76 | self.p_dropout = p_dropout 77 | 78 | self.drop = nn.Dropout(p_dropout) 79 | self.convs_sep = nn.ModuleList() 80 | self.convs_1x1 = nn.ModuleList() 81 | self.norms_1 = nn.ModuleList() 82 | self.norms_2 = nn.ModuleList() 83 | for i in range(n_layers): 84 | dilation = kernel_size ** i 85 | padding = (kernel_size * dilation - dilation) // 2 86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 87 | groups=channels, dilation=dilation, padding=padding 88 | )) 89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 90 | self.norms_1.append(LayerNorm(channels)) 91 | self.norms_2.append(LayerNorm(channels)) 92 | 93 | def forward(self, x, x_mask, g=None): 94 | if g is not None: 95 | x = x + g 96 | for i in range(self.n_layers): 97 | y = self.convs_sep[i](x * x_mask) 98 | y = self.norms_1[i](y) 99 | y = F.gelu(y) 100 | y = self.convs_1x1[i](y) 101 | y = self.norms_2[i](y) 102 | y = F.gelu(y) 103 | y = self.drop(y) 104 | x = x + y 105 | return x * x_mask 106 | 107 | 108 | class WN(torch.nn.Module): 109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 110 | super(WN, self).__init__() 111 | assert(kernel_size % 2 == 1) 112 | self.hidden_channels =hidden_channels 113 | self.kernel_size = kernel_size, 114 | self.dilation_rate = dilation_rate 115 | self.n_layers = n_layers 116 | self.gin_channels = gin_channels 117 | self.p_dropout = p_dropout 118 | 119 | self.in_layers = torch.nn.ModuleList() 120 | self.res_skip_layers = torch.nn.ModuleList() 121 | self.drop = nn.Dropout(p_dropout) 122 | 123 | if gin_channels != 0: 124 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 125 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 126 | 127 | for i in range(n_layers): 128 | dilation = dilation_rate ** i 129 | padding = int((kernel_size * dilation - dilation) / 2) 130 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 131 | dilation=dilation, padding=padding) 132 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 133 | self.in_layers.append(in_layer) 134 | 135 | # last one is not necessary 136 | if i < n_layers - 1: 137 | res_skip_channels = 2 * hidden_channels 138 | else: 139 | res_skip_channels = hidden_channels 140 | 141 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 142 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 143 | self.res_skip_layers.append(res_skip_layer) 144 | 145 | def forward(self, x, x_mask, g=None, **kwargs): 146 | output = torch.zeros_like(x) 147 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 148 | 149 | if g is not None: 150 | g = self.cond_layer(g) 151 | 152 | for i in range(self.n_layers): 153 | x_in = self.in_layers[i](x) 154 | if g is not None: 155 | cond_offset = i * 2 * self.hidden_channels 156 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 157 | else: 158 | g_l = torch.zeros_like(x_in) 159 | 160 | acts = commons.fused_add_tanh_sigmoid_multiply( 161 | x_in, 162 | g_l, 163 | n_channels_tensor) 164 | acts = self.drop(acts) 165 | 166 | res_skip_acts = self.res_skip_layers[i](acts) 167 | if i < self.n_layers - 1: 168 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 169 | x = (x + res_acts) * x_mask 170 | output = output + res_skip_acts[:,self.hidden_channels:,:] 171 | else: 172 | output = output + res_skip_acts 173 | return output * x_mask 174 | 175 | def remove_weight_norm(self): 176 | if self.gin_channels != 0: 177 | torch.nn.utils.remove_weight_norm(self.cond_layer) 178 | for l in self.in_layers: 179 | torch.nn.utils.remove_weight_norm(l) 180 | for l in self.res_skip_layers: 181 | torch.nn.utils.remove_weight_norm(l) 182 | 183 | 184 | class ResBlock1(torch.nn.Module): 185 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 186 | super(ResBlock1, self).__init__() 187 | self.convs1 = nn.ModuleList([ 188 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 189 | padding=get_padding(kernel_size, dilation[0]))), 190 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 191 | padding=get_padding(kernel_size, dilation[1]))), 192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 193 | padding=get_padding(kernel_size, dilation[2]))) 194 | ]) 195 | self.convs1.apply(init_weights) 196 | 197 | self.convs2 = nn.ModuleList([ 198 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 199 | padding=get_padding(kernel_size, 1))), 200 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 201 | padding=get_padding(kernel_size, 1))), 202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 203 | padding=get_padding(kernel_size, 1))) 204 | ]) 205 | self.convs2.apply(init_weights) 206 | 207 | def forward(self, x, x_mask=None): 208 | for c1, c2 in zip(self.convs1, self.convs2): 209 | xt = F.leaky_relu(x, LRELU_SLOPE) 210 | if x_mask is not None: 211 | xt = xt * x_mask 212 | xt = c1(xt) 213 | xt = F.leaky_relu(xt, LRELU_SLOPE) 214 | if x_mask is not None: 215 | xt = xt * x_mask 216 | xt = c2(xt) 217 | x = xt + x 218 | if x_mask is not None: 219 | x = x * x_mask 220 | return x 221 | 222 | def remove_weight_norm(self): 223 | for l in self.convs1: 224 | remove_weight_norm(l) 225 | for l in self.convs2: 226 | remove_weight_norm(l) 227 | 228 | 229 | class ResBlock2(torch.nn.Module): 230 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 231 | super(ResBlock2, self).__init__() 232 | self.convs = nn.ModuleList([ 233 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 234 | padding=get_padding(kernel_size, dilation[0]))), 235 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 236 | padding=get_padding(kernel_size, dilation[1]))) 237 | ]) 238 | self.convs.apply(init_weights) 239 | 240 | def forward(self, x, x_mask=None): 241 | for c in self.convs: 242 | xt = F.leaky_relu(x, LRELU_SLOPE) 243 | if x_mask is not None: 244 | xt = xt * x_mask 245 | xt = c(xt) 246 | x = xt + x 247 | if x_mask is not None: 248 | x = x * x_mask 249 | return x 250 | 251 | def remove_weight_norm(self): 252 | for l in self.convs: 253 | remove_weight_norm(l) 254 | 255 | 256 | class Log(nn.Module): 257 | def forward(self, x, x_mask, reverse=False, **kwargs): 258 | if not reverse: 259 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 260 | logdet = torch.sum(-y, [1, 2]) 261 | return y, logdet 262 | else: 263 | x = torch.exp(x) * x_mask 264 | return x 265 | 266 | 267 | class Flip(nn.Module): 268 | def forward(self, x, *args, reverse=False, **kwargs): 269 | x = torch.flip(x, [1]) 270 | if not reverse: 271 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 272 | return x, logdet 273 | else: 274 | return x 275 | 276 | 277 | class ElementwiseAffine(nn.Module): 278 | def __init__(self, channels): 279 | super().__init__() 280 | self.channels = channels 281 | self.m = nn.Parameter(torch.zeros(channels,1)) 282 | self.logs = nn.Parameter(torch.zeros(channels,1)) 283 | 284 | def forward(self, x, x_mask, reverse=False, **kwargs): 285 | if not reverse: 286 | y = self.m + torch.exp(self.logs) * x 287 | y = y * x_mask 288 | logdet = torch.sum(self.logs * x_mask, [1,2]) 289 | return y, logdet 290 | else: 291 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 292 | return x 293 | 294 | 295 | class ResidualCouplingLayer(nn.Module): 296 | def __init__(self, 297 | channels, 298 | hidden_channels, 299 | kernel_size, 300 | dilation_rate, 301 | n_layers, 302 | p_dropout=0, 303 | gin_channels=0, 304 | mean_only=False): 305 | assert channels % 2 == 0, "channels should be divisible by 2" 306 | super().__init__() 307 | self.channels = channels 308 | self.hidden_channels = hidden_channels 309 | self.kernel_size = kernel_size 310 | self.dilation_rate = dilation_rate 311 | self.n_layers = n_layers 312 | self.half_channels = channels // 2 313 | self.mean_only = mean_only 314 | 315 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 316 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 317 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 318 | self.post.weight.data.zero_() 319 | self.post.bias.data.zero_() 320 | 321 | def forward(self, x, x_mask, g=None, reverse=False): 322 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 323 | h = self.pre(x0) * x_mask 324 | h = self.enc(h, x_mask, g=g) 325 | stats = self.post(h) * x_mask 326 | if not self.mean_only: 327 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 328 | else: 329 | m = stats 330 | logs = torch.zeros_like(m) 331 | 332 | if not reverse: 333 | x1 = m + x1 * torch.exp(logs) * x_mask 334 | x = torch.cat([x0, x1], 1) 335 | logdet = torch.sum(logs, [1,2]) 336 | return x, logdet 337 | else: 338 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 339 | x = torch.cat([x0, x1], 1) 340 | return x 341 | 342 | 343 | class ConvFlow(nn.Module): 344 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 345 | super().__init__() 346 | self.in_channels = in_channels 347 | self.filter_channels = filter_channels 348 | self.kernel_size = kernel_size 349 | self.n_layers = n_layers 350 | self.num_bins = num_bins 351 | self.tail_bound = tail_bound 352 | self.half_channels = in_channels // 2 353 | 354 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 355 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 356 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 357 | self.proj.weight.data.zero_() 358 | self.proj.bias.data.zero_() 359 | 360 | def forward(self, x, x_mask, g=None, reverse=False): 361 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 362 | h = self.pre(x0) 363 | h = self.convs(h, x_mask, g=g) 364 | h = self.proj(h) * x_mask 365 | 366 | b, c, t = x0.shape 367 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 368 | 369 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 370 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 371 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 372 | 373 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 374 | unnormalized_widths, 375 | unnormalized_heights, 376 | unnormalized_derivatives, 377 | inverse=reverse, 378 | tails='linear', 379 | tail_bound=self.tail_bound 380 | ) 381 | 382 | x = torch.cat([x0, x1], 1) * x_mask 383 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 384 | if not reverse: 385 | return x, logdet 386 | else: 387 | return x 388 | -------------------------------------------------------------------------------- /monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | from numpy import zeros, int32, float32 2 | from torch import from_numpy 3 | 4 | from .core import maximum_path_jit 5 | 6 | def maximum_path(neg_cent, mask): 7 | device = neg_cent.device 8 | dtype = neg_cent.dtype 9 | neg_cent = neg_cent.data.cpu().numpy().astype(float32) 10 | path = zeros(neg_cent.shape, dtype=int32) 11 | 12 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32) 13 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32) 14 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max) 15 | return from_numpy(path).to(device=device, dtype=dtype) 16 | -------------------------------------------------------------------------------- /monotonic_align/core.py: -------------------------------------------------------------------------------- 1 | import numba 2 | 3 | 4 | @numba.jit(numba.void(numba.int32[:,:,::1], numba.float32[:,:,::1], numba.int32[::1], numba.int32[::1]), nopython=True, nogil=True) 5 | def maximum_path_jit(paths, values, t_ys, t_xs): 6 | b = paths.shape[0] 7 | max_neg_val=-1e9 8 | for i in range(int(b)): 9 | path = paths[i] 10 | value = values[i] 11 | t_y = t_ys[i] 12 | t_x = t_xs[i] 13 | 14 | v_prev = v_cur = 0.0 15 | index = t_x - 1 16 | 17 | for y in range(t_y): 18 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 19 | if x == y: 20 | v_cur = max_neg_val 21 | else: 22 | v_cur = value[y-1, x] 23 | if x == 0: 24 | if y == 0: 25 | v_prev = 0. 26 | else: 27 | v_prev = max_neg_val 28 | else: 29 | v_prev = value[y-1, x-1] 30 | value[y, x] += max(v_prev, v_cur) 31 | 32 | for y in range(t_y - 1, -1, -1): 33 | path[y, index] = 1 34 | if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]): 35 | index = index - 1 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # DO NOT include azure-functions-worker in this file 2 | # The Python Worker is managed by Azure Functions platform 3 | # Manually managing azure-functions-worker may cause unexpected issues 4 | 5 | azure-functions 6 | 7 | numpy==1.22.4 8 | numba 9 | scipy 10 | Unidecode 11 | openjtalk==0.3.0.dev2 12 | jamo 13 | av 14 | -f https://download.pytorch.org/whl/torch_stable.html 15 | torch==1.12.0+cpu 16 | -------------------------------------------------------------------------------- /speak/__init__.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | 3 | from api import Speaker 4 | 5 | 6 | speaker = Speaker('Yuzu/config.json', 'Yuzu/365_epochs.pth') 7 | 8 | 9 | def main(req: func.HttpRequest) -> func.HttpResponse: 10 | return speaker.main(req) 11 | -------------------------------------------------------------------------------- /speak/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": ["get"] 10 | }, 11 | { 12 | "type": "http", 13 | "direction": "out", 14 | "name": "$return" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /speak2/__init__.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | 3 | from api import Speaker 4 | 5 | 6 | speaker = Speaker('HamidashiCreative/config.json', 'HamidashiCreative/604_epochs.pth') 7 | 8 | 9 | def main(req: func.HttpRequest) -> func.HttpResponse: 10 | return speaker.main(req) 11 | -------------------------------------------------------------------------------- /speak2/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": ["get"] 10 | }, 11 | { 12 | "type": "http", 13 | "direction": "out", 14 | "name": "$return" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /speakkr/__init__.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | 3 | from api import Speaker 4 | 5 | 6 | speaker = Speaker('TheFoxAwaitsMe/config.json', 'TheFoxAwaitsMe/1164_epochs.pth') 7 | 8 | 9 | def main(req: func.HttpRequest) -> func.HttpResponse: 10 | return speaker.main(req) 11 | -------------------------------------------------------------------------------- /speakkr/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": ["get"] 10 | }, 11 | { 12 | "type": "http", 13 | "direction": "out", 14 | "name": "$return" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from text import cleaners 3 | 4 | 5 | def text_to_sequence(text, symbols, cleaner_names): 6 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 7 | Args: 8 | text: string to convert to a sequence 9 | cleaner_names: names of the cleaner functions to run the text through 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 14 | 15 | sequence = [] 16 | 17 | clean_text = _clean_text(text, cleaner_names) 18 | for symbol in clean_text: 19 | if symbol not in _symbol_to_id.keys(): 20 | continue 21 | symbol_id = _symbol_to_id[symbol] 22 | sequence += [symbol_id] 23 | return sequence 24 | 25 | 26 | def _clean_text(text, cleaner_names): 27 | for name in cleaner_names: 28 | cleaner = getattr(cleaners, name) 29 | if not cleaner: 30 | raise Exception('Unknown cleaner: %s' % name) 31 | text = cleaner(text) 32 | return text 33 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | import pyopenjtalk 18 | from jamo import h2j, j2hcj 19 | 20 | 21 | # This is a list of Korean classifiers preceded by pure Korean numerals. 22 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' 23 | 24 | # Regular expression matching whitespace: 25 | _whitespace_re = re.compile(r'\s+') 26 | 27 | # Regular expression matching Japanese without punctuation marks: 28 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 29 | 30 | # Regular expression matching non-Japanese characters or punctuation marks: 31 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 32 | 33 | # List of (regular expression, replacement) pairs for abbreviations: 34 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 35 | ('mrs', 'misess'), 36 | ('mr', 'mister'), 37 | ('dr', 'doctor'), 38 | ('st', 'saint'), 39 | ('co', 'company'), 40 | ('jr', 'junior'), 41 | ('maj', 'major'), 42 | ('gen', 'general'), 43 | ('drs', 'doctors'), 44 | ('rev', 'reverend'), 45 | ('lt', 'lieutenant'), 46 | ('hon', 'honorable'), 47 | ('sgt', 'sergeant'), 48 | ('capt', 'captain'), 49 | ('esq', 'esquire'), 50 | ('ltd', 'limited'), 51 | ('col', 'colonel'), 52 | ('ft', 'fort'), 53 | ]] 54 | 55 | # List of (hangul, hangul divided) pairs: 56 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ 57 | ('ㄳ', 'ㄱㅅ'), 58 | ('ㄵ', 'ㄴㅈ'), 59 | ('ㄶ', 'ㄴㅎ'), 60 | ('ㄺ', 'ㄹㄱ'), 61 | ('ㄻ', 'ㄹㅁ'), 62 | ('ㄼ', 'ㄹㅂ'), 63 | ('ㄽ', 'ㄹㅅ'), 64 | ('ㄾ', 'ㄹㅌ'), 65 | ('ㄿ', 'ㄹㅍ'), 66 | ('ㅀ', 'ㄹㅎ'), 67 | ('ㅄ', 'ㅂㅅ'), 68 | ('ㅘ', 'ㅗㅏ'), 69 | ('ㅙ', 'ㅗㅐ'), 70 | ('ㅚ', 'ㅗㅣ'), 71 | ('ㅝ', 'ㅜㅓ'), 72 | ('ㅞ', 'ㅜㅔ'), 73 | ('ㅟ', 'ㅜㅣ'), 74 | ('ㅢ', 'ㅡㅣ'), 75 | ('ㅑ', 'ㅣㅏ'), 76 | ('ㅒ', 'ㅣㅐ'), 77 | ('ㅕ', 'ㅣㅓ'), 78 | ('ㅖ', 'ㅣㅔ'), 79 | ('ㅛ', 'ㅣㅗ'), 80 | ('ㅠ', 'ㅣㅜ') 81 | ]] 82 | 83 | # List of (Latin alphabet, hangul) pairs: 84 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 85 | ('a', '에이'), 86 | ('b', '비'), 87 | ('c', '시'), 88 | ('d', '디'), 89 | ('e', '이'), 90 | ('f', '에프'), 91 | ('g', '지'), 92 | ('h', '에이치'), 93 | ('i', '아이'), 94 | ('j', '제이'), 95 | ('k', '케이'), 96 | ('l', '엘'), 97 | ('m', '엠'), 98 | ('n', '엔'), 99 | ('o', '오'), 100 | ('p', '피'), 101 | ('q', '큐'), 102 | ('r', '아르'), 103 | ('s', '에스'), 104 | ('t', '티'), 105 | ('u', '유'), 106 | ('v', '브이'), 107 | ('w', '더블유'), 108 | ('x', '엑스'), 109 | ('y', '와이'), 110 | ('z', '제트') 111 | ]] 112 | 113 | 114 | def expand_abbreviations(text): 115 | for regex, replacement in _abbreviations: 116 | text = re.sub(regex, replacement, text) 117 | return text 118 | 119 | 120 | def lowercase(text): 121 | return text.lower() 122 | 123 | 124 | def collapse_whitespace(text): 125 | return re.sub(_whitespace_re, ' ', text) 126 | 127 | 128 | def convert_to_ascii(text): 129 | return unidecode(text) 130 | 131 | 132 | def latin_to_hangul(text): 133 | for regex, replacement in _latin_to_hangul: 134 | text = re.sub(regex, replacement, text) 135 | return text 136 | 137 | 138 | def divide_hangul(text): 139 | for regex, replacement in _hangul_divided: 140 | text = re.sub(regex, replacement, text) 141 | return text 142 | 143 | 144 | def hangul_number(num, sino=True): 145 | '''Reference https://github.com/Kyubyong/g2pK''' 146 | num = re.sub(',', '', num) 147 | 148 | if num == '0': 149 | return '영' 150 | if not sino and num == '20': 151 | return '스무' 152 | 153 | digits = '123456789' 154 | names = '일이삼사오육칠팔구' 155 | digit2name = {d: n for d, n in zip(digits, names)} 156 | 157 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' 158 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' 159 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} 160 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} 161 | 162 | spelledout = [] 163 | for i, digit in enumerate(num): 164 | i = len(num) - i - 1 165 | if sino: 166 | if i == 0: 167 | name = digit2name.get(digit, '') 168 | elif i == 1: 169 | name = digit2name.get(digit, '') + '십' 170 | name = name.replace('일십', '십') 171 | else: 172 | if i == 0: 173 | name = digit2mod.get(digit, '') 174 | elif i == 1: 175 | name = digit2dec.get(digit, '') 176 | if digit == '0': 177 | if i % 4 == 0: 178 | last_three = spelledout[-min(3, len(spelledout)):] 179 | if ''.join(last_three) == '': 180 | spelledout.append('') 181 | continue 182 | else: 183 | spelledout.append('') 184 | continue 185 | if i == 2: 186 | name = digit2name.get(digit, '') + '백' 187 | name = name.replace('일백', '백') 188 | elif i == 3: 189 | name = digit2name.get(digit, '') + '천' 190 | name = name.replace('일천', '천') 191 | elif i == 4: 192 | name = digit2name.get(digit, '') + '만' 193 | name = name.replace('일만', '만') 194 | elif i == 5: 195 | name = digit2name.get(digit, '') + '십' 196 | name = name.replace('일십', '십') 197 | elif i == 6: 198 | name = digit2name.get(digit, '') + '백' 199 | name = name.replace('일백', '백') 200 | elif i == 7: 201 | name = digit2name.get(digit, '') + '천' 202 | name = name.replace('일천', '천') 203 | elif i == 8: 204 | name = digit2name.get(digit, '') + '억' 205 | elif i == 9: 206 | name = digit2name.get(digit, '') + '십' 207 | elif i == 10: 208 | name = digit2name.get(digit, '') + '백' 209 | elif i == 11: 210 | name = digit2name.get(digit, '') + '천' 211 | elif i == 12: 212 | name = digit2name.get(digit, '') + '조' 213 | elif i == 13: 214 | name = digit2name.get(digit, '') + '십' 215 | elif i == 14: 216 | name = digit2name.get(digit, '') + '백' 217 | elif i == 15: 218 | name = digit2name.get(digit, '') + '천' 219 | spelledout.append(name) 220 | return ''.join(elem for elem in spelledout) 221 | 222 | 223 | def number_to_hangul(text): 224 | '''Reference https://github.com/Kyubyong/g2pK''' 225 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) 226 | for token in tokens: 227 | num, classifier = token 228 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: 229 | spelledout = hangul_number(num, sino=False) 230 | else: 231 | spelledout = hangul_number(num, sino=True) 232 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') 233 | # digit by digit for remaining digits 234 | digits = '0123456789' 235 | names = '영일이삼사오육칠팔구' 236 | for d, n in zip(digits, names): 237 | text = text.replace(d, n) 238 | return text 239 | 240 | 241 | def basic_cleaners(text): 242 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 243 | text = lowercase(text) 244 | text = collapse_whitespace(text) 245 | return text 246 | 247 | 248 | def transliteration_cleaners(text): 249 | '''Pipeline for non-English text that transliterates to ASCII.''' 250 | text = convert_to_ascii(text) 251 | text = lowercase(text) 252 | text = collapse_whitespace(text) 253 | return text 254 | 255 | 256 | def japanese_cleaners(text): 257 | '''Pipeline for notating accent in Japanese text. 258 | Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' 259 | sentences = re.split(_japanese_marks, text) 260 | marks = re.findall(_japanese_marks, text) 261 | text = '' 262 | for i, sentence in enumerate(sentences): 263 | if re.match(_japanese_characters, sentence): 264 | if text!='': 265 | text+=' ' 266 | labels = pyopenjtalk.extract_fullcontext(sentence) 267 | for n, label in enumerate(labels): 268 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1) 269 | if phoneme not in ['sil','pau']: 270 | text += phoneme.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q') 271 | else: 272 | continue 273 | n_moras = int(re.search(r'/F:(\d+)_', label).group(1)) 274 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) 275 | a2 = int(re.search(r"\+(\d+)\+", label).group(1)) 276 | a3 = int(re.search(r"\+(\d+)/", label).group(1)) 277 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']: 278 | a2_next=-1 279 | else: 280 | a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1)) 281 | # Accent phrase boundary 282 | if a3 == 1 and a2_next == 1: 283 | text += ' ' 284 | # Falling 285 | elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras: 286 | text += '↓' 287 | # Rising 288 | elif a2 == 1 and a2_next == 2: 289 | text += '↑' 290 | if i= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from json import loads 3 | from torch import load 4 | import logging 5 | from av import open as avopen 6 | 7 | 8 | class HParams(): 9 | def __init__(self, **kwargs): 10 | for k, v in kwargs.items(): 11 | if type(v) == dict: 12 | v = HParams(**v) 13 | self[k] = v 14 | 15 | def keys(self): 16 | return self.__dict__.keys() 17 | 18 | def items(self): 19 | return self.__dict__.items() 20 | 21 | def values(self): 22 | return self.__dict__.values() 23 | 24 | def __len__(self): 25 | return len(self.__dict__) 26 | 27 | def __getitem__(self, key): 28 | return getattr(self, key) 29 | 30 | def __setitem__(self, key, value): 31 | return setattr(self, key, value) 32 | 33 | def __contains__(self, key): 34 | return key in self.__dict__ 35 | 36 | def __repr__(self): 37 | return self.__dict__.__repr__() 38 | 39 | 40 | def load_checkpoint(checkpoint_path, model): 41 | checkpoint_dict = load(checkpoint_path, map_location='cpu') 42 | iteration = checkpoint_dict['iteration'] 43 | saved_state_dict = checkpoint_dict['model'] 44 | if hasattr(model, 'module'): 45 | state_dict = model.module.state_dict() 46 | else: 47 | state_dict = model.state_dict() 48 | new_state_dict= {} 49 | for k, v in state_dict.items(): 50 | try: 51 | new_state_dict[k] = saved_state_dict[k] 52 | except: 53 | logging.info("%s is not in the checkpoint" % k) 54 | new_state_dict[k] = v 55 | if hasattr(model, 'module'): 56 | model.module.load_state_dict(new_state_dict) 57 | else: 58 | model.load_state_dict(new_state_dict) 59 | logging.info("Loaded checkpoint '{}' (iteration {})" .format( 60 | checkpoint_path, iteration)) 61 | return 62 | 63 | 64 | def get_hparams_from_file(config_path): 65 | with open(config_path, "r") as f: 66 | data = f.read() 67 | config = loads(data) 68 | 69 | hparams = HParams(**config) 70 | return hparams 71 | 72 | def wav2(i, o, format): 73 | inp = avopen(i, 'rb') 74 | out = avopen(o, 'wb', format=format) 75 | if format == "ogg": format = "libvorbis" 76 | 77 | ostream = out.add_stream(format) 78 | 79 | for frame in inp.decode(audio=0): 80 | for p in ostream.encode(frame): out.mux(p) 81 | 82 | for p in ostream.encode(None): out.mux(p) 83 | 84 | out.close() 85 | inp.close() 86 | --------------------------------------------------------------------------------