├── .funcignore
├── .gitignore
├── .vscode
    ├── extensions.json
    ├── launch.json
    ├── settings.json
    └── tasks.json
├── README.md
├── api.py
├── attentions.py
├── clean
    ├── __init__.py
    └── function.json
├── cleankr
    ├── __init__.py
    └── function.json
├── commons.py
├── host.json
├── models.py
├── modules.py
├── monotonic_align
    ├── __init__.py
    └── core.py
├── requirements.txt
├── speak
    ├── __init__.py
    └── function.json
├── speak2
    ├── __init__.py
    └── function.json
├── speakkr
    ├── __init__.py
    └── function.json
├── text
    ├── LICENSE
    ├── __init__.py
    └── cleaners.py
├── transforms.py
└── utils.py


/.funcignore:
--------------------------------------------------------------------------------
1 | .git*
2 | .vscode
3 | local.settings.json
4 | test
5 | .venv
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Ww][Ii][Nn]32/
 27 | [Aa][Rr][Mm]/
 28 | [Aa][Rr][Mm]64/
 29 | bld/
 30 | [Bb]in/
 31 | [Oo]bj/
 32 | [Oo]ut/
 33 | [Ll]og/
 34 | [Ll]ogs/
 35 | 
 36 | # Visual Studio 2015/2017 cache/options directory
 37 | .vs/
 38 | # Uncomment if you have tasks that create the project's static files in wwwroot
 39 | #wwwroot/
 40 | 
 41 | # Visual Studio 2017 auto generated files
 42 | Generated\ Files/
 43 | 
 44 | # MSTest test Results
 45 | [Tt]est[Rr]esult*/
 46 | [Bb]uild[Ll]og.*
 47 | 
 48 | # NUnit
 49 | *.VisualState.xml
 50 | TestResult.xml
 51 | nunit-*.xml
 52 | 
 53 | # Build Results of an ATL Project
 54 | [Dd]ebugPS/
 55 | [Rr]eleasePS/
 56 | dlldata.c
 57 | 
 58 | # Benchmark Results
 59 | BenchmarkDotNet.Artifacts/
 60 | 
 61 | # .NET Core
 62 | project.lock.json
 63 | project.fragment.lock.json
 64 | artifacts/
 65 | 
 66 | # ASP.NET Scaffolding
 67 | ScaffoldingReadMe.txt
 68 | 
 69 | # StyleCop
 70 | StyleCopReport.xml
 71 | 
 72 | # Files built by Visual Studio
 73 | *_i.c
 74 | *_p.c
 75 | *_h.h
 76 | *.ilk
 77 | *.meta
 78 | *.obj
 79 | *.iobj
 80 | *.pch
 81 | *.pdb
 82 | *.ipdb
 83 | *.pgc
 84 | *.pgd
 85 | *.rsp
 86 | *.sbr
 87 | *.tlb
 88 | *.tli
 89 | *.tlh
 90 | *.tmp
 91 | *.tmp_proj
 92 | *_wpftmp.csproj
 93 | *.log
 94 | *.vspscc
 95 | *.vssscc
 96 | .builds
 97 | *.pidb
 98 | *.svclog
 99 | *.scc
100 | 
101 | # Chutzpah Test files
102 | _Chutzpah*
103 | 
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 | 
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 | 
121 | # Visual Studio Trace Files
122 | *.e2e
123 | 
124 | # TFS 2012 Local Workspace
125 | $tf/
126 | 
127 | # Guidance Automation Toolkit
128 | *.gpState
129 | 
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 | 
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 | 
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 | 
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 | 
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 | 
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 | 
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 | 
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 | 
163 | # Web workbench (sass)
164 | .sass-cache/
165 | 
166 | # Installshield output folder
167 | [Ee]xpress/
168 | 
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 | 
179 | # Click-Once directory
180 | publish/
181 | 
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 | 
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 | 
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 | 
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 | 
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 | 
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 | 
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 | 
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 | 
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 | 
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 | 
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 | 
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 | 
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 | 
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 | 
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 | 
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 | 
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 | 
288 | # Visual Studio 6 build log
289 | *.plg
290 | 
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 | 
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 | 
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 | 
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 | 
309 | # FAKE - F# Make
310 | .fake/
311 | 
312 | # CodeRush personal settings
313 | .cr/personal
314 | 
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | *.pyc
318 | 
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 | 
323 | # Tabs Studio
324 | *.tss
325 | 
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 | 
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 | 
335 | # OpenCover UI analysis results
336 | OpenCover/
337 | 
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 | 
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 | 
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 | 
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 | 
350 | # Local History for Visual Studio
351 | .localhistory/
352 | 
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 | 
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 | 
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 | 
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
364 | 
365 | # build
366 | build
367 | monotonic_align/core.c
368 | *.o
369 | *.so
370 | *.dll
371 | 
372 | # Byte-compiled / optimized / DLL files
373 | __pycache__/
374 | *.py[cod]
375 | *$py.class
376 | 
377 | # C extensions
378 | *.so
379 | 
380 | # Distribution / packaging
381 | .Python
382 | build/
383 | develop-eggs/
384 | dist/
385 | downloads/
386 | eggs/
387 | .eggs/
388 | lib/
389 | lib64/
390 | parts/
391 | sdist/
392 | var/
393 | wheels/
394 | pip-wheel-metadata/
395 | share/python-wheels/
396 | *.egg-info/
397 | .installed.cfg
398 | *.egg
399 | MANIFEST
400 | 
401 | # PyInstaller
402 | #  Usually these files are written by a python script from a template
403 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
404 | *.manifest
405 | *.spec
406 | 
407 | # Installer logs
408 | pip-log.txt
409 | pip-delete-this-directory.txt
410 | 
411 | # Unit test / coverage reports
412 | htmlcov/
413 | .tox/
414 | .nox/
415 | .coverage
416 | .coverage.*
417 | .cache
418 | nosetests.xml
419 | coverage.xml
420 | *.cover
421 | .hypothesis/
422 | .pytest_cache/
423 | 
424 | # Translations
425 | *.mo
426 | *.pot
427 | 
428 | # Django stuff:
429 | *.log
430 | local_settings.py
431 | db.sqlite3
432 | 
433 | # Flask stuff:
434 | instance/
435 | .webassets-cache
436 | 
437 | # Scrapy stuff:
438 | .scrapy
439 | 
440 | # Sphinx documentation
441 | docs/_build/
442 | 
443 | # PyBuilder
444 | target/
445 | 
446 | # Jupyter Notebook
447 | .ipynb_checkpoints
448 | 
449 | # IPython
450 | profile_default/
451 | ipython_config.py
452 | 
453 | # pyenv
454 | .python-version
455 | 
456 | # pipenv
457 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
458 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
459 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
460 | #   install all needed dependencies.
461 | #Pipfile.lock
462 | 
463 | # celery beat schedule file
464 | celerybeat-schedule
465 | 
466 | # SageMath parsed files
467 | *.sage.py
468 | 
469 | # Environments
470 | .env
471 | .venv
472 | env/
473 | venv/
474 | ENV/
475 | env.bak/
476 | venv.bak/
477 | 
478 | # Spyder project settings
479 | .spyderproject
480 | .spyproject
481 | 
482 | # Rope project settings
483 | .ropeproject
484 | 
485 | # mkdocs documentation
486 | /site
487 | 
488 | # mypy
489 | .mypy_cache/
490 | .dmypy.json
491 | dmypy.json
492 | 
493 | # Pyre type checker
494 | .pyre/
495 | 
496 | # Azure Functions artifacts
497 | bin
498 | obj
499 | appsettings.json
500 | local.settings.json
501 | 
502 | # Azurite artifacts
503 | __blobstorage__
504 | __queuestorage__
505 | __azurite_db*__.json
506 | .python_packages
507 | 
508 | .DS_Store
509 | 
510 | *.pth
511 | config.json
512 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |     "ms-azuretools.vscode-azurefunctions",
4 |     "ms-python.python"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Attach to Python Functions",
 6 |             "type": "python",
 7 |             "request": "attach",
 8 |             "port": 9091,
 9 |             "preLaunchTask": "func: host start"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "azureFunctions.deploySubpath": ".",
3 |     "azureFunctions.scmDoBuildDuringDeployment": true,
4 |     "azureFunctions.pythonVenv": ".venv",
5 |     "azureFunctions.projectLanguage": "Python",
6 |     "azureFunctions.projectRuntime": "~4",
7 |     "debug.internalConsoleOptions": "neverOpen"
8 | }


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "2.0.0",
 3 | 	"tasks": [
 4 | 		{
 5 | 			"type": "func",
 6 | 			"command": "host start",
 7 | 			"problemMatcher": "$func-python-watch",
 8 | 			"isBackground": true,
 9 | 			"dependsOn": "pip install (functions)"
10 | 		},
11 | 		{
12 | 			"label": "pip install (functions)",
13 | 			"type": "shell",
14 | 			"osx": {
15 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple"
16 | 			},
17 | 			"windows": {
18 | 				"command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
19 | 			},
20 | 			"linux": {
21 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
22 | 			},
23 | 			"problemMatcher": []
24 | 		}
25 | 	]
26 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MoeGoe Azure Cloud Function API
 2 | See [MoeGoe](https://github.com/CjangCjengh/MoeGoe)
 3 | 
 4 | ## Japanese
 5 | 
 6 | > Nene + Meguru + Yoshino + Mako + Murasame + Koharu + Nanami
 7 | 
 8 | - GET https://moegoe.azurewebsites.net/api/speak?text=これは一つ簡単なテストです&id=0
 9 | 
10 | return ogg file in body
11 | 
12 | - GET https://moegoe.azurewebsites.net/api/clean?text=これは一つ簡単なテストです
13 | 
14 | return cleaned text in body
15 | 
16 | ```
17 | ko↑rewa hI↑to↓tsU ka↑NtaNna te↓sUtodesU.
18 | ```
19 | 
20 | - GET https://moegoe.azurewebsites.net/api/speak?cleantext=ko↑rewahI↑totsUka↑NtaNnate↓sUtodesU.&id=1
21 | 
22 | return ogg file in body
23 | 
24 | |  ID   | Speaker  |
25 | |  ----  | ----  |
26 | | 0 | 綾地寧々 |
27 | | 1 | 因幡めぐる |
28 | | 2 | 朝武芳乃 |
29 | | 3 | 常陸茉子 |
30 | | 4 | ムラサメ |
31 | | 5 | 鞍馬小春 |
32 | | 6 | 在原七海 |
33 | 
34 | > HamidashiCreative
35 | 
36 | replace`speak`to`speak2`
37 | 
38 | |  ID   | Speaker  |
39 | |  ----  | ----  |
40 | | 0 | 和泉妃愛 |
41 | | 1 | 常盤華乃 |
42 | | 2 | 錦あすみ |
43 | | 3 | 鎌倉詩桜 |
44 | | 4 | 竜閑天梨 |
45 | | 5 | 和泉里 |
46 | | 6 | 新川広夢 |
47 | | 7 | 聖莉々子 |
48 | 
49 | 
50 | ## Korean
51 | 
52 | > Sua + Mimiru + Arin + Yeonhwa + Yuhwa + Seonbae
53 | 
54 | - GET https://moegoe.azurewebsites.net/api/speakkr?text=이것은%20간단한%20테스트이다&id=0
55 | 
56 | return ogg file in body
57 | 
58 | - GET https://moegoe.azurewebsites.net/api/cleankr?text=이것은%20간단한%20테스트이다
59 | 
60 | return cleaned text in body
61 | 
62 | ```
63 | ㅇㅣㄱㅓㅅㅇㅡㄴ ㄱㅏㄴㄷㅏㄴㅎㅏㄴ ㅌㅔㅅㅡㅌㅡㅇㅣㄷㅏ.
64 | ```
65 | 
66 | - GET https://moegoe.azurewebsites.net/api/speakkr?cleantext=ㅇㅣㄱㅓㅅㅇㅡㄴ%20ㄱㅏㄴㄷㅏㄴㅎㅏㄴ%20ㅌㅔㅅㅡㅌㅡㅇㅣㄷㅏ.&id=1
67 | 
68 | return ogg file in body
69 | 
70 | |  ID   | Speaker  |
71 | |  ----  | ----  |
72 | | 0 | 수아 |
73 | | 1 | 미미르 |
74 | | 2 | 아린 |
75 | | 3 | 연화 |
76 | | 4 | 유화 |
77 | | 5 | 선배 |
78 | 
79 | ## Optional Parameters
80 | 
81 | ### speak
82 | - **format**: ogg(default), mp3 or wav
83 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | import azure.functions as func
  2 | 
  3 | from io import BytesIO
  4 | from pathlib import Path
  5 | from torch import no_grad, LongTensor
  6 | 
  7 | import commons
  8 | from utils import load_checkpoint, get_hparams_from_file, wav2
  9 | from models import SynthesizerTrn
 10 | from text import text_to_sequence, _clean_text
 11 | from urllib.parse import unquote
 12 | 
 13 | from scipy.io.wavfile import write
 14 | 
 15 | 
 16 | class Cleaner():
 17 |     def __init__(self, configfile: str):
 18 |         self.cleanernames = get_hparams_from_file(str(Path(__file__).parent/configfile)).data.text_cleaners
 19 | 
 20 |     def main(self, req: func.HttpRequest) -> func.HttpResponse:
 21 |         text = req.params.get('text')
 22 |         if not text:
 23 |             return func.HttpResponse(
 24 |                 "400 BAD REQUEST: null text",
 25 |                 status_code=400
 26 |             )
 27 |         try:
 28 |             return func.HttpResponse(
 29 |                 _clean_text(unquote(text), self.cleanernames),
 30 |                 status_code=200
 31 |             )
 32 |         except:
 33 |             return func.HttpResponse(
 34 |                 "400 BAD REQUEST: invalid text",
 35 |                 status_code=400
 36 |             )
 37 | 
 38 | 
 39 | class Speaker():
 40 |     def __init__(self, configfile: str, pthfile: str):
 41 |         self.hps_ms = get_hparams_from_file(str(Path(__file__).parent/configfile))
 42 |         self.net_g_ms = SynthesizerTrn(
 43 |             len(self.hps_ms.symbols),
 44 |             self.hps_ms.data.filter_length // 2 + 1,
 45 |             self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
 46 |             n_speakers=self.hps_ms.data.n_speakers,
 47 |             **self.hps_ms.model)
 48 |         _ = self.net_g_ms.eval()
 49 |         load_checkpoint(str(Path(__file__).parent/pthfile), self.net_g_ms)
 50 | 
 51 |     def get_text(self, text: str, cleaned=False):
 52 |         if cleaned:
 53 |             text_norm = text_to_sequence(text, self.hps_ms.symbols, [])
 54 |         else:
 55 |             text_norm = text_to_sequence(text, self.hps_ms.symbols, self.hps_ms.data.text_cleaners)
 56 |         if self.hps_ms.data.add_blank:
 57 |             text_norm = commons.intersperse(text_norm, 0)
 58 |         text_norm = LongTensor(text_norm)
 59 |         return text_norm
 60 | 
 61 |     def main(self, req: func.HttpRequest) -> func.HttpResponse:
 62 |         text = req.params.get('text')
 63 |         cleantext = req.params.get('cleantext')
 64 |         if not text and not cleantext:
 65 |             return func.HttpResponse(
 66 |                 "400 BAD REQUEST: null text",
 67 |                 status_code=400
 68 |             )
 69 |         if text and cleantext:
 70 |             return func.HttpResponse(
 71 |                 "400 BAD REQUEST: text and cleantext cannot be set both",
 72 |                 status_code=400
 73 |             )
 74 |         cleaned = False
 75 |         if cleantext:
 76 |             cleaned = True
 77 |             text = cleantext
 78 |         speaker_id = req.params.get('id')
 79 |         if not speaker_id:
 80 |             return func.HttpResponse(
 81 |                 "400 BAD REQUEST: null speaker id",
 82 |                 status_code=400
 83 |             )
 84 |         try:
 85 |             speaker_id = int(speaker_id)
 86 |         except:
 87 |             return func.HttpResponse(
 88 |                 "400 BAD REQUEST: invalid speaker id",
 89 |                 status_code=400
 90 |             )
 91 |         if speaker_id not in range(self.hps_ms.data.n_speakers):
 92 |             return func.HttpResponse(
 93 |                 "400 BAD REQUEST: speaker id out of range",
 94 |                 status_code=400
 95 |             )
 96 |         format = req.params.get('format')
 97 |         if not format: format = "ogg"
 98 |         if format not in ("ogg", "mp3", "wav"):
 99 |             return func.HttpResponse(
100 |                 "400 BAD REQUEST: invalid format",
101 |                 status_code=400
102 |             )
103 |         try:
104 |             stn_tst = self.get_text(unquote(text), cleaned)
105 |         except:
106 |             return func.HttpResponse(
107 |                 "400 BAD REQUEST: invalid text",
108 |                 status_code=400
109 |             )
110 |         try:
111 |             with no_grad():
112 |                 x_tst = stn_tst.unsqueeze(0)
113 |                 x_tst_lengths = LongTensor([stn_tst.size(0)])
114 |                 sid = LongTensor([speaker_id])
115 |                 audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
116 |                 with BytesIO() as f:
117 |                     write(f, self.hps_ms.data.sampling_rate, audio)
118 |                     if format == "wav":
119 |                         return func.HttpResponse(
120 |                             f.getvalue(),
121 |                             status_code=200,
122 |                             mimetype="audio/wav",
123 |                         )
124 |                     else:
125 |                         f.seek(0, 0)
126 |                         with BytesIO() as ofp:
127 |                             wav2(f, ofp, format)
128 |                             return func.HttpResponse(
129 |                                 ofp.getvalue(),
130 |                                 status_code=200,
131 |                                 mimetype="audio/mpeg" if format == "mp3" else "audio/ogg",
132 |                             )
133 |         except Exception as e:
134 |             return func.HttpResponse(
135 |                         "500 Internal Server Error\n"+str(e),
136 |                         status_code=500,
137 |                     )
138 | 


--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | from modules import LayerNorm
  8 | 
  9 | 
 10 | class Encoder(nn.Module):
 11 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 12 |     super().__init__()
 13 |     self.hidden_channels = hidden_channels
 14 |     self.filter_channels = filter_channels
 15 |     self.n_heads = n_heads
 16 |     self.n_layers = n_layers
 17 |     self.kernel_size = kernel_size
 18 |     self.p_dropout = p_dropout
 19 |     self.window_size = window_size
 20 | 
 21 |     self.drop = nn.Dropout(p_dropout)
 22 |     self.attn_layers = nn.ModuleList()
 23 |     self.norm_layers_1 = nn.ModuleList()
 24 |     self.ffn_layers = nn.ModuleList()
 25 |     self.norm_layers_2 = nn.ModuleList()
 26 |     for i in range(self.n_layers):
 27 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 28 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 29 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 30 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 31 | 
 32 |   def forward(self, x, x_mask):
 33 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 34 |     x = x * x_mask
 35 |     for i in range(self.n_layers):
 36 |       y = self.attn_layers[i](x, x, attn_mask)
 37 |       y = self.drop(y)
 38 |       x = self.norm_layers_1[i](x + y)
 39 | 
 40 |       y = self.ffn_layers[i](x, x_mask)
 41 |       y = self.drop(y)
 42 |       x = self.norm_layers_2[i](x + y)
 43 |     x = x * x_mask
 44 |     return x
 45 | 
 46 | 
 47 | class Decoder(nn.Module):
 48 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 49 |     super().__init__()
 50 |     self.hidden_channels = hidden_channels
 51 |     self.filter_channels = filter_channels
 52 |     self.n_heads = n_heads
 53 |     self.n_layers = n_layers
 54 |     self.kernel_size = kernel_size
 55 |     self.p_dropout = p_dropout
 56 |     self.proximal_bias = proximal_bias
 57 |     self.proximal_init = proximal_init
 58 | 
 59 |     self.drop = nn.Dropout(p_dropout)
 60 |     self.self_attn_layers = nn.ModuleList()
 61 |     self.norm_layers_0 = nn.ModuleList()
 62 |     self.encdec_attn_layers = nn.ModuleList()
 63 |     self.norm_layers_1 = nn.ModuleList()
 64 |     self.ffn_layers = nn.ModuleList()
 65 |     self.norm_layers_2 = nn.ModuleList()
 66 |     for i in range(self.n_layers):
 67 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 68 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 69 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 70 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 71 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 72 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 73 | 
 74 |   def forward(self, x, x_mask, h, h_mask):
 75 |     """
 76 |     x: decoder input
 77 |     h: encoder output
 78 |     """
 79 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 80 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 81 |     x = x * x_mask
 82 |     for i in range(self.n_layers):
 83 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 84 |       y = self.drop(y)
 85 |       x = self.norm_layers_0[i](x + y)
 86 | 
 87 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
 88 |       y = self.drop(y)
 89 |       x = self.norm_layers_1[i](x + y)
 90 |       
 91 |       y = self.ffn_layers[i](x, x_mask)
 92 |       y = self.drop(y)
 93 |       x = self.norm_layers_2[i](x + y)
 94 |     x = x * x_mask
 95 |     return x
 96 | 
 97 | 
 98 | class MultiHeadAttention(nn.Module):
 99 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
100 |     super().__init__()
101 |     assert channels % n_heads == 0
102 | 
103 |     self.channels = channels
104 |     self.out_channels = out_channels
105 |     self.n_heads = n_heads
106 |     self.p_dropout = p_dropout
107 |     self.window_size = window_size
108 |     self.heads_share = heads_share
109 |     self.block_length = block_length
110 |     self.proximal_bias = proximal_bias
111 |     self.proximal_init = proximal_init
112 |     self.attn = None
113 | 
114 |     self.k_channels = channels // n_heads
115 |     self.conv_q = nn.Conv1d(channels, channels, 1)
116 |     self.conv_k = nn.Conv1d(channels, channels, 1)
117 |     self.conv_v = nn.Conv1d(channels, channels, 1)
118 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
119 |     self.drop = nn.Dropout(p_dropout)
120 | 
121 |     if window_size is not None:
122 |       n_heads_rel = 1 if heads_share else n_heads
123 |       rel_stddev = self.k_channels**-0.5
124 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
125 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
126 | 
127 |     nn.init.xavier_uniform_(self.conv_q.weight)
128 |     nn.init.xavier_uniform_(self.conv_k.weight)
129 |     nn.init.xavier_uniform_(self.conv_v.weight)
130 |     if proximal_init:
131 |       with torch.no_grad():
132 |         self.conv_k.weight.copy_(self.conv_q.weight)
133 |         self.conv_k.bias.copy_(self.conv_q.bias)
134 |       
135 |   def forward(self, x, c, attn_mask=None):
136 |     q = self.conv_q(x)
137 |     k = self.conv_k(c)
138 |     v = self.conv_v(c)
139 |     
140 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
141 | 
142 |     x = self.conv_o(x)
143 |     return x
144 | 
145 |   def attention(self, query, key, value, mask=None):
146 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
147 |     b, d, t_s, t_t = (*key.size(), query.size(2))
148 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
149 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
150 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
151 | 
152 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
153 |     if self.window_size is not None:
154 |       assert t_s == t_t, "Relative attention is only available for self-attention."
155 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
156 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
157 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
158 |       scores = scores + scores_local
159 |     if self.proximal_bias:
160 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
161 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
162 |     if mask is not None:
163 |       scores = scores.masked_fill(mask == 0, -1e4)
164 |       if self.block_length is not None:
165 |         assert t_s == t_t, "Local attention is only available for self-attention."
166 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
167 |         scores = scores.masked_fill(block_mask == 0, -1e4)
168 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
169 |     p_attn = self.drop(p_attn)
170 |     output = torch.matmul(p_attn, value)
171 |     if self.window_size is not None:
172 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
173 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
174 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
175 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
176 |     return output, p_attn
177 | 
178 |   def _matmul_with_relative_values(self, x, y):
179 |     """
180 |     x: [b, h, l, m]
181 |     y: [h or 1, m, d]
182 |     ret: [b, h, l, d]
183 |     """
184 |     ret = torch.matmul(x, y.unsqueeze(0))
185 |     return ret
186 | 
187 |   def _matmul_with_relative_keys(self, x, y):
188 |     """
189 |     x: [b, h, l, d]
190 |     y: [h or 1, m, d]
191 |     ret: [b, h, l, m]
192 |     """
193 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
194 |     return ret
195 | 
196 |   def _get_relative_embeddings(self, relative_embeddings, length):
197 |     max_relative_position = 2 * self.window_size + 1
198 |     # Pad first before slice to avoid using cond ops.
199 |     pad_length = max(length - (self.window_size + 1), 0)
200 |     slice_start_position = max((self.window_size + 1) - length, 0)
201 |     slice_end_position = slice_start_position + 2 * length - 1
202 |     if pad_length > 0:
203 |       padded_relative_embeddings = F.pad(
204 |           relative_embeddings,
205 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
206 |     else:
207 |       padded_relative_embeddings = relative_embeddings
208 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
209 |     return used_relative_embeddings
210 | 
211 |   def _relative_position_to_absolute_position(self, x):
212 |     """
213 |     x: [b, h, l, 2*l-1]
214 |     ret: [b, h, l, l]
215 |     """
216 |     batch, heads, length, _ = x.size()
217 |     # Concat columns of pad to shift from relative to absolute indexing.
218 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
219 | 
220 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
221 |     x_flat = x.view([batch, heads, length * 2 * length])
222 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
223 | 
224 |     # Reshape and slice out the padded elements.
225 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
226 |     return x_final
227 | 
228 |   def _absolute_position_to_relative_position(self, x):
229 |     """
230 |     x: [b, h, l, l]
231 |     ret: [b, h, l, 2*l-1]
232 |     """
233 |     batch, heads, length, _ = x.size()
234 |     # padd along column
235 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
236 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
237 |     # add 0's in the beginning that will skew the elements after reshape
238 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
239 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
240 |     return x_final
241 | 
242 |   def _attention_bias_proximal(self, length):
243 |     """Bias for self-attention to encourage attention to close positions.
244 |     Args:
245 |       length: an integer scalar.
246 |     Returns:
247 |       a Tensor with shape [1, 1, length, length]
248 |     """
249 |     r = torch.arange(length, dtype=torch.float32)
250 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
251 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
252 | 
253 | 
254 | class FFN(nn.Module):
255 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
256 |     super().__init__()
257 |     self.in_channels = in_channels
258 |     self.out_channels = out_channels
259 |     self.filter_channels = filter_channels
260 |     self.kernel_size = kernel_size
261 |     self.p_dropout = p_dropout
262 |     self.activation = activation
263 |     self.causal = causal
264 | 
265 |     if causal:
266 |       self.padding = self._causal_padding
267 |     else:
268 |       self.padding = self._same_padding
269 | 
270 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
271 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
272 |     self.drop = nn.Dropout(p_dropout)
273 | 
274 |   def forward(self, x, x_mask):
275 |     x = self.conv_1(self.padding(x * x_mask))
276 |     if self.activation == "gelu":
277 |       x = x * torch.sigmoid(1.702 * x)
278 |     else:
279 |       x = torch.relu(x)
280 |     x = self.drop(x)
281 |     x = self.conv_2(self.padding(x * x_mask))
282 |     return x * x_mask
283 |   
284 |   def _causal_padding(self, x):
285 |     if self.kernel_size == 1:
286 |       return x
287 |     pad_l = self.kernel_size - 1
288 |     pad_r = 0
289 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
290 |     x = F.pad(x, commons.convert_pad_shape(padding))
291 |     return x
292 | 
293 |   def _same_padding(self, x):
294 |     if self.kernel_size == 1:
295 |       return x
296 |     pad_l = (self.kernel_size - 1) // 2
297 |     pad_r = self.kernel_size // 2
298 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
299 |     x = F.pad(x, commons.convert_pad_shape(padding))
300 |     return x
301 | 


--------------------------------------------------------------------------------
/clean/__init__.py:
--------------------------------------------------------------------------------
 1 | import azure.functions as func
 2 | 
 3 | from api import Cleaner
 4 | 
 5 | 
 6 | cleaner = Cleaner('Yuzu/config.json')
 7 | 
 8 | 
 9 | def main(req: func.HttpRequest) -> func.HttpResponse:
10 |     return cleaner.main(req)
11 | 


--------------------------------------------------------------------------------
/clean/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": ["get"]
10 |     },
11 |     {
12 |       "type": "http",
13 |       "direction": "out",
14 |       "name": "$return"
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/cleankr/__init__.py:
--------------------------------------------------------------------------------
 1 | import azure.functions as func
 2 | 
 3 | from api import Cleaner
 4 | 
 5 | 
 6 | cleaner = Cleaner('TheFoxAwaitsMe/config.json')
 7 | 
 8 | 
 9 | def main(req: func.HttpRequest) -> func.HttpResponse:
10 |     return cleaner.main(req)
11 | 


--------------------------------------------------------------------------------
/cleankr/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": ["get"]
10 |     },
11 |     {
12 |       "type": "http",
13 |       "direction": "out",
14 |       "name": "$return"
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | 
 4 | 
 5 | def init_weights(m, mean=0.0, std=0.01):
 6 |   classname = m.__class__.__name__
 7 |   if classname.find("Conv") != -1:
 8 |     m.weight.data.normal_(mean, std)
 9 | 
10 | 
11 | def get_padding(kernel_size, dilation=1):
12 |   return int((kernel_size*dilation - dilation)/2)
13 | 
14 | 
15 | def intersperse(lst, item):
16 |   result = [item] * (len(lst) * 2 + 1)
17 |   result[1::2] = lst
18 |   return result
19 | 
20 | 
21 | def slice_segments(x, ids_str, segment_size=4):
22 |   ret = torch.zeros_like(x[:, :, :segment_size])
23 |   for i in range(x.size(0)):
24 |     idx_str = ids_str[i]
25 |     idx_end = idx_str + segment_size
26 |     ret[i] = x[i, :, idx_str:idx_end]
27 |   return ret
28 | 
29 | 
30 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
31 |   b, d, t = x.size()
32 |   if x_lengths is None:
33 |     x_lengths = t
34 |   ids_str_max = x_lengths - segment_size + 1
35 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
36 |   ret = slice_segments(x, ids_str, segment_size)
37 |   return ret, ids_str
38 | 
39 | 
40 | def subsequent_mask(length):
41 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
42 |   return mask
43 | 
44 | 
45 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
46 |   n_channels_int = n_channels[0]
47 |   in_act = input_a + input_b
48 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
49 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
50 |   acts = t_act * s_act
51 |   return acts
52 | 
53 | 
54 | def convert_pad_shape(pad_shape):
55 |   l = pad_shape[::-1]
56 |   pad_shape = [item for sublist in l for item in sublist]
57 |   return pad_shape
58 | 
59 | 
60 | def sequence_mask(length, max_length=None):
61 |   if max_length is None:
62 |     max_length = length.max()
63 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
64 |   return x.unsqueeze(0) < length.unsqueeze(1)
65 | 
66 | 
67 | def generate_path(duration, mask):
68 |   """
69 |   duration: [b, 1, t_x]
70 |   mask: [b, 1, t_y, t_x]
71 |   """
72 |   
73 |   b, _, t_y, t_x = mask.shape
74 |   cum_duration = torch.cumsum(duration, -1)
75 |   
76 |   cum_duration_flat = cum_duration.view(b * t_x)
77 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
78 |   path = path.view(b, t_x, t_y)
79 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
80 |   path = path.unsqueeze(1).transpose(2,3) * mask
81 |   return path
82 | 


--------------------------------------------------------------------------------
/host.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "2.0",
 3 |   "logging": {
 4 |     "applicationInsights": {
 5 |       "samplingSettings": {
 6 |         "isEnabled": true,
 7 |         "excludedTypes": "Request"
 8 |       }
 9 |     }
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | import modules
  8 | import attentions
  9 | import monotonic_align
 10 | 
 11 | from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 13 | from commons import init_weights, get_padding
 14 | 
 15 | 
 16 | class StochasticDurationPredictor(nn.Module):
 17 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
 18 |     super().__init__()
 19 |     filter_channels = in_channels # it needs to be removed from future version.
 20 |     self.in_channels = in_channels
 21 |     self.filter_channels = filter_channels
 22 |     self.kernel_size = kernel_size
 23 |     self.p_dropout = p_dropout
 24 |     self.n_flows = n_flows
 25 |     self.gin_channels = gin_channels
 26 | 
 27 |     self.log_flow = modules.Log()
 28 |     self.flows = nn.ModuleList()
 29 |     self.flows.append(modules.ElementwiseAffine(2))
 30 |     for i in range(n_flows):
 31 |       self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 32 |       self.flows.append(modules.Flip())
 33 | 
 34 |     self.post_pre = nn.Conv1d(1, filter_channels, 1)
 35 |     self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
 36 |     self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 37 |     self.post_flows = nn.ModuleList()
 38 |     self.post_flows.append(modules.ElementwiseAffine(2))
 39 |     for i in range(4):
 40 |       self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 41 |       self.post_flows.append(modules.Flip())
 42 | 
 43 |     self.pre = nn.Conv1d(in_channels, filter_channels, 1)
 44 |     self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
 45 |     self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 46 |     if gin_channels != 0:
 47 |       self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 48 | 
 49 |   def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 50 |     x = torch.detach(x)
 51 |     x = self.pre(x)
 52 |     if g is not None:
 53 |       g = torch.detach(g)
 54 |       x = x + self.cond(g)
 55 |     x = self.convs(x, x_mask)
 56 |     x = self.proj(x) * x_mask
 57 | 
 58 |     if not reverse:
 59 |       flows = self.flows
 60 |       assert w is not None
 61 | 
 62 |       logdet_tot_q = 0 
 63 |       h_w = self.post_pre(w)
 64 |       h_w = self.post_convs(h_w, x_mask)
 65 |       h_w = self.post_proj(h_w) * x_mask
 66 |       e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
 67 |       z_q = e_q
 68 |       for flow in self.post_flows:
 69 |         z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
 70 |         logdet_tot_q += logdet_q
 71 |       z_u, z1 = torch.split(z_q, [1, 1], 1) 
 72 |       u = torch.sigmoid(z_u) * x_mask
 73 |       z0 = (w - u) * x_mask
 74 |       logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
 75 |       logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
 76 | 
 77 |       logdet_tot = 0
 78 |       z0, logdet = self.log_flow(z0, x_mask)
 79 |       logdet_tot += logdet
 80 |       z = torch.cat([z0, z1], 1)
 81 |       for flow in flows:
 82 |         z, logdet = flow(z, x_mask, g=x, reverse=reverse)
 83 |         logdet_tot = logdet_tot + logdet
 84 |       nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
 85 |       return nll + logq # [b]
 86 |     else:
 87 |       flows = list(reversed(self.flows))
 88 |       flows = flows[:-2] + [flows[-1]] # remove a useless vflow
 89 |       z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
 90 |       for flow in flows:
 91 |         z = flow(z, x_mask, g=x, reverse=reverse)
 92 |       z0, z1 = torch.split(z, [1, 1], 1)
 93 |       logw = z0
 94 |       return logw
 95 | 
 96 | 
 97 | class DurationPredictor(nn.Module):
 98 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
 99 |     super().__init__()
100 | 
101 |     self.in_channels = in_channels
102 |     self.filter_channels = filter_channels
103 |     self.kernel_size = kernel_size
104 |     self.p_dropout = p_dropout
105 |     self.gin_channels = gin_channels
106 | 
107 |     self.drop = nn.Dropout(p_dropout)
108 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
109 |     self.norm_1 = modules.LayerNorm(filter_channels)
110 |     self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
111 |     self.norm_2 = modules.LayerNorm(filter_channels)
112 |     self.proj = nn.Conv1d(filter_channels, 1, 1)
113 | 
114 |     if gin_channels != 0:
115 |       self.cond = nn.Conv1d(gin_channels, in_channels, 1)
116 | 
117 |   def forward(self, x, x_mask, g=None):
118 |     x = torch.detach(x)
119 |     if g is not None:
120 |       g = torch.detach(g)
121 |       x = x + self.cond(g)
122 |     x = self.conv_1(x * x_mask)
123 |     x = torch.relu(x)
124 |     x = self.norm_1(x)
125 |     x = self.drop(x)
126 |     x = self.conv_2(x * x_mask)
127 |     x = torch.relu(x)
128 |     x = self.norm_2(x)
129 |     x = self.drop(x)
130 |     x = self.proj(x * x_mask)
131 |     return x * x_mask
132 | 
133 | 
134 | class TextEncoder(nn.Module):
135 |   def __init__(self,
136 |       n_vocab,
137 |       out_channels,
138 |       hidden_channels,
139 |       filter_channels,
140 |       n_heads,
141 |       n_layers,
142 |       kernel_size,
143 |       p_dropout):
144 |     super().__init__()
145 |     self.n_vocab = n_vocab
146 |     self.out_channels = out_channels
147 |     self.hidden_channels = hidden_channels
148 |     self.filter_channels = filter_channels
149 |     self.n_heads = n_heads
150 |     self.n_layers = n_layers
151 |     self.kernel_size = kernel_size
152 |     self.p_dropout = p_dropout
153 | 
154 |     self.emb = nn.Embedding(n_vocab, hidden_channels)
155 |     nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
156 | 
157 |     self.encoder = attentions.Encoder(
158 |       hidden_channels,
159 |       filter_channels,
160 |       n_heads,
161 |       n_layers,
162 |       kernel_size,
163 |       p_dropout)
164 |     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
165 | 
166 |   def forward(self, x, x_lengths):
167 |     x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
168 |     x = torch.transpose(x, 1, -1) # [b, h, t]
169 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
170 | 
171 |     x = self.encoder(x * x_mask, x_mask)
172 |     stats = self.proj(x) * x_mask
173 | 
174 |     m, logs = torch.split(stats, self.out_channels, dim=1)
175 |     return x, m, logs, x_mask
176 | 
177 | 
178 | class ResidualCouplingBlock(nn.Module):
179 |   def __init__(self,
180 |       channels,
181 |       hidden_channels,
182 |       kernel_size,
183 |       dilation_rate,
184 |       n_layers,
185 |       n_flows=4,
186 |       gin_channels=0):
187 |     super().__init__()
188 |     self.channels = channels
189 |     self.hidden_channels = hidden_channels
190 |     self.kernel_size = kernel_size
191 |     self.dilation_rate = dilation_rate
192 |     self.n_layers = n_layers
193 |     self.n_flows = n_flows
194 |     self.gin_channels = gin_channels
195 | 
196 |     self.flows = nn.ModuleList()
197 |     for i in range(n_flows):
198 |       self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
199 |       self.flows.append(modules.Flip())
200 | 
201 |   def forward(self, x, x_mask, g=None, reverse=False):
202 |     if not reverse:
203 |       for flow in self.flows:
204 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
205 |     else:
206 |       for flow in reversed(self.flows):
207 |         x = flow(x, x_mask, g=g, reverse=reverse)
208 |     return x
209 | 
210 | 
211 | class PosteriorEncoder(nn.Module):
212 |   def __init__(self,
213 |       in_channels,
214 |       out_channels,
215 |       hidden_channels,
216 |       kernel_size,
217 |       dilation_rate,
218 |       n_layers,
219 |       gin_channels=0):
220 |     super().__init__()
221 |     self.in_channels = in_channels
222 |     self.out_channels = out_channels
223 |     self.hidden_channels = hidden_channels
224 |     self.kernel_size = kernel_size
225 |     self.dilation_rate = dilation_rate
226 |     self.n_layers = n_layers
227 |     self.gin_channels = gin_channels
228 | 
229 |     self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
230 |     self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
231 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
232 | 
233 |   def forward(self, x, x_lengths, g=None):
234 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
235 |     x = self.pre(x) * x_mask
236 |     x = self.enc(x, x_mask, g=g)
237 |     stats = self.proj(x) * x_mask
238 |     m, logs = torch.split(stats, self.out_channels, dim=1)
239 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
240 |     return z, m, logs, x_mask
241 | 
242 | 
243 | class Generator(torch.nn.Module):
244 |     def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
245 |         super(Generator, self).__init__()
246 |         self.num_kernels = len(resblock_kernel_sizes)
247 |         self.num_upsamples = len(upsample_rates)
248 |         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
249 |         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
250 | 
251 |         self.ups = nn.ModuleList()
252 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
253 |             self.ups.append(weight_norm(
254 |                 ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
255 |                                 k, u, padding=(k-u)//2)))
256 | 
257 |         self.resblocks = nn.ModuleList()
258 |         for i in range(len(self.ups)):
259 |             ch = upsample_initial_channel//(2**(i+1))
260 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
261 |                 self.resblocks.append(resblock(ch, k, d))
262 | 
263 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
264 |         self.ups.apply(init_weights)
265 | 
266 |         if gin_channels != 0:
267 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
268 | 
269 |     def forward(self, x, g=None):
270 |         x = self.conv_pre(x)
271 |         if g is not None:
272 |           x = x + self.cond(g)
273 | 
274 |         for i in range(self.num_upsamples):
275 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
276 |             x = self.ups[i](x)
277 |             xs = None
278 |             for j in range(self.num_kernels):
279 |                 if xs is None:
280 |                     xs = self.resblocks[i*self.num_kernels+j](x)
281 |                 else:
282 |                     xs += self.resblocks[i*self.num_kernels+j](x)
283 |             x = xs / self.num_kernels
284 |         x = F.leaky_relu(x)
285 |         x = self.conv_post(x)
286 |         x = torch.tanh(x)
287 | 
288 |         return x
289 | 
290 |     def remove_weight_norm(self):
291 |         print('Removing weight norm...')
292 |         for l in self.ups:
293 |             remove_weight_norm(l)
294 |         for l in self.resblocks:
295 |             l.remove_weight_norm()
296 | 
297 | 
298 | class DiscriminatorP(torch.nn.Module):
299 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
300 |         super(DiscriminatorP, self).__init__()
301 |         self.period = period
302 |         self.use_spectral_norm = use_spectral_norm
303 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
304 |         self.convs = nn.ModuleList([
305 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
306 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
307 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
308 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
309 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
310 |         ])
311 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
312 | 
313 |     def forward(self, x):
314 |         fmap = []
315 | 
316 |         # 1d to 2d
317 |         b, c, t = x.shape
318 |         if t % self.period != 0: # pad first
319 |             n_pad = self.period - (t % self.period)
320 |             x = F.pad(x, (0, n_pad), "reflect")
321 |             t = t + n_pad
322 |         x = x.view(b, c, t // self.period, self.period)
323 | 
324 |         for l in self.convs:
325 |             x = l(x)
326 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
327 |             fmap.append(x)
328 |         x = self.conv_post(x)
329 |         fmap.append(x)
330 |         x = torch.flatten(x, 1, -1)
331 | 
332 |         return x, fmap
333 | 
334 | 
335 | class DiscriminatorS(torch.nn.Module):
336 |     def __init__(self, use_spectral_norm=False):
337 |         super(DiscriminatorS, self).__init__()
338 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
339 |         self.convs = nn.ModuleList([
340 |             norm_f(Conv1d(1, 16, 15, 1, padding=7)),
341 |             norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
342 |             norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
343 |             norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
344 |             norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
345 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
346 |         ])
347 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
348 | 
349 |     def forward(self, x):
350 |         fmap = []
351 | 
352 |         for l in self.convs:
353 |             x = l(x)
354 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
355 |             fmap.append(x)
356 |         x = self.conv_post(x)
357 |         fmap.append(x)
358 |         x = torch.flatten(x, 1, -1)
359 | 
360 |         return x, fmap
361 | 
362 | 
363 | class MultiPeriodDiscriminator(torch.nn.Module):
364 |     def __init__(self, use_spectral_norm=False):
365 |         super(MultiPeriodDiscriminator, self).__init__()
366 |         periods = [2,3,5,7,11]
367 | 
368 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
369 |         discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
370 |         self.discriminators = nn.ModuleList(discs)
371 | 
372 |     def forward(self, y, y_hat):
373 |         y_d_rs = []
374 |         y_d_gs = []
375 |         fmap_rs = []
376 |         fmap_gs = []
377 |         for i, d in enumerate(self.discriminators):
378 |             y_d_r, fmap_r = d(y)
379 |             y_d_g, fmap_g = d(y_hat)
380 |             y_d_rs.append(y_d_r)
381 |             y_d_gs.append(y_d_g)
382 |             fmap_rs.append(fmap_r)
383 |             fmap_gs.append(fmap_g)
384 | 
385 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
386 | 
387 | 
388 | 
389 | class SynthesizerTrn(nn.Module):
390 |   """
391 |   Synthesizer for Training
392 |   """
393 | 
394 |   def __init__(self, 
395 |     n_vocab,
396 |     spec_channels,
397 |     segment_size,
398 |     inter_channels,
399 |     hidden_channels,
400 |     filter_channels,
401 |     n_heads,
402 |     n_layers,
403 |     kernel_size,
404 |     p_dropout,
405 |     resblock, 
406 |     resblock_kernel_sizes, 
407 |     resblock_dilation_sizes, 
408 |     upsample_rates, 
409 |     upsample_initial_channel, 
410 |     upsample_kernel_sizes,
411 |     n_speakers=0,
412 |     gin_channels=0,
413 |     use_sdp=True,
414 |     **kwargs):
415 | 
416 |     super().__init__()
417 |     self.n_vocab = n_vocab
418 |     self.spec_channels = spec_channels
419 |     self.inter_channels = inter_channels
420 |     self.hidden_channels = hidden_channels
421 |     self.filter_channels = filter_channels
422 |     self.n_heads = n_heads
423 |     self.n_layers = n_layers
424 |     self.kernel_size = kernel_size
425 |     self.p_dropout = p_dropout
426 |     self.resblock = resblock
427 |     self.resblock_kernel_sizes = resblock_kernel_sizes
428 |     self.resblock_dilation_sizes = resblock_dilation_sizes
429 |     self.upsample_rates = upsample_rates
430 |     self.upsample_initial_channel = upsample_initial_channel
431 |     self.upsample_kernel_sizes = upsample_kernel_sizes
432 |     self.segment_size = segment_size
433 |     self.n_speakers = n_speakers
434 |     self.gin_channels = gin_channels
435 | 
436 |     self.use_sdp = use_sdp
437 | 
438 |     self.enc_p = TextEncoder(n_vocab,
439 |         inter_channels,
440 |         hidden_channels,
441 |         filter_channels,
442 |         n_heads,
443 |         n_layers,
444 |         kernel_size,
445 |         p_dropout)
446 |     self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
447 |     self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
448 |     self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
449 | 
450 |     if use_sdp:
451 |       self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
452 |     else:
453 |       self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
454 | 
455 |     if n_speakers > 1:
456 |       self.emb_g = nn.Embedding(n_speakers, gin_channels)
457 | 
458 |   def forward(self, x, x_lengths, y, y_lengths, sid=None):
459 | 
460 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
461 |     if self.n_speakers > 0:
462 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
463 |     else:
464 |       g = None
465 | 
466 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
467 |     z_p = self.flow(z, y_mask, g=g)
468 | 
469 |     with torch.no_grad():
470 |       # negative cross-entropy
471 |       s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
472 |       neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
473 |       neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
474 |       neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
475 |       neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
476 |       neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
477 | 
478 |       attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
479 |       attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
480 | 
481 |     w = attn.sum(2)
482 |     if self.use_sdp:
483 |       l_length = self.dp(x, x_mask, w, g=g)
484 |       l_length = l_length / torch.sum(x_mask)
485 |     else:
486 |       logw_ = torch.log(w + 1e-6) * x_mask
487 |       logw = self.dp(x, x_mask, g=g)
488 |       l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging 
489 | 
490 |     # expand prior
491 |     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
492 |     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
493 | 
494 |     z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
495 |     o = self.dec(z_slice, g=g)
496 |     return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
497 | 
498 |   def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
499 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
500 |     if self.n_speakers > 0:
501 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
502 |     else:
503 |       g = None
504 | 
505 |     if self.use_sdp:
506 |       logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
507 |     else:
508 |       logw = self.dp(x, x_mask, g=g)
509 |     w = torch.exp(logw) * x_mask * length_scale
510 |     w_ceil = torch.ceil(w)
511 |     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
512 |     y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
513 |     attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
514 |     attn = commons.generate_path(w_ceil, attn_mask)
515 | 
516 |     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
517 |     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
518 | 
519 |     z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
520 |     z = self.flow(z_p, y_mask, g=g, reverse=True)
521 |     o = self.dec((z * y_mask)[:,:,:max_len], g=g)
522 |     return o, attn, y_mask, (z, z_p, m_p, logs_p)
523 | 
524 |   def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
525 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
526 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
527 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
528 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
529 |     z_p = self.flow(z, y_mask, g=g_src)
530 |     z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
531 |     o_hat = self.dec(z_hat * y_mask, g=g_tgt)
532 |     return o_hat, y_mask, (z, z_p, z_hat)
533 | 
534 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | from torch.nn import Conv1d
  7 | from torch.nn.utils import weight_norm, remove_weight_norm
  8 | 
  9 | import commons
 10 | from commons import init_weights, get_padding
 11 | from transforms import piecewise_rational_quadratic_transform
 12 | 
 13 | 
 14 | LRELU_SLOPE = 0.1
 15 | 
 16 | 
 17 | class LayerNorm(nn.Module):
 18 |   def __init__(self, channels, eps=1e-5):
 19 |     super().__init__()
 20 |     self.channels = channels
 21 |     self.eps = eps
 22 | 
 23 |     self.gamma = nn.Parameter(torch.ones(channels))
 24 |     self.beta = nn.Parameter(torch.zeros(channels))
 25 | 
 26 |   def forward(self, x):
 27 |     x = x.transpose(1, -1)
 28 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 29 |     return x.transpose(1, -1)
 30 | 
 31 |  
 32 | class ConvReluNorm(nn.Module):
 33 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 34 |     super().__init__()
 35 |     self.in_channels = in_channels
 36 |     self.hidden_channels = hidden_channels
 37 |     self.out_channels = out_channels
 38 |     self.kernel_size = kernel_size
 39 |     self.n_layers = n_layers
 40 |     self.p_dropout = p_dropout
 41 |     assert n_layers > 1, "Number of layers should be larger than 0."
 42 | 
 43 |     self.conv_layers = nn.ModuleList()
 44 |     self.norm_layers = nn.ModuleList()
 45 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 46 |     self.norm_layers.append(LayerNorm(hidden_channels))
 47 |     self.relu_drop = nn.Sequential(
 48 |         nn.ReLU(),
 49 |         nn.Dropout(p_dropout))
 50 |     for _ in range(n_layers-1):
 51 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 52 |       self.norm_layers.append(LayerNorm(hidden_channels))
 53 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 54 |     self.proj.weight.data.zero_()
 55 |     self.proj.bias.data.zero_()
 56 | 
 57 |   def forward(self, x, x_mask):
 58 |     x_org = x
 59 |     for i in range(self.n_layers):
 60 |       x = self.conv_layers[i](x * x_mask)
 61 |       x = self.norm_layers[i](x)
 62 |       x = self.relu_drop(x)
 63 |     x = x_org + self.proj(x)
 64 |     return x * x_mask
 65 | 
 66 | 
 67 | class DDSConv(nn.Module):
 68 |   """
 69 |   Dialted and Depth-Separable Convolution
 70 |   """
 71 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 72 |     super().__init__()
 73 |     self.channels = channels
 74 |     self.kernel_size = kernel_size
 75 |     self.n_layers = n_layers
 76 |     self.p_dropout = p_dropout
 77 | 
 78 |     self.drop = nn.Dropout(p_dropout)
 79 |     self.convs_sep = nn.ModuleList()
 80 |     self.convs_1x1 = nn.ModuleList()
 81 |     self.norms_1 = nn.ModuleList()
 82 |     self.norms_2 = nn.ModuleList()
 83 |     for i in range(n_layers):
 84 |       dilation = kernel_size ** i
 85 |       padding = (kernel_size * dilation - dilation) // 2
 86 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 87 |           groups=channels, dilation=dilation, padding=padding
 88 |       ))
 89 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 90 |       self.norms_1.append(LayerNorm(channels))
 91 |       self.norms_2.append(LayerNorm(channels))
 92 | 
 93 |   def forward(self, x, x_mask, g=None):
 94 |     if g is not None:
 95 |       x = x + g
 96 |     for i in range(self.n_layers):
 97 |       y = self.convs_sep[i](x * x_mask)
 98 |       y = self.norms_1[i](y)
 99 |       y = F.gelu(y)
100 |       y = self.convs_1x1[i](y)
101 |       y = self.norms_2[i](y)
102 |       y = F.gelu(y)
103 |       y = self.drop(y)
104 |       x = x + y
105 |     return x * x_mask
106 | 
107 | 
108 | class WN(torch.nn.Module):
109 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
110 |     super(WN, self).__init__()
111 |     assert(kernel_size % 2 == 1)
112 |     self.hidden_channels =hidden_channels
113 |     self.kernel_size = kernel_size,
114 |     self.dilation_rate = dilation_rate
115 |     self.n_layers = n_layers
116 |     self.gin_channels = gin_channels
117 |     self.p_dropout = p_dropout
118 | 
119 |     self.in_layers = torch.nn.ModuleList()
120 |     self.res_skip_layers = torch.nn.ModuleList()
121 |     self.drop = nn.Dropout(p_dropout)
122 | 
123 |     if gin_channels != 0:
124 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
125 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
126 | 
127 |     for i in range(n_layers):
128 |       dilation = dilation_rate ** i
129 |       padding = int((kernel_size * dilation - dilation) / 2)
130 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
131 |                                  dilation=dilation, padding=padding)
132 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
133 |       self.in_layers.append(in_layer)
134 | 
135 |       # last one is not necessary
136 |       if i < n_layers - 1:
137 |         res_skip_channels = 2 * hidden_channels
138 |       else:
139 |         res_skip_channels = hidden_channels
140 | 
141 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
142 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
143 |       self.res_skip_layers.append(res_skip_layer)
144 | 
145 |   def forward(self, x, x_mask, g=None, **kwargs):
146 |     output = torch.zeros_like(x)
147 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
148 | 
149 |     if g is not None:
150 |       g = self.cond_layer(g)
151 | 
152 |     for i in range(self.n_layers):
153 |       x_in = self.in_layers[i](x)
154 |       if g is not None:
155 |         cond_offset = i * 2 * self.hidden_channels
156 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
157 |       else:
158 |         g_l = torch.zeros_like(x_in)
159 | 
160 |       acts = commons.fused_add_tanh_sigmoid_multiply(
161 |           x_in,
162 |           g_l,
163 |           n_channels_tensor)
164 |       acts = self.drop(acts)
165 | 
166 |       res_skip_acts = self.res_skip_layers[i](acts)
167 |       if i < self.n_layers - 1:
168 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
169 |         x = (x + res_acts) * x_mask
170 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
171 |       else:
172 |         output = output + res_skip_acts
173 |     return output * x_mask
174 | 
175 |   def remove_weight_norm(self):
176 |     if self.gin_channels != 0:
177 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
178 |     for l in self.in_layers:
179 |       torch.nn.utils.remove_weight_norm(l)
180 |     for l in self.res_skip_layers:
181 |      torch.nn.utils.remove_weight_norm(l)
182 | 
183 | 
184 | class ResBlock1(torch.nn.Module):
185 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
186 |         super(ResBlock1, self).__init__()
187 |         self.convs1 = nn.ModuleList([
188 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
189 |                                padding=get_padding(kernel_size, dilation[0]))),
190 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
191 |                                padding=get_padding(kernel_size, dilation[1]))),
192 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
193 |                                padding=get_padding(kernel_size, dilation[2])))
194 |         ])
195 |         self.convs1.apply(init_weights)
196 | 
197 |         self.convs2 = nn.ModuleList([
198 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
199 |                                padding=get_padding(kernel_size, 1))),
200 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201 |                                padding=get_padding(kernel_size, 1))),
202 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203 |                                padding=get_padding(kernel_size, 1)))
204 |         ])
205 |         self.convs2.apply(init_weights)
206 | 
207 |     def forward(self, x, x_mask=None):
208 |         for c1, c2 in zip(self.convs1, self.convs2):
209 |             xt = F.leaky_relu(x, LRELU_SLOPE)
210 |             if x_mask is not None:
211 |                 xt = xt * x_mask
212 |             xt = c1(xt)
213 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
214 |             if x_mask is not None:
215 |                 xt = xt * x_mask
216 |             xt = c2(xt)
217 |             x = xt + x
218 |         if x_mask is not None:
219 |             x = x * x_mask
220 |         return x
221 | 
222 |     def remove_weight_norm(self):
223 |         for l in self.convs1:
224 |             remove_weight_norm(l)
225 |         for l in self.convs2:
226 |             remove_weight_norm(l)
227 | 
228 | 
229 | class ResBlock2(torch.nn.Module):
230 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
231 |         super(ResBlock2, self).__init__()
232 |         self.convs = nn.ModuleList([
233 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
234 |                                padding=get_padding(kernel_size, dilation[0]))),
235 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
236 |                                padding=get_padding(kernel_size, dilation[1])))
237 |         ])
238 |         self.convs.apply(init_weights)
239 | 
240 |     def forward(self, x, x_mask=None):
241 |         for c in self.convs:
242 |             xt = F.leaky_relu(x, LRELU_SLOPE)
243 |             if x_mask is not None:
244 |                 xt = xt * x_mask
245 |             xt = c(xt)
246 |             x = xt + x
247 |         if x_mask is not None:
248 |             x = x * x_mask
249 |         return x
250 | 
251 |     def remove_weight_norm(self):
252 |         for l in self.convs:
253 |             remove_weight_norm(l)
254 | 
255 | 
256 | class Log(nn.Module):
257 |   def forward(self, x, x_mask, reverse=False, **kwargs):
258 |     if not reverse:
259 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
260 |       logdet = torch.sum(-y, [1, 2])
261 |       return y, logdet
262 |     else:
263 |       x = torch.exp(x) * x_mask
264 |       return x
265 |     
266 | 
267 | class Flip(nn.Module):
268 |   def forward(self, x, *args, reverse=False, **kwargs):
269 |     x = torch.flip(x, [1])
270 |     if not reverse:
271 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
272 |       return x, logdet
273 |     else:
274 |       return x
275 | 
276 | 
277 | class ElementwiseAffine(nn.Module):
278 |   def __init__(self, channels):
279 |     super().__init__()
280 |     self.channels = channels
281 |     self.m = nn.Parameter(torch.zeros(channels,1))
282 |     self.logs = nn.Parameter(torch.zeros(channels,1))
283 | 
284 |   def forward(self, x, x_mask, reverse=False, **kwargs):
285 |     if not reverse:
286 |       y = self.m + torch.exp(self.logs) * x
287 |       y = y * x_mask
288 |       logdet = torch.sum(self.logs * x_mask, [1,2])
289 |       return y, logdet
290 |     else:
291 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
292 |       return x
293 | 
294 | 
295 | class ResidualCouplingLayer(nn.Module):
296 |   def __init__(self,
297 |       channels,
298 |       hidden_channels,
299 |       kernel_size,
300 |       dilation_rate,
301 |       n_layers,
302 |       p_dropout=0,
303 |       gin_channels=0,
304 |       mean_only=False):
305 |     assert channels % 2 == 0, "channels should be divisible by 2"
306 |     super().__init__()
307 |     self.channels = channels
308 |     self.hidden_channels = hidden_channels
309 |     self.kernel_size = kernel_size
310 |     self.dilation_rate = dilation_rate
311 |     self.n_layers = n_layers
312 |     self.half_channels = channels // 2
313 |     self.mean_only = mean_only
314 | 
315 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
316 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
317 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
318 |     self.post.weight.data.zero_()
319 |     self.post.bias.data.zero_()
320 | 
321 |   def forward(self, x, x_mask, g=None, reverse=False):
322 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
323 |     h = self.pre(x0) * x_mask
324 |     h = self.enc(h, x_mask, g=g)
325 |     stats = self.post(h) * x_mask
326 |     if not self.mean_only:
327 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
328 |     else:
329 |       m = stats
330 |       logs = torch.zeros_like(m)
331 | 
332 |     if not reverse:
333 |       x1 = m + x1 * torch.exp(logs) * x_mask
334 |       x = torch.cat([x0, x1], 1)
335 |       logdet = torch.sum(logs, [1,2])
336 |       return x, logdet
337 |     else:
338 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
339 |       x = torch.cat([x0, x1], 1)
340 |       return x
341 | 
342 | 
343 | class ConvFlow(nn.Module):
344 |   def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
345 |     super().__init__()
346 |     self.in_channels = in_channels
347 |     self.filter_channels = filter_channels
348 |     self.kernel_size = kernel_size
349 |     self.n_layers = n_layers
350 |     self.num_bins = num_bins
351 |     self.tail_bound = tail_bound
352 |     self.half_channels = in_channels // 2
353 | 
354 |     self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
355 |     self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
356 |     self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
357 |     self.proj.weight.data.zero_()
358 |     self.proj.bias.data.zero_()
359 | 
360 |   def forward(self, x, x_mask, g=None, reverse=False):
361 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
362 |     h = self.pre(x0)
363 |     h = self.convs(h, x_mask, g=g)
364 |     h = self.proj(h) * x_mask
365 | 
366 |     b, c, t = x0.shape
367 |     h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
368 | 
369 |     unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
370 |     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
371 |     unnormalized_derivatives = h[..., 2 * self.num_bins:]
372 | 
373 |     x1, logabsdet = piecewise_rational_quadratic_transform(x1,
374 |         unnormalized_widths,
375 |         unnormalized_heights,
376 |         unnormalized_derivatives,
377 |         inverse=reverse,
378 |         tails='linear',
379 |         tail_bound=self.tail_bound
380 |     )
381 | 
382 |     x = torch.cat([x0, x1], 1) * x_mask
383 |     logdet = torch.sum(logabsdet * x_mask, [1,2])
384 |     if not reverse:
385 |         return x, logdet
386 |     else:
387 |         return x
388 | 


--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | from numpy import zeros, int32, float32
 2 | from torch import from_numpy
 3 | 
 4 | from .core import maximum_path_jit
 5 | 
 6 | def maximum_path(neg_cent, mask):
 7 |   device = neg_cent.device
 8 |   dtype = neg_cent.dtype
 9 |   neg_cent = neg_cent.data.cpu().numpy().astype(float32)
10 |   path = zeros(neg_cent.shape, dtype=int32)
11 | 
12 |   t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
13 |   t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
14 |   maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
15 |   return from_numpy(path).to(device=device, dtype=dtype)
16 | 


--------------------------------------------------------------------------------
/monotonic_align/core.py:
--------------------------------------------------------------------------------
 1 | import numba
 2 | 
 3 | 
 4 | @numba.jit(numba.void(numba.int32[:,:,::1], numba.float32[:,:,::1], numba.int32[::1], numba.int32[::1]), nopython=True, nogil=True)
 5 | def maximum_path_jit(paths, values, t_ys, t_xs):
 6 |   b = paths.shape[0]
 7 |   max_neg_val=-1e9
 8 |   for i in range(int(b)):
 9 |     path = paths[i]
10 |     value = values[i]
11 |     t_y = t_ys[i]
12 |     t_x = t_xs[i]
13 | 
14 |     v_prev = v_cur = 0.0
15 |     index = t_x - 1
16 | 
17 |     for y in range(t_y):
18 |       for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
19 |         if x == y:
20 |           v_cur = max_neg_val
21 |         else:
22 |           v_cur = value[y-1, x]
23 |         if x == 0:
24 |           if y == 0:
25 |             v_prev = 0.
26 |           else:
27 |             v_prev = max_neg_val
28 |         else:
29 |           v_prev = value[y-1, x-1]
30 |         value[y, x] += max(v_prev, v_cur)
31 | 
32 |     for y in range(t_y - 1, -1, -1):
33 |       path[y, index] = 1
34 |       if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
35 |         index = index - 1
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # DO NOT include azure-functions-worker in this file
 2 | # The Python Worker is managed by Azure Functions platform
 3 | # Manually managing azure-functions-worker may cause unexpected issues
 4 | 
 5 | azure-functions
 6 | 
 7 | numpy==1.22.4
 8 | numba
 9 | scipy
10 | Unidecode
11 | openjtalk==0.3.0.dev2
12 | jamo
13 | av
14 | -f https://download.pytorch.org/whl/torch_stable.html
15 | torch==1.12.0+cpu
16 | 


--------------------------------------------------------------------------------
/speak/__init__.py:
--------------------------------------------------------------------------------
 1 | import azure.functions as func
 2 | 
 3 | from api import Speaker
 4 | 
 5 | 
 6 | speaker = Speaker('Yuzu/config.json', 'Yuzu/365_epochs.pth')
 7 | 
 8 | 
 9 | def main(req: func.HttpRequest) -> func.HttpResponse:
10 |     return speaker.main(req)
11 | 


--------------------------------------------------------------------------------
/speak/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": ["get"]
10 |     },
11 |     {
12 |       "type": "http",
13 |       "direction": "out",
14 |       "name": "$return"
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/speak2/__init__.py:
--------------------------------------------------------------------------------
 1 | import azure.functions as func
 2 | 
 3 | from api import Speaker
 4 | 
 5 | 
 6 | speaker = Speaker('HamidashiCreative/config.json', 'HamidashiCreative/604_epochs.pth')
 7 | 
 8 | 
 9 | def main(req: func.HttpRequest) -> func.HttpResponse:
10 |     return speaker.main(req)
11 | 


--------------------------------------------------------------------------------
/speak2/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": ["get"]
10 |     },
11 |     {
12 |       "type": "http",
13 |       "direction": "out",
14 |       "name": "$return"
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/speakkr/__init__.py:
--------------------------------------------------------------------------------
 1 | import azure.functions as func
 2 | 
 3 | from api import Speaker
 4 | 
 5 | 
 6 | speaker = Speaker('TheFoxAwaitsMe/config.json', 'TheFoxAwaitsMe/1164_epochs.pth')
 7 | 
 8 | 
 9 | def main(req: func.HttpRequest) -> func.HttpResponse:
10 |     return speaker.main(req)
11 | 


--------------------------------------------------------------------------------
/speakkr/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": ["get"]
10 |     },
11 |     {
12 |       "type": "http",
13 |       "direction": "out",
14 |       "name": "$return"
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from text import cleaners
 3 | 
 4 | 
 5 | def text_to_sequence(text, symbols, cleaner_names):
 6 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 7 |     Args:
 8 |       text: string to convert to a sequence
 9 |       cleaner_names: names of the cleaner functions to run the text through
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14 | 
15 |   sequence = []
16 | 
17 |   clean_text = _clean_text(text, cleaner_names)
18 |   for symbol in clean_text:
19 |     if symbol not in _symbol_to_id.keys():
20 |       continue
21 |     symbol_id = _symbol_to_id[symbol]
22 |     sequence += [symbol_id]
23 |   return sequence
24 | 
25 | 
26 | def _clean_text(text, cleaner_names):
27 |   for name in cleaner_names:
28 |     cleaner = getattr(cleaners, name)
29 |     if not cleaner:
30 |       raise Exception('Unknown cleaner: %s' % name)
31 |     text = cleaner(text)
32 |   return text
33 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | import re
 16 | from unidecode import unidecode
 17 | import pyopenjtalk
 18 | from jamo import h2j, j2hcj
 19 | 
 20 | 
 21 | # This is a list of Korean classifiers preceded by pure Korean numerals.
 22 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
 23 | 
 24 | # Regular expression matching whitespace:
 25 | _whitespace_re = re.compile(r'\s+')
 26 | 
 27 | # Regular expression matching Japanese without punctuation marks:
 28 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 29 | 
 30 | # Regular expression matching non-Japanese characters or punctuation marks:
 31 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 32 | 
 33 | # List of (regular expression, replacement) pairs for abbreviations:
 34 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 35 |   ('mrs', 'misess'),
 36 |   ('mr', 'mister'),
 37 |   ('dr', 'doctor'),
 38 |   ('st', 'saint'),
 39 |   ('co', 'company'),
 40 |   ('jr', 'junior'),
 41 |   ('maj', 'major'),
 42 |   ('gen', 'general'),
 43 |   ('drs', 'doctors'),
 44 |   ('rev', 'reverend'),
 45 |   ('lt', 'lieutenant'),
 46 |   ('hon', 'honorable'),
 47 |   ('sgt', 'sergeant'),
 48 |   ('capt', 'captain'),
 49 |   ('esq', 'esquire'),
 50 |   ('ltd', 'limited'),
 51 |   ('col', 'colonel'),
 52 |   ('ft', 'fort'),
 53 | ]]
 54 | 
 55 | # List of (hangul, hangul divided) pairs:
 56 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
 57 |   ('ㄳ', 'ㄱㅅ'),
 58 |   ('ㄵ', 'ㄴㅈ'),
 59 |   ('ㄶ', 'ㄴㅎ'),
 60 |   ('ㄺ', 'ㄹㄱ'),
 61 |   ('ㄻ', 'ㄹㅁ'),
 62 |   ('ㄼ', 'ㄹㅂ'),
 63 |   ('ㄽ', 'ㄹㅅ'),
 64 |   ('ㄾ', 'ㄹㅌ'),
 65 |   ('ㄿ', 'ㄹㅍ'),
 66 |   ('ㅀ', 'ㄹㅎ'),
 67 |   ('ㅄ', 'ㅂㅅ'),
 68 |   ('ㅘ', 'ㅗㅏ'),
 69 |   ('ㅙ', 'ㅗㅐ'),
 70 |   ('ㅚ', 'ㅗㅣ'),
 71 |   ('ㅝ', 'ㅜㅓ'),
 72 |   ('ㅞ', 'ㅜㅔ'),
 73 |   ('ㅟ', 'ㅜㅣ'),
 74 |   ('ㅢ', 'ㅡㅣ'),
 75 |   ('ㅑ', 'ㅣㅏ'),
 76 |   ('ㅒ', 'ㅣㅐ'),
 77 |   ('ㅕ', 'ㅣㅓ'),
 78 |   ('ㅖ', 'ㅣㅔ'),
 79 |   ('ㅛ', 'ㅣㅗ'),
 80 |   ('ㅠ', 'ㅣㅜ')
 81 | ]]
 82 | 
 83 | # List of (Latin alphabet, hangul) pairs:
 84 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 85 |   ('a', '에이'),
 86 |   ('b', '비'),
 87 |   ('c', '시'),
 88 |   ('d', '디'),
 89 |   ('e', '이'),
 90 |   ('f', '에프'),
 91 |   ('g', '지'),
 92 |   ('h', '에이치'),
 93 |   ('i', '아이'),
 94 |   ('j', '제이'),
 95 |   ('k', '케이'),
 96 |   ('l', '엘'),
 97 |   ('m', '엠'),
 98 |   ('n', '엔'),
 99 |   ('o', '오'),
100 |   ('p', '피'),
101 |   ('q', '큐'),
102 |   ('r', '아르'),
103 |   ('s', '에스'),
104 |   ('t', '티'),
105 |   ('u', '유'),
106 |   ('v', '브이'),
107 |   ('w', '더블유'),
108 |   ('x', '엑스'),
109 |   ('y', '와이'),
110 |   ('z', '제트')
111 | ]]
112 | 
113 | 
114 | def expand_abbreviations(text):
115 |   for regex, replacement in _abbreviations:
116 |     text = re.sub(regex, replacement, text)
117 |   return text
118 | 
119 | 
120 | def lowercase(text):
121 |   return text.lower()
122 | 
123 | 
124 | def collapse_whitespace(text):
125 |   return re.sub(_whitespace_re, ' ', text)
126 | 
127 | 
128 | def convert_to_ascii(text):
129 |   return unidecode(text)
130 | 
131 | 
132 | def latin_to_hangul(text):
133 |   for regex, replacement in _latin_to_hangul:
134 |     text = re.sub(regex, replacement, text)
135 |   return text
136 | 
137 | 
138 | def divide_hangul(text):
139 |   for regex, replacement in _hangul_divided:
140 |     text = re.sub(regex, replacement, text)
141 |   return text
142 | 
143 | 
144 | def hangul_number(num, sino=True):
145 |   '''Reference https://github.com/Kyubyong/g2pK'''
146 |   num = re.sub(',', '', num)
147 | 
148 |   if num == '0':
149 |       return '영'
150 |   if not sino and num == '20':
151 |       return '스무'
152 | 
153 |   digits = '123456789'
154 |   names = '일이삼사오육칠팔구'
155 |   digit2name = {d: n for d, n in zip(digits, names)}
156 |   
157 |   modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
158 |   decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
159 |   digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
160 |   digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
161 | 
162 |   spelledout = []
163 |   for i, digit in enumerate(num):
164 |     i = len(num) - i - 1
165 |     if sino:
166 |       if i == 0:
167 |         name = digit2name.get(digit, '')
168 |       elif i == 1:
169 |         name = digit2name.get(digit, '') + '십'
170 |         name = name.replace('일십', '십')
171 |     else:
172 |       if i == 0:
173 |         name = digit2mod.get(digit, '')
174 |       elif i == 1:
175 |         name = digit2dec.get(digit, '')
176 |     if digit == '0':
177 |       if i % 4 == 0:
178 |         last_three = spelledout[-min(3, len(spelledout)):]
179 |         if ''.join(last_three) == '':
180 |           spelledout.append('')
181 |           continue
182 |       else:
183 |         spelledout.append('')
184 |         continue
185 |     if i == 2:
186 |       name = digit2name.get(digit, '') + '백'
187 |       name = name.replace('일백', '백')
188 |     elif i == 3:
189 |       name = digit2name.get(digit, '') + '천'
190 |       name = name.replace('일천', '천')
191 |     elif i == 4:
192 |       name = digit2name.get(digit, '') + '만'
193 |       name = name.replace('일만', '만')
194 |     elif i == 5:
195 |       name = digit2name.get(digit, '') + '십'
196 |       name = name.replace('일십', '십')
197 |     elif i == 6:
198 |       name = digit2name.get(digit, '') + '백'
199 |       name = name.replace('일백', '백')
200 |     elif i == 7:
201 |       name = digit2name.get(digit, '') + '천'
202 |       name = name.replace('일천', '천')
203 |     elif i == 8:
204 |       name = digit2name.get(digit, '') + '억'
205 |     elif i == 9:
206 |       name = digit2name.get(digit, '') + '십'
207 |     elif i == 10:
208 |       name = digit2name.get(digit, '') + '백'
209 |     elif i == 11:
210 |       name = digit2name.get(digit, '') + '천'
211 |     elif i == 12:
212 |       name = digit2name.get(digit, '') + '조'
213 |     elif i == 13:
214 |       name = digit2name.get(digit, '') + '십'
215 |     elif i == 14:
216 |       name = digit2name.get(digit, '') + '백'
217 |     elif i == 15:
218 |       name = digit2name.get(digit, '') + '천'
219 |     spelledout.append(name)
220 |   return ''.join(elem for elem in spelledout)
221 | 
222 | 
223 | def number_to_hangul(text):
224 |   '''Reference https://github.com/Kyubyong/g2pK'''
225 |   tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
226 |   for token in tokens:
227 |     num, classifier = token
228 |     if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
229 |       spelledout = hangul_number(num, sino=False)
230 |     else:
231 |       spelledout = hangul_number(num, sino=True)
232 |     text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
233 |   # digit by digit for remaining digits
234 |   digits = '0123456789'
235 |   names = '영일이삼사오육칠팔구'
236 |   for d, n in zip(digits, names):
237 |     text = text.replace(d, n)
238 |   return text
239 | 
240 | 
241 | def basic_cleaners(text):
242 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
243 |   text = lowercase(text)
244 |   text = collapse_whitespace(text)
245 |   return text
246 | 
247 | 
248 | def transliteration_cleaners(text):
249 |   '''Pipeline for non-English text that transliterates to ASCII.'''
250 |   text = convert_to_ascii(text)
251 |   text = lowercase(text)
252 |   text = collapse_whitespace(text)
253 |   return text
254 | 
255 | 
256 | def japanese_cleaners(text):
257 |   '''Pipeline for notating accent in Japanese text.
258 |   Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
259 |   sentences = re.split(_japanese_marks, text)
260 |   marks = re.findall(_japanese_marks, text)
261 |   text = ''
262 |   for i, sentence in enumerate(sentences):
263 |     if re.match(_japanese_characters, sentence):
264 |       if text!='':
265 |         text+=' '
266 |       labels = pyopenjtalk.extract_fullcontext(sentence)
267 |       for n, label in enumerate(labels):
268 |         phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
269 |         if phoneme not in ['sil','pau']:
270 |           text += phoneme.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q')
271 |         else:
272 |           continue
273 |         n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
274 |         a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
275 |         a2 = int(re.search(r"\+(\d+)\+", label).group(1))
276 |         a3 = int(re.search(r"\+(\d+)/", label).group(1))
277 |         if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
278 |           a2_next=-1
279 |         else:
280 |           a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
281 |         # Accent phrase boundary
282 |         if a3 == 1 and a2_next == 1:
283 |           text += ' '
284 |         # Falling
285 |         elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
286 |           text += '↓'
287 |         # Rising
288 |         elif a2 == 1 and a2_next == 2:
289 |           text += '↑'
290 |     if i<len(marks):
291 |       text += unidecode(marks[i]).replace(' ','')
292 |   if re.match('[A-Za-z]',text[-1]):
293 |     text += '.'
294 |   return text
295 | 
296 | 
297 | def japanese_cleaners2(text):
298 |   '''Pipeline for notating accent in Japanese text.'''
299 |   '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
300 |   sentences = re.split(_japanese_marks, text)
301 |   marks = re.findall(_japanese_marks, text)
302 |   text = ''
303 |   for i, sentence in enumerate(sentences):
304 |     if re.match(_japanese_characters, sentence):
305 |       if text!='':
306 |         text+=' '
307 |       labels = pyopenjtalk.extract_fullcontext(sentence)
308 |       for n, label in enumerate(labels):
309 |         phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
310 |         if phoneme not in ['sil','pau']:
311 |           text += phoneme.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q').replace('ts','ʦ')
312 |         else:
313 |           continue
314 |         n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
315 |         a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
316 |         a2 = int(re.search(r"\+(\d+)\+", label).group(1))
317 |         a3 = int(re.search(r"\+(\d+)/", label).group(1))
318 |         if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
319 |           a2_next=-1
320 |         else:
321 |           a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
322 |         # Accent phrase boundary
323 |         if a3 == 1 and a2_next == 1:
324 |           text += ' '
325 |         # Falling
326 |         elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
327 |           text += '↓'
328 |         # Rising
329 |         elif a2 == 1 and a2_next == 2:
330 |           text += '↑'
331 |     if i<len(marks):
332 |       text += unidecode(marks[i]).replace(' ','')
333 |   if re.match('[A-Za-z]',text[-1]):
334 |     text += '.'
335 |   return text
336 | 
337 | 
338 | def korean_cleaners(text):
339 |   '''Pipeline for Korean text'''
340 |   text = latin_to_hangul(text)
341 |   text = number_to_hangul(text)
342 |   text = j2hcj(h2j(text))
343 |   text = divide_hangul(text)
344 |   if re.match('[\u3131-\u3163]',text[-1]):
345 |     text += '.'
346 |   return text


--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(inputs, 
 13 |                                            unnormalized_widths,
 14 |                                            unnormalized_heights,
 15 |                                            unnormalized_derivatives,
 16 |                                            inverse=False,
 17 |                                            tails=None, 
 18 |                                            tail_bound=1.,
 19 |                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 20 |                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 21 |                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
 22 | 
 23 |     if tails is None:
 24 |         spline_fn = rational_quadratic_spline
 25 |         spline_kwargs = {}
 26 |     else:
 27 |         spline_fn = unconstrained_rational_quadratic_spline
 28 |         spline_kwargs = {
 29 |             'tails': tails,
 30 |             'tail_bound': tail_bound
 31 |         }
 32 | 
 33 |     outputs, logabsdet = spline_fn(
 34 |             inputs=inputs,
 35 |             unnormalized_widths=unnormalized_widths,
 36 |             unnormalized_heights=unnormalized_heights,
 37 |             unnormalized_derivatives=unnormalized_derivatives,
 38 |             inverse=inverse,
 39 |             min_bin_width=min_bin_width,
 40 |             min_bin_height=min_bin_height,
 41 |             min_derivative=min_derivative,
 42 |             **spline_kwargs
 43 |     )
 44 |     return outputs, logabsdet
 45 | 
 46 | 
 47 | def searchsorted(bin_locations, inputs, eps=1e-6):
 48 |     bin_locations[..., -1] += eps
 49 |     return torch.sum(
 50 |         inputs[..., None] >= bin_locations,
 51 |         dim=-1
 52 |     ) - 1
 53 | 
 54 | 
 55 | def unconstrained_rational_quadratic_spline(inputs,
 56 |                                             unnormalized_widths,
 57 |                                             unnormalized_heights,
 58 |                                             unnormalized_derivatives,
 59 |                                             inverse=False,
 60 |                                             tails='linear',
 61 |                                             tail_bound=1.,
 62 |                                             min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 63 |                                             min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 64 |                                             min_derivative=DEFAULT_MIN_DERIVATIVE):
 65 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 66 |     outside_interval_mask = ~inside_interval_mask
 67 | 
 68 |     outputs = torch.zeros_like(inputs)
 69 |     logabsdet = torch.zeros_like(inputs)
 70 | 
 71 |     if tails == 'linear':
 72 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 73 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 74 |         unnormalized_derivatives[..., 0] = constant
 75 |         unnormalized_derivatives[..., -1] = constant
 76 | 
 77 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 78 |         logabsdet[outside_interval_mask] = 0
 79 |     else:
 80 |         raise RuntimeError('{} tails are not implemented.'.format(tails))
 81 | 
 82 |     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
 89 |         min_bin_width=min_bin_width,
 90 |         min_bin_height=min_bin_height,
 91 |         min_derivative=min_derivative
 92 |     )
 93 | 
 94 |     return outputs, logabsdet
 95 | 
 96 | def rational_quadratic_spline(inputs,
 97 |                               unnormalized_widths,
 98 |                               unnormalized_heights,
 99 |                               unnormalized_derivatives,
100 |                               inverse=False,
101 |                               left=0., right=1., bottom=0., top=1.,
102 |                               min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 |                               min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 |                               min_derivative=DEFAULT_MIN_DERIVATIVE):
105 |     if torch.min(inputs) < left or torch.max(inputs) > right:
106 |         raise ValueError('Input to a transform is not within its domain')
107 | 
108 |     num_bins = unnormalized_widths.shape[-1]
109 | 
110 |     if min_bin_width * num_bins > 1.0:
111 |         raise ValueError('Minimal bin width too large for the number of bins')
112 |     if min_bin_height * num_bins > 1.0:
113 |         raise ValueError('Minimal bin height too large for the number of bins')
114 | 
115 |     widths = F.softmax(unnormalized_widths, dim=-1)
116 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 |     cumwidths = torch.cumsum(widths, dim=-1)
118 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 |     cumwidths = (right - left) * cumwidths + left
120 |     cumwidths[..., 0] = left
121 |     cumwidths[..., -1] = right
122 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 | 
124 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 | 
126 |     heights = F.softmax(unnormalized_heights, dim=-1)
127 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 |     cumheights = torch.cumsum(heights, dim=-1)
129 |     cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 |     cumheights = (top - bottom) * cumheights + bottom
131 |     cumheights[..., 0] = bottom
132 |     cumheights[..., -1] = top
133 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
134 | 
135 |     if inverse:
136 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
137 |     else:
138 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 | 
140 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 | 
143 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 |     delta = heights / widths
145 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
146 | 
147 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
151 | 
152 |     if inverse:
153 |         a = (((inputs - input_cumheights) * (input_derivatives
154 |                                              + input_derivatives_plus_one
155 |                                              - 2 * input_delta)
156 |               + input_heights * (input_delta - input_derivatives)))
157 |         b = (input_heights * input_derivatives
158 |              - (inputs - input_cumheights) * (input_derivatives
159 |                                               + input_derivatives_plus_one
160 |                                               - 2 * input_delta))
161 |         c = - input_delta * (inputs - input_cumheights)
162 | 
163 |         discriminant = b.pow(2) - 4 * a * c
164 |         assert (discriminant >= 0).all()
165 | 
166 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
167 |         outputs = root * input_bin_widths + input_cumwidths
168 | 
169 |         theta_one_minus_theta = root * (1 - root)
170 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 |                                      * theta_one_minus_theta)
172 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 |                                                      + 2 * input_delta * theta_one_minus_theta
174 |                                                      + input_derivatives * (1 - root).pow(2))
175 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 | 
177 |         return outputs, -logabsdet
178 |     else:
179 |         theta = (inputs - input_cumwidths) / input_bin_widths
180 |         theta_one_minus_theta = theta * (1 - theta)
181 | 
182 |         numerator = input_heights * (input_delta * theta.pow(2)
183 |                                      + input_derivatives * theta_one_minus_theta)
184 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 |                                      * theta_one_minus_theta)
186 |         outputs = input_cumheights + numerator / denominator
187 | 
188 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 |                                                      + 2 * input_delta * theta_one_minus_theta
190 |                                                      + input_derivatives * (1 - theta).pow(2))
191 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 | 
193 |         return outputs, logabsdet
194 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from json import loads
 3 | from torch import load
 4 | import logging
 5 | from av import open as avopen
 6 | 
 7 | 
 8 | class HParams():
 9 |   def __init__(self, **kwargs):
10 |     for k, v in kwargs.items():
11 |       if type(v) == dict:
12 |         v = HParams(**v)
13 |       self[k] = v
14 | 
15 |   def keys(self):
16 |     return self.__dict__.keys()
17 | 
18 |   def items(self):
19 |     return self.__dict__.items()
20 | 
21 |   def values(self):
22 |     return self.__dict__.values()
23 | 
24 |   def __len__(self):
25 |     return len(self.__dict__)
26 | 
27 |   def __getitem__(self, key):
28 |     return getattr(self, key)
29 | 
30 |   def __setitem__(self, key, value):
31 |     return setattr(self, key, value)
32 | 
33 |   def __contains__(self, key):
34 |     return key in self.__dict__
35 | 
36 |   def __repr__(self):
37 |     return self.__dict__.__repr__()
38 | 
39 | 
40 | def load_checkpoint(checkpoint_path, model):
41 |   checkpoint_dict = load(checkpoint_path, map_location='cpu')
42 |   iteration = checkpoint_dict['iteration']
43 |   saved_state_dict = checkpoint_dict['model']
44 |   if hasattr(model, 'module'):
45 |     state_dict = model.module.state_dict()
46 |   else:
47 |     state_dict = model.state_dict()
48 |   new_state_dict= {}
49 |   for k, v in state_dict.items():
50 |     try:
51 |       new_state_dict[k] = saved_state_dict[k]
52 |     except:
53 |       logging.info("%s is not in the checkpoint" % k)
54 |       new_state_dict[k] = v
55 |   if hasattr(model, 'module'):
56 |     model.module.load_state_dict(new_state_dict)
57 |   else:
58 |     model.load_state_dict(new_state_dict)
59 |   logging.info("Loaded checkpoint '{}' (iteration {})" .format(
60 |     checkpoint_path, iteration))
61 |   return
62 | 
63 | 
64 | def get_hparams_from_file(config_path):
65 |   with open(config_path, "r") as f:
66 |     data = f.read()
67 |   config = loads(data)
68 | 
69 |   hparams = HParams(**config)
70 |   return hparams
71 | 
72 | def wav2(i, o, format):
73 |   inp = avopen(i, 'rb')
74 |   out = avopen(o, 'wb', format=format)
75 |   if format == "ogg": format = "libvorbis"
76 | 
77 |   ostream = out.add_stream(format)
78 | 
79 |   for frame in inp.decode(audio=0):
80 |       for p in ostream.encode(frame): out.mux(p)
81 | 
82 |   for p in ostream.encode(None): out.mux(p)
83 | 
84 |   out.close()
85 |   inp.close()
86 | 


--------------------------------------------------------------------------------