├── .gitattributes
├── .gitignore
├── .idea
├── PythonPlugins.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── MoeGoe.py
├── README.md
├── __init__.py
├── attentions.py
├── bot.py
├── commons.py
├── hubert_model.py
├── jieba
└── dict.txt
├── mel_processing.py
├── models.py
├── modules.py
├── monotonic_align
├── __init__.py
└── core.py
├── pictures
├── __init__.py
├── agakaUa$aNaGaka.jpg
├── apauaraga5aqafa.jpg
├── avabaaa%aZaxa6a.jpg
└── awa6aRakaka3a7a.jpg
├── plugins
├── RandomStr
│ ├── RandomStr.py
│ └── __init__.py
├── __init__.py
├── picGet.py
├── voicePart.py
└── voices
│ └── __init__.py
├── requirements.txt
├── text
├── LICENSE
├── __init__.py
├── cantonese.py
├── cleaners.py
├── english.py
├── japanese.py
├── korean.py
├── mandarin.py
├── ngu_dialect.py
├── sanskrit.py
├── shanghainese.py
└── thai.py
├── trans.py
├── transforms.py
├── utils.py
└── voiceModel
└── config.json
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Ww][Ii][Nn]32/
27 | [Aa][Rr][Mm]/
28 | [Aa][Rr][Mm]64/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Oo]ut/
33 | [Ll]og/
34 | [Ll]ogs/
35 |
36 | # Visual Studio 2015/2017 cache/options directory
37 | .vs/
38 | # Uncomment if you have tasks that create the project's static files in wwwroot
39 | #wwwroot/
40 |
41 | # Visual Studio 2017 auto generated files
42 | Generated\ Files/
43 |
44 | # MSTest test Results
45 | [Tt]est[Rr]esult*/
46 | [Bb]uild[Ll]og.*
47 |
48 | # NUnit
49 | *.VisualState.xml
50 | TestResult.xml
51 | nunit-*.xml
52 |
53 | # Build Results of an ATL Project
54 | [Dd]ebugPS/
55 | [Rr]eleasePS/
56 | dlldata.c
57 |
58 | # Benchmark Results
59 | BenchmarkDotNet.Artifacts/
60 |
61 | # .NET Core
62 | project.lock.json
63 | project.fragment.lock.json
64 | artifacts/
65 |
66 | # ASP.NET Scaffolding
67 | ScaffoldingReadMe.txt
68 |
69 | # StyleCop
70 | StyleCopReport.xml
71 |
72 | # Files built by Visual Studio
73 | *_i.c
74 | *_p.c
75 | *_h.h
76 | *.ilk
77 | *.meta
78 | *.obj
79 | *.iobj
80 | *.pch
81 | *.pdb
82 | *.ipdb
83 | *.pgc
84 | *.pgd
85 | *.rsp
86 | *.sbr
87 | *.tlb
88 | *.tli
89 | *.tlh
90 | *.tmp
91 | *.tmp_proj
92 | *_wpftmp.csproj
93 | *.log
94 | *.vspscc
95 | *.vssscc
96 | .builds
97 | *.pidb
98 | *.svclog
99 | *.scc
100 |
101 | # Chutzpah Test files
102 | _Chutzpah*
103 |
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 |
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 |
121 | # Visual Studio Trace Files
122 | *.e2e
123 |
124 | # TFS 2012 Local Workspace
125 | $tf/
126 |
127 | # Guidance Automation Toolkit
128 | *.gpState
129 |
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 |
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 |
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 |
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 |
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 |
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 |
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 |
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 |
163 | # Web workbench (sass)
164 | .sass-cache/
165 |
166 | # Installshield output folder
167 | [Ee]xpress/
168 |
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 |
179 | # Click-Once directory
180 | publish/
181 |
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 |
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 |
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 |
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 |
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 |
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 |
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 |
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 |
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 |
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 |
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 |
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 |
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 |
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 |
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 |
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 |
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 |
288 | # Visual Studio 6 build log
289 | *.plg
290 |
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 |
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 |
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 |
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 |
309 | # FAKE - F# Make
310 | .fake/
311 |
312 | # CodeRush personal settings
313 | .cr/personal
314 |
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | *.pyc
318 |
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 |
323 | # Tabs Studio
324 | *.tss
325 |
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 |
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 |
335 | # OpenCover UI analysis results
336 | OpenCover/
337 |
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 |
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 |
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 |
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 |
350 | # Local History for Visual Studio
351 | .localhistory/
352 |
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 |
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 |
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 |
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
364 |
365 | # build
366 | build
367 | monotonic_align/core.c
368 | *.o
369 | *.so
370 | *.dll
371 |
372 | # data
373 | /config.json
374 | /*.pth
375 | *.wav
376 | /monotonic_align/monotonic_align
377 | /resources
378 | /MoeGoe.spec
379 | /dist/MoeGoe
380 | /dist
381 |
382 | # MacOS
383 | .DS_Store
384 |
--------------------------------------------------------------------------------
/.idea/PythonPlugins.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
56 |
57 |
58 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 | 1669552567504
79 |
80 |
81 | 1669552567504
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 | 1675307452793
94 |
95 |
96 |
97 | 1675307452793
98 |
99 |
100 | 1675307818997
101 |
102 |
103 |
104 | 1675307818997
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 CjangCjengh
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MoeGoe.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from scipy.io.wavfile import write
4 |
5 | from plugins.RandomStr.RandomStr import random_str
6 | from trans import translate
7 | from mel_processing import spectrogram_torch
8 | from text import text_to_sequence, _clean_text
9 | from models import SynthesizerTrn
10 | import utils
11 | import commons
12 | import sys
13 | import re
14 | from torch import no_grad, LongTensor
15 | import logging
16 |
17 | logging.getLogger('numba').setLevel(logging.WARNING)
18 |
19 |
20 | def ex_print(text, escape=False):
21 | if escape:
22 | print(text.encode('unicode_escape').decode())
23 | else:
24 | print(text)
25 |
26 |
27 | def get_text(text, hps, cleaned=False):
28 | if cleaned:
29 | text_norm = text_to_sequence(text, hps.symbols, [])
30 | else:
31 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
32 | if hps.data.add_blank:
33 | text_norm = commons.intersperse(text_norm, 0)
34 | text_norm = LongTensor(text_norm)
35 | return text_norm
36 |
37 |
38 | def ask_if_continue():
39 | while True:
40 | answer = input('Continue? (y/n): ')
41 | if answer == 'y':
42 | break
43 | elif answer == 'n':
44 | sys.exit(0)
45 |
46 |
47 | def print_speakers(speakers, escape=False):
48 | print('ID\tSpeaker')
49 | for id, name in enumerate(speakers):
50 | ex_print(str(id) + '\t' + name, escape)
51 |
52 |
53 | def get_speaker_id(message):
54 | '''speaker_id = input(message)
55 | try:
56 | speaker_id = int(speaker_id)
57 | except:
58 | print(str(speaker_id) + ' is not a valid ID!')
59 | sys.exit(1)
60 | return speaker_id'''
61 | return 0
62 |
63 |
64 | def get_label_value(text, label, default, warning_name='value'):
65 | value = re.search(rf'\[{label}=(.+?)\]', text)
66 | if value:
67 | try:
68 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
69 | value = float(value.group(1))
70 | except:
71 | print(f'Invalid {warning_name}!')
72 | sys.exit(1)
73 | else:
74 | value = default
75 | return value, text
76 |
77 |
78 | def get_label(text, label):
79 | if f'[{label}]' in text:
80 | return True, text.replace(f'[{label}]', '')
81 | else:
82 | return False, text
83 |
84 | def voiceGenerate(tex,out,spealerIDDD=0,modelSelect=0):
85 | Path = sys.argv[0][:-23]
86 | text=tex
87 | out_path=out
88 | speakeriddd=spealerIDDD
89 | if '--escape' in sys.argv:
90 | escape = True
91 | else:
92 | escape = False
93 |
94 | #model = 'voiceModel\\1374_epochs.pth'#input('Path of a VITS model: ')
95 | #config ='voiceModel\\config.json'#input('Path of a config file: ')
96 | if modelSelect==1:
97 | model = 'voiceModel/YUUKA/G.pth'
98 | config = 'voiceModel/YUUKA/config.json'
99 | speakeriddd=0
100 | else:
101 | model = 'voiceModel/1374_epochs.pth'
102 | config = 'voiceModel/config.json'
103 |
104 | hps_ms = utils.get_hparams_from_file(config)
105 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
106 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
107 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
108 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
109 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
110 |
111 | net_g_ms = SynthesizerTrn(
112 | n_symbols,
113 | hps_ms.data.filter_length // 2 + 1,
114 | hps_ms.train.segment_size // hps_ms.data.hop_length,
115 | n_speakers=n_speakers,
116 | emotion_embedding=emotion_embedding,
117 | **hps_ms.model)
118 | _ = net_g_ms.eval()
119 | utils.load_checkpoint(model, net_g_ms)
120 |
121 | while True:
122 | choice = 't' # input('TTS or VC? (t/v):')
123 | if choice == 't':
124 | #text = input('Text to read: ')
125 | if text == '[ADVANCED]':
126 | text = input('Raw text:')
127 | print('Cleaned text is:')
128 | ex_print(_clean_text(
129 | text, hps_ms.data.text_cleaners), escape)
130 | continue
131 |
132 | length_scale, text = get_label_value(
133 | text, 'LENGTH', 1, 'length scale')
134 | noise_scale, text = get_label_value(
135 | text, 'NOISE', 0.667, 'noise scale')
136 | noise_scale_w, text = get_label_value(
137 | text, 'NOISEW', 0.8, 'deviation of noise')
138 | cleaned, text = get_label(text, 'CLEANED')
139 |
140 | stn_tst = get_text(text, hps_ms, cleaned=cleaned)
141 |
142 | #print_speakers(speakers, escape)
143 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
144 | print(time + '| 正在使用语音模型:'+str(speakeriddd)+' ......生成中'+' | 文本:'+tex)
145 | speaker_id = speakeriddd
146 |
147 | with no_grad():
148 | x_tst = stn_tst.unsqueeze(0)
149 | x_tst_lengths = LongTensor([stn_tst.size(0)])
150 | sid = LongTensor([speaker_id])
151 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
152 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][
153 | 0, 0].data.cpu().float().numpy()
154 |
155 | elif choice == 'v':
156 | audio, out_path = voice_conversion()
157 |
158 | write(out_path, hps_ms.data.sampling_rate, audio)
159 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
160 | print(time + '| Successfully saved!')
161 | break
162 |
163 | def voice_conversion(sourcepath,speaker=0):
164 | if '--escape' in sys.argv:
165 | escape = True
166 | else:
167 | escape = False
168 |
169 | model = 'voiceModel\\1374_epochs.pth'#input('Path of a VITS model: ')
170 | config ='voiceModel\\config.json'#input('Path of a config file: ')
171 |
172 | hps_ms = utils.get_hparams_from_file(config)
173 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
174 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
175 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
176 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
177 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
178 |
179 | net_g_ms = SynthesizerTrn(
180 | n_symbols,
181 | hps_ms.data.filter_length // 2 + 1,
182 | hps_ms.train.segment_size // hps_ms.data.hop_length,
183 | n_speakers=n_speakers,
184 | emotion_embedding=emotion_embedding,
185 | **hps_ms.model)
186 | _ = net_g_ms.eval()
187 | utils.load_checkpoint(model, net_g_ms)
188 |
189 | audio_path = sourcepath
190 | audio = utils.load_audio_to_torch(
191 | audio_path, hps_ms.data.sampling_rate)
192 |
193 | originnal_id = speaker
194 | target_id = 3
195 | out_path = 'plugins\\voices\\sing\\out.wav'
196 |
197 | y = audio.unsqueeze(0)
198 |
199 | spec = spectrogram_torch(y, hps_ms.data.filter_length,
200 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
201 | center=False)
202 | spec_lengths = LongTensor([spec.size(-1)])
203 | sid_src = LongTensor([originnal_id])
204 |
205 | with no_grad():
206 | sid_tgt = LongTensor([target_id])
207 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
208 | 0][0, 0].data.cpu().float().numpy()
209 | write(out_path, hps_ms.data.sampling_rate, audio)
210 | print('Successfully saved!')
211 | return out_path
212 |
213 |
214 | if __name__ == '__main__':
215 | #voice_conversion("plugins/voices/sing/rest.wav")
216 | voiceGenerate('先生,ちょっとお時間..いただけますか?','voiceModel/YUUKA/1.wav',0,1)
217 | '''ranpath = random_str()
218 | Path=sys.argv[0][:-23]
219 | print(Path)
220 | out = Path+'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
221 | tex = '[JA]' + translate('测试语音.....') + '[JA]'
222 | voiceGenerate(tex, out)'''
223 | '''if '--escape' in sys.argv:
224 | escape = True
225 | else:
226 | escape = False
227 |
228 | model = input('Path of a VITS model: ')
229 | config = input('Path of a config file: ')
230 |
231 | hps_ms = utils.get_hparams_from_file(config)
232 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
233 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
234 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
235 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
236 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
237 |
238 | net_g_ms = SynthesizerTrn(
239 | n_symbols,
240 | hps_ms.data.filter_length // 2 + 1,
241 | hps_ms.train.segment_size // hps_ms.data.hop_length,
242 | n_speakers=n_speakers,
243 | emotion_embedding=emotion_embedding,
244 | **hps_ms.model)
245 | _ = net_g_ms.eval()
246 | utils.load_checkpoint(model, net_g_ms)
247 |
248 | def voice_conversion():
249 | audio_path = input('Path of an audio file to convert:\n')
250 | print_speakers(speakers)
251 | audio = utils.load_audio_to_torch(
252 | audio_path, hps_ms.data.sampling_rate)
253 |
254 | originnal_id = get_speaker_id('Original speaker ID: ')
255 | target_id = get_speaker_id('Target speaker ID: ')
256 | out_path = input('Path to save: ')
257 |
258 | y = audio.unsqueeze(0)
259 |
260 | spec = spectrogram_torch(y, hps_ms.data.filter_length,
261 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
262 | center=False)
263 | spec_lengths = LongTensor([spec.size(-1)])
264 | sid_src = LongTensor([originnal_id])
265 |
266 | with no_grad():
267 | sid_tgt = LongTensor([target_id])
268 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
269 | 0][0, 0].data.cpu().float().numpy()
270 | return audio, out_path
271 |
272 | if n_symbols != 0:
273 | if not emotion_embedding:
274 | while True:
275 | choice = input('TTS or VC? (t/v):')
276 | if choice == 't':
277 | text = input('Text to read: ')
278 | if text == '[ADVANCED]':
279 | text = input('Raw text:')
280 | print('Cleaned text is:')
281 | ex_print(_clean_text(
282 | text, hps_ms.data.text_cleaners), escape)
283 | continue
284 |
285 | length_scale, text = get_label_value(
286 | text, 'LENGTH', 1, 'length scale')
287 | noise_scale, text = get_label_value(
288 | text, 'NOISE', 0.667, 'noise scale')
289 | noise_scale_w, text = get_label_value(
290 | text, 'NOISEW', 0.8, 'deviation of noise')
291 | cleaned, text = get_label(text, 'CLEANED')
292 |
293 | stn_tst = get_text(text, hps_ms, cleaned=cleaned)
294 |
295 | print_speakers(speakers, escape)
296 | speaker_id = get_speaker_id('Speaker ID: ')
297 | out_path = input('Path to save: ')
298 |
299 | with no_grad():
300 | x_tst = stn_tst.unsqueeze(0)
301 | x_tst_lengths = LongTensor([stn_tst.size(0)])
302 | sid = LongTensor([speaker_id])
303 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
304 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
305 |
306 | elif choice == 'v':
307 | audio, out_path = voice_conversion()
308 |
309 | write(out_path, hps_ms.data.sampling_rate, audio)
310 | print('Successfully saved!')
311 | ask_if_continue()
312 | else:
313 | import os
314 | import librosa
315 | import numpy as np
316 | from torch import FloatTensor
317 | import audonnx
318 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ')
319 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
320 | while True:
321 | choice = input('TTS or VC? (t/v):')
322 | if choice == 't':
323 | text = input('Text to read: ')
324 | if text == '[ADVANCED]':
325 | text = input('Raw text:')
326 | print('Cleaned text is:')
327 | ex_print(_clean_text(
328 | text, hps_ms.data.text_cleaners), escape)
329 | continue
330 |
331 | length_scale, text = get_label_value(
332 | text, 'LENGTH', 1, 'length scale')
333 | noise_scale, text = get_label_value(
334 | text, 'NOISE', 0.667, 'noise scale')
335 | noise_scale_w, text = get_label_value(
336 | text, 'NOISEW', 0.8, 'deviation of noise')
337 | cleaned, text = get_label(text, 'CLEANED')
338 |
339 | stn_tst = get_text(text, hps_ms, cleaned=cleaned)
340 |
341 | print_speakers(speakers, escape)
342 | speaker_id = get_speaker_id('Speaker ID: ')
343 |
344 | emotion_reference = input('Path of an emotion reference: ')
345 | if emotion_reference.endswith('.npy'):
346 | emotion = np.load(emotion_reference)
347 | emotion = FloatTensor(emotion).unsqueeze(0)
348 | else:
349 | audio16000, sampling_rate = librosa.load(
350 | emotion_reference, sr=16000, mono=True)
351 | emotion = w2v2_model(audio16000, sampling_rate)[
352 | 'hidden_states']
353 | emotion_reference = re.sub(
354 | r'\..*$', '', emotion_reference)
355 | np.save(emotion_reference, emotion.squeeze(0))
356 | emotion = FloatTensor(emotion)
357 |
358 | out_path = input('Path to save: ')
359 |
360 | with no_grad():
361 | x_tst = stn_tst.unsqueeze(0)
362 | x_tst_lengths = LongTensor([stn_tst.size(0)])
363 | sid = LongTensor([speaker_id])
364 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
365 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()
366 |
367 | elif choice == 'v':
368 | audio, out_path = voice_conversion()
369 |
370 | write(out_path, hps_ms.data.sampling_rate, audio)
371 | print('Successfully saved!')
372 | ask_if_continue()
373 | else:
374 | model = input('Path of a hubert-soft model: ')
375 | from hubert_model import hubert_soft
376 | hubert = hubert_soft(model)
377 |
378 | while True:
379 | audio_path = input('Path of an audio file to convert:\n')
380 |
381 | if audio_path != '[VC]':
382 | import librosa
383 | if use_f0:
384 | audio, sampling_rate = librosa.load(
385 | audio_path, sr=hps_ms.data.sampling_rate, mono=True)
386 | audio16000 = librosa.resample(
387 | audio, orig_sr=sampling_rate, target_sr=16000)
388 | else:
389 | audio16000, sampling_rate = librosa.load(
390 | audio_path, sr=16000, mono=True)
391 |
392 | target_id = get_speaker_id('Target speaker ID: ')
393 | out_path = input('Path to save: ')
394 | length_scale, out_path = get_label_value(
395 | out_path, 'LENGTH', 1, 'length scale')
396 | noise_scale, out_path = get_label_value(
397 | out_path, 'NOISE', 0.1, 'noise scale')
398 | noise_scale_w, out_path = get_label_value(
399 | out_path, 'NOISEW', 0.1, 'deviation of noise')
400 |
401 | from torch import inference_mode, FloatTensor
402 | import numpy as np
403 | with inference_mode():
404 | units = hubert.units(FloatTensor(audio16000).unsqueeze(
405 | 0).unsqueeze(0)).squeeze(0).numpy()
406 | if use_f0:
407 | f0_scale, out_path = get_label_value(
408 | out_path, 'F0', 1, 'f0 scale')
409 | f0 = librosa.pyin(audio, sr=sampling_rate,
410 | fmin=librosa.note_to_hz('C0'),
411 | fmax=librosa.note_to_hz('C7'),
412 | frame_length=1780)[0]
413 | target_length = len(units[:, 0])
414 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,
415 | np.arange(0, len(f0)), f0)) * f0_scale
416 | units[:, 0] = f0 / 10
417 |
418 | stn_tst = FloatTensor(units)
419 | with no_grad():
420 | x_tst = stn_tst.unsqueeze(0)
421 | x_tst_lengths = LongTensor([stn_tst.size(0)])
422 | sid = LongTensor([target_id])
423 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
424 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
425 |
426 | else:
427 | audio, out_path = voice_conversion()
428 |
429 | write(out_path, hps_ms.data.sampling_rate, audio)
430 | print('Successfully saved!')
431 | ask_if_continue()'''
432 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 此仓库不再更新,全部功能已转移至[Manyana](https://github.com/avilliai/Manyana)
2 |
3 | # 不 要 克 隆 仓 库
4 | - 克隆仓库用不了,几个月没push过。用右上角release。
5 | - 看到黄字推荐 pip install uvicorn时,忽略即可,安装将导致无法正常运行程序。
6 | # 更新
7 | - 支持导入模型,XX说
8 | - 设置语音回复,可以参考[wReply](https://github.com/avilliai/wReply)
9 |
10 | # Links
11 | - 项目最核心的部分是CjangCjengh佬的[MoeGoe](https://github.com/CjangCjengh/MoeGoe)
12 | - 基于[Yiri-mirai](https://github.com/YiriMiraiProject/YiriMirai)实现
13 | - python版本推荐3.9 不推荐python3.10
14 | - 请确保已安装[mirai-api-http](https://github.com/project-mirai/mirai-api-http)
15 |
16 |
17 | # 可能的问题
18 |
19 | 1
20 |
21 | FileNotFoundError[Errno 2]: No such............
22 |
23 | 解决:out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
24 |
25 | 替换成
26 |
27 | out='绝对路径\\PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
28 |
29 | 2
30 |
31 | ModuleNotFoundError: no module named 'XXX'
32 |
33 | 解决:缺包,执行如下命令
34 |
35 | pip install XXX
36 |
37 | 3
38 |
39 | TyreError: run() got an unexpected keyword argument 'debug'
40 |
41 | 解决:python版本不对,推荐换3.9
42 | pip uninstall uvicorn
43 |
44 | 4
45 |
46 | ConnectionRefusedError: [WinError 1225] 远程网络计算机拒绝连接
47 |
48 | 解决:bot.py的port,key,botqq与mirai-api-http配置不一致,修改对应即可
49 | 5
50 | AttributeError:........
51 | 推测是和现有的包有冲突,不知道。换个解释器试试。实在不行下载site-package并解压替换本地的site-package。
52 |
53 | # How to use
54 | - 下载Release,不要克隆仓库
55 |
56 | - 解压,安装压缩包里的python(记得勾选add to path)
57 |
58 | - 进入bot.py所在目录,打开cmd运行如下命令
59 |
60 | pip install -r requirements.txt
61 |
62 | - 修改config.json并运行bot.py(修改botqq、port、key与你的mirai-api-http需要保持一致,botName和master是可选内容)
63 |
64 | 发送 voice 查看帮助菜单
65 |
66 | 发送 sp 查看当前可用的所有角色
67 |
68 | # 导入更多模型(可选)
69 |
70 | 在voiceModel文件夹下新建文件夹,把.pth(模型文件)和config.json(配置文件)放进去
71 |
72 | 下载模型
73 |
74 | 下载模型
75 |
76 | [碧蓝档案主题模型](https://www.bilibili.com/video/BV1wG4y1M7SL/?spm_id_from=333.999.0.0)
77 |
78 | [CjangCjengh的模型仓库](https://github.com/CjangCjengh/TTSModels)
79 |
80 | [**已修改配置文件的模型仓库**](https://pan.baidu.com/s/1bEbDMv0Ysj0cRmwHi6WAyA?pwd=9rmj),下载后放在项目对应文件夹下即可。
81 |
82 |
83 | 模型命名规则(重要):
84 | 多语种模型:后缀名前面加一个m,如yuuka.pth支持中日双语,则改成yuukam.pth
85 | 单语种模型:不用改名
86 |
87 | 配置文件修改:
88 | 模型来自saya佬:打开config.json修改speakers,把一大串speakers修改为一个(名称随意) | 中文名的speaker需要转unicode | https://www.xgjzx.cn/chinese
89 | 模型来自CjangCjengh:直接用
90 |
91 | # 模型名称与config.json文件的修改(导入模型的详解)
92 |
93 | **以碧蓝档案的模型库为例**
94 |
95 | *碧蓝档案模型库的模型大都是单角色,它的配置文件是多模型通用的,但为了更方便地使用,我们需要修改config.json*
96 |
97 | **在[这个网站](https://www.xgjzx.cn/chinese)把角色的名称转成对应的unicode编码**
98 |
99 | 
100 |
101 |
102 | 打开config.json文件,把上一步得到的角色名称的unicode码填入speakers项。
103 |
104 | **修改前:"speakers":["这里是一大堆东西,全删掉"]**
105 |
106 | 
107 |
108 |
109 | **修改后:"speakers": ["\u963f\u55b5\u55b5"]**
110 |
111 | 
112 |
113 |
114 | **需要注意**
115 |
116 | 如果模型支持中日双语则需要把模型名从XXX.pth改成XXXm.pth
117 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/__init__.py
--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch import nn
4 | from torch.nn import functional as F
5 |
6 | import commons
7 | from modules import LayerNorm
8 |
9 |
10 | class Encoder(nn.Module):
11 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
12 | super().__init__()
13 | self.hidden_channels = hidden_channels
14 | self.filter_channels = filter_channels
15 | self.n_heads = n_heads
16 | self.n_layers = n_layers
17 | self.kernel_size = kernel_size
18 | self.p_dropout = p_dropout
19 | self.window_size = window_size
20 |
21 | self.drop = nn.Dropout(p_dropout)
22 | self.attn_layers = nn.ModuleList()
23 | self.norm_layers_1 = nn.ModuleList()
24 | self.ffn_layers = nn.ModuleList()
25 | self.norm_layers_2 = nn.ModuleList()
26 | for i in range(self.n_layers):
27 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
28 | self.norm_layers_1.append(LayerNorm(hidden_channels))
29 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
30 | self.norm_layers_2.append(LayerNorm(hidden_channels))
31 |
32 | def forward(self, x, x_mask):
33 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
34 | x = x * x_mask
35 | for i in range(self.n_layers):
36 | y = self.attn_layers[i](x, x, attn_mask)
37 | y = self.drop(y)
38 | x = self.norm_layers_1[i](x + y)
39 |
40 | y = self.ffn_layers[i](x, x_mask)
41 | y = self.drop(y)
42 | x = self.norm_layers_2[i](x + y)
43 | x = x * x_mask
44 | return x
45 |
46 |
47 | class Decoder(nn.Module):
48 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
49 | super().__init__()
50 | self.hidden_channels = hidden_channels
51 | self.filter_channels = filter_channels
52 | self.n_heads = n_heads
53 | self.n_layers = n_layers
54 | self.kernel_size = kernel_size
55 | self.p_dropout = p_dropout
56 | self.proximal_bias = proximal_bias
57 | self.proximal_init = proximal_init
58 |
59 | self.drop = nn.Dropout(p_dropout)
60 | self.self_attn_layers = nn.ModuleList()
61 | self.norm_layers_0 = nn.ModuleList()
62 | self.encdec_attn_layers = nn.ModuleList()
63 | self.norm_layers_1 = nn.ModuleList()
64 | self.ffn_layers = nn.ModuleList()
65 | self.norm_layers_2 = nn.ModuleList()
66 | for i in range(self.n_layers):
67 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
68 | self.norm_layers_0.append(LayerNorm(hidden_channels))
69 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
70 | self.norm_layers_1.append(LayerNorm(hidden_channels))
71 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
72 | self.norm_layers_2.append(LayerNorm(hidden_channels))
73 |
74 | def forward(self, x, x_mask, h, h_mask):
75 | """
76 | x: decoder input
77 | h: encoder output
78 | """
79 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
80 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
81 | x = x * x_mask
82 | for i in range(self.n_layers):
83 | y = self.self_attn_layers[i](x, x, self_attn_mask)
84 | y = self.drop(y)
85 | x = self.norm_layers_0[i](x + y)
86 |
87 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
88 | y = self.drop(y)
89 | x = self.norm_layers_1[i](x + y)
90 |
91 | y = self.ffn_layers[i](x, x_mask)
92 | y = self.drop(y)
93 | x = self.norm_layers_2[i](x + y)
94 | x = x * x_mask
95 | return x
96 |
97 |
98 | class MultiHeadAttention(nn.Module):
99 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
100 | super().__init__()
101 | assert channels % n_heads == 0
102 |
103 | self.channels = channels
104 | self.out_channels = out_channels
105 | self.n_heads = n_heads
106 | self.p_dropout = p_dropout
107 | self.window_size = window_size
108 | self.heads_share = heads_share
109 | self.block_length = block_length
110 | self.proximal_bias = proximal_bias
111 | self.proximal_init = proximal_init
112 | self.attn = None
113 |
114 | self.k_channels = channels // n_heads
115 | self.conv_q = nn.Conv1d(channels, channels, 1)
116 | self.conv_k = nn.Conv1d(channels, channels, 1)
117 | self.conv_v = nn.Conv1d(channels, channels, 1)
118 | self.conv_o = nn.Conv1d(channels, out_channels, 1)
119 | self.drop = nn.Dropout(p_dropout)
120 |
121 | if window_size is not None:
122 | n_heads_rel = 1 if heads_share else n_heads
123 | rel_stddev = self.k_channels**-0.5
124 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
125 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
126 |
127 | nn.init.xavier_uniform_(self.conv_q.weight)
128 | nn.init.xavier_uniform_(self.conv_k.weight)
129 | nn.init.xavier_uniform_(self.conv_v.weight)
130 | if proximal_init:
131 | with torch.no_grad():
132 | self.conv_k.weight.copy_(self.conv_q.weight)
133 | self.conv_k.bias.copy_(self.conv_q.bias)
134 |
135 | def forward(self, x, c, attn_mask=None):
136 | q = self.conv_q(x)
137 | k = self.conv_k(c)
138 | v = self.conv_v(c)
139 |
140 | x, self.attn = self.attention(q, k, v, mask=attn_mask)
141 |
142 | x = self.conv_o(x)
143 | return x
144 |
145 | def attention(self, query, key, value, mask=None):
146 | # reshape [b, d, t] -> [b, n_h, t, d_k]
147 | b, d, t_s, t_t = (*key.size(), query.size(2))
148 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
149 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
150 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
151 |
152 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
153 | if self.window_size is not None:
154 | assert t_s == t_t, "Relative attention is only available for self-attention."
155 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
156 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
157 | scores_local = self._relative_position_to_absolute_position(rel_logits)
158 | scores = scores + scores_local
159 | if self.proximal_bias:
160 | assert t_s == t_t, "Proximal bias is only available for self-attention."
161 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
162 | if mask is not None:
163 | scores = scores.masked_fill(mask == 0, -1e4)
164 | if self.block_length is not None:
165 | assert t_s == t_t, "Local attention is only available for self-attention."
166 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
167 | scores = scores.masked_fill(block_mask == 0, -1e4)
168 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
169 | p_attn = self.drop(p_attn)
170 | output = torch.matmul(p_attn, value)
171 | if self.window_size is not None:
172 | relative_weights = self._absolute_position_to_relative_position(p_attn)
173 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
174 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
175 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
176 | return output, p_attn
177 |
178 | def _matmul_with_relative_values(self, x, y):
179 | """
180 | x: [b, h, l, m]
181 | y: [h or 1, m, d]
182 | ret: [b, h, l, d]
183 | """
184 | ret = torch.matmul(x, y.unsqueeze(0))
185 | return ret
186 |
187 | def _matmul_with_relative_keys(self, x, y):
188 | """
189 | x: [b, h, l, d]
190 | y: [h or 1, m, d]
191 | ret: [b, h, l, m]
192 | """
193 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
194 | return ret
195 |
196 | def _get_relative_embeddings(self, relative_embeddings, length):
197 | max_relative_position = 2 * self.window_size + 1
198 | # Pad first before slice to avoid using cond ops.
199 | pad_length = max(length - (self.window_size + 1), 0)
200 | slice_start_position = max((self.window_size + 1) - length, 0)
201 | slice_end_position = slice_start_position + 2 * length - 1
202 | if pad_length > 0:
203 | padded_relative_embeddings = F.pad(
204 | relative_embeddings,
205 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
206 | else:
207 | padded_relative_embeddings = relative_embeddings
208 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
209 | return used_relative_embeddings
210 |
211 | def _relative_position_to_absolute_position(self, x):
212 | """
213 | x: [b, h, l, 2*l-1]
214 | ret: [b, h, l, l]
215 | """
216 | batch, heads, length, _ = x.size()
217 | # Concat columns of pad to shift from relative to absolute indexing.
218 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
219 |
220 | # Concat extra elements so to add up to shape (len+1, 2*len-1).
221 | x_flat = x.view([batch, heads, length * 2 * length])
222 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
223 |
224 | # Reshape and slice out the padded elements.
225 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
226 | return x_final
227 |
228 | def _absolute_position_to_relative_position(self, x):
229 | """
230 | x: [b, h, l, l]
231 | ret: [b, h, l, 2*l-1]
232 | """
233 | batch, heads, length, _ = x.size()
234 | # padd along column
235 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
236 | x_flat = x.view([batch, heads, length**2 + length*(length -1)])
237 | # add 0's in the beginning that will skew the elements after reshape
238 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
239 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
240 | return x_final
241 |
242 | def _attention_bias_proximal(self, length):
243 | """Bias for self-attention to encourage attention to close positions.
244 | Args:
245 | length: an integer scalar.
246 | Returns:
247 | a Tensor with shape [1, 1, length, length]
248 | """
249 | r = torch.arange(length, dtype=torch.float32)
250 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
251 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
252 |
253 |
254 | class FFN(nn.Module):
255 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
256 | super().__init__()
257 | self.in_channels = in_channels
258 | self.out_channels = out_channels
259 | self.filter_channels = filter_channels
260 | self.kernel_size = kernel_size
261 | self.p_dropout = p_dropout
262 | self.activation = activation
263 | self.causal = causal
264 |
265 | if causal:
266 | self.padding = self._causal_padding
267 | else:
268 | self.padding = self._same_padding
269 |
270 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
271 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
272 | self.drop = nn.Dropout(p_dropout)
273 |
274 | def forward(self, x, x_mask):
275 | x = self.conv_1(self.padding(x * x_mask))
276 | if self.activation == "gelu":
277 | x = x * torch.sigmoid(1.702 * x)
278 | else:
279 | x = torch.relu(x)
280 | x = self.drop(x)
281 | x = self.conv_2(self.padding(x * x_mask))
282 | return x * x_mask
283 |
284 | def _causal_padding(self, x):
285 | if self.kernel_size == 1:
286 | return x
287 | pad_l = self.kernel_size - 1
288 | pad_r = 0
289 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
290 | x = F.pad(x, commons.convert_pad_shape(padding))
291 | return x
292 |
293 | def _same_padding(self, x):
294 | if self.kernel_size == 1:
295 | return x
296 | pad_l = (self.kernel_size - 1) // 2
297 | pad_r = self.kernel_size // 2
298 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
299 | x = F.pad(x, commons.convert_pad_shape(padding))
300 | return x
301 |
--------------------------------------------------------------------------------
/bot.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | from mirai import Voice,Image
4 | from mirai import Mirai, WebSocketAdapter, FriendMessage, GroupMessage, At, Plain
5 |
6 |
7 | import sys
8 |
9 | from MoeGoe import voiceGenerate
10 | from plugins import voicePart
11 | from plugins.RandomStr.RandomStr import random_str
12 | from plugins.picGet import pic
13 | from trans import translate
14 |
15 | if __name__ == '__main__':
16 |
17 | qq=114514#这一行填写你机器人的QQ
18 | bot = Mirai(qq, adapter=WebSocketAdapter(
19 | verify_key='1234567890', host='localhost', port=23456
20 | ))
21 | aimFriend = 1840094972
22 | aimGroup = 699455559
23 | statusPath = 1
24 | model = 0
25 | lang = '日语'
26 | @bot.on(FriendMessage)
27 | async def yuYinMode(event: FriendMessage):
28 | if str(event.message_chain).startswith('发送'):
29 | sa = str(event.message_chain)[2:]
30 | ranpath = random_str()
31 | out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
32 |
33 | if int(statusPath)==0:
34 | if lang=='中文':
35 | tex = '[ZH]' + sa + '[ZH]'
36 | voiceGenerate(tex, out,model)
37 | await bot.send_friend_message(int(aimFriend),Voice(path=out))
38 | if lang=='日语':
39 | tex = '[JA]' + translate(sa) + '[JA]'
40 | voiceGenerate(tex, out,model)
41 | await bot.send_friend_message(int(aimFriend),Voice(path=out))
42 | elif int(statusPath)==1:
43 | if lang=='中文':
44 | tex = '[ZH]' + sa + '[ZH]'
45 | voiceGenerate(tex, out,model)
46 | await bot.send_group_message(int(aimGroup),Voice(path=out))
47 | if lang=='日语':
48 | tex = '[JA]' + translate(sa) + '[JA]'
49 | voiceGenerate(tex, out,model)
50 | await bot.send_group_message(int(aimGroup),Voice(path=out))
51 | #图片模块
52 | @bot.on(GroupMessage)
53 | async def handle_group_message(event: GroupMessage):
54 | if '/pic' in str(event.message_chain):
55 | picNum=int((str(event.message_chain))[4:])
56 | if picNum<10 and picNum>-1:
57 | for i in range(picNum):
58 | a = pic()
59 | await bot.send(event, Image(path=a))
60 | elif picNum=='':
61 | a = pic()
62 | await bot.send(event, Image(path=a))
63 | else:
64 | await bot.send(event,"可以发点正常的数字吗")
65 |
66 |
67 |
68 |
69 |
70 | #失效
71 | '''@bot.on(FriendMessage)
72 | async def handle_group_message(event: FriendMessage):
73 | if str(event.message_chain).startswith('#说'):
74 | if len(str(event.message_chain)) < 280:
75 | ranpath = random_str()
76 | out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
77 | tex = '[JA]' + translate((str(event.message_chain))[1:]) + '[JA]'
78 | voiceGenerate(tex, out)
79 | await bot.send(event, Voice(path=out))
80 | else:
81 | ranpath = random_str()
82 | out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
83 | tex = '[JA]' + translate('不行,太长了哦.....') + '[JA]'
84 | voiceGenerate(tex, out)
85 | await bot(event, Voice(path=out))'''
86 |
87 | #连接群
88 | @bot.on(FriendMessage)
89 | async def on_friend_message(event: FriendMessage):
90 | if str(event.message_chain).startswith('连接群'):
91 | sa = str(event.message_chain).split('#')
92 | global aimGroup
93 | aimGroup=int(sa[1])
94 | global statusPath
95 | statusPath = 1
96 | await bot.send(event, '已切换为群聊'+sa[1])
97 |
98 | #连接人
99 | @bot.on(FriendMessage)
100 | async def on_friend_message(event: FriendMessage):
101 | if str(event.message_chain).startswith('连接对象'):
102 | sa = str(event.message_chain).split('#')
103 | global aimFriend
104 | aimFriend=sa[1]
105 | global statusPath
106 | statusPath=0
107 | await bot.send(event, '已切换为私聊对象'+sa[1])
108 |
109 | #语言切换
110 | @bot.on(FriendMessage)
111 | async def Lanconfig(event: FriendMessage):
112 | if str(event.message_chain).startswith('切换'):
113 | sa = str(event.message_chain)[2:]
114 | global lang
115 | if sa=='中文':
116 | lang=sa
117 | await bot.send(event, '已切换,当前使用语言'+sa)
118 | elif sa=='日语':
119 | lang=sa
120 | await bot.send(event, '已切换,当前使用语言' + sa)
121 | else:
122 | await bot.send(event, '数值不合法,语言选择:中文/日语')
123 | #模型切换
124 | @bot.on(FriendMessage)
125 | async def on_friend_message(event: FriendMessage):
126 | if str(event.message_chain).startswith('M'):
127 | sa = str(event.message_chain).split('#')
128 | modelList = ['0', '1', '2', '3']
129 | if sa[1] in modelList:
130 | global model
131 | model=int(sa[1])
132 | await bot.send(event, '已切换,当前使用模型' + sa[1])
133 | else:
134 | await bot.send(event, '数值不合法,模型范围[0-3]')
135 |
136 | # 模型切换
137 | @bot.on(GroupMessage)
138 | async def on_group_message(event: GroupMessage):
139 | if str(event.message_chain).startswith('M'):
140 | sa = str(event.message_chain).split('#')
141 | modelList = ['0', '1', '2', '3']
142 | if sa[1] in modelList:
143 | global model
144 | model = int(sa[1])
145 | await bot.send(event, '已切换,当前使用模型' + sa[1])
146 | else:
147 | await bot.send(event, '数值不合法,模型范围[0-3]')
148 |
149 |
150 | voicePart.main(bot) # 语音生成(主动)
151 | bot.run()
152 |
153 |
154 |
--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.nn import functional as F
4 | import torch.jit
5 |
6 |
7 | def script_method(fn, _rcb=None):
8 | return fn
9 |
10 |
11 | def script(obj, optimize=True, _frames_up=0, _rcb=None):
12 | return obj
13 |
14 |
15 | torch.jit.script_method = script_method
16 | torch.jit.script = script
17 |
18 |
19 | def init_weights(m, mean=0.0, std=0.01):
20 | classname = m.__class__.__name__
21 | if classname.find("Conv") != -1:
22 | m.weight.data.normal_(mean, std)
23 |
24 |
25 | def get_padding(kernel_size, dilation=1):
26 | return int((kernel_size*dilation - dilation)/2)
27 |
28 |
29 | def intersperse(lst, item):
30 | result = [item] * (len(lst) * 2 + 1)
31 | result[1::2] = lst
32 | return result
33 |
34 |
35 | def slice_segments(x, ids_str, segment_size=4):
36 | ret = torch.zeros_like(x[:, :, :segment_size])
37 | for i in range(x.size(0)):
38 | idx_str = ids_str[i]
39 | idx_end = idx_str + segment_size
40 | ret[i] = x[i, :, idx_str:idx_end]
41 | return ret
42 |
43 |
44 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
45 | b, d, t = x.size()
46 | if x_lengths is None:
47 | x_lengths = t
48 | ids_str_max = x_lengths - segment_size + 1
49 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
50 | ret = slice_segments(x, ids_str, segment_size)
51 | return ret, ids_str
52 |
53 |
54 | def subsequent_mask(length):
55 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
56 | return mask
57 |
58 |
59 | @torch.jit.script
60 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
61 | n_channels_int = n_channels[0]
62 | in_act = input_a + input_b
63 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
64 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
65 | acts = t_act * s_act
66 | return acts
67 |
68 |
69 | def convert_pad_shape(pad_shape):
70 | l = pad_shape[::-1]
71 | pad_shape = [item for sublist in l for item in sublist]
72 | return pad_shape
73 |
74 |
75 | def sequence_mask(length, max_length=None):
76 | if max_length is None:
77 | max_length = length.max()
78 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
79 | return x.unsqueeze(0) < length.unsqueeze(1)
80 |
81 |
82 | def generate_path(duration, mask):
83 | """
84 | duration: [b, 1, t_x]
85 | mask: [b, 1, t_y, t_x]
86 | """
87 | device = duration.device
88 |
89 | b, _, t_y, t_x = mask.shape
90 | cum_duration = torch.cumsum(duration, -1)
91 |
92 | cum_duration_flat = cum_duration.view(b * t_x)
93 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
94 | path = path.view(b, t_x, t_y)
95 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
96 | path = path.unsqueeze(1).transpose(2,3) * mask
97 | return path
98 |
--------------------------------------------------------------------------------
/hubert_model.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from typing import Optional, Tuple
3 | import random
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9 |
10 | class Hubert(nn.Module):
11 | def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
12 | super().__init__()
13 | self._mask = mask
14 | self.feature_extractor = FeatureExtractor()
15 | self.feature_projection = FeatureProjection()
16 | self.positional_embedding = PositionalConvEmbedding()
17 | self.norm = nn.LayerNorm(768)
18 | self.dropout = nn.Dropout(0.1)
19 | self.encoder = TransformerEncoder(
20 | nn.TransformerEncoderLayer(
21 | 768, 12, 3072, activation="gelu", batch_first=True
22 | ),
23 | 12,
24 | )
25 | self.proj = nn.Linear(768, 256)
26 |
27 | self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
28 | self.label_embedding = nn.Embedding(num_label_embeddings, 256)
29 |
30 | def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
31 | mask = None
32 | if self.training and self._mask:
33 | mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
34 | x[mask] = self.masked_spec_embed.to(x.dtype)
35 | return x, mask
36 |
37 | def encode(
38 | self, x: torch.Tensor, layer: Optional[int] = None
39 | ) -> Tuple[torch.Tensor, torch.Tensor]:
40 | x = self.feature_extractor(x)
41 | x = self.feature_projection(x.transpose(1, 2))
42 | x, mask = self.mask(x)
43 | x = x + self.positional_embedding(x)
44 | x = self.dropout(self.norm(x))
45 | x = self.encoder(x, output_layer=layer)
46 | return x, mask
47 |
48 | def logits(self, x: torch.Tensor) -> torch.Tensor:
49 | logits = torch.cosine_similarity(
50 | x.unsqueeze(2),
51 | self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
52 | dim=-1,
53 | )
54 | return logits / 0.1
55 |
56 | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
57 | x, mask = self.encode(x)
58 | x = self.proj(x)
59 | logits = self.logits(x)
60 | return logits, mask
61 |
62 |
63 | class HubertSoft(Hubert):
64 | def __init__(self):
65 | super().__init__()
66 |
67 | @torch.inference_mode()
68 | def units(self, wav: torch.Tensor) -> torch.Tensor:
69 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
70 | x, _ = self.encode(wav)
71 | return self.proj(x)
72 |
73 |
74 | class FeatureExtractor(nn.Module):
75 | def __init__(self):
76 | super().__init__()
77 | self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
78 | self.norm0 = nn.GroupNorm(512, 512)
79 | self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
80 | self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
81 | self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
82 | self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
83 | self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
84 | self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
85 |
86 | def forward(self, x: torch.Tensor) -> torch.Tensor:
87 | x = F.gelu(self.norm0(self.conv0(x)))
88 | x = F.gelu(self.conv1(x))
89 | x = F.gelu(self.conv2(x))
90 | x = F.gelu(self.conv3(x))
91 | x = F.gelu(self.conv4(x))
92 | x = F.gelu(self.conv5(x))
93 | x = F.gelu(self.conv6(x))
94 | return x
95 |
96 |
97 | class FeatureProjection(nn.Module):
98 | def __init__(self):
99 | super().__init__()
100 | self.norm = nn.LayerNorm(512)
101 | self.projection = nn.Linear(512, 768)
102 | self.dropout = nn.Dropout(0.1)
103 |
104 | def forward(self, x: torch.Tensor) -> torch.Tensor:
105 | x = self.norm(x)
106 | x = self.projection(x)
107 | x = self.dropout(x)
108 | return x
109 |
110 |
111 | class PositionalConvEmbedding(nn.Module):
112 | def __init__(self):
113 | super().__init__()
114 | self.conv = nn.Conv1d(
115 | 768,
116 | 768,
117 | kernel_size=128,
118 | padding=128 // 2,
119 | groups=16,
120 | )
121 | self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
122 |
123 | def forward(self, x: torch.Tensor) -> torch.Tensor:
124 | x = self.conv(x.transpose(1, 2))
125 | x = F.gelu(x[:, :, :-1])
126 | return x.transpose(1, 2)
127 |
128 |
129 | class TransformerEncoder(nn.Module):
130 | def __init__(
131 | self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
132 | ) -> None:
133 | super(TransformerEncoder, self).__init__()
134 | self.layers = nn.ModuleList(
135 | [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
136 | )
137 | self.num_layers = num_layers
138 |
139 | def forward(
140 | self,
141 | src: torch.Tensor,
142 | mask: torch.Tensor = None,
143 | src_key_padding_mask: torch.Tensor = None,
144 | output_layer: Optional[int] = None,
145 | ) -> torch.Tensor:
146 | output = src
147 | for layer in self.layers[:output_layer]:
148 | output = layer(
149 | output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
150 | )
151 | return output
152 |
153 |
154 | def _compute_mask(
155 | shape: Tuple[int, int],
156 | mask_prob: float,
157 | mask_length: int,
158 | device: torch.device,
159 | min_masks: int = 0,
160 | ) -> torch.Tensor:
161 | batch_size, sequence_length = shape
162 |
163 | if mask_length < 1:
164 | raise ValueError("`mask_length` has to be bigger than 0.")
165 |
166 | if mask_length > sequence_length:
167 | raise ValueError(
168 | f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
169 | )
170 |
171 | # compute number of masked spans in batch
172 | num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
173 | num_masked_spans = max(num_masked_spans, min_masks)
174 |
175 | # make sure num masked indices <= sequence_length
176 | if num_masked_spans * mask_length > sequence_length:
177 | num_masked_spans = sequence_length // mask_length
178 |
179 | # SpecAugment mask to fill
180 | mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
181 |
182 | # uniform distribution to sample from, make sure that offset samples are < sequence_length
183 | uniform_dist = torch.ones(
184 | (batch_size, sequence_length - (mask_length - 1)), device=device
185 | )
186 |
187 | # get random indices to mask
188 | mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
189 |
190 | # expand masked indices to masked spans
191 | mask_indices = (
192 | mask_indices.unsqueeze(dim=-1)
193 | .expand((batch_size, num_masked_spans, mask_length))
194 | .reshape(batch_size, num_masked_spans * mask_length)
195 | )
196 | offsets = (
197 | torch.arange(mask_length, device=device)[None, None, :]
198 | .expand((batch_size, num_masked_spans, mask_length))
199 | .reshape(batch_size, num_masked_spans * mask_length)
200 | )
201 | mask_idxs = mask_indices + offsets
202 |
203 | # scatter indices to mask
204 | mask = mask.scatter(1, mask_idxs, True)
205 |
206 | return mask
207 |
208 |
209 | def hubert_soft(
210 | path: str
211 | ) -> HubertSoft:
212 | r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
213 | Args:
214 | path (str): path of a pretrained model
215 | """
216 | hubert = HubertSoft()
217 | checkpoint = torch.load(path)
218 | consume_prefix_in_state_dict_if_present(checkpoint, "module.")
219 | hubert.load_state_dict(checkpoint)
220 | hubert.eval()
221 | return hubert
222 |
--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data
3 | from librosa.filters import mel as librosa_mel_fn
4 |
5 | MAX_WAV_VALUE = 32768.0
6 |
7 |
8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
9 | """
10 | PARAMS
11 | ------
12 | C: compression factor
13 | """
14 | return torch.log(torch.clamp(x, min=clip_val) * C)
15 |
16 |
17 | def dynamic_range_decompression_torch(x, C=1):
18 | """
19 | PARAMS
20 | ------
21 | C: compression factor used to compress
22 | """
23 | return torch.exp(x) / C
24 |
25 |
26 | def spectral_normalize_torch(magnitudes):
27 | output = dynamic_range_compression_torch(magnitudes)
28 | return output
29 |
30 |
31 | def spectral_de_normalize_torch(magnitudes):
32 | output = dynamic_range_decompression_torch(magnitudes)
33 | return output
34 |
35 |
36 | mel_basis = {}
37 | hann_window = {}
38 |
39 |
40 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
41 | if torch.min(y) < -1.:
42 | print('min value is ', torch.min(y))
43 | if torch.max(y) > 1.:
44 | print('max value is ', torch.max(y))
45 |
46 | global hann_window
47 | dtype_device = str(y.dtype) + '_' + str(y.device)
48 | wnsize_dtype_device = str(win_size) + '_' + dtype_device
49 | if wnsize_dtype_device not in hann_window:
50 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
51 |
52 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
53 | y = y.squeeze(1)
54 |
55 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
56 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
57 |
58 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
59 | return spec
60 |
61 |
62 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
63 | global mel_basis
64 | dtype_device = str(spec.dtype) + '_' + str(spec.device)
65 | fmax_dtype_device = str(fmax) + '_' + dtype_device
66 | if fmax_dtype_device not in mel_basis:
67 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
68 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
69 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
70 | spec = spectral_normalize_torch(spec)
71 | return spec
72 |
73 |
74 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
75 | if torch.min(y) < -1.:
76 | print('min value is ', torch.min(y))
77 | if torch.max(y) > 1.:
78 | print('max value is ', torch.max(y))
79 |
80 | global mel_basis, hann_window
81 | dtype_device = str(y.dtype) + '_' + str(y.device)
82 | fmax_dtype_device = str(fmax) + '_' + dtype_device
83 | wnsize_dtype_device = str(win_size) + '_' + dtype_device
84 | if fmax_dtype_device not in mel_basis:
85 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
86 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
87 | if wnsize_dtype_device not in hann_window:
88 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
89 |
90 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
91 | y = y.squeeze(1)
92 |
93 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
94 | center=center, pad_mode='reflect', normalized=False, onesided=True)
95 |
96 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
97 |
98 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
99 | spec = spectral_normalize_torch(spec)
100 |
101 | return spec
102 |
--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch import nn
4 | from torch.nn import functional as F
5 |
6 | import commons
7 | import modules
8 | import attentions
9 | import monotonic_align
10 |
11 | from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13 | from commons import init_weights, get_padding
14 |
15 |
16 | class StochasticDurationPredictor(nn.Module):
17 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
18 | super().__init__()
19 | filter_channels = in_channels # it needs to be removed from future version.
20 | self.in_channels = in_channels
21 | self.filter_channels = filter_channels
22 | self.kernel_size = kernel_size
23 | self.p_dropout = p_dropout
24 | self.n_flows = n_flows
25 | self.gin_channels = gin_channels
26 |
27 | self.log_flow = modules.Log()
28 | self.flows = nn.ModuleList()
29 | self.flows.append(modules.ElementwiseAffine(2))
30 | for i in range(n_flows):
31 | self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
32 | self.flows.append(modules.Flip())
33 |
34 | self.post_pre = nn.Conv1d(1, filter_channels, 1)
35 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
36 | self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
37 | self.post_flows = nn.ModuleList()
38 | self.post_flows.append(modules.ElementwiseAffine(2))
39 | for i in range(4):
40 | self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
41 | self.post_flows.append(modules.Flip())
42 |
43 | self.pre = nn.Conv1d(in_channels, filter_channels, 1)
44 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
45 | self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
46 | if gin_channels != 0:
47 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
48 |
49 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
50 | x = torch.detach(x)
51 | x = self.pre(x)
52 | if g is not None:
53 | g = torch.detach(g)
54 | x = x + self.cond(g)
55 | x = self.convs(x, x_mask)
56 | x = self.proj(x) * x_mask
57 |
58 | if not reverse:
59 | flows = self.flows
60 | assert w is not None
61 |
62 | logdet_tot_q = 0
63 | h_w = self.post_pre(w)
64 | h_w = self.post_convs(h_w, x_mask)
65 | h_w = self.post_proj(h_w) * x_mask
66 | e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
67 | z_q = e_q
68 | for flow in self.post_flows:
69 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
70 | logdet_tot_q += logdet_q
71 | z_u, z1 = torch.split(z_q, [1, 1], 1)
72 | u = torch.sigmoid(z_u) * x_mask
73 | z0 = (w - u) * x_mask
74 | logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
75 | logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
76 |
77 | logdet_tot = 0
78 | z0, logdet = self.log_flow(z0, x_mask)
79 | logdet_tot += logdet
80 | z = torch.cat([z0, z1], 1)
81 | for flow in flows:
82 | z, logdet = flow(z, x_mask, g=x, reverse=reverse)
83 | logdet_tot = logdet_tot + logdet
84 | nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
85 | return nll + logq # [b]
86 | else:
87 | flows = list(reversed(self.flows))
88 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow
89 | z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
90 | for flow in flows:
91 | z = flow(z, x_mask, g=x, reverse=reverse)
92 | z0, z1 = torch.split(z, [1, 1], 1)
93 | logw = z0
94 | return logw
95 |
96 |
97 | class DurationPredictor(nn.Module):
98 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
99 | super().__init__()
100 |
101 | self.in_channels = in_channels
102 | self.filter_channels = filter_channels
103 | self.kernel_size = kernel_size
104 | self.p_dropout = p_dropout
105 | self.gin_channels = gin_channels
106 |
107 | self.drop = nn.Dropout(p_dropout)
108 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
109 | self.norm_1 = modules.LayerNorm(filter_channels)
110 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
111 | self.norm_2 = modules.LayerNorm(filter_channels)
112 | self.proj = nn.Conv1d(filter_channels, 1, 1)
113 |
114 | if gin_channels != 0:
115 | self.cond = nn.Conv1d(gin_channels, in_channels, 1)
116 |
117 | def forward(self, x, x_mask, g=None):
118 | x = torch.detach(x)
119 | if g is not None:
120 | g = torch.detach(g)
121 | x = x + self.cond(g)
122 | x = self.conv_1(x * x_mask)
123 | x = torch.relu(x)
124 | x = self.norm_1(x)
125 | x = self.drop(x)
126 | x = self.conv_2(x * x_mask)
127 | x = torch.relu(x)
128 | x = self.norm_2(x)
129 | x = self.drop(x)
130 | x = self.proj(x * x_mask)
131 | return x * x_mask
132 |
133 |
134 | class TextEncoder(nn.Module):
135 | def __init__(self,
136 | n_vocab,
137 | out_channels,
138 | hidden_channels,
139 | filter_channels,
140 | n_heads,
141 | n_layers,
142 | kernel_size,
143 | p_dropout,
144 | emotion_embedding):
145 | super().__init__()
146 | self.n_vocab = n_vocab
147 | self.out_channels = out_channels
148 | self.hidden_channels = hidden_channels
149 | self.filter_channels = filter_channels
150 | self.n_heads = n_heads
151 | self.n_layers = n_layers
152 | self.kernel_size = kernel_size
153 | self.p_dropout = p_dropout
154 | self.emotion_embedding = emotion_embedding
155 |
156 | if self.n_vocab!=0:
157 | self.emb = nn.Embedding(n_vocab, hidden_channels)
158 | if emotion_embedding:
159 | self.emo_proj = nn.Linear(1024, hidden_channels)
160 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
161 |
162 | self.encoder = attentions.Encoder(
163 | hidden_channels,
164 | filter_channels,
165 | n_heads,
166 | n_layers,
167 | kernel_size,
168 | p_dropout)
169 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
170 |
171 | def forward(self, x, x_lengths, emotion_embedding=None):
172 | if self.n_vocab!=0:
173 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
174 | if emotion_embedding is not None:
175 | x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
176 | x = torch.transpose(x, 1, -1) # [b, h, t]
177 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
178 |
179 | x = self.encoder(x * x_mask, x_mask)
180 | stats = self.proj(x) * x_mask
181 |
182 | m, logs = torch.split(stats, self.out_channels, dim=1)
183 | return x, m, logs, x_mask
184 |
185 |
186 | class ResidualCouplingBlock(nn.Module):
187 | def __init__(self,
188 | channels,
189 | hidden_channels,
190 | kernel_size,
191 | dilation_rate,
192 | n_layers,
193 | n_flows=4,
194 | gin_channels=0):
195 | super().__init__()
196 | self.channels = channels
197 | self.hidden_channels = hidden_channels
198 | self.kernel_size = kernel_size
199 | self.dilation_rate = dilation_rate
200 | self.n_layers = n_layers
201 | self.n_flows = n_flows
202 | self.gin_channels = gin_channels
203 |
204 | self.flows = nn.ModuleList()
205 | for i in range(n_flows):
206 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
207 | self.flows.append(modules.Flip())
208 |
209 | def forward(self, x, x_mask, g=None, reverse=False):
210 | if not reverse:
211 | for flow in self.flows:
212 | x, _ = flow(x, x_mask, g=g, reverse=reverse)
213 | else:
214 | for flow in reversed(self.flows):
215 | x = flow(x, x_mask, g=g, reverse=reverse)
216 | return x
217 |
218 |
219 | class PosteriorEncoder(nn.Module):
220 | def __init__(self,
221 | in_channels,
222 | out_channels,
223 | hidden_channels,
224 | kernel_size,
225 | dilation_rate,
226 | n_layers,
227 | gin_channels=0):
228 | super().__init__()
229 | self.in_channels = in_channels
230 | self.out_channels = out_channels
231 | self.hidden_channels = hidden_channels
232 | self.kernel_size = kernel_size
233 | self.dilation_rate = dilation_rate
234 | self.n_layers = n_layers
235 | self.gin_channels = gin_channels
236 |
237 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
238 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
239 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
240 |
241 | def forward(self, x, x_lengths, g=None):
242 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
243 | x = self.pre(x) * x_mask
244 | x = self.enc(x, x_mask, g=g)
245 | stats = self.proj(x) * x_mask
246 | m, logs = torch.split(stats, self.out_channels, dim=1)
247 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
248 | return z, m, logs, x_mask
249 |
250 |
251 | class Generator(torch.nn.Module):
252 | def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
253 | super(Generator, self).__init__()
254 | self.num_kernels = len(resblock_kernel_sizes)
255 | self.num_upsamples = len(upsample_rates)
256 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
257 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
258 |
259 | self.ups = nn.ModuleList()
260 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
261 | self.ups.append(weight_norm(
262 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
263 | k, u, padding=(k-u)//2)))
264 |
265 | self.resblocks = nn.ModuleList()
266 | for i in range(len(self.ups)):
267 | ch = upsample_initial_channel//(2**(i+1))
268 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
269 | self.resblocks.append(resblock(ch, k, d))
270 |
271 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
272 | self.ups.apply(init_weights)
273 |
274 | if gin_channels != 0:
275 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
276 |
277 | def forward(self, x, g=None):
278 | x = self.conv_pre(x)
279 | if g is not None:
280 | x = x + self.cond(g)
281 |
282 | for i in range(self.num_upsamples):
283 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
284 | x = self.ups[i](x)
285 | xs = None
286 | for j in range(self.num_kernels):
287 | if xs is None:
288 | xs = self.resblocks[i*self.num_kernels+j](x)
289 | else:
290 | xs += self.resblocks[i*self.num_kernels+j](x)
291 | x = xs / self.num_kernels
292 | x = F.leaky_relu(x)
293 | x = self.conv_post(x)
294 | x = torch.tanh(x)
295 |
296 | return x
297 |
298 | def remove_weight_norm(self):
299 | print('Removing weight norm...')
300 | for l in self.ups:
301 | remove_weight_norm(l)
302 | for l in self.resblocks:
303 | l.remove_weight_norm()
304 |
305 |
306 | class DiscriminatorP(torch.nn.Module):
307 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
308 | super(DiscriminatorP, self).__init__()
309 | self.period = period
310 | self.use_spectral_norm = use_spectral_norm
311 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
312 | self.convs = nn.ModuleList([
313 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
314 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
315 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
316 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
317 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
318 | ])
319 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
320 |
321 | def forward(self, x):
322 | fmap = []
323 |
324 | # 1d to 2d
325 | b, c, t = x.shape
326 | if t % self.period != 0: # pad first
327 | n_pad = self.period - (t % self.period)
328 | x = F.pad(x, (0, n_pad), "reflect")
329 | t = t + n_pad
330 | x = x.view(b, c, t // self.period, self.period)
331 |
332 | for l in self.convs:
333 | x = l(x)
334 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
335 | fmap.append(x)
336 | x = self.conv_post(x)
337 | fmap.append(x)
338 | x = torch.flatten(x, 1, -1)
339 |
340 | return x, fmap
341 |
342 |
343 | class DiscriminatorS(torch.nn.Module):
344 | def __init__(self, use_spectral_norm=False):
345 | super(DiscriminatorS, self).__init__()
346 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
347 | self.convs = nn.ModuleList([
348 | norm_f(Conv1d(1, 16, 15, 1, padding=7)),
349 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
350 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
351 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
352 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
353 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
354 | ])
355 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
356 |
357 | def forward(self, x):
358 | fmap = []
359 |
360 | for l in self.convs:
361 | x = l(x)
362 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
363 | fmap.append(x)
364 | x = self.conv_post(x)
365 | fmap.append(x)
366 | x = torch.flatten(x, 1, -1)
367 |
368 | return x, fmap
369 |
370 |
371 | class MultiPeriodDiscriminator(torch.nn.Module):
372 | def __init__(self, use_spectral_norm=False):
373 | super(MultiPeriodDiscriminator, self).__init__()
374 | periods = [2,3,5,7,11]
375 |
376 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
377 | discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
378 | self.discriminators = nn.ModuleList(discs)
379 |
380 | def forward(self, y, y_hat):
381 | y_d_rs = []
382 | y_d_gs = []
383 | fmap_rs = []
384 | fmap_gs = []
385 | for i, d in enumerate(self.discriminators):
386 | y_d_r, fmap_r = d(y)
387 | y_d_g, fmap_g = d(y_hat)
388 | y_d_rs.append(y_d_r)
389 | y_d_gs.append(y_d_g)
390 | fmap_rs.append(fmap_r)
391 | fmap_gs.append(fmap_g)
392 |
393 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
394 |
395 |
396 |
397 | class SynthesizerTrn(nn.Module):
398 | """
399 | Synthesizer for Training
400 | """
401 |
402 | def __init__(self,
403 | n_vocab,
404 | spec_channels,
405 | segment_size,
406 | inter_channels,
407 | hidden_channels,
408 | filter_channels,
409 | n_heads,
410 | n_layers,
411 | kernel_size,
412 | p_dropout,
413 | resblock,
414 | resblock_kernel_sizes,
415 | resblock_dilation_sizes,
416 | upsample_rates,
417 | upsample_initial_channel,
418 | upsample_kernel_sizes,
419 | n_speakers=0,
420 | gin_channels=0,
421 | use_sdp=True,
422 | emotion_embedding=False,
423 | **kwargs):
424 |
425 | super().__init__()
426 | self.n_vocab = n_vocab
427 | self.spec_channels = spec_channels
428 | self.inter_channels = inter_channels
429 | self.hidden_channels = hidden_channels
430 | self.filter_channels = filter_channels
431 | self.n_heads = n_heads
432 | self.n_layers = n_layers
433 | self.kernel_size = kernel_size
434 | self.p_dropout = p_dropout
435 | self.resblock = resblock
436 | self.resblock_kernel_sizes = resblock_kernel_sizes
437 | self.resblock_dilation_sizes = resblock_dilation_sizes
438 | self.upsample_rates = upsample_rates
439 | self.upsample_initial_channel = upsample_initial_channel
440 | self.upsample_kernel_sizes = upsample_kernel_sizes
441 | self.segment_size = segment_size
442 | self.n_speakers = n_speakers
443 | self.gin_channels = gin_channels
444 |
445 | self.use_sdp = use_sdp
446 |
447 | self.enc_p = TextEncoder(n_vocab,
448 | inter_channels,
449 | hidden_channels,
450 | filter_channels,
451 | n_heads,
452 | n_layers,
453 | kernel_size,
454 | p_dropout,
455 | emotion_embedding)
456 | self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
457 | self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
458 | self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
459 |
460 | if use_sdp:
461 | self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
462 | else:
463 | self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
464 |
465 | if n_speakers > 1:
466 | self.emb_g = nn.Embedding(n_speakers, gin_channels)
467 |
468 | def forward(self, x, x_lengths, y, y_lengths, sid=None, emotion_embedding=None):
469 |
470 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths ,emotion_embedding)
471 | if self.n_speakers > 0:
472 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
473 | else:
474 | g = None
475 |
476 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
477 | z_p = self.flow(z, y_mask, g=g)
478 |
479 | with torch.no_grad():
480 | # negative cross-entropy
481 | s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
482 | neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
483 | neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
484 | neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
485 | neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
486 | neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
487 |
488 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
489 | attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
490 |
491 | w = attn.sum(2)
492 | if self.use_sdp:
493 | l_length = self.dp(x, x_mask, w, g=g)
494 | l_length = l_length / torch.sum(x_mask)
495 | else:
496 | logw_ = torch.log(w + 1e-6) * x_mask
497 | logw = self.dp(x, x_mask, g=g)
498 | l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
499 |
500 | # expand prior
501 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
502 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
503 |
504 | z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
505 | o = self.dec(z_slice, g=g)
506 | return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
507 |
508 | def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
509 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
510 | if self.n_speakers > 0:
511 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
512 | else:
513 | g = None
514 |
515 | if self.use_sdp:
516 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
517 | else:
518 | logw = self.dp(x, x_mask, g=g)
519 | w = torch.exp(logw) * x_mask * length_scale
520 | w_ceil = torch.ceil(w)
521 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
522 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
523 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
524 | attn = commons.generate_path(w_ceil, attn_mask)
525 |
526 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
527 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
528 |
529 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
530 | z = self.flow(z_p, y_mask, g=g, reverse=True)
531 | o = self.dec((z * y_mask)[:,:,:max_len], g=g)
532 | return o, attn, y_mask, (z, z_p, m_p, logs_p)
533 |
534 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
535 | assert self.n_speakers > 0, "n_speakers have to be larger than 0."
536 | g_src = self.emb_g(sid_src).unsqueeze(-1)
537 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
538 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
539 | z_p = self.flow(z, y_mask, g=g_src)
540 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
541 | o_hat = self.dec(z_hat * y_mask, g=g_tgt)
542 | return o_hat, y_mask, (z, z_p, z_hat)
543 |
544 |
--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch import nn
4 | from torch.nn import functional as F
5 |
6 | from torch.nn import Conv1d
7 | from torch.nn.utils import weight_norm, remove_weight_norm
8 |
9 | import commons
10 | from commons import init_weights, get_padding
11 | from transforms import piecewise_rational_quadratic_transform
12 |
13 |
14 | LRELU_SLOPE = 0.1
15 |
16 |
17 | class LayerNorm(nn.Module):
18 | def __init__(self, channels, eps=1e-5):
19 | super().__init__()
20 | self.channels = channels
21 | self.eps = eps
22 |
23 | self.gamma = nn.Parameter(torch.ones(channels))
24 | self.beta = nn.Parameter(torch.zeros(channels))
25 |
26 | def forward(self, x):
27 | x = x.transpose(1, -1)
28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
29 | return x.transpose(1, -1)
30 |
31 |
32 | class ConvReluNorm(nn.Module):
33 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
34 | super().__init__()
35 | self.in_channels = in_channels
36 | self.hidden_channels = hidden_channels
37 | self.out_channels = out_channels
38 | self.kernel_size = kernel_size
39 | self.n_layers = n_layers
40 | self.p_dropout = p_dropout
41 | assert n_layers > 1, "Number of layers should be larger than 0."
42 |
43 | self.conv_layers = nn.ModuleList()
44 | self.norm_layers = nn.ModuleList()
45 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
46 | self.norm_layers.append(LayerNorm(hidden_channels))
47 | self.relu_drop = nn.Sequential(
48 | nn.ReLU(),
49 | nn.Dropout(p_dropout))
50 | for _ in range(n_layers-1):
51 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
52 | self.norm_layers.append(LayerNorm(hidden_channels))
53 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
54 | self.proj.weight.data.zero_()
55 | self.proj.bias.data.zero_()
56 |
57 | def forward(self, x, x_mask):
58 | x_org = x
59 | for i in range(self.n_layers):
60 | x = self.conv_layers[i](x * x_mask)
61 | x = self.norm_layers[i](x)
62 | x = self.relu_drop(x)
63 | x = x_org + self.proj(x)
64 | return x * x_mask
65 |
66 |
67 | class DDSConv(nn.Module):
68 | """
69 | Dialted and Depth-Separable Convolution
70 | """
71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
72 | super().__init__()
73 | self.channels = channels
74 | self.kernel_size = kernel_size
75 | self.n_layers = n_layers
76 | self.p_dropout = p_dropout
77 |
78 | self.drop = nn.Dropout(p_dropout)
79 | self.convs_sep = nn.ModuleList()
80 | self.convs_1x1 = nn.ModuleList()
81 | self.norms_1 = nn.ModuleList()
82 | self.norms_2 = nn.ModuleList()
83 | for i in range(n_layers):
84 | dilation = kernel_size ** i
85 | padding = (kernel_size * dilation - dilation) // 2
86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
87 | groups=channels, dilation=dilation, padding=padding
88 | ))
89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
90 | self.norms_1.append(LayerNorm(channels))
91 | self.norms_2.append(LayerNorm(channels))
92 |
93 | def forward(self, x, x_mask, g=None):
94 | if g is not None:
95 | x = x + g
96 | for i in range(self.n_layers):
97 | y = self.convs_sep[i](x * x_mask)
98 | y = self.norms_1[i](y)
99 | y = F.gelu(y)
100 | y = self.convs_1x1[i](y)
101 | y = self.norms_2[i](y)
102 | y = F.gelu(y)
103 | y = self.drop(y)
104 | x = x + y
105 | return x * x_mask
106 |
107 |
108 | class WN(torch.nn.Module):
109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
110 | super(WN, self).__init__()
111 | assert(kernel_size % 2 == 1)
112 | self.hidden_channels =hidden_channels
113 | self.kernel_size = kernel_size,
114 | self.dilation_rate = dilation_rate
115 | self.n_layers = n_layers
116 | self.gin_channels = gin_channels
117 | self.p_dropout = p_dropout
118 |
119 | self.in_layers = torch.nn.ModuleList()
120 | self.res_skip_layers = torch.nn.ModuleList()
121 | self.drop = nn.Dropout(p_dropout)
122 |
123 | if gin_channels != 0:
124 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
125 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
126 |
127 | for i in range(n_layers):
128 | dilation = dilation_rate ** i
129 | padding = int((kernel_size * dilation - dilation) / 2)
130 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
131 | dilation=dilation, padding=padding)
132 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
133 | self.in_layers.append(in_layer)
134 |
135 | # last one is not necessary
136 | if i < n_layers - 1:
137 | res_skip_channels = 2 * hidden_channels
138 | else:
139 | res_skip_channels = hidden_channels
140 |
141 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
142 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
143 | self.res_skip_layers.append(res_skip_layer)
144 |
145 | def forward(self, x, x_mask, g=None, **kwargs):
146 | output = torch.zeros_like(x)
147 | n_channels_tensor = torch.IntTensor([self.hidden_channels])
148 |
149 | if g is not None:
150 | g = self.cond_layer(g)
151 |
152 | for i in range(self.n_layers):
153 | x_in = self.in_layers[i](x)
154 | if g is not None:
155 | cond_offset = i * 2 * self.hidden_channels
156 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
157 | else:
158 | g_l = torch.zeros_like(x_in)
159 |
160 | acts = commons.fused_add_tanh_sigmoid_multiply(
161 | x_in,
162 | g_l,
163 | n_channels_tensor)
164 | acts = self.drop(acts)
165 |
166 | res_skip_acts = self.res_skip_layers[i](acts)
167 | if i < self.n_layers - 1:
168 | res_acts = res_skip_acts[:,:self.hidden_channels,:]
169 | x = (x + res_acts) * x_mask
170 | output = output + res_skip_acts[:,self.hidden_channels:,:]
171 | else:
172 | output = output + res_skip_acts
173 | return output * x_mask
174 |
175 | def remove_weight_norm(self):
176 | if self.gin_channels != 0:
177 | torch.nn.utils.remove_weight_norm(self.cond_layer)
178 | for l in self.in_layers:
179 | torch.nn.utils.remove_weight_norm(l)
180 | for l in self.res_skip_layers:
181 | torch.nn.utils.remove_weight_norm(l)
182 |
183 |
184 | class ResBlock1(torch.nn.Module):
185 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
186 | super(ResBlock1, self).__init__()
187 | self.convs1 = nn.ModuleList([
188 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
189 | padding=get_padding(kernel_size, dilation[0]))),
190 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
191 | padding=get_padding(kernel_size, dilation[1]))),
192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
193 | padding=get_padding(kernel_size, dilation[2])))
194 | ])
195 | self.convs1.apply(init_weights)
196 |
197 | self.convs2 = nn.ModuleList([
198 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
199 | padding=get_padding(kernel_size, 1))),
200 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201 | padding=get_padding(kernel_size, 1))),
202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203 | padding=get_padding(kernel_size, 1)))
204 | ])
205 | self.convs2.apply(init_weights)
206 |
207 | def forward(self, x, x_mask=None):
208 | for c1, c2 in zip(self.convs1, self.convs2):
209 | xt = F.leaky_relu(x, LRELU_SLOPE)
210 | if x_mask is not None:
211 | xt = xt * x_mask
212 | xt = c1(xt)
213 | xt = F.leaky_relu(xt, LRELU_SLOPE)
214 | if x_mask is not None:
215 | xt = xt * x_mask
216 | xt = c2(xt)
217 | x = xt + x
218 | if x_mask is not None:
219 | x = x * x_mask
220 | return x
221 |
222 | def remove_weight_norm(self):
223 | for l in self.convs1:
224 | remove_weight_norm(l)
225 | for l in self.convs2:
226 | remove_weight_norm(l)
227 |
228 |
229 | class ResBlock2(torch.nn.Module):
230 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
231 | super(ResBlock2, self).__init__()
232 | self.convs = nn.ModuleList([
233 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
234 | padding=get_padding(kernel_size, dilation[0]))),
235 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
236 | padding=get_padding(kernel_size, dilation[1])))
237 | ])
238 | self.convs.apply(init_weights)
239 |
240 | def forward(self, x, x_mask=None):
241 | for c in self.convs:
242 | xt = F.leaky_relu(x, LRELU_SLOPE)
243 | if x_mask is not None:
244 | xt = xt * x_mask
245 | xt = c(xt)
246 | x = xt + x
247 | if x_mask is not None:
248 | x = x * x_mask
249 | return x
250 |
251 | def remove_weight_norm(self):
252 | for l in self.convs:
253 | remove_weight_norm(l)
254 |
255 |
256 | class Log(nn.Module):
257 | def forward(self, x, x_mask, reverse=False, **kwargs):
258 | if not reverse:
259 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
260 | logdet = torch.sum(-y, [1, 2])
261 | return y, logdet
262 | else:
263 | x = torch.exp(x) * x_mask
264 | return x
265 |
266 |
267 | class Flip(nn.Module):
268 | def forward(self, x, *args, reverse=False, **kwargs):
269 | x = torch.flip(x, [1])
270 | if not reverse:
271 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
272 | return x, logdet
273 | else:
274 | return x
275 |
276 |
277 | class ElementwiseAffine(nn.Module):
278 | def __init__(self, channels):
279 | super().__init__()
280 | self.channels = channels
281 | self.m = nn.Parameter(torch.zeros(channels,1))
282 | self.logs = nn.Parameter(torch.zeros(channels,1))
283 |
284 | def forward(self, x, x_mask, reverse=False, **kwargs):
285 | if not reverse:
286 | y = self.m + torch.exp(self.logs) * x
287 | y = y * x_mask
288 | logdet = torch.sum(self.logs * x_mask, [1,2])
289 | return y, logdet
290 | else:
291 | x = (x - self.m) * torch.exp(-self.logs) * x_mask
292 | return x
293 |
294 |
295 | class ResidualCouplingLayer(nn.Module):
296 | def __init__(self,
297 | channels,
298 | hidden_channels,
299 | kernel_size,
300 | dilation_rate,
301 | n_layers,
302 | p_dropout=0,
303 | gin_channels=0,
304 | mean_only=False):
305 | assert channels % 2 == 0, "channels should be divisible by 2"
306 | super().__init__()
307 | self.channels = channels
308 | self.hidden_channels = hidden_channels
309 | self.kernel_size = kernel_size
310 | self.dilation_rate = dilation_rate
311 | self.n_layers = n_layers
312 | self.half_channels = channels // 2
313 | self.mean_only = mean_only
314 |
315 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
316 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
317 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
318 | self.post.weight.data.zero_()
319 | self.post.bias.data.zero_()
320 |
321 | def forward(self, x, x_mask, g=None, reverse=False):
322 | x0, x1 = torch.split(x, [self.half_channels]*2, 1)
323 | h = self.pre(x0) * x_mask
324 | h = self.enc(h, x_mask, g=g)
325 | stats = self.post(h) * x_mask
326 | if not self.mean_only:
327 | m, logs = torch.split(stats, [self.half_channels]*2, 1)
328 | else:
329 | m = stats
330 | logs = torch.zeros_like(m)
331 |
332 | if not reverse:
333 | x1 = m + x1 * torch.exp(logs) * x_mask
334 | x = torch.cat([x0, x1], 1)
335 | logdet = torch.sum(logs, [1,2])
336 | return x, logdet
337 | else:
338 | x1 = (x1 - m) * torch.exp(-logs) * x_mask
339 | x = torch.cat([x0, x1], 1)
340 | return x
341 |
342 |
343 | class ConvFlow(nn.Module):
344 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
345 | super().__init__()
346 | self.in_channels = in_channels
347 | self.filter_channels = filter_channels
348 | self.kernel_size = kernel_size
349 | self.n_layers = n_layers
350 | self.num_bins = num_bins
351 | self.tail_bound = tail_bound
352 | self.half_channels = in_channels // 2
353 |
354 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
355 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
356 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
357 | self.proj.weight.data.zero_()
358 | self.proj.bias.data.zero_()
359 |
360 | def forward(self, x, x_mask, g=None, reverse=False):
361 | x0, x1 = torch.split(x, [self.half_channels]*2, 1)
362 | h = self.pre(x0)
363 | h = self.convs(h, x_mask, g=g)
364 | h = self.proj(h) * x_mask
365 |
366 | b, c, t = x0.shape
367 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
368 |
369 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
370 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
371 | unnormalized_derivatives = h[..., 2 * self.num_bins:]
372 |
373 | x1, logabsdet = piecewise_rational_quadratic_transform(x1,
374 | unnormalized_widths,
375 | unnormalized_heights,
376 | unnormalized_derivatives,
377 | inverse=reverse,
378 | tails='linear',
379 | tail_bound=self.tail_bound
380 | )
381 |
382 | x = torch.cat([x0, x1], 1) * x_mask
383 | logdet = torch.sum(logabsdet * x_mask, [1,2])
384 | if not reverse:
385 | return x, logdet
386 | else:
387 | return x
388 |
--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
1 | from numpy import zeros, int32, float32
2 | from torch import from_numpy
3 |
4 | from .core import maximum_path_jit
5 |
6 | def maximum_path(neg_cent, mask):
7 | """ numba optimized version.
8 | neg_cent: [b, t_t, t_s]
9 | mask: [b, t_t, t_s]
10 | """
11 | device = neg_cent.device
12 | dtype = neg_cent.dtype
13 | neg_cent = neg_cent.data.cpu().numpy().astype(float32)
14 | path = zeros(neg_cent.shape, dtype=int32)
15 |
16 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
17 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
18 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
19 | return from_numpy(path).to(device=device, dtype=dtype)
20 |
--------------------------------------------------------------------------------
/monotonic_align/core.py:
--------------------------------------------------------------------------------
1 | import numba
2 |
3 |
4 | @numba.jit(numba.void(numba.int32[:,:,::1], numba.float32[:,:,::1], numba.int32[::1], numba.int32[::1]), nopython=True, nogil=True)
5 | def maximum_path_jit(paths, values, t_ys, t_xs):
6 | b = paths.shape[0]
7 | max_neg_val=-1e9
8 | for i in range(int(b)):
9 | path = paths[i]
10 | value = values[i]
11 | t_y = t_ys[i]
12 | t_x = t_xs[i]
13 |
14 | v_prev = v_cur = 0.0
15 | index = t_x - 1
16 |
17 | for y in range(t_y):
18 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
19 | if x == y:
20 | v_cur = max_neg_val
21 | else:
22 | v_cur = value[y-1, x]
23 | if x == 0:
24 | if y == 0:
25 | v_prev = 0.
26 | else:
27 | v_prev = max_neg_val
28 | else:
29 | v_prev = value[y-1, x-1]
30 | value[y, x] += max(v_prev, v_cur)
31 |
32 | for y in range(t_y - 1, -1, -1):
33 | path[y, index] = 1
34 | if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
35 | index = index - 1
36 |
--------------------------------------------------------------------------------
/pictures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/__init__.py
--------------------------------------------------------------------------------
/pictures/agakaUa$aNaGaka.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/agakaUa$aNaGaka.jpg
--------------------------------------------------------------------------------
/pictures/apauaraga5aqafa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/apauaraga5aqafa.jpg
--------------------------------------------------------------------------------
/pictures/avabaaa%aZaxa6a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/avabaaa%aZaxa6a.jpg
--------------------------------------------------------------------------------
/pictures/awa6aRakaka3a7a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/awa6aRakaka3a7a.jpg
--------------------------------------------------------------------------------
/plugins/RandomStr/RandomStr.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 |
4 | def random_str(random_length=6):
5 | """
6 | 生成随机字符串作为验证码
7 | :param random_length: 字符串长度,默认为6
8 | :return: 随机字符串
9 | """
10 | string = 'a'
11 | chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789@$#_%'
12 | length = len(chars) - 1
13 | # random = Random()
14 | # 设置循环每次取一个字符用来生成随机数
15 | for i in range(7):
16 | string += ((chars[random.randint(0, length)])+'a')
17 | return string
18 |
19 |
20 | if __name__ == '__main__':
21 | print(random_str())
22 | print(random_str(10))
23 |
--------------------------------------------------------------------------------
/plugins/RandomStr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/plugins/RandomStr/__init__.py
--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/plugins/__init__.py
--------------------------------------------------------------------------------
/plugins/picGet.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import requests
4 |
5 | from plugins.RandomStr.RandomStr import random_str
6 |
7 | url = 'https://iw233.cn/api.php?sort=yin' # 接口地址
8 | headers ={
9 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
10 | }
11 | def pic():
12 | r = requests.get(url, headers=headers)
13 | # 下载图片
14 | ranpath=''
15 | while True:
16 | ranpath = random_str()
17 | exist = os.path.isfile("pictures\\" + ranpath + ".jpg")
18 | direxist =os.path.isdir("pictures")
19 | if direxist:
20 | if exist:
21 | continue
22 | else:
23 | break
24 | else:
25 | os.mkdir("pictures")
26 | continue
27 |
28 | with open("pictures\\" + ranpath + ".jpg", mode="wb") as f:
29 | f.write(r.content) # 图片内容写入文件
30 | return "pictures\\" + ranpath + ".jpg"
31 | if __name__ == '__main__':
32 | s=input("输入1开始执行")
33 | i=0
34 | if s=="1":
35 | while i<=10:
36 | pic()
37 | i+=1
--------------------------------------------------------------------------------
/plugins/voicePart.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import os
4 | import datetime
5 | import random
6 | import time
7 | import sys
8 |
9 | from mirai import Image, Voice
10 | from mirai import Mirai, WebSocketAdapter, FriendMessage, GroupMessage, At, Plain
11 |
12 | from MoeGoe import voiceGenerate
13 | from plugins.RandomStr.RandomStr import random_str
14 | from trans import translate
15 | def main(bot):
16 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
17 | print(time + '| voiceGenerate module loaded successfully 已加载--- 语音生成 ---模块')
18 | # 中文生成1
19 | global voiceSender
20 | voiceSender = 0
21 | global voiceTrans
22 | voiceTrans = 0
23 | global modelSelect
24 | modelSelect=0
25 | global yuukaSaid
26 | yuukaSaid=0
27 |
28 | @bot.on(GroupMessage)
29 | async def handle_group_message(event: GroupMessage):
30 | if str(event.message_chain).startswith('中文'):
31 | modelList = ['0', '1', '2', '3']
32 | if len(str(event.message_chain)) < 60:
33 | if '#' in str(event.message_chain):
34 | textt = str(event.message_chain).split("#")
35 | if textt[1] in modelList:
36 | model = int(textt[1])
37 | tex = '[ZH]' + ((textt[0])[2:]) + '[ZH]'
38 | else:
39 | model = 0
40 | tex = '[ZH]' + (str(event.message_chain)[2:]) + '[ZH]'
41 | else:
42 | tex = '[ZH]' + (str(event.message_chain)[2:]) + '[ZH]'
43 | model = 0
44 | ranpath = random_str()
45 | out ='plugins\\voices\\' + ranpath + '.wav'
46 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
47 | print(time + '| 中文语音生成-----> ' +tex)
48 | voiceGenerate(tex, out, model)
49 | await bot.send(event, Voice(path=out))
50 | else:
51 | ranpath = random_str()
52 | out ='plugins\\voices\\' + ranpath + '.wav'
53 | tex = '[ZH]太常了哦......[ZH]'
54 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
55 | print(time + '| 中文语音生成-----> ' + tex)
56 | voiceGenerate(tex, out)
57 | await bot.send(event, Voice(path=out))
58 |
59 | # 日语生成
60 | @bot.on(GroupMessage)
61 | async def handle_group_message(event: GroupMessage):
62 | if str(event.message_chain).startswith('说'):
63 | global modelSelect
64 | modelList = ['0', '1', '2', '3']
65 | if len(str(event.message_chain)) < 70:
66 | if '#' in str(event.message_chain):
67 | textt = str(event.message_chain).split("#")
68 | if textt[1] in modelList:
69 | model = int(textt[1])
70 | tex = '[JA]' + translate((textt[0])[1:]) + '[JA]'
71 | else:
72 | model = 0
73 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]'
74 | else:
75 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]'
76 | model = 0
77 | ranpath = random_str()
78 | out ='plugins\\voices\\' + ranpath + '.wav'
79 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
80 | print(time + '| 日语语音生成-----> ' + tex)
81 | if modelSelect==1:
82 | tex=tex.replace('[JA]','')
83 | else:
84 | pass
85 | voiceGenerate(tex, out, model,modelSelect)
86 |
87 | modelSelect = 0
88 | await bot.send(event, Voice(path=out))
89 | else:
90 | ranpath = random_str()
91 | out = 'plugins\\voices\\' + ranpath + '.wav'
92 | tex = '[JA]' + translate('不行,太长了哦.....') + '[JA]'
93 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
94 | print(time + '| 日语语音生成-----> ' + tex)
95 | if modelSelect==1:
96 | tex=tex.replace('[JA]','')
97 | yuukaSaid+=1
98 | else:
99 | pass
100 | voiceGenerate(tex, out,0,modelSelect)
101 | if yuukaSaid==3:
102 | modelSelect = 0
103 | else:
104 | pass
105 | await bot.send(event, Voice(path=out))
106 | @bot.on(GroupMessage)
107 | async def yuukaVoiceModelSelecter(event: GroupMessage):
108 | if str(event.message_chain)=='modelSet=1':
109 | global modelSelect
110 | modelSelect=1
111 | await bot.send(event,'已切换至ユウカ(优香)语音模型\n接下来三次语音生成任务默认使用优香语音模型')
112 |
113 | @bot.on(GroupMessage)
114 | async def yuukaVoiceModelSelecter(event: GroupMessage):
115 | if str(event.message_chain).startswith('优香说'):
116 | tex=str(event.message_chain)[3:]
117 | tex=translate(tex)
118 | ranpath = random_str()
119 | out = 'plugins\\voices\\' + ranpath + '.wav'
120 | voiceGenerate(tex, out, 0, 1)
121 | await bot.send(event, Voice(path=out))
122 |
123 | if str(event.message_chain).startswith('邮箱说'):
124 | tex=str(event.message_chain)[3:]
125 | ranpath = random_str()
126 | out = 'plugins\\voices\\' + ranpath + '.wav'
127 | voiceGenerate(tex, out, 0, 1)
128 | await bot.send(event, Voice(path=out))
129 |
130 |
131 | # 语音转换
132 | '''@bot.on(GroupMessage)
133 | async def voiceTan(event: GroupMessage):
134 | if str(event.message_chain) == '语音转换':
135 | global voiceSender
136 | voiceSender = event.sender.id
137 | global voiceTrans
138 | voiceTrans = 2
139 | await bot.send(event, '请发送语音')
140 |
141 | # 语音转化附件
142 | @bot.on(GroupMessage)
143 | async def voicetransa(event: GroupMessage):
144 | global voiceSender
145 | global voiceTrans
146 | if event.message_chain.count(Voice):
147 | if voiceTrans == 2:
148 | if voiceSender == event.sender.id:
149 | s = event.message_chain.get(Voice)
150 | await Voice.download(s[0], 'plugins/voices/sing/rest.silk')
151 | silkcoder.decode("plugins/voices/sing/rest.silk", "plugins/voices/sing/rest.wav",
152 | ffmpeg_para=["-ar", "44100"])
153 | print('over')
154 | paths = voice_conversion("plugins/voices/sing/rest.wav")
155 | await bot.send(event, Voice(path=paths))
156 | voiceSender = 0
157 | voiceTrans = 0'''
158 |
159 | # 好友日语生成,因腾讯版本更新再不可用
160 | '''@bot.on(FriendMessage)
161 | async def handle_group_message(event: FriendMessage):
162 | if str(event.message_chain).startswith('说'):
163 | modelList = ['0', '1', '2', '3']
164 | if len(str(event.message_chain)) < 280:
165 | if '#' in str(event.message_chain):
166 | textt = str(event.message_chain).split("#")
167 | if textt[1] in modelList:
168 | model = int(textt[1])
169 | tex = '[JA]' + translate((textt[0])[1:]) + '[JA]'
170 | else:
171 | model = 0
172 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]'
173 | else:
174 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]'
175 | model = 0
176 | ranpath = random_str()
177 | out ='PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
178 | voiceGenerate(tex, out, model)
179 | await bot.send(event, Voice(path=out))
180 | else:
181 | ranpath = random_str()
182 | out = 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav'
183 | tex = '[JA]' + translate('不行,太长了哦.....') + '[JA]'
184 | voiceGenerate(tex, out)
185 | await bot.send(event, Voice(path=out))'''
186 |
187 |
--------------------------------------------------------------------------------
/plugins/voices/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/plugins/voices/__init__.py
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiofiles==0.7.0
2 | altgraph==0.17.3
3 | anyio==3.6.2
4 | appdirs==1.4.4
5 | audioread==3.0.0
6 | backports.functools-lru-cache==1.6.4
7 | certifi==2022.9.24
8 | cffi==1.15.1
9 | charset-normalizer==2.1.1
10 | click==8.1.3
11 | cn2an==0.5.19
12 | colorama==0.4.6
13 | Cython==0.29.32
14 | decorator==5.1.1
15 | eng-to-ipa==0.0.2
16 | Flask==2.2.2
17 | future==0.18.2
18 | h11==0.14.0
19 | httpcore==0.16.2
20 | httpx==0.23.1
21 | idna==3.4
22 | importlib-metadata==5.1.0
23 | indic-transliteration==2.3.40
24 | inflect==6.0.2
25 | itsdangerous==2.1.2
26 | jamo==0.4.1
27 | jieba==0.42.1
28 | Jinja2==3.1.2
29 | joblib==1.2.0
30 | ko-pron==1.3
31 | librosa==0.9.2
32 | llvmlite==0.39.1
33 | MarkupSafe==2.1.1
34 | num-thai==0.0.5
35 | numba==0.56.4
36 | numpy==1.22.0
37 | OpenCC==1.1.1
38 | openjtalk==0.3.0.dev2
39 | packaging==21.3
40 | pefile==2022.5.30
41 | pooch==1.6.0
42 | proces==0.1.3
43 | protobuf==4.21.9
44 | pycparser==2.21
45 | pydantic==1.10.2
46 | pyinstaller==5.6.2
47 | pyinstaller-hooks-contrib==2022.13
48 | pyparsing==3.0.9
49 | pypinyin==0.47.1
50 | pywin32-ctypes==0.2.0
51 | regex==2022.10.31
52 | requests==2.28.1
53 | resampy==0.4.2
54 | rfc3986==1.5.0
55 | roman==3.3
56 | scikit-learn==1.1.3
57 | scipy==1.9.3
58 | six==1.16.0
59 | sniffio==1.3.0
60 | soundfile==0.11.0
61 | starlette==0.22.0
62 | threadpoolctl==3.1.0
63 | toml==0.10.2
64 | torch==1.13.0
65 | tqdm==4.64.1
66 | typer==0.7.0
67 | typing_extensions==4.4.0
68 | Unidecode==1.3.6
69 | urllib3==1.26.13
70 | websockets==10.4
71 | Werkzeug==2.2.2
72 | yiri-mirai==0.2.7
73 | zipp==3.11.0
74 |
--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 Keith Ito
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 | from text import cleaners
3 |
4 |
5 | def text_to_sequence(text, symbols, cleaner_names):
6 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
7 | Args:
8 | text: string to convert to a sequence
9 | cleaner_names: names of the cleaner functions to run the text through
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | '''
13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14 |
15 | sequence = []
16 |
17 | clean_text = _clean_text(text, cleaner_names)
18 | for symbol in clean_text:
19 | if symbol not in _symbol_to_id.keys():
20 | continue
21 | symbol_id = _symbol_to_id[symbol]
22 | sequence += [symbol_id]
23 | return sequence
24 |
25 |
26 | def _clean_text(text, cleaner_names):
27 | for name in cleaner_names:
28 | cleaner = getattr(cleaners, name)
29 | if not cleaner:
30 | raise Exception('Unknown cleaner: %s' % name)
31 | text = cleaner(text)
32 | return text
33 |
--------------------------------------------------------------------------------
/text/cantonese.py:
--------------------------------------------------------------------------------
1 | import re
2 | import cn2an
3 | import opencc
4 |
5 |
6 | converter = opencc.OpenCC('jyutjyu')
7 |
8 | # List of (Latin alphabet, ipa) pairs:
9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10 | ('A', 'ei˥'),
11 | ('B', 'biː˥'),
12 | ('C', 'siː˥'),
13 | ('D', 'tiː˥'),
14 | ('E', 'iː˥'),
15 | ('F', 'e˥fuː˨˩'),
16 | ('G', 'tsiː˥'),
17 | ('H', 'ɪk̚˥tsʰyː˨˩'),
18 | ('I', 'ɐi˥'),
19 | ('J', 'tsei˥'),
20 | ('K', 'kʰei˥'),
21 | ('L', 'e˥llou˨˩'),
22 | ('M', 'ɛːm˥'),
23 | ('N', 'ɛːn˥'),
24 | ('O', 'ou˥'),
25 | ('P', 'pʰiː˥'),
26 | ('Q', 'kʰiːu˥'),
27 | ('R', 'aː˥lou˨˩'),
28 | ('S', 'ɛː˥siː˨˩'),
29 | ('T', 'tʰiː˥'),
30 | ('U', 'juː˥'),
31 | ('V', 'wiː˥'),
32 | ('W', 'tʊk̚˥piː˥juː˥'),
33 | ('X', 'ɪk̚˥siː˨˩'),
34 | ('Y', 'waːi˥'),
35 | ('Z', 'iː˨sɛːt̚˥')
36 | ]]
37 |
38 |
39 | def number_to_cantonese(text):
40 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
41 |
42 |
43 | def latin_to_ipa(text):
44 | for regex, replacement in _latin_to_ipa:
45 | text = re.sub(regex, replacement, text)
46 | return text
47 |
48 |
49 | def cantonese_to_ipa(text):
50 | text = number_to_cantonese(text.upper())
51 | text = converter.convert(text).replace('-','').replace('$',' ')
52 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
53 | text = re.sub(r'[、;:]', ',', text)
54 | text = re.sub(r'\s*,\s*', ', ', text)
55 | text = re.sub(r'\s*。\s*', '. ', text)
56 | text = re.sub(r'\s*?\s*', '? ', text)
57 | text = re.sub(r'\s*!\s*', '! ', text)
58 | text = re.sub(r'\s*$', '', text)
59 | return text
60 |
--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def japanese_cleaners(text):
5 | from text.japanese import japanese_to_romaji_with_accent
6 | text = japanese_to_romaji_with_accent(text)
7 | text = re.sub(r'([A-Za-z])$', r'\1.', text)
8 | return text
9 |
10 |
11 | def japanese_cleaners2(text):
12 | return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
13 |
14 |
15 | def korean_cleaners(text):
16 | '''Pipeline for Korean text'''
17 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
18 | text = latin_to_hangul(text)
19 | text = number_to_hangul(text)
20 | text = divide_hangul(text)
21 | text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
22 | return text
23 |
24 |
25 | def chinese_cleaners(text):
26 | '''Pipeline for Chinese text'''
27 | from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
28 | text = number_to_chinese(text)
29 | text = chinese_to_bopomofo(text)
30 | text = latin_to_bopomofo(text)
31 | text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
32 | return text
33 |
34 |
35 | def zh_ja_mixture_cleaners(text):
36 | from text.mandarin import chinese_to_romaji
37 | from text.japanese import japanese_to_romaji_with_accent
38 | text = re.sub(r'\[ZH\](.*?)\[ZH\]',
39 | lambda x: chinese_to_romaji(x.group(1))+' ', text)
40 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
41 | x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
42 | text = re.sub(r'\s+$', '', text)
43 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
44 | return text
45 |
46 |
47 | def sanskrit_cleaners(text):
48 | text = text.replace('॥', '।').replace('ॐ', 'ओम्')
49 | if text[-1] != '।':
50 | text += ' ।'
51 | return text
52 |
53 |
54 | def cjks_cleaners(text):
55 | from text.mandarin import chinese_to_lazy_ipa
56 | from text.japanese import japanese_to_ipa
57 | from text.korean import korean_to_lazy_ipa
58 | from text.sanskrit import devanagari_to_ipa
59 | from text.english import english_to_lazy_ipa
60 | text = re.sub(r'\[ZH\](.*?)\[ZH\]',
61 | lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
62 | text = re.sub(r'\[JA\](.*?)\[JA\]',
63 | lambda x: japanese_to_ipa(x.group(1))+' ', text)
64 | text = re.sub(r'\[KO\](.*?)\[KO\]',
65 | lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
66 | text = re.sub(r'\[SA\](.*?)\[SA\]',
67 | lambda x: devanagari_to_ipa(x.group(1))+' ', text)
68 | text = re.sub(r'\[EN\](.*?)\[EN\]',
69 | lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
70 | text = re.sub(r'\s+$', '', text)
71 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
72 | return text
73 |
74 |
75 | def cjke_cleaners(text):
76 | from text.mandarin import chinese_to_lazy_ipa
77 | from text.japanese import japanese_to_ipa
78 | from text.korean import korean_to_ipa
79 | from text.english import english_to_ipa2
80 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
81 | 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
82 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
83 | 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
84 | text = re.sub(r'\[KO\](.*?)\[KO\]',
85 | lambda x: korean_to_ipa(x.group(1))+' ', text)
86 | text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
87 | 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
88 | text = re.sub(r'\s+$', '', text)
89 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
90 | return text
91 |
92 |
93 | def cjke_cleaners2(text):
94 | from text.mandarin import chinese_to_ipa
95 | from text.japanese import japanese_to_ipa2
96 | from text.korean import korean_to_ipa
97 | from text.english import english_to_ipa2
98 | text = re.sub(r'\[ZH\](.*?)\[ZH\]',
99 | lambda x: chinese_to_ipa(x.group(1))+' ', text)
100 | text = re.sub(r'\[JA\](.*?)\[JA\]',
101 | lambda x: japanese_to_ipa2(x.group(1))+' ', text)
102 | text = re.sub(r'\[KO\](.*?)\[KO\]',
103 | lambda x: korean_to_ipa(x.group(1))+' ', text)
104 | text = re.sub(r'\[EN\](.*?)\[EN\]',
105 | lambda x: english_to_ipa2(x.group(1))+' ', text)
106 | text = re.sub(r'\s+$', '', text)
107 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
108 | return text
109 |
110 |
111 | def thai_cleaners(text):
112 | from text.thai import num_to_thai, latin_to_thai
113 | text = num_to_thai(text)
114 | text = latin_to_thai(text)
115 | return text
116 |
117 |
118 | def shanghainese_cleaners(text):
119 | from text.shanghainese import shanghainese_to_ipa
120 | text = shanghainese_to_ipa(text)
121 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
122 | return text
123 |
124 |
125 | def chinese_dialect_cleaners(text):
126 | from text.mandarin import chinese_to_ipa2
127 | from text.japanese import japanese_to_ipa3
128 | from text.shanghainese import shanghainese_to_ipa
129 | from text.cantonese import cantonese_to_ipa
130 | from text.english import english_to_lazy_ipa2
131 | from text.ngu_dialect import ngu_dialect_to_ipa
132 | text = re.sub(r'\[ZH\](.*?)\[ZH\]',
133 | lambda x: chinese_to_ipa2(x.group(1))+' ', text)
134 | text = re.sub(r'\[JA\](.*?)\[JA\]',
135 | lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
136 | text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
137 | '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
138 | text = re.sub(r'\[GD\](.*?)\[GD\]',
139 | lambda x: cantonese_to_ipa(x.group(1))+' ', text)
140 | text = re.sub(r'\[EN\](.*?)\[EN\]',
141 | lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
142 | text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
143 | 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
144 | text = re.sub(r'\s+$', '', text)
145 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
146 | return text
147 |
--------------------------------------------------------------------------------
/text/english.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | '''
4 | Cleaners are transformations that run over the input text at both training and eval time.
5 |
6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8 | 1. "english_cleaners" for English text
9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 | the symbols in symbols.py to match your data).
13 | '''
14 |
15 |
16 | # Regular expression matching whitespace:
17 |
18 |
19 | import re
20 | import inflect
21 | from unidecode import unidecode
22 | import eng_to_ipa as ipa
23 | _inflect = inflect.engine()
24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
29 | _number_re = re.compile(r'[0-9]+')
30 |
31 | # List of (regular expression, replacement) pairs for abbreviations:
32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
33 | ('mrs', 'misess'),
34 | ('mr', 'mister'),
35 | ('dr', 'doctor'),
36 | ('st', 'saint'),
37 | ('co', 'company'),
38 | ('jr', 'junior'),
39 | ('maj', 'major'),
40 | ('gen', 'general'),
41 | ('drs', 'doctors'),
42 | ('rev', 'reverend'),
43 | ('lt', 'lieutenant'),
44 | ('hon', 'honorable'),
45 | ('sgt', 'sergeant'),
46 | ('capt', 'captain'),
47 | ('esq', 'esquire'),
48 | ('ltd', 'limited'),
49 | ('col', 'colonel'),
50 | ('ft', 'fort'),
51 | ]]
52 |
53 |
54 | # List of (ipa, lazy ipa) pairs:
55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
56 | ('r', 'ɹ'),
57 | ('æ', 'e'),
58 | ('ɑ', 'a'),
59 | ('ɔ', 'o'),
60 | ('ð', 'z'),
61 | ('θ', 's'),
62 | ('ɛ', 'e'),
63 | ('ɪ', 'i'),
64 | ('ʊ', 'u'),
65 | ('ʒ', 'ʥ'),
66 | ('ʤ', 'ʥ'),
67 | ('ˈ', '↓'),
68 | ]]
69 |
70 | # List of (ipa, lazy ipa2) pairs:
71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
72 | ('r', 'ɹ'),
73 | ('ð', 'z'),
74 | ('θ', 's'),
75 | ('ʒ', 'ʑ'),
76 | ('ʤ', 'dʑ'),
77 | ('ˈ', '↓'),
78 | ]]
79 |
80 | # List of (ipa, ipa2) pairs
81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
82 | ('r', 'ɹ'),
83 | ('ʤ', 'dʒ'),
84 | ('ʧ', 'tʃ')
85 | ]]
86 |
87 |
88 | def expand_abbreviations(text):
89 | for regex, replacement in _abbreviations:
90 | text = re.sub(regex, replacement, text)
91 | return text
92 |
93 |
94 | def collapse_whitespace(text):
95 | return re.sub(r'\s+', ' ', text)
96 |
97 |
98 | def _remove_commas(m):
99 | return m.group(1).replace(',', '')
100 |
101 |
102 | def _expand_decimal_point(m):
103 | return m.group(1).replace('.', ' point ')
104 |
105 |
106 | def _expand_dollars(m):
107 | match = m.group(1)
108 | parts = match.split('.')
109 | if len(parts) > 2:
110 | return match + ' dollars' # Unexpected format
111 | dollars = int(parts[0]) if parts[0] else 0
112 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
113 | if dollars and cents:
114 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
115 | cent_unit = 'cent' if cents == 1 else 'cents'
116 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
117 | elif dollars:
118 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
119 | return '%s %s' % (dollars, dollar_unit)
120 | elif cents:
121 | cent_unit = 'cent' if cents == 1 else 'cents'
122 | return '%s %s' % (cents, cent_unit)
123 | else:
124 | return 'zero dollars'
125 |
126 |
127 | def _expand_ordinal(m):
128 | return _inflect.number_to_words(m.group(0))
129 |
130 |
131 | def _expand_number(m):
132 | num = int(m.group(0))
133 | if num > 1000 and num < 3000:
134 | if num == 2000:
135 | return 'two thousand'
136 | elif num > 2000 and num < 2010:
137 | return 'two thousand ' + _inflect.number_to_words(num % 100)
138 | elif num % 100 == 0:
139 | return _inflect.number_to_words(num // 100) + ' hundred'
140 | else:
141 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
142 | else:
143 | return _inflect.number_to_words(num, andword='')
144 |
145 |
146 | def normalize_numbers(text):
147 | text = re.sub(_comma_number_re, _remove_commas, text)
148 | text = re.sub(_pounds_re, r'\1 pounds', text)
149 | text = re.sub(_dollars_re, _expand_dollars, text)
150 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
151 | text = re.sub(_ordinal_re, _expand_ordinal, text)
152 | text = re.sub(_number_re, _expand_number, text)
153 | return text
154 |
155 |
156 | def mark_dark_l(text):
157 | return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
158 |
159 |
160 | def english_to_ipa(text):
161 | text = unidecode(text).lower()
162 | text = expand_abbreviations(text)
163 | text = normalize_numbers(text)
164 | phonemes = ipa.convert(text)
165 | phonemes = collapse_whitespace(phonemes)
166 | return phonemes
167 |
168 |
169 | def english_to_lazy_ipa(text):
170 | text = english_to_ipa(text)
171 | for regex, replacement in _lazy_ipa:
172 | text = re.sub(regex, replacement, text)
173 | return text
174 |
175 |
176 | def english_to_ipa2(text):
177 | text = english_to_ipa(text)
178 | text = mark_dark_l(text)
179 | for regex, replacement in _ipa_to_ipa2:
180 | text = re.sub(regex, replacement, text)
181 | return text.replace('...', '…')
182 |
183 |
184 | def english_to_lazy_ipa2(text):
185 | text = english_to_ipa(text)
186 | for regex, replacement in _lazy_ipa2:
187 | text = re.sub(regex, replacement, text)
188 | return text
189 |
--------------------------------------------------------------------------------
/text/japanese.py:
--------------------------------------------------------------------------------
1 | import re
2 | from unidecode import unidecode
3 | import pyopenjtalk
4 |
5 |
6 | # Regular expression matching Japanese without punctuation marks:
7 | _japanese_characters = re.compile(
8 | r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
9 |
10 | # Regular expression matching non-Japanese characters or punctuation marks:
11 | _japanese_marks = re.compile(
12 | r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
13 |
14 | # List of (symbol, Japanese) pairs for marks:
15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
16 | ('%', 'パーセント')
17 | ]]
18 |
19 | # List of (romaji, ipa) pairs for marks:
20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
21 | ('ts', 'ʦ'),
22 | ('u', 'ɯ'),
23 | ('j', 'ʥ'),
24 | ('y', 'j'),
25 | ('ni', 'n^i'),
26 | ('nj', 'n^'),
27 | ('hi', 'çi'),
28 | ('hj', 'ç'),
29 | ('f', 'ɸ'),
30 | ('I', 'i*'),
31 | ('U', 'ɯ*'),
32 | ('r', 'ɾ')
33 | ]]
34 |
35 | # List of (romaji, ipa2) pairs for marks:
36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
37 | ('u', 'ɯ'),
38 | ('ʧ', 'tʃ'),
39 | ('j', 'dʑ'),
40 | ('y', 'j'),
41 | ('ni', 'n^i'),
42 | ('nj', 'n^'),
43 | ('hi', 'çi'),
44 | ('hj', 'ç'),
45 | ('f', 'ɸ'),
46 | ('I', 'i*'),
47 | ('U', 'ɯ*'),
48 | ('r', 'ɾ')
49 | ]]
50 |
51 | # List of (consonant, sokuon) pairs:
52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
53 | (r'Q([↑↓]*[kg])', r'k#\1'),
54 | (r'Q([↑↓]*[tdjʧ])', r't#\1'),
55 | (r'Q([↑↓]*[sʃ])', r's\1'),
56 | (r'Q([↑↓]*[pb])', r'p#\1')
57 | ]]
58 |
59 | # List of (consonant, hatsuon) pairs:
60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
61 | (r'N([↑↓]*[pbm])', r'm\1'),
62 | (r'N([↑↓]*[ʧʥj])', r'n^\1'),
63 | (r'N([↑↓]*[tdn])', r'n\1'),
64 | (r'N([↑↓]*[kg])', r'ŋ\1')
65 | ]]
66 |
67 |
68 | def symbols_to_japanese(text):
69 | for regex, replacement in _symbols_to_japanese:
70 | text = re.sub(regex, replacement, text)
71 | return text
72 |
73 |
74 | def japanese_to_romaji_with_accent(text):
75 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
76 | text = symbols_to_japanese(text)
77 | sentences = re.split(_japanese_marks, text)
78 | marks = re.findall(_japanese_marks, text)
79 | text = ''
80 | for i, sentence in enumerate(sentences):
81 | if re.match(_japanese_characters, sentence):
82 | if text != '':
83 | text += ' '
84 | labels = pyopenjtalk.extract_fullcontext(sentence)
85 | for n, label in enumerate(labels):
86 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
87 | if phoneme not in ['sil', 'pau']:
88 | text += phoneme.replace('ch', 'ʧ').replace('sh',
89 | 'ʃ').replace('cl', 'Q')
90 | else:
91 | continue
92 | # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
93 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
94 | a2 = int(re.search(r"\+(\d+)\+", label).group(1))
95 | a3 = int(re.search(r"\+(\d+)/", label).group(1))
96 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
97 | a2_next = -1
98 | else:
99 | a2_next = int(
100 | re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
101 | # Accent phrase boundary
102 | if a3 == 1 and a2_next == 1:
103 | text += ' '
104 | # Falling
105 | elif a1 == 0 and a2_next == a2 + 1:
106 | text += '↓'
107 | # Rising
108 | elif a2 == 1 and a2_next == 2:
109 | text += '↑'
110 | if i < len(marks):
111 | text += unidecode(marks[i]).replace(' ', '')
112 | return text
113 |
114 |
115 | def get_real_sokuon(text):
116 | for regex, replacement in _real_sokuon:
117 | text = re.sub(regex, replacement, text)
118 | return text
119 |
120 |
121 | def get_real_hatsuon(text):
122 | for regex, replacement in _real_hatsuon:
123 | text = re.sub(regex, replacement, text)
124 | return text
125 |
126 |
127 | def japanese_to_ipa(text):
128 | text = japanese_to_romaji_with_accent(text).replace('...', '…')
129 | text = re.sub(
130 | r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
131 | text = get_real_sokuon(text)
132 | text = get_real_hatsuon(text)
133 | for regex, replacement in _romaji_to_ipa:
134 | text = re.sub(regex, replacement, text)
135 | return text
136 |
137 |
138 | def japanese_to_ipa2(text):
139 | text = japanese_to_romaji_with_accent(text).replace('...', '…')
140 | text = get_real_sokuon(text)
141 | text = get_real_hatsuon(text)
142 | for regex, replacement in _romaji_to_ipa2:
143 | text = re.sub(regex, replacement, text)
144 | return text
145 |
146 |
147 | def japanese_to_ipa3(text):
148 | text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
149 | 'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
150 | text = re.sub(
151 | r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
152 | text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
153 | return text
154 |
--------------------------------------------------------------------------------
/text/korean.py:
--------------------------------------------------------------------------------
1 | import re
2 | from jamo import h2j, j2hcj
3 | import ko_pron
4 |
5 |
6 | # This is a list of Korean classifiers preceded by pure Korean numerals.
7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
8 |
9 | # List of (hangul, hangul divided) pairs:
10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
11 | ('ㄳ', 'ㄱㅅ'),
12 | ('ㄵ', 'ㄴㅈ'),
13 | ('ㄶ', 'ㄴㅎ'),
14 | ('ㄺ', 'ㄹㄱ'),
15 | ('ㄻ', 'ㄹㅁ'),
16 | ('ㄼ', 'ㄹㅂ'),
17 | ('ㄽ', 'ㄹㅅ'),
18 | ('ㄾ', 'ㄹㅌ'),
19 | ('ㄿ', 'ㄹㅍ'),
20 | ('ㅀ', 'ㄹㅎ'),
21 | ('ㅄ', 'ㅂㅅ'),
22 | ('ㅘ', 'ㅗㅏ'),
23 | ('ㅙ', 'ㅗㅐ'),
24 | ('ㅚ', 'ㅗㅣ'),
25 | ('ㅝ', 'ㅜㅓ'),
26 | ('ㅞ', 'ㅜㅔ'),
27 | ('ㅟ', 'ㅜㅣ'),
28 | ('ㅢ', 'ㅡㅣ'),
29 | ('ㅑ', 'ㅣㅏ'),
30 | ('ㅒ', 'ㅣㅐ'),
31 | ('ㅕ', 'ㅣㅓ'),
32 | ('ㅖ', 'ㅣㅔ'),
33 | ('ㅛ', 'ㅣㅗ'),
34 | ('ㅠ', 'ㅣㅜ')
35 | ]]
36 |
37 | # List of (Latin alphabet, hangul) pairs:
38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
39 | ('a', '에이'),
40 | ('b', '비'),
41 | ('c', '시'),
42 | ('d', '디'),
43 | ('e', '이'),
44 | ('f', '에프'),
45 | ('g', '지'),
46 | ('h', '에이치'),
47 | ('i', '아이'),
48 | ('j', '제이'),
49 | ('k', '케이'),
50 | ('l', '엘'),
51 | ('m', '엠'),
52 | ('n', '엔'),
53 | ('o', '오'),
54 | ('p', '피'),
55 | ('q', '큐'),
56 | ('r', '아르'),
57 | ('s', '에스'),
58 | ('t', '티'),
59 | ('u', '유'),
60 | ('v', '브이'),
61 | ('w', '더블유'),
62 | ('x', '엑스'),
63 | ('y', '와이'),
64 | ('z', '제트')
65 | ]]
66 |
67 | # List of (ipa, lazy ipa) pairs:
68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
69 | ('t͡ɕ','ʧ'),
70 | ('d͡ʑ','ʥ'),
71 | ('ɲ','n^'),
72 | ('ɕ','ʃ'),
73 | ('ʷ','w'),
74 | ('ɭ','l`'),
75 | ('ʎ','ɾ'),
76 | ('ɣ','ŋ'),
77 | ('ɰ','ɯ'),
78 | ('ʝ','j'),
79 | ('ʌ','ə'),
80 | ('ɡ','g'),
81 | ('\u031a','#'),
82 | ('\u0348','='),
83 | ('\u031e',''),
84 | ('\u0320',''),
85 | ('\u0339','')
86 | ]]
87 |
88 |
89 | def latin_to_hangul(text):
90 | for regex, replacement in _latin_to_hangul:
91 | text = re.sub(regex, replacement, text)
92 | return text
93 |
94 |
95 | def divide_hangul(text):
96 | text = j2hcj(h2j(text))
97 | for regex, replacement in _hangul_divided:
98 | text = re.sub(regex, replacement, text)
99 | return text
100 |
101 |
102 | def hangul_number(num, sino=True):
103 | '''Reference https://github.com/Kyubyong/g2pK'''
104 | num = re.sub(',', '', num)
105 |
106 | if num == '0':
107 | return '영'
108 | if not sino and num == '20':
109 | return '스무'
110 |
111 | digits = '123456789'
112 | names = '일이삼사오육칠팔구'
113 | digit2name = {d: n for d, n in zip(digits, names)}
114 |
115 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
116 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
117 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
118 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
119 |
120 | spelledout = []
121 | for i, digit in enumerate(num):
122 | i = len(num) - i - 1
123 | if sino:
124 | if i == 0:
125 | name = digit2name.get(digit, '')
126 | elif i == 1:
127 | name = digit2name.get(digit, '') + '십'
128 | name = name.replace('일십', '십')
129 | else:
130 | if i == 0:
131 | name = digit2mod.get(digit, '')
132 | elif i == 1:
133 | name = digit2dec.get(digit, '')
134 | if digit == '0':
135 | if i % 4 == 0:
136 | last_three = spelledout[-min(3, len(spelledout)):]
137 | if ''.join(last_three) == '':
138 | spelledout.append('')
139 | continue
140 | else:
141 | spelledout.append('')
142 | continue
143 | if i == 2:
144 | name = digit2name.get(digit, '') + '백'
145 | name = name.replace('일백', '백')
146 | elif i == 3:
147 | name = digit2name.get(digit, '') + '천'
148 | name = name.replace('일천', '천')
149 | elif i == 4:
150 | name = digit2name.get(digit, '') + '만'
151 | name = name.replace('일만', '만')
152 | elif i == 5:
153 | name = digit2name.get(digit, '') + '십'
154 | name = name.replace('일십', '십')
155 | elif i == 6:
156 | name = digit2name.get(digit, '') + '백'
157 | name = name.replace('일백', '백')
158 | elif i == 7:
159 | name = digit2name.get(digit, '') + '천'
160 | name = name.replace('일천', '천')
161 | elif i == 8:
162 | name = digit2name.get(digit, '') + '억'
163 | elif i == 9:
164 | name = digit2name.get(digit, '') + '십'
165 | elif i == 10:
166 | name = digit2name.get(digit, '') + '백'
167 | elif i == 11:
168 | name = digit2name.get(digit, '') + '천'
169 | elif i == 12:
170 | name = digit2name.get(digit, '') + '조'
171 | elif i == 13:
172 | name = digit2name.get(digit, '') + '십'
173 | elif i == 14:
174 | name = digit2name.get(digit, '') + '백'
175 | elif i == 15:
176 | name = digit2name.get(digit, '') + '천'
177 | spelledout.append(name)
178 | return ''.join(elem for elem in spelledout)
179 |
180 |
181 | def number_to_hangul(text):
182 | '''Reference https://github.com/Kyubyong/g2pK'''
183 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
184 | for token in tokens:
185 | num, classifier = token
186 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
187 | spelledout = hangul_number(num, sino=False)
188 | else:
189 | spelledout = hangul_number(num, sino=True)
190 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
191 | # digit by digit for remaining digits
192 | digits = '0123456789'
193 | names = '영일이삼사오육칠팔구'
194 | for d, n in zip(digits, names):
195 | text = text.replace(d, n)
196 | return text
197 |
198 |
199 | def korean_to_lazy_ipa(text):
200 | text = latin_to_hangul(text)
201 | text = number_to_hangul(text)
202 | text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
203 | for regex, replacement in _ipa_to_lazy_ipa:
204 | text = re.sub(regex, replacement, text)
205 | return text
206 |
207 |
208 | def korean_to_ipa(text):
209 | text = korean_to_lazy_ipa(text)
210 | return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
211 |
--------------------------------------------------------------------------------
/text/mandarin.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import re
4 | from pypinyin import lazy_pinyin, BOPOMOFO
5 | import jieba
6 | import cn2an
7 | import logging
8 |
9 | logging.getLogger('jieba').setLevel(logging.WARNING)
10 | jieba.set_dictionary(os.path.dirname(sys.argv[0])+'/jieba/dict.txt')
11 | jieba.initialize()
12 |
13 |
14 | # List of (Latin alphabet, bopomofo) pairs:
15 | _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
16 | ('a', 'ㄟˉ'),
17 | ('b', 'ㄅㄧˋ'),
18 | ('c', 'ㄙㄧˉ'),
19 | ('d', 'ㄉㄧˋ'),
20 | ('e', 'ㄧˋ'),
21 | ('f', 'ㄝˊㄈㄨˋ'),
22 | ('g', 'ㄐㄧˋ'),
23 | ('h', 'ㄝˇㄑㄩˋ'),
24 | ('i', 'ㄞˋ'),
25 | ('j', 'ㄐㄟˋ'),
26 | ('k', 'ㄎㄟˋ'),
27 | ('l', 'ㄝˊㄛˋ'),
28 | ('m', 'ㄝˊㄇㄨˋ'),
29 | ('n', 'ㄣˉ'),
30 | ('o', 'ㄡˉ'),
31 | ('p', 'ㄆㄧˉ'),
32 | ('q', 'ㄎㄧㄡˉ'),
33 | ('r', 'ㄚˋ'),
34 | ('s', 'ㄝˊㄙˋ'),
35 | ('t', 'ㄊㄧˋ'),
36 | ('u', 'ㄧㄡˉ'),
37 | ('v', 'ㄨㄧˉ'),
38 | ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
39 | ('x', 'ㄝˉㄎㄨˋㄙˋ'),
40 | ('y', 'ㄨㄞˋ'),
41 | ('z', 'ㄗㄟˋ')
42 | ]]
43 |
44 | # List of (bopomofo, romaji) pairs:
45 | _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
46 | ('ㄅㄛ', 'p⁼wo'),
47 | ('ㄆㄛ', 'pʰwo'),
48 | ('ㄇㄛ', 'mwo'),
49 | ('ㄈㄛ', 'fwo'),
50 | ('ㄅ', 'p⁼'),
51 | ('ㄆ', 'pʰ'),
52 | ('ㄇ', 'm'),
53 | ('ㄈ', 'f'),
54 | ('ㄉ', 't⁼'),
55 | ('ㄊ', 'tʰ'),
56 | ('ㄋ', 'n'),
57 | ('ㄌ', 'l'),
58 | ('ㄍ', 'k⁼'),
59 | ('ㄎ', 'kʰ'),
60 | ('ㄏ', 'h'),
61 | ('ㄐ', 'ʧ⁼'),
62 | ('ㄑ', 'ʧʰ'),
63 | ('ㄒ', 'ʃ'),
64 | ('ㄓ', 'ʦ`⁼'),
65 | ('ㄔ', 'ʦ`ʰ'),
66 | ('ㄕ', 's`'),
67 | ('ㄖ', 'ɹ`'),
68 | ('ㄗ', 'ʦ⁼'),
69 | ('ㄘ', 'ʦʰ'),
70 | ('ㄙ', 's'),
71 | ('ㄚ', 'a'),
72 | ('ㄛ', 'o'),
73 | ('ㄜ', 'ə'),
74 | ('ㄝ', 'e'),
75 | ('ㄞ', 'ai'),
76 | ('ㄟ', 'ei'),
77 | ('ㄠ', 'au'),
78 | ('ㄡ', 'ou'),
79 | ('ㄧㄢ', 'yeNN'),
80 | ('ㄢ', 'aNN'),
81 | ('ㄧㄣ', 'iNN'),
82 | ('ㄣ', 'əNN'),
83 | ('ㄤ', 'aNg'),
84 | ('ㄧㄥ', 'iNg'),
85 | ('ㄨㄥ', 'uNg'),
86 | ('ㄩㄥ', 'yuNg'),
87 | ('ㄥ', 'əNg'),
88 | ('ㄦ', 'əɻ'),
89 | ('ㄧ', 'i'),
90 | ('ㄨ', 'u'),
91 | ('ㄩ', 'ɥ'),
92 | ('ˉ', '→'),
93 | ('ˊ', '↑'),
94 | ('ˇ', '↓↑'),
95 | ('ˋ', '↓'),
96 | ('˙', ''),
97 | (',', ','),
98 | ('。', '.'),
99 | ('!', '!'),
100 | ('?', '?'),
101 | ('—', '-')
102 | ]]
103 |
104 | # List of (romaji, ipa) pairs:
105 | _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
106 | ('ʃy', 'ʃ'),
107 | ('ʧʰy', 'ʧʰ'),
108 | ('ʧ⁼y', 'ʧ⁼'),
109 | ('NN', 'n'),
110 | ('Ng', 'ŋ'),
111 | ('y', 'j'),
112 | ('h', 'x')
113 | ]]
114 |
115 | # List of (bopomofo, ipa) pairs:
116 | _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
117 | ('ㄅㄛ', 'p⁼wo'),
118 | ('ㄆㄛ', 'pʰwo'),
119 | ('ㄇㄛ', 'mwo'),
120 | ('ㄈㄛ', 'fwo'),
121 | ('ㄅ', 'p⁼'),
122 | ('ㄆ', 'pʰ'),
123 | ('ㄇ', 'm'),
124 | ('ㄈ', 'f'),
125 | ('ㄉ', 't⁼'),
126 | ('ㄊ', 'tʰ'),
127 | ('ㄋ', 'n'),
128 | ('ㄌ', 'l'),
129 | ('ㄍ', 'k⁼'),
130 | ('ㄎ', 'kʰ'),
131 | ('ㄏ', 'x'),
132 | ('ㄐ', 'tʃ⁼'),
133 | ('ㄑ', 'tʃʰ'),
134 | ('ㄒ', 'ʃ'),
135 | ('ㄓ', 'ts`⁼'),
136 | ('ㄔ', 'ts`ʰ'),
137 | ('ㄕ', 's`'),
138 | ('ㄖ', 'ɹ`'),
139 | ('ㄗ', 'ts⁼'),
140 | ('ㄘ', 'tsʰ'),
141 | ('ㄙ', 's'),
142 | ('ㄚ', 'a'),
143 | ('ㄛ', 'o'),
144 | ('ㄜ', 'ə'),
145 | ('ㄝ', 'ɛ'),
146 | ('ㄞ', 'aɪ'),
147 | ('ㄟ', 'eɪ'),
148 | ('ㄠ', 'ɑʊ'),
149 | ('ㄡ', 'oʊ'),
150 | ('ㄧㄢ', 'jɛn'),
151 | ('ㄩㄢ', 'ɥæn'),
152 | ('ㄢ', 'an'),
153 | ('ㄧㄣ', 'in'),
154 | ('ㄩㄣ', 'ɥn'),
155 | ('ㄣ', 'ən'),
156 | ('ㄤ', 'ɑŋ'),
157 | ('ㄧㄥ', 'iŋ'),
158 | ('ㄨㄥ', 'ʊŋ'),
159 | ('ㄩㄥ', 'jʊŋ'),
160 | ('ㄥ', 'əŋ'),
161 | ('ㄦ', 'əɻ'),
162 | ('ㄧ', 'i'),
163 | ('ㄨ', 'u'),
164 | ('ㄩ', 'ɥ'),
165 | ('ˉ', '→'),
166 | ('ˊ', '↑'),
167 | ('ˇ', '↓↑'),
168 | ('ˋ', '↓'),
169 | ('˙', ''),
170 | (',', ','),
171 | ('。', '.'),
172 | ('!', '!'),
173 | ('?', '?'),
174 | ('—', '-')
175 | ]]
176 |
177 | # List of (bopomofo, ipa2) pairs:
178 | _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
179 | ('ㄅㄛ', 'pwo'),
180 | ('ㄆㄛ', 'pʰwo'),
181 | ('ㄇㄛ', 'mwo'),
182 | ('ㄈㄛ', 'fwo'),
183 | ('ㄅ', 'p'),
184 | ('ㄆ', 'pʰ'),
185 | ('ㄇ', 'm'),
186 | ('ㄈ', 'f'),
187 | ('ㄉ', 't'),
188 | ('ㄊ', 'tʰ'),
189 | ('ㄋ', 'n'),
190 | ('ㄌ', 'l'),
191 | ('ㄍ', 'k'),
192 | ('ㄎ', 'kʰ'),
193 | ('ㄏ', 'h'),
194 | ('ㄐ', 'tɕ'),
195 | ('ㄑ', 'tɕʰ'),
196 | ('ㄒ', 'ɕ'),
197 | ('ㄓ', 'tʂ'),
198 | ('ㄔ', 'tʂʰ'),
199 | ('ㄕ', 'ʂ'),
200 | ('ㄖ', 'ɻ'),
201 | ('ㄗ', 'ts'),
202 | ('ㄘ', 'tsʰ'),
203 | ('ㄙ', 's'),
204 | ('ㄚ', 'a'),
205 | ('ㄛ', 'o'),
206 | ('ㄜ', 'ɤ'),
207 | ('ㄝ', 'ɛ'),
208 | ('ㄞ', 'aɪ'),
209 | ('ㄟ', 'eɪ'),
210 | ('ㄠ', 'ɑʊ'),
211 | ('ㄡ', 'oʊ'),
212 | ('ㄧㄢ', 'jɛn'),
213 | ('ㄩㄢ', 'yæn'),
214 | ('ㄢ', 'an'),
215 | ('ㄧㄣ', 'in'),
216 | ('ㄩㄣ', 'yn'),
217 | ('ㄣ', 'ən'),
218 | ('ㄤ', 'ɑŋ'),
219 | ('ㄧㄥ', 'iŋ'),
220 | ('ㄨㄥ', 'ʊŋ'),
221 | ('ㄩㄥ', 'jʊŋ'),
222 | ('ㄥ', 'ɤŋ'),
223 | ('ㄦ', 'əɻ'),
224 | ('ㄧ', 'i'),
225 | ('ㄨ', 'u'),
226 | ('ㄩ', 'y'),
227 | ('ˉ', '˥'),
228 | ('ˊ', '˧˥'),
229 | ('ˇ', '˨˩˦'),
230 | ('ˋ', '˥˩'),
231 | ('˙', ''),
232 | (',', ','),
233 | ('。', '.'),
234 | ('!', '!'),
235 | ('?', '?'),
236 | ('—', '-')
237 | ]]
238 |
239 |
240 | def number_to_chinese(text):
241 | numbers = re.findall(r'\d+(?:\.?\d+)?', text)
242 | for number in numbers:
243 | text = text.replace(number, cn2an.an2cn(number), 1)
244 | return text
245 |
246 |
247 | def chinese_to_bopomofo(text):
248 | text = text.replace('、', ',').replace(';', ',').replace(':', ',')
249 | words = jieba.lcut(text, cut_all=False)
250 | text = ''
251 | for word in words:
252 | bopomofos = lazy_pinyin(word, BOPOMOFO)
253 | if not re.search('[\u4e00-\u9fff]', word):
254 | text += word
255 | continue
256 | for i in range(len(bopomofos)):
257 | bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
258 | if text != '':
259 | text += ' '
260 | text += ''.join(bopomofos)
261 | return text
262 |
263 |
264 | def latin_to_bopomofo(text):
265 | for regex, replacement in _latin_to_bopomofo:
266 | text = re.sub(regex, replacement, text)
267 | return text
268 |
269 |
270 | def bopomofo_to_romaji(text):
271 | for regex, replacement in _bopomofo_to_romaji:
272 | text = re.sub(regex, replacement, text)
273 | return text
274 |
275 |
276 | def bopomofo_to_ipa(text):
277 | for regex, replacement in _bopomofo_to_ipa:
278 | text = re.sub(regex, replacement, text)
279 | return text
280 |
281 |
282 | def bopomofo_to_ipa2(text):
283 | for regex, replacement in _bopomofo_to_ipa2:
284 | text = re.sub(regex, replacement, text)
285 | return text
286 |
287 |
288 | def chinese_to_romaji(text):
289 | text = number_to_chinese(text)
290 | text = chinese_to_bopomofo(text)
291 | text = latin_to_bopomofo(text)
292 | text = bopomofo_to_romaji(text)
293 | text = re.sub('i([aoe])', r'y\1', text)
294 | text = re.sub('u([aoəe])', r'w\1', text)
295 | text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
296 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
297 | text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
298 | return text
299 |
300 |
301 | def chinese_to_lazy_ipa(text):
302 | text = chinese_to_romaji(text)
303 | for regex, replacement in _romaji_to_ipa:
304 | text = re.sub(regex, replacement, text)
305 | return text
306 |
307 |
308 | def chinese_to_ipa(text):
309 | text = number_to_chinese(text)
310 | text = chinese_to_bopomofo(text)
311 | text = latin_to_bopomofo(text)
312 | text = bopomofo_to_ipa(text)
313 | text = re.sub('i([aoe])', r'j\1', text)
314 | text = re.sub('u([aoəe])', r'w\1', text)
315 | text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
316 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
317 | text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
318 | return text
319 |
320 |
321 | def chinese_to_ipa2(text):
322 | text = number_to_chinese(text)
323 | text = chinese_to_bopomofo(text)
324 | text = latin_to_bopomofo(text)
325 | text = bopomofo_to_ipa2(text)
326 | text = re.sub(r'i([aoe])', r'j\1', text)
327 | text = re.sub(r'u([aoəe])', r'w\1', text)
328 | text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
329 | text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
330 | return text
331 |
--------------------------------------------------------------------------------
/text/ngu_dialect.py:
--------------------------------------------------------------------------------
1 | import re
2 | import opencc
3 |
4 |
5 | dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
6 | 'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
7 | 'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
8 | 'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
9 | 'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
10 | 'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
11 |
12 | converters = {}
13 |
14 | for dialect in dialects.values():
15 | try:
16 | converters[dialect] = opencc.OpenCC(dialect)
17 | except:
18 | pass
19 |
20 |
21 | def ngu_dialect_to_ipa(text, dialect):
22 | dialect = dialects[dialect]
23 | text = converters[dialect].convert(text).replace('-','').replace('$',' ')
24 | text = re.sub(r'[、;:]', ',', text)
25 | text = re.sub(r'\s*,\s*', ', ', text)
26 | text = re.sub(r'\s*。\s*', '. ', text)
27 | text = re.sub(r'\s*?\s*', '? ', text)
28 | text = re.sub(r'\s*!\s*', '! ', text)
29 | text = re.sub(r'\s*$', '', text)
30 | return text
31 |
--------------------------------------------------------------------------------
/text/sanskrit.py:
--------------------------------------------------------------------------------
1 | import re
2 | from indic_transliteration import sanscript
3 |
4 |
5 | # List of (iast, ipa) pairs:
6 | _iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
7 | ('a', 'ə'),
8 | ('ā', 'aː'),
9 | ('ī', 'iː'),
10 | ('ū', 'uː'),
11 | ('ṛ', 'ɹ`'),
12 | ('ṝ', 'ɹ`ː'),
13 | ('ḷ', 'l`'),
14 | ('ḹ', 'l`ː'),
15 | ('e', 'eː'),
16 | ('o', 'oː'),
17 | ('k', 'k⁼'),
18 | ('k⁼h', 'kʰ'),
19 | ('g', 'g⁼'),
20 | ('g⁼h', 'gʰ'),
21 | ('ṅ', 'ŋ'),
22 | ('c', 'ʧ⁼'),
23 | ('ʧ⁼h', 'ʧʰ'),
24 | ('j', 'ʥ⁼'),
25 | ('ʥ⁼h', 'ʥʰ'),
26 | ('ñ', 'n^'),
27 | ('ṭ', 't`⁼'),
28 | ('t`⁼h', 't`ʰ'),
29 | ('ḍ', 'd`⁼'),
30 | ('d`⁼h', 'd`ʰ'),
31 | ('ṇ', 'n`'),
32 | ('t', 't⁼'),
33 | ('t⁼h', 'tʰ'),
34 | ('d', 'd⁼'),
35 | ('d⁼h', 'dʰ'),
36 | ('p', 'p⁼'),
37 | ('p⁼h', 'pʰ'),
38 | ('b', 'b⁼'),
39 | ('b⁼h', 'bʰ'),
40 | ('y', 'j'),
41 | ('ś', 'ʃ'),
42 | ('ṣ', 's`'),
43 | ('r', 'ɾ'),
44 | ('l̤', 'l`'),
45 | ('h', 'ɦ'),
46 | ("'", ''),
47 | ('~', '^'),
48 | ('ṃ', '^')
49 | ]]
50 |
51 |
52 | def devanagari_to_ipa(text):
53 | text = text.replace('ॐ', 'ओम्')
54 | text = re.sub(r'\s*।\s*$', '.', text)
55 | text = re.sub(r'\s*।\s*', ', ', text)
56 | text = re.sub(r'\s*॥', '.', text)
57 | text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
58 | for regex, replacement in _iast_to_ipa:
59 | text = re.sub(regex, replacement, text)
60 | text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0)
61 | [:-1]+'h'+x.group(1)+'*', text)
62 | return text
63 |
--------------------------------------------------------------------------------
/text/shanghainese.py:
--------------------------------------------------------------------------------
1 | import re
2 | import cn2an
3 | import opencc
4 |
5 |
6 | converter = opencc.OpenCC('zaonhe')
7 |
8 | # List of (Latin alphabet, ipa) pairs:
9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10 | ('A', 'ᴇ'),
11 | ('B', 'bi'),
12 | ('C', 'si'),
13 | ('D', 'di'),
14 | ('E', 'i'),
15 | ('F', 'ᴇf'),
16 | ('G', 'dʑi'),
17 | ('H', 'ᴇtɕʰ'),
18 | ('I', 'ᴀi'),
19 | ('J', 'dʑᴇ'),
20 | ('K', 'kʰᴇ'),
21 | ('L', 'ᴇl'),
22 | ('M', 'ᴇm'),
23 | ('N', 'ᴇn'),
24 | ('O', 'o'),
25 | ('P', 'pʰi'),
26 | ('Q', 'kʰiu'),
27 | ('R', 'ᴀl'),
28 | ('S', 'ᴇs'),
29 | ('T', 'tʰi'),
30 | ('U', 'ɦiu'),
31 | ('V', 'vi'),
32 | ('W', 'dᴀbɤliu'),
33 | ('X', 'ᴇks'),
34 | ('Y', 'uᴀi'),
35 | ('Z', 'zᴇ')
36 | ]]
37 |
38 |
39 | def _number_to_shanghainese(num):
40 | num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
41 | return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
42 |
43 |
44 | def number_to_shanghainese(text):
45 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
46 |
47 |
48 | def latin_to_ipa(text):
49 | for regex, replacement in _latin_to_ipa:
50 | text = re.sub(regex, replacement, text)
51 | return text
52 |
53 |
54 | def shanghainese_to_ipa(text):
55 | text = number_to_shanghainese(text.upper())
56 | text = converter.convert(text).replace('-','').replace('$',' ')
57 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
58 | text = re.sub(r'[、;:]', ',', text)
59 | text = re.sub(r'\s*,\s*', ', ', text)
60 | text = re.sub(r'\s*。\s*', '. ', text)
61 | text = re.sub(r'\s*?\s*', '? ', text)
62 | text = re.sub(r'\s*!\s*', '! ', text)
63 | text = re.sub(r'\s*$', '', text)
64 | return text
65 |
--------------------------------------------------------------------------------
/text/thai.py:
--------------------------------------------------------------------------------
1 | import re
2 | from num_thai.thainumbers import NumThai
3 |
4 |
5 | num = NumThai()
6 |
7 | # List of (Latin alphabet, Thai) pairs:
8 | _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
9 | ('a', 'เอ'),
10 | ('b','บี'),
11 | ('c','ซี'),
12 | ('d','ดี'),
13 | ('e','อี'),
14 | ('f','เอฟ'),
15 | ('g','จี'),
16 | ('h','เอช'),
17 | ('i','ไอ'),
18 | ('j','เจ'),
19 | ('k','เค'),
20 | ('l','แอล'),
21 | ('m','เอ็ม'),
22 | ('n','เอ็น'),
23 | ('o','โอ'),
24 | ('p','พี'),
25 | ('q','คิว'),
26 | ('r','แอร์'),
27 | ('s','เอส'),
28 | ('t','ที'),
29 | ('u','ยู'),
30 | ('v','วี'),
31 | ('w','ดับเบิลยู'),
32 | ('x','เอ็กซ์'),
33 | ('y','วาย'),
34 | ('z','ซี')
35 | ]]
36 |
37 |
38 | def num_to_thai(text):
39 | return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
40 |
41 | def latin_to_thai(text):
42 | for regex, replacement in _latin_to_thai:
43 | text = re.sub(regex, replacement, text)
44 | return text
45 |
--------------------------------------------------------------------------------
/trans.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 | import hashlib
4 | import uuid
5 |
6 | youdao_url = 'https://openapi.youdao.com/api' # 有道api地址
7 |
8 | # 需要翻译的文本'
9 | def translate(txt):
10 | translate_text = txt
11 |
12 | # 翻译文本生成sign前进行的处理
13 | input_text = ""
14 |
15 | # 当文本长度小于等于20时,取文本
16 | if (len(translate_text) <= 20):
17 | input_text = translate_text
18 |
19 | # 当文本长度大于20时,进行特殊处理
20 | elif (len(translate_text) > 20):
21 | input_text = translate_text[:10] + str(len(translate_text)) + translate_text[-10:]
22 |
23 | time_curtime = int(time.time()) # 秒级时间戳获取
24 | app_id = '67edf72f6213c548' # 应用id
25 | uu_id = uuid.uuid4() # 随机生成的uuid数,为了每次都生成一个不重复的数。
26 | app_key = "GIUcbzYlyLq1yKD2VVjV24OZXkzDpota" # 应用密钥
27 |
28 | sign = hashlib.sha256(
29 | (app_id + input_text + str(uu_id) + str(time_curtime) + app_key).encode('utf-8')).hexdigest() # sign生成
30 |
31 | data = {
32 | 'q': translate_text, # 翻译文本
33 | 'from': "zh-CHS", # 源语言
34 | 'to': "ja", # 翻译语言
35 | 'appKey': app_id, # 应用id
36 | 'salt': uu_id, # 随机生产的uuid码
37 | 'sign': sign, # 签名
38 | 'signType': "v3", # 签名类型,固定值
39 | 'curtime': time_curtime, # 秒级时间戳
40 | }
41 |
42 | r = requests.get(youdao_url, params=data).json() # 获取返回的json()内容
43 | return r["translation"][0]
44 | #print("翻译后的结果:" + r["translation"][0]) # 获取翻译内容
45 |
46 | if __name__ == '__main__':
47 | s=translate('早上好')
48 | print(s)
--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 | import numpy as np
5 |
6 |
7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
9 | DEFAULT_MIN_DERIVATIVE = 1e-3
10 |
11 |
12 | def piecewise_rational_quadratic_transform(inputs,
13 | unnormalized_widths,
14 | unnormalized_heights,
15 | unnormalized_derivatives,
16 | inverse=False,
17 | tails=None,
18 | tail_bound=1.,
19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
21 | min_derivative=DEFAULT_MIN_DERIVATIVE):
22 |
23 | if tails is None:
24 | spline_fn = rational_quadratic_spline
25 | spline_kwargs = {}
26 | else:
27 | spline_fn = unconstrained_rational_quadratic_spline
28 | spline_kwargs = {
29 | 'tails': tails,
30 | 'tail_bound': tail_bound
31 | }
32 |
33 | outputs, logabsdet = spline_fn(
34 | inputs=inputs,
35 | unnormalized_widths=unnormalized_widths,
36 | unnormalized_heights=unnormalized_heights,
37 | unnormalized_derivatives=unnormalized_derivatives,
38 | inverse=inverse,
39 | min_bin_width=min_bin_width,
40 | min_bin_height=min_bin_height,
41 | min_derivative=min_derivative,
42 | **spline_kwargs
43 | )
44 | return outputs, logabsdet
45 |
46 |
47 | def searchsorted(bin_locations, inputs, eps=1e-6):
48 | bin_locations[..., -1] += eps
49 | return torch.sum(
50 | inputs[..., None] >= bin_locations,
51 | dim=-1
52 | ) - 1
53 |
54 |
55 | def unconstrained_rational_quadratic_spline(inputs,
56 | unnormalized_widths,
57 | unnormalized_heights,
58 | unnormalized_derivatives,
59 | inverse=False,
60 | tails='linear',
61 | tail_bound=1.,
62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
64 | min_derivative=DEFAULT_MIN_DERIVATIVE):
65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
66 | outside_interval_mask = ~inside_interval_mask
67 |
68 | outputs = torch.zeros_like(inputs)
69 | logabsdet = torch.zeros_like(inputs)
70 |
71 | if tails == 'linear':
72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
73 | constant = np.log(np.exp(1 - min_derivative) - 1)
74 | unnormalized_derivatives[..., 0] = constant
75 | unnormalized_derivatives[..., -1] = constant
76 |
77 | outputs[outside_interval_mask] = inputs[outside_interval_mask]
78 | logabsdet[outside_interval_mask] = 0
79 | else:
80 | raise RuntimeError('{} tails are not implemented.'.format(tails))
81 |
82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
83 | inputs=inputs[inside_interval_mask],
84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87 | inverse=inverse,
88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
89 | min_bin_width=min_bin_width,
90 | min_bin_height=min_bin_height,
91 | min_derivative=min_derivative
92 | )
93 |
94 | return outputs, logabsdet
95 |
96 | def rational_quadratic_spline(inputs,
97 | unnormalized_widths,
98 | unnormalized_heights,
99 | unnormalized_derivatives,
100 | inverse=False,
101 | left=0., right=1., bottom=0., top=1.,
102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 | min_derivative=DEFAULT_MIN_DERIVATIVE):
105 | if torch.min(inputs) < left or torch.max(inputs) > right:
106 | raise ValueError('Input to a transform is not within its domain')
107 |
108 | num_bins = unnormalized_widths.shape[-1]
109 |
110 | if min_bin_width * num_bins > 1.0:
111 | raise ValueError('Minimal bin width too large for the number of bins')
112 | if min_bin_height * num_bins > 1.0:
113 | raise ValueError('Minimal bin height too large for the number of bins')
114 |
115 | widths = F.softmax(unnormalized_widths, dim=-1)
116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 | cumwidths = torch.cumsum(widths, dim=-1)
118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 | cumwidths = (right - left) * cumwidths + left
120 | cumwidths[..., 0] = left
121 | cumwidths[..., -1] = right
122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 |
124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 |
126 | heights = F.softmax(unnormalized_heights, dim=-1)
127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 | cumheights = torch.cumsum(heights, dim=-1)
129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 | cumheights = (top - bottom) * cumheights + bottom
131 | cumheights[..., 0] = bottom
132 | cumheights[..., -1] = top
133 | heights = cumheights[..., 1:] - cumheights[..., :-1]
134 |
135 | if inverse:
136 | bin_idx = searchsorted(cumheights, inputs)[..., None]
137 | else:
138 | bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 |
140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 |
143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 | delta = heights / widths
145 | input_delta = delta.gather(-1, bin_idx)[..., 0]
146 |
147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 |
150 | input_heights = heights.gather(-1, bin_idx)[..., 0]
151 |
152 | if inverse:
153 | a = (((inputs - input_cumheights) * (input_derivatives
154 | + input_derivatives_plus_one
155 | - 2 * input_delta)
156 | + input_heights * (input_delta - input_derivatives)))
157 | b = (input_heights * input_derivatives
158 | - (inputs - input_cumheights) * (input_derivatives
159 | + input_derivatives_plus_one
160 | - 2 * input_delta))
161 | c = - input_delta * (inputs - input_cumheights)
162 |
163 | discriminant = b.pow(2) - 4 * a * c
164 | assert (discriminant >= 0).all()
165 |
166 | root = (2 * c) / (-b - torch.sqrt(discriminant))
167 | outputs = root * input_bin_widths + input_cumwidths
168 |
169 | theta_one_minus_theta = root * (1 - root)
170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 | * theta_one_minus_theta)
172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 | + 2 * input_delta * theta_one_minus_theta
174 | + input_derivatives * (1 - root).pow(2))
175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 |
177 | return outputs, -logabsdet
178 | else:
179 | theta = (inputs - input_cumwidths) / input_bin_widths
180 | theta_one_minus_theta = theta * (1 - theta)
181 |
182 | numerator = input_heights * (input_delta * theta.pow(2)
183 | + input_derivatives * theta_one_minus_theta)
184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 | * theta_one_minus_theta)
186 | outputs = input_cumheights + numerator / denominator
187 |
188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 | + 2 * input_delta * theta_one_minus_theta
190 | + input_derivatives * (1 - theta).pow(2))
191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 |
193 | return outputs, logabsdet
194 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from json import loads
3 | from torch import load, FloatTensor
4 | from numpy import float32
5 | import librosa
6 |
7 |
8 | class HParams():
9 | def __init__(self, **kwargs):
10 | for k, v in kwargs.items():
11 | if type(v) == dict:
12 | v = HParams(**v)
13 | self[k] = v
14 |
15 | def keys(self):
16 | return self.__dict__.keys()
17 |
18 | def items(self):
19 | return self.__dict__.items()
20 |
21 | def values(self):
22 | return self.__dict__.values()
23 |
24 | def __len__(self):
25 | return len(self.__dict__)
26 |
27 | def __getitem__(self, key):
28 | return getattr(self, key)
29 |
30 | def __setitem__(self, key, value):
31 | return setattr(self, key, value)
32 |
33 | def __contains__(self, key):
34 | return key in self.__dict__
35 |
36 | def __repr__(self):
37 | return self.__dict__.__repr__()
38 |
39 |
40 | def load_checkpoint(checkpoint_path, model):
41 | checkpoint_dict = load(checkpoint_path, map_location='cpu')
42 | iteration = checkpoint_dict['iteration']
43 | saved_state_dict = checkpoint_dict['model']
44 | if hasattr(model, 'module'):
45 | state_dict = model.module.state_dict()
46 | else:
47 | state_dict = model.state_dict()
48 | new_state_dict= {}
49 | for k, v in state_dict.items():
50 | try:
51 | new_state_dict[k] = saved_state_dict[k]
52 | except:
53 | logging.info("%s is not in the checkpoint" % k)
54 | new_state_dict[k] = v
55 | if hasattr(model, 'module'):
56 | model.module.load_state_dict(new_state_dict)
57 | else:
58 | model.load_state_dict(new_state_dict)
59 | logging.info("Loaded checkpoint '{}' (iteration {})" .format(
60 | checkpoint_path, iteration))
61 | return
62 |
63 |
64 | def get_hparams_from_file(config_path):
65 | with open(config_path, "r") as f:
66 | data = f.read()
67 | config = loads(data)
68 |
69 | hparams = HParams(**config)
70 | return hparams
71 |
72 |
73 | def load_audio_to_torch(full_path, target_sampling_rate):
74 | audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
75 | return FloatTensor(audio.astype(float32))
76 |
--------------------------------------------------------------------------------
/voiceModel/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "segment_size": 8192
4 | },
5 | "data": {
6 | "text_cleaners":["zh_ja_mixture_cleaners"],
7 | "max_wav_value": 32768.0,
8 | "sampling_rate": 22050,
9 | "filter_length": 1024,
10 | "hop_length": 256,
11 | "win_length": 1024,
12 | "add_blank": true,
13 | "n_speakers": 5
14 | },
15 | "model": {
16 | "inter_channels": 192,
17 | "hidden_channels": 192,
18 | "filter_channels": 768,
19 | "n_heads": 2,
20 | "n_layers": 6,
21 | "kernel_size": 3,
22 | "p_dropout": 0.1,
23 | "resblock": "1",
24 | "resblock_kernel_sizes": [3,7,11],
25 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26 | "upsample_rates": [8,8,2,2],
27 | "upsample_initial_channel": 512,
28 | "upsample_kernel_sizes": [16,16,4,4],
29 | "n_layers_q": 3,
30 | "use_spectral_norm": false,
31 | "gin_channels": 256
32 | },
33 | "speakers": ["\u7dbe\u5730\u5be7\u3005", "\u5728\u539f\u4e03\u6d77", "\u5c0f\u8338", "\u5510\u4e50\u541f"],
34 | "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u207c", "\u02b0", "`", "\u2192", "\u2193", "\u2191", " "]
35 | }
36 |
--------------------------------------------------------------------------------