├── .gitattributes ├── .gitignore ├── .idea ├── PythonPlugins.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── LICENSE ├── MoeGoe.py ├── README.md ├── __init__.py ├── attentions.py ├── bot.py ├── commons.py ├── hubert_model.py ├── jieba └── dict.txt ├── mel_processing.py ├── models.py ├── modules.py ├── monotonic_align ├── __init__.py └── core.py ├── pictures ├── __init__.py ├── agakaUa$aNaGaka.jpg ├── apauaraga5aqafa.jpg ├── avabaaa%aZaxa6a.jpg └── awa6aRakaka3a7a.jpg ├── plugins ├── RandomStr │ ├── RandomStr.py │ └── __init__.py ├── __init__.py ├── picGet.py ├── voicePart.py └── voices │ └── __init__.py ├── requirements.txt ├── text ├── LICENSE ├── __init__.py ├── cantonese.py ├── cleaners.py ├── english.py ├── japanese.py ├── korean.py ├── mandarin.py ├── ngu_dialect.py ├── sanskrit.py ├── shanghainese.py └── thai.py ├── trans.py ├── transforms.py ├── utils.py └── voiceModel └── config.json /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | *.pyc 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd 364 | 365 | # build 366 | build 367 | monotonic_align/core.c 368 | *.o 369 | *.so 370 | *.dll 371 | 372 | # data 373 | /config.json 374 | /*.pth 375 | *.wav 376 | /monotonic_align/monotonic_align 377 | /resources 378 | /MoeGoe.spec 379 | /dist/MoeGoe 380 | /dist 381 | 382 | # MacOS 383 | .DS_Store 384 | -------------------------------------------------------------------------------- /.idea/PythonPlugins.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 65 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 13 | 14 | 18 | 19 | 21 | 22 | 23 | 24 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 1669552567504 79 | 92 | 93 | 1675307452793 94 | 99 | 100 | 1675307818997 101 | 106 | 109 | 110 | 112 | 113 | 134 | 135 | 136 | 137 | 138 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 CjangCjengh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MoeGoe.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from scipy.io.wavfile import write 4 | 5 | from plugins.RandomStr.RandomStr import random_str 6 | from trans import translate 7 | from mel_processing import spectrogram_torch 8 | from text import text_to_sequence, _clean_text 9 | from models import SynthesizerTrn 10 | import utils 11 | import commons 12 | import sys 13 | import re 14 | from torch import no_grad, LongTensor 15 | import logging 16 | 17 | logging.getLogger('numba').setLevel(logging.WARNING) 18 | 19 | 20 | def ex_print(text, escape=False): 21 | if escape: 22 | print(text.encode('unicode_escape').decode()) 23 | else: 24 | print(text) 25 | 26 | 27 | def get_text(text, hps, cleaned=False): 28 | if cleaned: 29 | text_norm = text_to_sequence(text, hps.symbols, []) 30 | else: 31 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 32 | if hps.data.add_blank: 33 | text_norm = commons.intersperse(text_norm, 0) 34 | text_norm = LongTensor(text_norm) 35 | return text_norm 36 | 37 | 38 | def ask_if_continue(): 39 | while True: 40 | answer = input('Continue? (y/n): ') 41 | if answer == 'y': 42 | break 43 | elif answer == 'n': 44 | sys.exit(0) 45 | 46 | 47 | def print_speakers(speakers, escape=False): 48 | print('ID\tSpeaker') 49 | for id, name in enumerate(speakers): 50 | ex_print(str(id) + '\t' + name, escape) 51 | 52 | 53 | def get_speaker_id(message): 54 | '''speaker_id = input(message) 55 | try: 56 | speaker_id = int(speaker_id) 57 | except: 58 | print(str(speaker_id) + ' is not a valid ID!') 59 | sys.exit(1) 60 | return speaker_id''' 61 | return 0 62 | 63 | 64 | def get_label_value(text, label, default, warning_name='value'): 65 | value = re.search(rf'\[{label}=(.+?)\]', text) 66 | if value: 67 | try: 68 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 69 | value = float(value.group(1)) 70 | except: 71 | print(f'Invalid {warning_name}!') 72 | sys.exit(1) 73 | else: 74 | value = default 75 | return value, text 76 | 77 | 78 | def get_label(text, label): 79 | if f'[{label}]' in text: 80 | return True, text.replace(f'[{label}]', '') 81 | else: 82 | return False, text 83 | 84 | def voiceGenerate(tex,out,spealerIDDD=0,modelSelect=0): 85 | Path = sys.argv[0][:-23] 86 | text=tex 87 | out_path=out 88 | speakeriddd=spealerIDDD 89 | if '--escape' in sys.argv: 90 | escape = True 91 | else: 92 | escape = False 93 | 94 | #model = 'voiceModel\\1374_epochs.pth'#input('Path of a VITS model: ') 95 | #config ='voiceModel\\config.json'#input('Path of a config file: ') 96 | if modelSelect==1: 97 | model = 'voiceModel/YUUKA/G.pth' 98 | config = 'voiceModel/YUUKA/config.json' 99 | speakeriddd=0 100 | else: 101 | model = 'voiceModel/1374_epochs.pth' 102 | config = 'voiceModel/config.json' 103 | 104 | hps_ms = utils.get_hparams_from_file(config) 105 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 106 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 107 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 108 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 109 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 110 | 111 | net_g_ms = SynthesizerTrn( 112 | n_symbols, 113 | hps_ms.data.filter_length // 2 + 1, 114 | hps_ms.train.segment_size // hps_ms.data.hop_length, 115 | n_speakers=n_speakers, 116 | emotion_embedding=emotion_embedding, 117 | **hps_ms.model) 118 | _ = net_g_ms.eval() 119 | utils.load_checkpoint(model, net_g_ms) 120 | 121 | while True: 122 | choice = 't' # input('TTS or VC? (t/v):') 123 | if choice == 't': 124 | #text = input('Text to read: ') 125 | if text == '[ADVANCED]': 126 | text = input('Raw text:') 127 | print('Cleaned text is:') 128 | ex_print(_clean_text( 129 | text, hps_ms.data.text_cleaners), escape) 130 | continue 131 | 132 | length_scale, text = get_label_value( 133 | text, 'LENGTH', 1, 'length scale') 134 | noise_scale, text = get_label_value( 135 | text, 'NOISE', 0.667, 'noise scale') 136 | noise_scale_w, text = get_label_value( 137 | text, 'NOISEW', 0.8, 'deviation of noise') 138 | cleaned, text = get_label(text, 'CLEANED') 139 | 140 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 141 | 142 | #print_speakers(speakers, escape) 143 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 144 | print(time + '| 正在使用语音模型:'+str(speakeriddd)+' ......生成中'+' | 文本:'+tex) 145 | speaker_id = speakeriddd 146 | 147 | with no_grad(): 148 | x_tst = stn_tst.unsqueeze(0) 149 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 150 | sid = LongTensor([speaker_id]) 151 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 152 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][ 153 | 0, 0].data.cpu().float().numpy() 154 | 155 | elif choice == 'v': 156 | audio, out_path = voice_conversion() 157 | 158 | write(out_path, hps_ms.data.sampling_rate, audio) 159 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 160 | print(time + '| Successfully saved!') 161 | break 162 | 163 | def voice_conversion(sourcepath,speaker=0): 164 | if '--escape' in sys.argv: 165 | escape = True 166 | else: 167 | escape = False 168 | 169 | model = 'voiceModel\\1374_epochs.pth'#input('Path of a VITS model: ') 170 | config ='voiceModel\\config.json'#input('Path of a config file: ') 171 | 172 | hps_ms = utils.get_hparams_from_file(config) 173 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 174 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 175 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 176 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 177 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 178 | 179 | net_g_ms = SynthesizerTrn( 180 | n_symbols, 181 | hps_ms.data.filter_length // 2 + 1, 182 | hps_ms.train.segment_size // hps_ms.data.hop_length, 183 | n_speakers=n_speakers, 184 | emotion_embedding=emotion_embedding, 185 | **hps_ms.model) 186 | _ = net_g_ms.eval() 187 | utils.load_checkpoint(model, net_g_ms) 188 | 189 | audio_path = sourcepath 190 | audio = utils.load_audio_to_torch( 191 | audio_path, hps_ms.data.sampling_rate) 192 | 193 | originnal_id = speaker 194 | target_id = 3 195 | out_path = 'plugins\\voices\\sing\\out.wav' 196 | 197 | y = audio.unsqueeze(0) 198 | 199 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 200 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 201 | center=False) 202 | spec_lengths = LongTensor([spec.size(-1)]) 203 | sid_src = LongTensor([originnal_id]) 204 | 205 | with no_grad(): 206 | sid_tgt = LongTensor([target_id]) 207 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 208 | 0][0, 0].data.cpu().float().numpy() 209 | write(out_path, hps_ms.data.sampling_rate, audio) 210 | print('Successfully saved!') 211 | return out_path 212 | 213 | 214 | if __name__ == '__main__': 215 | #voice_conversion("plugins/voices/sing/rest.wav") 216 | voiceGenerate('先生,ちょっとお時間..いただけますか?','voiceModel/YUUKA/1.wav',0,1) 217 | '''ranpath = random_str() 218 | Path=sys.argv[0][:-23] 219 | print(Path) 220 | out = Path+'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 221 | tex = '[JA]' + translate('测试语音.....') + '[JA]' 222 | voiceGenerate(tex, out)''' 223 | '''if '--escape' in sys.argv: 224 | escape = True 225 | else: 226 | escape = False 227 | 228 | model = input('Path of a VITS model: ') 229 | config = input('Path of a config file: ') 230 | 231 | hps_ms = utils.get_hparams_from_file(config) 232 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 233 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 234 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 235 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 236 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 237 | 238 | net_g_ms = SynthesizerTrn( 239 | n_symbols, 240 | hps_ms.data.filter_length // 2 + 1, 241 | hps_ms.train.segment_size // hps_ms.data.hop_length, 242 | n_speakers=n_speakers, 243 | emotion_embedding=emotion_embedding, 244 | **hps_ms.model) 245 | _ = net_g_ms.eval() 246 | utils.load_checkpoint(model, net_g_ms) 247 | 248 | def voice_conversion(): 249 | audio_path = input('Path of an audio file to convert:\n') 250 | print_speakers(speakers) 251 | audio = utils.load_audio_to_torch( 252 | audio_path, hps_ms.data.sampling_rate) 253 | 254 | originnal_id = get_speaker_id('Original speaker ID: ') 255 | target_id = get_speaker_id('Target speaker ID: ') 256 | out_path = input('Path to save: ') 257 | 258 | y = audio.unsqueeze(0) 259 | 260 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 261 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 262 | center=False) 263 | spec_lengths = LongTensor([spec.size(-1)]) 264 | sid_src = LongTensor([originnal_id]) 265 | 266 | with no_grad(): 267 | sid_tgt = LongTensor([target_id]) 268 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 269 | 0][0, 0].data.cpu().float().numpy() 270 | return audio, out_path 271 | 272 | if n_symbols != 0: 273 | if not emotion_embedding: 274 | while True: 275 | choice = input('TTS or VC? (t/v):') 276 | if choice == 't': 277 | text = input('Text to read: ') 278 | if text == '[ADVANCED]': 279 | text = input('Raw text:') 280 | print('Cleaned text is:') 281 | ex_print(_clean_text( 282 | text, hps_ms.data.text_cleaners), escape) 283 | continue 284 | 285 | length_scale, text = get_label_value( 286 | text, 'LENGTH', 1, 'length scale') 287 | noise_scale, text = get_label_value( 288 | text, 'NOISE', 0.667, 'noise scale') 289 | noise_scale_w, text = get_label_value( 290 | text, 'NOISEW', 0.8, 'deviation of noise') 291 | cleaned, text = get_label(text, 'CLEANED') 292 | 293 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 294 | 295 | print_speakers(speakers, escape) 296 | speaker_id = get_speaker_id('Speaker ID: ') 297 | out_path = input('Path to save: ') 298 | 299 | with no_grad(): 300 | x_tst = stn_tst.unsqueeze(0) 301 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 302 | sid = LongTensor([speaker_id]) 303 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 304 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 305 | 306 | elif choice == 'v': 307 | audio, out_path = voice_conversion() 308 | 309 | write(out_path, hps_ms.data.sampling_rate, audio) 310 | print('Successfully saved!') 311 | ask_if_continue() 312 | else: 313 | import os 314 | import librosa 315 | import numpy as np 316 | from torch import FloatTensor 317 | import audonnx 318 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') 319 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) 320 | while True: 321 | choice = input('TTS or VC? (t/v):') 322 | if choice == 't': 323 | text = input('Text to read: ') 324 | if text == '[ADVANCED]': 325 | text = input('Raw text:') 326 | print('Cleaned text is:') 327 | ex_print(_clean_text( 328 | text, hps_ms.data.text_cleaners), escape) 329 | continue 330 | 331 | length_scale, text = get_label_value( 332 | text, 'LENGTH', 1, 'length scale') 333 | noise_scale, text = get_label_value( 334 | text, 'NOISE', 0.667, 'noise scale') 335 | noise_scale_w, text = get_label_value( 336 | text, 'NOISEW', 0.8, 'deviation of noise') 337 | cleaned, text = get_label(text, 'CLEANED') 338 | 339 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 340 | 341 | print_speakers(speakers, escape) 342 | speaker_id = get_speaker_id('Speaker ID: ') 343 | 344 | emotion_reference = input('Path of an emotion reference: ') 345 | if emotion_reference.endswith('.npy'): 346 | emotion = np.load(emotion_reference) 347 | emotion = FloatTensor(emotion).unsqueeze(0) 348 | else: 349 | audio16000, sampling_rate = librosa.load( 350 | emotion_reference, sr=16000, mono=True) 351 | emotion = w2v2_model(audio16000, sampling_rate)[ 352 | 'hidden_states'] 353 | emotion_reference = re.sub( 354 | r'\..*$', '', emotion_reference) 355 | np.save(emotion_reference, emotion.squeeze(0)) 356 | emotion = FloatTensor(emotion) 357 | 358 | out_path = input('Path to save: ') 359 | 360 | with no_grad(): 361 | x_tst = stn_tst.unsqueeze(0) 362 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 363 | sid = LongTensor([speaker_id]) 364 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, 365 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy() 366 | 367 | elif choice == 'v': 368 | audio, out_path = voice_conversion() 369 | 370 | write(out_path, hps_ms.data.sampling_rate, audio) 371 | print('Successfully saved!') 372 | ask_if_continue() 373 | else: 374 | model = input('Path of a hubert-soft model: ') 375 | from hubert_model import hubert_soft 376 | hubert = hubert_soft(model) 377 | 378 | while True: 379 | audio_path = input('Path of an audio file to convert:\n') 380 | 381 | if audio_path != '[VC]': 382 | import librosa 383 | if use_f0: 384 | audio, sampling_rate = librosa.load( 385 | audio_path, sr=hps_ms.data.sampling_rate, mono=True) 386 | audio16000 = librosa.resample( 387 | audio, orig_sr=sampling_rate, target_sr=16000) 388 | else: 389 | audio16000, sampling_rate = librosa.load( 390 | audio_path, sr=16000, mono=True) 391 | 392 | target_id = get_speaker_id('Target speaker ID: ') 393 | out_path = input('Path to save: ') 394 | length_scale, out_path = get_label_value( 395 | out_path, 'LENGTH', 1, 'length scale') 396 | noise_scale, out_path = get_label_value( 397 | out_path, 'NOISE', 0.1, 'noise scale') 398 | noise_scale_w, out_path = get_label_value( 399 | out_path, 'NOISEW', 0.1, 'deviation of noise') 400 | 401 | from torch import inference_mode, FloatTensor 402 | import numpy as np 403 | with inference_mode(): 404 | units = hubert.units(FloatTensor(audio16000).unsqueeze( 405 | 0).unsqueeze(0)).squeeze(0).numpy() 406 | if use_f0: 407 | f0_scale, out_path = get_label_value( 408 | out_path, 'F0', 1, 'f0 scale') 409 | f0 = librosa.pyin(audio, sr=sampling_rate, 410 | fmin=librosa.note_to_hz('C0'), 411 | fmax=librosa.note_to_hz('C7'), 412 | frame_length=1780)[0] 413 | target_length = len(units[:, 0]) 414 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length, 415 | np.arange(0, len(f0)), f0)) * f0_scale 416 | units[:, 0] = f0 / 10 417 | 418 | stn_tst = FloatTensor(units) 419 | with no_grad(): 420 | x_tst = stn_tst.unsqueeze(0) 421 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 422 | sid = LongTensor([target_id]) 423 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 424 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() 425 | 426 | else: 427 | audio, out_path = voice_conversion() 428 | 429 | write(out_path, hps_ms.data.sampling_rate, audio) 430 | print('Successfully saved!') 431 | ask_if_continue()''' 432 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 此仓库不再更新,全部功能已转移至[Manyana](https://github.com/avilliai/Manyana) 2 | 3 | # 不 要 克 隆 仓 库 4 | - 克隆仓库用不了,几个月没push过。用右上角release。 5 | - 看到黄字推荐 pip install uvicorn时,忽略即可,安装将导致无法正常运行程序。 6 | # 更新 7 | - 支持导入模型,XX说 8 | - 设置语音回复,可以参考[wReply](https://github.com/avilliai/wReply) 9 | 10 | # Links 11 | - 项目最核心的部分是CjangCjengh佬的[MoeGoe](https://github.com/CjangCjengh/MoeGoe) 12 | - 基于[Yiri-mirai](https://github.com/YiriMiraiProject/YiriMirai)实现 13 | - python版本推荐3.9 不推荐python3.10 14 | - 请确保已安装[mirai-api-http](https://github.com/project-mirai/mirai-api-http) 15 | 16 | 17 | # 可能的问题 18 | 19 | 1 20 | 21 | FileNotFoundError[Errno 2]: No such............ 22 | 23 | 解决:out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 24 | 25 | 替换成 26 | 27 | out='绝对路径\\PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 28 | 29 | 2 30 | 31 | ModuleNotFoundError: no module named 'XXX' 32 | 33 | 解决:缺包,执行如下命令 34 | 35 | pip install XXX 36 | 37 | 3 38 | 39 | TyreError: run() got an unexpected keyword argument 'debug' 40 | 41 | 解决:python版本不对,推荐换3.9 42 | pip uninstall uvicorn 43 | 44 | 4 45 | 46 | ConnectionRefusedError: [WinError 1225] 远程网络计算机拒绝连接 47 | 48 | 解决:bot.py的port,key,botqq与mirai-api-http配置不一致,修改对应即可 49 | 5 50 | AttributeError:........ 51 | 推测是和现有的包有冲突,不知道。换个解释器试试。实在不行下载site-package并解压替换本地的site-package。 52 | 53 | # How to use 54 | - 下载Release,不要克隆仓库 55 | 56 | - 解压,安装压缩包里的python(记得勾选add to path) 57 | 58 | - 进入bot.py所在目录,打开cmd运行如下命令 59 | 60 | pip install -r requirements.txt 61 | 62 | - 修改config.json并运行bot.py(修改botqq、port、key与你的mirai-api-http需要保持一致,botName和master是可选内容) 63 | 64 | 发送 voice 查看帮助菜单 65 | 66 | 发送 sp 查看当前可用的所有角色 67 | 68 | # 导入更多模型(可选) 69 | 70 | 在voiceModel文件夹下新建文件夹,把.pth(模型文件)和config.json(配置文件)放进去 71 | 72 | 下载模型 73 | 74 | 下载模型 75 | 76 | [碧蓝档案主题模型](https://www.bilibili.com/video/BV1wG4y1M7SL/?spm_id_from=333.999.0.0) 77 | 78 | [CjangCjengh的模型仓库](https://github.com/CjangCjengh/TTSModels) 79 | 80 | [**已修改配置文件的模型仓库**](https://pan.baidu.com/s/1bEbDMv0Ysj0cRmwHi6WAyA?pwd=9rmj),下载后放在项目对应文件夹下即可。 81 | 82 | 83 | 模型命名规则(重要): 84 | 多语种模型:后缀名前面加一个m,如yuuka.pth支持中日双语,则改成yuukam.pth 85 | 单语种模型:不用改名 86 | 87 | 配置文件修改: 88 | 模型来自saya佬:打开config.json修改speakers,把一大串speakers修改为一个(名称随意) | 中文名的speaker需要转unicode | https://www.xgjzx.cn/chinese 89 | 模型来自CjangCjengh:直接用 90 | 91 | # 模型名称与config.json文件的修改(导入模型的详解) 92 | 93 | **以碧蓝档案的模型库为例** 94 | 95 | *碧蓝档案模型库的模型大都是单角色,它的配置文件是多模型通用的,但为了更方便地使用,我们需要修改config.json* 96 | 97 | **在[这个网站](https://www.xgjzx.cn/chinese)把角色的名称转成对应的unicode编码** 98 | 99 | ![image](https://user-images.githubusercontent.com/99066610/223444528-6095f225-f9f6-4154-af3b-ecfd120fd563.png) 100 | 101 | 102 | 打开config.json文件,把上一步得到的角色名称的unicode码填入speakers项。 103 | 104 | **修改前:"speakers":["这里是一大堆东西,全删掉"]** 105 | 106 | ![image](https://user-images.githubusercontent.com/99066610/223444630-8c5e2a02-df4d-488a-954d-a68c92d3e491.png) 107 | 108 | 109 | **修改后:"speakers": ["\u963f\u55b5\u55b5"]** 110 | 111 | ![image](https://user-images.githubusercontent.com/99066610/223444725-4a6fe6f6-9225-4cd4-aa1b-7277b92d89f9.png) 112 | 113 | 114 | **需要注意** 115 | 116 | 如果模型支持中日双语则需要把模型名从XXX.pth改成XXXm.pth 117 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/__init__.py -------------------------------------------------------------------------------- /attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | from modules import LayerNorm 8 | 9 | 10 | class Encoder(nn.Module): 11 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 12 | super().__init__() 13 | self.hidden_channels = hidden_channels 14 | self.filter_channels = filter_channels 15 | self.n_heads = n_heads 16 | self.n_layers = n_layers 17 | self.kernel_size = kernel_size 18 | self.p_dropout = p_dropout 19 | self.window_size = window_size 20 | 21 | self.drop = nn.Dropout(p_dropout) 22 | self.attn_layers = nn.ModuleList() 23 | self.norm_layers_1 = nn.ModuleList() 24 | self.ffn_layers = nn.ModuleList() 25 | self.norm_layers_2 = nn.ModuleList() 26 | for i in range(self.n_layers): 27 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 28 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 29 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 30 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 31 | 32 | def forward(self, x, x_mask): 33 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 34 | x = x * x_mask 35 | for i in range(self.n_layers): 36 | y = self.attn_layers[i](x, x, attn_mask) 37 | y = self.drop(y) 38 | x = self.norm_layers_1[i](x + y) 39 | 40 | y = self.ffn_layers[i](x, x_mask) 41 | y = self.drop(y) 42 | x = self.norm_layers_2[i](x + y) 43 | x = x * x_mask 44 | return x 45 | 46 | 47 | class Decoder(nn.Module): 48 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 49 | super().__init__() 50 | self.hidden_channels = hidden_channels 51 | self.filter_channels = filter_channels 52 | self.n_heads = n_heads 53 | self.n_layers = n_layers 54 | self.kernel_size = kernel_size 55 | self.p_dropout = p_dropout 56 | self.proximal_bias = proximal_bias 57 | self.proximal_init = proximal_init 58 | 59 | self.drop = nn.Dropout(p_dropout) 60 | self.self_attn_layers = nn.ModuleList() 61 | self.norm_layers_0 = nn.ModuleList() 62 | self.encdec_attn_layers = nn.ModuleList() 63 | self.norm_layers_1 = nn.ModuleList() 64 | self.ffn_layers = nn.ModuleList() 65 | self.norm_layers_2 = nn.ModuleList() 66 | for i in range(self.n_layers): 67 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 68 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 69 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 70 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 71 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 72 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 73 | 74 | def forward(self, x, x_mask, h, h_mask): 75 | """ 76 | x: decoder input 77 | h: encoder output 78 | """ 79 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 80 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 81 | x = x * x_mask 82 | for i in range(self.n_layers): 83 | y = self.self_attn_layers[i](x, x, self_attn_mask) 84 | y = self.drop(y) 85 | x = self.norm_layers_0[i](x + y) 86 | 87 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 88 | y = self.drop(y) 89 | x = self.norm_layers_1[i](x + y) 90 | 91 | y = self.ffn_layers[i](x, x_mask) 92 | y = self.drop(y) 93 | x = self.norm_layers_2[i](x + y) 94 | x = x * x_mask 95 | return x 96 | 97 | 98 | class MultiHeadAttention(nn.Module): 99 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 100 | super().__init__() 101 | assert channels % n_heads == 0 102 | 103 | self.channels = channels 104 | self.out_channels = out_channels 105 | self.n_heads = n_heads 106 | self.p_dropout = p_dropout 107 | self.window_size = window_size 108 | self.heads_share = heads_share 109 | self.block_length = block_length 110 | self.proximal_bias = proximal_bias 111 | self.proximal_init = proximal_init 112 | self.attn = None 113 | 114 | self.k_channels = channels // n_heads 115 | self.conv_q = nn.Conv1d(channels, channels, 1) 116 | self.conv_k = nn.Conv1d(channels, channels, 1) 117 | self.conv_v = nn.Conv1d(channels, channels, 1) 118 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 119 | self.drop = nn.Dropout(p_dropout) 120 | 121 | if window_size is not None: 122 | n_heads_rel = 1 if heads_share else n_heads 123 | rel_stddev = self.k_channels**-0.5 124 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 125 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 126 | 127 | nn.init.xavier_uniform_(self.conv_q.weight) 128 | nn.init.xavier_uniform_(self.conv_k.weight) 129 | nn.init.xavier_uniform_(self.conv_v.weight) 130 | if proximal_init: 131 | with torch.no_grad(): 132 | self.conv_k.weight.copy_(self.conv_q.weight) 133 | self.conv_k.bias.copy_(self.conv_q.bias) 134 | 135 | def forward(self, x, c, attn_mask=None): 136 | q = self.conv_q(x) 137 | k = self.conv_k(c) 138 | v = self.conv_v(c) 139 | 140 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 141 | 142 | x = self.conv_o(x) 143 | return x 144 | 145 | def attention(self, query, key, value, mask=None): 146 | # reshape [b, d, t] -> [b, n_h, t, d_k] 147 | b, d, t_s, t_t = (*key.size(), query.size(2)) 148 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 149 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 150 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 151 | 152 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 153 | if self.window_size is not None: 154 | assert t_s == t_t, "Relative attention is only available for self-attention." 155 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 156 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 157 | scores_local = self._relative_position_to_absolute_position(rel_logits) 158 | scores = scores + scores_local 159 | if self.proximal_bias: 160 | assert t_s == t_t, "Proximal bias is only available for self-attention." 161 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 162 | if mask is not None: 163 | scores = scores.masked_fill(mask == 0, -1e4) 164 | if self.block_length is not None: 165 | assert t_s == t_t, "Local attention is only available for self-attention." 166 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 167 | scores = scores.masked_fill(block_mask == 0, -1e4) 168 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 169 | p_attn = self.drop(p_attn) 170 | output = torch.matmul(p_attn, value) 171 | if self.window_size is not None: 172 | relative_weights = self._absolute_position_to_relative_position(p_attn) 173 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 174 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 175 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 176 | return output, p_attn 177 | 178 | def _matmul_with_relative_values(self, x, y): 179 | """ 180 | x: [b, h, l, m] 181 | y: [h or 1, m, d] 182 | ret: [b, h, l, d] 183 | """ 184 | ret = torch.matmul(x, y.unsqueeze(0)) 185 | return ret 186 | 187 | def _matmul_with_relative_keys(self, x, y): 188 | """ 189 | x: [b, h, l, d] 190 | y: [h or 1, m, d] 191 | ret: [b, h, l, m] 192 | """ 193 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 194 | return ret 195 | 196 | def _get_relative_embeddings(self, relative_embeddings, length): 197 | max_relative_position = 2 * self.window_size + 1 198 | # Pad first before slice to avoid using cond ops. 199 | pad_length = max(length - (self.window_size + 1), 0) 200 | slice_start_position = max((self.window_size + 1) - length, 0) 201 | slice_end_position = slice_start_position + 2 * length - 1 202 | if pad_length > 0: 203 | padded_relative_embeddings = F.pad( 204 | relative_embeddings, 205 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 206 | else: 207 | padded_relative_embeddings = relative_embeddings 208 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 209 | return used_relative_embeddings 210 | 211 | def _relative_position_to_absolute_position(self, x): 212 | """ 213 | x: [b, h, l, 2*l-1] 214 | ret: [b, h, l, l] 215 | """ 216 | batch, heads, length, _ = x.size() 217 | # Concat columns of pad to shift from relative to absolute indexing. 218 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 219 | 220 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 221 | x_flat = x.view([batch, heads, length * 2 * length]) 222 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 223 | 224 | # Reshape and slice out the padded elements. 225 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 226 | return x_final 227 | 228 | def _absolute_position_to_relative_position(self, x): 229 | """ 230 | x: [b, h, l, l] 231 | ret: [b, h, l, 2*l-1] 232 | """ 233 | batch, heads, length, _ = x.size() 234 | # padd along column 235 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 236 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 237 | # add 0's in the beginning that will skew the elements after reshape 238 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 239 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 240 | return x_final 241 | 242 | def _attention_bias_proximal(self, length): 243 | """Bias for self-attention to encourage attention to close positions. 244 | Args: 245 | length: an integer scalar. 246 | Returns: 247 | a Tensor with shape [1, 1, length, length] 248 | """ 249 | r = torch.arange(length, dtype=torch.float32) 250 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 251 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 252 | 253 | 254 | class FFN(nn.Module): 255 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 256 | super().__init__() 257 | self.in_channels = in_channels 258 | self.out_channels = out_channels 259 | self.filter_channels = filter_channels 260 | self.kernel_size = kernel_size 261 | self.p_dropout = p_dropout 262 | self.activation = activation 263 | self.causal = causal 264 | 265 | if causal: 266 | self.padding = self._causal_padding 267 | else: 268 | self.padding = self._same_padding 269 | 270 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 271 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 272 | self.drop = nn.Dropout(p_dropout) 273 | 274 | def forward(self, x, x_mask): 275 | x = self.conv_1(self.padding(x * x_mask)) 276 | if self.activation == "gelu": 277 | x = x * torch.sigmoid(1.702 * x) 278 | else: 279 | x = torch.relu(x) 280 | x = self.drop(x) 281 | x = self.conv_2(self.padding(x * x_mask)) 282 | return x * x_mask 283 | 284 | def _causal_padding(self, x): 285 | if self.kernel_size == 1: 286 | return x 287 | pad_l = self.kernel_size - 1 288 | pad_r = 0 289 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 290 | x = F.pad(x, commons.convert_pad_shape(padding)) 291 | return x 292 | 293 | def _same_padding(self, x): 294 | if self.kernel_size == 1: 295 | return x 296 | pad_l = (self.kernel_size - 1) // 2 297 | pad_r = self.kernel_size // 2 298 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 299 | x = F.pad(x, commons.convert_pad_shape(padding)) 300 | return x 301 | -------------------------------------------------------------------------------- /bot.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from mirai import Voice,Image 4 | from mirai import Mirai, WebSocketAdapter, FriendMessage, GroupMessage, At, Plain 5 | 6 | 7 | import sys 8 | 9 | from MoeGoe import voiceGenerate 10 | from plugins import voicePart 11 | from plugins.RandomStr.RandomStr import random_str 12 | from plugins.picGet import pic 13 | from trans import translate 14 | 15 | if __name__ == '__main__': 16 | 17 | qq=114514#这一行填写你机器人的QQ 18 | bot = Mirai(qq, adapter=WebSocketAdapter( 19 | verify_key='1234567890', host='localhost', port=23456 20 | )) 21 | aimFriend = 1840094972 22 | aimGroup = 699455559 23 | statusPath = 1 24 | model = 0 25 | lang = '日语' 26 | @bot.on(FriendMessage) 27 | async def yuYinMode(event: FriendMessage): 28 | if str(event.message_chain).startswith('发送'): 29 | sa = str(event.message_chain)[2:] 30 | ranpath = random_str() 31 | out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 32 | 33 | if int(statusPath)==0: 34 | if lang=='中文': 35 | tex = '[ZH]' + sa + '[ZH]' 36 | voiceGenerate(tex, out,model) 37 | await bot.send_friend_message(int(aimFriend),Voice(path=out)) 38 | if lang=='日语': 39 | tex = '[JA]' + translate(sa) + '[JA]' 40 | voiceGenerate(tex, out,model) 41 | await bot.send_friend_message(int(aimFriend),Voice(path=out)) 42 | elif int(statusPath)==1: 43 | if lang=='中文': 44 | tex = '[ZH]' + sa + '[ZH]' 45 | voiceGenerate(tex, out,model) 46 | await bot.send_group_message(int(aimGroup),Voice(path=out)) 47 | if lang=='日语': 48 | tex = '[JA]' + translate(sa) + '[JA]' 49 | voiceGenerate(tex, out,model) 50 | await bot.send_group_message(int(aimGroup),Voice(path=out)) 51 | #图片模块 52 | @bot.on(GroupMessage) 53 | async def handle_group_message(event: GroupMessage): 54 | if '/pic' in str(event.message_chain): 55 | picNum=int((str(event.message_chain))[4:]) 56 | if picNum<10 and picNum>-1: 57 | for i in range(picNum): 58 | a = pic() 59 | await bot.send(event, Image(path=a)) 60 | elif picNum=='': 61 | a = pic() 62 | await bot.send(event, Image(path=a)) 63 | else: 64 | await bot.send(event,"可以发点正常的数字吗") 65 | 66 | 67 | 68 | 69 | 70 | #失效 71 | '''@bot.on(FriendMessage) 72 | async def handle_group_message(event: FriendMessage): 73 | if str(event.message_chain).startswith('#说'): 74 | if len(str(event.message_chain)) < 280: 75 | ranpath = random_str() 76 | out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 77 | tex = '[JA]' + translate((str(event.message_chain))[1:]) + '[JA]' 78 | voiceGenerate(tex, out) 79 | await bot.send(event, Voice(path=out)) 80 | else: 81 | ranpath = random_str() 82 | out = sys.argv[0][:-20] + 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 83 | tex = '[JA]' + translate('不行,太长了哦.....') + '[JA]' 84 | voiceGenerate(tex, out) 85 | await bot(event, Voice(path=out))''' 86 | 87 | #连接群 88 | @bot.on(FriendMessage) 89 | async def on_friend_message(event: FriendMessage): 90 | if str(event.message_chain).startswith('连接群'): 91 | sa = str(event.message_chain).split('#') 92 | global aimGroup 93 | aimGroup=int(sa[1]) 94 | global statusPath 95 | statusPath = 1 96 | await bot.send(event, '已切换为群聊'+sa[1]) 97 | 98 | #连接人 99 | @bot.on(FriendMessage) 100 | async def on_friend_message(event: FriendMessage): 101 | if str(event.message_chain).startswith('连接对象'): 102 | sa = str(event.message_chain).split('#') 103 | global aimFriend 104 | aimFriend=sa[1] 105 | global statusPath 106 | statusPath=0 107 | await bot.send(event, '已切换为私聊对象'+sa[1]) 108 | 109 | #语言切换 110 | @bot.on(FriendMessage) 111 | async def Lanconfig(event: FriendMessage): 112 | if str(event.message_chain).startswith('切换'): 113 | sa = str(event.message_chain)[2:] 114 | global lang 115 | if sa=='中文': 116 | lang=sa 117 | await bot.send(event, '已切换,当前使用语言'+sa) 118 | elif sa=='日语': 119 | lang=sa 120 | await bot.send(event, '已切换,当前使用语言' + sa) 121 | else: 122 | await bot.send(event, '数值不合法,语言选择:中文/日语') 123 | #模型切换 124 | @bot.on(FriendMessage) 125 | async def on_friend_message(event: FriendMessage): 126 | if str(event.message_chain).startswith('M'): 127 | sa = str(event.message_chain).split('#') 128 | modelList = ['0', '1', '2', '3'] 129 | if sa[1] in modelList: 130 | global model 131 | model=int(sa[1]) 132 | await bot.send(event, '已切换,当前使用模型' + sa[1]) 133 | else: 134 | await bot.send(event, '数值不合法,模型范围[0-3]') 135 | 136 | # 模型切换 137 | @bot.on(GroupMessage) 138 | async def on_group_message(event: GroupMessage): 139 | if str(event.message_chain).startswith('M'): 140 | sa = str(event.message_chain).split('#') 141 | modelList = ['0', '1', '2', '3'] 142 | if sa[1] in modelList: 143 | global model 144 | model = int(sa[1]) 145 | await bot.send(event, '已切换,当前使用模型' + sa[1]) 146 | else: 147 | await bot.send(event, '数值不合法,模型范围[0-3]') 148 | 149 | 150 | voicePart.main(bot) # 语音生成(主动) 151 | bot.run() 152 | 153 | 154 | -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn import functional as F 4 | import torch.jit 5 | 6 | 7 | def script_method(fn, _rcb=None): 8 | return fn 9 | 10 | 11 | def script(obj, optimize=True, _frames_up=0, _rcb=None): 12 | return obj 13 | 14 | 15 | torch.jit.script_method = script_method 16 | torch.jit.script = script 17 | 18 | 19 | def init_weights(m, mean=0.0, std=0.01): 20 | classname = m.__class__.__name__ 21 | if classname.find("Conv") != -1: 22 | m.weight.data.normal_(mean, std) 23 | 24 | 25 | def get_padding(kernel_size, dilation=1): 26 | return int((kernel_size*dilation - dilation)/2) 27 | 28 | 29 | def intersperse(lst, item): 30 | result = [item] * (len(lst) * 2 + 1) 31 | result[1::2] = lst 32 | return result 33 | 34 | 35 | def slice_segments(x, ids_str, segment_size=4): 36 | ret = torch.zeros_like(x[:, :, :segment_size]) 37 | for i in range(x.size(0)): 38 | idx_str = ids_str[i] 39 | idx_end = idx_str + segment_size 40 | ret[i] = x[i, :, idx_str:idx_end] 41 | return ret 42 | 43 | 44 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 45 | b, d, t = x.size() 46 | if x_lengths is None: 47 | x_lengths = t 48 | ids_str_max = x_lengths - segment_size + 1 49 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 50 | ret = slice_segments(x, ids_str, segment_size) 51 | return ret, ids_str 52 | 53 | 54 | def subsequent_mask(length): 55 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 56 | return mask 57 | 58 | 59 | @torch.jit.script 60 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 61 | n_channels_int = n_channels[0] 62 | in_act = input_a + input_b 63 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 64 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 65 | acts = t_act * s_act 66 | return acts 67 | 68 | 69 | def convert_pad_shape(pad_shape): 70 | l = pad_shape[::-1] 71 | pad_shape = [item for sublist in l for item in sublist] 72 | return pad_shape 73 | 74 | 75 | def sequence_mask(length, max_length=None): 76 | if max_length is None: 77 | max_length = length.max() 78 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 79 | return x.unsqueeze(0) < length.unsqueeze(1) 80 | 81 | 82 | def generate_path(duration, mask): 83 | """ 84 | duration: [b, 1, t_x] 85 | mask: [b, 1, t_y, t_x] 86 | """ 87 | device = duration.device 88 | 89 | b, _, t_y, t_x = mask.shape 90 | cum_duration = torch.cumsum(duration, -1) 91 | 92 | cum_duration_flat = cum_duration.view(b * t_x) 93 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 94 | path = path.view(b, t_x, t_y) 95 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 96 | path = path.unsqueeze(1).transpose(2,3) * mask 97 | return path 98 | -------------------------------------------------------------------------------- /hubert_model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, Tuple 3 | import random 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present 9 | 10 | class Hubert(nn.Module): 11 | def __init__(self, num_label_embeddings: int = 100, mask: bool = True): 12 | super().__init__() 13 | self._mask = mask 14 | self.feature_extractor = FeatureExtractor() 15 | self.feature_projection = FeatureProjection() 16 | self.positional_embedding = PositionalConvEmbedding() 17 | self.norm = nn.LayerNorm(768) 18 | self.dropout = nn.Dropout(0.1) 19 | self.encoder = TransformerEncoder( 20 | nn.TransformerEncoderLayer( 21 | 768, 12, 3072, activation="gelu", batch_first=True 22 | ), 23 | 12, 24 | ) 25 | self.proj = nn.Linear(768, 256) 26 | 27 | self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) 28 | self.label_embedding = nn.Embedding(num_label_embeddings, 256) 29 | 30 | def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 31 | mask = None 32 | if self.training and self._mask: 33 | mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) 34 | x[mask] = self.masked_spec_embed.to(x.dtype) 35 | return x, mask 36 | 37 | def encode( 38 | self, x: torch.Tensor, layer: Optional[int] = None 39 | ) -> Tuple[torch.Tensor, torch.Tensor]: 40 | x = self.feature_extractor(x) 41 | x = self.feature_projection(x.transpose(1, 2)) 42 | x, mask = self.mask(x) 43 | x = x + self.positional_embedding(x) 44 | x = self.dropout(self.norm(x)) 45 | x = self.encoder(x, output_layer=layer) 46 | return x, mask 47 | 48 | def logits(self, x: torch.Tensor) -> torch.Tensor: 49 | logits = torch.cosine_similarity( 50 | x.unsqueeze(2), 51 | self.label_embedding.weight.unsqueeze(0).unsqueeze(0), 52 | dim=-1, 53 | ) 54 | return logits / 0.1 55 | 56 | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 57 | x, mask = self.encode(x) 58 | x = self.proj(x) 59 | logits = self.logits(x) 60 | return logits, mask 61 | 62 | 63 | class HubertSoft(Hubert): 64 | def __init__(self): 65 | super().__init__() 66 | 67 | @torch.inference_mode() 68 | def units(self, wav: torch.Tensor) -> torch.Tensor: 69 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) 70 | x, _ = self.encode(wav) 71 | return self.proj(x) 72 | 73 | 74 | class FeatureExtractor(nn.Module): 75 | def __init__(self): 76 | super().__init__() 77 | self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) 78 | self.norm0 = nn.GroupNorm(512, 512) 79 | self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) 80 | self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) 81 | self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) 82 | self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) 83 | self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) 84 | self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) 85 | 86 | def forward(self, x: torch.Tensor) -> torch.Tensor: 87 | x = F.gelu(self.norm0(self.conv0(x))) 88 | x = F.gelu(self.conv1(x)) 89 | x = F.gelu(self.conv2(x)) 90 | x = F.gelu(self.conv3(x)) 91 | x = F.gelu(self.conv4(x)) 92 | x = F.gelu(self.conv5(x)) 93 | x = F.gelu(self.conv6(x)) 94 | return x 95 | 96 | 97 | class FeatureProjection(nn.Module): 98 | def __init__(self): 99 | super().__init__() 100 | self.norm = nn.LayerNorm(512) 101 | self.projection = nn.Linear(512, 768) 102 | self.dropout = nn.Dropout(0.1) 103 | 104 | def forward(self, x: torch.Tensor) -> torch.Tensor: 105 | x = self.norm(x) 106 | x = self.projection(x) 107 | x = self.dropout(x) 108 | return x 109 | 110 | 111 | class PositionalConvEmbedding(nn.Module): 112 | def __init__(self): 113 | super().__init__() 114 | self.conv = nn.Conv1d( 115 | 768, 116 | 768, 117 | kernel_size=128, 118 | padding=128 // 2, 119 | groups=16, 120 | ) 121 | self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) 122 | 123 | def forward(self, x: torch.Tensor) -> torch.Tensor: 124 | x = self.conv(x.transpose(1, 2)) 125 | x = F.gelu(x[:, :, :-1]) 126 | return x.transpose(1, 2) 127 | 128 | 129 | class TransformerEncoder(nn.Module): 130 | def __init__( 131 | self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int 132 | ) -> None: 133 | super(TransformerEncoder, self).__init__() 134 | self.layers = nn.ModuleList( 135 | [copy.deepcopy(encoder_layer) for _ in range(num_layers)] 136 | ) 137 | self.num_layers = num_layers 138 | 139 | def forward( 140 | self, 141 | src: torch.Tensor, 142 | mask: torch.Tensor = None, 143 | src_key_padding_mask: torch.Tensor = None, 144 | output_layer: Optional[int] = None, 145 | ) -> torch.Tensor: 146 | output = src 147 | for layer in self.layers[:output_layer]: 148 | output = layer( 149 | output, src_mask=mask, src_key_padding_mask=src_key_padding_mask 150 | ) 151 | return output 152 | 153 | 154 | def _compute_mask( 155 | shape: Tuple[int, int], 156 | mask_prob: float, 157 | mask_length: int, 158 | device: torch.device, 159 | min_masks: int = 0, 160 | ) -> torch.Tensor: 161 | batch_size, sequence_length = shape 162 | 163 | if mask_length < 1: 164 | raise ValueError("`mask_length` has to be bigger than 0.") 165 | 166 | if mask_length > sequence_length: 167 | raise ValueError( 168 | f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" 169 | ) 170 | 171 | # compute number of masked spans in batch 172 | num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) 173 | num_masked_spans = max(num_masked_spans, min_masks) 174 | 175 | # make sure num masked indices <= sequence_length 176 | if num_masked_spans * mask_length > sequence_length: 177 | num_masked_spans = sequence_length // mask_length 178 | 179 | # SpecAugment mask to fill 180 | mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) 181 | 182 | # uniform distribution to sample from, make sure that offset samples are < sequence_length 183 | uniform_dist = torch.ones( 184 | (batch_size, sequence_length - (mask_length - 1)), device=device 185 | ) 186 | 187 | # get random indices to mask 188 | mask_indices = torch.multinomial(uniform_dist, num_masked_spans) 189 | 190 | # expand masked indices to masked spans 191 | mask_indices = ( 192 | mask_indices.unsqueeze(dim=-1) 193 | .expand((batch_size, num_masked_spans, mask_length)) 194 | .reshape(batch_size, num_masked_spans * mask_length) 195 | ) 196 | offsets = ( 197 | torch.arange(mask_length, device=device)[None, None, :] 198 | .expand((batch_size, num_masked_spans, mask_length)) 199 | .reshape(batch_size, num_masked_spans * mask_length) 200 | ) 201 | mask_idxs = mask_indices + offsets 202 | 203 | # scatter indices to mask 204 | mask = mask.scatter(1, mask_idxs, True) 205 | 206 | return mask 207 | 208 | 209 | def hubert_soft( 210 | path: str 211 | ) -> HubertSoft: 212 | r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. 213 | Args: 214 | path (str): path of a pretrained model 215 | """ 216 | hubert = HubertSoft() 217 | checkpoint = torch.load(path) 218 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 219 | hubert.load_state_dict(checkpoint) 220 | hubert.eval() 221 | return hubert 222 | -------------------------------------------------------------------------------- /mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | 5 | MAX_WAV_VALUE = 32768.0 6 | 7 | 8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 9 | """ 10 | PARAMS 11 | ------ 12 | C: compression factor 13 | """ 14 | return torch.log(torch.clamp(x, min=clip_val) * C) 15 | 16 | 17 | def dynamic_range_decompression_torch(x, C=1): 18 | """ 19 | PARAMS 20 | ------ 21 | C: compression factor used to compress 22 | """ 23 | return torch.exp(x) / C 24 | 25 | 26 | def spectral_normalize_torch(magnitudes): 27 | output = dynamic_range_compression_torch(magnitudes) 28 | return output 29 | 30 | 31 | def spectral_de_normalize_torch(magnitudes): 32 | output = dynamic_range_decompression_torch(magnitudes) 33 | return output 34 | 35 | 36 | mel_basis = {} 37 | hann_window = {} 38 | 39 | 40 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 41 | if torch.min(y) < -1.: 42 | print('min value is ', torch.min(y)) 43 | if torch.max(y) > 1.: 44 | print('max value is ', torch.max(y)) 45 | 46 | global hann_window 47 | dtype_device = str(y.dtype) + '_' + str(y.device) 48 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 49 | if wnsize_dtype_device not in hann_window: 50 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 51 | 52 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 53 | y = y.squeeze(1) 54 | 55 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 56 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 57 | 58 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 59 | return spec 60 | 61 | 62 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 63 | global mel_basis 64 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 65 | fmax_dtype_device = str(fmax) + '_' + dtype_device 66 | if fmax_dtype_device not in mel_basis: 67 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 68 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 69 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 70 | spec = spectral_normalize_torch(spec) 71 | return spec 72 | 73 | 74 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 75 | if torch.min(y) < -1.: 76 | print('min value is ', torch.min(y)) 77 | if torch.max(y) > 1.: 78 | print('max value is ', torch.max(y)) 79 | 80 | global mel_basis, hann_window 81 | dtype_device = str(y.dtype) + '_' + str(y.device) 82 | fmax_dtype_device = str(fmax) + '_' + dtype_device 83 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 84 | if fmax_dtype_device not in mel_basis: 85 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 86 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 87 | if wnsize_dtype_device not in hann_window: 88 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 89 | 90 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 91 | y = y.squeeze(1) 92 | 93 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 94 | center=center, pad_mode='reflect', normalized=False, onesided=True) 95 | 96 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 97 | 98 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 99 | spec = spectral_normalize_torch(spec) 100 | 101 | return spec 102 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | import modules 8 | import attentions 9 | import monotonic_align 10 | 11 | from torch.nn import Conv1d, ConvTranspose1d, Conv2d 12 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 13 | from commons import init_weights, get_padding 14 | 15 | 16 | class StochasticDurationPredictor(nn.Module): 17 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): 18 | super().__init__() 19 | filter_channels = in_channels # it needs to be removed from future version. 20 | self.in_channels = in_channels 21 | self.filter_channels = filter_channels 22 | self.kernel_size = kernel_size 23 | self.p_dropout = p_dropout 24 | self.n_flows = n_flows 25 | self.gin_channels = gin_channels 26 | 27 | self.log_flow = modules.Log() 28 | self.flows = nn.ModuleList() 29 | self.flows.append(modules.ElementwiseAffine(2)) 30 | for i in range(n_flows): 31 | self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 32 | self.flows.append(modules.Flip()) 33 | 34 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 35 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 36 | self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 37 | self.post_flows = nn.ModuleList() 38 | self.post_flows.append(modules.ElementwiseAffine(2)) 39 | for i in range(4): 40 | self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 41 | self.post_flows.append(modules.Flip()) 42 | 43 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 44 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 45 | self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 46 | if gin_channels != 0: 47 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 48 | 49 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 50 | x = torch.detach(x) 51 | x = self.pre(x) 52 | if g is not None: 53 | g = torch.detach(g) 54 | x = x + self.cond(g) 55 | x = self.convs(x, x_mask) 56 | x = self.proj(x) * x_mask 57 | 58 | if not reverse: 59 | flows = self.flows 60 | assert w is not None 61 | 62 | logdet_tot_q = 0 63 | h_w = self.post_pre(w) 64 | h_w = self.post_convs(h_w, x_mask) 65 | h_w = self.post_proj(h_w) * x_mask 66 | e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask 67 | z_q = e_q 68 | for flow in self.post_flows: 69 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 70 | logdet_tot_q += logdet_q 71 | z_u, z1 = torch.split(z_q, [1, 1], 1) 72 | u = torch.sigmoid(z_u) * x_mask 73 | z0 = (w - u) * x_mask 74 | logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2]) 75 | logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q 76 | 77 | logdet_tot = 0 78 | z0, logdet = self.log_flow(z0, x_mask) 79 | logdet_tot += logdet 80 | z = torch.cat([z0, z1], 1) 81 | for flow in flows: 82 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 83 | logdet_tot = logdet_tot + logdet 84 | nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot 85 | return nll + logq # [b] 86 | else: 87 | flows = list(reversed(self.flows)) 88 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 89 | z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale 90 | for flow in flows: 91 | z = flow(z, x_mask, g=x, reverse=reverse) 92 | z0, z1 = torch.split(z, [1, 1], 1) 93 | logw = z0 94 | return logw 95 | 96 | 97 | class DurationPredictor(nn.Module): 98 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): 99 | super().__init__() 100 | 101 | self.in_channels = in_channels 102 | self.filter_channels = filter_channels 103 | self.kernel_size = kernel_size 104 | self.p_dropout = p_dropout 105 | self.gin_channels = gin_channels 106 | 107 | self.drop = nn.Dropout(p_dropout) 108 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) 109 | self.norm_1 = modules.LayerNorm(filter_channels) 110 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) 111 | self.norm_2 = modules.LayerNorm(filter_channels) 112 | self.proj = nn.Conv1d(filter_channels, 1, 1) 113 | 114 | if gin_channels != 0: 115 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 116 | 117 | def forward(self, x, x_mask, g=None): 118 | x = torch.detach(x) 119 | if g is not None: 120 | g = torch.detach(g) 121 | x = x + self.cond(g) 122 | x = self.conv_1(x * x_mask) 123 | x = torch.relu(x) 124 | x = self.norm_1(x) 125 | x = self.drop(x) 126 | x = self.conv_2(x * x_mask) 127 | x = torch.relu(x) 128 | x = self.norm_2(x) 129 | x = self.drop(x) 130 | x = self.proj(x * x_mask) 131 | return x * x_mask 132 | 133 | 134 | class TextEncoder(nn.Module): 135 | def __init__(self, 136 | n_vocab, 137 | out_channels, 138 | hidden_channels, 139 | filter_channels, 140 | n_heads, 141 | n_layers, 142 | kernel_size, 143 | p_dropout, 144 | emotion_embedding): 145 | super().__init__() 146 | self.n_vocab = n_vocab 147 | self.out_channels = out_channels 148 | self.hidden_channels = hidden_channels 149 | self.filter_channels = filter_channels 150 | self.n_heads = n_heads 151 | self.n_layers = n_layers 152 | self.kernel_size = kernel_size 153 | self.p_dropout = p_dropout 154 | self.emotion_embedding = emotion_embedding 155 | 156 | if self.n_vocab!=0: 157 | self.emb = nn.Embedding(n_vocab, hidden_channels) 158 | if emotion_embedding: 159 | self.emo_proj = nn.Linear(1024, hidden_channels) 160 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) 161 | 162 | self.encoder = attentions.Encoder( 163 | hidden_channels, 164 | filter_channels, 165 | n_heads, 166 | n_layers, 167 | kernel_size, 168 | p_dropout) 169 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) 170 | 171 | def forward(self, x, x_lengths, emotion_embedding=None): 172 | if self.n_vocab!=0: 173 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] 174 | if emotion_embedding is not None: 175 | x = x + self.emo_proj(emotion_embedding.unsqueeze(1)) 176 | x = torch.transpose(x, 1, -1) # [b, h, t] 177 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 178 | 179 | x = self.encoder(x * x_mask, x_mask) 180 | stats = self.proj(x) * x_mask 181 | 182 | m, logs = torch.split(stats, self.out_channels, dim=1) 183 | return x, m, logs, x_mask 184 | 185 | 186 | class ResidualCouplingBlock(nn.Module): 187 | def __init__(self, 188 | channels, 189 | hidden_channels, 190 | kernel_size, 191 | dilation_rate, 192 | n_layers, 193 | n_flows=4, 194 | gin_channels=0): 195 | super().__init__() 196 | self.channels = channels 197 | self.hidden_channels = hidden_channels 198 | self.kernel_size = kernel_size 199 | self.dilation_rate = dilation_rate 200 | self.n_layers = n_layers 201 | self.n_flows = n_flows 202 | self.gin_channels = gin_channels 203 | 204 | self.flows = nn.ModuleList() 205 | for i in range(n_flows): 206 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) 207 | self.flows.append(modules.Flip()) 208 | 209 | def forward(self, x, x_mask, g=None, reverse=False): 210 | if not reverse: 211 | for flow in self.flows: 212 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 213 | else: 214 | for flow in reversed(self.flows): 215 | x = flow(x, x_mask, g=g, reverse=reverse) 216 | return x 217 | 218 | 219 | class PosteriorEncoder(nn.Module): 220 | def __init__(self, 221 | in_channels, 222 | out_channels, 223 | hidden_channels, 224 | kernel_size, 225 | dilation_rate, 226 | n_layers, 227 | gin_channels=0): 228 | super().__init__() 229 | self.in_channels = in_channels 230 | self.out_channels = out_channels 231 | self.hidden_channels = hidden_channels 232 | self.kernel_size = kernel_size 233 | self.dilation_rate = dilation_rate 234 | self.n_layers = n_layers 235 | self.gin_channels = gin_channels 236 | 237 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 238 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) 239 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 240 | 241 | def forward(self, x, x_lengths, g=None): 242 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 243 | x = self.pre(x) * x_mask 244 | x = self.enc(x, x_mask, g=g) 245 | stats = self.proj(x) * x_mask 246 | m, logs = torch.split(stats, self.out_channels, dim=1) 247 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 248 | return z, m, logs, x_mask 249 | 250 | 251 | class Generator(torch.nn.Module): 252 | def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): 253 | super(Generator, self).__init__() 254 | self.num_kernels = len(resblock_kernel_sizes) 255 | self.num_upsamples = len(upsample_rates) 256 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 257 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 258 | 259 | self.ups = nn.ModuleList() 260 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 261 | self.ups.append(weight_norm( 262 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), 263 | k, u, padding=(k-u)//2))) 264 | 265 | self.resblocks = nn.ModuleList() 266 | for i in range(len(self.ups)): 267 | ch = upsample_initial_channel//(2**(i+1)) 268 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 269 | self.resblocks.append(resblock(ch, k, d)) 270 | 271 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 272 | self.ups.apply(init_weights) 273 | 274 | if gin_channels != 0: 275 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 276 | 277 | def forward(self, x, g=None): 278 | x = self.conv_pre(x) 279 | if g is not None: 280 | x = x + self.cond(g) 281 | 282 | for i in range(self.num_upsamples): 283 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 284 | x = self.ups[i](x) 285 | xs = None 286 | for j in range(self.num_kernels): 287 | if xs is None: 288 | xs = self.resblocks[i*self.num_kernels+j](x) 289 | else: 290 | xs += self.resblocks[i*self.num_kernels+j](x) 291 | x = xs / self.num_kernels 292 | x = F.leaky_relu(x) 293 | x = self.conv_post(x) 294 | x = torch.tanh(x) 295 | 296 | return x 297 | 298 | def remove_weight_norm(self): 299 | print('Removing weight norm...') 300 | for l in self.ups: 301 | remove_weight_norm(l) 302 | for l in self.resblocks: 303 | l.remove_weight_norm() 304 | 305 | 306 | class DiscriminatorP(torch.nn.Module): 307 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 308 | super(DiscriminatorP, self).__init__() 309 | self.period = period 310 | self.use_spectral_norm = use_spectral_norm 311 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 312 | self.convs = nn.ModuleList([ 313 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 314 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 315 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 316 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 317 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), 318 | ]) 319 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 320 | 321 | def forward(self, x): 322 | fmap = [] 323 | 324 | # 1d to 2d 325 | b, c, t = x.shape 326 | if t % self.period != 0: # pad first 327 | n_pad = self.period - (t % self.period) 328 | x = F.pad(x, (0, n_pad), "reflect") 329 | t = t + n_pad 330 | x = x.view(b, c, t // self.period, self.period) 331 | 332 | for l in self.convs: 333 | x = l(x) 334 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 335 | fmap.append(x) 336 | x = self.conv_post(x) 337 | fmap.append(x) 338 | x = torch.flatten(x, 1, -1) 339 | 340 | return x, fmap 341 | 342 | 343 | class DiscriminatorS(torch.nn.Module): 344 | def __init__(self, use_spectral_norm=False): 345 | super(DiscriminatorS, self).__init__() 346 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 347 | self.convs = nn.ModuleList([ 348 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 349 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 350 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 351 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 352 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 353 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 354 | ]) 355 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 356 | 357 | def forward(self, x): 358 | fmap = [] 359 | 360 | for l in self.convs: 361 | x = l(x) 362 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 363 | fmap.append(x) 364 | x = self.conv_post(x) 365 | fmap.append(x) 366 | x = torch.flatten(x, 1, -1) 367 | 368 | return x, fmap 369 | 370 | 371 | class MultiPeriodDiscriminator(torch.nn.Module): 372 | def __init__(self, use_spectral_norm=False): 373 | super(MultiPeriodDiscriminator, self).__init__() 374 | periods = [2,3,5,7,11] 375 | 376 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 377 | discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] 378 | self.discriminators = nn.ModuleList(discs) 379 | 380 | def forward(self, y, y_hat): 381 | y_d_rs = [] 382 | y_d_gs = [] 383 | fmap_rs = [] 384 | fmap_gs = [] 385 | for i, d in enumerate(self.discriminators): 386 | y_d_r, fmap_r = d(y) 387 | y_d_g, fmap_g = d(y_hat) 388 | y_d_rs.append(y_d_r) 389 | y_d_gs.append(y_d_g) 390 | fmap_rs.append(fmap_r) 391 | fmap_gs.append(fmap_g) 392 | 393 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 394 | 395 | 396 | 397 | class SynthesizerTrn(nn.Module): 398 | """ 399 | Synthesizer for Training 400 | """ 401 | 402 | def __init__(self, 403 | n_vocab, 404 | spec_channels, 405 | segment_size, 406 | inter_channels, 407 | hidden_channels, 408 | filter_channels, 409 | n_heads, 410 | n_layers, 411 | kernel_size, 412 | p_dropout, 413 | resblock, 414 | resblock_kernel_sizes, 415 | resblock_dilation_sizes, 416 | upsample_rates, 417 | upsample_initial_channel, 418 | upsample_kernel_sizes, 419 | n_speakers=0, 420 | gin_channels=0, 421 | use_sdp=True, 422 | emotion_embedding=False, 423 | **kwargs): 424 | 425 | super().__init__() 426 | self.n_vocab = n_vocab 427 | self.spec_channels = spec_channels 428 | self.inter_channels = inter_channels 429 | self.hidden_channels = hidden_channels 430 | self.filter_channels = filter_channels 431 | self.n_heads = n_heads 432 | self.n_layers = n_layers 433 | self.kernel_size = kernel_size 434 | self.p_dropout = p_dropout 435 | self.resblock = resblock 436 | self.resblock_kernel_sizes = resblock_kernel_sizes 437 | self.resblock_dilation_sizes = resblock_dilation_sizes 438 | self.upsample_rates = upsample_rates 439 | self.upsample_initial_channel = upsample_initial_channel 440 | self.upsample_kernel_sizes = upsample_kernel_sizes 441 | self.segment_size = segment_size 442 | self.n_speakers = n_speakers 443 | self.gin_channels = gin_channels 444 | 445 | self.use_sdp = use_sdp 446 | 447 | self.enc_p = TextEncoder(n_vocab, 448 | inter_channels, 449 | hidden_channels, 450 | filter_channels, 451 | n_heads, 452 | n_layers, 453 | kernel_size, 454 | p_dropout, 455 | emotion_embedding) 456 | self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) 457 | self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 458 | self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) 459 | 460 | if use_sdp: 461 | self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) 462 | else: 463 | self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) 464 | 465 | if n_speakers > 1: 466 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 467 | 468 | def forward(self, x, x_lengths, y, y_lengths, sid=None, emotion_embedding=None): 469 | 470 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths ,emotion_embedding) 471 | if self.n_speakers > 0: 472 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 473 | else: 474 | g = None 475 | 476 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 477 | z_p = self.flow(z, y_mask, g=g) 478 | 479 | with torch.no_grad(): 480 | # negative cross-entropy 481 | s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] 482 | neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s] 483 | neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] 484 | neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] 485 | neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s] 486 | neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 487 | 488 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 489 | attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach() 490 | 491 | w = attn.sum(2) 492 | if self.use_sdp: 493 | l_length = self.dp(x, x_mask, w, g=g) 494 | l_length = l_length / torch.sum(x_mask) 495 | else: 496 | logw_ = torch.log(w + 1e-6) * x_mask 497 | logw = self.dp(x, x_mask, g=g) 498 | l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging 499 | 500 | # expand prior 501 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) 502 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) 503 | 504 | z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) 505 | o = self.dec(z_slice, g=g) 506 | return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) 507 | 508 | def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None): 509 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding) 510 | if self.n_speakers > 0: 511 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 512 | else: 513 | g = None 514 | 515 | if self.use_sdp: 516 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) 517 | else: 518 | logw = self.dp(x, x_mask, g=g) 519 | w = torch.exp(logw) * x_mask * length_scale 520 | w_ceil = torch.ceil(w) 521 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 522 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) 523 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 524 | attn = commons.generate_path(w_ceil, attn_mask) 525 | 526 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 527 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 528 | 529 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 530 | z = self.flow(z_p, y_mask, g=g, reverse=True) 531 | o = self.dec((z * y_mask)[:,:,:max_len], g=g) 532 | return o, attn, y_mask, (z, z_p, m_p, logs_p) 533 | 534 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): 535 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 536 | g_src = self.emb_g(sid_src).unsqueeze(-1) 537 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 538 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 539 | z_p = self.flow(z, y_mask, g=g_src) 540 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 541 | o_hat = self.dec(z_hat * y_mask, g=g_tgt) 542 | return o_hat, y_mask, (z, z_p, z_hat) 543 | 544 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch.nn import Conv1d 7 | from torch.nn.utils import weight_norm, remove_weight_norm 8 | 9 | import commons 10 | from commons import init_weights, get_padding 11 | from transforms import piecewise_rational_quadratic_transform 12 | 13 | 14 | LRELU_SLOPE = 0.1 15 | 16 | 17 | class LayerNorm(nn.Module): 18 | def __init__(self, channels, eps=1e-5): 19 | super().__init__() 20 | self.channels = channels 21 | self.eps = eps 22 | 23 | self.gamma = nn.Parameter(torch.ones(channels)) 24 | self.beta = nn.Parameter(torch.zeros(channels)) 25 | 26 | def forward(self, x): 27 | x = x.transpose(1, -1) 28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 29 | return x.transpose(1, -1) 30 | 31 | 32 | class ConvReluNorm(nn.Module): 33 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 34 | super().__init__() 35 | self.in_channels = in_channels 36 | self.hidden_channels = hidden_channels 37 | self.out_channels = out_channels 38 | self.kernel_size = kernel_size 39 | self.n_layers = n_layers 40 | self.p_dropout = p_dropout 41 | assert n_layers > 1, "Number of layers should be larger than 0." 42 | 43 | self.conv_layers = nn.ModuleList() 44 | self.norm_layers = nn.ModuleList() 45 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 46 | self.norm_layers.append(LayerNorm(hidden_channels)) 47 | self.relu_drop = nn.Sequential( 48 | nn.ReLU(), 49 | nn.Dropout(p_dropout)) 50 | for _ in range(n_layers-1): 51 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 52 | self.norm_layers.append(LayerNorm(hidden_channels)) 53 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 54 | self.proj.weight.data.zero_() 55 | self.proj.bias.data.zero_() 56 | 57 | def forward(self, x, x_mask): 58 | x_org = x 59 | for i in range(self.n_layers): 60 | x = self.conv_layers[i](x * x_mask) 61 | x = self.norm_layers[i](x) 62 | x = self.relu_drop(x) 63 | x = x_org + self.proj(x) 64 | return x * x_mask 65 | 66 | 67 | class DDSConv(nn.Module): 68 | """ 69 | Dialted and Depth-Separable Convolution 70 | """ 71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 72 | super().__init__() 73 | self.channels = channels 74 | self.kernel_size = kernel_size 75 | self.n_layers = n_layers 76 | self.p_dropout = p_dropout 77 | 78 | self.drop = nn.Dropout(p_dropout) 79 | self.convs_sep = nn.ModuleList() 80 | self.convs_1x1 = nn.ModuleList() 81 | self.norms_1 = nn.ModuleList() 82 | self.norms_2 = nn.ModuleList() 83 | for i in range(n_layers): 84 | dilation = kernel_size ** i 85 | padding = (kernel_size * dilation - dilation) // 2 86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 87 | groups=channels, dilation=dilation, padding=padding 88 | )) 89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 90 | self.norms_1.append(LayerNorm(channels)) 91 | self.norms_2.append(LayerNorm(channels)) 92 | 93 | def forward(self, x, x_mask, g=None): 94 | if g is not None: 95 | x = x + g 96 | for i in range(self.n_layers): 97 | y = self.convs_sep[i](x * x_mask) 98 | y = self.norms_1[i](y) 99 | y = F.gelu(y) 100 | y = self.convs_1x1[i](y) 101 | y = self.norms_2[i](y) 102 | y = F.gelu(y) 103 | y = self.drop(y) 104 | x = x + y 105 | return x * x_mask 106 | 107 | 108 | class WN(torch.nn.Module): 109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 110 | super(WN, self).__init__() 111 | assert(kernel_size % 2 == 1) 112 | self.hidden_channels =hidden_channels 113 | self.kernel_size = kernel_size, 114 | self.dilation_rate = dilation_rate 115 | self.n_layers = n_layers 116 | self.gin_channels = gin_channels 117 | self.p_dropout = p_dropout 118 | 119 | self.in_layers = torch.nn.ModuleList() 120 | self.res_skip_layers = torch.nn.ModuleList() 121 | self.drop = nn.Dropout(p_dropout) 122 | 123 | if gin_channels != 0: 124 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 125 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 126 | 127 | for i in range(n_layers): 128 | dilation = dilation_rate ** i 129 | padding = int((kernel_size * dilation - dilation) / 2) 130 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 131 | dilation=dilation, padding=padding) 132 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 133 | self.in_layers.append(in_layer) 134 | 135 | # last one is not necessary 136 | if i < n_layers - 1: 137 | res_skip_channels = 2 * hidden_channels 138 | else: 139 | res_skip_channels = hidden_channels 140 | 141 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 142 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 143 | self.res_skip_layers.append(res_skip_layer) 144 | 145 | def forward(self, x, x_mask, g=None, **kwargs): 146 | output = torch.zeros_like(x) 147 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 148 | 149 | if g is not None: 150 | g = self.cond_layer(g) 151 | 152 | for i in range(self.n_layers): 153 | x_in = self.in_layers[i](x) 154 | if g is not None: 155 | cond_offset = i * 2 * self.hidden_channels 156 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 157 | else: 158 | g_l = torch.zeros_like(x_in) 159 | 160 | acts = commons.fused_add_tanh_sigmoid_multiply( 161 | x_in, 162 | g_l, 163 | n_channels_tensor) 164 | acts = self.drop(acts) 165 | 166 | res_skip_acts = self.res_skip_layers[i](acts) 167 | if i < self.n_layers - 1: 168 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 169 | x = (x + res_acts) * x_mask 170 | output = output + res_skip_acts[:,self.hidden_channels:,:] 171 | else: 172 | output = output + res_skip_acts 173 | return output * x_mask 174 | 175 | def remove_weight_norm(self): 176 | if self.gin_channels != 0: 177 | torch.nn.utils.remove_weight_norm(self.cond_layer) 178 | for l in self.in_layers: 179 | torch.nn.utils.remove_weight_norm(l) 180 | for l in self.res_skip_layers: 181 | torch.nn.utils.remove_weight_norm(l) 182 | 183 | 184 | class ResBlock1(torch.nn.Module): 185 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 186 | super(ResBlock1, self).__init__() 187 | self.convs1 = nn.ModuleList([ 188 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 189 | padding=get_padding(kernel_size, dilation[0]))), 190 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 191 | padding=get_padding(kernel_size, dilation[1]))), 192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 193 | padding=get_padding(kernel_size, dilation[2]))) 194 | ]) 195 | self.convs1.apply(init_weights) 196 | 197 | self.convs2 = nn.ModuleList([ 198 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 199 | padding=get_padding(kernel_size, 1))), 200 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 201 | padding=get_padding(kernel_size, 1))), 202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 203 | padding=get_padding(kernel_size, 1))) 204 | ]) 205 | self.convs2.apply(init_weights) 206 | 207 | def forward(self, x, x_mask=None): 208 | for c1, c2 in zip(self.convs1, self.convs2): 209 | xt = F.leaky_relu(x, LRELU_SLOPE) 210 | if x_mask is not None: 211 | xt = xt * x_mask 212 | xt = c1(xt) 213 | xt = F.leaky_relu(xt, LRELU_SLOPE) 214 | if x_mask is not None: 215 | xt = xt * x_mask 216 | xt = c2(xt) 217 | x = xt + x 218 | if x_mask is not None: 219 | x = x * x_mask 220 | return x 221 | 222 | def remove_weight_norm(self): 223 | for l in self.convs1: 224 | remove_weight_norm(l) 225 | for l in self.convs2: 226 | remove_weight_norm(l) 227 | 228 | 229 | class ResBlock2(torch.nn.Module): 230 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 231 | super(ResBlock2, self).__init__() 232 | self.convs = nn.ModuleList([ 233 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 234 | padding=get_padding(kernel_size, dilation[0]))), 235 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 236 | padding=get_padding(kernel_size, dilation[1]))) 237 | ]) 238 | self.convs.apply(init_weights) 239 | 240 | def forward(self, x, x_mask=None): 241 | for c in self.convs: 242 | xt = F.leaky_relu(x, LRELU_SLOPE) 243 | if x_mask is not None: 244 | xt = xt * x_mask 245 | xt = c(xt) 246 | x = xt + x 247 | if x_mask is not None: 248 | x = x * x_mask 249 | return x 250 | 251 | def remove_weight_norm(self): 252 | for l in self.convs: 253 | remove_weight_norm(l) 254 | 255 | 256 | class Log(nn.Module): 257 | def forward(self, x, x_mask, reverse=False, **kwargs): 258 | if not reverse: 259 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 260 | logdet = torch.sum(-y, [1, 2]) 261 | return y, logdet 262 | else: 263 | x = torch.exp(x) * x_mask 264 | return x 265 | 266 | 267 | class Flip(nn.Module): 268 | def forward(self, x, *args, reverse=False, **kwargs): 269 | x = torch.flip(x, [1]) 270 | if not reverse: 271 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 272 | return x, logdet 273 | else: 274 | return x 275 | 276 | 277 | class ElementwiseAffine(nn.Module): 278 | def __init__(self, channels): 279 | super().__init__() 280 | self.channels = channels 281 | self.m = nn.Parameter(torch.zeros(channels,1)) 282 | self.logs = nn.Parameter(torch.zeros(channels,1)) 283 | 284 | def forward(self, x, x_mask, reverse=False, **kwargs): 285 | if not reverse: 286 | y = self.m + torch.exp(self.logs) * x 287 | y = y * x_mask 288 | logdet = torch.sum(self.logs * x_mask, [1,2]) 289 | return y, logdet 290 | else: 291 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 292 | return x 293 | 294 | 295 | class ResidualCouplingLayer(nn.Module): 296 | def __init__(self, 297 | channels, 298 | hidden_channels, 299 | kernel_size, 300 | dilation_rate, 301 | n_layers, 302 | p_dropout=0, 303 | gin_channels=0, 304 | mean_only=False): 305 | assert channels % 2 == 0, "channels should be divisible by 2" 306 | super().__init__() 307 | self.channels = channels 308 | self.hidden_channels = hidden_channels 309 | self.kernel_size = kernel_size 310 | self.dilation_rate = dilation_rate 311 | self.n_layers = n_layers 312 | self.half_channels = channels // 2 313 | self.mean_only = mean_only 314 | 315 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 316 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 317 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 318 | self.post.weight.data.zero_() 319 | self.post.bias.data.zero_() 320 | 321 | def forward(self, x, x_mask, g=None, reverse=False): 322 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 323 | h = self.pre(x0) * x_mask 324 | h = self.enc(h, x_mask, g=g) 325 | stats = self.post(h) * x_mask 326 | if not self.mean_only: 327 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 328 | else: 329 | m = stats 330 | logs = torch.zeros_like(m) 331 | 332 | if not reverse: 333 | x1 = m + x1 * torch.exp(logs) * x_mask 334 | x = torch.cat([x0, x1], 1) 335 | logdet = torch.sum(logs, [1,2]) 336 | return x, logdet 337 | else: 338 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 339 | x = torch.cat([x0, x1], 1) 340 | return x 341 | 342 | 343 | class ConvFlow(nn.Module): 344 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 345 | super().__init__() 346 | self.in_channels = in_channels 347 | self.filter_channels = filter_channels 348 | self.kernel_size = kernel_size 349 | self.n_layers = n_layers 350 | self.num_bins = num_bins 351 | self.tail_bound = tail_bound 352 | self.half_channels = in_channels // 2 353 | 354 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 355 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 356 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 357 | self.proj.weight.data.zero_() 358 | self.proj.bias.data.zero_() 359 | 360 | def forward(self, x, x_mask, g=None, reverse=False): 361 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 362 | h = self.pre(x0) 363 | h = self.convs(h, x_mask, g=g) 364 | h = self.proj(h) * x_mask 365 | 366 | b, c, t = x0.shape 367 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 368 | 369 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 370 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 371 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 372 | 373 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 374 | unnormalized_widths, 375 | unnormalized_heights, 376 | unnormalized_derivatives, 377 | inverse=reverse, 378 | tails='linear', 379 | tail_bound=self.tail_bound 380 | ) 381 | 382 | x = torch.cat([x0, x1], 1) * x_mask 383 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 384 | if not reverse: 385 | return x, logdet 386 | else: 387 | return x 388 | -------------------------------------------------------------------------------- /monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | from numpy import zeros, int32, float32 2 | from torch import from_numpy 3 | 4 | from .core import maximum_path_jit 5 | 6 | def maximum_path(neg_cent, mask): 7 | """ numba optimized version. 8 | neg_cent: [b, t_t, t_s] 9 | mask: [b, t_t, t_s] 10 | """ 11 | device = neg_cent.device 12 | dtype = neg_cent.dtype 13 | neg_cent = neg_cent.data.cpu().numpy().astype(float32) 14 | path = zeros(neg_cent.shape, dtype=int32) 15 | 16 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32) 17 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32) 18 | maximum_path_jit(path, neg_cent, t_t_max, t_s_max) 19 | return from_numpy(path).to(device=device, dtype=dtype) 20 | -------------------------------------------------------------------------------- /monotonic_align/core.py: -------------------------------------------------------------------------------- 1 | import numba 2 | 3 | 4 | @numba.jit(numba.void(numba.int32[:,:,::1], numba.float32[:,:,::1], numba.int32[::1], numba.int32[::1]), nopython=True, nogil=True) 5 | def maximum_path_jit(paths, values, t_ys, t_xs): 6 | b = paths.shape[0] 7 | max_neg_val=-1e9 8 | for i in range(int(b)): 9 | path = paths[i] 10 | value = values[i] 11 | t_y = t_ys[i] 12 | t_x = t_xs[i] 13 | 14 | v_prev = v_cur = 0.0 15 | index = t_x - 1 16 | 17 | for y in range(t_y): 18 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 19 | if x == y: 20 | v_cur = max_neg_val 21 | else: 22 | v_cur = value[y-1, x] 23 | if x == 0: 24 | if y == 0: 25 | v_prev = 0. 26 | else: 27 | v_prev = max_neg_val 28 | else: 29 | v_prev = value[y-1, x-1] 30 | value[y, x] += max(v_prev, v_cur) 31 | 32 | for y in range(t_y - 1, -1, -1): 33 | path[y, index] = 1 34 | if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]): 35 | index = index - 1 36 | -------------------------------------------------------------------------------- /pictures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/__init__.py -------------------------------------------------------------------------------- /pictures/agakaUa$aNaGaka.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/agakaUa$aNaGaka.jpg -------------------------------------------------------------------------------- /pictures/apauaraga5aqafa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/apauaraga5aqafa.jpg -------------------------------------------------------------------------------- /pictures/avabaaa%aZaxa6a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/avabaaa%aZaxa6a.jpg -------------------------------------------------------------------------------- /pictures/awa6aRakaka3a7a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/pictures/awa6aRakaka3a7a.jpg -------------------------------------------------------------------------------- /plugins/RandomStr/RandomStr.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def random_str(random_length=6): 5 | """ 6 | 生成随机字符串作为验证码 7 | :param random_length: 字符串长度,默认为6 8 | :return: 随机字符串 9 | """ 10 | string = 'a' 11 | chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789@$#_%' 12 | length = len(chars) - 1 13 | # random = Random() 14 | # 设置循环每次取一个字符用来生成随机数 15 | for i in range(7): 16 | string += ((chars[random.randint(0, length)])+'a') 17 | return string 18 | 19 | 20 | if __name__ == '__main__': 21 | print(random_str()) 22 | print(random_str(10)) 23 | -------------------------------------------------------------------------------- /plugins/RandomStr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/plugins/RandomStr/__init__.py -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/plugins/__init__.py -------------------------------------------------------------------------------- /plugins/picGet.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | from plugins.RandomStr.RandomStr import random_str 6 | 7 | url = 'https://iw233.cn/api.php?sort=yin' # 接口地址 8 | headers ={ 9 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36' 10 | } 11 | def pic(): 12 | r = requests.get(url, headers=headers) 13 | # 下载图片 14 | ranpath='' 15 | while True: 16 | ranpath = random_str() 17 | exist = os.path.isfile("pictures\\" + ranpath + ".jpg") 18 | direxist =os.path.isdir("pictures") 19 | if direxist: 20 | if exist: 21 | continue 22 | else: 23 | break 24 | else: 25 | os.mkdir("pictures") 26 | continue 27 | 28 | with open("pictures\\" + ranpath + ".jpg", mode="wb") as f: 29 | f.write(r.content) # 图片内容写入文件 30 | return "pictures\\" + ranpath + ".jpg" 31 | if __name__ == '__main__': 32 | s=input("输入1开始执行") 33 | i=0 34 | if s=="1": 35 | while i<=10: 36 | pic() 37 | i+=1 -------------------------------------------------------------------------------- /plugins/voicePart.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import os 4 | import datetime 5 | import random 6 | import time 7 | import sys 8 | 9 | from mirai import Image, Voice 10 | from mirai import Mirai, WebSocketAdapter, FriendMessage, GroupMessage, At, Plain 11 | 12 | from MoeGoe import voiceGenerate 13 | from plugins.RandomStr.RandomStr import random_str 14 | from trans import translate 15 | def main(bot): 16 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 17 | print(time + '| voiceGenerate module loaded successfully 已加载--- 语音生成 ---模块') 18 | # 中文生成1 19 | global voiceSender 20 | voiceSender = 0 21 | global voiceTrans 22 | voiceTrans = 0 23 | global modelSelect 24 | modelSelect=0 25 | global yuukaSaid 26 | yuukaSaid=0 27 | 28 | @bot.on(GroupMessage) 29 | async def handle_group_message(event: GroupMessage): 30 | if str(event.message_chain).startswith('中文'): 31 | modelList = ['0', '1', '2', '3'] 32 | if len(str(event.message_chain)) < 60: 33 | if '#' in str(event.message_chain): 34 | textt = str(event.message_chain).split("#") 35 | if textt[1] in modelList: 36 | model = int(textt[1]) 37 | tex = '[ZH]' + ((textt[0])[2:]) + '[ZH]' 38 | else: 39 | model = 0 40 | tex = '[ZH]' + (str(event.message_chain)[2:]) + '[ZH]' 41 | else: 42 | tex = '[ZH]' + (str(event.message_chain)[2:]) + '[ZH]' 43 | model = 0 44 | ranpath = random_str() 45 | out ='plugins\\voices\\' + ranpath + '.wav' 46 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 47 | print(time + '| 中文语音生成-----> ' +tex) 48 | voiceGenerate(tex, out, model) 49 | await bot.send(event, Voice(path=out)) 50 | else: 51 | ranpath = random_str() 52 | out ='plugins\\voices\\' + ranpath + '.wav' 53 | tex = '[ZH]太常了哦......[ZH]' 54 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 55 | print(time + '| 中文语音生成-----> ' + tex) 56 | voiceGenerate(tex, out) 57 | await bot.send(event, Voice(path=out)) 58 | 59 | # 日语生成 60 | @bot.on(GroupMessage) 61 | async def handle_group_message(event: GroupMessage): 62 | if str(event.message_chain).startswith('说'): 63 | global modelSelect 64 | modelList = ['0', '1', '2', '3'] 65 | if len(str(event.message_chain)) < 70: 66 | if '#' in str(event.message_chain): 67 | textt = str(event.message_chain).split("#") 68 | if textt[1] in modelList: 69 | model = int(textt[1]) 70 | tex = '[JA]' + translate((textt[0])[1:]) + '[JA]' 71 | else: 72 | model = 0 73 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]' 74 | else: 75 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]' 76 | model = 0 77 | ranpath = random_str() 78 | out ='plugins\\voices\\' + ranpath + '.wav' 79 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 80 | print(time + '| 日语语音生成-----> ' + tex) 81 | if modelSelect==1: 82 | tex=tex.replace('[JA]','') 83 | else: 84 | pass 85 | voiceGenerate(tex, out, model,modelSelect) 86 | 87 | modelSelect = 0 88 | await bot.send(event, Voice(path=out)) 89 | else: 90 | ranpath = random_str() 91 | out = 'plugins\\voices\\' + ranpath + '.wav' 92 | tex = '[JA]' + translate('不行,太长了哦.....') + '[JA]' 93 | time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 94 | print(time + '| 日语语音生成-----> ' + tex) 95 | if modelSelect==1: 96 | tex=tex.replace('[JA]','') 97 | yuukaSaid+=1 98 | else: 99 | pass 100 | voiceGenerate(tex, out,0,modelSelect) 101 | if yuukaSaid==3: 102 | modelSelect = 0 103 | else: 104 | pass 105 | await bot.send(event, Voice(path=out)) 106 | @bot.on(GroupMessage) 107 | async def yuukaVoiceModelSelecter(event: GroupMessage): 108 | if str(event.message_chain)=='modelSet=1': 109 | global modelSelect 110 | modelSelect=1 111 | await bot.send(event,'已切换至ユウカ(优香)语音模型\n接下来三次语音生成任务默认使用优香语音模型') 112 | 113 | @bot.on(GroupMessage) 114 | async def yuukaVoiceModelSelecter(event: GroupMessage): 115 | if str(event.message_chain).startswith('优香说'): 116 | tex=str(event.message_chain)[3:] 117 | tex=translate(tex) 118 | ranpath = random_str() 119 | out = 'plugins\\voices\\' + ranpath + '.wav' 120 | voiceGenerate(tex, out, 0, 1) 121 | await bot.send(event, Voice(path=out)) 122 | 123 | if str(event.message_chain).startswith('邮箱说'): 124 | tex=str(event.message_chain)[3:] 125 | ranpath = random_str() 126 | out = 'plugins\\voices\\' + ranpath + '.wav' 127 | voiceGenerate(tex, out, 0, 1) 128 | await bot.send(event, Voice(path=out)) 129 | 130 | 131 | # 语音转换 132 | '''@bot.on(GroupMessage) 133 | async def voiceTan(event: GroupMessage): 134 | if str(event.message_chain) == '语音转换': 135 | global voiceSender 136 | voiceSender = event.sender.id 137 | global voiceTrans 138 | voiceTrans = 2 139 | await bot.send(event, '请发送语音') 140 | 141 | # 语音转化附件 142 | @bot.on(GroupMessage) 143 | async def voicetransa(event: GroupMessage): 144 | global voiceSender 145 | global voiceTrans 146 | if event.message_chain.count(Voice): 147 | if voiceTrans == 2: 148 | if voiceSender == event.sender.id: 149 | s = event.message_chain.get(Voice) 150 | await Voice.download(s[0], 'plugins/voices/sing/rest.silk') 151 | silkcoder.decode("plugins/voices/sing/rest.silk", "plugins/voices/sing/rest.wav", 152 | ffmpeg_para=["-ar", "44100"]) 153 | print('over') 154 | paths = voice_conversion("plugins/voices/sing/rest.wav") 155 | await bot.send(event, Voice(path=paths)) 156 | voiceSender = 0 157 | voiceTrans = 0''' 158 | 159 | # 好友日语生成,因腾讯版本更新再不可用 160 | '''@bot.on(FriendMessage) 161 | async def handle_group_message(event: FriendMessage): 162 | if str(event.message_chain).startswith('说'): 163 | modelList = ['0', '1', '2', '3'] 164 | if len(str(event.message_chain)) < 280: 165 | if '#' in str(event.message_chain): 166 | textt = str(event.message_chain).split("#") 167 | if textt[1] in modelList: 168 | model = int(textt[1]) 169 | tex = '[JA]' + translate((textt[0])[1:]) + '[JA]' 170 | else: 171 | model = 0 172 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]' 173 | else: 174 | tex = '[JA]' + translate(str(event.message_chain)[1:]) + '[JA]' 175 | model = 0 176 | ranpath = random_str() 177 | out ='PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 178 | voiceGenerate(tex, out, model) 179 | await bot.send(event, Voice(path=out)) 180 | else: 181 | ranpath = random_str() 182 | out = 'PythonPlugins\\plugins\\voices\\' + ranpath + '.wav' 183 | tex = '[JA]' + translate('不行,太长了哦.....') + '[JA]' 184 | voiceGenerate(tex, out) 185 | await bot.send(event, Voice(path=out))''' 186 | 187 | -------------------------------------------------------------------------------- /plugins/voices/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/avilliai/MoeGoe-Yirimirai/90273a09dc6a93947506e4d7782d10d8f86b5b9c/plugins/voices/__init__.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==0.7.0 2 | altgraph==0.17.3 3 | anyio==3.6.2 4 | appdirs==1.4.4 5 | audioread==3.0.0 6 | backports.functools-lru-cache==1.6.4 7 | certifi==2022.9.24 8 | cffi==1.15.1 9 | charset-normalizer==2.1.1 10 | click==8.1.3 11 | cn2an==0.5.19 12 | colorama==0.4.6 13 | Cython==0.29.32 14 | decorator==5.1.1 15 | eng-to-ipa==0.0.2 16 | Flask==2.2.2 17 | future==0.18.2 18 | h11==0.14.0 19 | httpcore==0.16.2 20 | httpx==0.23.1 21 | idna==3.4 22 | importlib-metadata==5.1.0 23 | indic-transliteration==2.3.40 24 | inflect==6.0.2 25 | itsdangerous==2.1.2 26 | jamo==0.4.1 27 | jieba==0.42.1 28 | Jinja2==3.1.2 29 | joblib==1.2.0 30 | ko-pron==1.3 31 | librosa==0.9.2 32 | llvmlite==0.39.1 33 | MarkupSafe==2.1.1 34 | num-thai==0.0.5 35 | numba==0.56.4 36 | numpy==1.22.0 37 | OpenCC==1.1.1 38 | openjtalk==0.3.0.dev2 39 | packaging==21.3 40 | pefile==2022.5.30 41 | pooch==1.6.0 42 | proces==0.1.3 43 | protobuf==4.21.9 44 | pycparser==2.21 45 | pydantic==1.10.2 46 | pyinstaller==5.6.2 47 | pyinstaller-hooks-contrib==2022.13 48 | pyparsing==3.0.9 49 | pypinyin==0.47.1 50 | pywin32-ctypes==0.2.0 51 | regex==2022.10.31 52 | requests==2.28.1 53 | resampy==0.4.2 54 | rfc3986==1.5.0 55 | roman==3.3 56 | scikit-learn==1.1.3 57 | scipy==1.9.3 58 | six==1.16.0 59 | sniffio==1.3.0 60 | soundfile==0.11.0 61 | starlette==0.22.0 62 | threadpoolctl==3.1.0 63 | toml==0.10.2 64 | torch==1.13.0 65 | tqdm==4.64.1 66 | typer==0.7.0 67 | typing_extensions==4.4.0 68 | Unidecode==1.3.6 69 | urllib3==1.26.13 70 | websockets==10.4 71 | Werkzeug==2.2.2 72 | yiri-mirai==0.2.7 73 | zipp==3.11.0 74 | -------------------------------------------------------------------------------- /text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from text import cleaners 3 | 4 | 5 | def text_to_sequence(text, symbols, cleaner_names): 6 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 7 | Args: 8 | text: string to convert to a sequence 9 | cleaner_names: names of the cleaner functions to run the text through 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 14 | 15 | sequence = [] 16 | 17 | clean_text = _clean_text(text, cleaner_names) 18 | for symbol in clean_text: 19 | if symbol not in _symbol_to_id.keys(): 20 | continue 21 | symbol_id = _symbol_to_id[symbol] 22 | sequence += [symbol_id] 23 | return sequence 24 | 25 | 26 | def _clean_text(text, cleaner_names): 27 | for name in cleaner_names: 28 | cleaner = getattr(cleaners, name) 29 | if not cleaner: 30 | raise Exception('Unknown cleaner: %s' % name) 31 | text = cleaner(text) 32 | return text 33 | -------------------------------------------------------------------------------- /text/cantonese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('jyutjyu') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ei˥'), 11 | ('B', 'biː˥'), 12 | ('C', 'siː˥'), 13 | ('D', 'tiː˥'), 14 | ('E', 'iː˥'), 15 | ('F', 'e˥fuː˨˩'), 16 | ('G', 'tsiː˥'), 17 | ('H', 'ɪk̚˥tsʰyː˨˩'), 18 | ('I', 'ɐi˥'), 19 | ('J', 'tsei˥'), 20 | ('K', 'kʰei˥'), 21 | ('L', 'e˥llou˨˩'), 22 | ('M', 'ɛːm˥'), 23 | ('N', 'ɛːn˥'), 24 | ('O', 'ou˥'), 25 | ('P', 'pʰiː˥'), 26 | ('Q', 'kʰiːu˥'), 27 | ('R', 'aː˥lou˨˩'), 28 | ('S', 'ɛː˥siː˨˩'), 29 | ('T', 'tʰiː˥'), 30 | ('U', 'juː˥'), 31 | ('V', 'wiː˥'), 32 | ('W', 'tʊk̚˥piː˥juː˥'), 33 | ('X', 'ɪk̚˥siː˨˩'), 34 | ('Y', 'waːi˥'), 35 | ('Z', 'iː˨sɛːt̚˥') 36 | ]] 37 | 38 | 39 | def number_to_cantonese(text): 40 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text) 41 | 42 | 43 | def latin_to_ipa(text): 44 | for regex, replacement in _latin_to_ipa: 45 | text = re.sub(regex, replacement, text) 46 | return text 47 | 48 | 49 | def cantonese_to_ipa(text): 50 | text = number_to_cantonese(text.upper()) 51 | text = converter.convert(text).replace('-','').replace('$',' ') 52 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 53 | text = re.sub(r'[、;:]', ',', text) 54 | text = re.sub(r'\s*,\s*', ', ', text) 55 | text = re.sub(r'\s*。\s*', '. ', text) 56 | text = re.sub(r'\s*?\s*', '? ', text) 57 | text = re.sub(r'\s*!\s*', '! ', text) 58 | text = re.sub(r'\s*$', '', text) 59 | return text 60 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def japanese_cleaners(text): 5 | from text.japanese import japanese_to_romaji_with_accent 6 | text = japanese_to_romaji_with_accent(text) 7 | text = re.sub(r'([A-Za-z])$', r'\1.', text) 8 | return text 9 | 10 | 11 | def japanese_cleaners2(text): 12 | return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') 13 | 14 | 15 | def korean_cleaners(text): 16 | '''Pipeline for Korean text''' 17 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul 18 | text = latin_to_hangul(text) 19 | text = number_to_hangul(text) 20 | text = divide_hangul(text) 21 | text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) 22 | return text 23 | 24 | 25 | def chinese_cleaners(text): 26 | '''Pipeline for Chinese text''' 27 | from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo 28 | text = number_to_chinese(text) 29 | text = chinese_to_bopomofo(text) 30 | text = latin_to_bopomofo(text) 31 | text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text) 32 | return text 33 | 34 | 35 | def zh_ja_mixture_cleaners(text): 36 | from text.mandarin import chinese_to_romaji 37 | from text.japanese import japanese_to_romaji_with_accent 38 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 39 | lambda x: chinese_to_romaji(x.group(1))+' ', text) 40 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent( 41 | x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text) 42 | text = re.sub(r'\s+$', '', text) 43 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 44 | return text 45 | 46 | 47 | def sanskrit_cleaners(text): 48 | text = text.replace('॥', '।').replace('ॐ', 'ओम्') 49 | if text[-1] != '।': 50 | text += ' ।' 51 | return text 52 | 53 | 54 | def cjks_cleaners(text): 55 | from text.mandarin import chinese_to_lazy_ipa 56 | from text.japanese import japanese_to_ipa 57 | from text.korean import korean_to_lazy_ipa 58 | from text.sanskrit import devanagari_to_ipa 59 | from text.english import english_to_lazy_ipa 60 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 61 | lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text) 62 | text = re.sub(r'\[JA\](.*?)\[JA\]', 63 | lambda x: japanese_to_ipa(x.group(1))+' ', text) 64 | text = re.sub(r'\[KO\](.*?)\[KO\]', 65 | lambda x: korean_to_lazy_ipa(x.group(1))+' ', text) 66 | text = re.sub(r'\[SA\](.*?)\[SA\]', 67 | lambda x: devanagari_to_ipa(x.group(1))+' ', text) 68 | text = re.sub(r'\[EN\](.*?)\[EN\]', 69 | lambda x: english_to_lazy_ipa(x.group(1))+' ', text) 70 | text = re.sub(r'\s+$', '', text) 71 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 72 | return text 73 | 74 | 75 | def cjke_cleaners(text): 76 | from text.mandarin import chinese_to_lazy_ipa 77 | from text.japanese import japanese_to_ipa 78 | from text.korean import korean_to_ipa 79 | from text.english import english_to_ipa2 80 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace( 81 | 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text) 82 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace( 83 | 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text) 84 | text = re.sub(r'\[KO\](.*?)\[KO\]', 85 | lambda x: korean_to_ipa(x.group(1))+' ', text) 86 | text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace( 87 | 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text) 88 | text = re.sub(r'\s+$', '', text) 89 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 90 | return text 91 | 92 | 93 | def cjke_cleaners2(text): 94 | from text.mandarin import chinese_to_ipa 95 | from text.japanese import japanese_to_ipa2 96 | from text.korean import korean_to_ipa 97 | from text.english import english_to_ipa2 98 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 99 | lambda x: chinese_to_ipa(x.group(1))+' ', text) 100 | text = re.sub(r'\[JA\](.*?)\[JA\]', 101 | lambda x: japanese_to_ipa2(x.group(1))+' ', text) 102 | text = re.sub(r'\[KO\](.*?)\[KO\]', 103 | lambda x: korean_to_ipa(x.group(1))+' ', text) 104 | text = re.sub(r'\[EN\](.*?)\[EN\]', 105 | lambda x: english_to_ipa2(x.group(1))+' ', text) 106 | text = re.sub(r'\s+$', '', text) 107 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 108 | return text 109 | 110 | 111 | def thai_cleaners(text): 112 | from text.thai import num_to_thai, latin_to_thai 113 | text = num_to_thai(text) 114 | text = latin_to_thai(text) 115 | return text 116 | 117 | 118 | def shanghainese_cleaners(text): 119 | from text.shanghainese import shanghainese_to_ipa 120 | text = shanghainese_to_ipa(text) 121 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 122 | return text 123 | 124 | 125 | def chinese_dialect_cleaners(text): 126 | from text.mandarin import chinese_to_ipa2 127 | from text.japanese import japanese_to_ipa3 128 | from text.shanghainese import shanghainese_to_ipa 129 | from text.cantonese import cantonese_to_ipa 130 | from text.english import english_to_lazy_ipa2 131 | from text.ngu_dialect import ngu_dialect_to_ipa 132 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 133 | lambda x: chinese_to_ipa2(x.group(1))+' ', text) 134 | text = re.sub(r'\[JA\](.*?)\[JA\]', 135 | lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text) 136 | text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5', 137 | '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text) 138 | text = re.sub(r'\[GD\](.*?)\[GD\]', 139 | lambda x: cantonese_to_ipa(x.group(1))+' ', text) 140 | text = re.sub(r'\[EN\](.*?)\[EN\]', 141 | lambda x: english_to_lazy_ipa2(x.group(1))+' ', text) 142 | text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group( 143 | 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text) 144 | text = re.sub(r'\s+$', '', text) 145 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 146 | return text 147 | -------------------------------------------------------------------------------- /text/english.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | 16 | # Regular expression matching whitespace: 17 | 18 | 19 | import re 20 | import inflect 21 | from unidecode import unidecode 22 | import eng_to_ipa as ipa 23 | _inflect = inflect.engine() 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 29 | _number_re = re.compile(r'[0-9]+') 30 | 31 | # List of (regular expression, replacement) pairs for abbreviations: 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 33 | ('mrs', 'misess'), 34 | ('mr', 'mister'), 35 | ('dr', 'doctor'), 36 | ('st', 'saint'), 37 | ('co', 'company'), 38 | ('jr', 'junior'), 39 | ('maj', 'major'), 40 | ('gen', 'general'), 41 | ('drs', 'doctors'), 42 | ('rev', 'reverend'), 43 | ('lt', 'lieutenant'), 44 | ('hon', 'honorable'), 45 | ('sgt', 'sergeant'), 46 | ('capt', 'captain'), 47 | ('esq', 'esquire'), 48 | ('ltd', 'limited'), 49 | ('col', 'colonel'), 50 | ('ft', 'fort'), 51 | ]] 52 | 53 | 54 | # List of (ipa, lazy ipa) pairs: 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 56 | ('r', 'ɹ'), 57 | ('æ', 'e'), 58 | ('ɑ', 'a'), 59 | ('ɔ', 'o'), 60 | ('ð', 'z'), 61 | ('θ', 's'), 62 | ('ɛ', 'e'), 63 | ('ɪ', 'i'), 64 | ('ʊ', 'u'), 65 | ('ʒ', 'ʥ'), 66 | ('ʤ', 'ʥ'), 67 | ('ˈ', '↓'), 68 | ]] 69 | 70 | # List of (ipa, lazy ipa2) pairs: 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 72 | ('r', 'ɹ'), 73 | ('ð', 'z'), 74 | ('θ', 's'), 75 | ('ʒ', 'ʑ'), 76 | ('ʤ', 'dʑ'), 77 | ('ˈ', '↓'), 78 | ]] 79 | 80 | # List of (ipa, ipa2) pairs 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 82 | ('r', 'ɹ'), 83 | ('ʤ', 'dʒ'), 84 | ('ʧ', 'tʃ') 85 | ]] 86 | 87 | 88 | def expand_abbreviations(text): 89 | for regex, replacement in _abbreviations: 90 | text = re.sub(regex, replacement, text) 91 | return text 92 | 93 | 94 | def collapse_whitespace(text): 95 | return re.sub(r'\s+', ' ', text) 96 | 97 | 98 | def _remove_commas(m): 99 | return m.group(1).replace(',', '') 100 | 101 | 102 | def _expand_decimal_point(m): 103 | return m.group(1).replace('.', ' point ') 104 | 105 | 106 | def _expand_dollars(m): 107 | match = m.group(1) 108 | parts = match.split('.') 109 | if len(parts) > 2: 110 | return match + ' dollars' # Unexpected format 111 | dollars = int(parts[0]) if parts[0] else 0 112 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 113 | if dollars and cents: 114 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 115 | cent_unit = 'cent' if cents == 1 else 'cents' 116 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 117 | elif dollars: 118 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 119 | return '%s %s' % (dollars, dollar_unit) 120 | elif cents: 121 | cent_unit = 'cent' if cents == 1 else 'cents' 122 | return '%s %s' % (cents, cent_unit) 123 | else: 124 | return 'zero dollars' 125 | 126 | 127 | def _expand_ordinal(m): 128 | return _inflect.number_to_words(m.group(0)) 129 | 130 | 131 | def _expand_number(m): 132 | num = int(m.group(0)) 133 | if num > 1000 and num < 3000: 134 | if num == 2000: 135 | return 'two thousand' 136 | elif num > 2000 and num < 2010: 137 | return 'two thousand ' + _inflect.number_to_words(num % 100) 138 | elif num % 100 == 0: 139 | return _inflect.number_to_words(num // 100) + ' hundred' 140 | else: 141 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 142 | else: 143 | return _inflect.number_to_words(num, andword='') 144 | 145 | 146 | def normalize_numbers(text): 147 | text = re.sub(_comma_number_re, _remove_commas, text) 148 | text = re.sub(_pounds_re, r'\1 pounds', text) 149 | text = re.sub(_dollars_re, _expand_dollars, text) 150 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 151 | text = re.sub(_ordinal_re, _expand_ordinal, text) 152 | text = re.sub(_number_re, _expand_number, text) 153 | return text 154 | 155 | 156 | def mark_dark_l(text): 157 | return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text) 158 | 159 | 160 | def english_to_ipa(text): 161 | text = unidecode(text).lower() 162 | text = expand_abbreviations(text) 163 | text = normalize_numbers(text) 164 | phonemes = ipa.convert(text) 165 | phonemes = collapse_whitespace(phonemes) 166 | return phonemes 167 | 168 | 169 | def english_to_lazy_ipa(text): 170 | text = english_to_ipa(text) 171 | for regex, replacement in _lazy_ipa: 172 | text = re.sub(regex, replacement, text) 173 | return text 174 | 175 | 176 | def english_to_ipa2(text): 177 | text = english_to_ipa(text) 178 | text = mark_dark_l(text) 179 | for regex, replacement in _ipa_to_ipa2: 180 | text = re.sub(regex, replacement, text) 181 | return text.replace('...', '…') 182 | 183 | 184 | def english_to_lazy_ipa2(text): 185 | text = english_to_ipa(text) 186 | for regex, replacement in _lazy_ipa2: 187 | text = re.sub(regex, replacement, text) 188 | return text 189 | -------------------------------------------------------------------------------- /text/japanese.py: -------------------------------------------------------------------------------- 1 | import re 2 | from unidecode import unidecode 3 | import pyopenjtalk 4 | 5 | 6 | # Regular expression matching Japanese without punctuation marks: 7 | _japanese_characters = re.compile( 8 | r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 9 | 10 | # Regular expression matching non-Japanese characters or punctuation marks: 11 | _japanese_marks = re.compile( 12 | r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 13 | 14 | # List of (symbol, Japanese) pairs for marks: 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ 16 | ('%', 'パーセント') 17 | ]] 18 | 19 | # List of (romaji, ipa) pairs for marks: 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 21 | ('ts', 'ʦ'), 22 | ('u', 'ɯ'), 23 | ('j', 'ʥ'), 24 | ('y', 'j'), 25 | ('ni', 'n^i'), 26 | ('nj', 'n^'), 27 | ('hi', 'çi'), 28 | ('hj', 'ç'), 29 | ('f', 'ɸ'), 30 | ('I', 'i*'), 31 | ('U', 'ɯ*'), 32 | ('r', 'ɾ') 33 | ]] 34 | 35 | # List of (romaji, ipa2) pairs for marks: 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 37 | ('u', 'ɯ'), 38 | ('ʧ', 'tʃ'), 39 | ('j', 'dʑ'), 40 | ('y', 'j'), 41 | ('ni', 'n^i'), 42 | ('nj', 'n^'), 43 | ('hi', 'çi'), 44 | ('hj', 'ç'), 45 | ('f', 'ɸ'), 46 | ('I', 'i*'), 47 | ('U', 'ɯ*'), 48 | ('r', 'ɾ') 49 | ]] 50 | 51 | # List of (consonant, sokuon) pairs: 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 53 | (r'Q([↑↓]*[kg])', r'k#\1'), 54 | (r'Q([↑↓]*[tdjʧ])', r't#\1'), 55 | (r'Q([↑↓]*[sʃ])', r's\1'), 56 | (r'Q([↑↓]*[pb])', r'p#\1') 57 | ]] 58 | 59 | # List of (consonant, hatsuon) pairs: 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 61 | (r'N([↑↓]*[pbm])', r'm\1'), 62 | (r'N([↑↓]*[ʧʥj])', r'n^\1'), 63 | (r'N([↑↓]*[tdn])', r'n\1'), 64 | (r'N([↑↓]*[kg])', r'ŋ\1') 65 | ]] 66 | 67 | 68 | def symbols_to_japanese(text): 69 | for regex, replacement in _symbols_to_japanese: 70 | text = re.sub(regex, replacement, text) 71 | return text 72 | 73 | 74 | def japanese_to_romaji_with_accent(text): 75 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' 76 | text = symbols_to_japanese(text) 77 | sentences = re.split(_japanese_marks, text) 78 | marks = re.findall(_japanese_marks, text) 79 | text = '' 80 | for i, sentence in enumerate(sentences): 81 | if re.match(_japanese_characters, sentence): 82 | if text != '': 83 | text += ' ' 84 | labels = pyopenjtalk.extract_fullcontext(sentence) 85 | for n, label in enumerate(labels): 86 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1) 87 | if phoneme not in ['sil', 'pau']: 88 | text += phoneme.replace('ch', 'ʧ').replace('sh', 89 | 'ʃ').replace('cl', 'Q') 90 | else: 91 | continue 92 | # n_moras = int(re.search(r'/F:(\d+)_', label).group(1)) 93 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) 94 | a2 = int(re.search(r"\+(\d+)\+", label).group(1)) 95 | a3 = int(re.search(r"\+(\d+)/", label).group(1)) 96 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']: 97 | a2_next = -1 98 | else: 99 | a2_next = int( 100 | re.search(r"\+(\d+)\+", labels[n + 1]).group(1)) 101 | # Accent phrase boundary 102 | if a3 == 1 and a2_next == 1: 103 | text += ' ' 104 | # Falling 105 | elif a1 == 0 and a2_next == a2 + 1: 106 | text += '↓' 107 | # Rising 108 | elif a2 == 1 and a2_next == 2: 109 | text += '↑' 110 | if i < len(marks): 111 | text += unidecode(marks[i]).replace(' ', '') 112 | return text 113 | 114 | 115 | def get_real_sokuon(text): 116 | for regex, replacement in _real_sokuon: 117 | text = re.sub(regex, replacement, text) 118 | return text 119 | 120 | 121 | def get_real_hatsuon(text): 122 | for regex, replacement in _real_hatsuon: 123 | text = re.sub(regex, replacement, text) 124 | return text 125 | 126 | 127 | def japanese_to_ipa(text): 128 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 129 | text = re.sub( 130 | r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 131 | text = get_real_sokuon(text) 132 | text = get_real_hatsuon(text) 133 | for regex, replacement in _romaji_to_ipa: 134 | text = re.sub(regex, replacement, text) 135 | return text 136 | 137 | 138 | def japanese_to_ipa2(text): 139 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 140 | text = get_real_sokuon(text) 141 | text = get_real_hatsuon(text) 142 | for regex, replacement in _romaji_to_ipa2: 143 | text = re.sub(regex, replacement, text) 144 | return text 145 | 146 | 147 | def japanese_to_ipa3(text): 148 | text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace( 149 | 'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a') 150 | text = re.sub( 151 | r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 152 | text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text) 153 | return text 154 | -------------------------------------------------------------------------------- /text/korean.py: -------------------------------------------------------------------------------- 1 | import re 2 | from jamo import h2j, j2hcj 3 | import ko_pron 4 | 5 | 6 | # This is a list of Korean classifiers preceded by pure Korean numerals. 7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' 8 | 9 | # List of (hangul, hangul divided) pairs: 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ 11 | ('ㄳ', 'ㄱㅅ'), 12 | ('ㄵ', 'ㄴㅈ'), 13 | ('ㄶ', 'ㄴㅎ'), 14 | ('ㄺ', 'ㄹㄱ'), 15 | ('ㄻ', 'ㄹㅁ'), 16 | ('ㄼ', 'ㄹㅂ'), 17 | ('ㄽ', 'ㄹㅅ'), 18 | ('ㄾ', 'ㄹㅌ'), 19 | ('ㄿ', 'ㄹㅍ'), 20 | ('ㅀ', 'ㄹㅎ'), 21 | ('ㅄ', 'ㅂㅅ'), 22 | ('ㅘ', 'ㅗㅏ'), 23 | ('ㅙ', 'ㅗㅐ'), 24 | ('ㅚ', 'ㅗㅣ'), 25 | ('ㅝ', 'ㅜㅓ'), 26 | ('ㅞ', 'ㅜㅔ'), 27 | ('ㅟ', 'ㅜㅣ'), 28 | ('ㅢ', 'ㅡㅣ'), 29 | ('ㅑ', 'ㅣㅏ'), 30 | ('ㅒ', 'ㅣㅐ'), 31 | ('ㅕ', 'ㅣㅓ'), 32 | ('ㅖ', 'ㅣㅔ'), 33 | ('ㅛ', 'ㅣㅗ'), 34 | ('ㅠ', 'ㅣㅜ') 35 | ]] 36 | 37 | # List of (Latin alphabet, hangul) pairs: 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 39 | ('a', '에이'), 40 | ('b', '비'), 41 | ('c', '시'), 42 | ('d', '디'), 43 | ('e', '이'), 44 | ('f', '에프'), 45 | ('g', '지'), 46 | ('h', '에이치'), 47 | ('i', '아이'), 48 | ('j', '제이'), 49 | ('k', '케이'), 50 | ('l', '엘'), 51 | ('m', '엠'), 52 | ('n', '엔'), 53 | ('o', '오'), 54 | ('p', '피'), 55 | ('q', '큐'), 56 | ('r', '아르'), 57 | ('s', '에스'), 58 | ('t', '티'), 59 | ('u', '유'), 60 | ('v', '브이'), 61 | ('w', '더블유'), 62 | ('x', '엑스'), 63 | ('y', '와이'), 64 | ('z', '제트') 65 | ]] 66 | 67 | # List of (ipa, lazy ipa) pairs: 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 69 | ('t͡ɕ','ʧ'), 70 | ('d͡ʑ','ʥ'), 71 | ('ɲ','n^'), 72 | ('ɕ','ʃ'), 73 | ('ʷ','w'), 74 | ('ɭ','l`'), 75 | ('ʎ','ɾ'), 76 | ('ɣ','ŋ'), 77 | ('ɰ','ɯ'), 78 | ('ʝ','j'), 79 | ('ʌ','ə'), 80 | ('ɡ','g'), 81 | ('\u031a','#'), 82 | ('\u0348','='), 83 | ('\u031e',''), 84 | ('\u0320',''), 85 | ('\u0339','') 86 | ]] 87 | 88 | 89 | def latin_to_hangul(text): 90 | for regex, replacement in _latin_to_hangul: 91 | text = re.sub(regex, replacement, text) 92 | return text 93 | 94 | 95 | def divide_hangul(text): 96 | text = j2hcj(h2j(text)) 97 | for regex, replacement in _hangul_divided: 98 | text = re.sub(regex, replacement, text) 99 | return text 100 | 101 | 102 | def hangul_number(num, sino=True): 103 | '''Reference https://github.com/Kyubyong/g2pK''' 104 | num = re.sub(',', '', num) 105 | 106 | if num == '0': 107 | return '영' 108 | if not sino and num == '20': 109 | return '스무' 110 | 111 | digits = '123456789' 112 | names = '일이삼사오육칠팔구' 113 | digit2name = {d: n for d, n in zip(digits, names)} 114 | 115 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' 116 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' 117 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} 118 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} 119 | 120 | spelledout = [] 121 | for i, digit in enumerate(num): 122 | i = len(num) - i - 1 123 | if sino: 124 | if i == 0: 125 | name = digit2name.get(digit, '') 126 | elif i == 1: 127 | name = digit2name.get(digit, '') + '십' 128 | name = name.replace('일십', '십') 129 | else: 130 | if i == 0: 131 | name = digit2mod.get(digit, '') 132 | elif i == 1: 133 | name = digit2dec.get(digit, '') 134 | if digit == '0': 135 | if i % 4 == 0: 136 | last_three = spelledout[-min(3, len(spelledout)):] 137 | if ''.join(last_three) == '': 138 | spelledout.append('') 139 | continue 140 | else: 141 | spelledout.append('') 142 | continue 143 | if i == 2: 144 | name = digit2name.get(digit, '') + '백' 145 | name = name.replace('일백', '백') 146 | elif i == 3: 147 | name = digit2name.get(digit, '') + '천' 148 | name = name.replace('일천', '천') 149 | elif i == 4: 150 | name = digit2name.get(digit, '') + '만' 151 | name = name.replace('일만', '만') 152 | elif i == 5: 153 | name = digit2name.get(digit, '') + '십' 154 | name = name.replace('일십', '십') 155 | elif i == 6: 156 | name = digit2name.get(digit, '') + '백' 157 | name = name.replace('일백', '백') 158 | elif i == 7: 159 | name = digit2name.get(digit, '') + '천' 160 | name = name.replace('일천', '천') 161 | elif i == 8: 162 | name = digit2name.get(digit, '') + '억' 163 | elif i == 9: 164 | name = digit2name.get(digit, '') + '십' 165 | elif i == 10: 166 | name = digit2name.get(digit, '') + '백' 167 | elif i == 11: 168 | name = digit2name.get(digit, '') + '천' 169 | elif i == 12: 170 | name = digit2name.get(digit, '') + '조' 171 | elif i == 13: 172 | name = digit2name.get(digit, '') + '십' 173 | elif i == 14: 174 | name = digit2name.get(digit, '') + '백' 175 | elif i == 15: 176 | name = digit2name.get(digit, '') + '천' 177 | spelledout.append(name) 178 | return ''.join(elem for elem in spelledout) 179 | 180 | 181 | def number_to_hangul(text): 182 | '''Reference https://github.com/Kyubyong/g2pK''' 183 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) 184 | for token in tokens: 185 | num, classifier = token 186 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: 187 | spelledout = hangul_number(num, sino=False) 188 | else: 189 | spelledout = hangul_number(num, sino=True) 190 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') 191 | # digit by digit for remaining digits 192 | digits = '0123456789' 193 | names = '영일이삼사오육칠팔구' 194 | for d, n in zip(digits, names): 195 | text = text.replace(d, n) 196 | return text 197 | 198 | 199 | def korean_to_lazy_ipa(text): 200 | text = latin_to_hangul(text) 201 | text = number_to_hangul(text) 202 | text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) 203 | for regex, replacement in _ipa_to_lazy_ipa: 204 | text = re.sub(regex, replacement, text) 205 | return text 206 | 207 | 208 | def korean_to_ipa(text): 209 | text = korean_to_lazy_ipa(text) 210 | return text.replace('ʧ','tʃ').replace('ʥ','dʑ') 211 | -------------------------------------------------------------------------------- /text/mandarin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | from pypinyin import lazy_pinyin, BOPOMOFO 5 | import jieba 6 | import cn2an 7 | import logging 8 | 9 | logging.getLogger('jieba').setLevel(logging.WARNING) 10 | jieba.set_dictionary(os.path.dirname(sys.argv[0])+'/jieba/dict.txt') 11 | jieba.initialize() 12 | 13 | 14 | # List of (Latin alphabet, bopomofo) pairs: 15 | _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 16 | ('a', 'ㄟˉ'), 17 | ('b', 'ㄅㄧˋ'), 18 | ('c', 'ㄙㄧˉ'), 19 | ('d', 'ㄉㄧˋ'), 20 | ('e', 'ㄧˋ'), 21 | ('f', 'ㄝˊㄈㄨˋ'), 22 | ('g', 'ㄐㄧˋ'), 23 | ('h', 'ㄝˇㄑㄩˋ'), 24 | ('i', 'ㄞˋ'), 25 | ('j', 'ㄐㄟˋ'), 26 | ('k', 'ㄎㄟˋ'), 27 | ('l', 'ㄝˊㄛˋ'), 28 | ('m', 'ㄝˊㄇㄨˋ'), 29 | ('n', 'ㄣˉ'), 30 | ('o', 'ㄡˉ'), 31 | ('p', 'ㄆㄧˉ'), 32 | ('q', 'ㄎㄧㄡˉ'), 33 | ('r', 'ㄚˋ'), 34 | ('s', 'ㄝˊㄙˋ'), 35 | ('t', 'ㄊㄧˋ'), 36 | ('u', 'ㄧㄡˉ'), 37 | ('v', 'ㄨㄧˉ'), 38 | ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), 39 | ('x', 'ㄝˉㄎㄨˋㄙˋ'), 40 | ('y', 'ㄨㄞˋ'), 41 | ('z', 'ㄗㄟˋ') 42 | ]] 43 | 44 | # List of (bopomofo, romaji) pairs: 45 | _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ 46 | ('ㄅㄛ', 'p⁼wo'), 47 | ('ㄆㄛ', 'pʰwo'), 48 | ('ㄇㄛ', 'mwo'), 49 | ('ㄈㄛ', 'fwo'), 50 | ('ㄅ', 'p⁼'), 51 | ('ㄆ', 'pʰ'), 52 | ('ㄇ', 'm'), 53 | ('ㄈ', 'f'), 54 | ('ㄉ', 't⁼'), 55 | ('ㄊ', 'tʰ'), 56 | ('ㄋ', 'n'), 57 | ('ㄌ', 'l'), 58 | ('ㄍ', 'k⁼'), 59 | ('ㄎ', 'kʰ'), 60 | ('ㄏ', 'h'), 61 | ('ㄐ', 'ʧ⁼'), 62 | ('ㄑ', 'ʧʰ'), 63 | ('ㄒ', 'ʃ'), 64 | ('ㄓ', 'ʦ`⁼'), 65 | ('ㄔ', 'ʦ`ʰ'), 66 | ('ㄕ', 's`'), 67 | ('ㄖ', 'ɹ`'), 68 | ('ㄗ', 'ʦ⁼'), 69 | ('ㄘ', 'ʦʰ'), 70 | ('ㄙ', 's'), 71 | ('ㄚ', 'a'), 72 | ('ㄛ', 'o'), 73 | ('ㄜ', 'ə'), 74 | ('ㄝ', 'e'), 75 | ('ㄞ', 'ai'), 76 | ('ㄟ', 'ei'), 77 | ('ㄠ', 'au'), 78 | ('ㄡ', 'ou'), 79 | ('ㄧㄢ', 'yeNN'), 80 | ('ㄢ', 'aNN'), 81 | ('ㄧㄣ', 'iNN'), 82 | ('ㄣ', 'əNN'), 83 | ('ㄤ', 'aNg'), 84 | ('ㄧㄥ', 'iNg'), 85 | ('ㄨㄥ', 'uNg'), 86 | ('ㄩㄥ', 'yuNg'), 87 | ('ㄥ', 'əNg'), 88 | ('ㄦ', 'əɻ'), 89 | ('ㄧ', 'i'), 90 | ('ㄨ', 'u'), 91 | ('ㄩ', 'ɥ'), 92 | ('ˉ', '→'), 93 | ('ˊ', '↑'), 94 | ('ˇ', '↓↑'), 95 | ('ˋ', '↓'), 96 | ('˙', ''), 97 | (',', ','), 98 | ('。', '.'), 99 | ('!', '!'), 100 | ('?', '?'), 101 | ('—', '-') 102 | ]] 103 | 104 | # List of (romaji, ipa) pairs: 105 | _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 106 | ('ʃy', 'ʃ'), 107 | ('ʧʰy', 'ʧʰ'), 108 | ('ʧ⁼y', 'ʧ⁼'), 109 | ('NN', 'n'), 110 | ('Ng', 'ŋ'), 111 | ('y', 'j'), 112 | ('h', 'x') 113 | ]] 114 | 115 | # List of (bopomofo, ipa) pairs: 116 | _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 117 | ('ㄅㄛ', 'p⁼wo'), 118 | ('ㄆㄛ', 'pʰwo'), 119 | ('ㄇㄛ', 'mwo'), 120 | ('ㄈㄛ', 'fwo'), 121 | ('ㄅ', 'p⁼'), 122 | ('ㄆ', 'pʰ'), 123 | ('ㄇ', 'm'), 124 | ('ㄈ', 'f'), 125 | ('ㄉ', 't⁼'), 126 | ('ㄊ', 'tʰ'), 127 | ('ㄋ', 'n'), 128 | ('ㄌ', 'l'), 129 | ('ㄍ', 'k⁼'), 130 | ('ㄎ', 'kʰ'), 131 | ('ㄏ', 'x'), 132 | ('ㄐ', 'tʃ⁼'), 133 | ('ㄑ', 'tʃʰ'), 134 | ('ㄒ', 'ʃ'), 135 | ('ㄓ', 'ts`⁼'), 136 | ('ㄔ', 'ts`ʰ'), 137 | ('ㄕ', 's`'), 138 | ('ㄖ', 'ɹ`'), 139 | ('ㄗ', 'ts⁼'), 140 | ('ㄘ', 'tsʰ'), 141 | ('ㄙ', 's'), 142 | ('ㄚ', 'a'), 143 | ('ㄛ', 'o'), 144 | ('ㄜ', 'ə'), 145 | ('ㄝ', 'ɛ'), 146 | ('ㄞ', 'aɪ'), 147 | ('ㄟ', 'eɪ'), 148 | ('ㄠ', 'ɑʊ'), 149 | ('ㄡ', 'oʊ'), 150 | ('ㄧㄢ', 'jɛn'), 151 | ('ㄩㄢ', 'ɥæn'), 152 | ('ㄢ', 'an'), 153 | ('ㄧㄣ', 'in'), 154 | ('ㄩㄣ', 'ɥn'), 155 | ('ㄣ', 'ən'), 156 | ('ㄤ', 'ɑŋ'), 157 | ('ㄧㄥ', 'iŋ'), 158 | ('ㄨㄥ', 'ʊŋ'), 159 | ('ㄩㄥ', 'jʊŋ'), 160 | ('ㄥ', 'əŋ'), 161 | ('ㄦ', 'əɻ'), 162 | ('ㄧ', 'i'), 163 | ('ㄨ', 'u'), 164 | ('ㄩ', 'ɥ'), 165 | ('ˉ', '→'), 166 | ('ˊ', '↑'), 167 | ('ˇ', '↓↑'), 168 | ('ˋ', '↓'), 169 | ('˙', ''), 170 | (',', ','), 171 | ('。', '.'), 172 | ('!', '!'), 173 | ('?', '?'), 174 | ('—', '-') 175 | ]] 176 | 177 | # List of (bopomofo, ipa2) pairs: 178 | _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 179 | ('ㄅㄛ', 'pwo'), 180 | ('ㄆㄛ', 'pʰwo'), 181 | ('ㄇㄛ', 'mwo'), 182 | ('ㄈㄛ', 'fwo'), 183 | ('ㄅ', 'p'), 184 | ('ㄆ', 'pʰ'), 185 | ('ㄇ', 'm'), 186 | ('ㄈ', 'f'), 187 | ('ㄉ', 't'), 188 | ('ㄊ', 'tʰ'), 189 | ('ㄋ', 'n'), 190 | ('ㄌ', 'l'), 191 | ('ㄍ', 'k'), 192 | ('ㄎ', 'kʰ'), 193 | ('ㄏ', 'h'), 194 | ('ㄐ', 'tɕ'), 195 | ('ㄑ', 'tɕʰ'), 196 | ('ㄒ', 'ɕ'), 197 | ('ㄓ', 'tʂ'), 198 | ('ㄔ', 'tʂʰ'), 199 | ('ㄕ', 'ʂ'), 200 | ('ㄖ', 'ɻ'), 201 | ('ㄗ', 'ts'), 202 | ('ㄘ', 'tsʰ'), 203 | ('ㄙ', 's'), 204 | ('ㄚ', 'a'), 205 | ('ㄛ', 'o'), 206 | ('ㄜ', 'ɤ'), 207 | ('ㄝ', 'ɛ'), 208 | ('ㄞ', 'aɪ'), 209 | ('ㄟ', 'eɪ'), 210 | ('ㄠ', 'ɑʊ'), 211 | ('ㄡ', 'oʊ'), 212 | ('ㄧㄢ', 'jɛn'), 213 | ('ㄩㄢ', 'yæn'), 214 | ('ㄢ', 'an'), 215 | ('ㄧㄣ', 'in'), 216 | ('ㄩㄣ', 'yn'), 217 | ('ㄣ', 'ən'), 218 | ('ㄤ', 'ɑŋ'), 219 | ('ㄧㄥ', 'iŋ'), 220 | ('ㄨㄥ', 'ʊŋ'), 221 | ('ㄩㄥ', 'jʊŋ'), 222 | ('ㄥ', 'ɤŋ'), 223 | ('ㄦ', 'əɻ'), 224 | ('ㄧ', 'i'), 225 | ('ㄨ', 'u'), 226 | ('ㄩ', 'y'), 227 | ('ˉ', '˥'), 228 | ('ˊ', '˧˥'), 229 | ('ˇ', '˨˩˦'), 230 | ('ˋ', '˥˩'), 231 | ('˙', ''), 232 | (',', ','), 233 | ('。', '.'), 234 | ('!', '!'), 235 | ('?', '?'), 236 | ('—', '-') 237 | ]] 238 | 239 | 240 | def number_to_chinese(text): 241 | numbers = re.findall(r'\d+(?:\.?\d+)?', text) 242 | for number in numbers: 243 | text = text.replace(number, cn2an.an2cn(number), 1) 244 | return text 245 | 246 | 247 | def chinese_to_bopomofo(text): 248 | text = text.replace('、', ',').replace(';', ',').replace(':', ',') 249 | words = jieba.lcut(text, cut_all=False) 250 | text = '' 251 | for word in words: 252 | bopomofos = lazy_pinyin(word, BOPOMOFO) 253 | if not re.search('[\u4e00-\u9fff]', word): 254 | text += word 255 | continue 256 | for i in range(len(bopomofos)): 257 | bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i]) 258 | if text != '': 259 | text += ' ' 260 | text += ''.join(bopomofos) 261 | return text 262 | 263 | 264 | def latin_to_bopomofo(text): 265 | for regex, replacement in _latin_to_bopomofo: 266 | text = re.sub(regex, replacement, text) 267 | return text 268 | 269 | 270 | def bopomofo_to_romaji(text): 271 | for regex, replacement in _bopomofo_to_romaji: 272 | text = re.sub(regex, replacement, text) 273 | return text 274 | 275 | 276 | def bopomofo_to_ipa(text): 277 | for regex, replacement in _bopomofo_to_ipa: 278 | text = re.sub(regex, replacement, text) 279 | return text 280 | 281 | 282 | def bopomofo_to_ipa2(text): 283 | for regex, replacement in _bopomofo_to_ipa2: 284 | text = re.sub(regex, replacement, text) 285 | return text 286 | 287 | 288 | def chinese_to_romaji(text): 289 | text = number_to_chinese(text) 290 | text = chinese_to_bopomofo(text) 291 | text = latin_to_bopomofo(text) 292 | text = bopomofo_to_romaji(text) 293 | text = re.sub('i([aoe])', r'y\1', text) 294 | text = re.sub('u([aoəe])', r'w\1', text) 295 | text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 296 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 297 | text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 298 | return text 299 | 300 | 301 | def chinese_to_lazy_ipa(text): 302 | text = chinese_to_romaji(text) 303 | for regex, replacement in _romaji_to_ipa: 304 | text = re.sub(regex, replacement, text) 305 | return text 306 | 307 | 308 | def chinese_to_ipa(text): 309 | text = number_to_chinese(text) 310 | text = chinese_to_bopomofo(text) 311 | text = latin_to_bopomofo(text) 312 | text = bopomofo_to_ipa(text) 313 | text = re.sub('i([aoe])', r'j\1', text) 314 | text = re.sub('u([aoəe])', r'w\1', text) 315 | text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 316 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 317 | text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 318 | return text 319 | 320 | 321 | def chinese_to_ipa2(text): 322 | text = number_to_chinese(text) 323 | text = chinese_to_bopomofo(text) 324 | text = latin_to_bopomofo(text) 325 | text = bopomofo_to_ipa2(text) 326 | text = re.sub(r'i([aoe])', r'j\1', text) 327 | text = re.sub(r'u([aoəe])', r'w\1', text) 328 | text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text) 329 | text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text) 330 | return text 331 | -------------------------------------------------------------------------------- /text/ngu_dialect.py: -------------------------------------------------------------------------------- 1 | import re 2 | import opencc 3 | 4 | 5 | dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou', 6 | 'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing', 7 | 'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang', 8 | 'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan', 9 | 'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen', 10 | 'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'} 11 | 12 | converters = {} 13 | 14 | for dialect in dialects.values(): 15 | try: 16 | converters[dialect] = opencc.OpenCC(dialect) 17 | except: 18 | pass 19 | 20 | 21 | def ngu_dialect_to_ipa(text, dialect): 22 | dialect = dialects[dialect] 23 | text = converters[dialect].convert(text).replace('-','').replace('$',' ') 24 | text = re.sub(r'[、;:]', ',', text) 25 | text = re.sub(r'\s*,\s*', ', ', text) 26 | text = re.sub(r'\s*。\s*', '. ', text) 27 | text = re.sub(r'\s*?\s*', '? ', text) 28 | text = re.sub(r'\s*!\s*', '! ', text) 29 | text = re.sub(r'\s*$', '', text) 30 | return text 31 | -------------------------------------------------------------------------------- /text/sanskrit.py: -------------------------------------------------------------------------------- 1 | import re 2 | from indic_transliteration import sanscript 3 | 4 | 5 | # List of (iast, ipa) pairs: 6 | _iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 7 | ('a', 'ə'), 8 | ('ā', 'aː'), 9 | ('ī', 'iː'), 10 | ('ū', 'uː'), 11 | ('ṛ', 'ɹ`'), 12 | ('ṝ', 'ɹ`ː'), 13 | ('ḷ', 'l`'), 14 | ('ḹ', 'l`ː'), 15 | ('e', 'eː'), 16 | ('o', 'oː'), 17 | ('k', 'k⁼'), 18 | ('k⁼h', 'kʰ'), 19 | ('g', 'g⁼'), 20 | ('g⁼h', 'gʰ'), 21 | ('ṅ', 'ŋ'), 22 | ('c', 'ʧ⁼'), 23 | ('ʧ⁼h', 'ʧʰ'), 24 | ('j', 'ʥ⁼'), 25 | ('ʥ⁼h', 'ʥʰ'), 26 | ('ñ', 'n^'), 27 | ('ṭ', 't`⁼'), 28 | ('t`⁼h', 't`ʰ'), 29 | ('ḍ', 'd`⁼'), 30 | ('d`⁼h', 'd`ʰ'), 31 | ('ṇ', 'n`'), 32 | ('t', 't⁼'), 33 | ('t⁼h', 'tʰ'), 34 | ('d', 'd⁼'), 35 | ('d⁼h', 'dʰ'), 36 | ('p', 'p⁼'), 37 | ('p⁼h', 'pʰ'), 38 | ('b', 'b⁼'), 39 | ('b⁼h', 'bʰ'), 40 | ('y', 'j'), 41 | ('ś', 'ʃ'), 42 | ('ṣ', 's`'), 43 | ('r', 'ɾ'), 44 | ('l̤', 'l`'), 45 | ('h', 'ɦ'), 46 | ("'", ''), 47 | ('~', '^'), 48 | ('ṃ', '^') 49 | ]] 50 | 51 | 52 | def devanagari_to_ipa(text): 53 | text = text.replace('ॐ', 'ओम्') 54 | text = re.sub(r'\s*।\s*$', '.', text) 55 | text = re.sub(r'\s*।\s*', ', ', text) 56 | text = re.sub(r'\s*॥', '.', text) 57 | text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST) 58 | for regex, replacement in _iast_to_ipa: 59 | text = re.sub(regex, replacement, text) 60 | text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0) 61 | [:-1]+'h'+x.group(1)+'*', text) 62 | return text 63 | -------------------------------------------------------------------------------- /text/shanghainese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('zaonhe') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ᴇ'), 11 | ('B', 'bi'), 12 | ('C', 'si'), 13 | ('D', 'di'), 14 | ('E', 'i'), 15 | ('F', 'ᴇf'), 16 | ('G', 'dʑi'), 17 | ('H', 'ᴇtɕʰ'), 18 | ('I', 'ᴀi'), 19 | ('J', 'dʑᴇ'), 20 | ('K', 'kʰᴇ'), 21 | ('L', 'ᴇl'), 22 | ('M', 'ᴇm'), 23 | ('N', 'ᴇn'), 24 | ('O', 'o'), 25 | ('P', 'pʰi'), 26 | ('Q', 'kʰiu'), 27 | ('R', 'ᴀl'), 28 | ('S', 'ᴇs'), 29 | ('T', 'tʰi'), 30 | ('U', 'ɦiu'), 31 | ('V', 'vi'), 32 | ('W', 'dᴀbɤliu'), 33 | ('X', 'ᴇks'), 34 | ('Y', 'uᴀi'), 35 | ('Z', 'zᴇ') 36 | ]] 37 | 38 | 39 | def _number_to_shanghainese(num): 40 | num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两') 41 | return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num) 42 | 43 | 44 | def number_to_shanghainese(text): 45 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text) 46 | 47 | 48 | def latin_to_ipa(text): 49 | for regex, replacement in _latin_to_ipa: 50 | text = re.sub(regex, replacement, text) 51 | return text 52 | 53 | 54 | def shanghainese_to_ipa(text): 55 | text = number_to_shanghainese(text.upper()) 56 | text = converter.convert(text).replace('-','').replace('$',' ') 57 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 58 | text = re.sub(r'[、;:]', ',', text) 59 | text = re.sub(r'\s*,\s*', ', ', text) 60 | text = re.sub(r'\s*。\s*', '. ', text) 61 | text = re.sub(r'\s*?\s*', '? ', text) 62 | text = re.sub(r'\s*!\s*', '! ', text) 63 | text = re.sub(r'\s*$', '', text) 64 | return text 65 | -------------------------------------------------------------------------------- /text/thai.py: -------------------------------------------------------------------------------- 1 | import re 2 | from num_thai.thainumbers import NumThai 3 | 4 | 5 | num = NumThai() 6 | 7 | # List of (Latin alphabet, Thai) pairs: 8 | _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 9 | ('a', 'เอ'), 10 | ('b','บี'), 11 | ('c','ซี'), 12 | ('d','ดี'), 13 | ('e','อี'), 14 | ('f','เอฟ'), 15 | ('g','จี'), 16 | ('h','เอช'), 17 | ('i','ไอ'), 18 | ('j','เจ'), 19 | ('k','เค'), 20 | ('l','แอล'), 21 | ('m','เอ็ม'), 22 | ('n','เอ็น'), 23 | ('o','โอ'), 24 | ('p','พี'), 25 | ('q','คิว'), 26 | ('r','แอร์'), 27 | ('s','เอส'), 28 | ('t','ที'), 29 | ('u','ยู'), 30 | ('v','วี'), 31 | ('w','ดับเบิลยู'), 32 | ('x','เอ็กซ์'), 33 | ('y','วาย'), 34 | ('z','ซี') 35 | ]] 36 | 37 | 38 | def num_to_thai(text): 39 | return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text) 40 | 41 | def latin_to_thai(text): 42 | for regex, replacement in _latin_to_thai: 43 | text = re.sub(regex, replacement, text) 44 | return text 45 | -------------------------------------------------------------------------------- /trans.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import hashlib 4 | import uuid 5 | 6 | youdao_url = 'https://openapi.youdao.com/api' # 有道api地址 7 | 8 | # 需要翻译的文本' 9 | def translate(txt): 10 | translate_text = txt 11 | 12 | # 翻译文本生成sign前进行的处理 13 | input_text = "" 14 | 15 | # 当文本长度小于等于20时,取文本 16 | if (len(translate_text) <= 20): 17 | input_text = translate_text 18 | 19 | # 当文本长度大于20时,进行特殊处理 20 | elif (len(translate_text) > 20): 21 | input_text = translate_text[:10] + str(len(translate_text)) + translate_text[-10:] 22 | 23 | time_curtime = int(time.time()) # 秒级时间戳获取 24 | app_id = '67edf72f6213c548' # 应用id 25 | uu_id = uuid.uuid4() # 随机生成的uuid数,为了每次都生成一个不重复的数。 26 | app_key = "GIUcbzYlyLq1yKD2VVjV24OZXkzDpota" # 应用密钥 27 | 28 | sign = hashlib.sha256( 29 | (app_id + input_text + str(uu_id) + str(time_curtime) + app_key).encode('utf-8')).hexdigest() # sign生成 30 | 31 | data = { 32 | 'q': translate_text, # 翻译文本 33 | 'from': "zh-CHS", # 源语言 34 | 'to': "ja", # 翻译语言 35 | 'appKey': app_id, # 应用id 36 | 'salt': uu_id, # 随机生产的uuid码 37 | 'sign': sign, # 签名 38 | 'signType': "v3", # 签名类型,固定值 39 | 'curtime': time_curtime, # 秒级时间戳 40 | } 41 | 42 | r = requests.get(youdao_url, params=data).json() # 获取返回的json()内容 43 | return r["translation"][0] 44 | #print("翻译后的结果:" + r["translation"][0]) # 获取翻译内容 45 | 46 | if __name__ == '__main__': 47 | s=translate('早上好') 48 | print(s) -------------------------------------------------------------------------------- /transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from json import loads 3 | from torch import load, FloatTensor 4 | from numpy import float32 5 | import librosa 6 | 7 | 8 | class HParams(): 9 | def __init__(self, **kwargs): 10 | for k, v in kwargs.items(): 11 | if type(v) == dict: 12 | v = HParams(**v) 13 | self[k] = v 14 | 15 | def keys(self): 16 | return self.__dict__.keys() 17 | 18 | def items(self): 19 | return self.__dict__.items() 20 | 21 | def values(self): 22 | return self.__dict__.values() 23 | 24 | def __len__(self): 25 | return len(self.__dict__) 26 | 27 | def __getitem__(self, key): 28 | return getattr(self, key) 29 | 30 | def __setitem__(self, key, value): 31 | return setattr(self, key, value) 32 | 33 | def __contains__(self, key): 34 | return key in self.__dict__ 35 | 36 | def __repr__(self): 37 | return self.__dict__.__repr__() 38 | 39 | 40 | def load_checkpoint(checkpoint_path, model): 41 | checkpoint_dict = load(checkpoint_path, map_location='cpu') 42 | iteration = checkpoint_dict['iteration'] 43 | saved_state_dict = checkpoint_dict['model'] 44 | if hasattr(model, 'module'): 45 | state_dict = model.module.state_dict() 46 | else: 47 | state_dict = model.state_dict() 48 | new_state_dict= {} 49 | for k, v in state_dict.items(): 50 | try: 51 | new_state_dict[k] = saved_state_dict[k] 52 | except: 53 | logging.info("%s is not in the checkpoint" % k) 54 | new_state_dict[k] = v 55 | if hasattr(model, 'module'): 56 | model.module.load_state_dict(new_state_dict) 57 | else: 58 | model.load_state_dict(new_state_dict) 59 | logging.info("Loaded checkpoint '{}' (iteration {})" .format( 60 | checkpoint_path, iteration)) 61 | return 62 | 63 | 64 | def get_hparams_from_file(config_path): 65 | with open(config_path, "r") as f: 66 | data = f.read() 67 | config = loads(data) 68 | 69 | hparams = HParams(**config) 70 | return hparams 71 | 72 | 73 | def load_audio_to_torch(full_path, target_sampling_rate): 74 | audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True) 75 | return FloatTensor(audio.astype(float32)) 76 | -------------------------------------------------------------------------------- /voiceModel/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "segment_size": 8192 4 | }, 5 | "data": { 6 | "text_cleaners":["zh_ja_mixture_cleaners"], 7 | "max_wav_value": 32768.0, 8 | "sampling_rate": 22050, 9 | "filter_length": 1024, 10 | "hop_length": 256, 11 | "win_length": 1024, 12 | "add_blank": true, 13 | "n_speakers": 5 14 | }, 15 | "model": { 16 | "inter_channels": 192, 17 | "hidden_channels": 192, 18 | "filter_channels": 768, 19 | "n_heads": 2, 20 | "n_layers": 6, 21 | "kernel_size": 3, 22 | "p_dropout": 0.1, 23 | "resblock": "1", 24 | "resblock_kernel_sizes": [3,7,11], 25 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 26 | "upsample_rates": [8,8,2,2], 27 | "upsample_initial_channel": 512, 28 | "upsample_kernel_sizes": [16,16,4,4], 29 | "n_layers_q": 3, 30 | "use_spectral_norm": false, 31 | "gin_channels": 256 32 | }, 33 | "speakers": ["\u7dbe\u5730\u5be7\u3005", "\u5728\u539f\u4e03\u6d77", "\u5c0f\u8338", "\u5510\u4e50\u541f"], 34 | "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u207c", "\u02b0", "`", "\u2192", "\u2193", "\u2191", " "] 35 | } 36 | --------------------------------------------------------------------------------