├── .gitattributes ├── .gitignore ├── ChatWaifu.py ├── ChatWaifuCN.py ├── ChatWaifuCNVoice.py ├── ChatWaifuJP.py ├── ChatWaifuJPVoice.py ├── ChatWaifuJPVoiceEN.py ├── ChatWaifuJPVoiceJP.py ├── ChatWaifuVoice.py ├── LICENSE ├── README.md ├── attentions.py ├── commons.py ├── eng-README.md ├── hubert_model.py ├── jieba └── dict.txt ├── mel_processing.py ├── models.py ├── modules.py ├── readme ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── cyberchat.png └── token.png ├── requirements.txt ├── text ├── LICENSE ├── __init__.py ├── cantonese.py ├── cleaners.py ├── english.py ├── japanese.py ├── korean.py ├── mandarin.py ├── ngu_dialect.py ├── sanskrit.py ├── shanghainese.py └── thai.py ├── transforms.py └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | *.pyc 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd 364 | 365 | # build 366 | build 367 | monotonic_align/core.c 368 | *.o 369 | *.so 370 | *.dll 371 | 372 | # data 373 | /config.json 374 | /*.pth 375 | *.wav 376 | /monotonic_align/monotonic_align 377 | /resources 378 | /MoeGoe.spec 379 | /dist/MoeGoe 380 | /dist 381 | 382 | # MacOS 383 | .DS_Store 384 | -------------------------------------------------------------------------------- /ChatWaifu.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write 2 | from mel_processing import spectrogram_torch 3 | from text import text_to_sequence, _clean_text 4 | from models import SynthesizerTrn 5 | import utils 6 | import commons 7 | import sys 8 | import re 9 | from torch import no_grad, LongTensor 10 | import logging 11 | from winsound import PlaySound 12 | from openai import OpenAI 13 | 14 | chinese_model_path = ".\model\CN\model.pth" 15 | chinese_config_path = ".\model\CN\config.json" 16 | japanese_model_path = ".\model\H_excluded.pth" 17 | japanese_config_path = ".\model\config.json" 18 | 19 | #################################### 20 | #CHATGPT INITIALIZE 21 | from pyChatGPT import ChatGPT 22 | import json 23 | 24 | modelmessage = """ID Output Language 25 | 0 Chinese 26 | 1 Japanese 27 | """ 28 | 29 | idmessage_cn = """ID Speaker 30 | 0 綾地寧々 31 | 1 在原七海 32 | 2 小茸 33 | 3 唐乐吟 34 | """ 35 | 36 | idmessage_jp = """ID Speaker 37 | 0 綾地寧々 38 | 1 因幡めぐる 39 | 2 朝武芳乃 40 | 3 常陸茉子 41 | 4 ムラサメ 42 | 5 鞍馬小春 43 | 6 在原七海 44 | """ 45 | 46 | def get_input(): 47 | # prompt for input 48 | print("You:") 49 | user_input = input() 50 | return user_input 51 | 52 | def get_input_jp(): 53 | # prompt for input 54 | print("You:") 55 | usr_in = input() 56 | if usr_in == 'quit()': 57 | return usr_in 58 | else: 59 | user_input = usr_in +" 使用日本语" 60 | return user_input 61 | 62 | def get_token(): 63 | token = input("Your API Key: \n") 64 | return token 65 | 66 | 67 | ################################################ 68 | 69 | 70 | logging.getLogger('numba').setLevel(logging.WARNING) 71 | 72 | def ex_print(text, escape=False): 73 | if escape: 74 | print(text.encode('unicode_escape').decode()) 75 | else: 76 | print(text) 77 | 78 | 79 | def get_text(text, hps, cleaned=False): 80 | if cleaned: 81 | text_norm = text_to_sequence(text, hps.symbols, []) 82 | else: 83 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 84 | if hps.data.add_blank: 85 | text_norm = commons.intersperse(text_norm, 0) 86 | text_norm = LongTensor(text_norm) 87 | return text_norm 88 | 89 | 90 | def ask_if_continue(): 91 | while True: 92 | answer = input('Continue? (y/n): ') 93 | if answer == 'y': 94 | break 95 | elif answer == 'n': 96 | sys.exit(0) 97 | 98 | 99 | def print_speakers(speakers, escape=False): 100 | if len(speakers) > 100: 101 | return 102 | print('ID\tSpeaker') 103 | for id, name in enumerate(speakers): 104 | ex_print(str(id) + '\t' + name, escape) 105 | 106 | 107 | def get_speaker_id(message): 108 | speaker_id = input(message) 109 | if speaker_id == '': 110 | print(str(speaker_id) + ' is not a valid ID!') 111 | sys.exit(1) 112 | else: 113 | try: 114 | speaker_id = int(speaker_id) 115 | except: 116 | print(str(speaker_id) + ' is not a valid ID!') 117 | sys.exit(1) 118 | return speaker_id 119 | 120 | def get_model_id(message): 121 | model_id = input(message) 122 | if model_id == '': 123 | print(str(model_id) + ' is not a valid ID!') 124 | sys.exit(1) 125 | else: 126 | try: 127 | model_id = int(model_id) 128 | except: 129 | print(str(model_id) + ' is not a valid ID!') 130 | sys.exit(1) 131 | return model_id 132 | 133 | def get_label_value(text, label, default, warning_name='value'): 134 | value = re.search(rf'\[{label}=(.+?)\]', text) 135 | if value: 136 | try: 137 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 138 | value = float(value.group(1)) 139 | except: 140 | print(f'Invalid {warning_name}!') 141 | sys.exit(1) 142 | else: 143 | value = default 144 | return value, text 145 | 146 | 147 | def get_label(text, label): 148 | if f'[{label}]' in text: 149 | return True, text.replace(f'[{label}]', '') 150 | else: 151 | return False, text 152 | 153 | def get_reponse(input): 154 | msg = [ 155 | {"role": "user", "content": input} 156 | ] 157 | 158 | # Call the OpenAI API with the prompt 159 | response = client.chat.completions.create( 160 | model="gpt-3.5-turbo", # Adjust based on available engine versions 161 | messages=msg, 162 | temperature=0 163 | ) 164 | # Extract and return the text from the API response 165 | return response.choices[0].message.content 166 | 167 | 168 | def generateSound(inputString, id, model_id): 169 | if '--escape' in sys.argv: 170 | escape = True 171 | else: 172 | escape = False 173 | 174 | #model = input('0: Chinese') 175 | #config = input('Path of a config file: ') 176 | if model_id == 0: 177 | model = chinese_model_path 178 | config = chinese_config_path 179 | elif model_id == 1: 180 | model = japanese_model_path 181 | config = japanese_config_path 182 | 183 | 184 | hps_ms = utils.get_hparams_from_file(config) 185 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 186 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 187 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 188 | 189 | net_g_ms = SynthesizerTrn( 190 | n_symbols, 191 | hps_ms.data.filter_length // 2 + 1, 192 | hps_ms.train.segment_size // hps_ms.data.hop_length, 193 | n_speakers=n_speakers, 194 | emotion_embedding=emotion_embedding, 195 | **hps_ms.model) 196 | _ = net_g_ms.eval() 197 | utils.load_checkpoint(model, net_g_ms) 198 | 199 | if n_symbols != 0: 200 | if not emotion_embedding: 201 | #while True: 202 | if(1 == 1): 203 | choice = 't' 204 | if choice == 't': 205 | text = inputString 206 | if text == '[ADVANCED]': 207 | text = "我不会说" 208 | 209 | length_scale, text = get_label_value( 210 | text, 'LENGTH', 1, 'length scale') 211 | noise_scale, text = get_label_value( 212 | text, 'NOISE', 0.667, 'noise scale') 213 | noise_scale_w, text = get_label_value( 214 | text, 'NOISEW', 0.8, 'deviation of noise') 215 | cleaned, text = get_label(text, 'CLEANED') 216 | 217 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 218 | 219 | speaker_id = id 220 | out_path = "output.wav" 221 | 222 | with no_grad(): 223 | x_tst = stn_tst.unsqueeze(0) 224 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 225 | sid = LongTensor([speaker_id]) 226 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 227 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 228 | 229 | write(out_path, hps_ms.data.sampling_rate, audio) 230 | if __name__ == "__main__": 231 | # Set OpenAI API key 232 | api_key = get_token() 233 | print() 234 | client = OpenAI(api_key=api_key, timeout=600) 235 | model_id = -1 236 | while True: 237 | print(modelmessage) 238 | model_id = int(get_model_id('选择回复语言: ')) 239 | if model_id == 0 or model_id == 1: 240 | break 241 | else: 242 | print(str(model_id) + ' is not a valid ID!\n') 243 | print() 244 | 245 | speaker_id = -1 246 | while True: 247 | if model_id == 0: 248 | print("\n" + idmessage_cn) 249 | elif model_id == 1: 250 | print("\n" + idmessage_jp) 251 | 252 | speaker_id = get_speaker_id('选择角色: ') 253 | if (model_id == 0 and speaker_id in list(range(4))) or (model_id == 1 and speaker_id in list(range(7))): 254 | break 255 | else: 256 | print(str(speaker_id) + ' is not a valid ID!\n') 257 | print() 258 | 259 | while True: 260 | if model_id == 0: 261 | usr_in = get_input() 262 | 263 | if(usr_in == "quit()"): 264 | break 265 | resp = get_reponse(usr_in) 266 | print("ChatGPT:") 267 | answer = resp.replace('\n','') 268 | generateSound("[ZH]"+answer+"[ZH]", speaker_id, model_id) 269 | print(answer) 270 | PlaySound(r'./output.wav', flags=1) 271 | elif model_id == 1: 272 | usr_in = get_input_jp() 273 | if(usr_in == "quit()"): 274 | break 275 | resp = get_reponse(usr_in) 276 | print("ChatGPT:") 277 | answer = resp.replace('\n','') 278 | generateSound(answer, speaker_id, model_id) 279 | print(answer) 280 | PlaySound(r'./output.wav', flags=1) -------------------------------------------------------------------------------- /ChatWaifuCN.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write 2 | from mel_processing import spectrogram_torch 3 | from text import text_to_sequence, _clean_text 4 | from models import SynthesizerTrn 5 | import utils 6 | import commons 7 | import sys 8 | import re 9 | from torch import no_grad, LongTensor 10 | import logging 11 | from winsound import PlaySound 12 | 13 | #################################### 14 | #CHATGPT INITIALIZE 15 | from pyChatGPT import ChatGPT 16 | import json 17 | idmessage = """ID Speaker 18 | 0 綾地寧々 19 | 1 在原七海 20 | 2 小茸 21 | 3 唐乐吟 22 | """ 23 | speakerID = 0 24 | 25 | def get_input(): 26 | # prompt for input 27 | print("You:") 28 | user_input = input() 29 | return user_input 30 | 31 | def get_token(): 32 | token = input("Copy your token from ChatGPT and press Enter \n") 33 | return token; 34 | 35 | 36 | ################################################ 37 | 38 | 39 | logging.getLogger('numba').setLevel(logging.WARNING) 40 | 41 | 42 | def ex_print(text, escape=False): 43 | if escape: 44 | print(text.encode('unicode_escape').decode()) 45 | else: 46 | print(text) 47 | 48 | 49 | def get_text(text, hps, cleaned=False): 50 | if cleaned: 51 | text_norm = text_to_sequence(text, hps.symbols, []) 52 | else: 53 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 54 | if hps.data.add_blank: 55 | text_norm = commons.intersperse(text_norm, 0) 56 | text_norm = LongTensor(text_norm) 57 | return text_norm 58 | 59 | 60 | def ask_if_continue(): 61 | while True: 62 | answer = input('Continue? (y/n): ') 63 | if answer == 'y': 64 | break 65 | elif answer == 'n': 66 | sys.exit(0) 67 | 68 | 69 | def print_speakers(speakers, escape=False): 70 | if len(speakers) > 100: 71 | return 72 | print('ID\tSpeaker') 73 | for id, name in enumerate(speakers): 74 | ex_print(str(id) + '\t' + name, escape) 75 | 76 | 77 | def get_speaker_id(message): 78 | speaker_id = input(message) 79 | try: 80 | speaker_id = int(speaker_id) 81 | except: 82 | print(str(speaker_id) + ' is not a valid ID!') 83 | sys.exit(1) 84 | return speaker_id 85 | 86 | 87 | def get_label_value(text, label, default, warning_name='value'): 88 | value = re.search(rf'\[{label}=(.+?)\]', text) 89 | if value: 90 | try: 91 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 92 | value = float(value.group(1)) 93 | except: 94 | print(f'Invalid {warning_name}!') 95 | sys.exit(1) 96 | else: 97 | value = default 98 | return value, text 99 | 100 | 101 | def get_label(text, label): 102 | if f'[{label}]' in text: 103 | return True, text.replace(f'[{label}]', '') 104 | else: 105 | return False, text 106 | 107 | 108 | 109 | def generateSound(inputString): 110 | if '--escape' in sys.argv: 111 | escape = True 112 | else: 113 | escape = False 114 | 115 | #model = input('Path of a VITS model: ') 116 | #config = input('Path of a config file: ') 117 | model = r".\model\CN\model.pth" 118 | config = r".\model\CN\config.json" 119 | 120 | 121 | hps_ms = utils.get_hparams_from_file(config) 122 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 123 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 124 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 125 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 126 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 127 | 128 | net_g_ms = SynthesizerTrn( 129 | n_symbols, 130 | hps_ms.data.filter_length // 2 + 1, 131 | hps_ms.train.segment_size // hps_ms.data.hop_length, 132 | n_speakers=n_speakers, 133 | emotion_embedding=emotion_embedding, 134 | **hps_ms.model) 135 | _ = net_g_ms.eval() 136 | utils.load_checkpoint(model, net_g_ms) 137 | 138 | def voice_conversion(): 139 | audio_path = input('Path of an audio file to convert:\n') 140 | print_speakers(speakers) 141 | audio = utils.load_audio_to_torch( 142 | audio_path, hps_ms.data.sampling_rate) 143 | 144 | originnal_id = get_speaker_id('Original speaker ID: ') 145 | target_id = get_speaker_id('Target speaker ID: ') 146 | out_path = input('Path to save: ') 147 | 148 | y = audio.unsqueeze(0) 149 | 150 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 151 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 152 | center=False) 153 | spec_lengths = LongTensor([spec.size(-1)]) 154 | sid_src = LongTensor([originnal_id]) 155 | 156 | with no_grad(): 157 | sid_tgt = LongTensor([target_id]) 158 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 159 | 0][0, 0].data.cpu().float().numpy() 160 | return audio, out_path 161 | 162 | if n_symbols != 0: 163 | if not emotion_embedding: 164 | #while True: 165 | if(1==1): 166 | #choice = input('TTS or VC? (t/v):') 167 | choice = 't' 168 | if choice == 't': 169 | #text = input('Text to read: ') 170 | text = inputString 171 | if text == '[ADVANCED]': 172 | #text = input('Raw text:') 173 | text = "我不会说" 174 | #print('Cleaned text is:') 175 | #ex_print(_clean_text( 176 | # text, hps_ms.data.text_cleaners), escape) 177 | #continue 178 | 179 | length_scale, text = get_label_value( 180 | text, 'LENGTH', 1, 'length scale') 181 | noise_scale, text = get_label_value( 182 | text, 'NOISE', 0.667, 'noise scale') 183 | noise_scale_w, text = get_label_value( 184 | text, 'NOISEW', 0.8, 'deviation of noise') 185 | cleaned, text = get_label(text, 'CLEANED') 186 | 187 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 188 | 189 | #print_speakers(speakers, escape) 190 | #speaker_id = get_speaker_id('Speaker ID: ') 191 | speaker_id = speakerID 192 | #out_path = input('Path to save: ') 193 | out_path = "output.wav" 194 | 195 | with no_grad(): 196 | x_tst = stn_tst.unsqueeze(0) 197 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 198 | sid = LongTensor([speaker_id]) 199 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 200 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 201 | 202 | elif choice == 'v': 203 | audio, out_path = voice_conversion() 204 | 205 | write(out_path, hps_ms.data.sampling_rate, audio) 206 | print('Successfully saved!') 207 | #ask_if_continue() 208 | else: 209 | import os 210 | import librosa 211 | import numpy as np 212 | from torch import FloatTensor 213 | import audonnx 214 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') 215 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) 216 | #while True: 217 | if(1==1): 218 | #choice = input('TTS or VC? (t/v):') 219 | choice = 't' 220 | if choice == 't': 221 | #text = input('Text to read: ') 222 | text = inputString 223 | if text == '[ADVANCED]': 224 | #text = input('Raw text:') 225 | text = "我不会说" 226 | #print('Cleaned text is:') 227 | #ex_print(_clean_text( 228 | # text, hps_ms.data.text_cleaners), escape) 229 | #continue 230 | 231 | length_scale, text = get_label_value( 232 | text, 'LENGTH', 1, 'length scale') 233 | noise_scale, text = get_label_value( 234 | text, 'NOISE', 0.667, 'noise scale') 235 | noise_scale_w, text = get_label_value( 236 | text, 'NOISEW', 0.8, 'deviation of noise') 237 | cleaned, text = get_label(text, 'CLEANED') 238 | 239 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 240 | 241 | #print_speakers(speakers, escape) 242 | #speaker_id = get_speaker_id('Speaker ID: ') 243 | speaker_id = speakerID 244 | 245 | emotion_reference = input('Path of an emotion reference: ') 246 | if emotion_reference.endswith('.npy'): 247 | emotion = np.load(emotion_reference) 248 | emotion = FloatTensor(emotion).unsqueeze(0) 249 | else: 250 | audio16000, sampling_rate = librosa.load( 251 | emotion_reference, sr=16000, mono=True) 252 | emotion = w2v2_model(audio16000, sampling_rate)[ 253 | 'hidden_states'] 254 | emotion_reference = re.sub( 255 | r'\..*$', '', emotion_reference) 256 | np.save(emotion_reference, emotion.squeeze(0)) 257 | emotion = FloatTensor(emotion) 258 | 259 | #out_path = input('Path to save: ') 260 | out_path = "output.wav" 261 | 262 | with no_grad(): 263 | x_tst = stn_tst.unsqueeze(0) 264 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 265 | sid = LongTensor([speaker_id]) 266 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, 267 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy() 268 | 269 | elif choice == 'v': 270 | audio, out_path = voice_conversion() 271 | 272 | write(out_path, hps_ms.data.sampling_rate, audio) 273 | print('Successfully saved!') 274 | #ask_if_continue() 275 | else: 276 | model = input('Path of a hubert-soft model: ') 277 | from hubert_model import hubert_soft 278 | hubert = hubert_soft(model) 279 | 280 | while True: 281 | audio_path = input('Path of an audio file to convert:\n') 282 | 283 | if audio_path != '[VC]': 284 | import librosa 285 | if use_f0: 286 | audio, sampling_rate = librosa.load( 287 | audio_path, sr=hps_ms.data.sampling_rate, mono=True) 288 | audio16000 = librosa.resample( 289 | audio, orig_sr=sampling_rate, target_sr=16000) 290 | else: 291 | audio16000, sampling_rate = librosa.load( 292 | audio_path, sr=16000, mono=True) 293 | 294 | #print_speakers(speakers, escape) 295 | target_id = get_speaker_id('Target speaker ID: ') 296 | out_path = input('Path to save: ') 297 | length_scale, out_path = get_label_value( 298 | out_path, 'LENGTH', 1, 'length scale') 299 | noise_scale, out_path = get_label_value( 300 | out_path, 'NOISE', 0.1, 'noise scale') 301 | noise_scale_w, out_path = get_label_value( 302 | out_path, 'NOISEW', 0.1, 'deviation of noise') 303 | 304 | from torch import inference_mode, FloatTensor 305 | import numpy as np 306 | with inference_mode(): 307 | units = hubert.units(FloatTensor(audio16000).unsqueeze( 308 | 0).unsqueeze(0)).squeeze(0).numpy() 309 | if use_f0: 310 | f0_scale, out_path = get_label_value( 311 | out_path, 'F0', 1, 'f0 scale') 312 | f0 = librosa.pyin(audio, sr=sampling_rate, 313 | fmin=librosa.note_to_hz('C0'), 314 | fmax=librosa.note_to_hz('C7'), 315 | frame_length=1780)[0] 316 | target_length = len(units[:, 0]) 317 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length, 318 | np.arange(0, len(f0)), f0)) * f0_scale 319 | units[:, 0] = f0 / 10 320 | 321 | stn_tst = FloatTensor(units) 322 | with no_grad(): 323 | x_tst = stn_tst.unsqueeze(0) 324 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 325 | sid = LongTensor([target_id]) 326 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 327 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() 328 | 329 | else: 330 | audio, out_path = voice_conversion() 331 | 332 | write(out_path, hps_ms.data.sampling_rate, audio) 333 | print('Successfully saved!') 334 | #ask_if_continue() 335 | 336 | if __name__ == "__main__": 337 | session_token = get_token() 338 | api = ChatGPT(session_token) 339 | print(idmessage) 340 | peaker_id = input() 341 | while True: 342 | resp = api.send_message(get_input()) 343 | answer = resp["message"].replace('\n','') 344 | print("ChatGPT:") 345 | print(answer) 346 | generateSound("[ZH]"+answer+"[ZH]") 347 | PlaySound(r'.\output.wav', flags=1) 348 | 349 | -------------------------------------------------------------------------------- /ChatWaifuJP.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write 2 | from mel_processing import spectrogram_torch 3 | from text import text_to_sequence, _clean_text 4 | from models import SynthesizerTrn 5 | import utils 6 | import commons 7 | import sys 8 | import re 9 | from torch import no_grad, LongTensor 10 | import logging 11 | from winsound import PlaySound 12 | 13 | #################################### 14 | #CHATGPT INITIALIZE 15 | from pyChatGPT import ChatGPT 16 | import json 17 | idmessage = """ID Speaker 18 | 0 綾地寧々 19 | 1 因幡めぐる 20 | 2 朝武芳乃 21 | 3 常陸茉子 22 | 4 ムラサメ 23 | 5 鞍馬小春 24 | 6 在原七海 25 | """ 26 | speakerID = 0 27 | 28 | def get_input(): 29 | # prompt for input 30 | print("You:") 31 | user_input = input() +" 使用日本语" 32 | return user_input 33 | 34 | def get_token(): 35 | token = input("Copy your token from ChatGPT and press Enter \n") 36 | return token; 37 | 38 | 39 | ################################################ 40 | 41 | 42 | logging.getLogger('numba').setLevel(logging.WARNING) 43 | 44 | 45 | def ex_print(text, escape=False): 46 | if escape: 47 | print(text.encode('unicode_escape').decode()) 48 | else: 49 | print(text) 50 | 51 | 52 | def get_text(text, hps, cleaned=False): 53 | if cleaned: 54 | text_norm = text_to_sequence(text, hps.symbols, []) 55 | else: 56 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 57 | if hps.data.add_blank: 58 | text_norm = commons.intersperse(text_norm, 0) 59 | text_norm = LongTensor(text_norm) 60 | return text_norm 61 | 62 | 63 | def ask_if_continue(): 64 | while True: 65 | answer = input('Continue? (y/n): ') 66 | if answer == 'y': 67 | break 68 | elif answer == 'n': 69 | sys.exit(0) 70 | 71 | 72 | def print_speakers(speakers, escape=False): 73 | if len(speakers) > 100: 74 | return 75 | print('ID\tSpeaker') 76 | for id, name in enumerate(speakers): 77 | ex_print(str(id) + '\t' + name, escape) 78 | 79 | 80 | def get_speaker_id(message): 81 | speaker_id = input(message) 82 | try: 83 | speaker_id = int(speaker_id) 84 | except: 85 | print(str(speaker_id) + ' is not a valid ID!') 86 | sys.exit(1) 87 | return speaker_id 88 | 89 | 90 | def get_label_value(text, label, default, warning_name='value'): 91 | value = re.search(rf'\[{label}=(.+?)\]', text) 92 | if value: 93 | try: 94 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 95 | value = float(value.group(1)) 96 | except: 97 | print(f'Invalid {warning_name}!') 98 | sys.exit(1) 99 | else: 100 | value = default 101 | return value, text 102 | 103 | 104 | def get_label(text, label): 105 | if f'[{label}]' in text: 106 | return True, text.replace(f'[{label}]', '') 107 | else: 108 | return False, text 109 | 110 | 111 | 112 | def generateSound(inputString): 113 | if '--escape' in sys.argv: 114 | escape = True 115 | else: 116 | escape = False 117 | 118 | #model = input('Path of a VITS model: ') 119 | #config = input('Path of a config file: ') 120 | model = r".\model\H_excluded.pth" 121 | config = r".\model\config.json" 122 | 123 | 124 | hps_ms = utils.get_hparams_from_file(config) 125 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 126 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 127 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 128 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 129 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 130 | 131 | net_g_ms = SynthesizerTrn( 132 | n_symbols, 133 | hps_ms.data.filter_length // 2 + 1, 134 | hps_ms.train.segment_size // hps_ms.data.hop_length, 135 | n_speakers=n_speakers, 136 | emotion_embedding=emotion_embedding, 137 | **hps_ms.model) 138 | _ = net_g_ms.eval() 139 | utils.load_checkpoint(model, net_g_ms) 140 | 141 | def voice_conversion(): 142 | audio_path = input('Path of an audio file to convert:\n') 143 | print_speakers(speakers) 144 | audio = utils.load_audio_to_torch( 145 | audio_path, hps_ms.data.sampling_rate) 146 | 147 | originnal_id = get_speaker_id('Original speaker ID: ') 148 | target_id = get_speaker_id('Target speaker ID: ') 149 | out_path = input('Path to save: ') 150 | 151 | y = audio.unsqueeze(0) 152 | 153 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 154 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 155 | center=False) 156 | spec_lengths = LongTensor([spec.size(-1)]) 157 | sid_src = LongTensor([originnal_id]) 158 | 159 | with no_grad(): 160 | sid_tgt = LongTensor([target_id]) 161 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 162 | 0][0, 0].data.cpu().float().numpy() 163 | return audio, out_path 164 | 165 | if n_symbols != 0: 166 | if not emotion_embedding: 167 | #while True: 168 | if(1==1): 169 | #choice = input('TTS or VC? (t/v):') 170 | choice = 't' 171 | if choice == 't': 172 | #text = input('Text to read: ') 173 | text = inputString 174 | if text == '[ADVANCED]': 175 | #text = input('Raw text:') 176 | text = "我不会说" 177 | #print('Cleaned text is:') 178 | #ex_print(_clean_text( 179 | # text, hps_ms.data.text_cleaners), escape) 180 | #continue 181 | 182 | length_scale, text = get_label_value( 183 | text, 'LENGTH', 1, 'length scale') 184 | noise_scale, text = get_label_value( 185 | text, 'NOISE', 0.667, 'noise scale') 186 | noise_scale_w, text = get_label_value( 187 | text, 'NOISEW', 0.8, 'deviation of noise') 188 | cleaned, text = get_label(text, 'CLEANED') 189 | 190 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 191 | 192 | #print_speakers(speakers, escape) 193 | #speaker_id = get_speaker_id('Speaker ID: ') 194 | speaker_id = speakerID 195 | #out_path = input('Path to save: ') 196 | out_path = "output.wav" 197 | 198 | with no_grad(): 199 | x_tst = stn_tst.unsqueeze(0) 200 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 201 | sid = LongTensor([speaker_id]) 202 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 203 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 204 | 205 | elif choice == 'v': 206 | audio, out_path = voice_conversion() 207 | 208 | write(out_path, hps_ms.data.sampling_rate, audio) 209 | print('Successfully saved!') 210 | #ask_if_continue() 211 | else: 212 | import os 213 | import librosa 214 | import numpy as np 215 | from torch import FloatTensor 216 | import audonnx 217 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') 218 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) 219 | #while True: 220 | if(1==1): 221 | #choice = input('TTS or VC? (t/v):') 222 | choice = 't' 223 | if choice == 't': 224 | #text = input('Text to read: ') 225 | text = inputString 226 | if text == '[ADVANCED]': 227 | #text = input('Raw text:') 228 | text = "我不会说" 229 | #print('Cleaned text is:') 230 | #ex_print(_clean_text( 231 | # text, hps_ms.data.text_cleaners), escape) 232 | #continue 233 | 234 | length_scale, text = get_label_value( 235 | text, 'LENGTH', 1, 'length scale') 236 | noise_scale, text = get_label_value( 237 | text, 'NOISE', 0.667, 'noise scale') 238 | noise_scale_w, text = get_label_value( 239 | text, 'NOISEW', 0.8, 'deviation of noise') 240 | cleaned, text = get_label(text, 'CLEANED') 241 | 242 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 243 | 244 | #print_speakers(speakers, escape) 245 | #speaker_id = get_speaker_id('Speaker ID: ') 246 | speaker_id = speakerID 247 | 248 | emotion_reference = input('Path of an emotion reference: ') 249 | if emotion_reference.endswith('.npy'): 250 | emotion = np.load(emotion_reference) 251 | emotion = FloatTensor(emotion).unsqueeze(0) 252 | else: 253 | audio16000, sampling_rate = librosa.load( 254 | emotion_reference, sr=16000, mono=True) 255 | emotion = w2v2_model(audio16000, sampling_rate)[ 256 | 'hidden_states'] 257 | emotion_reference = re.sub( 258 | r'\..*$', '', emotion_reference) 259 | np.save(emotion_reference, emotion.squeeze(0)) 260 | emotion = FloatTensor(emotion) 261 | 262 | #out_path = input('Path to save: ') 263 | out_path = "output.wav" 264 | 265 | with no_grad(): 266 | x_tst = stn_tst.unsqueeze(0) 267 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 268 | sid = LongTensor([speaker_id]) 269 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, 270 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy() 271 | 272 | elif choice == 'v': 273 | audio, out_path = voice_conversion() 274 | 275 | write(out_path, hps_ms.data.sampling_rate, audio) 276 | print('Successfully saved!') 277 | #ask_if_continue() 278 | else: 279 | model = input('Path of a hubert-soft model: ') 280 | from hubert_model import hubert_soft 281 | hubert = hubert_soft(model) 282 | 283 | while True: 284 | audio_path = input('Path of an audio file to convert:\n') 285 | 286 | if audio_path != '[VC]': 287 | import librosa 288 | if use_f0: 289 | audio, sampling_rate = librosa.load( 290 | audio_path, sr=hps_ms.data.sampling_rate, mono=True) 291 | audio16000 = librosa.resample( 292 | audio, orig_sr=sampling_rate, target_sr=16000) 293 | else: 294 | audio16000, sampling_rate = librosa.load( 295 | audio_path, sr=16000, mono=True) 296 | 297 | #print_speakers(speakers, escape) 298 | target_id = get_speaker_id('Target speaker ID: ') 299 | out_path = input('Path to save: ') 300 | length_scale, out_path = get_label_value( 301 | out_path, 'LENGTH', 1, 'length scale') 302 | noise_scale, out_path = get_label_value( 303 | out_path, 'NOISE', 0.1, 'noise scale') 304 | noise_scale_w, out_path = get_label_value( 305 | out_path, 'NOISEW', 0.1, 'deviation of noise') 306 | 307 | from torch import inference_mode, FloatTensor 308 | import numpy as np 309 | with inference_mode(): 310 | units = hubert.units(FloatTensor(audio16000).unsqueeze( 311 | 0).unsqueeze(0)).squeeze(0).numpy() 312 | if use_f0: 313 | f0_scale, out_path = get_label_value( 314 | out_path, 'F0', 1, 'f0 scale') 315 | f0 = librosa.pyin(audio, sr=sampling_rate, 316 | fmin=librosa.note_to_hz('C0'), 317 | fmax=librosa.note_to_hz('C7'), 318 | frame_length=1780)[0] 319 | target_length = len(units[:, 0]) 320 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length, 321 | np.arange(0, len(f0)), f0)) * f0_scale 322 | units[:, 0] = f0 / 10 323 | 324 | stn_tst = FloatTensor(units) 325 | with no_grad(): 326 | x_tst = stn_tst.unsqueeze(0) 327 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 328 | sid = LongTensor([target_id]) 329 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 330 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() 331 | 332 | else: 333 | audio, out_path = voice_conversion() 334 | 335 | write(out_path, hps_ms.data.sampling_rate, audio) 336 | print('Successfully saved!') 337 | #ask_if_continue() 338 | 339 | if __name__ == "__main__": 340 | session_token = get_token() 341 | api = ChatGPT(session_token) 342 | print(idmessage) 343 | peaker_id = input() 344 | while True: 345 | resp = api.send_message(get_input()) 346 | answer = resp["message"].replace('\n','') 347 | print("ChatGPT:") 348 | print(answer) 349 | generateSound(answer) 350 | PlaySound(r'.\output.wav', flags=1) 351 | 352 | -------------------------------------------------------------------------------- /ChatWaifuJPVoiceEN.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write 2 | from mel_processing import spectrogram_torch 3 | from text import text_to_sequence, _clean_text 4 | from models import SynthesizerTrn 5 | import utils 6 | import commons 7 | import sys 8 | import re 9 | from torch import no_grad, LongTensor 10 | import logging 11 | from winsound import PlaySound 12 | import argparse 13 | import queue 14 | import sounddevice as sd 15 | from vosk import Model, KaldiRecognizer 16 | 17 | q = queue.Queue() 18 | def int_or_str(text): 19 | """Helper function for argument parsing.""" 20 | try: 21 | return int(text) 22 | except ValueError: 23 | return text 24 | 25 | 26 | def callback(indata, frames, time, status): 27 | """This is called (from a separate thread) for each audio block.""" 28 | if status: 29 | print(status, file=sys.stderr) 30 | q.put(bytes(indata)) 31 | 32 | 33 | parser = argparse.ArgumentParser(add_help=False) 34 | parser.add_argument( 35 | "-l", "--list-devices", action="store_true", 36 | help="show list of audio devices and exit") 37 | args, remaining = parser.parse_known_args() 38 | if args.list_devices: 39 | parser.exit(0) 40 | parser = argparse.ArgumentParser( 41 | description=__doc__, 42 | formatter_class=argparse.RawDescriptionHelpFormatter, 43 | parents=[parser]) 44 | parser.add_argument( 45 | "-f", "--filename", type=str, metavar="FILENAME", 46 | help="audio file to store recording to") 47 | parser.add_argument( 48 | "-d", "--device", type=int_or_str, 49 | help="input device (numeric ID or substring)") 50 | parser.add_argument( 51 | "-r", "--samplerate", type=int, help="sampling rate") 52 | parser.add_argument( 53 | "-m", "--model", type=str, help="language model; e.g. en-us, fr, nl; default is en-us") 54 | args = parser.parse_args(remaining) 55 | try: 56 | if args.samplerate is None: 57 | device_info = sd.query_devices(args.device, "input") 58 | # soundfile expects an int, sounddevice provides a float: 59 | args.samplerate = int(device_info["default_samplerate"]) 60 | 61 | if args.model is None: 62 | model = Model(lang="en-us") 63 | else: 64 | model = Model(lang=args.model) 65 | 66 | if args.filename: 67 | dump_fn = open(args.filename, "wb") 68 | else: 69 | dump_fn = None 70 | 71 | 72 | 73 | except KeyboardInterrupt: 74 | print("\nDone") 75 | parser.exit(0) 76 | 77 | #################################### 78 | #CHATGPT INITIALIZE 79 | from pyChatGPT import ChatGPT 80 | import json 81 | idmessage = """ID Speaker 82 | 0 綾地寧々 83 | 1 因幡めぐる 84 | 2 朝武芳乃 85 | 3 常陸茉子 86 | 4 ムラサメ 87 | 5 鞍馬小春 88 | 6 在原七海 89 | """ 90 | speakerID = 0 91 | 92 | def voice_input(): 93 | print("You:") 94 | with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, 95 | dtype="int16", channels=1, callback=callback): 96 | 97 | rec = KaldiRecognizer(model, args.samplerate) 98 | while True: 99 | data = q.get() 100 | if rec.AcceptWaveform(data): 101 | a = json.loads(rec.Result()) 102 | a = str(a['text']) 103 | a = ''.join(a.split()) 104 | if(len(a) > 0): 105 | print(a) 106 | user_input = a + " 使用日本语" 107 | return user_input 108 | if dump_fn is not None: 109 | dump_fn.write(data) 110 | 111 | 112 | def get_token(): 113 | token = input("Copy your token from ChatGPT and press Enter \n") 114 | return token; 115 | 116 | 117 | ################################################ 118 | 119 | 120 | logging.getLogger('numba').setLevel(logging.WARNING) 121 | 122 | 123 | def ex_print(text, escape=False): 124 | if escape: 125 | print(text.encode('unicode_escape').decode()) 126 | else: 127 | print(text) 128 | 129 | 130 | def get_text(text, hps, cleaned=False): 131 | if cleaned: 132 | text_norm = text_to_sequence(text, hps.symbols, []) 133 | else: 134 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 135 | if hps.data.add_blank: 136 | text_norm = commons.intersperse(text_norm, 0) 137 | text_norm = LongTensor(text_norm) 138 | return text_norm 139 | 140 | 141 | def ask_if_continue(): 142 | while True: 143 | answer = input('Continue? (y/n): ') 144 | if answer == 'y': 145 | break 146 | elif answer == 'n': 147 | sys.exit(0) 148 | 149 | 150 | def print_speakers(speakers, escape=False): 151 | if len(speakers) > 100: 152 | return 153 | print('ID\tSpeaker') 154 | for id, name in enumerate(speakers): 155 | ex_print(str(id) + '\t' + name, escape) 156 | 157 | 158 | def get_speaker_id(message): 159 | speaker_id = input(message) 160 | try: 161 | speaker_id = int(speaker_id) 162 | except: 163 | print(str(speaker_id) + ' is not a valid ID!') 164 | sys.exit(1) 165 | return speaker_id 166 | 167 | 168 | def get_label_value(text, label, default, warning_name='value'): 169 | value = re.search(rf'\[{label}=(.+?)\]', text) 170 | if value: 171 | try: 172 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 173 | value = float(value.group(1)) 174 | except: 175 | print(f'Invalid {warning_name}!') 176 | sys.exit(1) 177 | else: 178 | value = default 179 | return value, text 180 | 181 | 182 | def get_label(text, label): 183 | if f'[{label}]' in text: 184 | return True, text.replace(f'[{label}]', '') 185 | else: 186 | return False, text 187 | 188 | 189 | 190 | def generateSound(inputString): 191 | if '--escape' in sys.argv: 192 | escape = True 193 | else: 194 | escape = False 195 | 196 | #model = input('Path of a VITS model: ') 197 | #config = input('Path of a config file: ') 198 | model = r".\model\H_excluded.pth" 199 | config = r".\model\config.json" 200 | 201 | 202 | hps_ms = utils.get_hparams_from_file(config) 203 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 204 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 205 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 206 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 207 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 208 | 209 | net_g_ms = SynthesizerTrn( 210 | n_symbols, 211 | hps_ms.data.filter_length // 2 + 1, 212 | hps_ms.train.segment_size // hps_ms.data.hop_length, 213 | n_speakers=n_speakers, 214 | emotion_embedding=emotion_embedding, 215 | **hps_ms.model) 216 | _ = net_g_ms.eval() 217 | utils.load_checkpoint(model, net_g_ms) 218 | 219 | def voice_conversion(): 220 | audio_path = input('Path of an audio file to convert:\n') 221 | print_speakers(speakers) 222 | audio = utils.load_audio_to_torch( 223 | audio_path, hps_ms.data.sampling_rate) 224 | 225 | originnal_id = get_speaker_id('Original speaker ID: ') 226 | target_id = get_speaker_id('Target speaker ID: ') 227 | out_path = input('Path to save: ') 228 | 229 | y = audio.unsqueeze(0) 230 | 231 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 232 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 233 | center=False) 234 | spec_lengths = LongTensor([spec.size(-1)]) 235 | sid_src = LongTensor([originnal_id]) 236 | 237 | with no_grad(): 238 | sid_tgt = LongTensor([target_id]) 239 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 240 | 0][0, 0].data.cpu().float().numpy() 241 | return audio, out_path 242 | 243 | if n_symbols != 0: 244 | if not emotion_embedding: 245 | #while True: 246 | if(1==1): 247 | #choice = input('TTS or VC? (t/v):') 248 | choice = 't' 249 | if choice == 't': 250 | #text = input('Text to read: ') 251 | text = inputString 252 | if text == '[ADVANCED]': 253 | #text = input('Raw text:') 254 | text = "我不会说" 255 | #print('Cleaned text is:') 256 | #ex_print(_clean_text( 257 | # text, hps_ms.data.text_cleaners), escape) 258 | #continue 259 | 260 | length_scale, text = get_label_value( 261 | text, 'LENGTH', 1, 'length scale') 262 | noise_scale, text = get_label_value( 263 | text, 'NOISE', 0.667, 'noise scale') 264 | noise_scale_w, text = get_label_value( 265 | text, 'NOISEW', 0.8, 'deviation of noise') 266 | cleaned, text = get_label(text, 'CLEANED') 267 | 268 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 269 | 270 | #print_speakers(speakers, escape) 271 | #speaker_id = get_speaker_id('Speaker ID: ') 272 | speaker_id = speakerID 273 | #out_path = input('Path to save: ') 274 | out_path = "output.wav" 275 | 276 | with no_grad(): 277 | x_tst = stn_tst.unsqueeze(0) 278 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 279 | sid = LongTensor([speaker_id]) 280 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 281 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 282 | 283 | elif choice == 'v': 284 | audio, out_path = voice_conversion() 285 | 286 | write(out_path, hps_ms.data.sampling_rate, audio) 287 | #print('Successfully saved!') 288 | #ask_if_continue() 289 | else: 290 | import os 291 | import librosa 292 | import numpy as np 293 | from torch import FloatTensor 294 | import audonnx 295 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') 296 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) 297 | #while True: 298 | if(1==1): 299 | #choice = input('TTS or VC? (t/v):') 300 | choice = 't' 301 | if choice == 't': 302 | #text = input('Text to read: ') 303 | text = inputString 304 | if text == '[ADVANCED]': 305 | #text = input('Raw text:') 306 | text = "我不会说" 307 | #print('Cleaned text is:') 308 | #ex_print(_clean_text( 309 | # text, hps_ms.data.text_cleaners), escape) 310 | #continue 311 | 312 | length_scale, text = get_label_value( 313 | text, 'LENGTH', 1, 'length scale') 314 | noise_scale, text = get_label_value( 315 | text, 'NOISE', 0.667, 'noise scale') 316 | noise_scale_w, text = get_label_value( 317 | text, 'NOISEW', 0.8, 'deviation of noise') 318 | cleaned, text = get_label(text, 'CLEANED') 319 | 320 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 321 | 322 | #print_speakers(speakers, escape) 323 | #speaker_id = get_speaker_id('Speaker ID: ') 324 | speaker_id = speakerID 325 | 326 | emotion_reference = input('Path of an emotion reference: ') 327 | if emotion_reference.endswith('.npy'): 328 | emotion = np.load(emotion_reference) 329 | emotion = FloatTensor(emotion).unsqueeze(0) 330 | else: 331 | audio16000, sampling_rate = librosa.load( 332 | emotion_reference, sr=16000, mono=True) 333 | emotion = w2v2_model(audio16000, sampling_rate)[ 334 | 'hidden_states'] 335 | emotion_reference = re.sub( 336 | r'\..*$', '', emotion_reference) 337 | np.save(emotion_reference, emotion.squeeze(0)) 338 | emotion = FloatTensor(emotion) 339 | 340 | #out_path = input('Path to save: ') 341 | out_path = "output.wav" 342 | 343 | with no_grad(): 344 | x_tst = stn_tst.unsqueeze(0) 345 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 346 | sid = LongTensor([speaker_id]) 347 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, 348 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy() 349 | 350 | elif choice == 'v': 351 | audio, out_path = voice_conversion() 352 | 353 | write(out_path, hps_ms.data.sampling_rate, audio) 354 | #print('Successfully saved!') 355 | #ask_if_continue() 356 | else: 357 | model = input('Path of a hubert-soft model: ') 358 | from hubert_model import hubert_soft 359 | hubert = hubert_soft(model) 360 | 361 | while True: 362 | audio_path = input('Path of an audio file to convert:\n') 363 | 364 | if audio_path != '[VC]': 365 | import librosa 366 | if use_f0: 367 | audio, sampling_rate = librosa.load( 368 | audio_path, sr=hps_ms.data.sampling_rate, mono=True) 369 | audio16000 = librosa.resample( 370 | audio, orig_sr=sampling_rate, target_sr=16000) 371 | else: 372 | audio16000, sampling_rate = librosa.load( 373 | audio_path, sr=16000, mono=True) 374 | 375 | #print_speakers(speakers, escape) 376 | target_id = get_speaker_id('Target speaker ID: ') 377 | out_path = input('Path to save: ') 378 | length_scale, out_path = get_label_value( 379 | out_path, 'LENGTH', 1, 'length scale') 380 | noise_scale, out_path = get_label_value( 381 | out_path, 'NOISE', 0.1, 'noise scale') 382 | noise_scale_w, out_path = get_label_value( 383 | out_path, 'NOISEW', 0.1, 'deviation of noise') 384 | 385 | from torch import inference_mode, FloatTensor 386 | import numpy as np 387 | with inference_mode(): 388 | units = hubert.units(FloatTensor(audio16000).unsqueeze( 389 | 0).unsqueeze(0)).squeeze(0).numpy() 390 | if use_f0: 391 | f0_scale, out_path = get_label_value( 392 | out_path, 'F0', 1, 'f0 scale') 393 | f0 = librosa.pyin(audio, sr=sampling_rate, 394 | fmin=librosa.note_to_hz('C0'), 395 | fmax=librosa.note_to_hz('C7'), 396 | frame_length=1780)[0] 397 | target_length = len(units[:, 0]) 398 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length, 399 | np.arange(0, len(f0)), f0)) * f0_scale 400 | units[:, 0] = f0 / 10 401 | 402 | stn_tst = FloatTensor(units) 403 | with no_grad(): 404 | x_tst = stn_tst.unsqueeze(0) 405 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 406 | sid = LongTensor([target_id]) 407 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 408 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() 409 | 410 | else: 411 | audio, out_path = voice_conversion() 412 | 413 | write(out_path, hps_ms.data.sampling_rate, audio) 414 | #print('Successfully saved!') 415 | #ask_if_continue() 416 | 417 | if __name__ == "__main__": 418 | session_token = get_token() 419 | api = ChatGPT(session_token) 420 | print(idmessage) 421 | peaker_id = input() 422 | while True: 423 | resp = api.send_message(voice_input()) 424 | answer = resp["message"].replace('\n','') 425 | print("ChatGPT:") 426 | print(answer) 427 | generateSound(answer) 428 | PlaySound(r'.\output.wav', flags=0) 429 | 430 | -------------------------------------------------------------------------------- /ChatWaifuJPVoiceJP.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write 2 | from mel_processing import spectrogram_torch 3 | from text import text_to_sequence, _clean_text 4 | from models import SynthesizerTrn 5 | import utils 6 | import commons 7 | import sys 8 | import re 9 | from torch import no_grad, LongTensor 10 | import logging 11 | from winsound import PlaySound 12 | import argparse 13 | import queue 14 | import sounddevice as sd 15 | from vosk import Model, KaldiRecognizer 16 | 17 | q = queue.Queue() 18 | def int_or_str(text): 19 | """Helper function for argument parsing.""" 20 | try: 21 | return int(text) 22 | except ValueError: 23 | return text 24 | 25 | 26 | def callback(indata, frames, time, status): 27 | """This is called (from a separate thread) for each audio block.""" 28 | if status: 29 | print(status, file=sys.stderr) 30 | q.put(bytes(indata)) 31 | 32 | 33 | parser = argparse.ArgumentParser(add_help=False) 34 | parser.add_argument( 35 | "-l", "--list-devices", action="store_true", 36 | help="show list of audio devices and exit") 37 | args, remaining = parser.parse_known_args() 38 | if args.list_devices: 39 | parser.exit(0) 40 | parser = argparse.ArgumentParser( 41 | description=__doc__, 42 | formatter_class=argparse.RawDescriptionHelpFormatter, 43 | parents=[parser]) 44 | parser.add_argument( 45 | "-f", "--filename", type=str, metavar="FILENAME", 46 | help="audio file to store recording to") 47 | parser.add_argument( 48 | "-d", "--device", type=int_or_str, 49 | help="input device (numeric ID or substring)") 50 | parser.add_argument( 51 | "-r", "--samplerate", type=int, help="sampling rate") 52 | parser.add_argument( 53 | "-m", "--model", type=str, help="language model; e.g. en-us, fr, nl; default is en-us") 54 | args = parser.parse_args(remaining) 55 | try: 56 | if args.samplerate is None: 57 | device_info = sd.query_devices(args.device, "input") 58 | # soundfile expects an int, sounddevice provides a float: 59 | args.samplerate = int(device_info["default_samplerate"]) 60 | 61 | if args.model is None: 62 | model = Model(lang="ja") 63 | else: 64 | model = Model(lang=args.model) 65 | 66 | if args.filename: 67 | dump_fn = open(args.filename, "wb") 68 | else: 69 | dump_fn = None 70 | 71 | 72 | 73 | except KeyboardInterrupt: 74 | print("\nDone") 75 | parser.exit(0) 76 | 77 | #################################### 78 | #CHATGPT INITIALIZE 79 | from pyChatGPT import ChatGPT 80 | import json 81 | idmessage = """ID Speaker 82 | 0 綾地寧々 83 | 1 因幡めぐる 84 | 2 朝武芳乃 85 | 3 常陸茉子 86 | 4 ムラサメ 87 | 5 鞍馬小春 88 | 6 在原七海 89 | """ 90 | speakerID = 0 91 | 92 | def voice_input(): 93 | print("You:") 94 | with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, 95 | dtype="int16", channels=1, callback=callback): 96 | 97 | rec = KaldiRecognizer(model, args.samplerate) 98 | while True: 99 | data = q.get() 100 | if rec.AcceptWaveform(data): 101 | a = json.loads(rec.Result()) 102 | a = str(a['text']) 103 | a = ''.join(a.split()) 104 | if(len(a) > 0): 105 | print(a) 106 | user_input = a + " 使用日本语" 107 | return user_input 108 | if dump_fn is not None: 109 | dump_fn.write(data) 110 | 111 | 112 | def get_token(): 113 | token = input("Copy your token from ChatGPT and press Enter \n") 114 | return token; 115 | 116 | 117 | ################################################ 118 | 119 | 120 | logging.getLogger('numba').setLevel(logging.WARNING) 121 | 122 | 123 | def ex_print(text, escape=False): 124 | if escape: 125 | print(text.encode('unicode_escape').decode()) 126 | else: 127 | print(text) 128 | 129 | 130 | def get_text(text, hps, cleaned=False): 131 | if cleaned: 132 | text_norm = text_to_sequence(text, hps.symbols, []) 133 | else: 134 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 135 | if hps.data.add_blank: 136 | text_norm = commons.intersperse(text_norm, 0) 137 | text_norm = LongTensor(text_norm) 138 | return text_norm 139 | 140 | 141 | def ask_if_continue(): 142 | while True: 143 | answer = input('Continue? (y/n): ') 144 | if answer == 'y': 145 | break 146 | elif answer == 'n': 147 | sys.exit(0) 148 | 149 | 150 | def print_speakers(speakers, escape=False): 151 | if len(speakers) > 100: 152 | return 153 | print('ID\tSpeaker') 154 | for id, name in enumerate(speakers): 155 | ex_print(str(id) + '\t' + name, escape) 156 | 157 | 158 | def get_speaker_id(message): 159 | speaker_id = input(message) 160 | try: 161 | speaker_id = int(speaker_id) 162 | except: 163 | print(str(speaker_id) + ' is not a valid ID!') 164 | sys.exit(1) 165 | return speaker_id 166 | 167 | 168 | def get_label_value(text, label, default, warning_name='value'): 169 | value = re.search(rf'\[{label}=(.+?)\]', text) 170 | if value: 171 | try: 172 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 173 | value = float(value.group(1)) 174 | except: 175 | print(f'Invalid {warning_name}!') 176 | sys.exit(1) 177 | else: 178 | value = default 179 | return value, text 180 | 181 | 182 | def get_label(text, label): 183 | if f'[{label}]' in text: 184 | return True, text.replace(f'[{label}]', '') 185 | else: 186 | return False, text 187 | 188 | 189 | 190 | def generateSound(inputString): 191 | if '--escape' in sys.argv: 192 | escape = True 193 | else: 194 | escape = False 195 | 196 | #model = input('Path of a VITS model: ') 197 | #config = input('Path of a config file: ') 198 | model = r".\model\H_excluded.pth" 199 | config = r".\model\config.json" 200 | 201 | 202 | hps_ms = utils.get_hparams_from_file(config) 203 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 204 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 205 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 206 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 207 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 208 | 209 | net_g_ms = SynthesizerTrn( 210 | n_symbols, 211 | hps_ms.data.filter_length // 2 + 1, 212 | hps_ms.train.segment_size // hps_ms.data.hop_length, 213 | n_speakers=n_speakers, 214 | emotion_embedding=emotion_embedding, 215 | **hps_ms.model) 216 | _ = net_g_ms.eval() 217 | utils.load_checkpoint(model, net_g_ms) 218 | 219 | def voice_conversion(): 220 | audio_path = input('Path of an audio file to convert:\n') 221 | print_speakers(speakers) 222 | audio = utils.load_audio_to_torch( 223 | audio_path, hps_ms.data.sampling_rate) 224 | 225 | originnal_id = get_speaker_id('Original speaker ID: ') 226 | target_id = get_speaker_id('Target speaker ID: ') 227 | out_path = input('Path to save: ') 228 | 229 | y = audio.unsqueeze(0) 230 | 231 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 232 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 233 | center=False) 234 | spec_lengths = LongTensor([spec.size(-1)]) 235 | sid_src = LongTensor([originnal_id]) 236 | 237 | with no_grad(): 238 | sid_tgt = LongTensor([target_id]) 239 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 240 | 0][0, 0].data.cpu().float().numpy() 241 | return audio, out_path 242 | 243 | if n_symbols != 0: 244 | if not emotion_embedding: 245 | #while True: 246 | if(1==1): 247 | #choice = input('TTS or VC? (t/v):') 248 | choice = 't' 249 | if choice == 't': 250 | #text = input('Text to read: ') 251 | text = inputString 252 | if text == '[ADVANCED]': 253 | #text = input('Raw text:') 254 | text = "我不会说" 255 | #print('Cleaned text is:') 256 | #ex_print(_clean_text( 257 | # text, hps_ms.data.text_cleaners), escape) 258 | #continue 259 | 260 | length_scale, text = get_label_value( 261 | text, 'LENGTH', 1, 'length scale') 262 | noise_scale, text = get_label_value( 263 | text, 'NOISE', 0.667, 'noise scale') 264 | noise_scale_w, text = get_label_value( 265 | text, 'NOISEW', 0.8, 'deviation of noise') 266 | cleaned, text = get_label(text, 'CLEANED') 267 | 268 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 269 | 270 | #print_speakers(speakers, escape) 271 | #speaker_id = get_speaker_id('Speaker ID: ') 272 | speaker_id = speakerID 273 | #out_path = input('Path to save: ') 274 | out_path = "output.wav" 275 | 276 | with no_grad(): 277 | x_tst = stn_tst.unsqueeze(0) 278 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 279 | sid = LongTensor([speaker_id]) 280 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 281 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 282 | 283 | elif choice == 'v': 284 | audio, out_path = voice_conversion() 285 | 286 | write(out_path, hps_ms.data.sampling_rate, audio) 287 | #print('Successfully saved!') 288 | #ask_if_continue() 289 | else: 290 | import os 291 | import librosa 292 | import numpy as np 293 | from torch import FloatTensor 294 | import audonnx 295 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') 296 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) 297 | #while True: 298 | if(1==1): 299 | #choice = input('TTS or VC? (t/v):') 300 | choice = 't' 301 | if choice == 't': 302 | #text = input('Text to read: ') 303 | text = inputString 304 | if text == '[ADVANCED]': 305 | #text = input('Raw text:') 306 | text = "我不会说" 307 | #print('Cleaned text is:') 308 | #ex_print(_clean_text( 309 | # text, hps_ms.data.text_cleaners), escape) 310 | #continue 311 | 312 | length_scale, text = get_label_value( 313 | text, 'LENGTH', 1, 'length scale') 314 | noise_scale, text = get_label_value( 315 | text, 'NOISE', 0.667, 'noise scale') 316 | noise_scale_w, text = get_label_value( 317 | text, 'NOISEW', 0.8, 'deviation of noise') 318 | cleaned, text = get_label(text, 'CLEANED') 319 | 320 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 321 | 322 | #print_speakers(speakers, escape) 323 | #speaker_id = get_speaker_id('Speaker ID: ') 324 | speaker_id = speakerID 325 | 326 | emotion_reference = input('Path of an emotion reference: ') 327 | if emotion_reference.endswith('.npy'): 328 | emotion = np.load(emotion_reference) 329 | emotion = FloatTensor(emotion).unsqueeze(0) 330 | else: 331 | audio16000, sampling_rate = librosa.load( 332 | emotion_reference, sr=16000, mono=True) 333 | emotion = w2v2_model(audio16000, sampling_rate)[ 334 | 'hidden_states'] 335 | emotion_reference = re.sub( 336 | r'\..*$', '', emotion_reference) 337 | np.save(emotion_reference, emotion.squeeze(0)) 338 | emotion = FloatTensor(emotion) 339 | 340 | #out_path = input('Path to save: ') 341 | out_path = "output.wav" 342 | 343 | with no_grad(): 344 | x_tst = stn_tst.unsqueeze(0) 345 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 346 | sid = LongTensor([speaker_id]) 347 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, 348 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy() 349 | 350 | elif choice == 'v': 351 | audio, out_path = voice_conversion() 352 | 353 | write(out_path, hps_ms.data.sampling_rate, audio) 354 | #print('Successfully saved!') 355 | #ask_if_continue() 356 | else: 357 | model = input('Path of a hubert-soft model: ') 358 | from hubert_model import hubert_soft 359 | hubert = hubert_soft(model) 360 | 361 | while True: 362 | audio_path = input('Path of an audio file to convert:\n') 363 | 364 | if audio_path != '[VC]': 365 | import librosa 366 | if use_f0: 367 | audio, sampling_rate = librosa.load( 368 | audio_path, sr=hps_ms.data.sampling_rate, mono=True) 369 | audio16000 = librosa.resample( 370 | audio, orig_sr=sampling_rate, target_sr=16000) 371 | else: 372 | audio16000, sampling_rate = librosa.load( 373 | audio_path, sr=16000, mono=True) 374 | 375 | #print_speakers(speakers, escape) 376 | target_id = get_speaker_id('Target speaker ID: ') 377 | out_path = input('Path to save: ') 378 | length_scale, out_path = get_label_value( 379 | out_path, 'LENGTH', 1, 'length scale') 380 | noise_scale, out_path = get_label_value( 381 | out_path, 'NOISE', 0.1, 'noise scale') 382 | noise_scale_w, out_path = get_label_value( 383 | out_path, 'NOISEW', 0.1, 'deviation of noise') 384 | 385 | from torch import inference_mode, FloatTensor 386 | import numpy as np 387 | with inference_mode(): 388 | units = hubert.units(FloatTensor(audio16000).unsqueeze( 389 | 0).unsqueeze(0)).squeeze(0).numpy() 390 | if use_f0: 391 | f0_scale, out_path = get_label_value( 392 | out_path, 'F0', 1, 'f0 scale') 393 | f0 = librosa.pyin(audio, sr=sampling_rate, 394 | fmin=librosa.note_to_hz('C0'), 395 | fmax=librosa.note_to_hz('C7'), 396 | frame_length=1780)[0] 397 | target_length = len(units[:, 0]) 398 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length, 399 | np.arange(0, len(f0)), f0)) * f0_scale 400 | units[:, 0] = f0 / 10 401 | 402 | stn_tst = FloatTensor(units) 403 | with no_grad(): 404 | x_tst = stn_tst.unsqueeze(0) 405 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 406 | sid = LongTensor([target_id]) 407 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 408 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() 409 | 410 | else: 411 | audio, out_path = voice_conversion() 412 | 413 | write(out_path, hps_ms.data.sampling_rate, audio) 414 | #print('Successfully saved!') 415 | #ask_if_continue() 416 | 417 | if __name__ == "__main__": 418 | session_token = get_token() 419 | api = ChatGPT(session_token) 420 | print(idmessage) 421 | peaker_id = input() 422 | while True: 423 | resp = api.send_message(voice_input()) 424 | answer = resp["message"].replace('\n','') 425 | print("ChatGPT:") 426 | print(answer) 427 | generateSound(answer) 428 | PlaySound(r'.\output.wav', flags=0) 429 | 430 | -------------------------------------------------------------------------------- /ChatWaifuVoice.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write 2 | from mel_processing import spectrogram_torch 3 | from text import text_to_sequence, _clean_text 4 | from models import SynthesizerTrn 5 | import utils 6 | import commons 7 | import sys 8 | import re 9 | from torch import no_grad, LongTensor 10 | import logging 11 | from winsound import PlaySound 12 | import argparse 13 | import queue 14 | import sounddevice as sd 15 | from vosk import Model, KaldiRecognizer 16 | 17 | chinese_model_path = ".\model\CN\model.pth" 18 | chinese_config_path = ".\model\CN\config.json" 19 | japanese_model_path = ".\model\H_excluded.pth" 20 | japanese_config_path = ".\model\config.json" 21 | 22 | q = queue.Queue() 23 | def int_or_str(text): 24 | """Helper function for argument parsing.""" 25 | try: 26 | return int(text) 27 | except ValueError: 28 | return text 29 | 30 | 31 | def callback(indata, frames, time, status): 32 | """This is called (from a separate thread) for each audio block.""" 33 | if status: 34 | print(status, file=sys.stderr) 35 | q.put(bytes(indata)) 36 | 37 | 38 | parser = argparse.ArgumentParser(add_help=False) 39 | parser.add_argument( 40 | "-l", "--list-devices", action="store_true", 41 | help="show list of audio devices and exit") 42 | args, remaining = parser.parse_known_args() 43 | if args.list_devices: 44 | parser.exit(0) 45 | parser = argparse.ArgumentParser( 46 | description=__doc__, 47 | formatter_class=argparse.RawDescriptionHelpFormatter, 48 | parents=[parser]) 49 | parser.add_argument( 50 | "-f", "--filename", type=str, metavar="FILENAME", 51 | help="audio file to store recording to") 52 | parser.add_argument( 53 | "-d", "--device", type=int_or_str, 54 | help="input device (numeric ID or substring)") 55 | parser.add_argument( 56 | "-r", "--samplerate", type=int, help="sampling rate") 57 | parser.add_argument( 58 | "-m", "--model", type=str, help="language model; e.g. en-us, fr, nl; default is en-us") 59 | args = parser.parse_args(remaining) 60 | try: 61 | if args.samplerate is None: 62 | device_info = sd.query_devices(args.device, "input") 63 | # soundfile expects an int, sounddevice provides a float: 64 | args.samplerate = int(device_info["default_samplerate"]) 65 | 66 | if args.model is None: 67 | model = Model(lang="en-us") 68 | else: 69 | model = Model(lang=args.model) 70 | 71 | if args.filename: 72 | dump_fn = open(args.filename, "wb") 73 | else: 74 | dump_fn = None 75 | 76 | 77 | 78 | except KeyboardInterrupt: 79 | print("\nDone") 80 | parser.exit(0) 81 | 82 | #################################### 83 | #CHATGPT INITIALIZE 84 | from pyChatGPT import ChatGPT 85 | import json 86 | 87 | modelmessage = """ID Output Language 88 | 0 Chinese 89 | 1 Japanese 90 | """ 91 | 92 | idmessage_cn = """ID Speaker 93 | 0 綾地寧々 94 | 1 在原七海 95 | 2 小茸 96 | 3 唐乐吟 97 | """ 98 | 99 | idmessage_jp = """ID Speaker 100 | 0 綾地寧々 101 | 1 因幡めぐる 102 | 2 朝武芳乃 103 | 3 常陸茉子 104 | 4 ムラサメ 105 | 5 鞍馬小春 106 | 6 在原七海 107 | """ 108 | 109 | inputLanguage = """ID Input Language 110 | 0 Chinese 111 | 1 Japanese 112 | 2 English 113 | """ 114 | 115 | def voice_input_jp(): 116 | model = Model(lang="cn") 117 | print("You:") 118 | with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, 119 | dtype="int16", channels=1, callback=callback): 120 | 121 | rec = KaldiRecognizer(model, args.samplerate) 122 | while True: 123 | data = q.get() 124 | if rec.AcceptWaveform(data): 125 | a = json.loads(rec.Result()) 126 | a = str(a['text']) 127 | a = ''.join(a.split()) 128 | if(len(a) > 0): 129 | print(a) 130 | user_input = a + " 使用日本语" 131 | return user_input 132 | if dump_fn is not None: 133 | dump_fn.write(data) 134 | 135 | def voice_input_cn(): 136 | model = Model(lang="cn") 137 | print("You:") 138 | with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, 139 | dtype="int16", channels=1, callback=callback): 140 | 141 | rec = KaldiRecognizer(model, args.samplerate) 142 | while True: 143 | data = q.get() 144 | if rec.AcceptWaveform(data): 145 | a = json.loads(rec.Result()) 146 | a = str(a['text']) 147 | a = ''.join(a.split()) 148 | if(len(a) > 0): 149 | print(a) 150 | user_input = a 151 | return user_input 152 | if dump_fn is not None: 153 | dump_fn.write(data) 154 | 155 | def voice_input_jpjp(): 156 | model = Model(lang="ja") 157 | print("You:") 158 | with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, 159 | dtype="int16", channels=1, callback=callback): 160 | 161 | rec = KaldiRecognizer(model, args.samplerate) 162 | while True: 163 | data = q.get() 164 | if rec.AcceptWaveform(data): 165 | a = json.loads(rec.Result()) 166 | a = str(a['text']) 167 | a = ''.join(a.split()) 168 | if(len(a) > 0): 169 | print(a) 170 | user_input = a + " 使用日本语" 171 | return user_input 172 | if dump_fn is not None: 173 | dump_fn.write(data) 174 | 175 | def voice_input_enjp(): 176 | model = Model(lang="en-us") 177 | print("You:") 178 | with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, 179 | dtype="int16", channels=1, callback=callback): 180 | 181 | rec = KaldiRecognizer(model, args.samplerate) 182 | while True: 183 | data = q.get() 184 | if rec.AcceptWaveform(data): 185 | a = json.loads(rec.Result()) 186 | a = str(a['text']) 187 | a = ''.join(a.split()) 188 | if(len(a) > 0): 189 | print(a) 190 | user_input = a + " 使用日本语" 191 | return user_input 192 | if dump_fn is not None: 193 | dump_fn.write(data) 194 | 195 | 196 | def get_token(): 197 | token = input("Copy your token from ChatGPT and press Enter \n") 198 | return token 199 | 200 | 201 | ################################################ 202 | logging.getLogger('numba').setLevel(logging.WARNING) 203 | 204 | def get_text(text, hps, cleaned=False): 205 | if cleaned: 206 | text_norm = text_to_sequence(text, hps.symbols, []) 207 | else: 208 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 209 | if hps.data.add_blank: 210 | text_norm = commons.intersperse(text_norm, 0) 211 | text_norm = LongTensor(text_norm) 212 | return text_norm 213 | 214 | def get_speaker_id(message): 215 | speaker_id = input(message) 216 | try: 217 | speaker_id = int(speaker_id) 218 | except: 219 | print(str(speaker_id) + ' is not a valid ID!') 220 | sys.exit(1) 221 | return speaker_id 222 | 223 | def get_model_id(message): 224 | speaker_id = input(message) 225 | try: 226 | speaker_id = int(speaker_id) 227 | except: 228 | print(str(speaker_id) + ' is not a valid ID!') 229 | sys.exit(1) 230 | return speaker_id 231 | 232 | def get_language_id(message): 233 | speaker_id = input(message) 234 | try: 235 | speaker_id = int(speaker_id) 236 | except: 237 | print(str(speaker_id) + ' is not a valid ID!') 238 | sys.exit(1) 239 | return speaker_id 240 | 241 | 242 | def get_label_value(text, label, default, warning_name='value'): 243 | value = re.search(rf'\[{label}=(.+?)\]', text) 244 | if value: 245 | try: 246 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 247 | value = float(value.group(1)) 248 | except: 249 | print(f'Invalid {warning_name}!') 250 | sys.exit(1) 251 | else: 252 | value = default 253 | return value, text 254 | 255 | 256 | def get_label(text, label): 257 | if f'[{label}]' in text: 258 | return True, text.replace(f'[{label}]', '') 259 | else: 260 | return False, text 261 | 262 | 263 | def generateSound(inputString, id, model_id): 264 | if '--escape' in sys.argv: 265 | escape = True 266 | else: 267 | escape = False 268 | 269 | #model = input('0: Chinese') 270 | #config = input('Path of a config file: ') 271 | if model_id == 0: 272 | model = chinese_model_path 273 | config = chinese_config_path 274 | elif model_id == 1: 275 | model = japanese_model_path 276 | config = japanese_config_path 277 | 278 | 279 | hps_ms = utils.get_hparams_from_file(config) 280 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 281 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 282 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 283 | 284 | net_g_ms = SynthesizerTrn( 285 | n_symbols, 286 | hps_ms.data.filter_length // 2 + 1, 287 | hps_ms.train.segment_size // hps_ms.data.hop_length, 288 | n_speakers=n_speakers, 289 | emotion_embedding=emotion_embedding, 290 | **hps_ms.model) 291 | _ = net_g_ms.eval() 292 | utils.load_checkpoint(model, net_g_ms) 293 | 294 | if n_symbols != 0: 295 | if not emotion_embedding: 296 | #while True: 297 | if(1 == 1): 298 | choice = 't' 299 | if choice == 't': 300 | text = inputString 301 | if text == '[ADVANCED]': 302 | text = "我不会说" 303 | 304 | length_scale, text = get_label_value( 305 | text, 'LENGTH', 1, 'length scale') 306 | noise_scale, text = get_label_value( 307 | text, 'NOISE', 0.667, 'noise scale') 308 | noise_scale_w, text = get_label_value( 309 | text, 'NOISEW', 0.8, 'deviation of noise') 310 | cleaned, text = get_label(text, 'CLEANED') 311 | 312 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 313 | 314 | speaker_id = id 315 | out_path = "output.wav" 316 | 317 | with no_grad(): 318 | x_tst = stn_tst.unsqueeze(0) 319 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 320 | sid = LongTensor([speaker_id]) 321 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 322 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 323 | 324 | write(out_path, hps_ms.data.sampling_rate, audio) 325 | print('Successfully saved!') 326 | 327 | if __name__ == "__main__": 328 | session_token = get_token() 329 | api = ChatGPT(session_token) 330 | 331 | print(inputLanguage) 332 | language_id = get_language_id("选择输入语言:") 333 | if language_id == 0: #cn 334 | print(modelmessage) 335 | model_id = get_model_id('选择回复语言: ') 336 | if model_id == 0: 337 | print("\n" + idmessage_cn) 338 | id = get_speaker_id('选择角色: ') 339 | elif model_id == 1: 340 | print("\n" + idmessage_jp) 341 | id = get_speaker_id('选择角色: ') 342 | elif language_id == 1: #jp 343 | model_id = 1 344 | print("\n" + idmessage_jp) 345 | id = get_speaker_id('选择角色: ') 346 | elif language_id == 2: #en 347 | model_id = 1 348 | print("\n" + idmessage_cn) 349 | id = get_speaker_id('选择角色: ') 350 | 351 | print() 352 | while True: 353 | 354 | if language_id == 0 and model_id == 0: #input=cn output=cn 355 | resp = api.send_message(voice_input_cn()) 356 | if(resp == "quit()"): 357 | break 358 | answer = resp["message"].replace('\n','') 359 | print("ChatGPT:") 360 | print(answer) 361 | generateSound("[ZH]"+answer+"[ZH]", id, model_id) 362 | PlaySound(r'.\output.wav', flags=1) 363 | elif language_id == 0 and model_id == 1: #input=cn output=jp 364 | resp = api.send_message(voice_input_jp()) 365 | if(resp == "quit()"): 366 | break 367 | answer = resp["message"].replace('\n','') 368 | print("ChatGPT:") 369 | print(answer) 370 | generateSound(answer, id, model_id) 371 | PlaySound(r'.\output.wav', flags=1) 372 | elif language_id == 1: #input=jp output=jp 373 | resp = api.send_message(voice_input_jpjp()) 374 | answer = resp["message"].replace('\n','') 375 | print("ChatGPT:") 376 | print(answer) 377 | generateSound(answer, id, model_id) 378 | PlaySound(r'.\output.wav', flags=0) 379 | elif language_id == 2: #input=en output=jp 380 | resp = api.send_message(voice_input_enjp()) 381 | answer = resp["message"].replace('\n','') 382 | print("ChatGPT:") 383 | print(answer) 384 | generateSound(answer, id, model_id) 385 | PlaySound(r'.\output.wav', flags=0) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 CjangCjengh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |  2 | 3 | [中文](README.md "中文") [English](eng-README.md "English") [日本語](jp-README.md "日本語") 4 | 5 |
10 | 11 | # 12 | 13 | ### 这是一个使用TTS+VITS的ChatGPT语音对话程序! 14 | 15 | 效果演示BiliBIli:[《青春猪头少年不会梦见赛博女友》](https://www.bilibili.com/video/BV1rv4y1Q7eT "BiliBili") 16 | 17 | **当前支持功能:** 18 | * [x] ChatGPT的对话聊天 19 | * [x] 回答转语音 20 | * [x] 多角色语音 21 | * [x] 语音识别对话 (研发了一款真正人性化的智能语音Q宝 22 | * [x] [对接Marai机器人](https://github.com/MuBai-He/ChatWaifu-marai) 23 | * [x] [对接Live2D的UI版本](https://github.com/cjyaddone/ChatWaifuL2D) 24 | * [x] [使用gpt3官方api,并支持cuda加速的版本(当前仅源码](https://github.com/cjyaddone/ChatWaifu-API) 25 | 26 | 27 | 28 | # 目录 29 | ### 本项目均默认使用Chrome浏览器 30 | * [1.安装环境:](#1.) 31 | * 1.1 [使用cd命令进入项目文件夹](#cd) 32 | * 1.2 [创建Python虚拟环境:](#99) 33 | * 1.3 [进入创建好的虚拟环境:](#venv) 34 | * 1.4 [pip安装项目所需要的库文件:](#pip) 35 | * [2.导入模型到根目录model文件夹(如果没有自行创建):](#.model) 36 | * 2.1 [双击导入model](#cd1) 37 | * [3.运行(快和我的老婆们对话吧:](#22) 38 | * 3.1 [获取ChatGPT Token](#333) 39 | * 3.2 [开始和CyberWaifu聊天](#444) 40 | * [4.鸣谢](#915) 41 | ## 1.安装环境: 42 | > **安装anaconda环境或Python>=3.7** 43 | > 44 | > **本例使用的环境名称是:chatWaifu** 45 | 46 | ### 1.1 使用cd命令进入项目文件夹 47 | `cd 你的项目路径` 48 |  49 | ### 1.2 创建Python虚拟环境: 50 | 51 | Conda:`conda create --name chatWaifu python=3.10` 52 |  53 |  54 | Python:`python -m venv chatWaifu` 55 |  56 | 57 | ### 1.3 进入创建好的虚拟环境: 58 | Conda:`conda activate chatWaifu` 59 |  60 | 61 | Python:`.\chatWaifu\Scripts\activate.bat` 62 |  63 | 64 | ### 1.4 pip安装项目所需要的库文件: 65 | `pip install -r requirements.txt` 66 |  67 | 68 | ## 2.导入模型到根目录model文件夹: 69 | Google Drive:https://drive.google.com/file/d/1tMCafhnUoL7FbevVQ44VQi-WznDjt23_/view?usp=sharing 70 | 71 | 阿里云盘: https://www.aliyundrive.com/s/9JEj1mp1ZRv 提取码: m2y3 72 | 73 | ### 2.1移动到项目根目录下双击导入model 74 | 75 | ## 3.运行(快和老婆们对话吧: 76 | 77 | 打字输入版:`python ChatWaifu.py` 78 | 79 | 语音对话版(日语和英语输入默认日语输出):`python ChatWaifuVoice.py` 80 | 81 | ### 以下也可以使用,每个文件只对应一种语音输出输入模式 82 | 83 | 打字日语版:`python ChatWaifuJP.py` 84 | 85 | 打字中文版:`python ChatWaifuCN.py` 86 | 87 | 日语语音对话版(使用中文):`python ChatWaifuJPVoice.py` 88 | 89 | 中文语音对话版(使用中文):`python ChatWaifuCNVoice.py` 90 | 91 | 日文语音对话版(使用英文):`python ChatWaifuJPVoiceEN.py` 92 | 93 | 日语语音对话版(使用日文):`python ChatWaifuJPVoiceJP.py` 94 | 95 | ### 3.1 获取ChatGPT Token 96 | #### 在浏览器登入https://chat.openai.com 97 | #### 按F12进入开发控制台 98 | #### 找到 应用程序 -> cookie -> __Secure-next-auth.session-token 99 |  100 | #### 将值复制进入终端并回车 101 | 102 | ### 3.2 开始和CyberWaifu聊天!!! 103 | 104 | **语音对话版:** 当控制台提示"You:"时开始说话,说完并出现句子录音结束并发送到ChatGPT对话。 105 | 106 | 附赠:[ChatGPT 中文调教指南](https://github.com/PlexPt/awesome-chatgpt-prompts-zh) 107 | 108 | ## 4.鸣谢: 109 | - [MoeGoe_GUI]https://github.com/CjangCjengh/MoeGoe_GUI 110 | - [Pretrained models]https://github.com/CjangCjengh/TTSModels 111 | - [PyChatGPT]https://github.com/terry3041/pyChatGPT 112 | -------------------------------------------------------------------------------- /attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | from modules import LayerNorm 8 | 9 | 10 | class Encoder(nn.Module): 11 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 12 | super().__init__() 13 | self.hidden_channels = hidden_channels 14 | self.filter_channels = filter_channels 15 | self.n_heads = n_heads 16 | self.n_layers = n_layers 17 | self.kernel_size = kernel_size 18 | self.p_dropout = p_dropout 19 | self.window_size = window_size 20 | 21 | self.drop = nn.Dropout(p_dropout) 22 | self.attn_layers = nn.ModuleList() 23 | self.norm_layers_1 = nn.ModuleList() 24 | self.ffn_layers = nn.ModuleList() 25 | self.norm_layers_2 = nn.ModuleList() 26 | for i in range(self.n_layers): 27 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 28 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 29 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 30 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 31 | 32 | def forward(self, x, x_mask): 33 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 34 | x = x * x_mask 35 | for i in range(self.n_layers): 36 | y = self.attn_layers[i](x, x, attn_mask) 37 | y = self.drop(y) 38 | x = self.norm_layers_1[i](x + y) 39 | 40 | y = self.ffn_layers[i](x, x_mask) 41 | y = self.drop(y) 42 | x = self.norm_layers_2[i](x + y) 43 | x = x * x_mask 44 | return x 45 | 46 | 47 | class Decoder(nn.Module): 48 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 49 | super().__init__() 50 | self.hidden_channels = hidden_channels 51 | self.filter_channels = filter_channels 52 | self.n_heads = n_heads 53 | self.n_layers = n_layers 54 | self.kernel_size = kernel_size 55 | self.p_dropout = p_dropout 56 | self.proximal_bias = proximal_bias 57 | self.proximal_init = proximal_init 58 | 59 | self.drop = nn.Dropout(p_dropout) 60 | self.self_attn_layers = nn.ModuleList() 61 | self.norm_layers_0 = nn.ModuleList() 62 | self.encdec_attn_layers = nn.ModuleList() 63 | self.norm_layers_1 = nn.ModuleList() 64 | self.ffn_layers = nn.ModuleList() 65 | self.norm_layers_2 = nn.ModuleList() 66 | for i in range(self.n_layers): 67 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 68 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 69 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 70 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 71 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 72 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 73 | 74 | def forward(self, x, x_mask, h, h_mask): 75 | """ 76 | x: decoder input 77 | h: encoder output 78 | """ 79 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 80 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 81 | x = x * x_mask 82 | for i in range(self.n_layers): 83 | y = self.self_attn_layers[i](x, x, self_attn_mask) 84 | y = self.drop(y) 85 | x = self.norm_layers_0[i](x + y) 86 | 87 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 88 | y = self.drop(y) 89 | x = self.norm_layers_1[i](x + y) 90 | 91 | y = self.ffn_layers[i](x, x_mask) 92 | y = self.drop(y) 93 | x = self.norm_layers_2[i](x + y) 94 | x = x * x_mask 95 | return x 96 | 97 | 98 | class MultiHeadAttention(nn.Module): 99 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 100 | super().__init__() 101 | assert channels % n_heads == 0 102 | 103 | self.channels = channels 104 | self.out_channels = out_channels 105 | self.n_heads = n_heads 106 | self.p_dropout = p_dropout 107 | self.window_size = window_size 108 | self.heads_share = heads_share 109 | self.block_length = block_length 110 | self.proximal_bias = proximal_bias 111 | self.proximal_init = proximal_init 112 | self.attn = None 113 | 114 | self.k_channels = channels // n_heads 115 | self.conv_q = nn.Conv1d(channels, channels, 1) 116 | self.conv_k = nn.Conv1d(channels, channels, 1) 117 | self.conv_v = nn.Conv1d(channels, channels, 1) 118 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 119 | self.drop = nn.Dropout(p_dropout) 120 | 121 | if window_size is not None: 122 | n_heads_rel = 1 if heads_share else n_heads 123 | rel_stddev = self.k_channels**-0.5 124 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 125 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 126 | 127 | nn.init.xavier_uniform_(self.conv_q.weight) 128 | nn.init.xavier_uniform_(self.conv_k.weight) 129 | nn.init.xavier_uniform_(self.conv_v.weight) 130 | if proximal_init: 131 | with torch.no_grad(): 132 | self.conv_k.weight.copy_(self.conv_q.weight) 133 | self.conv_k.bias.copy_(self.conv_q.bias) 134 | 135 | def forward(self, x, c, attn_mask=None): 136 | q = self.conv_q(x) 137 | k = self.conv_k(c) 138 | v = self.conv_v(c) 139 | 140 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 141 | 142 | x = self.conv_o(x) 143 | return x 144 | 145 | def attention(self, query, key, value, mask=None): 146 | # reshape [b, d, t] -> [b, n_h, t, d_k] 147 | b, d, t_s, t_t = (*key.size(), query.size(2)) 148 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 149 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 150 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 151 | 152 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 153 | if self.window_size is not None: 154 | assert t_s == t_t, "Relative attention is only available for self-attention." 155 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 156 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 157 | scores_local = self._relative_position_to_absolute_position(rel_logits) 158 | scores = scores + scores_local 159 | if self.proximal_bias: 160 | assert t_s == t_t, "Proximal bias is only available for self-attention." 161 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 162 | if mask is not None: 163 | scores = scores.masked_fill(mask == 0, -1e4) 164 | if self.block_length is not None: 165 | assert t_s == t_t, "Local attention is only available for self-attention." 166 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 167 | scores = scores.masked_fill(block_mask == 0, -1e4) 168 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 169 | p_attn = self.drop(p_attn) 170 | output = torch.matmul(p_attn, value) 171 | if self.window_size is not None: 172 | relative_weights = self._absolute_position_to_relative_position(p_attn) 173 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 174 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 175 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 176 | return output, p_attn 177 | 178 | def _matmul_with_relative_values(self, x, y): 179 | """ 180 | x: [b, h, l, m] 181 | y: [h or 1, m, d] 182 | ret: [b, h, l, d] 183 | """ 184 | ret = torch.matmul(x, y.unsqueeze(0)) 185 | return ret 186 | 187 | def _matmul_with_relative_keys(self, x, y): 188 | """ 189 | x: [b, h, l, d] 190 | y: [h or 1, m, d] 191 | ret: [b, h, l, m] 192 | """ 193 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 194 | return ret 195 | 196 | def _get_relative_embeddings(self, relative_embeddings, length): 197 | max_relative_position = 2 * self.window_size + 1 198 | # Pad first before slice to avoid using cond ops. 199 | pad_length = max(length - (self.window_size + 1), 0) 200 | slice_start_position = max((self.window_size + 1) - length, 0) 201 | slice_end_position = slice_start_position + 2 * length - 1 202 | if pad_length > 0: 203 | padded_relative_embeddings = F.pad( 204 | relative_embeddings, 205 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 206 | else: 207 | padded_relative_embeddings = relative_embeddings 208 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 209 | return used_relative_embeddings 210 | 211 | def _relative_position_to_absolute_position(self, x): 212 | """ 213 | x: [b, h, l, 2*l-1] 214 | ret: [b, h, l, l] 215 | """ 216 | batch, heads, length, _ = x.size() 217 | # Concat columns of pad to shift from relative to absolute indexing. 218 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 219 | 220 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 221 | x_flat = x.view([batch, heads, length * 2 * length]) 222 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 223 | 224 | # Reshape and slice out the padded elements. 225 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 226 | return x_final 227 | 228 | def _absolute_position_to_relative_position(self, x): 229 | """ 230 | x: [b, h, l, l] 231 | ret: [b, h, l, 2*l-1] 232 | """ 233 | batch, heads, length, _ = x.size() 234 | # padd along column 235 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 236 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 237 | # add 0's in the beginning that will skew the elements after reshape 238 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 239 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 240 | return x_final 241 | 242 | def _attention_bias_proximal(self, length): 243 | """Bias for self-attention to encourage attention to close positions. 244 | Args: 245 | length: an integer scalar. 246 | Returns: 247 | a Tensor with shape [1, 1, length, length] 248 | """ 249 | r = torch.arange(length, dtype=torch.float32) 250 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 251 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 252 | 253 | 254 | class FFN(nn.Module): 255 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 256 | super().__init__() 257 | self.in_channels = in_channels 258 | self.out_channels = out_channels 259 | self.filter_channels = filter_channels 260 | self.kernel_size = kernel_size 261 | self.p_dropout = p_dropout 262 | self.activation = activation 263 | self.causal = causal 264 | 265 | if causal: 266 | self.padding = self._causal_padding 267 | else: 268 | self.padding = self._same_padding 269 | 270 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 271 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 272 | self.drop = nn.Dropout(p_dropout) 273 | 274 | def forward(self, x, x_mask): 275 | x = self.conv_1(self.padding(x * x_mask)) 276 | if self.activation == "gelu": 277 | x = x * torch.sigmoid(1.702 * x) 278 | else: 279 | x = torch.relu(x) 280 | x = self.drop(x) 281 | x = self.conv_2(self.padding(x * x_mask)) 282 | return x * x_mask 283 | 284 | def _causal_padding(self, x): 285 | if self.kernel_size == 1: 286 | return x 287 | pad_l = self.kernel_size - 1 288 | pad_r = 0 289 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 290 | x = F.pad(x, commons.convert_pad_shape(padding)) 291 | return x 292 | 293 | def _same_padding(self, x): 294 | if self.kernel_size == 1: 295 | return x 296 | pad_l = (self.kernel_size - 1) // 2 297 | pad_r = self.kernel_size // 2 298 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 299 | x = F.pad(x, commons.convert_pad_shape(padding)) 300 | return x 301 | -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | import torch.jit 4 | 5 | 6 | def script_method(fn, _rcb=None): 7 | return fn 8 | 9 | 10 | def script(obj, optimize=True, _frames_up=0, _rcb=None): 11 | return obj 12 | 13 | 14 | torch.jit.script_method = script_method 15 | torch.jit.script = script 16 | 17 | 18 | def init_weights(m, mean=0.0, std=0.01): 19 | classname = m.__class__.__name__ 20 | if classname.find("Conv") != -1: 21 | m.weight.data.normal_(mean, std) 22 | 23 | 24 | def get_padding(kernel_size, dilation=1): 25 | return int((kernel_size*dilation - dilation)/2) 26 | 27 | 28 | def intersperse(lst, item): 29 | result = [item] * (len(lst) * 2 + 1) 30 | result[1::2] = lst 31 | return result 32 | 33 | 34 | def slice_segments(x, ids_str, segment_size=4): 35 | ret = torch.zeros_like(x[:, :, :segment_size]) 36 | for i in range(x.size(0)): 37 | idx_str = ids_str[i] 38 | idx_end = idx_str + segment_size 39 | ret[i] = x[i, :, idx_str:idx_end] 40 | return ret 41 | 42 | 43 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 44 | b, d, t = x.size() 45 | if x_lengths is None: 46 | x_lengths = t 47 | ids_str_max = x_lengths - segment_size + 1 48 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 49 | ret = slice_segments(x, ids_str, segment_size) 50 | return ret, ids_str 51 | 52 | 53 | def subsequent_mask(length): 54 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 55 | return mask 56 | 57 | 58 | @torch.jit.script 59 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 60 | n_channels_int = n_channels[0] 61 | in_act = input_a + input_b 62 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 63 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 64 | acts = t_act * s_act 65 | return acts 66 | 67 | 68 | def convert_pad_shape(pad_shape): 69 | l = pad_shape[::-1] 70 | pad_shape = [item for sublist in l for item in sublist] 71 | return pad_shape 72 | 73 | 74 | def sequence_mask(length, max_length=None): 75 | if max_length is None: 76 | max_length = length.max() 77 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 78 | return x.unsqueeze(0) < length.unsqueeze(1) 79 | 80 | 81 | def generate_path(duration, mask): 82 | """ 83 | duration: [b, 1, t_x] 84 | mask: [b, 1, t_y, t_x] 85 | """ 86 | device = duration.device 87 | 88 | b, _, t_y, t_x = mask.shape 89 | cum_duration = torch.cumsum(duration, -1) 90 | 91 | cum_duration_flat = cum_duration.view(b * t_x) 92 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 93 | path = path.view(b, t_x, t_y) 94 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 95 | path = path.unsqueeze(1).transpose(2,3) * mask 96 | return path 97 | -------------------------------------------------------------------------------- /eng-README.md: -------------------------------------------------------------------------------- 1 |  2 | 3 | [中文](README.md "中文") [English](eng-README.md "English") [日本語](jp-README.md "日本語") 4 | 5 | 10 | 11 | # 12 | 13 | > ### This is a chatting Waifu program based on VITS & ChatGPT! 14 | 15 | Effect demonstration on BiliBIli:[《青春猪头少年不会梦见赛博女友》](https://www.bilibili.com/video/BV1rv4y1Q7eT "BiliBili") 16 | 17 | **Functioning Now:** 18 | * [x] Talking with ChatGPT 19 | * [x] Convert AI's Response to wav file 20 | * [x] Multi-Character voice generator 21 | * [x] Voice Recognition 22 | * [x] [Connect to Marai Robort](https://github.com/MuBai-He/ChatWaifu-marai) 23 | * [x] [Connect to Live2D](https://github.com/cjyaddone/ChatWaifuL2D) 24 | 25 | # Catalogue 26 | ### This project assumes that you are using chrome explorer 27 | * [1.Install Python venv:](#1.) 28 | * 1.1 [Enter directory with cd commend](#cd) 29 | * 1.2 [Create Python Venv:](#99) 30 | * 1.3 [Enter Python Venv:](#venv) 31 | * 1.4 [Install required library with Pip:](#pip) 32 | * [2.Import pre-trained models to "model" folder(create a new one if doesn't exist):](#.model) 33 | * 2.1 [Double click model.exe to import Models](#cd1) 34 | * [3.Run(Talk to your Waifu:](#22) 35 | * 3.1 [Get ChatGPT Token](#333) 36 | * 3.2 [Start chatting with CyberWaifu](#444) 37 | * [4.Contributions](#915) 38 | ## 1.Install Python Venv: 39 | > **Install Anaconda or Python>=3.7** 40 | > 41 | > **This example name the venv:chatWaifu** 42 | 43 | ### 1.1 Enter project directory with cd command 44 | `cd YOUR_PROJECT_RESPORY` 45 |  46 | ### 1.2 Create Python Venv: 47 | 48 | Conda:`conda create --name chatWaifu python=3.10` 49 |  50 |  51 | 52 | Python:`python -m venv chatWaifu` 53 |  54 | 55 | ### 1.3 Activate created venv: 56 | Conda:`conda activate chatWaifu` 57 | 58 |  59 | 60 | Python:`.\chatWaifu\Scripts\activate.bat` 61 |  62 | 63 | ### 1.4 Install required library with Pip: 64 | `pip install -r requirements.txt` 65 |  66 | 67 | ## 2.import pre-trained models to root directory: 68 | Google Drive:https://drive.google.com/file/d/1tMCafhnUoL7FbevVQ44VQi-WznDjt23_/view?usp=sharing 69 | 70 | Ali Drive: https://www.aliyundrive.com/s/9JEj1mp1ZRv 提取码: m2y3 71 | 72 | ### 2.1Double click model.exe to import Models 73 | 74 | ## 3.RUN(Start chatting with CyberWaifu: 75 | Japanese Ver:`python ChatWaifuJP.py` 76 | 77 | Chinese Ver:`python ChatWaifuCN.py` 78 | 79 | Japanese voice conversation Ver(use Chinese):`python ChatWaifuJPVoice.py` 80 | 81 | Chinese voice conversation Ver(use Chinese):`python ChatWaifuCNVoice.py` 82 | 83 | Japanese voice conversation Ver(use English):`python ChatWaifuJPVoiceEN.py` 84 | 85 | Japanese voice conversation Ver(use Japanese):`python ChatWaifuJPVoiceJP.py` 86 | 87 | 88 | ### 3.1 Get ChatGPT Token 89 | #### Log in to ChatGPT whith link:https://chat.openai.com 90 | #### Press F12 to enter command center 91 | #### Find Application -> cookie -> __Secure-next-auth.session-token 92 |  93 | #### Copy the value into cmd and press ENTER 94 | 95 | ### 3.2 Start chatting with CyberWaifu 96 | 97 | **voice conversation Ver:** Start talking when the console prompts "You:" and then the sentence is recorded and sent to the ChatGPT conversation. 98 | 99 | ## 4.Contribution: 100 | - [MoeGoe_GUI]https://github.com/CjangCjengh/MoeGoe_GUI 101 | - [Pretrained models]https://github.com/CjangCjengh/TTSModels 102 | - [PyChatGPT]https://github.com/terry3041/pyChatGPT 103 | -------------------------------------------------------------------------------- /hubert_model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, Tuple 3 | import random 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present 9 | 10 | class Hubert(nn.Module): 11 | def __init__(self, num_label_embeddings: int = 100, mask: bool = True): 12 | super().__init__() 13 | self._mask = mask 14 | self.feature_extractor = FeatureExtractor() 15 | self.feature_projection = FeatureProjection() 16 | self.positional_embedding = PositionalConvEmbedding() 17 | self.norm = nn.LayerNorm(768) 18 | self.dropout = nn.Dropout(0.1) 19 | self.encoder = TransformerEncoder( 20 | nn.TransformerEncoderLayer( 21 | 768, 12, 3072, activation="gelu", batch_first=True 22 | ), 23 | 12, 24 | ) 25 | self.proj = nn.Linear(768, 256) 26 | 27 | self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) 28 | self.label_embedding = nn.Embedding(num_label_embeddings, 256) 29 | 30 | def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 31 | mask = None 32 | if self.training and self._mask: 33 | mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) 34 | x[mask] = self.masked_spec_embed.to(x.dtype) 35 | return x, mask 36 | 37 | def encode( 38 | self, x: torch.Tensor, layer: Optional[int] = None 39 | ) -> Tuple[torch.Tensor, torch.Tensor]: 40 | x = self.feature_extractor(x) 41 | x = self.feature_projection(x.transpose(1, 2)) 42 | x, mask = self.mask(x) 43 | x = x + self.positional_embedding(x) 44 | x = self.dropout(self.norm(x)) 45 | x = self.encoder(x, output_layer=layer) 46 | return x, mask 47 | 48 | def logits(self, x: torch.Tensor) -> torch.Tensor: 49 | logits = torch.cosine_similarity( 50 | x.unsqueeze(2), 51 | self.label_embedding.weight.unsqueeze(0).unsqueeze(0), 52 | dim=-1, 53 | ) 54 | return logits / 0.1 55 | 56 | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 57 | x, mask = self.encode(x) 58 | x = self.proj(x) 59 | logits = self.logits(x) 60 | return logits, mask 61 | 62 | 63 | class HubertSoft(Hubert): 64 | def __init__(self): 65 | super().__init__() 66 | 67 | @torch.inference_mode() 68 | def units(self, wav: torch.Tensor) -> torch.Tensor: 69 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) 70 | x, _ = self.encode(wav) 71 | return self.proj(x) 72 | 73 | 74 | class FeatureExtractor(nn.Module): 75 | def __init__(self): 76 | super().__init__() 77 | self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) 78 | self.norm0 = nn.GroupNorm(512, 512) 79 | self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) 80 | self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) 81 | self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) 82 | self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) 83 | self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) 84 | self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) 85 | 86 | def forward(self, x: torch.Tensor) -> torch.Tensor: 87 | x = F.gelu(self.norm0(self.conv0(x))) 88 | x = F.gelu(self.conv1(x)) 89 | x = F.gelu(self.conv2(x)) 90 | x = F.gelu(self.conv3(x)) 91 | x = F.gelu(self.conv4(x)) 92 | x = F.gelu(self.conv5(x)) 93 | x = F.gelu(self.conv6(x)) 94 | return x 95 | 96 | 97 | class FeatureProjection(nn.Module): 98 | def __init__(self): 99 | super().__init__() 100 | self.norm = nn.LayerNorm(512) 101 | self.projection = nn.Linear(512, 768) 102 | self.dropout = nn.Dropout(0.1) 103 | 104 | def forward(self, x: torch.Tensor) -> torch.Tensor: 105 | x = self.norm(x) 106 | x = self.projection(x) 107 | x = self.dropout(x) 108 | return x 109 | 110 | 111 | class PositionalConvEmbedding(nn.Module): 112 | def __init__(self): 113 | super().__init__() 114 | self.conv = nn.Conv1d( 115 | 768, 116 | 768, 117 | kernel_size=128, 118 | padding=128 // 2, 119 | groups=16, 120 | ) 121 | self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) 122 | 123 | def forward(self, x: torch.Tensor) -> torch.Tensor: 124 | x = self.conv(x.transpose(1, 2)) 125 | x = F.gelu(x[:, :, :-1]) 126 | return x.transpose(1, 2) 127 | 128 | 129 | class TransformerEncoder(nn.Module): 130 | def __init__( 131 | self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int 132 | ) -> None: 133 | super(TransformerEncoder, self).__init__() 134 | self.layers = nn.ModuleList( 135 | [copy.deepcopy(encoder_layer) for _ in range(num_layers)] 136 | ) 137 | self.num_layers = num_layers 138 | 139 | def forward( 140 | self, 141 | src: torch.Tensor, 142 | mask: torch.Tensor = None, 143 | src_key_padding_mask: torch.Tensor = None, 144 | output_layer: Optional[int] = None, 145 | ) -> torch.Tensor: 146 | output = src 147 | for layer in self.layers[:output_layer]: 148 | output = layer( 149 | output, src_mask=mask, src_key_padding_mask=src_key_padding_mask 150 | ) 151 | return output 152 | 153 | 154 | def _compute_mask( 155 | shape: Tuple[int, int], 156 | mask_prob: float, 157 | mask_length: int, 158 | device: torch.device, 159 | min_masks: int = 0, 160 | ) -> torch.Tensor: 161 | batch_size, sequence_length = shape 162 | 163 | if mask_length < 1: 164 | raise ValueError("`mask_length` has to be bigger than 0.") 165 | 166 | if mask_length > sequence_length: 167 | raise ValueError( 168 | f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" 169 | ) 170 | 171 | # compute number of masked spans in batch 172 | num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) 173 | num_masked_spans = max(num_masked_spans, min_masks) 174 | 175 | # make sure num masked indices <= sequence_length 176 | if num_masked_spans * mask_length > sequence_length: 177 | num_masked_spans = sequence_length // mask_length 178 | 179 | # SpecAugment mask to fill 180 | mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) 181 | 182 | # uniform distribution to sample from, make sure that offset samples are < sequence_length 183 | uniform_dist = torch.ones( 184 | (batch_size, sequence_length - (mask_length - 1)), device=device 185 | ) 186 | 187 | # get random indices to mask 188 | mask_indices = torch.multinomial(uniform_dist, num_masked_spans) 189 | 190 | # expand masked indices to masked spans 191 | mask_indices = ( 192 | mask_indices.unsqueeze(dim=-1) 193 | .expand((batch_size, num_masked_spans, mask_length)) 194 | .reshape(batch_size, num_masked_spans * mask_length) 195 | ) 196 | offsets = ( 197 | torch.arange(mask_length, device=device)[None, None, :] 198 | .expand((batch_size, num_masked_spans, mask_length)) 199 | .reshape(batch_size, num_masked_spans * mask_length) 200 | ) 201 | mask_idxs = mask_indices + offsets 202 | 203 | # scatter indices to mask 204 | mask = mask.scatter(1, mask_idxs, True) 205 | 206 | return mask 207 | 208 | 209 | def hubert_soft( 210 | path: str 211 | ) -> HubertSoft: 212 | r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. 213 | Args: 214 | path (str): path of a pretrained model 215 | """ 216 | hubert = HubertSoft() 217 | checkpoint = torch.load(path) 218 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 219 | hubert.load_state_dict(checkpoint) 220 | hubert.eval() 221 | return hubert 222 | -------------------------------------------------------------------------------- /mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | 5 | MAX_WAV_VALUE = 32768.0 6 | 7 | 8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 9 | """ 10 | PARAMS 11 | ------ 12 | C: compression factor 13 | """ 14 | return torch.log(torch.clamp(x, min=clip_val) * C) 15 | 16 | 17 | def dynamic_range_decompression_torch(x, C=1): 18 | """ 19 | PARAMS 20 | ------ 21 | C: compression factor used to compress 22 | """ 23 | return torch.exp(x) / C 24 | 25 | 26 | def spectral_normalize_torch(magnitudes): 27 | output = dynamic_range_compression_torch(magnitudes) 28 | return output 29 | 30 | 31 | def spectral_de_normalize_torch(magnitudes): 32 | output = dynamic_range_decompression_torch(magnitudes) 33 | return output 34 | 35 | 36 | mel_basis = {} 37 | hann_window = {} 38 | 39 | 40 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 41 | if torch.min(y) < -1.: 42 | print('min value is ', torch.min(y)) 43 | if torch.max(y) > 1.: 44 | print('max value is ', torch.max(y)) 45 | 46 | global hann_window 47 | dtype_device = str(y.dtype) + '_' + str(y.device) 48 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 49 | if wnsize_dtype_device not in hann_window: 50 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 51 | 52 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 53 | y = y.squeeze(1) 54 | 55 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 56 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 57 | 58 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 59 | return spec 60 | 61 | 62 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 63 | global mel_basis 64 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 65 | fmax_dtype_device = str(fmax) + '_' + dtype_device 66 | if fmax_dtype_device not in mel_basis: 67 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 68 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 69 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 70 | spec = spectral_normalize_torch(spec) 71 | return spec 72 | 73 | 74 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 75 | if torch.min(y) < -1.: 76 | print('min value is ', torch.min(y)) 77 | if torch.max(y) > 1.: 78 | print('max value is ', torch.max(y)) 79 | 80 | global mel_basis, hann_window 81 | dtype_device = str(y.dtype) + '_' + str(y.device) 82 | fmax_dtype_device = str(fmax) + '_' + dtype_device 83 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 84 | if fmax_dtype_device not in mel_basis: 85 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 86 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 87 | if wnsize_dtype_device not in hann_window: 88 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 89 | 90 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 91 | y = y.squeeze(1) 92 | 93 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 94 | center=center, pad_mode='reflect', normalized=False, onesided=True) 95 | 96 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 97 | 98 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 99 | spec = spectral_normalize_torch(spec) 100 | 101 | return spec 102 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | import modules 8 | import attentions 9 | 10 | from torch.nn import Conv1d, ConvTranspose1d 11 | from torch.nn.utils import weight_norm 12 | from commons import init_weights 13 | 14 | 15 | class StochasticDurationPredictor(nn.Module): 16 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): 17 | super().__init__() 18 | filter_channels = in_channels # it needs to be removed from future version. 19 | self.in_channels = in_channels 20 | self.filter_channels = filter_channels 21 | self.kernel_size = kernel_size 22 | self.p_dropout = p_dropout 23 | self.n_flows = n_flows 24 | self.gin_channels = gin_channels 25 | 26 | self.log_flow = modules.Log() 27 | self.flows = nn.ModuleList() 28 | self.flows.append(modules.ElementwiseAffine(2)) 29 | for i in range(n_flows): 30 | self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 31 | self.flows.append(modules.Flip()) 32 | 33 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 34 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 35 | self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 36 | self.post_flows = nn.ModuleList() 37 | self.post_flows.append(modules.ElementwiseAffine(2)) 38 | for i in range(4): 39 | self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 40 | self.post_flows.append(modules.Flip()) 41 | 42 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 43 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 44 | self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 45 | if gin_channels != 0: 46 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 47 | 48 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 49 | x = torch.detach(x) 50 | x = self.pre(x) 51 | if g is not None: 52 | g = torch.detach(g) 53 | x = x + self.cond(g) 54 | x = self.convs(x, x_mask) 55 | x = self.proj(x) * x_mask 56 | 57 | if not reverse: 58 | flows = self.flows 59 | assert w is not None 60 | 61 | logdet_tot_q = 0 62 | h_w = self.post_pre(w) 63 | h_w = self.post_convs(h_w, x_mask) 64 | h_w = self.post_proj(h_w) * x_mask 65 | e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask 66 | z_q = e_q 67 | for flow in self.post_flows: 68 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 69 | logdet_tot_q += logdet_q 70 | z_u, z1 = torch.split(z_q, [1, 1], 1) 71 | u = torch.sigmoid(z_u) * x_mask 72 | z0 = (w - u) * x_mask 73 | logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2]) 74 | logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q 75 | 76 | logdet_tot = 0 77 | z0, logdet = self.log_flow(z0, x_mask) 78 | logdet_tot += logdet 79 | z = torch.cat([z0, z1], 1) 80 | for flow in flows: 81 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 82 | logdet_tot = logdet_tot + logdet 83 | nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot 84 | return nll + logq # [b] 85 | else: 86 | flows = list(reversed(self.flows)) 87 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 88 | z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale 89 | for flow in flows: 90 | z = flow(z, x_mask, g=x, reverse=reverse) 91 | z0, z1 = torch.split(z, [1, 1], 1) 92 | logw = z0 93 | return logw 94 | 95 | 96 | class DurationPredictor(nn.Module): 97 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): 98 | super().__init__() 99 | 100 | self.in_channels = in_channels 101 | self.filter_channels = filter_channels 102 | self.kernel_size = kernel_size 103 | self.p_dropout = p_dropout 104 | self.gin_channels = gin_channels 105 | 106 | self.drop = nn.Dropout(p_dropout) 107 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) 108 | self.norm_1 = modules.LayerNorm(filter_channels) 109 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) 110 | self.norm_2 = modules.LayerNorm(filter_channels) 111 | self.proj = nn.Conv1d(filter_channels, 1, 1) 112 | 113 | if gin_channels != 0: 114 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 115 | 116 | def forward(self, x, x_mask, g=None): 117 | x = torch.detach(x) 118 | if g is not None: 119 | g = torch.detach(g) 120 | x = x + self.cond(g) 121 | x = self.conv_1(x * x_mask) 122 | x = torch.relu(x) 123 | x = self.norm_1(x) 124 | x = self.drop(x) 125 | x = self.conv_2(x * x_mask) 126 | x = torch.relu(x) 127 | x = self.norm_2(x) 128 | x = self.drop(x) 129 | x = self.proj(x * x_mask) 130 | return x * x_mask 131 | 132 | 133 | class TextEncoder(nn.Module): 134 | def __init__(self, 135 | n_vocab, 136 | out_channels, 137 | hidden_channels, 138 | filter_channels, 139 | n_heads, 140 | n_layers, 141 | kernel_size, 142 | p_dropout, 143 | emotion_embedding): 144 | super().__init__() 145 | self.n_vocab = n_vocab 146 | self.out_channels = out_channels 147 | self.hidden_channels = hidden_channels 148 | self.filter_channels = filter_channels 149 | self.n_heads = n_heads 150 | self.n_layers = n_layers 151 | self.kernel_size = kernel_size 152 | self.p_dropout = p_dropout 153 | self.emotion_embedding = emotion_embedding 154 | 155 | if self.n_vocab!=0: 156 | self.emb = nn.Embedding(n_vocab, hidden_channels) 157 | if emotion_embedding: 158 | self.emo_proj = nn.Linear(1024, hidden_channels) 159 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) 160 | 161 | self.encoder = attentions.Encoder( 162 | hidden_channels, 163 | filter_channels, 164 | n_heads, 165 | n_layers, 166 | kernel_size, 167 | p_dropout) 168 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) 169 | 170 | def forward(self, x, x_lengths, emotion_embedding=None): 171 | if self.n_vocab!=0: 172 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] 173 | if emotion_embedding is not None: 174 | x = x + self.emo_proj(emotion_embedding.unsqueeze(1)) 175 | x = torch.transpose(x, 1, -1) # [b, h, t] 176 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 177 | 178 | x = self.encoder(x * x_mask, x_mask) 179 | stats = self.proj(x) * x_mask 180 | 181 | m, logs = torch.split(stats, self.out_channels, dim=1) 182 | return x, m, logs, x_mask 183 | 184 | 185 | class ResidualCouplingBlock(nn.Module): 186 | def __init__(self, 187 | channels, 188 | hidden_channels, 189 | kernel_size, 190 | dilation_rate, 191 | n_layers, 192 | n_flows=4, 193 | gin_channels=0): 194 | super().__init__() 195 | self.channels = channels 196 | self.hidden_channels = hidden_channels 197 | self.kernel_size = kernel_size 198 | self.dilation_rate = dilation_rate 199 | self.n_layers = n_layers 200 | self.n_flows = n_flows 201 | self.gin_channels = gin_channels 202 | 203 | self.flows = nn.ModuleList() 204 | for i in range(n_flows): 205 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) 206 | self.flows.append(modules.Flip()) 207 | 208 | def forward(self, x, x_mask, g=None, reverse=False): 209 | if not reverse: 210 | for flow in self.flows: 211 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 212 | else: 213 | for flow in reversed(self.flows): 214 | x = flow(x, x_mask, g=g, reverse=reverse) 215 | return x 216 | 217 | 218 | class PosteriorEncoder(nn.Module): 219 | def __init__(self, 220 | in_channels, 221 | out_channels, 222 | hidden_channels, 223 | kernel_size, 224 | dilation_rate, 225 | n_layers, 226 | gin_channels=0): 227 | super().__init__() 228 | self.in_channels = in_channels 229 | self.out_channels = out_channels 230 | self.hidden_channels = hidden_channels 231 | self.kernel_size = kernel_size 232 | self.dilation_rate = dilation_rate 233 | self.n_layers = n_layers 234 | self.gin_channels = gin_channels 235 | 236 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 237 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) 238 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 239 | 240 | def forward(self, x, x_lengths, g=None): 241 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 242 | x = self.pre(x) * x_mask 243 | x = self.enc(x, x_mask, g=g) 244 | stats = self.proj(x) * x_mask 245 | m, logs = torch.split(stats, self.out_channels, dim=1) 246 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 247 | return z, m, logs, x_mask 248 | 249 | 250 | class Generator(torch.nn.Module): 251 | def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): 252 | super(Generator, self).__init__() 253 | self.num_kernels = len(resblock_kernel_sizes) 254 | self.num_upsamples = len(upsample_rates) 255 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 256 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 257 | 258 | self.ups = nn.ModuleList() 259 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 260 | self.ups.append(weight_norm( 261 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), 262 | k, u, padding=(k-u)//2))) 263 | 264 | self.resblocks = nn.ModuleList() 265 | for i in range(len(self.ups)): 266 | ch = upsample_initial_channel//(2**(i+1)) 267 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 268 | self.resblocks.append(resblock(ch, k, d)) 269 | 270 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 271 | self.ups.apply(init_weights) 272 | 273 | if gin_channels != 0: 274 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 275 | 276 | def forward(self, x, g=None): 277 | x = self.conv_pre(x) 278 | if g is not None: 279 | x = x + self.cond(g) 280 | 281 | for i in range(self.num_upsamples): 282 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 283 | x = self.ups[i](x) 284 | xs = None 285 | for j in range(self.num_kernels): 286 | if xs is None: 287 | xs = self.resblocks[i*self.num_kernels+j](x) 288 | else: 289 | xs += self.resblocks[i*self.num_kernels+j](x) 290 | x = xs / self.num_kernels 291 | x = F.leaky_relu(x) 292 | x = self.conv_post(x) 293 | x = torch.tanh(x) 294 | 295 | return x 296 | 297 | 298 | class SynthesizerTrn(nn.Module): 299 | """ 300 | Synthesizer for Training 301 | """ 302 | 303 | def __init__(self, 304 | n_vocab, 305 | spec_channels, 306 | segment_size, 307 | inter_channels, 308 | hidden_channels, 309 | filter_channels, 310 | n_heads, 311 | n_layers, 312 | kernel_size, 313 | p_dropout, 314 | resblock, 315 | resblock_kernel_sizes, 316 | resblock_dilation_sizes, 317 | upsample_rates, 318 | upsample_initial_channel, 319 | upsample_kernel_sizes, 320 | n_speakers=0, 321 | gin_channels=0, 322 | use_sdp=True, 323 | emotion_embedding=False, 324 | **kwargs): 325 | 326 | super().__init__() 327 | self.n_vocab = n_vocab 328 | self.spec_channels = spec_channels 329 | self.inter_channels = inter_channels 330 | self.hidden_channels = hidden_channels 331 | self.filter_channels = filter_channels 332 | self.n_heads = n_heads 333 | self.n_layers = n_layers 334 | self.kernel_size = kernel_size 335 | self.p_dropout = p_dropout 336 | self.resblock = resblock 337 | self.resblock_kernel_sizes = resblock_kernel_sizes 338 | self.resblock_dilation_sizes = resblock_dilation_sizes 339 | self.upsample_rates = upsample_rates 340 | self.upsample_initial_channel = upsample_initial_channel 341 | self.upsample_kernel_sizes = upsample_kernel_sizes 342 | self.segment_size = segment_size 343 | self.n_speakers = n_speakers 344 | self.gin_channels = gin_channels 345 | 346 | self.use_sdp = use_sdp 347 | 348 | self.enc_p = TextEncoder(n_vocab, 349 | inter_channels, 350 | hidden_channels, 351 | filter_channels, 352 | n_heads, 353 | n_layers, 354 | kernel_size, 355 | p_dropout, 356 | emotion_embedding) 357 | self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) 358 | self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 359 | self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) 360 | 361 | if use_sdp: 362 | self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) 363 | else: 364 | self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) 365 | 366 | if n_speakers > 1: 367 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 368 | 369 | def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None): 370 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding) 371 | if self.n_speakers > 0: 372 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 373 | else: 374 | g = None 375 | 376 | if self.use_sdp: 377 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) 378 | else: 379 | logw = self.dp(x, x_mask, g=g) 380 | w = torch.exp(logw) * x_mask * length_scale 381 | w_ceil = torch.ceil(w) 382 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 383 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) 384 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 385 | attn = commons.generate_path(w_ceil, attn_mask) 386 | 387 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 388 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 389 | 390 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 391 | z = self.flow(z_p, y_mask, g=g, reverse=True) 392 | o = self.dec((z * y_mask)[:,:,:max_len], g=g) 393 | return o, attn, y_mask, (z, z_p, m_p, logs_p) 394 | 395 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): 396 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 397 | g_src = self.emb_g(sid_src).unsqueeze(-1) 398 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 399 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 400 | z_p = self.flow(z, y_mask, g=g_src) 401 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 402 | o_hat = self.dec(z_hat * y_mask, g=g_tgt) 403 | return o_hat, y_mask, (z, z_p, z_hat) 404 | 405 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch.nn import Conv1d 7 | from torch.nn.utils import weight_norm, remove_weight_norm 8 | 9 | import commons 10 | from commons import init_weights, get_padding 11 | from transforms import piecewise_rational_quadratic_transform 12 | 13 | 14 | LRELU_SLOPE = 0.1 15 | 16 | 17 | class LayerNorm(nn.Module): 18 | def __init__(self, channels, eps=1e-5): 19 | super().__init__() 20 | self.channels = channels 21 | self.eps = eps 22 | 23 | self.gamma = nn.Parameter(torch.ones(channels)) 24 | self.beta = nn.Parameter(torch.zeros(channels)) 25 | 26 | def forward(self, x): 27 | x = x.transpose(1, -1) 28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 29 | return x.transpose(1, -1) 30 | 31 | 32 | class ConvReluNorm(nn.Module): 33 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 34 | super().__init__() 35 | self.in_channels = in_channels 36 | self.hidden_channels = hidden_channels 37 | self.out_channels = out_channels 38 | self.kernel_size = kernel_size 39 | self.n_layers = n_layers 40 | self.p_dropout = p_dropout 41 | assert n_layers > 1, "Number of layers should be larger than 0." 42 | 43 | self.conv_layers = nn.ModuleList() 44 | self.norm_layers = nn.ModuleList() 45 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 46 | self.norm_layers.append(LayerNorm(hidden_channels)) 47 | self.relu_drop = nn.Sequential( 48 | nn.ReLU(), 49 | nn.Dropout(p_dropout)) 50 | for _ in range(n_layers-1): 51 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 52 | self.norm_layers.append(LayerNorm(hidden_channels)) 53 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 54 | self.proj.weight.data.zero_() 55 | self.proj.bias.data.zero_() 56 | 57 | def forward(self, x, x_mask): 58 | x_org = x 59 | for i in range(self.n_layers): 60 | x = self.conv_layers[i](x * x_mask) 61 | x = self.norm_layers[i](x) 62 | x = self.relu_drop(x) 63 | x = x_org + self.proj(x) 64 | return x * x_mask 65 | 66 | 67 | class DDSConv(nn.Module): 68 | """ 69 | Dilated and Depth-Separable Convolution 70 | """ 71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 72 | super().__init__() 73 | self.channels = channels 74 | self.kernel_size = kernel_size 75 | self.n_layers = n_layers 76 | self.p_dropout = p_dropout 77 | 78 | self.drop = nn.Dropout(p_dropout) 79 | self.convs_sep = nn.ModuleList() 80 | self.convs_1x1 = nn.ModuleList() 81 | self.norms_1 = nn.ModuleList() 82 | self.norms_2 = nn.ModuleList() 83 | for i in range(n_layers): 84 | dilation = kernel_size ** i 85 | padding = (kernel_size * dilation - dilation) // 2 86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 87 | groups=channels, dilation=dilation, padding=padding 88 | )) 89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 90 | self.norms_1.append(LayerNorm(channels)) 91 | self.norms_2.append(LayerNorm(channels)) 92 | 93 | def forward(self, x, x_mask, g=None): 94 | if g is not None: 95 | x = x + g 96 | for i in range(self.n_layers): 97 | y = self.convs_sep[i](x * x_mask) 98 | y = self.norms_1[i](y) 99 | y = F.gelu(y) 100 | y = self.convs_1x1[i](y) 101 | y = self.norms_2[i](y) 102 | y = F.gelu(y) 103 | y = self.drop(y) 104 | x = x + y 105 | return x * x_mask 106 | 107 | 108 | class WN(torch.nn.Module): 109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 110 | super(WN, self).__init__() 111 | assert(kernel_size % 2 == 1) 112 | self.hidden_channels =hidden_channels 113 | self.kernel_size = kernel_size, 114 | self.dilation_rate = dilation_rate 115 | self.n_layers = n_layers 116 | self.gin_channels = gin_channels 117 | self.p_dropout = p_dropout 118 | 119 | self.in_layers = torch.nn.ModuleList() 120 | self.res_skip_layers = torch.nn.ModuleList() 121 | self.drop = nn.Dropout(p_dropout) 122 | 123 | if gin_channels != 0: 124 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 125 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 126 | 127 | for i in range(n_layers): 128 | dilation = dilation_rate ** i 129 | padding = int((kernel_size * dilation - dilation) / 2) 130 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 131 | dilation=dilation, padding=padding) 132 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 133 | self.in_layers.append(in_layer) 134 | 135 | # last one is not necessary 136 | if i < n_layers - 1: 137 | res_skip_channels = 2 * hidden_channels 138 | else: 139 | res_skip_channels = hidden_channels 140 | 141 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 142 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 143 | self.res_skip_layers.append(res_skip_layer) 144 | 145 | def forward(self, x, x_mask, g=None, **kwargs): 146 | output = torch.zeros_like(x) 147 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 148 | 149 | if g is not None: 150 | g = self.cond_layer(g) 151 | 152 | for i in range(self.n_layers): 153 | x_in = self.in_layers[i](x) 154 | if g is not None: 155 | cond_offset = i * 2 * self.hidden_channels 156 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 157 | else: 158 | g_l = torch.zeros_like(x_in) 159 | 160 | acts = commons.fused_add_tanh_sigmoid_multiply( 161 | x_in, 162 | g_l, 163 | n_channels_tensor) 164 | acts = self.drop(acts) 165 | 166 | res_skip_acts = self.res_skip_layers[i](acts) 167 | if i < self.n_layers - 1: 168 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 169 | x = (x + res_acts) * x_mask 170 | output = output + res_skip_acts[:,self.hidden_channels:,:] 171 | else: 172 | output = output + res_skip_acts 173 | return output * x_mask 174 | 175 | def remove_weight_norm(self): 176 | if self.gin_channels != 0: 177 | torch.nn.utils.remove_weight_norm(self.cond_layer) 178 | for l in self.in_layers: 179 | torch.nn.utils.remove_weight_norm(l) 180 | for l in self.res_skip_layers: 181 | torch.nn.utils.remove_weight_norm(l) 182 | 183 | 184 | class ResBlock1(torch.nn.Module): 185 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 186 | super(ResBlock1, self).__init__() 187 | self.convs1 = nn.ModuleList([ 188 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 189 | padding=get_padding(kernel_size, dilation[0]))), 190 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 191 | padding=get_padding(kernel_size, dilation[1]))), 192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 193 | padding=get_padding(kernel_size, dilation[2]))) 194 | ]) 195 | self.convs1.apply(init_weights) 196 | 197 | self.convs2 = nn.ModuleList([ 198 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 199 | padding=get_padding(kernel_size, 1))), 200 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 201 | padding=get_padding(kernel_size, 1))), 202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 203 | padding=get_padding(kernel_size, 1))) 204 | ]) 205 | self.convs2.apply(init_weights) 206 | 207 | def forward(self, x, x_mask=None): 208 | for c1, c2 in zip(self.convs1, self.convs2): 209 | xt = F.leaky_relu(x, LRELU_SLOPE) 210 | if x_mask is not None: 211 | xt = xt * x_mask 212 | xt = c1(xt) 213 | xt = F.leaky_relu(xt, LRELU_SLOPE) 214 | if x_mask is not None: 215 | xt = xt * x_mask 216 | xt = c2(xt) 217 | x = xt + x 218 | if x_mask is not None: 219 | x = x * x_mask 220 | return x 221 | 222 | def remove_weight_norm(self): 223 | for l in self.convs1: 224 | remove_weight_norm(l) 225 | for l in self.convs2: 226 | remove_weight_norm(l) 227 | 228 | 229 | class ResBlock2(torch.nn.Module): 230 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 231 | super(ResBlock2, self).__init__() 232 | self.convs = nn.ModuleList([ 233 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 234 | padding=get_padding(kernel_size, dilation[0]))), 235 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 236 | padding=get_padding(kernel_size, dilation[1]))) 237 | ]) 238 | self.convs.apply(init_weights) 239 | 240 | def forward(self, x, x_mask=None): 241 | for c in self.convs: 242 | xt = F.leaky_relu(x, LRELU_SLOPE) 243 | if x_mask is not None: 244 | xt = xt * x_mask 245 | xt = c(xt) 246 | x = xt + x 247 | if x_mask is not None: 248 | x = x * x_mask 249 | return x 250 | 251 | def remove_weight_norm(self): 252 | for l in self.convs: 253 | remove_weight_norm(l) 254 | 255 | 256 | class Log(nn.Module): 257 | def forward(self, x, x_mask, reverse=False, **kwargs): 258 | if not reverse: 259 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 260 | logdet = torch.sum(-y, [1, 2]) 261 | return y, logdet 262 | else: 263 | x = torch.exp(x) * x_mask 264 | return x 265 | 266 | 267 | class Flip(nn.Module): 268 | def forward(self, x, *args, reverse=False, **kwargs): 269 | x = torch.flip(x, [1]) 270 | if not reverse: 271 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 272 | return x, logdet 273 | else: 274 | return x 275 | 276 | 277 | class ElementwiseAffine(nn.Module): 278 | def __init__(self, channels): 279 | super().__init__() 280 | self.channels = channels 281 | self.m = nn.Parameter(torch.zeros(channels,1)) 282 | self.logs = nn.Parameter(torch.zeros(channels,1)) 283 | 284 | def forward(self, x, x_mask, reverse=False, **kwargs): 285 | if not reverse: 286 | y = self.m + torch.exp(self.logs) * x 287 | y = y * x_mask 288 | logdet = torch.sum(self.logs * x_mask, [1,2]) 289 | return y, logdet 290 | else: 291 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 292 | return x 293 | 294 | 295 | class ResidualCouplingLayer(nn.Module): 296 | def __init__(self, 297 | channels, 298 | hidden_channels, 299 | kernel_size, 300 | dilation_rate, 301 | n_layers, 302 | p_dropout=0, 303 | gin_channels=0, 304 | mean_only=False): 305 | assert channels % 2 == 0, "channels should be divisible by 2" 306 | super().__init__() 307 | self.channels = channels 308 | self.hidden_channels = hidden_channels 309 | self.kernel_size = kernel_size 310 | self.dilation_rate = dilation_rate 311 | self.n_layers = n_layers 312 | self.half_channels = channels // 2 313 | self.mean_only = mean_only 314 | 315 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 316 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 317 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 318 | self.post.weight.data.zero_() 319 | self.post.bias.data.zero_() 320 | 321 | def forward(self, x, x_mask, g=None, reverse=False): 322 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 323 | h = self.pre(x0) * x_mask 324 | h = self.enc(h, x_mask, g=g) 325 | stats = self.post(h) * x_mask 326 | if not self.mean_only: 327 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 328 | else: 329 | m = stats 330 | logs = torch.zeros_like(m) 331 | 332 | if not reverse: 333 | x1 = m + x1 * torch.exp(logs) * x_mask 334 | x = torch.cat([x0, x1], 1) 335 | logdet = torch.sum(logs, [1,2]) 336 | return x, logdet 337 | else: 338 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 339 | x = torch.cat([x0, x1], 1) 340 | return x 341 | 342 | 343 | class ConvFlow(nn.Module): 344 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 345 | super().__init__() 346 | self.in_channels = in_channels 347 | self.filter_channels = filter_channels 348 | self.kernel_size = kernel_size 349 | self.n_layers = n_layers 350 | self.num_bins = num_bins 351 | self.tail_bound = tail_bound 352 | self.half_channels = in_channels // 2 353 | 354 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 355 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 356 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 357 | self.proj.weight.data.zero_() 358 | self.proj.bias.data.zero_() 359 | 360 | def forward(self, x, x_mask, g=None, reverse=False): 361 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 362 | h = self.pre(x0) 363 | h = self.convs(h, x_mask, g=g) 364 | h = self.proj(h) * x_mask 365 | 366 | b, c, t = x0.shape 367 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 368 | 369 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 370 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 371 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 372 | 373 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 374 | unnormalized_widths, 375 | unnormalized_heights, 376 | unnormalized_derivatives, 377 | inverse=reverse, 378 | tails='linear', 379 | tail_bound=self.tail_bound 380 | ) 381 | 382 | x = torch.cat([x0, x1], 1) * x_mask 383 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 384 | if not reverse: 385 | return x, logdet 386 | else: 387 | return x 388 | -------------------------------------------------------------------------------- /readme/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/1.png -------------------------------------------------------------------------------- /readme/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/2.png -------------------------------------------------------------------------------- /readme/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/3.png -------------------------------------------------------------------------------- /readme/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/4.png -------------------------------------------------------------------------------- /readme/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/5.png -------------------------------------------------------------------------------- /readme/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/6.png -------------------------------------------------------------------------------- /readme/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/7.png -------------------------------------------------------------------------------- /readme/cyberchat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/cyberchat.png -------------------------------------------------------------------------------- /readme/token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/token.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numba 2 | librosa 3 | numpy 4 | scipy 5 | torch 6 | unidecode 7 | openjtalk>=0.3.0.dev2 8 | jamo 9 | pypinyin 10 | jieba 11 | protobuf 12 | cn2an 13 | inflect 14 | eng_to_ipa 15 | ko_pron 16 | indic_transliteration 17 | num_thai 18 | opencc 19 | pyChatGPT 20 | vosk 21 | sounddevice 22 | -------------------------------------------------------------------------------- /text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from text import cleaners 3 | 4 | 5 | def text_to_sequence(text, symbols, cleaner_names): 6 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 7 | Args: 8 | text: string to convert to a sequence 9 | cleaner_names: names of the cleaner functions to run the text through 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 14 | 15 | sequence = [] 16 | 17 | clean_text = _clean_text(text, cleaner_names) 18 | for symbol in clean_text: 19 | if symbol not in _symbol_to_id.keys(): 20 | continue 21 | symbol_id = _symbol_to_id[symbol] 22 | sequence += [symbol_id] 23 | return sequence 24 | 25 | 26 | def _clean_text(text, cleaner_names): 27 | for name in cleaner_names: 28 | cleaner = getattr(cleaners, name) 29 | if not cleaner: 30 | raise Exception('Unknown cleaner: %s' % name) 31 | text = cleaner(text) 32 | return text 33 | -------------------------------------------------------------------------------- /text/cantonese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('jyutjyu') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ei˥'), 11 | ('B', 'biː˥'), 12 | ('C', 'siː˥'), 13 | ('D', 'tiː˥'), 14 | ('E', 'iː˥'), 15 | ('F', 'e˥fuː˨˩'), 16 | ('G', 'tsiː˥'), 17 | ('H', 'ɪk̚˥tsʰyː˨˩'), 18 | ('I', 'ɐi˥'), 19 | ('J', 'tsei˥'), 20 | ('K', 'kʰei˥'), 21 | ('L', 'e˥llou˨˩'), 22 | ('M', 'ɛːm˥'), 23 | ('N', 'ɛːn˥'), 24 | ('O', 'ou˥'), 25 | ('P', 'pʰiː˥'), 26 | ('Q', 'kʰiːu˥'), 27 | ('R', 'aː˥lou˨˩'), 28 | ('S', 'ɛː˥siː˨˩'), 29 | ('T', 'tʰiː˥'), 30 | ('U', 'juː˥'), 31 | ('V', 'wiː˥'), 32 | ('W', 'tʊk̚˥piː˥juː˥'), 33 | ('X', 'ɪk̚˥siː˨˩'), 34 | ('Y', 'waːi˥'), 35 | ('Z', 'iː˨sɛːt̚˥') 36 | ]] 37 | 38 | 39 | def number_to_cantonese(text): 40 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text) 41 | 42 | 43 | def latin_to_ipa(text): 44 | for regex, replacement in _latin_to_ipa: 45 | text = re.sub(regex, replacement, text) 46 | return text 47 | 48 | 49 | def cantonese_to_ipa(text): 50 | text = number_to_cantonese(text.upper()) 51 | text = converter.convert(text).replace('-','').replace('$',' ') 52 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 53 | text = re.sub(r'[、;:]', ',', text) 54 | text = re.sub(r'\s*,\s*', ', ', text) 55 | text = re.sub(r'\s*。\s*', '. ', text) 56 | text = re.sub(r'\s*?\s*', '? ', text) 57 | text = re.sub(r'\s*!\s*', '! ', text) 58 | text = re.sub(r'\s*$', '', text) 59 | return text 60 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def japanese_cleaners(text): 5 | from text.japanese import japanese_to_romaji_with_accent 6 | text = japanese_to_romaji_with_accent(text) 7 | text = re.sub(r'([A-Za-z])$', r'\1.', text) 8 | return text 9 | 10 | 11 | def japanese_cleaners2(text): 12 | return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') 13 | 14 | 15 | def korean_cleaners(text): 16 | '''Pipeline for Korean text''' 17 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul 18 | text = latin_to_hangul(text) 19 | text = number_to_hangul(text) 20 | text = divide_hangul(text) 21 | text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) 22 | return text 23 | 24 | 25 | def chinese_cleaners(text): 26 | '''Pipeline for Chinese text''' 27 | from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo 28 | text = number_to_chinese(text) 29 | text = chinese_to_bopomofo(text) 30 | text = latin_to_bopomofo(text) 31 | text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text) 32 | return text 33 | 34 | 35 | def zh_ja_mixture_cleaners(text): 36 | from text.mandarin import chinese_to_romaji 37 | from text.japanese import japanese_to_romaji_with_accent 38 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 39 | lambda x: chinese_to_romaji(x.group(1))+' ', text) 40 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent( 41 | x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text) 42 | text = re.sub(r'\s+$', '', text) 43 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 44 | return text 45 | 46 | 47 | def sanskrit_cleaners(text): 48 | text = text.replace('॥', '।').replace('ॐ', 'ओम्') 49 | text = re.sub(r'([^।])$', r'\1।', text) 50 | return text 51 | 52 | 53 | def cjks_cleaners(text): 54 | from text.mandarin import chinese_to_lazy_ipa 55 | from text.japanese import japanese_to_ipa 56 | from text.korean import korean_to_lazy_ipa 57 | from text.sanskrit import devanagari_to_ipa 58 | from text.english import english_to_lazy_ipa 59 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 60 | lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text) 61 | text = re.sub(r'\[JA\](.*?)\[JA\]', 62 | lambda x: japanese_to_ipa(x.group(1))+' ', text) 63 | text = re.sub(r'\[KO\](.*?)\[KO\]', 64 | lambda x: korean_to_lazy_ipa(x.group(1))+' ', text) 65 | text = re.sub(r'\[SA\](.*?)\[SA\]', 66 | lambda x: devanagari_to_ipa(x.group(1))+' ', text) 67 | text = re.sub(r'\[EN\](.*?)\[EN\]', 68 | lambda x: english_to_lazy_ipa(x.group(1))+' ', text) 69 | text = re.sub(r'\s+$', '', text) 70 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 71 | return text 72 | 73 | 74 | def cjke_cleaners(text): 75 | from text.mandarin import chinese_to_lazy_ipa 76 | from text.japanese import japanese_to_ipa 77 | from text.korean import korean_to_ipa 78 | from text.english import english_to_ipa2 79 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace( 80 | 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text) 81 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace( 82 | 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text) 83 | text = re.sub(r'\[KO\](.*?)\[KO\]', 84 | lambda x: korean_to_ipa(x.group(1))+' ', text) 85 | text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace( 86 | 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text) 87 | text = re.sub(r'\s+$', '', text) 88 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 89 | return text 90 | 91 | 92 | def cjke_cleaners2(text): 93 | from text.mandarin import chinese_to_ipa 94 | from text.japanese import japanese_to_ipa2 95 | from text.korean import korean_to_ipa 96 | from text.english import english_to_ipa2 97 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 98 | lambda x: chinese_to_ipa(x.group(1))+' ', text) 99 | text = re.sub(r'\[JA\](.*?)\[JA\]', 100 | lambda x: japanese_to_ipa2(x.group(1))+' ', text) 101 | text = re.sub(r'\[KO\](.*?)\[KO\]', 102 | lambda x: korean_to_ipa(x.group(1))+' ', text) 103 | text = re.sub(r'\[EN\](.*?)\[EN\]', 104 | lambda x: english_to_ipa2(x.group(1))+' ', text) 105 | text = re.sub(r'\s+$', '', text) 106 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 107 | return text 108 | 109 | 110 | def thai_cleaners(text): 111 | from text.thai import num_to_thai, latin_to_thai 112 | text = num_to_thai(text) 113 | text = latin_to_thai(text) 114 | return text 115 | 116 | 117 | def shanghainese_cleaners(text): 118 | from text.shanghainese import shanghainese_to_ipa 119 | text = shanghainese_to_ipa(text) 120 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 121 | return text 122 | 123 | 124 | def chinese_dialect_cleaners(text): 125 | from text.mandarin import chinese_to_ipa2 126 | from text.japanese import japanese_to_ipa3 127 | from text.shanghainese import shanghainese_to_ipa 128 | from text.cantonese import cantonese_to_ipa 129 | from text.english import english_to_lazy_ipa2 130 | from text.ngu_dialect import ngu_dialect_to_ipa 131 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 132 | lambda x: chinese_to_ipa2(x.group(1))+' ', text) 133 | text = re.sub(r'\[JA\](.*?)\[JA\]', 134 | lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text) 135 | text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5', 136 | '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text) 137 | text = re.sub(r'\[GD\](.*?)\[GD\]', 138 | lambda x: cantonese_to_ipa(x.group(1))+' ', text) 139 | text = re.sub(r'\[EN\](.*?)\[EN\]', 140 | lambda x: english_to_lazy_ipa2(x.group(1))+' ', text) 141 | text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group( 142 | 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text) 143 | text = re.sub(r'\s+$', '', text) 144 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 145 | return text 146 | -------------------------------------------------------------------------------- /text/english.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | 16 | # Regular expression matching whitespace: 17 | 18 | 19 | import re 20 | import inflect 21 | from unidecode import unidecode 22 | import eng_to_ipa as ipa 23 | _inflect = inflect.engine() 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 29 | _number_re = re.compile(r'[0-9]+') 30 | 31 | # List of (regular expression, replacement) pairs for abbreviations: 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 33 | ('mrs', 'misess'), 34 | ('mr', 'mister'), 35 | ('dr', 'doctor'), 36 | ('st', 'saint'), 37 | ('co', 'company'), 38 | ('jr', 'junior'), 39 | ('maj', 'major'), 40 | ('gen', 'general'), 41 | ('drs', 'doctors'), 42 | ('rev', 'reverend'), 43 | ('lt', 'lieutenant'), 44 | ('hon', 'honorable'), 45 | ('sgt', 'sergeant'), 46 | ('capt', 'captain'), 47 | ('esq', 'esquire'), 48 | ('ltd', 'limited'), 49 | ('col', 'colonel'), 50 | ('ft', 'fort'), 51 | ]] 52 | 53 | 54 | # List of (ipa, lazy ipa) pairs: 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 56 | ('r', 'ɹ'), 57 | ('æ', 'e'), 58 | ('ɑ', 'a'), 59 | ('ɔ', 'o'), 60 | ('ð', 'z'), 61 | ('θ', 's'), 62 | ('ɛ', 'e'), 63 | ('ɪ', 'i'), 64 | ('ʊ', 'u'), 65 | ('ʒ', 'ʥ'), 66 | ('ʤ', 'ʥ'), 67 | ('ˈ', '↓'), 68 | ]] 69 | 70 | # List of (ipa, lazy ipa2) pairs: 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 72 | ('r', 'ɹ'), 73 | ('ð', 'z'), 74 | ('θ', 's'), 75 | ('ʒ', 'ʑ'), 76 | ('ʤ', 'dʑ'), 77 | ('ˈ', '↓'), 78 | ]] 79 | 80 | # List of (ipa, ipa2) pairs 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 82 | ('r', 'ɹ'), 83 | ('ʤ', 'dʒ'), 84 | ('ʧ', 'tʃ') 85 | ]] 86 | 87 | 88 | def expand_abbreviations(text): 89 | for regex, replacement in _abbreviations: 90 | text = re.sub(regex, replacement, text) 91 | return text 92 | 93 | 94 | def collapse_whitespace(text): 95 | return re.sub(r'\s+', ' ', text) 96 | 97 | 98 | def _remove_commas(m): 99 | return m.group(1).replace(',', '') 100 | 101 | 102 | def _expand_decimal_point(m): 103 | return m.group(1).replace('.', ' point ') 104 | 105 | 106 | def _expand_dollars(m): 107 | match = m.group(1) 108 | parts = match.split('.') 109 | if len(parts) > 2: 110 | return match + ' dollars' # Unexpected format 111 | dollars = int(parts[0]) if parts[0] else 0 112 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 113 | if dollars and cents: 114 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 115 | cent_unit = 'cent' if cents == 1 else 'cents' 116 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 117 | elif dollars: 118 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 119 | return '%s %s' % (dollars, dollar_unit) 120 | elif cents: 121 | cent_unit = 'cent' if cents == 1 else 'cents' 122 | return '%s %s' % (cents, cent_unit) 123 | else: 124 | return 'zero dollars' 125 | 126 | 127 | def _expand_ordinal(m): 128 | return _inflect.number_to_words(m.group(0)) 129 | 130 | 131 | def _expand_number(m): 132 | num = int(m.group(0)) 133 | if num > 1000 and num < 3000: 134 | if num == 2000: 135 | return 'two thousand' 136 | elif num > 2000 and num < 2010: 137 | return 'two thousand ' + _inflect.number_to_words(num % 100) 138 | elif num % 100 == 0: 139 | return _inflect.number_to_words(num // 100) + ' hundred' 140 | else: 141 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 142 | else: 143 | return _inflect.number_to_words(num, andword='') 144 | 145 | 146 | def normalize_numbers(text): 147 | text = re.sub(_comma_number_re, _remove_commas, text) 148 | text = re.sub(_pounds_re, r'\1 pounds', text) 149 | text = re.sub(_dollars_re, _expand_dollars, text) 150 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 151 | text = re.sub(_ordinal_re, _expand_ordinal, text) 152 | text = re.sub(_number_re, _expand_number, text) 153 | return text 154 | 155 | 156 | def mark_dark_l(text): 157 | return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text) 158 | 159 | 160 | def english_to_ipa(text): 161 | text = unidecode(text).lower() 162 | text = expand_abbreviations(text) 163 | text = normalize_numbers(text) 164 | phonemes = ipa.convert(text) 165 | phonemes = collapse_whitespace(phonemes) 166 | return phonemes 167 | 168 | 169 | def english_to_lazy_ipa(text): 170 | text = english_to_ipa(text) 171 | for regex, replacement in _lazy_ipa: 172 | text = re.sub(regex, replacement, text) 173 | return text 174 | 175 | 176 | def english_to_ipa2(text): 177 | text = english_to_ipa(text) 178 | text = mark_dark_l(text) 179 | for regex, replacement in _ipa_to_ipa2: 180 | text = re.sub(regex, replacement, text) 181 | return text.replace('...', '…') 182 | 183 | 184 | def english_to_lazy_ipa2(text): 185 | text = english_to_ipa(text) 186 | for regex, replacement in _lazy_ipa2: 187 | text = re.sub(regex, replacement, text) 188 | return text 189 | -------------------------------------------------------------------------------- /text/japanese.py: -------------------------------------------------------------------------------- 1 | import re 2 | from unidecode import unidecode 3 | import pyopenjtalk 4 | 5 | 6 | # Regular expression matching Japanese without punctuation marks: 7 | _japanese_characters = re.compile( 8 | r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 9 | 10 | # Regular expression matching non-Japanese characters or punctuation marks: 11 | _japanese_marks = re.compile( 12 | r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 13 | 14 | # List of (symbol, Japanese) pairs for marks: 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ 16 | ('%', 'パーセント') 17 | ]] 18 | 19 | # List of (romaji, ipa) pairs for marks: 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 21 | ('ts', 'ʦ'), 22 | ('u', 'ɯ'), 23 | ('j', 'ʥ'), 24 | ('y', 'j'), 25 | ('ni', 'n^i'), 26 | ('nj', 'n^'), 27 | ('hi', 'çi'), 28 | ('hj', 'ç'), 29 | ('f', 'ɸ'), 30 | ('I', 'i*'), 31 | ('U', 'ɯ*'), 32 | ('r', 'ɾ') 33 | ]] 34 | 35 | # List of (romaji, ipa2) pairs for marks: 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 37 | ('u', 'ɯ'), 38 | ('ʧ', 'tʃ'), 39 | ('j', 'dʑ'), 40 | ('y', 'j'), 41 | ('ni', 'n^i'), 42 | ('nj', 'n^'), 43 | ('hi', 'çi'), 44 | ('hj', 'ç'), 45 | ('f', 'ɸ'), 46 | ('I', 'i*'), 47 | ('U', 'ɯ*'), 48 | ('r', 'ɾ') 49 | ]] 50 | 51 | # List of (consonant, sokuon) pairs: 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 53 | (r'Q([↑↓]*[kg])', r'k#\1'), 54 | (r'Q([↑↓]*[tdjʧ])', r't#\1'), 55 | (r'Q([↑↓]*[sʃ])', r's\1'), 56 | (r'Q([↑↓]*[pb])', r'p#\1') 57 | ]] 58 | 59 | # List of (consonant, hatsuon) pairs: 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 61 | (r'N([↑↓]*[pbm])', r'm\1'), 62 | (r'N([↑↓]*[ʧʥj])', r'n^\1'), 63 | (r'N([↑↓]*[tdn])', r'n\1'), 64 | (r'N([↑↓]*[kg])', r'ŋ\1') 65 | ]] 66 | 67 | 68 | def symbols_to_japanese(text): 69 | for regex, replacement in _symbols_to_japanese: 70 | text = re.sub(regex, replacement, text) 71 | return text 72 | 73 | 74 | def japanese_to_romaji_with_accent(text): 75 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' 76 | text = symbols_to_japanese(text) 77 | sentences = re.split(_japanese_marks, text) 78 | marks = re.findall(_japanese_marks, text) 79 | text = '' 80 | for i, sentence in enumerate(sentences): 81 | if re.match(_japanese_characters, sentence): 82 | if text != '': 83 | text += ' ' 84 | labels = pyopenjtalk.extract_fullcontext(sentence) 85 | for n, label in enumerate(labels): 86 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1) 87 | if phoneme not in ['sil', 'pau']: 88 | text += phoneme.replace('ch', 'ʧ').replace('sh', 89 | 'ʃ').replace('cl', 'Q') 90 | else: 91 | continue 92 | # n_moras = int(re.search(r'/F:(\d+)_', label).group(1)) 93 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) 94 | a2 = int(re.search(r"\+(\d+)\+", label).group(1)) 95 | a3 = int(re.search(r"\+(\d+)/", label).group(1)) 96 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']: 97 | a2_next = -1 98 | else: 99 | a2_next = int( 100 | re.search(r"\+(\d+)\+", labels[n + 1]).group(1)) 101 | # Accent phrase boundary 102 | if a3 == 1 and a2_next == 1: 103 | text += ' ' 104 | # Falling 105 | elif a1 == 0 and a2_next == a2 + 1: 106 | text += '↓' 107 | # Rising 108 | elif a2 == 1 and a2_next == 2: 109 | text += '↑' 110 | if i < len(marks): 111 | text += unidecode(marks[i]).replace(' ', '') 112 | return text 113 | 114 | 115 | def get_real_sokuon(text): 116 | for regex, replacement in _real_sokuon: 117 | text = re.sub(regex, replacement, text) 118 | return text 119 | 120 | 121 | def get_real_hatsuon(text): 122 | for regex, replacement in _real_hatsuon: 123 | text = re.sub(regex, replacement, text) 124 | return text 125 | 126 | 127 | def japanese_to_ipa(text): 128 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 129 | text = re.sub( 130 | r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 131 | text = get_real_sokuon(text) 132 | text = get_real_hatsuon(text) 133 | for regex, replacement in _romaji_to_ipa: 134 | text = re.sub(regex, replacement, text) 135 | return text 136 | 137 | 138 | def japanese_to_ipa2(text): 139 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 140 | text = get_real_sokuon(text) 141 | text = get_real_hatsuon(text) 142 | for regex, replacement in _romaji_to_ipa2: 143 | text = re.sub(regex, replacement, text) 144 | return text 145 | 146 | 147 | def japanese_to_ipa3(text): 148 | text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace( 149 | 'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a') 150 | text = re.sub( 151 | r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 152 | text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text) 153 | return text 154 | -------------------------------------------------------------------------------- /text/korean.py: -------------------------------------------------------------------------------- 1 | import re 2 | from jamo import h2j, j2hcj 3 | import ko_pron 4 | 5 | 6 | # This is a list of Korean classifiers preceded by pure Korean numerals. 7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' 8 | 9 | # List of (hangul, hangul divided) pairs: 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ 11 | ('ㄳ', 'ㄱㅅ'), 12 | ('ㄵ', 'ㄴㅈ'), 13 | ('ㄶ', 'ㄴㅎ'), 14 | ('ㄺ', 'ㄹㄱ'), 15 | ('ㄻ', 'ㄹㅁ'), 16 | ('ㄼ', 'ㄹㅂ'), 17 | ('ㄽ', 'ㄹㅅ'), 18 | ('ㄾ', 'ㄹㅌ'), 19 | ('ㄿ', 'ㄹㅍ'), 20 | ('ㅀ', 'ㄹㅎ'), 21 | ('ㅄ', 'ㅂㅅ'), 22 | ('ㅘ', 'ㅗㅏ'), 23 | ('ㅙ', 'ㅗㅐ'), 24 | ('ㅚ', 'ㅗㅣ'), 25 | ('ㅝ', 'ㅜㅓ'), 26 | ('ㅞ', 'ㅜㅔ'), 27 | ('ㅟ', 'ㅜㅣ'), 28 | ('ㅢ', 'ㅡㅣ'), 29 | ('ㅑ', 'ㅣㅏ'), 30 | ('ㅒ', 'ㅣㅐ'), 31 | ('ㅕ', 'ㅣㅓ'), 32 | ('ㅖ', 'ㅣㅔ'), 33 | ('ㅛ', 'ㅣㅗ'), 34 | ('ㅠ', 'ㅣㅜ') 35 | ]] 36 | 37 | # List of (Latin alphabet, hangul) pairs: 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 39 | ('a', '에이'), 40 | ('b', '비'), 41 | ('c', '시'), 42 | ('d', '디'), 43 | ('e', '이'), 44 | ('f', '에프'), 45 | ('g', '지'), 46 | ('h', '에이치'), 47 | ('i', '아이'), 48 | ('j', '제이'), 49 | ('k', '케이'), 50 | ('l', '엘'), 51 | ('m', '엠'), 52 | ('n', '엔'), 53 | ('o', '오'), 54 | ('p', '피'), 55 | ('q', '큐'), 56 | ('r', '아르'), 57 | ('s', '에스'), 58 | ('t', '티'), 59 | ('u', '유'), 60 | ('v', '브이'), 61 | ('w', '더블유'), 62 | ('x', '엑스'), 63 | ('y', '와이'), 64 | ('z', '제트') 65 | ]] 66 | 67 | # List of (ipa, lazy ipa) pairs: 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 69 | ('t͡ɕ','ʧ'), 70 | ('d͡ʑ','ʥ'), 71 | ('ɲ','n^'), 72 | ('ɕ','ʃ'), 73 | ('ʷ','w'), 74 | ('ɭ','l`'), 75 | ('ʎ','ɾ'), 76 | ('ɣ','ŋ'), 77 | ('ɰ','ɯ'), 78 | ('ʝ','j'), 79 | ('ʌ','ə'), 80 | ('ɡ','g'), 81 | ('\u031a','#'), 82 | ('\u0348','='), 83 | ('\u031e',''), 84 | ('\u0320',''), 85 | ('\u0339','') 86 | ]] 87 | 88 | 89 | def latin_to_hangul(text): 90 | for regex, replacement in _latin_to_hangul: 91 | text = re.sub(regex, replacement, text) 92 | return text 93 | 94 | 95 | def divide_hangul(text): 96 | text = j2hcj(h2j(text)) 97 | for regex, replacement in _hangul_divided: 98 | text = re.sub(regex, replacement, text) 99 | return text 100 | 101 | 102 | def hangul_number(num, sino=True): 103 | '''Reference https://github.com/Kyubyong/g2pK''' 104 | num = re.sub(',', '', num) 105 | 106 | if num == '0': 107 | return '영' 108 | if not sino and num == '20': 109 | return '스무' 110 | 111 | digits = '123456789' 112 | names = '일이삼사오육칠팔구' 113 | digit2name = {d: n for d, n in zip(digits, names)} 114 | 115 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' 116 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' 117 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} 118 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} 119 | 120 | spelledout = [] 121 | for i, digit in enumerate(num): 122 | i = len(num) - i - 1 123 | if sino: 124 | if i == 0: 125 | name = digit2name.get(digit, '') 126 | elif i == 1: 127 | name = digit2name.get(digit, '') + '십' 128 | name = name.replace('일십', '십') 129 | else: 130 | if i == 0: 131 | name = digit2mod.get(digit, '') 132 | elif i == 1: 133 | name = digit2dec.get(digit, '') 134 | if digit == '0': 135 | if i % 4 == 0: 136 | last_three = spelledout[-min(3, len(spelledout)):] 137 | if ''.join(last_three) == '': 138 | spelledout.append('') 139 | continue 140 | else: 141 | spelledout.append('') 142 | continue 143 | if i == 2: 144 | name = digit2name.get(digit, '') + '백' 145 | name = name.replace('일백', '백') 146 | elif i == 3: 147 | name = digit2name.get(digit, '') + '천' 148 | name = name.replace('일천', '천') 149 | elif i == 4: 150 | name = digit2name.get(digit, '') + '만' 151 | name = name.replace('일만', '만') 152 | elif i == 5: 153 | name = digit2name.get(digit, '') + '십' 154 | name = name.replace('일십', '십') 155 | elif i == 6: 156 | name = digit2name.get(digit, '') + '백' 157 | name = name.replace('일백', '백') 158 | elif i == 7: 159 | name = digit2name.get(digit, '') + '천' 160 | name = name.replace('일천', '천') 161 | elif i == 8: 162 | name = digit2name.get(digit, '') + '억' 163 | elif i == 9: 164 | name = digit2name.get(digit, '') + '십' 165 | elif i == 10: 166 | name = digit2name.get(digit, '') + '백' 167 | elif i == 11: 168 | name = digit2name.get(digit, '') + '천' 169 | elif i == 12: 170 | name = digit2name.get(digit, '') + '조' 171 | elif i == 13: 172 | name = digit2name.get(digit, '') + '십' 173 | elif i == 14: 174 | name = digit2name.get(digit, '') + '백' 175 | elif i == 15: 176 | name = digit2name.get(digit, '') + '천' 177 | spelledout.append(name) 178 | return ''.join(elem for elem in spelledout) 179 | 180 | 181 | def number_to_hangul(text): 182 | '''Reference https://github.com/Kyubyong/g2pK''' 183 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) 184 | for token in tokens: 185 | num, classifier = token 186 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: 187 | spelledout = hangul_number(num, sino=False) 188 | else: 189 | spelledout = hangul_number(num, sino=True) 190 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') 191 | # digit by digit for remaining digits 192 | digits = '0123456789' 193 | names = '영일이삼사오육칠팔구' 194 | for d, n in zip(digits, names): 195 | text = text.replace(d, n) 196 | return text 197 | 198 | 199 | def korean_to_lazy_ipa(text): 200 | text = latin_to_hangul(text) 201 | text = number_to_hangul(text) 202 | text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) 203 | for regex, replacement in _ipa_to_lazy_ipa: 204 | text = re.sub(regex, replacement, text) 205 | return text 206 | 207 | 208 | def korean_to_ipa(text): 209 | text = korean_to_lazy_ipa(text) 210 | return text.replace('ʧ','tʃ').replace('ʥ','dʑ') 211 | -------------------------------------------------------------------------------- /text/mandarin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | from pypinyin import lazy_pinyin, BOPOMOFO 5 | import jieba 6 | import cn2an 7 | import logging 8 | 9 | logging.getLogger('jieba').setLevel(logging.WARNING) 10 | jieba.set_dictionary(r'./jieba/dict.txt') 11 | jieba.initialize() 12 | 13 | 14 | # List of (Latin alphabet, bopomofo) pairs: 15 | _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 16 | ('a', 'ㄟˉ'), 17 | ('b', 'ㄅㄧˋ'), 18 | ('c', 'ㄙㄧˉ'), 19 | ('d', 'ㄉㄧˋ'), 20 | ('e', 'ㄧˋ'), 21 | ('f', 'ㄝˊㄈㄨˋ'), 22 | ('g', 'ㄐㄧˋ'), 23 | ('h', 'ㄝˇㄑㄩˋ'), 24 | ('i', 'ㄞˋ'), 25 | ('j', 'ㄐㄟˋ'), 26 | ('k', 'ㄎㄟˋ'), 27 | ('l', 'ㄝˊㄛˋ'), 28 | ('m', 'ㄝˊㄇㄨˋ'), 29 | ('n', 'ㄣˉ'), 30 | ('o', 'ㄡˉ'), 31 | ('p', 'ㄆㄧˉ'), 32 | ('q', 'ㄎㄧㄡˉ'), 33 | ('r', 'ㄚˋ'), 34 | ('s', 'ㄝˊㄙˋ'), 35 | ('t', 'ㄊㄧˋ'), 36 | ('u', 'ㄧㄡˉ'), 37 | ('v', 'ㄨㄧˉ'), 38 | ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), 39 | ('x', 'ㄝˉㄎㄨˋㄙˋ'), 40 | ('y', 'ㄨㄞˋ'), 41 | ('z', 'ㄗㄟˋ') 42 | ]] 43 | 44 | # List of (bopomofo, romaji) pairs: 45 | _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ 46 | ('ㄅㄛ', 'p⁼wo'), 47 | ('ㄆㄛ', 'pʰwo'), 48 | ('ㄇㄛ', 'mwo'), 49 | ('ㄈㄛ', 'fwo'), 50 | ('ㄅ', 'p⁼'), 51 | ('ㄆ', 'pʰ'), 52 | ('ㄇ', 'm'), 53 | ('ㄈ', 'f'), 54 | ('ㄉ', 't⁼'), 55 | ('ㄊ', 'tʰ'), 56 | ('ㄋ', 'n'), 57 | ('ㄌ', 'l'), 58 | ('ㄍ', 'k⁼'), 59 | ('ㄎ', 'kʰ'), 60 | ('ㄏ', 'h'), 61 | ('ㄐ', 'ʧ⁼'), 62 | ('ㄑ', 'ʧʰ'), 63 | ('ㄒ', 'ʃ'), 64 | ('ㄓ', 'ʦ`⁼'), 65 | ('ㄔ', 'ʦ`ʰ'), 66 | ('ㄕ', 's`'), 67 | ('ㄖ', 'ɹ`'), 68 | ('ㄗ', 'ʦ⁼'), 69 | ('ㄘ', 'ʦʰ'), 70 | ('ㄙ', 's'), 71 | ('ㄚ', 'a'), 72 | ('ㄛ', 'o'), 73 | ('ㄜ', 'ə'), 74 | ('ㄝ', 'e'), 75 | ('ㄞ', 'ai'), 76 | ('ㄟ', 'ei'), 77 | ('ㄠ', 'au'), 78 | ('ㄡ', 'ou'), 79 | ('ㄧㄢ', 'yeNN'), 80 | ('ㄢ', 'aNN'), 81 | ('ㄧㄣ', 'iNN'), 82 | ('ㄣ', 'əNN'), 83 | ('ㄤ', 'aNg'), 84 | ('ㄧㄥ', 'iNg'), 85 | ('ㄨㄥ', 'uNg'), 86 | ('ㄩㄥ', 'yuNg'), 87 | ('ㄥ', 'əNg'), 88 | ('ㄦ', 'əɻ'), 89 | ('ㄧ', 'i'), 90 | ('ㄨ', 'u'), 91 | ('ㄩ', 'ɥ'), 92 | ('ˉ', '→'), 93 | ('ˊ', '↑'), 94 | ('ˇ', '↓↑'), 95 | ('ˋ', '↓'), 96 | ('˙', ''), 97 | (',', ','), 98 | ('。', '.'), 99 | ('!', '!'), 100 | ('?', '?'), 101 | ('—', '-') 102 | ]] 103 | 104 | # List of (romaji, ipa) pairs: 105 | _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 106 | ('ʃy', 'ʃ'), 107 | ('ʧʰy', 'ʧʰ'), 108 | ('ʧ⁼y', 'ʧ⁼'), 109 | ('NN', 'n'), 110 | ('Ng', 'ŋ'), 111 | ('y', 'j'), 112 | ('h', 'x') 113 | ]] 114 | 115 | # List of (bopomofo, ipa) pairs: 116 | _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 117 | ('ㄅㄛ', 'p⁼wo'), 118 | ('ㄆㄛ', 'pʰwo'), 119 | ('ㄇㄛ', 'mwo'), 120 | ('ㄈㄛ', 'fwo'), 121 | ('ㄅ', 'p⁼'), 122 | ('ㄆ', 'pʰ'), 123 | ('ㄇ', 'm'), 124 | ('ㄈ', 'f'), 125 | ('ㄉ', 't⁼'), 126 | ('ㄊ', 'tʰ'), 127 | ('ㄋ', 'n'), 128 | ('ㄌ', 'l'), 129 | ('ㄍ', 'k⁼'), 130 | ('ㄎ', 'kʰ'), 131 | ('ㄏ', 'x'), 132 | ('ㄐ', 'tʃ⁼'), 133 | ('ㄑ', 'tʃʰ'), 134 | ('ㄒ', 'ʃ'), 135 | ('ㄓ', 'ts`⁼'), 136 | ('ㄔ', 'ts`ʰ'), 137 | ('ㄕ', 's`'), 138 | ('ㄖ', 'ɹ`'), 139 | ('ㄗ', 'ts⁼'), 140 | ('ㄘ', 'tsʰ'), 141 | ('ㄙ', 's'), 142 | ('ㄚ', 'a'), 143 | ('ㄛ', 'o'), 144 | ('ㄜ', 'ə'), 145 | ('ㄝ', 'ɛ'), 146 | ('ㄞ', 'aɪ'), 147 | ('ㄟ', 'eɪ'), 148 | ('ㄠ', 'ɑʊ'), 149 | ('ㄡ', 'oʊ'), 150 | ('ㄧㄢ', 'jɛn'), 151 | ('ㄩㄢ', 'ɥæn'), 152 | ('ㄢ', 'an'), 153 | ('ㄧㄣ', 'in'), 154 | ('ㄩㄣ', 'ɥn'), 155 | ('ㄣ', 'ən'), 156 | ('ㄤ', 'ɑŋ'), 157 | ('ㄧㄥ', 'iŋ'), 158 | ('ㄨㄥ', 'ʊŋ'), 159 | ('ㄩㄥ', 'jʊŋ'), 160 | ('ㄥ', 'əŋ'), 161 | ('ㄦ', 'əɻ'), 162 | ('ㄧ', 'i'), 163 | ('ㄨ', 'u'), 164 | ('ㄩ', 'ɥ'), 165 | ('ˉ', '→'), 166 | ('ˊ', '↑'), 167 | ('ˇ', '↓↑'), 168 | ('ˋ', '↓'), 169 | ('˙', ''), 170 | (',', ','), 171 | ('。', '.'), 172 | ('!', '!'), 173 | ('?', '?'), 174 | ('—', '-') 175 | ]] 176 | 177 | # List of (bopomofo, ipa2) pairs: 178 | _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 179 | ('ㄅㄛ', 'pwo'), 180 | ('ㄆㄛ', 'pʰwo'), 181 | ('ㄇㄛ', 'mwo'), 182 | ('ㄈㄛ', 'fwo'), 183 | ('ㄅ', 'p'), 184 | ('ㄆ', 'pʰ'), 185 | ('ㄇ', 'm'), 186 | ('ㄈ', 'f'), 187 | ('ㄉ', 't'), 188 | ('ㄊ', 'tʰ'), 189 | ('ㄋ', 'n'), 190 | ('ㄌ', 'l'), 191 | ('ㄍ', 'k'), 192 | ('ㄎ', 'kʰ'), 193 | ('ㄏ', 'h'), 194 | ('ㄐ', 'tɕ'), 195 | ('ㄑ', 'tɕʰ'), 196 | ('ㄒ', 'ɕ'), 197 | ('ㄓ', 'tʂ'), 198 | ('ㄔ', 'tʂʰ'), 199 | ('ㄕ', 'ʂ'), 200 | ('ㄖ', 'ɻ'), 201 | ('ㄗ', 'ts'), 202 | ('ㄘ', 'tsʰ'), 203 | ('ㄙ', 's'), 204 | ('ㄚ', 'a'), 205 | ('ㄛ', 'o'), 206 | ('ㄜ', 'ɤ'), 207 | ('ㄝ', 'ɛ'), 208 | ('ㄞ', 'aɪ'), 209 | ('ㄟ', 'eɪ'), 210 | ('ㄠ', 'ɑʊ'), 211 | ('ㄡ', 'oʊ'), 212 | ('ㄧㄢ', 'jɛn'), 213 | ('ㄩㄢ', 'yæn'), 214 | ('ㄢ', 'an'), 215 | ('ㄧㄣ', 'in'), 216 | ('ㄩㄣ', 'yn'), 217 | ('ㄣ', 'ən'), 218 | ('ㄤ', 'ɑŋ'), 219 | ('ㄧㄥ', 'iŋ'), 220 | ('ㄨㄥ', 'ʊŋ'), 221 | ('ㄩㄥ', 'jʊŋ'), 222 | ('ㄥ', 'ɤŋ'), 223 | ('ㄦ', 'əɻ'), 224 | ('ㄧ', 'i'), 225 | ('ㄨ', 'u'), 226 | ('ㄩ', 'y'), 227 | ('ˉ', '˥'), 228 | ('ˊ', '˧˥'), 229 | ('ˇ', '˨˩˦'), 230 | ('ˋ', '˥˩'), 231 | ('˙', ''), 232 | (',', ','), 233 | ('。', '.'), 234 | ('!', '!'), 235 | ('?', '?'), 236 | ('—', '-') 237 | ]] 238 | 239 | 240 | def number_to_chinese(text): 241 | numbers = re.findall(r'\d+(?:\.?\d+)?', text) 242 | for number in numbers: 243 | text = text.replace(number, cn2an.an2cn(number), 1) 244 | return text 245 | 246 | 247 | def chinese_to_bopomofo(text): 248 | text = text.replace('、', ',').replace(';', ',').replace(':', ',') 249 | words = jieba.lcut(text, cut_all=False) 250 | text = '' 251 | for word in words: 252 | bopomofos = lazy_pinyin(word, BOPOMOFO) 253 | if not re.search('[\u4e00-\u9fff]', word): 254 | text += word 255 | continue 256 | for i in range(len(bopomofos)): 257 | bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i]) 258 | if text != '': 259 | text += ' ' 260 | text += ''.join(bopomofos) 261 | return text 262 | 263 | 264 | def latin_to_bopomofo(text): 265 | for regex, replacement in _latin_to_bopomofo: 266 | text = re.sub(regex, replacement, text) 267 | return text 268 | 269 | 270 | def bopomofo_to_romaji(text): 271 | for regex, replacement in _bopomofo_to_romaji: 272 | text = re.sub(regex, replacement, text) 273 | return text 274 | 275 | 276 | def bopomofo_to_ipa(text): 277 | for regex, replacement in _bopomofo_to_ipa: 278 | text = re.sub(regex, replacement, text) 279 | return text 280 | 281 | 282 | def bopomofo_to_ipa2(text): 283 | for regex, replacement in _bopomofo_to_ipa2: 284 | text = re.sub(regex, replacement, text) 285 | return text 286 | 287 | 288 | def chinese_to_romaji(text): 289 | text = number_to_chinese(text) 290 | text = chinese_to_bopomofo(text) 291 | text = latin_to_bopomofo(text) 292 | text = bopomofo_to_romaji(text) 293 | text = re.sub('i([aoe])', r'y\1', text) 294 | text = re.sub('u([aoəe])', r'w\1', text) 295 | text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 296 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 297 | text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 298 | return text 299 | 300 | 301 | def chinese_to_lazy_ipa(text): 302 | text = chinese_to_romaji(text) 303 | for regex, replacement in _romaji_to_ipa: 304 | text = re.sub(regex, replacement, text) 305 | return text 306 | 307 | 308 | def chinese_to_ipa(text): 309 | text = number_to_chinese(text) 310 | text = chinese_to_bopomofo(text) 311 | text = latin_to_bopomofo(text) 312 | text = bopomofo_to_ipa(text) 313 | text = re.sub('i([aoe])', r'j\1', text) 314 | text = re.sub('u([aoəe])', r'w\1', text) 315 | text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 316 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 317 | text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 318 | return text 319 | 320 | 321 | def chinese_to_ipa2(text): 322 | text = number_to_chinese(text) 323 | text = chinese_to_bopomofo(text) 324 | text = latin_to_bopomofo(text) 325 | text = bopomofo_to_ipa2(text) 326 | text = re.sub(r'i([aoe])', r'j\1', text) 327 | text = re.sub(r'u([aoəe])', r'w\1', text) 328 | text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text) 329 | text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text) 330 | return text 331 | -------------------------------------------------------------------------------- /text/ngu_dialect.py: -------------------------------------------------------------------------------- 1 | import re 2 | import opencc 3 | 4 | 5 | dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou', 6 | 'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing', 7 | 'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang', 8 | 'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan', 9 | 'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen', 10 | 'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'} 11 | 12 | converters = {} 13 | 14 | for dialect in dialects.values(): 15 | try: 16 | converters[dialect] = opencc.OpenCC(dialect) 17 | except: 18 | pass 19 | 20 | 21 | def ngu_dialect_to_ipa(text, dialect): 22 | dialect = dialects[dialect] 23 | text = converters[dialect].convert(text).replace('-','').replace('$',' ') 24 | text = re.sub(r'[、;:]', ',', text) 25 | text = re.sub(r'\s*,\s*', ', ', text) 26 | text = re.sub(r'\s*。\s*', '. ', text) 27 | text = re.sub(r'\s*?\s*', '? ', text) 28 | text = re.sub(r'\s*!\s*', '! ', text) 29 | text = re.sub(r'\s*$', '', text) 30 | return text 31 | -------------------------------------------------------------------------------- /text/sanskrit.py: -------------------------------------------------------------------------------- 1 | import re 2 | from indic_transliteration import sanscript 3 | 4 | 5 | # List of (iast, ipa) pairs: 6 | _iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 7 | ('a', 'ə'), 8 | ('ā', 'aː'), 9 | ('ī', 'iː'), 10 | ('ū', 'uː'), 11 | ('ṛ', 'ɹ`'), 12 | ('ṝ', 'ɹ`ː'), 13 | ('ḷ', 'l`'), 14 | ('ḹ', 'l`ː'), 15 | ('e', 'eː'), 16 | ('o', 'oː'), 17 | ('k', 'k⁼'), 18 | ('k⁼h', 'kʰ'), 19 | ('g', 'g⁼'), 20 | ('g⁼h', 'gʰ'), 21 | ('ṅ', 'ŋ'), 22 | ('c', 'ʧ⁼'), 23 | ('ʧ⁼h', 'ʧʰ'), 24 | ('j', 'ʥ⁼'), 25 | ('ʥ⁼h', 'ʥʰ'), 26 | ('ñ', 'n^'), 27 | ('ṭ', 't`⁼'), 28 | ('t`⁼h', 't`ʰ'), 29 | ('ḍ', 'd`⁼'), 30 | ('d`⁼h', 'd`ʰ'), 31 | ('ṇ', 'n`'), 32 | ('t', 't⁼'), 33 | ('t⁼h', 'tʰ'), 34 | ('d', 'd⁼'), 35 | ('d⁼h', 'dʰ'), 36 | ('p', 'p⁼'), 37 | ('p⁼h', 'pʰ'), 38 | ('b', 'b⁼'), 39 | ('b⁼h', 'bʰ'), 40 | ('y', 'j'), 41 | ('ś', 'ʃ'), 42 | ('ṣ', 's`'), 43 | ('r', 'ɾ'), 44 | ('l̤', 'l`'), 45 | ('h', 'ɦ'), 46 | ("'", ''), 47 | ('~', '^'), 48 | ('ṃ', '^') 49 | ]] 50 | 51 | 52 | def devanagari_to_ipa(text): 53 | text = text.replace('ॐ', 'ओम्') 54 | text = re.sub(r'\s*।\s*$', '.', text) 55 | text = re.sub(r'\s*।\s*', ', ', text) 56 | text = re.sub(r'\s*॥', '.', text) 57 | text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST) 58 | for regex, replacement in _iast_to_ipa: 59 | text = re.sub(regex, replacement, text) 60 | text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0) 61 | [:-1]+'h'+x.group(1)+'*', text) 62 | return text 63 | -------------------------------------------------------------------------------- /text/shanghainese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('zaonhe') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ᴇ'), 11 | ('B', 'bi'), 12 | ('C', 'si'), 13 | ('D', 'di'), 14 | ('E', 'i'), 15 | ('F', 'ᴇf'), 16 | ('G', 'dʑi'), 17 | ('H', 'ᴇtɕʰ'), 18 | ('I', 'ᴀi'), 19 | ('J', 'dʑᴇ'), 20 | ('K', 'kʰᴇ'), 21 | ('L', 'ᴇl'), 22 | ('M', 'ᴇm'), 23 | ('N', 'ᴇn'), 24 | ('O', 'o'), 25 | ('P', 'pʰi'), 26 | ('Q', 'kʰiu'), 27 | ('R', 'ᴀl'), 28 | ('S', 'ᴇs'), 29 | ('T', 'tʰi'), 30 | ('U', 'ɦiu'), 31 | ('V', 'vi'), 32 | ('W', 'dᴀbɤliu'), 33 | ('X', 'ᴇks'), 34 | ('Y', 'uᴀi'), 35 | ('Z', 'zᴇ') 36 | ]] 37 | 38 | 39 | def _number_to_shanghainese(num): 40 | num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两') 41 | return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num) 42 | 43 | 44 | def number_to_shanghainese(text): 45 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text) 46 | 47 | 48 | def latin_to_ipa(text): 49 | for regex, replacement in _latin_to_ipa: 50 | text = re.sub(regex, replacement, text) 51 | return text 52 | 53 | 54 | def shanghainese_to_ipa(text): 55 | text = number_to_shanghainese(text.upper()) 56 | text = converter.convert(text).replace('-','').replace('$',' ') 57 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 58 | text = re.sub(r'[、;:]', ',', text) 59 | text = re.sub(r'\s*,\s*', ', ', text) 60 | text = re.sub(r'\s*。\s*', '. ', text) 61 | text = re.sub(r'\s*?\s*', '? ', text) 62 | text = re.sub(r'\s*!\s*', '! ', text) 63 | text = re.sub(r'\s*$', '', text) 64 | return text 65 | -------------------------------------------------------------------------------- /text/thai.py: -------------------------------------------------------------------------------- 1 | import re 2 | from num_thai.thainumbers import NumThai 3 | 4 | 5 | num = NumThai() 6 | 7 | # List of (Latin alphabet, Thai) pairs: 8 | _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 9 | ('a', 'เอ'), 10 | ('b','บี'), 11 | ('c','ซี'), 12 | ('d','ดี'), 13 | ('e','อี'), 14 | ('f','เอฟ'), 15 | ('g','จี'), 16 | ('h','เอช'), 17 | ('i','ไอ'), 18 | ('j','เจ'), 19 | ('k','เค'), 20 | ('l','แอล'), 21 | ('m','เอ็ม'), 22 | ('n','เอ็น'), 23 | ('o','โอ'), 24 | ('p','พี'), 25 | ('q','คิว'), 26 | ('r','แอร์'), 27 | ('s','เอส'), 28 | ('t','ที'), 29 | ('u','ยู'), 30 | ('v','วี'), 31 | ('w','ดับเบิลยู'), 32 | ('x','เอ็กซ์'), 33 | ('y','วาย'), 34 | ('z','ซี') 35 | ]] 36 | 37 | 38 | def num_to_thai(text): 39 | return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text) 40 | 41 | def latin_to_thai(text): 42 | for regex, replacement in _latin_to_thai: 43 | text = re.sub(regex, replacement, text) 44 | return text 45 | -------------------------------------------------------------------------------- /transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from json import loads 3 | from torch import load, FloatTensor 4 | from numpy import float32 5 | import librosa 6 | 7 | 8 | class HParams(): 9 | def __init__(self, **kwargs): 10 | for k, v in kwargs.items(): 11 | if type(v) == dict: 12 | v = HParams(**v) 13 | self[k] = v 14 | 15 | def keys(self): 16 | return self.__dict__.keys() 17 | 18 | def items(self): 19 | return self.__dict__.items() 20 | 21 | def values(self): 22 | return self.__dict__.values() 23 | 24 | def __len__(self): 25 | return len(self.__dict__) 26 | 27 | def __getitem__(self, key): 28 | return getattr(self, key) 29 | 30 | def __setitem__(self, key, value): 31 | return setattr(self, key, value) 32 | 33 | def __contains__(self, key): 34 | return key in self.__dict__ 35 | 36 | def __repr__(self): 37 | return self.__dict__.__repr__() 38 | 39 | 40 | def load_checkpoint(checkpoint_path, model): 41 | checkpoint_dict = load(checkpoint_path, map_location='cpu') 42 | iteration = checkpoint_dict['iteration'] 43 | saved_state_dict = checkpoint_dict['model'] 44 | if hasattr(model, 'module'): 45 | state_dict = model.module.state_dict() 46 | else: 47 | state_dict = model.state_dict() 48 | new_state_dict= {} 49 | for k, v in state_dict.items(): 50 | try: 51 | new_state_dict[k] = saved_state_dict[k] 52 | except: 53 | logging.info("%s is not in the checkpoint" % k) 54 | new_state_dict[k] = v 55 | if hasattr(model, 'module'): 56 | model.module.load_state_dict(new_state_dict) 57 | else: 58 | model.load_state_dict(new_state_dict) 59 | logging.info("Loaded checkpoint '{}' (iteration {})" .format( 60 | checkpoint_path, iteration)) 61 | return 62 | 63 | 64 | def get_hparams_from_file(config_path): 65 | with open(config_path, "r") as f: 66 | data = f.read() 67 | config = loads(data) 68 | 69 | hparams = HParams(**config) 70 | return hparams 71 | 72 | 73 | def load_audio_to_torch(full_path, target_sampling_rate): 74 | audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True) 75 | return FloatTensor(audio.astype(float32)) 76 | --------------------------------------------------------------------------------