├── .gitattributes
├── .gitignore
├── ChatWaifu.py
├── ChatWaifuCN.py
├── ChatWaifuCNVoice.py
├── ChatWaifuJP.py
├── ChatWaifuJPVoice.py
├── ChatWaifuJPVoiceEN.py
├── ChatWaifuJPVoiceJP.py
├── ChatWaifuVoice.py
├── LICENSE
├── README.md
├── attentions.py
├── commons.py
├── eng-README.md
├── hubert_model.py
├── jieba
    └── dict.txt
├── mel_processing.py
├── models.py
├── modules.py
├── readme
    ├── 1.png
    ├── 2.png
    ├── 3.png
    ├── 4.png
    ├── 5.png
    ├── 6.png
    ├── 7.png
    ├── cyberchat.png
    └── token.png
├── requirements.txt
├── text
    ├── LICENSE
    ├── __init__.py
    ├── cantonese.py
    ├── cleaners.py
    ├── english.py
    ├── japanese.py
    ├── korean.py
    ├── mandarin.py
    ├── ngu_dialect.py
    ├── sanskrit.py
    ├── shanghainese.py
    └── thai.py
├── transforms.py
└── utils.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Set default behavior to automatically normalize line endings.
 3 | ###############################################################################
 4 | * text=auto
 5 | 
 6 | ###############################################################################
 7 | # Set default behavior for command prompt diff.
 8 | #
 9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs     diff=csharp
14 | 
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following 
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln       merge=binary
26 | #*.csproj    merge=binary
27 | #*.vbproj    merge=binary
28 | #*.vcxproj   merge=binary
29 | #*.vcproj    merge=binary
30 | #*.dbproj    merge=binary
31 | #*.fsproj    merge=binary
32 | #*.lsproj    merge=binary
33 | #*.wixproj   merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj   merge=binary
36 | #*.wwaproj   merge=binary
37 | 
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg   binary
44 | #*.png   binary
45 | #*.gif   binary
46 | 
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | # 
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the 
52 | # entries below.
53 | ###############################################################################
54 | #*.doc   diff=astextplain
55 | #*.DOC   diff=astextplain
56 | #*.docx  diff=astextplain
57 | #*.DOCX  diff=astextplain
58 | #*.dot   diff=astextplain
59 | #*.DOT   diff=astextplain
60 | #*.pdf   diff=astextplain
61 | #*.PDF   diff=astextplain
62 | #*.rtf   diff=astextplain
63 | #*.RTF   diff=astextplain
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Ww][Ii][Nn]32/
 27 | [Aa][Rr][Mm]/
 28 | [Aa][Rr][Mm]64/
 29 | bld/
 30 | [Bb]in/
 31 | [Oo]bj/
 32 | [Oo]ut/
 33 | [Ll]og/
 34 | [Ll]ogs/
 35 | 
 36 | # Visual Studio 2015/2017 cache/options directory
 37 | .vs/
 38 | # Uncomment if you have tasks that create the project's static files in wwwroot
 39 | #wwwroot/
 40 | 
 41 | # Visual Studio 2017 auto generated files
 42 | Generated\ Files/
 43 | 
 44 | # MSTest test Results
 45 | [Tt]est[Rr]esult*/
 46 | [Bb]uild[Ll]og.*
 47 | 
 48 | # NUnit
 49 | *.VisualState.xml
 50 | TestResult.xml
 51 | nunit-*.xml
 52 | 
 53 | # Build Results of an ATL Project
 54 | [Dd]ebugPS/
 55 | [Rr]eleasePS/
 56 | dlldata.c
 57 | 
 58 | # Benchmark Results
 59 | BenchmarkDotNet.Artifacts/
 60 | 
 61 | # .NET Core
 62 | project.lock.json
 63 | project.fragment.lock.json
 64 | artifacts/
 65 | 
 66 | # ASP.NET Scaffolding
 67 | ScaffoldingReadMe.txt
 68 | 
 69 | # StyleCop
 70 | StyleCopReport.xml
 71 | 
 72 | # Files built by Visual Studio
 73 | *_i.c
 74 | *_p.c
 75 | *_h.h
 76 | *.ilk
 77 | *.meta
 78 | *.obj
 79 | *.iobj
 80 | *.pch
 81 | *.pdb
 82 | *.ipdb
 83 | *.pgc
 84 | *.pgd
 85 | *.rsp
 86 | *.sbr
 87 | *.tlb
 88 | *.tli
 89 | *.tlh
 90 | *.tmp
 91 | *.tmp_proj
 92 | *_wpftmp.csproj
 93 | *.log
 94 | *.vspscc
 95 | *.vssscc
 96 | .builds
 97 | *.pidb
 98 | *.svclog
 99 | *.scc
100 | 
101 | # Chutzpah Test files
102 | _Chutzpah*
103 | 
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 | 
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 | 
121 | # Visual Studio Trace Files
122 | *.e2e
123 | 
124 | # TFS 2012 Local Workspace
125 | $tf/
126 | 
127 | # Guidance Automation Toolkit
128 | *.gpState
129 | 
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 | 
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 | 
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 | 
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 | 
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 | 
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 | 
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 | 
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 | 
163 | # Web workbench (sass)
164 | .sass-cache/
165 | 
166 | # Installshield output folder
167 | [Ee]xpress/
168 | 
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 | 
179 | # Click-Once directory
180 | publish/
181 | 
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 | 
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 | 
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 | 
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 | 
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 | 
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 | 
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 | 
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 | 
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 | 
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 | 
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 | 
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 | 
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 | 
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 | 
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 | 
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 | 
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 | 
288 | # Visual Studio 6 build log
289 | *.plg
290 | 
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 | 
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 | 
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 | 
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 | 
309 | # FAKE - F# Make
310 | .fake/
311 | 
312 | # CodeRush personal settings
313 | .cr/personal
314 | 
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | *.pyc
318 | 
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 | 
323 | # Tabs Studio
324 | *.tss
325 | 
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 | 
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 | 
335 | # OpenCover UI analysis results
336 | OpenCover/
337 | 
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 | 
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 | 
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 | 
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 | 
350 | # Local History for Visual Studio
351 | .localhistory/
352 | 
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 | 
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 | 
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 | 
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
364 | 
365 | # build
366 | build
367 | monotonic_align/core.c
368 | *.o
369 | *.so
370 | *.dll
371 | 
372 | # data
373 | /config.json
374 | /*.pth
375 | *.wav
376 | /monotonic_align/monotonic_align
377 | /resources
378 | /MoeGoe.spec
379 | /dist/MoeGoe
380 | /dist
381 | 
382 | # MacOS
383 | .DS_Store
384 | 


--------------------------------------------------------------------------------
/ChatWaifu.py:
--------------------------------------------------------------------------------
  1 | from scipy.io.wavfile import write
  2 | from mel_processing import spectrogram_torch
  3 | from text import text_to_sequence, _clean_text
  4 | from models import SynthesizerTrn
  5 | import utils
  6 | import commons
  7 | import sys
  8 | import re
  9 | from torch import no_grad, LongTensor
 10 | import logging
 11 | from winsound import PlaySound
 12 | from openai import OpenAI
 13 | 
 14 | chinese_model_path = ".\model\CN\model.pth"
 15 | chinese_config_path = ".\model\CN\config.json"
 16 | japanese_model_path = ".\model\H_excluded.pth"
 17 | japanese_config_path = ".\model\config.json"
 18 | 
 19 | ####################################
 20 | #CHATGPT INITIALIZE
 21 | from pyChatGPT import ChatGPT
 22 | import json
 23 | 
 24 | modelmessage = """ID      Output Language
 25 | 0       Chinese
 26 | 1       Japanese
 27 | """
 28 | 
 29 | idmessage_cn = """ID      Speaker
 30 | 0       綾地寧々
 31 | 1       在原七海
 32 | 2       小茸
 33 | 3       唐乐吟
 34 | """
 35 | 
 36 | idmessage_jp = """ID      Speaker
 37 | 0       綾地寧々
 38 | 1       因幡めぐる
 39 | 2       朝武芳乃
 40 | 3       常陸茉子
 41 | 4       ムラサメ
 42 | 5       鞍馬小春
 43 | 6       在原七海
 44 | """
 45 | 
 46 | def get_input():
 47 |     # prompt for input
 48 |     print("You:")
 49 |     user_input = input()
 50 |     return user_input
 51 | 
 52 | def get_input_jp():
 53 |     # prompt for input
 54 |     print("You:")
 55 |     usr_in = input()
 56 |     if usr_in == 'quit()':
 57 |         return usr_in
 58 |     else:
 59 |         user_input = usr_in +" 使用日本语"
 60 |     return user_input
 61 | 
 62 | def get_token():
 63 |     token = input("Your API Key: \n")
 64 |     return token
 65 | 
 66 |       
 67 | ################################################
 68 | 
 69 | 
 70 | logging.getLogger('numba').setLevel(logging.WARNING)
 71 | 
 72 | def ex_print(text, escape=False):
 73 |     if escape:
 74 |         print(text.encode('unicode_escape').decode())
 75 |     else:
 76 |         print(text)
 77 | 
 78 | 
 79 | def get_text(text, hps, cleaned=False):
 80 |     if cleaned:
 81 |         text_norm = text_to_sequence(text, hps.symbols, [])
 82 |     else:
 83 |         text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
 84 |     if hps.data.add_blank:
 85 |         text_norm = commons.intersperse(text_norm, 0)
 86 |     text_norm = LongTensor(text_norm)
 87 |     return text_norm
 88 | 
 89 | 
 90 | def ask_if_continue():
 91 |     while True:
 92 |         answer = input('Continue? (y/n): ')
 93 |         if answer == 'y':
 94 |             break
 95 |         elif answer == 'n':
 96 |             sys.exit(0)
 97 | 
 98 | 
 99 | def print_speakers(speakers, escape=False):
100 |     if len(speakers) > 100:
101 |         return
102 |     print('ID\tSpeaker')
103 |     for id, name in enumerate(speakers):
104 |         ex_print(str(id) + '\t' + name, escape)
105 | 
106 | 
107 | def get_speaker_id(message):
108 |     speaker_id = input(message)
109 |     if speaker_id == '':
110 |         print(str(speaker_id) + ' is not a valid ID!')
111 |         sys.exit(1)
112 |     else:
113 |         try:
114 |             speaker_id = int(speaker_id)
115 |         except:
116 |             print(str(speaker_id) + ' is not a valid ID!')
117 |             sys.exit(1)
118 |     return speaker_id
119 | 
120 | def get_model_id(message):
121 |     model_id = input(message)
122 |     if model_id == '':
123 |         print(str(model_id) + ' is not a valid ID!')
124 |         sys.exit(1)
125 |     else:
126 |         try:
127 |             model_id = int(model_id)
128 |         except:
129 |             print(str(model_id) + ' is not a valid ID!')
130 |             sys.exit(1)
131 |     return model_id
132 | 
133 | def get_label_value(text, label, default, warning_name='value'):
134 |     value = re.search(rf'\[{label}=(.+?)\]', text)
135 |     if value:
136 |         try:
137 |             text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
138 |             value = float(value.group(1))
139 |         except:
140 |             print(f'Invalid {warning_name}!')
141 |             sys.exit(1)
142 |     else:
143 |         value = default
144 |     return value, text
145 | 
146 | 
147 | def get_label(text, label):
148 |     if f'[{label}]' in text:
149 |         return True, text.replace(f'[{label}]', '')
150 |     else:
151 |         return False, text
152 |     
153 | def get_reponse(input):
154 |     msg = [
155 |         {"role": "user", "content": input}
156 |     ]
157 | 
158 |     # Call the OpenAI API with the prompt
159 |     response = client.chat.completions.create(
160 |       model="gpt-3.5-turbo",  # Adjust based on available engine versions
161 |       messages=msg,
162 |       temperature=0
163 |     )
164 |     # Extract and return the text from the API response
165 |     return response.choices[0].message.content
166 | 
167 | 
168 | def generateSound(inputString, id, model_id):
169 |     if '--escape' in sys.argv:
170 |         escape = True
171 |     else:
172 |         escape = False
173 | 
174 |     #model = input('0: Chinese')
175 |     #config = input('Path of a config file: ')
176 |     if model_id == 0:
177 |         model = chinese_model_path
178 |         config = chinese_config_path
179 |     elif model_id == 1:
180 |         model = japanese_model_path
181 |         config = japanese_config_path
182 |         
183 | 
184 |     hps_ms = utils.get_hparams_from_file(config)
185 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
186 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
187 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
188 | 
189 |     net_g_ms = SynthesizerTrn(
190 |         n_symbols,
191 |         hps_ms.data.filter_length // 2 + 1,
192 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
193 |         n_speakers=n_speakers,
194 |         emotion_embedding=emotion_embedding,
195 |         **hps_ms.model)
196 |     _ = net_g_ms.eval()
197 |     utils.load_checkpoint(model, net_g_ms)
198 | 
199 |     if n_symbols != 0:
200 |         if not emotion_embedding:
201 |             #while True:
202 |             if(1 == 1):
203 |                 choice = 't'
204 |                 if choice == 't':
205 |                     text = inputString
206 |                     if text == '[ADVANCED]':
207 |                         text = "我不会说"
208 | 
209 |                     length_scale, text = get_label_value(
210 |                         text, 'LENGTH', 1, 'length scale')
211 |                     noise_scale, text = get_label_value(
212 |                         text, 'NOISE', 0.667, 'noise scale')
213 |                     noise_scale_w, text = get_label_value(
214 |                         text, 'NOISEW', 0.8, 'deviation of noise')
215 |                     cleaned, text = get_label(text, 'CLEANED')
216 | 
217 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
218 |                     
219 |                     speaker_id = id 
220 |                     out_path = "output.wav"
221 | 
222 |                     with no_grad():
223 |                         x_tst = stn_tst.unsqueeze(0)
224 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
225 |                         sid = LongTensor([speaker_id])
226 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
227 |                                                noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
228 | 
229 |                 write(out_path, hps_ms.data.sampling_rate, audio)
230 | if __name__ == "__main__":
231 |     # Set OpenAI API key
232 |     api_key = get_token()
233 |     print()
234 |     client = OpenAI(api_key=api_key, timeout=600)
235 |     model_id = -1
236 |     while True:
237 |         print(modelmessage)
238 |         model_id = int(get_model_id('选择回复语言: '))
239 |         if model_id == 0 or model_id == 1:
240 |             break
241 |         else:
242 |             print(str(model_id) + ' is not a valid ID!\n')
243 |     print()
244 | 
245 |     speaker_id = -1
246 |     while True:
247 |         if model_id == 0:
248 |             print("\n" + idmessage_cn)
249 |         elif model_id == 1:
250 |             print("\n" + idmessage_jp)
251 |         
252 |         speaker_id = get_speaker_id('选择角色: ')
253 |         if (model_id == 0 and speaker_id in list(range(4))) or (model_id == 1 and speaker_id in list(range(7))):
254 |             break
255 |         else:
256 |             print(str(speaker_id) + ' is not a valid ID!\n')
257 |     print()
258 | 
259 |     while True:
260 |         if model_id == 0:
261 |             usr_in = get_input()
262 | 
263 |             if(usr_in == "quit()"):
264 |                 break
265 |             resp = get_reponse(usr_in)
266 |             print("ChatGPT:")
267 |             answer = resp.replace('\n','')
268 |             generateSound("[ZH]"+answer+"[ZH]", speaker_id, model_id)
269 |             print(answer)
270 |             PlaySound(r'./output.wav', flags=1)
271 |         elif model_id == 1:
272 |             usr_in = get_input_jp()
273 |             if(usr_in == "quit()"):
274 |                 break
275 |             resp = get_reponse(usr_in)
276 |             print("ChatGPT:")
277 |             answer = resp.replace('\n','')
278 |             generateSound(answer, speaker_id, model_id)
279 |             print(answer)
280 |             PlaySound(r'./output.wav', flags=1)


--------------------------------------------------------------------------------
/ChatWaifuCN.py:
--------------------------------------------------------------------------------
  1 | from scipy.io.wavfile import write
  2 | from mel_processing import spectrogram_torch
  3 | from text import text_to_sequence, _clean_text
  4 | from models import SynthesizerTrn
  5 | import utils
  6 | import commons
  7 | import sys
  8 | import re
  9 | from torch import no_grad, LongTensor
 10 | import logging
 11 | from winsound import PlaySound
 12 | 
 13 | ####################################
 14 | #CHATGPT INITIALIZE
 15 | from pyChatGPT import ChatGPT
 16 | import json
 17 | idmessage = """ID      Speaker
 18 | 0       綾地寧々
 19 | 1       在原七海
 20 | 2       小茸
 21 | 3       唐乐吟
 22 | """
 23 | speakerID = 0
 24 | 
 25 | def get_input():
 26 |     # prompt for input
 27 |     print("You:")
 28 |     user_input = input()
 29 |     return user_input
 30 | 
 31 | def get_token():
 32 |     token = input("Copy your token from ChatGPT and press Enter \n")
 33 |     return token;
 34 | 
 35 |       
 36 | ################################################
 37 | 
 38 | 
 39 | logging.getLogger('numba').setLevel(logging.WARNING)
 40 | 
 41 | 
 42 | def ex_print(text, escape=False):
 43 |     if escape:
 44 |         print(text.encode('unicode_escape').decode())
 45 |     else:
 46 |         print(text)
 47 | 
 48 | 
 49 | def get_text(text, hps, cleaned=False):
 50 |     if cleaned:
 51 |         text_norm = text_to_sequence(text, hps.symbols, [])
 52 |     else:
 53 |         text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
 54 |     if hps.data.add_blank:
 55 |         text_norm = commons.intersperse(text_norm, 0)
 56 |     text_norm = LongTensor(text_norm)
 57 |     return text_norm
 58 | 
 59 | 
 60 | def ask_if_continue():
 61 |     while True:
 62 |         answer = input('Continue? (y/n): ')
 63 |         if answer == 'y':
 64 |             break
 65 |         elif answer == 'n':
 66 |             sys.exit(0)
 67 | 
 68 | 
 69 | def print_speakers(speakers, escape=False):
 70 |     if len(speakers) > 100:
 71 |         return
 72 |     print('ID\tSpeaker')
 73 |     for id, name in enumerate(speakers):
 74 |         ex_print(str(id) + '\t' + name, escape)
 75 | 
 76 | 
 77 | def get_speaker_id(message):
 78 |     speaker_id = input(message)
 79 |     try:
 80 |         speaker_id = int(speaker_id)
 81 |     except:
 82 |         print(str(speaker_id) + ' is not a valid ID!')
 83 |         sys.exit(1)
 84 |     return speaker_id
 85 | 
 86 | 
 87 | def get_label_value(text, label, default, warning_name='value'):
 88 |     value = re.search(rf'\[{label}=(.+?)\]', text)
 89 |     if value:
 90 |         try:
 91 |             text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
 92 |             value = float(value.group(1))
 93 |         except:
 94 |             print(f'Invalid {warning_name}!')
 95 |             sys.exit(1)
 96 |     else:
 97 |         value = default
 98 |     return value, text
 99 | 
100 | 
101 | def get_label(text, label):
102 |     if f'[{label}]' in text:
103 |         return True, text.replace(f'[{label}]', '')
104 |     else:
105 |         return False, text
106 | 
107 | 
108 | 
109 | def generateSound(inputString):
110 |     if '--escape' in sys.argv:
111 |         escape = True
112 |     else:
113 |         escape = False
114 | 
115 |     #model = input('Path of a VITS model: ')
116 |     #config = input('Path of a config file: ')
117 |     model = r".\model\CN\model.pth"
118 |     config = r".\model\CN\config.json"
119 |         
120 | 
121 |     hps_ms = utils.get_hparams_from_file(config)
122 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
123 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
124 |     speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
125 |     use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
126 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
127 | 
128 |     net_g_ms = SynthesizerTrn(
129 |         n_symbols,
130 |         hps_ms.data.filter_length // 2 + 1,
131 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
132 |         n_speakers=n_speakers,
133 |         emotion_embedding=emotion_embedding,
134 |         **hps_ms.model)
135 |     _ = net_g_ms.eval()
136 |     utils.load_checkpoint(model, net_g_ms)
137 | 
138 |     def voice_conversion():
139 |         audio_path = input('Path of an audio file to convert:\n')
140 |         print_speakers(speakers)
141 |         audio = utils.load_audio_to_torch(
142 |             audio_path, hps_ms.data.sampling_rate)
143 | 
144 |         originnal_id = get_speaker_id('Original speaker ID: ')
145 |         target_id = get_speaker_id('Target speaker ID: ')
146 |         out_path = input('Path to save: ')
147 | 
148 |         y = audio.unsqueeze(0)
149 | 
150 |         spec = spectrogram_torch(y, hps_ms.data.filter_length,
151 |                                  hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
152 |                                  center=False)
153 |         spec_lengths = LongTensor([spec.size(-1)])
154 |         sid_src = LongTensor([originnal_id])
155 | 
156 |         with no_grad():
157 |             sid_tgt = LongTensor([target_id])
158 |             audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
159 |                 0][0, 0].data.cpu().float().numpy()
160 |         return audio, out_path
161 | 
162 |     if n_symbols != 0:
163 |         if not emotion_embedding:
164 |             #while True:
165 |             if(1==1):
166 |                 #choice = input('TTS or VC? (t/v):')
167 |                 choice = 't'
168 |                 if choice == 't':
169 |                     #text = input('Text to read: ')
170 |                     text = inputString
171 |                     if text == '[ADVANCED]':
172 |                         #text = input('Raw text:')
173 |                         text = "我不会说"
174 |                         #print('Cleaned text is:')
175 |                         #ex_print(_clean_text(
176 |                         #    text, hps_ms.data.text_cleaners), escape)
177 |                         #continue
178 | 
179 |                     length_scale, text = get_label_value(
180 |                         text, 'LENGTH', 1, 'length scale')
181 |                     noise_scale, text = get_label_value(
182 |                         text, 'NOISE', 0.667, 'noise scale')
183 |                     noise_scale_w, text = get_label_value(
184 |                         text, 'NOISEW', 0.8, 'deviation of noise')
185 |                     cleaned, text = get_label(text, 'CLEANED')
186 | 
187 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
188 | 
189 |                     #print_speakers(speakers, escape)
190 |                     #speaker_id = get_speaker_id('Speaker ID: ')
191 |                     speaker_id = speakerID 
192 |                     #out_path = input('Path to save: ')
193 |                     out_path = "output.wav"
194 | 
195 |                     with no_grad():
196 |                         x_tst = stn_tst.unsqueeze(0)
197 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
198 |                         sid = LongTensor([speaker_id])
199 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
200 |                                                noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
201 | 
202 |                 elif choice == 'v':
203 |                     audio, out_path = voice_conversion()
204 | 
205 |                 write(out_path, hps_ms.data.sampling_rate, audio)
206 |                 print('Successfully saved!')
207 |                 #ask_if_continue()
208 |         else:
209 |             import os
210 |             import librosa
211 |             import numpy as np
212 |             from torch import FloatTensor
213 |             import audonnx
214 |             w2v2_folder = input('Path of a w2v2 dimensional emotion model: ')
215 |             w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
216 |             #while True:
217 |             if(1==1):
218 |                 #choice = input('TTS or VC? (t/v):')
219 |                 choice = 't'
220 |                 if choice == 't':
221 |                     #text = input('Text to read: ')
222 |                     text = inputString
223 |                     if text == '[ADVANCED]':
224 |                         #text = input('Raw text:')
225 |                         text = "我不会说"
226 |                         #print('Cleaned text is:')
227 |                         #ex_print(_clean_text(
228 |                         #    text, hps_ms.data.text_cleaners), escape)
229 |                         #continue
230 | 
231 |                     length_scale, text = get_label_value(
232 |                         text, 'LENGTH', 1, 'length scale')
233 |                     noise_scale, text = get_label_value(
234 |                         text, 'NOISE', 0.667, 'noise scale')
235 |                     noise_scale_w, text = get_label_value(
236 |                         text, 'NOISEW', 0.8, 'deviation of noise')
237 |                     cleaned, text = get_label(text, 'CLEANED')
238 | 
239 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
240 | 
241 |                     #print_speakers(speakers, escape)
242 |                     #speaker_id = get_speaker_id('Speaker ID: ')
243 |                     speaker_id = speakerID
244 | 
245 |                     emotion_reference = input('Path of an emotion reference: ')
246 |                     if emotion_reference.endswith('.npy'):
247 |                         emotion = np.load(emotion_reference)
248 |                         emotion = FloatTensor(emotion).unsqueeze(0)
249 |                     else:
250 |                         audio16000, sampling_rate = librosa.load(
251 |                             emotion_reference, sr=16000, mono=True)
252 |                         emotion = w2v2_model(audio16000, sampling_rate)[
253 |                             'hidden_states']
254 |                         emotion_reference = re.sub(
255 |                             r'\..*$', '', emotion_reference)
256 |                         np.save(emotion_reference, emotion.squeeze(0))
257 |                         emotion = FloatTensor(emotion)
258 | 
259 |                     #out_path = input('Path to save: ')
260 |                     out_path = "output.wav"
261 | 
262 |                     with no_grad():
263 |                         x_tst = stn_tst.unsqueeze(0)
264 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
265 |                         sid = LongTensor([speaker_id])
266 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
267 |                                                length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()
268 | 
269 |                 elif choice == 'v':
270 |                     audio, out_path = voice_conversion()
271 | 
272 |                 write(out_path, hps_ms.data.sampling_rate, audio)
273 |                 print('Successfully saved!')
274 |                 #ask_if_continue()
275 |     else:
276 |         model = input('Path of a hubert-soft model: ')
277 |         from hubert_model import hubert_soft
278 |         hubert = hubert_soft(model)
279 | 
280 |         while True:
281 |             audio_path = input('Path of an audio file to convert:\n')
282 | 
283 |             if audio_path != '[VC]':
284 |                 import librosa
285 |                 if use_f0:
286 |                     audio, sampling_rate = librosa.load(
287 |                         audio_path, sr=hps_ms.data.sampling_rate, mono=True)
288 |                     audio16000 = librosa.resample(
289 |                         audio, orig_sr=sampling_rate, target_sr=16000)
290 |                 else:
291 |                     audio16000, sampling_rate = librosa.load(
292 |                         audio_path, sr=16000, mono=True)
293 | 
294 |                 #print_speakers(speakers, escape)
295 |                 target_id = get_speaker_id('Target speaker ID: ')
296 |                 out_path = input('Path to save: ')
297 |                 length_scale, out_path = get_label_value(
298 |                     out_path, 'LENGTH', 1, 'length scale')
299 |                 noise_scale, out_path = get_label_value(
300 |                     out_path, 'NOISE', 0.1, 'noise scale')
301 |                 noise_scale_w, out_path = get_label_value(
302 |                     out_path, 'NOISEW', 0.1, 'deviation of noise')
303 | 
304 |                 from torch import inference_mode, FloatTensor
305 |                 import numpy as np
306 |                 with inference_mode():
307 |                     units = hubert.units(FloatTensor(audio16000).unsqueeze(
308 |                         0).unsqueeze(0)).squeeze(0).numpy()
309 |                     if use_f0:
310 |                         f0_scale, out_path = get_label_value(
311 |                             out_path, 'F0', 1, 'f0 scale')
312 |                         f0 = librosa.pyin(audio, sr=sampling_rate,
313 |                                           fmin=librosa.note_to_hz('C0'),
314 |                                           fmax=librosa.note_to_hz('C7'),
315 |                                           frame_length=1780)[0]
316 |                         target_length = len(units[:, 0])
317 |                         f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,
318 |                                                      np.arange(0, len(f0)), f0)) * f0_scale
319 |                         units[:, 0] = f0 / 10
320 | 
321 |                 stn_tst = FloatTensor(units)
322 |                 with no_grad():
323 |                     x_tst = stn_tst.unsqueeze(0)
324 |                     x_tst_lengths = LongTensor([stn_tst.size(0)])
325 |                     sid = LongTensor([target_id])
326 |                     audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
327 |                                            noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
328 | 
329 |             else:
330 |                 audio, out_path = voice_conversion()
331 | 
332 |             write(out_path, hps_ms.data.sampling_rate, audio)
333 |             print('Successfully saved!')
334 |             #ask_if_continue()
335 | 
336 | if __name__ == "__main__":
337 |     session_token = get_token()
338 |     api = ChatGPT(session_token)
339 |     print(idmessage)
340 |     peaker_id = input()
341 |     while True:
342 |         resp = api.send_message(get_input())
343 |         answer = resp["message"].replace('\n','')
344 |         print("ChatGPT:")
345 |         print(answer)
346 |         generateSound("[ZH]"+answer+"[ZH]")
347 |         PlaySound(r'.\output.wav', flags=1)
348 |     
349 | 


--------------------------------------------------------------------------------
/ChatWaifuJP.py:
--------------------------------------------------------------------------------
  1 | from scipy.io.wavfile import write
  2 | from mel_processing import spectrogram_torch
  3 | from text import text_to_sequence, _clean_text
  4 | from models import SynthesizerTrn
  5 | import utils
  6 | import commons
  7 | import sys
  8 | import re
  9 | from torch import no_grad, LongTensor
 10 | import logging
 11 | from winsound import PlaySound
 12 | 
 13 | ####################################
 14 | #CHATGPT INITIALIZE
 15 | from pyChatGPT import ChatGPT
 16 | import json
 17 | idmessage = """ID      Speaker
 18 | 0       綾地寧々
 19 | 1       因幡めぐる
 20 | 2       朝武芳乃
 21 | 3       常陸茉子
 22 | 4       ムラサメ
 23 | 5       鞍馬小春
 24 | 6       在原七海
 25 | """
 26 | speakerID = 0
 27 | 
 28 | def get_input():
 29 |     # prompt for input
 30 |     print("You:")
 31 |     user_input = input() +" 使用日本语"
 32 |     return user_input
 33 | 
 34 | def get_token():
 35 |     token = input("Copy your token from ChatGPT and press Enter \n")
 36 |     return token;
 37 | 
 38 |       
 39 | ################################################
 40 | 
 41 | 
 42 | logging.getLogger('numba').setLevel(logging.WARNING)
 43 | 
 44 | 
 45 | def ex_print(text, escape=False):
 46 |     if escape:
 47 |         print(text.encode('unicode_escape').decode())
 48 |     else:
 49 |         print(text)
 50 | 
 51 | 
 52 | def get_text(text, hps, cleaned=False):
 53 |     if cleaned:
 54 |         text_norm = text_to_sequence(text, hps.symbols, [])
 55 |     else:
 56 |         text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
 57 |     if hps.data.add_blank:
 58 |         text_norm = commons.intersperse(text_norm, 0)
 59 |     text_norm = LongTensor(text_norm)
 60 |     return text_norm
 61 | 
 62 | 
 63 | def ask_if_continue():
 64 |     while True:
 65 |         answer = input('Continue? (y/n): ')
 66 |         if answer == 'y':
 67 |             break
 68 |         elif answer == 'n':
 69 |             sys.exit(0)
 70 | 
 71 | 
 72 | def print_speakers(speakers, escape=False):
 73 |     if len(speakers) > 100:
 74 |         return
 75 |     print('ID\tSpeaker')
 76 |     for id, name in enumerate(speakers):
 77 |         ex_print(str(id) + '\t' + name, escape)
 78 | 
 79 | 
 80 | def get_speaker_id(message):
 81 |     speaker_id = input(message)
 82 |     try:
 83 |         speaker_id = int(speaker_id)
 84 |     except:
 85 |         print(str(speaker_id) + ' is not a valid ID!')
 86 |         sys.exit(1)
 87 |     return speaker_id
 88 | 
 89 | 
 90 | def get_label_value(text, label, default, warning_name='value'):
 91 |     value = re.search(rf'\[{label}=(.+?)\]', text)
 92 |     if value:
 93 |         try:
 94 |             text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
 95 |             value = float(value.group(1))
 96 |         except:
 97 |             print(f'Invalid {warning_name}!')
 98 |             sys.exit(1)
 99 |     else:
100 |         value = default
101 |     return value, text
102 | 
103 | 
104 | def get_label(text, label):
105 |     if f'[{label}]' in text:
106 |         return True, text.replace(f'[{label}]', '')
107 |     else:
108 |         return False, text
109 | 
110 | 
111 | 
112 | def generateSound(inputString):
113 |     if '--escape' in sys.argv:
114 |         escape = True
115 |     else:
116 |         escape = False
117 | 
118 |     #model = input('Path of a VITS model: ')
119 |     #config = input('Path of a config file: ')
120 |     model = r".\model\H_excluded.pth"
121 |     config = r".\model\config.json"
122 |         
123 | 
124 |     hps_ms = utils.get_hparams_from_file(config)
125 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
126 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
127 |     speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
128 |     use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
129 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
130 | 
131 |     net_g_ms = SynthesizerTrn(
132 |         n_symbols,
133 |         hps_ms.data.filter_length // 2 + 1,
134 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
135 |         n_speakers=n_speakers,
136 |         emotion_embedding=emotion_embedding,
137 |         **hps_ms.model)
138 |     _ = net_g_ms.eval()
139 |     utils.load_checkpoint(model, net_g_ms)
140 | 
141 |     def voice_conversion():
142 |         audio_path = input('Path of an audio file to convert:\n')
143 |         print_speakers(speakers)
144 |         audio = utils.load_audio_to_torch(
145 |             audio_path, hps_ms.data.sampling_rate)
146 | 
147 |         originnal_id = get_speaker_id('Original speaker ID: ')
148 |         target_id = get_speaker_id('Target speaker ID: ')
149 |         out_path = input('Path to save: ')
150 | 
151 |         y = audio.unsqueeze(0)
152 | 
153 |         spec = spectrogram_torch(y, hps_ms.data.filter_length,
154 |                                  hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
155 |                                  center=False)
156 |         spec_lengths = LongTensor([spec.size(-1)])
157 |         sid_src = LongTensor([originnal_id])
158 | 
159 |         with no_grad():
160 |             sid_tgt = LongTensor([target_id])
161 |             audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
162 |                 0][0, 0].data.cpu().float().numpy()
163 |         return audio, out_path
164 | 
165 |     if n_symbols != 0:
166 |         if not emotion_embedding:
167 |             #while True:
168 |             if(1==1):
169 |                 #choice = input('TTS or VC? (t/v):')
170 |                 choice = 't'
171 |                 if choice == 't':
172 |                     #text = input('Text to read: ')
173 |                     text = inputString
174 |                     if text == '[ADVANCED]':
175 |                         #text = input('Raw text:')
176 |                         text = "我不会说"
177 |                         #print('Cleaned text is:')
178 |                         #ex_print(_clean_text(
179 |                         #    text, hps_ms.data.text_cleaners), escape)
180 |                         #continue
181 | 
182 |                     length_scale, text = get_label_value(
183 |                         text, 'LENGTH', 1, 'length scale')
184 |                     noise_scale, text = get_label_value(
185 |                         text, 'NOISE', 0.667, 'noise scale')
186 |                     noise_scale_w, text = get_label_value(
187 |                         text, 'NOISEW', 0.8, 'deviation of noise')
188 |                     cleaned, text = get_label(text, 'CLEANED')
189 | 
190 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
191 | 
192 |                     #print_speakers(speakers, escape)
193 |                     #speaker_id = get_speaker_id('Speaker ID: ')
194 |                     speaker_id = speakerID 
195 |                     #out_path = input('Path to save: ')
196 |                     out_path = "output.wav"
197 | 
198 |                     with no_grad():
199 |                         x_tst = stn_tst.unsqueeze(0)
200 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
201 |                         sid = LongTensor([speaker_id])
202 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
203 |                                                noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
204 | 
205 |                 elif choice == 'v':
206 |                     audio, out_path = voice_conversion()
207 | 
208 |                 write(out_path, hps_ms.data.sampling_rate, audio)
209 |                 print('Successfully saved!')
210 |                 #ask_if_continue()
211 |         else:
212 |             import os
213 |             import librosa
214 |             import numpy as np
215 |             from torch import FloatTensor
216 |             import audonnx
217 |             w2v2_folder = input('Path of a w2v2 dimensional emotion model: ')
218 |             w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
219 |             #while True:
220 |             if(1==1):
221 |                 #choice = input('TTS or VC? (t/v):')
222 |                 choice = 't'
223 |                 if choice == 't':
224 |                     #text = input('Text to read: ')
225 |                     text = inputString
226 |                     if text == '[ADVANCED]':
227 |                         #text = input('Raw text:')
228 |                         text = "我不会说"
229 |                         #print('Cleaned text is:')
230 |                         #ex_print(_clean_text(
231 |                         #    text, hps_ms.data.text_cleaners), escape)
232 |                         #continue
233 | 
234 |                     length_scale, text = get_label_value(
235 |                         text, 'LENGTH', 1, 'length scale')
236 |                     noise_scale, text = get_label_value(
237 |                         text, 'NOISE', 0.667, 'noise scale')
238 |                     noise_scale_w, text = get_label_value(
239 |                         text, 'NOISEW', 0.8, 'deviation of noise')
240 |                     cleaned, text = get_label(text, 'CLEANED')
241 | 
242 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
243 | 
244 |                     #print_speakers(speakers, escape)
245 |                     #speaker_id = get_speaker_id('Speaker ID: ')
246 |                     speaker_id = speakerID
247 | 
248 |                     emotion_reference = input('Path of an emotion reference: ')
249 |                     if emotion_reference.endswith('.npy'):
250 |                         emotion = np.load(emotion_reference)
251 |                         emotion = FloatTensor(emotion).unsqueeze(0)
252 |                     else:
253 |                         audio16000, sampling_rate = librosa.load(
254 |                             emotion_reference, sr=16000, mono=True)
255 |                         emotion = w2v2_model(audio16000, sampling_rate)[
256 |                             'hidden_states']
257 |                         emotion_reference = re.sub(
258 |                             r'\..*$', '', emotion_reference)
259 |                         np.save(emotion_reference, emotion.squeeze(0))
260 |                         emotion = FloatTensor(emotion)
261 | 
262 |                     #out_path = input('Path to save: ')
263 |                     out_path = "output.wav"
264 | 
265 |                     with no_grad():
266 |                         x_tst = stn_tst.unsqueeze(0)
267 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
268 |                         sid = LongTensor([speaker_id])
269 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
270 |                                                length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()
271 | 
272 |                 elif choice == 'v':
273 |                     audio, out_path = voice_conversion()
274 | 
275 |                 write(out_path, hps_ms.data.sampling_rate, audio)
276 |                 print('Successfully saved!')
277 |                 #ask_if_continue()
278 |     else:
279 |         model = input('Path of a hubert-soft model: ')
280 |         from hubert_model import hubert_soft
281 |         hubert = hubert_soft(model)
282 | 
283 |         while True:
284 |             audio_path = input('Path of an audio file to convert:\n')
285 | 
286 |             if audio_path != '[VC]':
287 |                 import librosa
288 |                 if use_f0:
289 |                     audio, sampling_rate = librosa.load(
290 |                         audio_path, sr=hps_ms.data.sampling_rate, mono=True)
291 |                     audio16000 = librosa.resample(
292 |                         audio, orig_sr=sampling_rate, target_sr=16000)
293 |                 else:
294 |                     audio16000, sampling_rate = librosa.load(
295 |                         audio_path, sr=16000, mono=True)
296 | 
297 |                 #print_speakers(speakers, escape)
298 |                 target_id = get_speaker_id('Target speaker ID: ')
299 |                 out_path = input('Path to save: ')
300 |                 length_scale, out_path = get_label_value(
301 |                     out_path, 'LENGTH', 1, 'length scale')
302 |                 noise_scale, out_path = get_label_value(
303 |                     out_path, 'NOISE', 0.1, 'noise scale')
304 |                 noise_scale_w, out_path = get_label_value(
305 |                     out_path, 'NOISEW', 0.1, 'deviation of noise')
306 | 
307 |                 from torch import inference_mode, FloatTensor
308 |                 import numpy as np
309 |                 with inference_mode():
310 |                     units = hubert.units(FloatTensor(audio16000).unsqueeze(
311 |                         0).unsqueeze(0)).squeeze(0).numpy()
312 |                     if use_f0:
313 |                         f0_scale, out_path = get_label_value(
314 |                             out_path, 'F0', 1, 'f0 scale')
315 |                         f0 = librosa.pyin(audio, sr=sampling_rate,
316 |                                           fmin=librosa.note_to_hz('C0'),
317 |                                           fmax=librosa.note_to_hz('C7'),
318 |                                           frame_length=1780)[0]
319 |                         target_length = len(units[:, 0])
320 |                         f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,
321 |                                                      np.arange(0, len(f0)), f0)) * f0_scale
322 |                         units[:, 0] = f0 / 10
323 | 
324 |                 stn_tst = FloatTensor(units)
325 |                 with no_grad():
326 |                     x_tst = stn_tst.unsqueeze(0)
327 |                     x_tst_lengths = LongTensor([stn_tst.size(0)])
328 |                     sid = LongTensor([target_id])
329 |                     audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
330 |                                            noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
331 | 
332 |             else:
333 |                 audio, out_path = voice_conversion()
334 | 
335 |             write(out_path, hps_ms.data.sampling_rate, audio)
336 |             print('Successfully saved!')
337 |             #ask_if_continue()
338 | 
339 | if __name__ == "__main__":
340 |     session_token = get_token()
341 |     api = ChatGPT(session_token)
342 |     print(idmessage)
343 |     peaker_id = input()
344 |     while True:
345 |         resp = api.send_message(get_input())
346 |         answer = resp["message"].replace('\n','')
347 |         print("ChatGPT:")
348 |         print(answer)
349 |         generateSound(answer)
350 |         PlaySound(r'.\output.wav', flags=1)
351 |     
352 | 


--------------------------------------------------------------------------------
/ChatWaifuJPVoiceEN.py:
--------------------------------------------------------------------------------
  1 | from scipy.io.wavfile import write
  2 | from mel_processing import spectrogram_torch
  3 | from text import text_to_sequence, _clean_text
  4 | from models import SynthesizerTrn
  5 | import utils
  6 | import commons
  7 | import sys
  8 | import re
  9 | from torch import no_grad, LongTensor
 10 | import logging
 11 | from winsound import PlaySound
 12 | import argparse
 13 | import queue
 14 | import sounddevice as sd
 15 | from vosk import Model, KaldiRecognizer
 16 | 
 17 | q = queue.Queue()
 18 | def int_or_str(text):
 19 |     """Helper function for argument parsing."""
 20 |     try:
 21 |         return int(text)
 22 |     except ValueError:
 23 |         return text
 24 | 
 25 | 
 26 | def callback(indata, frames, time, status):
 27 |     """This is called (from a separate thread) for each audio block."""
 28 |     if status:
 29 |         print(status, file=sys.stderr)
 30 |     q.put(bytes(indata))
 31 | 
 32 | 
 33 | parser = argparse.ArgumentParser(add_help=False)
 34 | parser.add_argument(
 35 |     "-l", "--list-devices", action="store_true",
 36 |     help="show list of audio devices and exit")
 37 | args, remaining = parser.parse_known_args()
 38 | if args.list_devices:
 39 |     parser.exit(0)
 40 | parser = argparse.ArgumentParser(
 41 |     description=__doc__,
 42 |     formatter_class=argparse.RawDescriptionHelpFormatter,
 43 |     parents=[parser])
 44 | parser.add_argument(
 45 |     "-f", "--filename", type=str, metavar="FILENAME",
 46 |     help="audio file to store recording to")
 47 | parser.add_argument(
 48 |     "-d", "--device", type=int_or_str,
 49 |     help="input device (numeric ID or substring)")
 50 | parser.add_argument(
 51 |     "-r", "--samplerate", type=int, help="sampling rate")
 52 | parser.add_argument(
 53 |     "-m", "--model", type=str, help="language model; e.g. en-us, fr, nl; default is en-us")
 54 | args = parser.parse_args(remaining)
 55 | try:
 56 |     if args.samplerate is None:
 57 |         device_info = sd.query_devices(args.device, "input")
 58 |         # soundfile expects an int, sounddevice provides a float:
 59 |         args.samplerate = int(device_info["default_samplerate"])
 60 | 
 61 |     if args.model is None:
 62 |         model = Model(lang="en-us")
 63 |     else:
 64 |         model = Model(lang=args.model)
 65 | 
 66 |     if args.filename:
 67 |         dump_fn = open(args.filename, "wb")
 68 |     else:
 69 |         dump_fn = None
 70 | 
 71 | 
 72 | 
 73 | except KeyboardInterrupt:
 74 |     print("\nDone")
 75 |     parser.exit(0)
 76 | 
 77 | ####################################
 78 | #CHATGPT INITIALIZE
 79 | from pyChatGPT import ChatGPT
 80 | import json
 81 | idmessage = """ID      Speaker
 82 | 0       綾地寧々
 83 | 1       因幡めぐる
 84 | 2       朝武芳乃
 85 | 3       常陸茉子
 86 | 4       ムラサメ
 87 | 5       鞍馬小春
 88 | 6       在原七海
 89 | """
 90 | speakerID = 0
 91 | 
 92 | def voice_input():
 93 |     print("You:")
 94 |     with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device,
 95 |                            dtype="int16", channels=1, callback=callback):
 96 | 
 97 |         rec = KaldiRecognizer(model, args.samplerate)
 98 |         while True:
 99 |             data = q.get()
100 |             if rec.AcceptWaveform(data):
101 |                 a = json.loads(rec.Result())
102 |                 a = str(a['text'])
103 |                 a = ''.join(a.split())
104 |                 if(len(a) > 0):
105 |                     print(a)
106 |                     user_input = a + " 使用日本语"
107 |                     return user_input
108 |             if dump_fn is not None:
109 |                 dump_fn.write(data)
110 | 
111 | 
112 | def get_token():
113 |     token = input("Copy your token from ChatGPT and press Enter \n")
114 |     return token;
115 | 
116 |       
117 | ################################################
118 | 
119 | 
120 | logging.getLogger('numba').setLevel(logging.WARNING)
121 | 
122 | 
123 | def ex_print(text, escape=False):
124 |     if escape:
125 |         print(text.encode('unicode_escape').decode())
126 |     else:
127 |         print(text)
128 | 
129 | 
130 | def get_text(text, hps, cleaned=False):
131 |     if cleaned:
132 |         text_norm = text_to_sequence(text, hps.symbols, [])
133 |     else:
134 |         text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
135 |     if hps.data.add_blank:
136 |         text_norm = commons.intersperse(text_norm, 0)
137 |     text_norm = LongTensor(text_norm)
138 |     return text_norm
139 | 
140 | 
141 | def ask_if_continue():
142 |     while True:
143 |         answer = input('Continue? (y/n): ')
144 |         if answer == 'y':
145 |             break
146 |         elif answer == 'n':
147 |             sys.exit(0)
148 | 
149 | 
150 | def print_speakers(speakers, escape=False):
151 |     if len(speakers) > 100:
152 |         return
153 |     print('ID\tSpeaker')
154 |     for id, name in enumerate(speakers):
155 |         ex_print(str(id) + '\t' + name, escape)
156 | 
157 | 
158 | def get_speaker_id(message):
159 |     speaker_id = input(message)
160 |     try:
161 |         speaker_id = int(speaker_id)
162 |     except:
163 |         print(str(speaker_id) + ' is not a valid ID!')
164 |         sys.exit(1)
165 |     return speaker_id
166 | 
167 | 
168 | def get_label_value(text, label, default, warning_name='value'):
169 |     value = re.search(rf'\[{label}=(.+?)\]', text)
170 |     if value:
171 |         try:
172 |             text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
173 |             value = float(value.group(1))
174 |         except:
175 |             print(f'Invalid {warning_name}!')
176 |             sys.exit(1)
177 |     else:
178 |         value = default
179 |     return value, text
180 | 
181 | 
182 | def get_label(text, label):
183 |     if f'[{label}]' in text:
184 |         return True, text.replace(f'[{label}]', '')
185 |     else:
186 |         return False, text
187 | 
188 | 
189 | 
190 | def generateSound(inputString):
191 |     if '--escape' in sys.argv:
192 |         escape = True
193 |     else:
194 |         escape = False
195 | 
196 |     #model = input('Path of a VITS model: ')
197 |     #config = input('Path of a config file: ')
198 |     model = r".\model\H_excluded.pth"
199 |     config = r".\model\config.json"
200 |         
201 | 
202 |     hps_ms = utils.get_hparams_from_file(config)
203 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
204 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
205 |     speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
206 |     use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
207 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
208 | 
209 |     net_g_ms = SynthesizerTrn(
210 |         n_symbols,
211 |         hps_ms.data.filter_length // 2 + 1,
212 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
213 |         n_speakers=n_speakers,
214 |         emotion_embedding=emotion_embedding,
215 |         **hps_ms.model)
216 |     _ = net_g_ms.eval()
217 |     utils.load_checkpoint(model, net_g_ms)
218 | 
219 |     def voice_conversion():
220 |         audio_path = input('Path of an audio file to convert:\n')
221 |         print_speakers(speakers)
222 |         audio = utils.load_audio_to_torch(
223 |             audio_path, hps_ms.data.sampling_rate)
224 | 
225 |         originnal_id = get_speaker_id('Original speaker ID: ')
226 |         target_id = get_speaker_id('Target speaker ID: ')
227 |         out_path = input('Path to save: ')
228 | 
229 |         y = audio.unsqueeze(0)
230 | 
231 |         spec = spectrogram_torch(y, hps_ms.data.filter_length,
232 |                                  hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
233 |                                  center=False)
234 |         spec_lengths = LongTensor([spec.size(-1)])
235 |         sid_src = LongTensor([originnal_id])
236 | 
237 |         with no_grad():
238 |             sid_tgt = LongTensor([target_id])
239 |             audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
240 |                 0][0, 0].data.cpu().float().numpy()
241 |         return audio, out_path
242 | 
243 |     if n_symbols != 0:
244 |         if not emotion_embedding:
245 |             #while True:
246 |             if(1==1):
247 |                 #choice = input('TTS or VC? (t/v):')
248 |                 choice = 't'
249 |                 if choice == 't':
250 |                     #text = input('Text to read: ')
251 |                     text = inputString
252 |                     if text == '[ADVANCED]':
253 |                         #text = input('Raw text:')
254 |                         text = "我不会说"
255 |                         #print('Cleaned text is:')
256 |                         #ex_print(_clean_text(
257 |                         #    text, hps_ms.data.text_cleaners), escape)
258 |                         #continue
259 | 
260 |                     length_scale, text = get_label_value(
261 |                         text, 'LENGTH', 1, 'length scale')
262 |                     noise_scale, text = get_label_value(
263 |                         text, 'NOISE', 0.667, 'noise scale')
264 |                     noise_scale_w, text = get_label_value(
265 |                         text, 'NOISEW', 0.8, 'deviation of noise')
266 |                     cleaned, text = get_label(text, 'CLEANED')
267 | 
268 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
269 | 
270 |                     #print_speakers(speakers, escape)
271 |                     #speaker_id = get_speaker_id('Speaker ID: ')
272 |                     speaker_id = speakerID 
273 |                     #out_path = input('Path to save: ')
274 |                     out_path = "output.wav"
275 | 
276 |                     with no_grad():
277 |                         x_tst = stn_tst.unsqueeze(0)
278 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
279 |                         sid = LongTensor([speaker_id])
280 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
281 |                                                noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
282 | 
283 |                 elif choice == 'v':
284 |                     audio, out_path = voice_conversion()
285 | 
286 |                 write(out_path, hps_ms.data.sampling_rate, audio)
287 |                 #print('Successfully saved!')
288 |                 #ask_if_continue()
289 |         else:
290 |             import os
291 |             import librosa
292 |             import numpy as np
293 |             from torch import FloatTensor
294 |             import audonnx
295 |             w2v2_folder = input('Path of a w2v2 dimensional emotion model: ')
296 |             w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
297 |             #while True:
298 |             if(1==1):
299 |                 #choice = input('TTS or VC? (t/v):')
300 |                 choice = 't'
301 |                 if choice == 't':
302 |                     #text = input('Text to read: ')
303 |                     text = inputString
304 |                     if text == '[ADVANCED]':
305 |                         #text = input('Raw text:')
306 |                         text = "我不会说"
307 |                         #print('Cleaned text is:')
308 |                         #ex_print(_clean_text(
309 |                         #    text, hps_ms.data.text_cleaners), escape)
310 |                         #continue
311 | 
312 |                     length_scale, text = get_label_value(
313 |                         text, 'LENGTH', 1, 'length scale')
314 |                     noise_scale, text = get_label_value(
315 |                         text, 'NOISE', 0.667, 'noise scale')
316 |                     noise_scale_w, text = get_label_value(
317 |                         text, 'NOISEW', 0.8, 'deviation of noise')
318 |                     cleaned, text = get_label(text, 'CLEANED')
319 | 
320 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
321 | 
322 |                     #print_speakers(speakers, escape)
323 |                     #speaker_id = get_speaker_id('Speaker ID: ')
324 |                     speaker_id = speakerID
325 | 
326 |                     emotion_reference = input('Path of an emotion reference: ')
327 |                     if emotion_reference.endswith('.npy'):
328 |                         emotion = np.load(emotion_reference)
329 |                         emotion = FloatTensor(emotion).unsqueeze(0)
330 |                     else:
331 |                         audio16000, sampling_rate = librosa.load(
332 |                             emotion_reference, sr=16000, mono=True)
333 |                         emotion = w2v2_model(audio16000, sampling_rate)[
334 |                             'hidden_states']
335 |                         emotion_reference = re.sub(
336 |                             r'\..*$', '', emotion_reference)
337 |                         np.save(emotion_reference, emotion.squeeze(0))
338 |                         emotion = FloatTensor(emotion)
339 | 
340 |                     #out_path = input('Path to save: ')
341 |                     out_path = "output.wav"
342 | 
343 |                     with no_grad():
344 |                         x_tst = stn_tst.unsqueeze(0)
345 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
346 |                         sid = LongTensor([speaker_id])
347 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
348 |                                                length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()
349 | 
350 |                 elif choice == 'v':
351 |                     audio, out_path = voice_conversion()
352 | 
353 |                 write(out_path, hps_ms.data.sampling_rate, audio)
354 |                 #print('Successfully saved!')
355 |                 #ask_if_continue()
356 |     else:
357 |         model = input('Path of a hubert-soft model: ')
358 |         from hubert_model import hubert_soft
359 |         hubert = hubert_soft(model)
360 | 
361 |         while True:
362 |             audio_path = input('Path of an audio file to convert:\n')
363 | 
364 |             if audio_path != '[VC]':
365 |                 import librosa
366 |                 if use_f0:
367 |                     audio, sampling_rate = librosa.load(
368 |                         audio_path, sr=hps_ms.data.sampling_rate, mono=True)
369 |                     audio16000 = librosa.resample(
370 |                         audio, orig_sr=sampling_rate, target_sr=16000)
371 |                 else:
372 |                     audio16000, sampling_rate = librosa.load(
373 |                         audio_path, sr=16000, mono=True)
374 | 
375 |                 #print_speakers(speakers, escape)
376 |                 target_id = get_speaker_id('Target speaker ID: ')
377 |                 out_path = input('Path to save: ')
378 |                 length_scale, out_path = get_label_value(
379 |                     out_path, 'LENGTH', 1, 'length scale')
380 |                 noise_scale, out_path = get_label_value(
381 |                     out_path, 'NOISE', 0.1, 'noise scale')
382 |                 noise_scale_w, out_path = get_label_value(
383 |                     out_path, 'NOISEW', 0.1, 'deviation of noise')
384 | 
385 |                 from torch import inference_mode, FloatTensor
386 |                 import numpy as np
387 |                 with inference_mode():
388 |                     units = hubert.units(FloatTensor(audio16000).unsqueeze(
389 |                         0).unsqueeze(0)).squeeze(0).numpy()
390 |                     if use_f0:
391 |                         f0_scale, out_path = get_label_value(
392 |                             out_path, 'F0', 1, 'f0 scale')
393 |                         f0 = librosa.pyin(audio, sr=sampling_rate,
394 |                                           fmin=librosa.note_to_hz('C0'),
395 |                                           fmax=librosa.note_to_hz('C7'),
396 |                                           frame_length=1780)[0]
397 |                         target_length = len(units[:, 0])
398 |                         f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,
399 |                                                      np.arange(0, len(f0)), f0)) * f0_scale
400 |                         units[:, 0] = f0 / 10
401 | 
402 |                 stn_tst = FloatTensor(units)
403 |                 with no_grad():
404 |                     x_tst = stn_tst.unsqueeze(0)
405 |                     x_tst_lengths = LongTensor([stn_tst.size(0)])
406 |                     sid = LongTensor([target_id])
407 |                     audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
408 |                                            noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
409 | 
410 |             else:
411 |                 audio, out_path = voice_conversion()
412 | 
413 |             write(out_path, hps_ms.data.sampling_rate, audio)
414 |             #print('Successfully saved!')
415 |             #ask_if_continue()
416 | 
417 | if __name__ == "__main__":
418 |     session_token = get_token()
419 |     api = ChatGPT(session_token)
420 |     print(idmessage)
421 |     peaker_id = input()
422 |     while True:
423 |         resp = api.send_message(voice_input())
424 |         answer = resp["message"].replace('\n','')
425 |         print("ChatGPT:")
426 |         print(answer)
427 |         generateSound(answer)
428 |         PlaySound(r'.\output.wav', flags=0)
429 |     
430 | 


--------------------------------------------------------------------------------
/ChatWaifuJPVoiceJP.py:
--------------------------------------------------------------------------------
  1 | from scipy.io.wavfile import write
  2 | from mel_processing import spectrogram_torch
  3 | from text import text_to_sequence, _clean_text
  4 | from models import SynthesizerTrn
  5 | import utils
  6 | import commons
  7 | import sys
  8 | import re
  9 | from torch import no_grad, LongTensor
 10 | import logging
 11 | from winsound import PlaySound
 12 | import argparse
 13 | import queue
 14 | import sounddevice as sd
 15 | from vosk import Model, KaldiRecognizer
 16 | 
 17 | q = queue.Queue()
 18 | def int_or_str(text):
 19 |     """Helper function for argument parsing."""
 20 |     try:
 21 |         return int(text)
 22 |     except ValueError:
 23 |         return text
 24 | 
 25 | 
 26 | def callback(indata, frames, time, status):
 27 |     """This is called (from a separate thread) for each audio block."""
 28 |     if status:
 29 |         print(status, file=sys.stderr)
 30 |     q.put(bytes(indata))
 31 | 
 32 | 
 33 | parser = argparse.ArgumentParser(add_help=False)
 34 | parser.add_argument(
 35 |     "-l", "--list-devices", action="store_true",
 36 |     help="show list of audio devices and exit")
 37 | args, remaining = parser.parse_known_args()
 38 | if args.list_devices:
 39 |     parser.exit(0)
 40 | parser = argparse.ArgumentParser(
 41 |     description=__doc__,
 42 |     formatter_class=argparse.RawDescriptionHelpFormatter,
 43 |     parents=[parser])
 44 | parser.add_argument(
 45 |     "-f", "--filename", type=str, metavar="FILENAME",
 46 |     help="audio file to store recording to")
 47 | parser.add_argument(
 48 |     "-d", "--device", type=int_or_str,
 49 |     help="input device (numeric ID or substring)")
 50 | parser.add_argument(
 51 |     "-r", "--samplerate", type=int, help="sampling rate")
 52 | parser.add_argument(
 53 |     "-m", "--model", type=str, help="language model; e.g. en-us, fr, nl; default is en-us")
 54 | args = parser.parse_args(remaining)
 55 | try:
 56 |     if args.samplerate is None:
 57 |         device_info = sd.query_devices(args.device, "input")
 58 |         # soundfile expects an int, sounddevice provides a float:
 59 |         args.samplerate = int(device_info["default_samplerate"])
 60 | 
 61 |     if args.model is None:
 62 |         model = Model(lang="ja")
 63 |     else:
 64 |         model = Model(lang=args.model)
 65 | 
 66 |     if args.filename:
 67 |         dump_fn = open(args.filename, "wb")
 68 |     else:
 69 |         dump_fn = None
 70 | 
 71 | 
 72 | 
 73 | except KeyboardInterrupt:
 74 |     print("\nDone")
 75 |     parser.exit(0)
 76 | 
 77 | ####################################
 78 | #CHATGPT INITIALIZE
 79 | from pyChatGPT import ChatGPT
 80 | import json
 81 | idmessage = """ID      Speaker
 82 | 0       綾地寧々
 83 | 1       因幡めぐる
 84 | 2       朝武芳乃
 85 | 3       常陸茉子
 86 | 4       ムラサメ
 87 | 5       鞍馬小春
 88 | 6       在原七海
 89 | """
 90 | speakerID = 0
 91 | 
 92 | def voice_input():
 93 |     print("You:")
 94 |     with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device,
 95 |                            dtype="int16", channels=1, callback=callback):
 96 | 
 97 |         rec = KaldiRecognizer(model, args.samplerate)
 98 |         while True:
 99 |             data = q.get()
100 |             if rec.AcceptWaveform(data):
101 |                 a = json.loads(rec.Result())
102 |                 a = str(a['text'])
103 |                 a = ''.join(a.split())
104 |                 if(len(a) > 0):
105 |                     print(a)
106 |                     user_input = a + " 使用日本语"
107 |                     return user_input
108 |             if dump_fn is not None:
109 |                 dump_fn.write(data)
110 | 
111 | 
112 | def get_token():
113 |     token = input("Copy your token from ChatGPT and press Enter \n")
114 |     return token;
115 | 
116 |       
117 | ################################################
118 | 
119 | 
120 | logging.getLogger('numba').setLevel(logging.WARNING)
121 | 
122 | 
123 | def ex_print(text, escape=False):
124 |     if escape:
125 |         print(text.encode('unicode_escape').decode())
126 |     else:
127 |         print(text)
128 | 
129 | 
130 | def get_text(text, hps, cleaned=False):
131 |     if cleaned:
132 |         text_norm = text_to_sequence(text, hps.symbols, [])
133 |     else:
134 |         text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
135 |     if hps.data.add_blank:
136 |         text_norm = commons.intersperse(text_norm, 0)
137 |     text_norm = LongTensor(text_norm)
138 |     return text_norm
139 | 
140 | 
141 | def ask_if_continue():
142 |     while True:
143 |         answer = input('Continue? (y/n): ')
144 |         if answer == 'y':
145 |             break
146 |         elif answer == 'n':
147 |             sys.exit(0)
148 | 
149 | 
150 | def print_speakers(speakers, escape=False):
151 |     if len(speakers) > 100:
152 |         return
153 |     print('ID\tSpeaker')
154 |     for id, name in enumerate(speakers):
155 |         ex_print(str(id) + '\t' + name, escape)
156 | 
157 | 
158 | def get_speaker_id(message):
159 |     speaker_id = input(message)
160 |     try:
161 |         speaker_id = int(speaker_id)
162 |     except:
163 |         print(str(speaker_id) + ' is not a valid ID!')
164 |         sys.exit(1)
165 |     return speaker_id
166 | 
167 | 
168 | def get_label_value(text, label, default, warning_name='value'):
169 |     value = re.search(rf'\[{label}=(.+?)\]', text)
170 |     if value:
171 |         try:
172 |             text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
173 |             value = float(value.group(1))
174 |         except:
175 |             print(f'Invalid {warning_name}!')
176 |             sys.exit(1)
177 |     else:
178 |         value = default
179 |     return value, text
180 | 
181 | 
182 | def get_label(text, label):
183 |     if f'[{label}]' in text:
184 |         return True, text.replace(f'[{label}]', '')
185 |     else:
186 |         return False, text
187 | 
188 | 
189 | 
190 | def generateSound(inputString):
191 |     if '--escape' in sys.argv:
192 |         escape = True
193 |     else:
194 |         escape = False
195 | 
196 |     #model = input('Path of a VITS model: ')
197 |     #config = input('Path of a config file: ')
198 |     model = r".\model\H_excluded.pth"
199 |     config = r".\model\config.json"
200 |         
201 | 
202 |     hps_ms = utils.get_hparams_from_file(config)
203 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
204 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
205 |     speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
206 |     use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
207 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
208 | 
209 |     net_g_ms = SynthesizerTrn(
210 |         n_symbols,
211 |         hps_ms.data.filter_length // 2 + 1,
212 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
213 |         n_speakers=n_speakers,
214 |         emotion_embedding=emotion_embedding,
215 |         **hps_ms.model)
216 |     _ = net_g_ms.eval()
217 |     utils.load_checkpoint(model, net_g_ms)
218 | 
219 |     def voice_conversion():
220 |         audio_path = input('Path of an audio file to convert:\n')
221 |         print_speakers(speakers)
222 |         audio = utils.load_audio_to_torch(
223 |             audio_path, hps_ms.data.sampling_rate)
224 | 
225 |         originnal_id = get_speaker_id('Original speaker ID: ')
226 |         target_id = get_speaker_id('Target speaker ID: ')
227 |         out_path = input('Path to save: ')
228 | 
229 |         y = audio.unsqueeze(0)
230 | 
231 |         spec = spectrogram_torch(y, hps_ms.data.filter_length,
232 |                                  hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
233 |                                  center=False)
234 |         spec_lengths = LongTensor([spec.size(-1)])
235 |         sid_src = LongTensor([originnal_id])
236 | 
237 |         with no_grad():
238 |             sid_tgt = LongTensor([target_id])
239 |             audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
240 |                 0][0, 0].data.cpu().float().numpy()
241 |         return audio, out_path
242 | 
243 |     if n_symbols != 0:
244 |         if not emotion_embedding:
245 |             #while True:
246 |             if(1==1):
247 |                 #choice = input('TTS or VC? (t/v):')
248 |                 choice = 't'
249 |                 if choice == 't':
250 |                     #text = input('Text to read: ')
251 |                     text = inputString
252 |                     if text == '[ADVANCED]':
253 |                         #text = input('Raw text:')
254 |                         text = "我不会说"
255 |                         #print('Cleaned text is:')
256 |                         #ex_print(_clean_text(
257 |                         #    text, hps_ms.data.text_cleaners), escape)
258 |                         #continue
259 | 
260 |                     length_scale, text = get_label_value(
261 |                         text, 'LENGTH', 1, 'length scale')
262 |                     noise_scale, text = get_label_value(
263 |                         text, 'NOISE', 0.667, 'noise scale')
264 |                     noise_scale_w, text = get_label_value(
265 |                         text, 'NOISEW', 0.8, 'deviation of noise')
266 |                     cleaned, text = get_label(text, 'CLEANED')
267 | 
268 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
269 | 
270 |                     #print_speakers(speakers, escape)
271 |                     #speaker_id = get_speaker_id('Speaker ID: ')
272 |                     speaker_id = speakerID 
273 |                     #out_path = input('Path to save: ')
274 |                     out_path = "output.wav"
275 | 
276 |                     with no_grad():
277 |                         x_tst = stn_tst.unsqueeze(0)
278 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
279 |                         sid = LongTensor([speaker_id])
280 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
281 |                                                noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
282 | 
283 |                 elif choice == 'v':
284 |                     audio, out_path = voice_conversion()
285 | 
286 |                 write(out_path, hps_ms.data.sampling_rate, audio)
287 |                 #print('Successfully saved!')
288 |                 #ask_if_continue()
289 |         else:
290 |             import os
291 |             import librosa
292 |             import numpy as np
293 |             from torch import FloatTensor
294 |             import audonnx
295 |             w2v2_folder = input('Path of a w2v2 dimensional emotion model: ')
296 |             w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
297 |             #while True:
298 |             if(1==1):
299 |                 #choice = input('TTS or VC? (t/v):')
300 |                 choice = 't'
301 |                 if choice == 't':
302 |                     #text = input('Text to read: ')
303 |                     text = inputString
304 |                     if text == '[ADVANCED]':
305 |                         #text = input('Raw text:')
306 |                         text = "我不会说"
307 |                         #print('Cleaned text is:')
308 |                         #ex_print(_clean_text(
309 |                         #    text, hps_ms.data.text_cleaners), escape)
310 |                         #continue
311 | 
312 |                     length_scale, text = get_label_value(
313 |                         text, 'LENGTH', 1, 'length scale')
314 |                     noise_scale, text = get_label_value(
315 |                         text, 'NOISE', 0.667, 'noise scale')
316 |                     noise_scale_w, text = get_label_value(
317 |                         text, 'NOISEW', 0.8, 'deviation of noise')
318 |                     cleaned, text = get_label(text, 'CLEANED')
319 | 
320 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
321 | 
322 |                     #print_speakers(speakers, escape)
323 |                     #speaker_id = get_speaker_id('Speaker ID: ')
324 |                     speaker_id = speakerID
325 | 
326 |                     emotion_reference = input('Path of an emotion reference: ')
327 |                     if emotion_reference.endswith('.npy'):
328 |                         emotion = np.load(emotion_reference)
329 |                         emotion = FloatTensor(emotion).unsqueeze(0)
330 |                     else:
331 |                         audio16000, sampling_rate = librosa.load(
332 |                             emotion_reference, sr=16000, mono=True)
333 |                         emotion = w2v2_model(audio16000, sampling_rate)[
334 |                             'hidden_states']
335 |                         emotion_reference = re.sub(
336 |                             r'\..*$', '', emotion_reference)
337 |                         np.save(emotion_reference, emotion.squeeze(0))
338 |                         emotion = FloatTensor(emotion)
339 | 
340 |                     #out_path = input('Path to save: ')
341 |                     out_path = "output.wav"
342 | 
343 |                     with no_grad():
344 |                         x_tst = stn_tst.unsqueeze(0)
345 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
346 |                         sid = LongTensor([speaker_id])
347 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
348 |                                                length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()
349 | 
350 |                 elif choice == 'v':
351 |                     audio, out_path = voice_conversion()
352 | 
353 |                 write(out_path, hps_ms.data.sampling_rate, audio)
354 |                 #print('Successfully saved!')
355 |                 #ask_if_continue()
356 |     else:
357 |         model = input('Path of a hubert-soft model: ')
358 |         from hubert_model import hubert_soft
359 |         hubert = hubert_soft(model)
360 | 
361 |         while True:
362 |             audio_path = input('Path of an audio file to convert:\n')
363 | 
364 |             if audio_path != '[VC]':
365 |                 import librosa
366 |                 if use_f0:
367 |                     audio, sampling_rate = librosa.load(
368 |                         audio_path, sr=hps_ms.data.sampling_rate, mono=True)
369 |                     audio16000 = librosa.resample(
370 |                         audio, orig_sr=sampling_rate, target_sr=16000)
371 |                 else:
372 |                     audio16000, sampling_rate = librosa.load(
373 |                         audio_path, sr=16000, mono=True)
374 | 
375 |                 #print_speakers(speakers, escape)
376 |                 target_id = get_speaker_id('Target speaker ID: ')
377 |                 out_path = input('Path to save: ')
378 |                 length_scale, out_path = get_label_value(
379 |                     out_path, 'LENGTH', 1, 'length scale')
380 |                 noise_scale, out_path = get_label_value(
381 |                     out_path, 'NOISE', 0.1, 'noise scale')
382 |                 noise_scale_w, out_path = get_label_value(
383 |                     out_path, 'NOISEW', 0.1, 'deviation of noise')
384 | 
385 |                 from torch import inference_mode, FloatTensor
386 |                 import numpy as np
387 |                 with inference_mode():
388 |                     units = hubert.units(FloatTensor(audio16000).unsqueeze(
389 |                         0).unsqueeze(0)).squeeze(0).numpy()
390 |                     if use_f0:
391 |                         f0_scale, out_path = get_label_value(
392 |                             out_path, 'F0', 1, 'f0 scale')
393 |                         f0 = librosa.pyin(audio, sr=sampling_rate,
394 |                                           fmin=librosa.note_to_hz('C0'),
395 |                                           fmax=librosa.note_to_hz('C7'),
396 |                                           frame_length=1780)[0]
397 |                         target_length = len(units[:, 0])
398 |                         f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,
399 |                                                      np.arange(0, len(f0)), f0)) * f0_scale
400 |                         units[:, 0] = f0 / 10
401 | 
402 |                 stn_tst = FloatTensor(units)
403 |                 with no_grad():
404 |                     x_tst = stn_tst.unsqueeze(0)
405 |                     x_tst_lengths = LongTensor([stn_tst.size(0)])
406 |                     sid = LongTensor([target_id])
407 |                     audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
408 |                                            noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
409 | 
410 |             else:
411 |                 audio, out_path = voice_conversion()
412 | 
413 |             write(out_path, hps_ms.data.sampling_rate, audio)
414 |             #print('Successfully saved!')
415 |             #ask_if_continue()
416 | 
417 | if __name__ == "__main__":
418 |     session_token = get_token()
419 |     api = ChatGPT(session_token)
420 |     print(idmessage)
421 |     peaker_id = input()
422 |     while True:
423 |         resp = api.send_message(voice_input())
424 |         answer = resp["message"].replace('\n','')
425 |         print("ChatGPT:")
426 |         print(answer)
427 |         generateSound(answer)
428 |         PlaySound(r'.\output.wav', flags=0)
429 |     
430 | 


--------------------------------------------------------------------------------
/ChatWaifuVoice.py:
--------------------------------------------------------------------------------
  1 | from scipy.io.wavfile import write
  2 | from mel_processing import spectrogram_torch
  3 | from text import text_to_sequence, _clean_text
  4 | from models import SynthesizerTrn
  5 | import utils
  6 | import commons
  7 | import sys
  8 | import re
  9 | from torch import no_grad, LongTensor
 10 | import logging
 11 | from winsound import PlaySound
 12 | import argparse
 13 | import queue
 14 | import sounddevice as sd
 15 | from vosk import Model, KaldiRecognizer
 16 | 
 17 | chinese_model_path = ".\model\CN\model.pth"
 18 | chinese_config_path = ".\model\CN\config.json"
 19 | japanese_model_path = ".\model\H_excluded.pth"
 20 | japanese_config_path = ".\model\config.json"
 21 | 
 22 | q = queue.Queue()
 23 | def int_or_str(text):
 24 |     """Helper function for argument parsing."""
 25 |     try:
 26 |         return int(text)
 27 |     except ValueError:
 28 |         return text
 29 | 
 30 | 
 31 | def callback(indata, frames, time, status):
 32 |     """This is called (from a separate thread) for each audio block."""
 33 |     if status:
 34 |         print(status, file=sys.stderr)
 35 |     q.put(bytes(indata))
 36 | 
 37 | 
 38 | parser = argparse.ArgumentParser(add_help=False)
 39 | parser.add_argument(
 40 |     "-l", "--list-devices", action="store_true",
 41 |     help="show list of audio devices and exit")
 42 | args, remaining = parser.parse_known_args()
 43 | if args.list_devices:
 44 |     parser.exit(0)
 45 | parser = argparse.ArgumentParser(
 46 |     description=__doc__,
 47 |     formatter_class=argparse.RawDescriptionHelpFormatter,
 48 |     parents=[parser])
 49 | parser.add_argument(
 50 |     "-f", "--filename", type=str, metavar="FILENAME",
 51 |     help="audio file to store recording to")
 52 | parser.add_argument(
 53 |     "-d", "--device", type=int_or_str,
 54 |     help="input device (numeric ID or substring)")
 55 | parser.add_argument(
 56 |     "-r", "--samplerate", type=int, help="sampling rate")
 57 | parser.add_argument(
 58 |     "-m", "--model", type=str, help="language model; e.g. en-us, fr, nl; default is en-us")
 59 | args = parser.parse_args(remaining)
 60 | try:
 61 |     if args.samplerate is None:
 62 |         device_info = sd.query_devices(args.device, "input")
 63 |         # soundfile expects an int, sounddevice provides a float:
 64 |         args.samplerate = int(device_info["default_samplerate"])
 65 | 
 66 |     if args.model is None:
 67 |         model = Model(lang="en-us")
 68 |     else:
 69 |         model = Model(lang=args.model)
 70 | 
 71 |     if args.filename:
 72 |         dump_fn = open(args.filename, "wb")
 73 |     else:
 74 |         dump_fn = None
 75 | 
 76 | 
 77 | 
 78 | except KeyboardInterrupt:
 79 |     print("\nDone")
 80 |     parser.exit(0)
 81 | 
 82 | ####################################
 83 | #CHATGPT INITIALIZE
 84 | from pyChatGPT import ChatGPT
 85 | import json
 86 | 
 87 | modelmessage = """ID      Output Language
 88 | 0       Chinese
 89 | 1       Japanese
 90 | """
 91 | 
 92 | idmessage_cn = """ID      Speaker
 93 | 0       綾地寧々
 94 | 1       在原七海
 95 | 2       小茸
 96 | 3       唐乐吟
 97 | """
 98 | 
 99 | idmessage_jp = """ID      Speaker
100 | 0       綾地寧々
101 | 1       因幡めぐる
102 | 2       朝武芳乃
103 | 3       常陸茉子
104 | 4       ムラサメ
105 | 5       鞍馬小春
106 | 6       在原七海
107 | """
108 | 
109 | inputLanguage = """ID      Input Language
110 | 0       Chinese
111 | 1       Japanese
112 | 2       English
113 | """
114 | 
115 | def voice_input_jp():
116 |     model = Model(lang="cn")
117 |     print("You:")
118 |     with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device,
119 |                            dtype="int16", channels=1, callback=callback):
120 | 
121 |         rec = KaldiRecognizer(model, args.samplerate)
122 |         while True:
123 |             data = q.get()
124 |             if rec.AcceptWaveform(data):
125 |                 a = json.loads(rec.Result())
126 |                 a = str(a['text'])
127 |                 a = ''.join(a.split())
128 |                 if(len(a) > 0):
129 |                     print(a)
130 |                     user_input = a + " 使用日本语"
131 |                     return user_input
132 |             if dump_fn is not None:
133 |                 dump_fn.write(data)
134 | 
135 | def voice_input_cn():
136 |     model = Model(lang="cn")
137 |     print("You:")
138 |     with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device,
139 |                            dtype="int16", channels=1, callback=callback):
140 | 
141 |         rec = KaldiRecognizer(model, args.samplerate)
142 |         while True:
143 |             data = q.get()
144 |             if rec.AcceptWaveform(data):
145 |                 a = json.loads(rec.Result())
146 |                 a = str(a['text'])
147 |                 a = ''.join(a.split())
148 |                 if(len(a) > 0):
149 |                     print(a)
150 |                     user_input = a
151 |                     return user_input
152 |             if dump_fn is not None:
153 |                 dump_fn.write(data)
154 | 
155 | def voice_input_jpjp():
156 |     model = Model(lang="ja")
157 |     print("You:")
158 |     with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device,
159 |                            dtype="int16", channels=1, callback=callback):
160 | 
161 |         rec = KaldiRecognizer(model, args.samplerate)
162 |         while True:
163 |             data = q.get()
164 |             if rec.AcceptWaveform(data):
165 |                 a = json.loads(rec.Result())
166 |                 a = str(a['text'])
167 |                 a = ''.join(a.split())
168 |                 if(len(a) > 0):
169 |                     print(a)
170 |                     user_input = a + " 使用日本语"
171 |                     return user_input
172 |             if dump_fn is not None:
173 |                 dump_fn.write(data)
174 | 
175 | def voice_input_enjp():
176 |     model = Model(lang="en-us")
177 |     print("You:")
178 |     with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device,
179 |                            dtype="int16", channels=1, callback=callback):
180 | 
181 |         rec = KaldiRecognizer(model, args.samplerate)
182 |         while True:
183 |             data = q.get()
184 |             if rec.AcceptWaveform(data):
185 |                 a = json.loads(rec.Result())
186 |                 a = str(a['text'])
187 |                 a = ''.join(a.split())
188 |                 if(len(a) > 0):
189 |                     print(a)
190 |                     user_input = a + " 使用日本语"
191 |                     return user_input
192 |             if dump_fn is not None:
193 |                 dump_fn.write(data)
194 | 
195 | 
196 | def get_token():
197 |     token = input("Copy your token from ChatGPT and press Enter \n")
198 |     return token
199 | 
200 |       
201 | ################################################
202 | logging.getLogger('numba').setLevel(logging.WARNING)
203 | 
204 | def get_text(text, hps, cleaned=False):
205 |     if cleaned:
206 |         text_norm = text_to_sequence(text, hps.symbols, [])
207 |     else:
208 |         text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
209 |     if hps.data.add_blank:
210 |         text_norm = commons.intersperse(text_norm, 0)
211 |     text_norm = LongTensor(text_norm)
212 |     return text_norm
213 | 
214 | def get_speaker_id(message):
215 |     speaker_id = input(message)
216 |     try:
217 |         speaker_id = int(speaker_id)
218 |     except:
219 |         print(str(speaker_id) + ' is not a valid ID!')
220 |         sys.exit(1)
221 |     return speaker_id
222 | 
223 | def get_model_id(message):
224 |     speaker_id = input(message)
225 |     try:
226 |         speaker_id = int(speaker_id)
227 |     except:
228 |         print(str(speaker_id) + ' is not a valid ID!')
229 |         sys.exit(1)
230 |     return speaker_id
231 | 
232 | def get_language_id(message):
233 |     speaker_id = input(message)
234 |     try:
235 |         speaker_id = int(speaker_id)
236 |     except:
237 |         print(str(speaker_id) + ' is not a valid ID!')
238 |         sys.exit(1)
239 |     return speaker_id
240 | 
241 | 
242 | def get_label_value(text, label, default, warning_name='value'):
243 |     value = re.search(rf'\[{label}=(.+?)\]', text)
244 |     if value:
245 |         try:
246 |             text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
247 |             value = float(value.group(1))
248 |         except:
249 |             print(f'Invalid {warning_name}!')
250 |             sys.exit(1)
251 |     else:
252 |         value = default
253 |     return value, text
254 | 
255 | 
256 | def get_label(text, label):
257 |     if f'[{label}]' in text:
258 |         return True, text.replace(f'[{label}]', '')
259 |     else:
260 |         return False, text
261 | 
262 | 
263 | def generateSound(inputString, id, model_id):
264 |     if '--escape' in sys.argv:
265 |         escape = True
266 |     else:
267 |         escape = False
268 | 
269 |     #model = input('0: Chinese')
270 |     #config = input('Path of a config file: ')
271 |     if model_id == 0:
272 |         model = chinese_model_path
273 |         config = chinese_config_path
274 |     elif model_id == 1:
275 |         model = japanese_model_path
276 |         config = japanese_config_path
277 |         
278 | 
279 |     hps_ms = utils.get_hparams_from_file(config)
280 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
281 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
282 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
283 | 
284 |     net_g_ms = SynthesizerTrn(
285 |         n_symbols,
286 |         hps_ms.data.filter_length // 2 + 1,
287 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
288 |         n_speakers=n_speakers,
289 |         emotion_embedding=emotion_embedding,
290 |         **hps_ms.model)
291 |     _ = net_g_ms.eval()
292 |     utils.load_checkpoint(model, net_g_ms)
293 | 
294 |     if n_symbols != 0:
295 |         if not emotion_embedding:
296 |             #while True:
297 |             if(1 == 1):
298 |                 choice = 't'
299 |                 if choice == 't':
300 |                     text = inputString
301 |                     if text == '[ADVANCED]':
302 |                         text = "我不会说"
303 | 
304 |                     length_scale, text = get_label_value(
305 |                         text, 'LENGTH', 1, 'length scale')
306 |                     noise_scale, text = get_label_value(
307 |                         text, 'NOISE', 0.667, 'noise scale')
308 |                     noise_scale_w, text = get_label_value(
309 |                         text, 'NOISEW', 0.8, 'deviation of noise')
310 |                     cleaned, text = get_label(text, 'CLEANED')
311 | 
312 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
313 |                     
314 |                     speaker_id = id 
315 |                     out_path = "output.wav"
316 | 
317 |                     with no_grad():
318 |                         x_tst = stn_tst.unsqueeze(0)
319 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
320 |                         sid = LongTensor([speaker_id])
321 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
322 |                                                noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
323 | 
324 |                 write(out_path, hps_ms.data.sampling_rate, audio)
325 |                 print('Successfully saved!')
326 | 
327 | if __name__ == "__main__":
328 |     session_token = get_token()
329 |     api = ChatGPT(session_token)
330 | 
331 |     print(inputLanguage)
332 |     language_id = get_language_id("选择输入语言：")
333 |     if language_id == 0: #cn
334 |         print(modelmessage)
335 |         model_id = get_model_id('选择回复语言: ')
336 |         if model_id == 0:
337 |             print("\n" + idmessage_cn)
338 |             id = get_speaker_id('选择角色: ')
339 |         elif model_id == 1:
340 |             print("\n" + idmessage_jp)
341 |             id = get_speaker_id('选择角色: ')
342 |     elif language_id == 1: #jp
343 |         model_id = 1
344 |         print("\n" + idmessage_jp)
345 |         id = get_speaker_id('选择角色: ')
346 |     elif language_id == 2: #en
347 |         model_id = 1
348 |         print("\n" + idmessage_cn)
349 |         id = get_speaker_id('选择角色: ')
350 | 
351 |     print()
352 |     while True:
353 | 
354 |         if language_id == 0 and model_id == 0: #input=cn output=cn
355 |             resp = api.send_message(voice_input_cn())
356 |             if(resp == "quit()"):
357 |                 break
358 |             answer = resp["message"].replace('\n','')
359 |             print("ChatGPT:")
360 |             print(answer)
361 |             generateSound("[ZH]"+answer+"[ZH]", id, model_id)
362 |             PlaySound(r'.\output.wav', flags=1)
363 |         elif language_id == 0 and model_id == 1: #input=cn output=jp
364 |             resp = api.send_message(voice_input_jp())
365 |             if(resp == "quit()"):
366 |                 break
367 |             answer = resp["message"].replace('\n','')
368 |             print("ChatGPT:")
369 |             print(answer)
370 |             generateSound(answer, id, model_id)
371 |             PlaySound(r'.\output.wav', flags=1)
372 |         elif language_id == 1: #input=jp output=jp
373 |             resp = api.send_message(voice_input_jpjp())
374 |             answer = resp["message"].replace('\n','')
375 |             print("ChatGPT:")
376 |             print(answer)
377 |             generateSound(answer, id, model_id)
378 |             PlaySound(r'.\output.wav', flags=0)
379 |         elif language_id == 2: #input=en output=jp
380 |             resp = api.send_message(voice_input_enjp())
381 |             answer = resp["message"].replace('\n','')
382 |             print("ChatGPT:")
383 |             print(answer)
384 |             generateSound(answer, id, model_id)
385 |             PlaySound(r'.\output.wav', flags=0)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 CjangCjengh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![cover](readme/cyberchat.png)
  2 | 
  3 | [中文](README.md "中文") [English](eng-README.md "English") [日本語](jp-README.md "日本語")
  4 | 
  5 | <p align="center">
  6 | 	<img alt="GitHub" src="https://img.shields.io/github/license/cjyaddone/ChatWaifu?color=red">
  7 | 	<img src="https://img.shields.io/badge/Python-3.7|8|9|10-green" alt="PYTHON" >
  8 |   	<a href="https://app.fossa.com/projects/git%2Bgithub.com%2Fcjyaddone%2FChatWaifu?ref=badge_small" alt="FOSSA Status"><img src="https://app.fossa.com/api/projects/git%2Bgithub.com%2Fcjyaddone%2FChatWaifu.svg?type=small"/></a>
  9 | </p>
 10 | 
 11 | #
 12 | 
 13 | ### 这是一个使用TTS+VITS的ChatGPT语音对话程序!
 14 | 
 15 | 效果演示BiliBIli:[《青春猪头少年不会梦见赛博女友》](https://www.bilibili.com/video/BV1rv4y1Q7eT "BiliBili")
 16 | 
 17 | **当前支持功能：**
 18 | * [x] ChatGPT的对话聊天
 19 | * [x] 回答转语音
 20 | * [x] 多角色语音
 21 | * [x] 语音识别对话 (研发了一款真正人性化的智能语音Q宝
 22 | * [x] [对接Marai机器人](https://github.com/MuBai-He/ChatWaifu-marai)
 23 | * [x] [对接Live2D的UI版本](https://github.com/cjyaddone/ChatWaifuL2D)
 24 | * [x] [使用gpt3官方api，并支持cuda加速的版本（当前仅源码](https://github.com/cjyaddone/ChatWaifu-API)
 25 | 
 26 | 
 27 | 
 28 | # 目录
 29 | ### 本项目均默认使用Chrome浏览器
 30 | * [1.安装环境：](#1.)
 31 | 	* 1.1 [使用cd命令进入项目文件夹](#cd)
 32 | 	* 1.2 [创建Python虚拟环境:](#99)
 33 | 	* 1.3 [进入创建好的虚拟环境:](#venv)
 34 | 	* 1.4 [pip安装项目所需要的库文件:](#pip)
 35 | * [2.导入模型到根目录model文件夹（如果没有自行创建):](#.model)
 36 | 	* 2.1 [双击导入model](#cd1)
 37 | * [3.运行（快和我的老婆们对话吧:](#22)
 38 | 	* 3.1 [获取ChatGPT Token](#333)
 39 | 	* 3.2 [开始和CyberWaifu聊天](#444)
 40 | * [4.鸣谢](#915)
 41 | ## <span id="1.">1.安装环境：</span>
 42 | > **安装anaconda环境或Python>=3.7**
 43 | > 
 44 | > **本例使用的环境名称是：chatWaifu**
 45 | 
 46 | ### <span id="cd">1.1 使用cd命令进入项目文件夹</span>
 47 | `cd 你的项目路径`
 48 | ![](readme/5.png)
 49 | ### <span id="99">1.2 创建Python虚拟环境:</span>
 50 | 
 51 | Conda:`conda create --name chatWaifu python=3.10`
 52 | ![](readme/1.png)
 53 | ![](readme/2.png)
 54 | Python:`python -m venv chatWaifu`
 55 | ![](readme/6.png)
 56 | 
 57 | ### <span id="venv">1.3 进入创建好的虚拟环境:</span>
 58 | Conda:`conda activate chatWaifu`
 59 | ![](readme/3.png)
 60 | 
 61 | Python:`.\chatWaifu\Scripts\activate.bat`
 62 | ![](readme/7.png)
 63 | 
 64 | ### <span id="pip">1.4 pip安装项目所需要的库文件:</span>
 65 | `pip install -r requirements.txt`
 66 | ![](readme/4.png)
 67 | 
 68 | ## <span id=".model">2.导入模型到根目录model文件夹:</span>
 69 | Google Drive:https://drive.google.com/file/d/1tMCafhnUoL7FbevVQ44VQi-WznDjt23_/view?usp=sharing
 70 | 
 71 | 阿里云盘: https://www.aliyundrive.com/s/9JEj1mp1ZRv 提取码: m2y3
 72 | 
 73 | ### <span id="cd1">2.1移动到项目根目录下双击导入model</span>
 74 | 
 75 | ## <span id="22">3.运行（快和老婆们对话吧:</span>
 76 | 
 77 | 打字输入版：`python ChatWaifu.py`
 78 | 
 79 | 语音对话版（日语和英语输入默认日语输出）：`python ChatWaifuVoice.py`
 80 | 
 81 | ### 以下也可以使用，每个文件只对应一种语音输出输入模式
 82 | 
 83 | 打字日语版：`python ChatWaifuJP.py`
 84 | 
 85 | 打字中文版：`python ChatWaifuCN.py`
 86 | 
 87 | 日语语音对话版（使用中文）：`python ChatWaifuJPVoice.py`
 88 | 
 89 | 中文语音对话版（使用中文）：`python ChatWaifuCNVoice.py`
 90 | 
 91 | 日文语音对话版（使用英文）：`python ChatWaifuJPVoiceEN.py`
 92 | 
 93 | 日语语音对话版（使用日文）：`python ChatWaifuJPVoiceJP.py`
 94 | 
 95 | ### <span id="333">3.1 获取ChatGPT Token</span>
 96 | #### 在浏览器登入https://chat.openai.com
 97 | #### 按F12进入开发控制台
 98 | #### 找到 应用程序 -> cookie -> __Secure-next-auth.session-token
 99 | ![](readme/token.png)
100 | #### 将值复制进入终端并回车
101 | 
102 | ### <span id="444">3.2 开始和CyberWaifu聊天！！！</span>
103 | 
104 | **语音对话版:** 当控制台提示"You:"时开始说话，说完并出现句子录音结束并发送到ChatGPT对话。
105 | 
106 | 附赠:[ChatGPT 中文调教指南](https://github.com/PlexPt/awesome-chatgpt-prompts-zh)
107 | 
108 | ## <span id="915">4.鸣谢：</span>
109 | - [MoeGoe_GUI]https://github.com/CjangCjengh/MoeGoe_GUI
110 | - [Pretrained models]https://github.com/CjangCjengh/TTSModels
111 | - [PyChatGPT]https://github.com/terry3041/pyChatGPT
112 | 


--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | from modules import LayerNorm
  8 |    
  9 | 
 10 | class Encoder(nn.Module):
 11 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 12 |     super().__init__()
 13 |     self.hidden_channels = hidden_channels
 14 |     self.filter_channels = filter_channels
 15 |     self.n_heads = n_heads
 16 |     self.n_layers = n_layers
 17 |     self.kernel_size = kernel_size
 18 |     self.p_dropout = p_dropout
 19 |     self.window_size = window_size
 20 | 
 21 |     self.drop = nn.Dropout(p_dropout)
 22 |     self.attn_layers = nn.ModuleList()
 23 |     self.norm_layers_1 = nn.ModuleList()
 24 |     self.ffn_layers = nn.ModuleList()
 25 |     self.norm_layers_2 = nn.ModuleList()
 26 |     for i in range(self.n_layers):
 27 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 28 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 29 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 30 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 31 | 
 32 |   def forward(self, x, x_mask):
 33 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 34 |     x = x * x_mask
 35 |     for i in range(self.n_layers):
 36 |       y = self.attn_layers[i](x, x, attn_mask)
 37 |       y = self.drop(y)
 38 |       x = self.norm_layers_1[i](x + y)
 39 | 
 40 |       y = self.ffn_layers[i](x, x_mask)
 41 |       y = self.drop(y)
 42 |       x = self.norm_layers_2[i](x + y)
 43 |     x = x * x_mask
 44 |     return x
 45 | 
 46 | 
 47 | class Decoder(nn.Module):
 48 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 49 |     super().__init__()
 50 |     self.hidden_channels = hidden_channels
 51 |     self.filter_channels = filter_channels
 52 |     self.n_heads = n_heads
 53 |     self.n_layers = n_layers
 54 |     self.kernel_size = kernel_size
 55 |     self.p_dropout = p_dropout
 56 |     self.proximal_bias = proximal_bias
 57 |     self.proximal_init = proximal_init
 58 | 
 59 |     self.drop = nn.Dropout(p_dropout)
 60 |     self.self_attn_layers = nn.ModuleList()
 61 |     self.norm_layers_0 = nn.ModuleList()
 62 |     self.encdec_attn_layers = nn.ModuleList()
 63 |     self.norm_layers_1 = nn.ModuleList()
 64 |     self.ffn_layers = nn.ModuleList()
 65 |     self.norm_layers_2 = nn.ModuleList()
 66 |     for i in range(self.n_layers):
 67 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 68 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 69 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 70 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 71 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 72 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 73 | 
 74 |   def forward(self, x, x_mask, h, h_mask):
 75 |     """
 76 |     x: decoder input
 77 |     h: encoder output
 78 |     """
 79 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 80 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 81 |     x = x * x_mask
 82 |     for i in range(self.n_layers):
 83 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 84 |       y = self.drop(y)
 85 |       x = self.norm_layers_0[i](x + y)
 86 | 
 87 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
 88 |       y = self.drop(y)
 89 |       x = self.norm_layers_1[i](x + y)
 90 |       
 91 |       y = self.ffn_layers[i](x, x_mask)
 92 |       y = self.drop(y)
 93 |       x = self.norm_layers_2[i](x + y)
 94 |     x = x * x_mask
 95 |     return x
 96 | 
 97 | 
 98 | class MultiHeadAttention(nn.Module):
 99 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
100 |     super().__init__()
101 |     assert channels % n_heads == 0
102 | 
103 |     self.channels = channels
104 |     self.out_channels = out_channels
105 |     self.n_heads = n_heads
106 |     self.p_dropout = p_dropout
107 |     self.window_size = window_size
108 |     self.heads_share = heads_share
109 |     self.block_length = block_length
110 |     self.proximal_bias = proximal_bias
111 |     self.proximal_init = proximal_init
112 |     self.attn = None
113 | 
114 |     self.k_channels = channels // n_heads
115 |     self.conv_q = nn.Conv1d(channels, channels, 1)
116 |     self.conv_k = nn.Conv1d(channels, channels, 1)
117 |     self.conv_v = nn.Conv1d(channels, channels, 1)
118 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
119 |     self.drop = nn.Dropout(p_dropout)
120 | 
121 |     if window_size is not None:
122 |       n_heads_rel = 1 if heads_share else n_heads
123 |       rel_stddev = self.k_channels**-0.5
124 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
125 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
126 | 
127 |     nn.init.xavier_uniform_(self.conv_q.weight)
128 |     nn.init.xavier_uniform_(self.conv_k.weight)
129 |     nn.init.xavier_uniform_(self.conv_v.weight)
130 |     if proximal_init:
131 |       with torch.no_grad():
132 |         self.conv_k.weight.copy_(self.conv_q.weight)
133 |         self.conv_k.bias.copy_(self.conv_q.bias)
134 |       
135 |   def forward(self, x, c, attn_mask=None):
136 |     q = self.conv_q(x)
137 |     k = self.conv_k(c)
138 |     v = self.conv_v(c)
139 |     
140 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
141 | 
142 |     x = self.conv_o(x)
143 |     return x
144 | 
145 |   def attention(self, query, key, value, mask=None):
146 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
147 |     b, d, t_s, t_t = (*key.size(), query.size(2))
148 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
149 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
150 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
151 | 
152 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
153 |     if self.window_size is not None:
154 |       assert t_s == t_t, "Relative attention is only available for self-attention."
155 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
156 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
157 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
158 |       scores = scores + scores_local
159 |     if self.proximal_bias:
160 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
161 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
162 |     if mask is not None:
163 |       scores = scores.masked_fill(mask == 0, -1e4)
164 |       if self.block_length is not None:
165 |         assert t_s == t_t, "Local attention is only available for self-attention."
166 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
167 |         scores = scores.masked_fill(block_mask == 0, -1e4)
168 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
169 |     p_attn = self.drop(p_attn)
170 |     output = torch.matmul(p_attn, value)
171 |     if self.window_size is not None:
172 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
173 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
174 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
175 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
176 |     return output, p_attn
177 | 
178 |   def _matmul_with_relative_values(self, x, y):
179 |     """
180 |     x: [b, h, l, m]
181 |     y: [h or 1, m, d]
182 |     ret: [b, h, l, d]
183 |     """
184 |     ret = torch.matmul(x, y.unsqueeze(0))
185 |     return ret
186 | 
187 |   def _matmul_with_relative_keys(self, x, y):
188 |     """
189 |     x: [b, h, l, d]
190 |     y: [h or 1, m, d]
191 |     ret: [b, h, l, m]
192 |     """
193 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
194 |     return ret
195 | 
196 |   def _get_relative_embeddings(self, relative_embeddings, length):
197 |     max_relative_position = 2 * self.window_size + 1
198 |     # Pad first before slice to avoid using cond ops.
199 |     pad_length = max(length - (self.window_size + 1), 0)
200 |     slice_start_position = max((self.window_size + 1) - length, 0)
201 |     slice_end_position = slice_start_position + 2 * length - 1
202 |     if pad_length > 0:
203 |       padded_relative_embeddings = F.pad(
204 |           relative_embeddings,
205 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
206 |     else:
207 |       padded_relative_embeddings = relative_embeddings
208 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
209 |     return used_relative_embeddings
210 | 
211 |   def _relative_position_to_absolute_position(self, x):
212 |     """
213 |     x: [b, h, l, 2*l-1]
214 |     ret: [b, h, l, l]
215 |     """
216 |     batch, heads, length, _ = x.size()
217 |     # Concat columns of pad to shift from relative to absolute indexing.
218 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
219 | 
220 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
221 |     x_flat = x.view([batch, heads, length * 2 * length])
222 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
223 | 
224 |     # Reshape and slice out the padded elements.
225 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
226 |     return x_final
227 | 
228 |   def _absolute_position_to_relative_position(self, x):
229 |     """
230 |     x: [b, h, l, l]
231 |     ret: [b, h, l, 2*l-1]
232 |     """
233 |     batch, heads, length, _ = x.size()
234 |     # padd along column
235 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
236 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
237 |     # add 0's in the beginning that will skew the elements after reshape
238 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
239 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
240 |     return x_final
241 | 
242 |   def _attention_bias_proximal(self, length):
243 |     """Bias for self-attention to encourage attention to close positions.
244 |     Args:
245 |       length: an integer scalar.
246 |     Returns:
247 |       a Tensor with shape [1, 1, length, length]
248 |     """
249 |     r = torch.arange(length, dtype=torch.float32)
250 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
251 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
252 | 
253 | 
254 | class FFN(nn.Module):
255 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
256 |     super().__init__()
257 |     self.in_channels = in_channels
258 |     self.out_channels = out_channels
259 |     self.filter_channels = filter_channels
260 |     self.kernel_size = kernel_size
261 |     self.p_dropout = p_dropout
262 |     self.activation = activation
263 |     self.causal = causal
264 | 
265 |     if causal:
266 |       self.padding = self._causal_padding
267 |     else:
268 |       self.padding = self._same_padding
269 | 
270 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
271 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
272 |     self.drop = nn.Dropout(p_dropout)
273 | 
274 |   def forward(self, x, x_mask):
275 |     x = self.conv_1(self.padding(x * x_mask))
276 |     if self.activation == "gelu":
277 |       x = x * torch.sigmoid(1.702 * x)
278 |     else:
279 |       x = torch.relu(x)
280 |     x = self.drop(x)
281 |     x = self.conv_2(self.padding(x * x_mask))
282 |     return x * x_mask
283 |   
284 |   def _causal_padding(self, x):
285 |     if self.kernel_size == 1:
286 |       return x
287 |     pad_l = self.kernel_size - 1
288 |     pad_r = 0
289 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
290 |     x = F.pad(x, commons.convert_pad_shape(padding))
291 |     return x
292 | 
293 |   def _same_padding(self, x):
294 |     if self.kernel_size == 1:
295 |       return x
296 |     pad_l = (self.kernel_size - 1) // 2
297 |     pad_r = self.kernel_size // 2
298 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
299 |     x = F.pad(x, commons.convert_pad_shape(padding))
300 |     return x
301 | 


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | import torch.jit
 4 | 
 5 | 
 6 | def script_method(fn, _rcb=None):
 7 |   return fn
 8 | 
 9 | 
10 | def script(obj, optimize=True, _frames_up=0, _rcb=None):
11 |   return obj
12 | 
13 | 
14 | torch.jit.script_method = script_method
15 | torch.jit.script = script
16 | 
17 | 
18 | def init_weights(m, mean=0.0, std=0.01):
19 |   classname = m.__class__.__name__
20 |   if classname.find("Conv") != -1:
21 |     m.weight.data.normal_(mean, std)
22 | 
23 | 
24 | def get_padding(kernel_size, dilation=1):
25 |   return int((kernel_size*dilation - dilation)/2)
26 | 
27 | 
28 | def intersperse(lst, item):
29 |   result = [item] * (len(lst) * 2 + 1)
30 |   result[1::2] = lst
31 |   return result
32 | 
33 | 
34 | def slice_segments(x, ids_str, segment_size=4):
35 |   ret = torch.zeros_like(x[:, :, :segment_size])
36 |   for i in range(x.size(0)):
37 |     idx_str = ids_str[i]
38 |     idx_end = idx_str + segment_size
39 |     ret[i] = x[i, :, idx_str:idx_end]
40 |   return ret
41 | 
42 | 
43 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
44 |   b, d, t = x.size()
45 |   if x_lengths is None:
46 |     x_lengths = t
47 |   ids_str_max = x_lengths - segment_size + 1
48 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
49 |   ret = slice_segments(x, ids_str, segment_size)
50 |   return ret, ids_str
51 | 
52 | 
53 | def subsequent_mask(length):
54 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
55 |   return mask
56 | 
57 | 
58 | @torch.jit.script
59 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
60 |   n_channels_int = n_channels[0]
61 |   in_act = input_a + input_b
62 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
63 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
64 |   acts = t_act * s_act
65 |   return acts
66 | 
67 | 
68 | def convert_pad_shape(pad_shape):
69 |   l = pad_shape[::-1]
70 |   pad_shape = [item for sublist in l for item in sublist]
71 |   return pad_shape
72 | 
73 | 
74 | def sequence_mask(length, max_length=None):
75 |   if max_length is None:
76 |     max_length = length.max()
77 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
78 |   return x.unsqueeze(0) < length.unsqueeze(1)
79 | 
80 | 
81 | def generate_path(duration, mask):
82 |   """
83 |   duration: [b, 1, t_x]
84 |   mask: [b, 1, t_y, t_x]
85 |   """
86 |   device = duration.device
87 |   
88 |   b, _, t_y, t_x = mask.shape
89 |   cum_duration = torch.cumsum(duration, -1)
90 |   
91 |   cum_duration_flat = cum_duration.view(b * t_x)
92 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
93 |   path = path.view(b, t_x, t_y)
94 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
95 |   path = path.unsqueeze(1).transpose(2,3) * mask
96 |   return path
97 | 


--------------------------------------------------------------------------------
/eng-README.md:
--------------------------------------------------------------------------------
  1 | ![cover](readme/cyberchat.png)
  2 | 
  3 | [中文](README.md "中文") [English](eng-README.md "English") [日本語](jp-README.md "日本語")
  4 | 
  5 | <p align="center">
  6 | 	<img alt="GitHub" src="https://img.shields.io/github/license/cjyaddone/ChatWaifu?color=red">
  7 | 	<img src="https://img.shields.io/badge/Python-3.7|8|9|10-green" alt="PYTHON" >
  8 |   	<a href="https://app.fossa.com/projects/git%2Bgithub.com%2Fcjyaddone%2FChatWaifu?ref=badge_small" alt="FOSSA Status"><img src="https://app.fossa.com/api/projects/git%2Bgithub.com%2Fcjyaddone%2FChatWaifu.svg?type=small"/></a>
  9 | </p>
 10 | 
 11 | #
 12 | 
 13 | > ### This is a chatting Waifu program based on VITS & ChatGPT!
 14 | 
 15 | Effect demonstration on BiliBIli:[《青春猪头少年不会梦见赛博女友》](https://www.bilibili.com/video/BV1rv4y1Q7eT "BiliBili")
 16 | 
 17 | **Functioning Now：**
 18 | * [x] Talking with ChatGPT
 19 | * [x] Convert AI's Response to wav file
 20 | * [x] Multi-Character voice generator
 21 | * [x] Voice Recognition
 22 | * [x] [Connect to Marai Robort](https://github.com/MuBai-He/ChatWaifu-marai)
 23 | * [x] [Connect to Live2D](https://github.com/cjyaddone/ChatWaifuL2D)
 24 | 
 25 | # Catalogue
 26 | ### This project assumes that you are using chrome explorer
 27 | * [1.Install Python venv：](#1.)
 28 | 	* 1.1 [Enter directory with cd commend](#cd)
 29 | 	* 1.2 [Create Python Venv:](#99)
 30 | 	* 1.3 [Enter Python Venv:](#venv)
 31 | 	* 1.4 [Install required library with Pip:](#pip)
 32 | * [2.Import pre-trained models to "model" folder（create a new one if doesn't exist):](#.model)
 33 | 	* 2.1 [Double click model.exe to import Models](#cd1)
 34 | * [3.Run（Talk to your Waifu:](#22)
 35 | 	* 3.1 [Get ChatGPT Token](#333)
 36 | 	* 3.2 [Start chatting with CyberWaifu](#444)
 37 | * [4.Contributions](#915)
 38 | ## <span id="1.">1.Install Python Venv：</span>
 39 | > **Install Anaconda or Python>=3.7**
 40 | > 
 41 | > **This example name the venv：chatWaifu**
 42 | 
 43 | ### <span id="cd">1.1 Enter project directory with cd command</span>
 44 | `cd YOUR_PROJECT_RESPORY`
 45 | ![](readme/5.png)
 46 | ### <span id="99">1.2 Create Python Venv:</span>
 47 | 
 48 | Conda:`conda create --name chatWaifu python=3.10`
 49 | ![](readme/1.png)
 50 | ![](readme/2.png)
 51 | 
 52 | Python:`python -m venv chatWaifu`
 53 | ![](readme/6.png)
 54 | 
 55 | ### <span id="venv">1.3 Activate created venv:</span>
 56 | Conda:`conda activate chatWaifu`
 57 | 
 58 | ![](readme/3.png)
 59 | 
 60 | Python:`.\chatWaifu\Scripts\activate.bat`
 61 | ![](readme/7.png)
 62 | 
 63 | ### <span id="pip">1.4 Install required library with Pip:</span>
 64 | `pip install -r requirements.txt`
 65 | ![](readme/4.png)
 66 | 
 67 | ## <span id=".model">2.import pre-trained models to root directory:</span>
 68 | Google Drive:https://drive.google.com/file/d/1tMCafhnUoL7FbevVQ44VQi-WznDjt23_/view?usp=sharing
 69 | 
 70 | Ali Drive: https://www.aliyundrive.com/s/9JEj1mp1ZRv 提取码: m2y3
 71 | 
 72 | ### <span id="cd1">2.1Double click model.exe to import Models</span>
 73 | 
 74 | ## <span id="22">3.RUN（Start chatting with CyberWaifu:</span>
 75 | Japanese Ver：`python ChatWaifuJP.py`
 76 | 
 77 | Chinese Ver：`python ChatWaifuCN.py`
 78 | 
 79 | Japanese voice conversation Ver(use Chinese)：`python ChatWaifuJPVoice.py`
 80 | 
 81 | Chinese voice conversation Ver(use Chinese)：`python ChatWaifuCNVoice.py`
 82 | 
 83 | Japanese voice conversation Ver(use English)：`python ChatWaifuJPVoiceEN.py`
 84 | 
 85 | Japanese voice conversation Ver(use Japanese)：`python ChatWaifuJPVoiceJP.py`
 86 | 
 87 | 
 88 | ### <span id="333">3.1 Get ChatGPT Token</span>
 89 | #### Log in to ChatGPT whith link:https://chat.openai.com
 90 | #### Press F12 to enter command center
 91 | #### Find Application -> cookie -> __Secure-next-auth.session-token
 92 | ![](readme/token.png)
 93 | #### Copy the value into cmd and press ENTER
 94 | 
 95 | ### <span id="444">3.2 Start chatting with CyberWaifu</span>
 96 | 
 97 | **voice conversation Ver:** Start talking when the console prompts "You:" and then the sentence is recorded and sent to the ChatGPT conversation. 
 98 | 
 99 | ## <span id="915">4.Contribution：</span>
100 | - [MoeGoe_GUI]https://github.com/CjangCjengh/MoeGoe_GUI
101 | - [Pretrained models]https://github.com/CjangCjengh/TTSModels
102 | - [PyChatGPT]https://github.com/terry3041/pyChatGPT
103 | 


--------------------------------------------------------------------------------
/hubert_model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Optional, Tuple
  3 | import random
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
  9 | 
 10 | class Hubert(nn.Module):
 11 |     def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
 12 |         super().__init__()
 13 |         self._mask = mask
 14 |         self.feature_extractor = FeatureExtractor()
 15 |         self.feature_projection = FeatureProjection()
 16 |         self.positional_embedding = PositionalConvEmbedding()
 17 |         self.norm = nn.LayerNorm(768)
 18 |         self.dropout = nn.Dropout(0.1)
 19 |         self.encoder = TransformerEncoder(
 20 |             nn.TransformerEncoderLayer(
 21 |                 768, 12, 3072, activation="gelu", batch_first=True
 22 |             ),
 23 |             12,
 24 |         )
 25 |         self.proj = nn.Linear(768, 256)
 26 | 
 27 |         self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
 28 |         self.label_embedding = nn.Embedding(num_label_embeddings, 256)
 29 | 
 30 |     def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 31 |         mask = None
 32 |         if self.training and self._mask:
 33 |             mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
 34 |             x[mask] = self.masked_spec_embed.to(x.dtype)
 35 |         return x, mask
 36 | 
 37 |     def encode(
 38 |         self, x: torch.Tensor, layer: Optional[int] = None
 39 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 40 |         x = self.feature_extractor(x)
 41 |         x = self.feature_projection(x.transpose(1, 2))
 42 |         x, mask = self.mask(x)
 43 |         x = x + self.positional_embedding(x)
 44 |         x = self.dropout(self.norm(x))
 45 |         x = self.encoder(x, output_layer=layer)
 46 |         return x, mask
 47 | 
 48 |     def logits(self, x: torch.Tensor) -> torch.Tensor:
 49 |         logits = torch.cosine_similarity(
 50 |             x.unsqueeze(2),
 51 |             self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
 52 |             dim=-1,
 53 |         )
 54 |         return logits / 0.1
 55 | 
 56 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 57 |         x, mask = self.encode(x)
 58 |         x = self.proj(x)
 59 |         logits = self.logits(x)
 60 |         return logits, mask
 61 | 
 62 | 
 63 | class HubertSoft(Hubert):
 64 |     def __init__(self):
 65 |         super().__init__()
 66 | 
 67 |     @torch.inference_mode()
 68 |     def units(self, wav: torch.Tensor) -> torch.Tensor:
 69 |         wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
 70 |         x, _ = self.encode(wav)
 71 |         return self.proj(x)
 72 | 
 73 | 
 74 | class FeatureExtractor(nn.Module):
 75 |     def __init__(self):
 76 |         super().__init__()
 77 |         self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
 78 |         self.norm0 = nn.GroupNorm(512, 512)
 79 |         self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
 80 |         self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
 81 |         self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
 82 |         self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
 83 |         self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
 84 |         self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
 85 | 
 86 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 87 |         x = F.gelu(self.norm0(self.conv0(x)))
 88 |         x = F.gelu(self.conv1(x))
 89 |         x = F.gelu(self.conv2(x))
 90 |         x = F.gelu(self.conv3(x))
 91 |         x = F.gelu(self.conv4(x))
 92 |         x = F.gelu(self.conv5(x))
 93 |         x = F.gelu(self.conv6(x))
 94 |         return x
 95 | 
 96 | 
 97 | class FeatureProjection(nn.Module):
 98 |     def __init__(self):
 99 |         super().__init__()
100 |         self.norm = nn.LayerNorm(512)
101 |         self.projection = nn.Linear(512, 768)
102 |         self.dropout = nn.Dropout(0.1)
103 | 
104 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
105 |         x = self.norm(x)
106 |         x = self.projection(x)
107 |         x = self.dropout(x)
108 |         return x
109 | 
110 | 
111 | class PositionalConvEmbedding(nn.Module):
112 |     def __init__(self):
113 |         super().__init__()
114 |         self.conv = nn.Conv1d(
115 |             768,
116 |             768,
117 |             kernel_size=128,
118 |             padding=128 // 2,
119 |             groups=16,
120 |         )
121 |         self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
122 | 
123 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
124 |         x = self.conv(x.transpose(1, 2))
125 |         x = F.gelu(x[:, :, :-1])
126 |         return x.transpose(1, 2)
127 | 
128 | 
129 | class TransformerEncoder(nn.Module):
130 |     def __init__(
131 |         self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
132 |     ) -> None:
133 |         super(TransformerEncoder, self).__init__()
134 |         self.layers = nn.ModuleList(
135 |             [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
136 |         )
137 |         self.num_layers = num_layers
138 | 
139 |     def forward(
140 |         self,
141 |         src: torch.Tensor,
142 |         mask: torch.Tensor = None,
143 |         src_key_padding_mask: torch.Tensor = None,
144 |         output_layer: Optional[int] = None,
145 |     ) -> torch.Tensor:
146 |         output = src
147 |         for layer in self.layers[:output_layer]:
148 |             output = layer(
149 |                 output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
150 |             )
151 |         return output
152 | 
153 | 
154 | def _compute_mask(
155 |     shape: Tuple[int, int],
156 |     mask_prob: float,
157 |     mask_length: int,
158 |     device: torch.device,
159 |     min_masks: int = 0,
160 | ) -> torch.Tensor:
161 |     batch_size, sequence_length = shape
162 | 
163 |     if mask_length < 1:
164 |         raise ValueError("`mask_length` has to be bigger than 0.")
165 | 
166 |     if mask_length > sequence_length:
167 |         raise ValueError(
168 |             f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
169 |         )
170 | 
171 |     # compute number of masked spans in batch
172 |     num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
173 |     num_masked_spans = max(num_masked_spans, min_masks)
174 | 
175 |     # make sure num masked indices <= sequence_length
176 |     if num_masked_spans * mask_length > sequence_length:
177 |         num_masked_spans = sequence_length // mask_length
178 | 
179 |     # SpecAugment mask to fill
180 |     mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
181 | 
182 |     # uniform distribution to sample from, make sure that offset samples are < sequence_length
183 |     uniform_dist = torch.ones(
184 |         (batch_size, sequence_length - (mask_length - 1)), device=device
185 |     )
186 | 
187 |     # get random indices to mask
188 |     mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
189 | 
190 |     # expand masked indices to masked spans
191 |     mask_indices = (
192 |         mask_indices.unsqueeze(dim=-1)
193 |         .expand((batch_size, num_masked_spans, mask_length))
194 |         .reshape(batch_size, num_masked_spans * mask_length)
195 |     )
196 |     offsets = (
197 |         torch.arange(mask_length, device=device)[None, None, :]
198 |         .expand((batch_size, num_masked_spans, mask_length))
199 |         .reshape(batch_size, num_masked_spans * mask_length)
200 |     )
201 |     mask_idxs = mask_indices + offsets
202 | 
203 |     # scatter indices to mask
204 |     mask = mask.scatter(1, mask_idxs, True)
205 | 
206 |     return mask
207 | 
208 | 
209 | def hubert_soft(
210 |     path: str
211 | ) -> HubertSoft:
212 |     r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
213 |     Args:
214 |         path (str): path of a pretrained model
215 |     """
216 |     hubert = HubertSoft()
217 |     checkpoint = torch.load(path)
218 |     consume_prefix_in_state_dict_if_present(checkpoint, "module.")
219 |     hubert.load_state_dict(checkpoint)
220 |     hubert.eval()
221 |     return hubert
222 | 


--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | from librosa.filters import mel as librosa_mel_fn
  4 | 
  5 | MAX_WAV_VALUE = 32768.0
  6 | 
  7 | 
  8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
  9 |     """
 10 |     PARAMS
 11 |     ------
 12 |     C: compression factor
 13 |     """
 14 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 15 | 
 16 | 
 17 | def dynamic_range_decompression_torch(x, C=1):
 18 |     """
 19 |     PARAMS
 20 |     ------
 21 |     C: compression factor used to compress
 22 |     """
 23 |     return torch.exp(x) / C
 24 | 
 25 | 
 26 | def spectral_normalize_torch(magnitudes):
 27 |     output = dynamic_range_compression_torch(magnitudes)
 28 |     return output
 29 | 
 30 | 
 31 | def spectral_de_normalize_torch(magnitudes):
 32 |     output = dynamic_range_decompression_torch(magnitudes)
 33 |     return output
 34 | 
 35 | 
 36 | mel_basis = {}
 37 | hann_window = {}
 38 | 
 39 | 
 40 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 41 |     if torch.min(y) < -1.:
 42 |         print('min value is ', torch.min(y))
 43 |     if torch.max(y) > 1.:
 44 |         print('max value is ', torch.max(y))
 45 | 
 46 |     global hann_window
 47 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 48 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 49 |     if wnsize_dtype_device not in hann_window:
 50 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 51 | 
 52 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 53 |     y = y.squeeze(1)
 54 | 
 55 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 56 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 57 | 
 58 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 59 |     return spec
 60 | 
 61 | 
 62 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 63 |     global mel_basis
 64 |     dtype_device = str(spec.dtype) + '_' + str(spec.device)
 65 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 66 |     if fmax_dtype_device not in mel_basis:
 67 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 68 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
 69 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 70 |     spec = spectral_normalize_torch(spec)
 71 |     return spec
 72 | 
 73 | 
 74 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 75 |     if torch.min(y) < -1.:
 76 |         print('min value is ', torch.min(y))
 77 |     if torch.max(y) > 1.:
 78 |         print('max value is ', torch.max(y))
 79 | 
 80 |     global mel_basis, hann_window
 81 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 82 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 83 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 84 |     if fmax_dtype_device not in mel_basis:
 85 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 86 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
 87 |     if wnsize_dtype_device not in hann_window:
 88 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 89 | 
 90 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 91 |     y = y.squeeze(1)
 92 | 
 93 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 94 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 95 | 
 96 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 97 | 
 98 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 99 |     spec = spectral_normalize_torch(spec)
100 | 
101 |     return spec
102 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | import modules
  8 | import attentions
  9 | 
 10 | from torch.nn import Conv1d, ConvTranspose1d
 11 | from torch.nn.utils import weight_norm
 12 | from commons import init_weights
 13 | 
 14 | 
 15 | class StochasticDurationPredictor(nn.Module):
 16 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
 17 |     super().__init__()
 18 |     filter_channels = in_channels # it needs to be removed from future version.
 19 |     self.in_channels = in_channels
 20 |     self.filter_channels = filter_channels
 21 |     self.kernel_size = kernel_size
 22 |     self.p_dropout = p_dropout
 23 |     self.n_flows = n_flows
 24 |     self.gin_channels = gin_channels
 25 | 
 26 |     self.log_flow = modules.Log()
 27 |     self.flows = nn.ModuleList()
 28 |     self.flows.append(modules.ElementwiseAffine(2))
 29 |     for i in range(n_flows):
 30 |       self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 31 |       self.flows.append(modules.Flip())
 32 | 
 33 |     self.post_pre = nn.Conv1d(1, filter_channels, 1)
 34 |     self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
 35 |     self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 36 |     self.post_flows = nn.ModuleList()
 37 |     self.post_flows.append(modules.ElementwiseAffine(2))
 38 |     for i in range(4):
 39 |       self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 40 |       self.post_flows.append(modules.Flip())
 41 | 
 42 |     self.pre = nn.Conv1d(in_channels, filter_channels, 1)
 43 |     self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
 44 |     self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 45 |     if gin_channels != 0:
 46 |       self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 47 | 
 48 |   def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 49 |     x = torch.detach(x)
 50 |     x = self.pre(x)
 51 |     if g is not None:
 52 |       g = torch.detach(g)
 53 |       x = x + self.cond(g)
 54 |     x = self.convs(x, x_mask)
 55 |     x = self.proj(x) * x_mask
 56 | 
 57 |     if not reverse:
 58 |       flows = self.flows
 59 |       assert w is not None
 60 | 
 61 |       logdet_tot_q = 0 
 62 |       h_w = self.post_pre(w)
 63 |       h_w = self.post_convs(h_w, x_mask)
 64 |       h_w = self.post_proj(h_w) * x_mask
 65 |       e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
 66 |       z_q = e_q
 67 |       for flow in self.post_flows:
 68 |         z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
 69 |         logdet_tot_q += logdet_q
 70 |       z_u, z1 = torch.split(z_q, [1, 1], 1) 
 71 |       u = torch.sigmoid(z_u) * x_mask
 72 |       z0 = (w - u) * x_mask
 73 |       logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
 74 |       logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
 75 | 
 76 |       logdet_tot = 0
 77 |       z0, logdet = self.log_flow(z0, x_mask)
 78 |       logdet_tot += logdet
 79 |       z = torch.cat([z0, z1], 1)
 80 |       for flow in flows:
 81 |         z, logdet = flow(z, x_mask, g=x, reverse=reverse)
 82 |         logdet_tot = logdet_tot + logdet
 83 |       nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
 84 |       return nll + logq # [b]
 85 |     else:
 86 |       flows = list(reversed(self.flows))
 87 |       flows = flows[:-2] + [flows[-1]] # remove a useless vflow
 88 |       z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
 89 |       for flow in flows:
 90 |         z = flow(z, x_mask, g=x, reverse=reverse)
 91 |       z0, z1 = torch.split(z, [1, 1], 1)
 92 |       logw = z0
 93 |       return logw
 94 | 
 95 | 
 96 | class DurationPredictor(nn.Module):
 97 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
 98 |     super().__init__()
 99 | 
100 |     self.in_channels = in_channels
101 |     self.filter_channels = filter_channels
102 |     self.kernel_size = kernel_size
103 |     self.p_dropout = p_dropout
104 |     self.gin_channels = gin_channels
105 | 
106 |     self.drop = nn.Dropout(p_dropout)
107 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
108 |     self.norm_1 = modules.LayerNorm(filter_channels)
109 |     self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
110 |     self.norm_2 = modules.LayerNorm(filter_channels)
111 |     self.proj = nn.Conv1d(filter_channels, 1, 1)
112 | 
113 |     if gin_channels != 0:
114 |       self.cond = nn.Conv1d(gin_channels, in_channels, 1)
115 | 
116 |   def forward(self, x, x_mask, g=None):
117 |     x = torch.detach(x)
118 |     if g is not None:
119 |       g = torch.detach(g)
120 |       x = x + self.cond(g)
121 |     x = self.conv_1(x * x_mask)
122 |     x = torch.relu(x)
123 |     x = self.norm_1(x)
124 |     x = self.drop(x)
125 |     x = self.conv_2(x * x_mask)
126 |     x = torch.relu(x)
127 |     x = self.norm_2(x)
128 |     x = self.drop(x)
129 |     x = self.proj(x * x_mask)
130 |     return x * x_mask
131 | 
132 | 
133 | class TextEncoder(nn.Module):
134 |   def __init__(self,
135 |       n_vocab,
136 |       out_channels,
137 |       hidden_channels,
138 |       filter_channels,
139 |       n_heads,
140 |       n_layers,
141 |       kernel_size,
142 |       p_dropout,
143 |       emotion_embedding):
144 |     super().__init__()
145 |     self.n_vocab = n_vocab
146 |     self.out_channels = out_channels
147 |     self.hidden_channels = hidden_channels
148 |     self.filter_channels = filter_channels
149 |     self.n_heads = n_heads
150 |     self.n_layers = n_layers
151 |     self.kernel_size = kernel_size
152 |     self.p_dropout = p_dropout
153 |     self.emotion_embedding = emotion_embedding
154 |     
155 |     if self.n_vocab!=0:
156 |       self.emb = nn.Embedding(n_vocab, hidden_channels)
157 |       if emotion_embedding:
158 |         self.emo_proj = nn.Linear(1024, hidden_channels)
159 |       nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
160 | 
161 |     self.encoder = attentions.Encoder(
162 |       hidden_channels,
163 |       filter_channels,
164 |       n_heads,
165 |       n_layers,
166 |       kernel_size,
167 |       p_dropout)
168 |     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
169 | 
170 |   def forward(self, x, x_lengths, emotion_embedding=None):
171 |     if self.n_vocab!=0:
172 |       x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
173 |     if emotion_embedding is not None:
174 |       x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
175 |     x = torch.transpose(x, 1, -1) # [b, h, t]
176 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
177 | 
178 |     x = self.encoder(x * x_mask, x_mask)
179 |     stats = self.proj(x) * x_mask
180 | 
181 |     m, logs = torch.split(stats, self.out_channels, dim=1)
182 |     return x, m, logs, x_mask
183 | 
184 | 
185 | class ResidualCouplingBlock(nn.Module):
186 |   def __init__(self,
187 |       channels,
188 |       hidden_channels,
189 |       kernel_size,
190 |       dilation_rate,
191 |       n_layers,
192 |       n_flows=4,
193 |       gin_channels=0):
194 |     super().__init__()
195 |     self.channels = channels
196 |     self.hidden_channels = hidden_channels
197 |     self.kernel_size = kernel_size
198 |     self.dilation_rate = dilation_rate
199 |     self.n_layers = n_layers
200 |     self.n_flows = n_flows
201 |     self.gin_channels = gin_channels
202 | 
203 |     self.flows = nn.ModuleList()
204 |     for i in range(n_flows):
205 |       self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
206 |       self.flows.append(modules.Flip())
207 | 
208 |   def forward(self, x, x_mask, g=None, reverse=False):
209 |     if not reverse:
210 |       for flow in self.flows:
211 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
212 |     else:
213 |       for flow in reversed(self.flows):
214 |         x = flow(x, x_mask, g=g, reverse=reverse)
215 |     return x
216 | 
217 | 
218 | class PosteriorEncoder(nn.Module):
219 |   def __init__(self,
220 |       in_channels,
221 |       out_channels,
222 |       hidden_channels,
223 |       kernel_size,
224 |       dilation_rate,
225 |       n_layers,
226 |       gin_channels=0):
227 |     super().__init__()
228 |     self.in_channels = in_channels
229 |     self.out_channels = out_channels
230 |     self.hidden_channels = hidden_channels
231 |     self.kernel_size = kernel_size
232 |     self.dilation_rate = dilation_rate
233 |     self.n_layers = n_layers
234 |     self.gin_channels = gin_channels
235 | 
236 |     self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
237 |     self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
238 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
239 | 
240 |   def forward(self, x, x_lengths, g=None):
241 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
242 |     x = self.pre(x) * x_mask
243 |     x = self.enc(x, x_mask, g=g)
244 |     stats = self.proj(x) * x_mask
245 |     m, logs = torch.split(stats, self.out_channels, dim=1)
246 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
247 |     return z, m, logs, x_mask
248 | 
249 | 
250 | class Generator(torch.nn.Module):
251 |     def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
252 |         super(Generator, self).__init__()
253 |         self.num_kernels = len(resblock_kernel_sizes)
254 |         self.num_upsamples = len(upsample_rates)
255 |         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
256 |         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
257 | 
258 |         self.ups = nn.ModuleList()
259 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
260 |             self.ups.append(weight_norm(
261 |                 ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
262 |                                 k, u, padding=(k-u)//2)))
263 | 
264 |         self.resblocks = nn.ModuleList()
265 |         for i in range(len(self.ups)):
266 |             ch = upsample_initial_channel//(2**(i+1))
267 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
268 |                 self.resblocks.append(resblock(ch, k, d))
269 | 
270 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
271 |         self.ups.apply(init_weights)
272 | 
273 |         if gin_channels != 0:
274 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
275 | 
276 |     def forward(self, x, g=None):
277 |         x = self.conv_pre(x)
278 |         if g is not None:
279 |           x = x + self.cond(g)
280 | 
281 |         for i in range(self.num_upsamples):
282 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
283 |             x = self.ups[i](x)
284 |             xs = None
285 |             for j in range(self.num_kernels):
286 |                 if xs is None:
287 |                     xs = self.resblocks[i*self.num_kernels+j](x)
288 |                 else:
289 |                     xs += self.resblocks[i*self.num_kernels+j](x)
290 |             x = xs / self.num_kernels
291 |         x = F.leaky_relu(x)
292 |         x = self.conv_post(x)
293 |         x = torch.tanh(x)
294 | 
295 |         return x
296 | 
297 | 
298 | class SynthesizerTrn(nn.Module):
299 |   """
300 |   Synthesizer for Training
301 |   """
302 | 
303 |   def __init__(self, 
304 |     n_vocab,
305 |     spec_channels,
306 |     segment_size,
307 |     inter_channels,
308 |     hidden_channels,
309 |     filter_channels,
310 |     n_heads,
311 |     n_layers,
312 |     kernel_size,
313 |     p_dropout,
314 |     resblock, 
315 |     resblock_kernel_sizes, 
316 |     resblock_dilation_sizes, 
317 |     upsample_rates, 
318 |     upsample_initial_channel, 
319 |     upsample_kernel_sizes,
320 |     n_speakers=0,
321 |     gin_channels=0,
322 |     use_sdp=True,
323 |     emotion_embedding=False,
324 |     **kwargs):
325 | 
326 |     super().__init__()
327 |     self.n_vocab = n_vocab
328 |     self.spec_channels = spec_channels
329 |     self.inter_channels = inter_channels
330 |     self.hidden_channels = hidden_channels
331 |     self.filter_channels = filter_channels
332 |     self.n_heads = n_heads
333 |     self.n_layers = n_layers
334 |     self.kernel_size = kernel_size
335 |     self.p_dropout = p_dropout
336 |     self.resblock = resblock
337 |     self.resblock_kernel_sizes = resblock_kernel_sizes
338 |     self.resblock_dilation_sizes = resblock_dilation_sizes
339 |     self.upsample_rates = upsample_rates
340 |     self.upsample_initial_channel = upsample_initial_channel
341 |     self.upsample_kernel_sizes = upsample_kernel_sizes
342 |     self.segment_size = segment_size
343 |     self.n_speakers = n_speakers
344 |     self.gin_channels = gin_channels
345 | 
346 |     self.use_sdp = use_sdp
347 | 
348 |     self.enc_p = TextEncoder(n_vocab,
349 |         inter_channels,
350 |         hidden_channels,
351 |         filter_channels,
352 |         n_heads,
353 |         n_layers,
354 |         kernel_size,
355 |         p_dropout,
356 |         emotion_embedding)
357 |     self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
358 |     self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
359 |     self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
360 | 
361 |     if use_sdp:
362 |       self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
363 |     else:
364 |       self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
365 | 
366 |     if n_speakers > 1:
367 |       self.emb_g = nn.Embedding(n_speakers, gin_channels)
368 | 
369 |   def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
370 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
371 |     if self.n_speakers > 0:
372 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
373 |     else:
374 |       g = None
375 | 
376 |     if self.use_sdp:
377 |       logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
378 |     else:
379 |       logw = self.dp(x, x_mask, g=g)
380 |     w = torch.exp(logw) * x_mask * length_scale
381 |     w_ceil = torch.ceil(w)
382 |     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
383 |     y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
384 |     attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
385 |     attn = commons.generate_path(w_ceil, attn_mask)
386 | 
387 |     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
388 |     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
389 | 
390 |     z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
391 |     z = self.flow(z_p, y_mask, g=g, reverse=True)
392 |     o = self.dec((z * y_mask)[:,:,:max_len], g=g)
393 |     return o, attn, y_mask, (z, z_p, m_p, logs_p)
394 | 
395 |   def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
396 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
397 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
398 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
399 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
400 |     z_p = self.flow(z, y_mask, g=g_src)
401 |     z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
402 |     o_hat = self.dec(z_hat * y_mask, g=g_tgt)
403 |     return o_hat, y_mask, (z, z_p, z_hat)
404 | 
405 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | from torch.nn import Conv1d
  7 | from torch.nn.utils import weight_norm, remove_weight_norm
  8 | 
  9 | import commons
 10 | from commons import init_weights, get_padding
 11 | from transforms import piecewise_rational_quadratic_transform
 12 | 
 13 | 
 14 | LRELU_SLOPE = 0.1
 15 | 
 16 | 
 17 | class LayerNorm(nn.Module):
 18 |   def __init__(self, channels, eps=1e-5):
 19 |     super().__init__()
 20 |     self.channels = channels
 21 |     self.eps = eps
 22 | 
 23 |     self.gamma = nn.Parameter(torch.ones(channels))
 24 |     self.beta = nn.Parameter(torch.zeros(channels))
 25 | 
 26 |   def forward(self, x):
 27 |     x = x.transpose(1, -1)
 28 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 29 |     return x.transpose(1, -1)
 30 | 
 31 |  
 32 | class ConvReluNorm(nn.Module):
 33 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 34 |     super().__init__()
 35 |     self.in_channels = in_channels
 36 |     self.hidden_channels = hidden_channels
 37 |     self.out_channels = out_channels
 38 |     self.kernel_size = kernel_size
 39 |     self.n_layers = n_layers
 40 |     self.p_dropout = p_dropout
 41 |     assert n_layers > 1, "Number of layers should be larger than 0."
 42 | 
 43 |     self.conv_layers = nn.ModuleList()
 44 |     self.norm_layers = nn.ModuleList()
 45 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 46 |     self.norm_layers.append(LayerNorm(hidden_channels))
 47 |     self.relu_drop = nn.Sequential(
 48 |         nn.ReLU(),
 49 |         nn.Dropout(p_dropout))
 50 |     for _ in range(n_layers-1):
 51 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 52 |       self.norm_layers.append(LayerNorm(hidden_channels))
 53 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 54 |     self.proj.weight.data.zero_()
 55 |     self.proj.bias.data.zero_()
 56 | 
 57 |   def forward(self, x, x_mask):
 58 |     x_org = x
 59 |     for i in range(self.n_layers):
 60 |       x = self.conv_layers[i](x * x_mask)
 61 |       x = self.norm_layers[i](x)
 62 |       x = self.relu_drop(x)
 63 |     x = x_org + self.proj(x)
 64 |     return x * x_mask
 65 | 
 66 | 
 67 | class DDSConv(nn.Module):
 68 |   """
 69 |   Dilated and Depth-Separable Convolution
 70 |   """
 71 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 72 |     super().__init__()
 73 |     self.channels = channels
 74 |     self.kernel_size = kernel_size
 75 |     self.n_layers = n_layers
 76 |     self.p_dropout = p_dropout
 77 | 
 78 |     self.drop = nn.Dropout(p_dropout)
 79 |     self.convs_sep = nn.ModuleList()
 80 |     self.convs_1x1 = nn.ModuleList()
 81 |     self.norms_1 = nn.ModuleList()
 82 |     self.norms_2 = nn.ModuleList()
 83 |     for i in range(n_layers):
 84 |       dilation = kernel_size ** i
 85 |       padding = (kernel_size * dilation - dilation) // 2
 86 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 87 |           groups=channels, dilation=dilation, padding=padding
 88 |       ))
 89 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 90 |       self.norms_1.append(LayerNorm(channels))
 91 |       self.norms_2.append(LayerNorm(channels))
 92 | 
 93 |   def forward(self, x, x_mask, g=None):
 94 |     if g is not None:
 95 |       x = x + g
 96 |     for i in range(self.n_layers):
 97 |       y = self.convs_sep[i](x * x_mask)
 98 |       y = self.norms_1[i](y)
 99 |       y = F.gelu(y)
100 |       y = self.convs_1x1[i](y)
101 |       y = self.norms_2[i](y)
102 |       y = F.gelu(y)
103 |       y = self.drop(y)
104 |       x = x + y
105 |     return x * x_mask
106 | 
107 | 
108 | class WN(torch.nn.Module):
109 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
110 |     super(WN, self).__init__()
111 |     assert(kernel_size % 2 == 1)
112 |     self.hidden_channels =hidden_channels
113 |     self.kernel_size = kernel_size,
114 |     self.dilation_rate = dilation_rate
115 |     self.n_layers = n_layers
116 |     self.gin_channels = gin_channels
117 |     self.p_dropout = p_dropout
118 | 
119 |     self.in_layers = torch.nn.ModuleList()
120 |     self.res_skip_layers = torch.nn.ModuleList()
121 |     self.drop = nn.Dropout(p_dropout)
122 | 
123 |     if gin_channels != 0:
124 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
125 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
126 | 
127 |     for i in range(n_layers):
128 |       dilation = dilation_rate ** i
129 |       padding = int((kernel_size * dilation - dilation) / 2)
130 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
131 |                                  dilation=dilation, padding=padding)
132 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
133 |       self.in_layers.append(in_layer)
134 | 
135 |       # last one is not necessary
136 |       if i < n_layers - 1:
137 |         res_skip_channels = 2 * hidden_channels
138 |       else:
139 |         res_skip_channels = hidden_channels
140 | 
141 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
142 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
143 |       self.res_skip_layers.append(res_skip_layer)
144 | 
145 |   def forward(self, x, x_mask, g=None, **kwargs):
146 |     output = torch.zeros_like(x)
147 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
148 | 
149 |     if g is not None:
150 |       g = self.cond_layer(g)
151 | 
152 |     for i in range(self.n_layers):
153 |       x_in = self.in_layers[i](x)
154 |       if g is not None:
155 |         cond_offset = i * 2 * self.hidden_channels
156 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
157 |       else:
158 |         g_l = torch.zeros_like(x_in)
159 | 
160 |       acts = commons.fused_add_tanh_sigmoid_multiply(
161 |           x_in,
162 |           g_l,
163 |           n_channels_tensor)
164 |       acts = self.drop(acts)
165 | 
166 |       res_skip_acts = self.res_skip_layers[i](acts)
167 |       if i < self.n_layers - 1:
168 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
169 |         x = (x + res_acts) * x_mask
170 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
171 |       else:
172 |         output = output + res_skip_acts
173 |     return output * x_mask
174 | 
175 |   def remove_weight_norm(self):
176 |     if self.gin_channels != 0:
177 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
178 |     for l in self.in_layers:
179 |       torch.nn.utils.remove_weight_norm(l)
180 |     for l in self.res_skip_layers:
181 |      torch.nn.utils.remove_weight_norm(l)
182 | 
183 | 
184 | class ResBlock1(torch.nn.Module):
185 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
186 |         super(ResBlock1, self).__init__()
187 |         self.convs1 = nn.ModuleList([
188 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
189 |                                padding=get_padding(kernel_size, dilation[0]))),
190 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
191 |                                padding=get_padding(kernel_size, dilation[1]))),
192 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
193 |                                padding=get_padding(kernel_size, dilation[2])))
194 |         ])
195 |         self.convs1.apply(init_weights)
196 | 
197 |         self.convs2 = nn.ModuleList([
198 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
199 |                                padding=get_padding(kernel_size, 1))),
200 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201 |                                padding=get_padding(kernel_size, 1))),
202 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203 |                                padding=get_padding(kernel_size, 1)))
204 |         ])
205 |         self.convs2.apply(init_weights)
206 | 
207 |     def forward(self, x, x_mask=None):
208 |         for c1, c2 in zip(self.convs1, self.convs2):
209 |             xt = F.leaky_relu(x, LRELU_SLOPE)
210 |             if x_mask is not None:
211 |                 xt = xt * x_mask
212 |             xt = c1(xt)
213 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
214 |             if x_mask is not None:
215 |                 xt = xt * x_mask
216 |             xt = c2(xt)
217 |             x = xt + x
218 |         if x_mask is not None:
219 |             x = x * x_mask
220 |         return x
221 | 
222 |     def remove_weight_norm(self):
223 |         for l in self.convs1:
224 |             remove_weight_norm(l)
225 |         for l in self.convs2:
226 |             remove_weight_norm(l)
227 | 
228 | 
229 | class ResBlock2(torch.nn.Module):
230 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
231 |         super(ResBlock2, self).__init__()
232 |         self.convs = nn.ModuleList([
233 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
234 |                                padding=get_padding(kernel_size, dilation[0]))),
235 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
236 |                                padding=get_padding(kernel_size, dilation[1])))
237 |         ])
238 |         self.convs.apply(init_weights)
239 | 
240 |     def forward(self, x, x_mask=None):
241 |         for c in self.convs:
242 |             xt = F.leaky_relu(x, LRELU_SLOPE)
243 |             if x_mask is not None:
244 |                 xt = xt * x_mask
245 |             xt = c(xt)
246 |             x = xt + x
247 |         if x_mask is not None:
248 |             x = x * x_mask
249 |         return x
250 | 
251 |     def remove_weight_norm(self):
252 |         for l in self.convs:
253 |             remove_weight_norm(l)
254 | 
255 | 
256 | class Log(nn.Module):
257 |   def forward(self, x, x_mask, reverse=False, **kwargs):
258 |     if not reverse:
259 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
260 |       logdet = torch.sum(-y, [1, 2])
261 |       return y, logdet
262 |     else:
263 |       x = torch.exp(x) * x_mask
264 |       return x
265 |     
266 | 
267 | class Flip(nn.Module):
268 |   def forward(self, x, *args, reverse=False, **kwargs):
269 |     x = torch.flip(x, [1])
270 |     if not reverse:
271 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
272 |       return x, logdet
273 |     else:
274 |       return x
275 | 
276 | 
277 | class ElementwiseAffine(nn.Module):
278 |   def __init__(self, channels):
279 |     super().__init__()
280 |     self.channels = channels
281 |     self.m = nn.Parameter(torch.zeros(channels,1))
282 |     self.logs = nn.Parameter(torch.zeros(channels,1))
283 | 
284 |   def forward(self, x, x_mask, reverse=False, **kwargs):
285 |     if not reverse:
286 |       y = self.m + torch.exp(self.logs) * x
287 |       y = y * x_mask
288 |       logdet = torch.sum(self.logs * x_mask, [1,2])
289 |       return y, logdet
290 |     else:
291 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
292 |       return x
293 | 
294 | 
295 | class ResidualCouplingLayer(nn.Module):
296 |   def __init__(self,
297 |       channels,
298 |       hidden_channels,
299 |       kernel_size,
300 |       dilation_rate,
301 |       n_layers,
302 |       p_dropout=0,
303 |       gin_channels=0,
304 |       mean_only=False):
305 |     assert channels % 2 == 0, "channels should be divisible by 2"
306 |     super().__init__()
307 |     self.channels = channels
308 |     self.hidden_channels = hidden_channels
309 |     self.kernel_size = kernel_size
310 |     self.dilation_rate = dilation_rate
311 |     self.n_layers = n_layers
312 |     self.half_channels = channels // 2
313 |     self.mean_only = mean_only
314 | 
315 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
316 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
317 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
318 |     self.post.weight.data.zero_()
319 |     self.post.bias.data.zero_()
320 | 
321 |   def forward(self, x, x_mask, g=None, reverse=False):
322 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
323 |     h = self.pre(x0) * x_mask
324 |     h = self.enc(h, x_mask, g=g)
325 |     stats = self.post(h) * x_mask
326 |     if not self.mean_only:
327 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
328 |     else:
329 |       m = stats
330 |       logs = torch.zeros_like(m)
331 | 
332 |     if not reverse:
333 |       x1 = m + x1 * torch.exp(logs) * x_mask
334 |       x = torch.cat([x0, x1], 1)
335 |       logdet = torch.sum(logs, [1,2])
336 |       return x, logdet
337 |     else:
338 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
339 |       x = torch.cat([x0, x1], 1)
340 |       return x
341 | 
342 | 
343 | class ConvFlow(nn.Module):
344 |   def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
345 |     super().__init__()
346 |     self.in_channels = in_channels
347 |     self.filter_channels = filter_channels
348 |     self.kernel_size = kernel_size
349 |     self.n_layers = n_layers
350 |     self.num_bins = num_bins
351 |     self.tail_bound = tail_bound
352 |     self.half_channels = in_channels // 2
353 | 
354 |     self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
355 |     self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
356 |     self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
357 |     self.proj.weight.data.zero_()
358 |     self.proj.bias.data.zero_()
359 | 
360 |   def forward(self, x, x_mask, g=None, reverse=False):
361 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
362 |     h = self.pre(x0)
363 |     h = self.convs(h, x_mask, g=g)
364 |     h = self.proj(h) * x_mask
365 | 
366 |     b, c, t = x0.shape
367 |     h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
368 | 
369 |     unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
370 |     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
371 |     unnormalized_derivatives = h[..., 2 * self.num_bins:]
372 | 
373 |     x1, logabsdet = piecewise_rational_quadratic_transform(x1,
374 |         unnormalized_widths,
375 |         unnormalized_heights,
376 |         unnormalized_derivatives,
377 |         inverse=reverse,
378 |         tails='linear',
379 |         tail_bound=self.tail_bound
380 |     )
381 | 
382 |     x = torch.cat([x0, x1], 1) * x_mask
383 |     logdet = torch.sum(logabsdet * x_mask, [1,2])
384 |     if not reverse:
385 |         return x, logdet
386 |     else:
387 |         return x
388 | 


--------------------------------------------------------------------------------
/readme/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/1.png


--------------------------------------------------------------------------------
/readme/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/2.png


--------------------------------------------------------------------------------
/readme/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/3.png


--------------------------------------------------------------------------------
/readme/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/4.png


--------------------------------------------------------------------------------
/readme/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/5.png


--------------------------------------------------------------------------------
/readme/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/6.png


--------------------------------------------------------------------------------
/readme/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/7.png


--------------------------------------------------------------------------------
/readme/cyberchat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/cyberchat.png


--------------------------------------------------------------------------------
/readme/token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjyaddone/ChatWaifu/eb9c697be020a34aa09acce488948bf62ee33e58/readme/token.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numba
 2 | librosa
 3 | numpy
 4 | scipy
 5 | torch
 6 | unidecode
 7 | openjtalk>=0.3.0.dev2
 8 | jamo
 9 | pypinyin
10 | jieba
11 | protobuf
12 | cn2an
13 | inflect
14 | eng_to_ipa
15 | ko_pron
16 | indic_transliteration
17 | num_thai
18 | opencc
19 | pyChatGPT
20 | vosk
21 | sounddevice
22 | 


--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from text import cleaners
 3 | 
 4 | 
 5 | def text_to_sequence(text, symbols, cleaner_names):
 6 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 7 |     Args:
 8 |       text: string to convert to a sequence
 9 |       cleaner_names: names of the cleaner functions to run the text through
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14 | 
15 |   sequence = []
16 | 
17 |   clean_text = _clean_text(text, cleaner_names)
18 |   for symbol in clean_text:
19 |     if symbol not in _symbol_to_id.keys():
20 |       continue
21 |     symbol_id = _symbol_to_id[symbol]
22 |     sequence += [symbol_id]
23 |   return sequence
24 | 
25 | 
26 | def _clean_text(text, cleaner_names):
27 |   for name in cleaner_names:
28 |     cleaner = getattr(cleaners, name)
29 |     if not cleaner:
30 |       raise Exception('Unknown cleaner: %s' % name)
31 |     text = cleaner(text)
32 |   return text
33 | 


--------------------------------------------------------------------------------
/text/cantonese.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import cn2an
 3 | import opencc
 4 | 
 5 | 
 6 | converter = opencc.OpenCC('jyutjyu')
 7 | 
 8 | # List of (Latin alphabet, ipa) pairs:
 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10 |     ('A', 'ei˥'),
11 |     ('B', 'biː˥'),
12 |     ('C', 'siː˥'),
13 |     ('D', 'tiː˥'),
14 |     ('E', 'iː˥'),
15 |     ('F', 'e˥fuː˨˩'),
16 |     ('G', 'tsiː˥'),
17 |     ('H', 'ɪk̚˥tsʰyː˨˩'),
18 |     ('I', 'ɐi˥'),
19 |     ('J', 'tsei˥'),
20 |     ('K', 'kʰei˥'),
21 |     ('L', 'e˥llou˨˩'),
22 |     ('M', 'ɛːm˥'),
23 |     ('N', 'ɛːn˥'),
24 |     ('O', 'ou˥'),
25 |     ('P', 'pʰiː˥'),
26 |     ('Q', 'kʰiːu˥'),
27 |     ('R', 'aː˥lou˨˩'),
28 |     ('S', 'ɛː˥siː˨˩'),
29 |     ('T', 'tʰiː˥'),
30 |     ('U', 'juː˥'),
31 |     ('V', 'wiː˥'),
32 |     ('W', 'tʊk̚˥piː˥juː˥'),
33 |     ('X', 'ɪk̚˥siː˨˩'),
34 |     ('Y', 'waːi˥'),
35 |     ('Z', 'iː˨sɛːt̚˥')
36 | ]]
37 | 
38 | 
39 | def number_to_cantonese(text):
40 |     return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
41 | 
42 | 
43 | def latin_to_ipa(text):
44 |     for regex, replacement in _latin_to_ipa:
45 |         text = re.sub(regex, replacement, text)
46 |     return text
47 | 
48 | 
49 | def cantonese_to_ipa(text):
50 |     text = number_to_cantonese(text.upper())
51 |     text = converter.convert(text).replace('-','').replace('$',' ')
52 |     text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
53 |     text = re.sub(r'[、；：]', '，', text)
54 |     text = re.sub(r'\s*，\s*', ', ', text)
55 |     text = re.sub(r'\s*。\s*', '. ', text)
56 |     text = re.sub(r'\s*？\s*', '? ', text)
57 |     text = re.sub(r'\s*！\s*', '! ', text)
58 |     text = re.sub(r'\s*$', '', text)
59 |     return text
60 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | def japanese_cleaners(text):
  5 |     from text.japanese import japanese_to_romaji_with_accent
  6 |     text = japanese_to_romaji_with_accent(text)
  7 |     text = re.sub(r'([A-Za-z])$', r'\1.', text)
  8 |     return text
  9 | 
 10 | 
 11 | def japanese_cleaners2(text):
 12 |     return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
 13 | 
 14 | 
 15 | def korean_cleaners(text):
 16 |     '''Pipeline for Korean text'''
 17 |     from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
 18 |     text = latin_to_hangul(text)
 19 |     text = number_to_hangul(text)
 20 |     text = divide_hangul(text)
 21 |     text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
 22 |     return text
 23 | 
 24 | 
 25 | def chinese_cleaners(text):
 26 |     '''Pipeline for Chinese text'''
 27 |     from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
 28 |     text = number_to_chinese(text)
 29 |     text = chinese_to_bopomofo(text)
 30 |     text = latin_to_bopomofo(text)
 31 |     text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
 32 |     return text
 33 | 
 34 | 
 35 | def zh_ja_mixture_cleaners(text):
 36 |     from text.mandarin import chinese_to_romaji
 37 |     from text.japanese import japanese_to_romaji_with_accent
 38 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
 39 |                   lambda x: chinese_to_romaji(x.group(1))+' ', text)
 40 |     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
 41 |         x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
 42 |     text = re.sub(r'\s+$', '', text)
 43 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 44 |     return text
 45 | 
 46 | 
 47 | def sanskrit_cleaners(text):
 48 |     text = text.replace('॥', '।').replace('ॐ', 'ओम्')
 49 |     text = re.sub(r'([^।])$', r'\1।', text)
 50 |     return text
 51 | 
 52 | 
 53 | def cjks_cleaners(text):
 54 |     from text.mandarin import chinese_to_lazy_ipa
 55 |     from text.japanese import japanese_to_ipa
 56 |     from text.korean import korean_to_lazy_ipa
 57 |     from text.sanskrit import devanagari_to_ipa
 58 |     from text.english import english_to_lazy_ipa
 59 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
 60 |                   lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
 61 |     text = re.sub(r'\[JA\](.*?)\[JA\]',
 62 |                   lambda x: japanese_to_ipa(x.group(1))+' ', text)
 63 |     text = re.sub(r'\[KO\](.*?)\[KO\]',
 64 |                   lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
 65 |     text = re.sub(r'\[SA\](.*?)\[SA\]',
 66 |                   lambda x: devanagari_to_ipa(x.group(1))+' ', text)
 67 |     text = re.sub(r'\[EN\](.*?)\[EN\]',
 68 |                   lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
 69 |     text = re.sub(r'\s+$', '', text)
 70 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 71 |     return text
 72 | 
 73 | 
 74 | def cjke_cleaners(text):
 75 |     from text.mandarin import chinese_to_lazy_ipa
 76 |     from text.japanese import japanese_to_ipa
 77 |     from text.korean import korean_to_ipa
 78 |     from text.english import english_to_ipa2
 79 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
 80 |         'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
 81 |     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
 82 |         'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
 83 |     text = re.sub(r'\[KO\](.*?)\[KO\]',
 84 |                   lambda x: korean_to_ipa(x.group(1))+' ', text)
 85 |     text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
 86 |         'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
 87 |     text = re.sub(r'\s+$', '', text)
 88 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 89 |     return text
 90 | 
 91 | 
 92 | def cjke_cleaners2(text):
 93 |     from text.mandarin import chinese_to_ipa
 94 |     from text.japanese import japanese_to_ipa2
 95 |     from text.korean import korean_to_ipa
 96 |     from text.english import english_to_ipa2
 97 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
 98 |                   lambda x: chinese_to_ipa(x.group(1))+' ', text)
 99 |     text = re.sub(r'\[JA\](.*?)\[JA\]',
100 |                   lambda x: japanese_to_ipa2(x.group(1))+' ', text)
101 |     text = re.sub(r'\[KO\](.*?)\[KO\]',
102 |                   lambda x: korean_to_ipa(x.group(1))+' ', text)
103 |     text = re.sub(r'\[EN\](.*?)\[EN\]',
104 |                   lambda x: english_to_ipa2(x.group(1))+' ', text)
105 |     text = re.sub(r'\s+$', '', text)
106 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
107 |     return text
108 | 
109 | 
110 | def thai_cleaners(text):
111 |     from text.thai import num_to_thai, latin_to_thai
112 |     text = num_to_thai(text)
113 |     text = latin_to_thai(text)
114 |     return text
115 | 
116 | 
117 | def shanghainese_cleaners(text):
118 |     from text.shanghainese import shanghainese_to_ipa
119 |     text = shanghainese_to_ipa(text)
120 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
121 |     return text
122 | 
123 | 
124 | def chinese_dialect_cleaners(text):
125 |     from text.mandarin import chinese_to_ipa2
126 |     from text.japanese import japanese_to_ipa3
127 |     from text.shanghainese import shanghainese_to_ipa
128 |     from text.cantonese import cantonese_to_ipa
129 |     from text.english import english_to_lazy_ipa2
130 |     from text.ngu_dialect import ngu_dialect_to_ipa
131 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
132 |                   lambda x: chinese_to_ipa2(x.group(1))+' ', text)
133 |     text = re.sub(r'\[JA\](.*?)\[JA\]',
134 |                   lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
135 |     text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
136 |                   '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
137 |     text = re.sub(r'\[GD\](.*?)\[GD\]',
138 |                   lambda x: cantonese_to_ipa(x.group(1))+' ', text)
139 |     text = re.sub(r'\[EN\](.*?)\[EN\]',
140 |                   lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
141 |     text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
142 |         1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
143 |     text = re.sub(r'\s+$', '', text)
144 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
145 |     return text
146 | 


--------------------------------------------------------------------------------
/text/english.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | 
 16 | # Regular expression matching whitespace:
 17 | 
 18 | 
 19 | import re
 20 | import inflect
 21 | from unidecode import unidecode
 22 | import eng_to_ipa as ipa
 23 | _inflect = inflect.engine()
 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
 29 | _number_re = re.compile(r'[0-9]+')
 30 | 
 31 | # List of (regular expression, replacement) pairs for abbreviations:
 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 33 |     ('mrs', 'misess'),
 34 |     ('mr', 'mister'),
 35 |     ('dr', 'doctor'),
 36 |     ('st', 'saint'),
 37 |     ('co', 'company'),
 38 |     ('jr', 'junior'),
 39 |     ('maj', 'major'),
 40 |     ('gen', 'general'),
 41 |     ('drs', 'doctors'),
 42 |     ('rev', 'reverend'),
 43 |     ('lt', 'lieutenant'),
 44 |     ('hon', 'honorable'),
 45 |     ('sgt', 'sergeant'),
 46 |     ('capt', 'captain'),
 47 |     ('esq', 'esquire'),
 48 |     ('ltd', 'limited'),
 49 |     ('col', 'colonel'),
 50 |     ('ft', 'fort'),
 51 | ]]
 52 | 
 53 | 
 54 | # List of (ipa, lazy ipa) pairs:
 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 56 |     ('r', 'ɹ'),
 57 |     ('æ', 'e'),
 58 |     ('ɑ', 'a'),
 59 |     ('ɔ', 'o'),
 60 |     ('ð', 'z'),
 61 |     ('θ', 's'),
 62 |     ('ɛ', 'e'),
 63 |     ('ɪ', 'i'),
 64 |     ('ʊ', 'u'),
 65 |     ('ʒ', 'ʥ'),
 66 |     ('ʤ', 'ʥ'),
 67 |     ('ˈ', '↓'),
 68 | ]]
 69 | 
 70 | # List of (ipa, lazy ipa2) pairs:
 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 72 |     ('r', 'ɹ'),
 73 |     ('ð', 'z'),
 74 |     ('θ', 's'),
 75 |     ('ʒ', 'ʑ'),
 76 |     ('ʤ', 'dʑ'),
 77 |     ('ˈ', '↓'),
 78 | ]]
 79 | 
 80 | # List of (ipa, ipa2) pairs
 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 82 |     ('r', 'ɹ'),
 83 |     ('ʤ', 'dʒ'),
 84 |     ('ʧ', 'tʃ')
 85 | ]]
 86 | 
 87 | 
 88 | def expand_abbreviations(text):
 89 |     for regex, replacement in _abbreviations:
 90 |         text = re.sub(regex, replacement, text)
 91 |     return text
 92 | 
 93 | 
 94 | def collapse_whitespace(text):
 95 |     return re.sub(r'\s+', ' ', text)
 96 | 
 97 | 
 98 | def _remove_commas(m):
 99 |     return m.group(1).replace(',', '')
100 | 
101 | 
102 | def _expand_decimal_point(m):
103 |     return m.group(1).replace('.', ' point ')
104 | 
105 | 
106 | def _expand_dollars(m):
107 |     match = m.group(1)
108 |     parts = match.split('.')
109 |     if len(parts) > 2:
110 |         return match + ' dollars'  # Unexpected format
111 |     dollars = int(parts[0]) if parts[0] else 0
112 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
113 |     if dollars and cents:
114 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
115 |         cent_unit = 'cent' if cents == 1 else 'cents'
116 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
117 |     elif dollars:
118 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
119 |         return '%s %s' % (dollars, dollar_unit)
120 |     elif cents:
121 |         cent_unit = 'cent' if cents == 1 else 'cents'
122 |         return '%s %s' % (cents, cent_unit)
123 |     else:
124 |         return 'zero dollars'
125 | 
126 | 
127 | def _expand_ordinal(m):
128 |     return _inflect.number_to_words(m.group(0))
129 | 
130 | 
131 | def _expand_number(m):
132 |     num = int(m.group(0))
133 |     if num > 1000 and num < 3000:
134 |         if num == 2000:
135 |             return 'two thousand'
136 |         elif num > 2000 and num < 2010:
137 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
138 |         elif num % 100 == 0:
139 |             return _inflect.number_to_words(num // 100) + ' hundred'
140 |         else:
141 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
142 |     else:
143 |         return _inflect.number_to_words(num, andword='')
144 | 
145 | 
146 | def normalize_numbers(text):
147 |     text = re.sub(_comma_number_re, _remove_commas, text)
148 |     text = re.sub(_pounds_re, r'\1 pounds', text)
149 |     text = re.sub(_dollars_re, _expand_dollars, text)
150 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
151 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
152 |     text = re.sub(_number_re, _expand_number, text)
153 |     return text
154 | 
155 | 
156 | def mark_dark_l(text):
157 |     return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
158 | 
159 | 
160 | def english_to_ipa(text):
161 |     text = unidecode(text).lower()
162 |     text = expand_abbreviations(text)
163 |     text = normalize_numbers(text)
164 |     phonemes = ipa.convert(text)
165 |     phonemes = collapse_whitespace(phonemes)
166 |     return phonemes
167 | 
168 | 
169 | def english_to_lazy_ipa(text):
170 |     text = english_to_ipa(text)
171 |     for regex, replacement in _lazy_ipa:
172 |         text = re.sub(regex, replacement, text)
173 |     return text
174 | 
175 | 
176 | def english_to_ipa2(text):
177 |     text = english_to_ipa(text)
178 |     text = mark_dark_l(text)
179 |     for regex, replacement in _ipa_to_ipa2:
180 |         text = re.sub(regex, replacement, text)
181 |     return text.replace('...', '…')
182 | 
183 | 
184 | def english_to_lazy_ipa2(text):
185 |     text = english_to_ipa(text)
186 |     for regex, replacement in _lazy_ipa2:
187 |         text = re.sub(regex, replacement, text)
188 |     return text
189 | 


--------------------------------------------------------------------------------
/text/japanese.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from unidecode import unidecode
  3 | import pyopenjtalk
  4 | 
  5 | 
  6 | # Regular expression matching Japanese without punctuation marks:
  7 | _japanese_characters = re.compile(
  8 |     r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
  9 | 
 10 | # Regular expression matching non-Japanese characters or punctuation marks:
 11 | _japanese_marks = re.compile(
 12 |     r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 13 | 
 14 | # List of (symbol, Japanese) pairs for marks:
 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
 16 |     ('％', 'パーセント')
 17 | ]]
 18 | 
 19 | # List of (romaji, ipa) pairs for marks:
 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 21 |     ('ts', 'ʦ'),
 22 |     ('u', 'ɯ'),
 23 |     ('j', 'ʥ'),
 24 |     ('y', 'j'),
 25 |     ('ni', 'n^i'),
 26 |     ('nj', 'n^'),
 27 |     ('hi', 'çi'),
 28 |     ('hj', 'ç'),
 29 |     ('f', 'ɸ'),
 30 |     ('I', 'i*'),
 31 |     ('U', 'ɯ*'),
 32 |     ('r', 'ɾ')
 33 | ]]
 34 | 
 35 | # List of (romaji, ipa2) pairs for marks:
 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 37 |     ('u', 'ɯ'),
 38 |     ('ʧ', 'tʃ'),
 39 |     ('j', 'dʑ'),
 40 |     ('y', 'j'),
 41 |     ('ni', 'n^i'),
 42 |     ('nj', 'n^'),
 43 |     ('hi', 'çi'),
 44 |     ('hj', 'ç'),
 45 |     ('f', 'ɸ'),
 46 |     ('I', 'i*'),
 47 |     ('U', 'ɯ*'),
 48 |     ('r', 'ɾ')
 49 | ]]
 50 | 
 51 | # List of (consonant, sokuon) pairs:
 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
 53 |     (r'Q([↑↓]*[kg])', r'k#\1'),
 54 |     (r'Q([↑↓]*[tdjʧ])', r't#\1'),
 55 |     (r'Q([↑↓]*[sʃ])', r's\1'),
 56 |     (r'Q([↑↓]*[pb])', r'p#\1')
 57 | ]]
 58 | 
 59 | # List of (consonant, hatsuon) pairs:
 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
 61 |     (r'N([↑↓]*[pbm])', r'm\1'),
 62 |     (r'N([↑↓]*[ʧʥj])', r'n^\1'),
 63 |     (r'N([↑↓]*[tdn])', r'n\1'),
 64 |     (r'N([↑↓]*[kg])', r'ŋ\1')
 65 | ]]
 66 | 
 67 | 
 68 | def symbols_to_japanese(text):
 69 |     for regex, replacement in _symbols_to_japanese:
 70 |         text = re.sub(regex, replacement, text)
 71 |     return text
 72 | 
 73 | 
 74 | def japanese_to_romaji_with_accent(text):
 75 |     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
 76 |     text = symbols_to_japanese(text)
 77 |     sentences = re.split(_japanese_marks, text)
 78 |     marks = re.findall(_japanese_marks, text)
 79 |     text = ''
 80 |     for i, sentence in enumerate(sentences):
 81 |         if re.match(_japanese_characters, sentence):
 82 |             if text != '':
 83 |                 text += ' '
 84 |             labels = pyopenjtalk.extract_fullcontext(sentence)
 85 |             for n, label in enumerate(labels):
 86 |                 phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
 87 |                 if phoneme not in ['sil', 'pau']:
 88 |                     text += phoneme.replace('ch', 'ʧ').replace('sh',
 89 |                                                                'ʃ').replace('cl', 'Q')
 90 |                 else:
 91 |                     continue
 92 |                 # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
 93 |                 a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
 94 |                 a2 = int(re.search(r"\+(\d+)\+", label).group(1))
 95 |                 a3 = int(re.search(r"\+(\d+)/", label).group(1))
 96 |                 if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
 97 |                     a2_next = -1
 98 |                 else:
 99 |                     a2_next = int(
100 |                         re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
101 |                 # Accent phrase boundary
102 |                 if a3 == 1 and a2_next == 1:
103 |                     text += ' '
104 |                 # Falling
105 |                 elif a1 == 0 and a2_next == a2 + 1:
106 |                     text += '↓'
107 |                 # Rising
108 |                 elif a2 == 1 and a2_next == 2:
109 |                     text += '↑'
110 |         if i < len(marks):
111 |             text += unidecode(marks[i]).replace(' ', '')
112 |     return text
113 | 
114 | 
115 | def get_real_sokuon(text):
116 |     for regex, replacement in _real_sokuon:
117 |         text = re.sub(regex, replacement, text)
118 |     return text
119 | 
120 | 
121 | def get_real_hatsuon(text):
122 |     for regex, replacement in _real_hatsuon:
123 |         text = re.sub(regex, replacement, text)
124 |     return text
125 | 
126 | 
127 | def japanese_to_ipa(text):
128 |     text = japanese_to_romaji_with_accent(text).replace('...', '…')
129 |     text = re.sub(
130 |         r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
131 |     text = get_real_sokuon(text)
132 |     text = get_real_hatsuon(text)
133 |     for regex, replacement in _romaji_to_ipa:
134 |         text = re.sub(regex, replacement, text)
135 |     return text
136 | 
137 | 
138 | def japanese_to_ipa2(text):
139 |     text = japanese_to_romaji_with_accent(text).replace('...', '…')
140 |     text = get_real_sokuon(text)
141 |     text = get_real_hatsuon(text)
142 |     for regex, replacement in _romaji_to_ipa2:
143 |         text = re.sub(regex, replacement, text)
144 |     return text
145 | 
146 | 
147 | def japanese_to_ipa3(text):
148 |     text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
149 |         'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
150 |     text = re.sub(
151 |         r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
152 |     text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
153 |     return text
154 | 


--------------------------------------------------------------------------------
/text/korean.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from jamo import h2j, j2hcj
  3 | import ko_pron
  4 | 
  5 | 
  6 | # This is a list of Korean classifiers preceded by pure Korean numerals.
  7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
  8 | 
  9 | # List of (hangul, hangul divided) pairs:
 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
 11 |     ('ㄳ', 'ㄱㅅ'),
 12 |     ('ㄵ', 'ㄴㅈ'),
 13 |     ('ㄶ', 'ㄴㅎ'),
 14 |     ('ㄺ', 'ㄹㄱ'),
 15 |     ('ㄻ', 'ㄹㅁ'),
 16 |     ('ㄼ', 'ㄹㅂ'),
 17 |     ('ㄽ', 'ㄹㅅ'),
 18 |     ('ㄾ', 'ㄹㅌ'),
 19 |     ('ㄿ', 'ㄹㅍ'),
 20 |     ('ㅀ', 'ㄹㅎ'),
 21 |     ('ㅄ', 'ㅂㅅ'),
 22 |     ('ㅘ', 'ㅗㅏ'),
 23 |     ('ㅙ', 'ㅗㅐ'),
 24 |     ('ㅚ', 'ㅗㅣ'),
 25 |     ('ㅝ', 'ㅜㅓ'),
 26 |     ('ㅞ', 'ㅜㅔ'),
 27 |     ('ㅟ', 'ㅜㅣ'),
 28 |     ('ㅢ', 'ㅡㅣ'),
 29 |     ('ㅑ', 'ㅣㅏ'),
 30 |     ('ㅒ', 'ㅣㅐ'),
 31 |     ('ㅕ', 'ㅣㅓ'),
 32 |     ('ㅖ', 'ㅣㅔ'),
 33 |     ('ㅛ', 'ㅣㅗ'),
 34 |     ('ㅠ', 'ㅣㅜ')
 35 | ]]
 36 | 
 37 | # List of (Latin alphabet, hangul) pairs:
 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 39 |     ('a', '에이'),
 40 |     ('b', '비'),
 41 |     ('c', '시'),
 42 |     ('d', '디'),
 43 |     ('e', '이'),
 44 |     ('f', '에프'),
 45 |     ('g', '지'),
 46 |     ('h', '에이치'),
 47 |     ('i', '아이'),
 48 |     ('j', '제이'),
 49 |     ('k', '케이'),
 50 |     ('l', '엘'),
 51 |     ('m', '엠'),
 52 |     ('n', '엔'),
 53 |     ('o', '오'),
 54 |     ('p', '피'),
 55 |     ('q', '큐'),
 56 |     ('r', '아르'),
 57 |     ('s', '에스'),
 58 |     ('t', '티'),
 59 |     ('u', '유'),
 60 |     ('v', '브이'),
 61 |     ('w', '더블유'),
 62 |     ('x', '엑스'),
 63 |     ('y', '와이'),
 64 |     ('z', '제트')
 65 | ]]
 66 | 
 67 | # List of (ipa, lazy ipa) pairs:
 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 69 |     ('t͡ɕ','ʧ'),
 70 |     ('d͡ʑ','ʥ'),
 71 |     ('ɲ','n^'),
 72 |     ('ɕ','ʃ'),
 73 |     ('ʷ','w'),
 74 |     ('ɭ','l`'),
 75 |     ('ʎ','ɾ'),
 76 |     ('ɣ','ŋ'),
 77 |     ('ɰ','ɯ'),
 78 |     ('ʝ','j'),
 79 |     ('ʌ','ə'),
 80 |     ('ɡ','g'),
 81 |     ('\u031a','#'),
 82 |     ('\u0348','='),
 83 |     ('\u031e',''),
 84 |     ('\u0320',''),
 85 |     ('\u0339','')
 86 | ]]
 87 | 
 88 | 
 89 | def latin_to_hangul(text):
 90 |     for regex, replacement in _latin_to_hangul:
 91 |         text = re.sub(regex, replacement, text)
 92 |     return text
 93 | 
 94 | 
 95 | def divide_hangul(text):
 96 |     text = j2hcj(h2j(text))
 97 |     for regex, replacement in _hangul_divided:
 98 |         text = re.sub(regex, replacement, text)
 99 |     return text
100 | 
101 | 
102 | def hangul_number(num, sino=True):
103 |     '''Reference https://github.com/Kyubyong/g2pK'''
104 |     num = re.sub(',', '', num)
105 | 
106 |     if num == '0':
107 |         return '영'
108 |     if not sino and num == '20':
109 |         return '스무'
110 | 
111 |     digits = '123456789'
112 |     names = '일이삼사오육칠팔구'
113 |     digit2name = {d: n for d, n in zip(digits, names)}
114 | 
115 |     modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
116 |     decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
117 |     digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
118 |     digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
119 | 
120 |     spelledout = []
121 |     for i, digit in enumerate(num):
122 |         i = len(num) - i - 1
123 |         if sino:
124 |             if i == 0:
125 |                 name = digit2name.get(digit, '')
126 |             elif i == 1:
127 |                 name = digit2name.get(digit, '') + '십'
128 |                 name = name.replace('일십', '십')
129 |         else:
130 |             if i == 0:
131 |                 name = digit2mod.get(digit, '')
132 |             elif i == 1:
133 |                 name = digit2dec.get(digit, '')
134 |         if digit == '0':
135 |             if i % 4 == 0:
136 |                 last_three = spelledout[-min(3, len(spelledout)):]
137 |                 if ''.join(last_three) == '':
138 |                     spelledout.append('')
139 |                     continue
140 |             else:
141 |                 spelledout.append('')
142 |                 continue
143 |         if i == 2:
144 |             name = digit2name.get(digit, '') + '백'
145 |             name = name.replace('일백', '백')
146 |         elif i == 3:
147 |             name = digit2name.get(digit, '') + '천'
148 |             name = name.replace('일천', '천')
149 |         elif i == 4:
150 |             name = digit2name.get(digit, '') + '만'
151 |             name = name.replace('일만', '만')
152 |         elif i == 5:
153 |             name = digit2name.get(digit, '') + '십'
154 |             name = name.replace('일십', '십')
155 |         elif i == 6:
156 |             name = digit2name.get(digit, '') + '백'
157 |             name = name.replace('일백', '백')
158 |         elif i == 7:
159 |             name = digit2name.get(digit, '') + '천'
160 |             name = name.replace('일천', '천')
161 |         elif i == 8:
162 |             name = digit2name.get(digit, '') + '억'
163 |         elif i == 9:
164 |             name = digit2name.get(digit, '') + '십'
165 |         elif i == 10:
166 |             name = digit2name.get(digit, '') + '백'
167 |         elif i == 11:
168 |             name = digit2name.get(digit, '') + '천'
169 |         elif i == 12:
170 |             name = digit2name.get(digit, '') + '조'
171 |         elif i == 13:
172 |             name = digit2name.get(digit, '') + '십'
173 |         elif i == 14:
174 |             name = digit2name.get(digit, '') + '백'
175 |         elif i == 15:
176 |             name = digit2name.get(digit, '') + '천'
177 |         spelledout.append(name)
178 |     return ''.join(elem for elem in spelledout)
179 | 
180 | 
181 | def number_to_hangul(text):
182 |     '''Reference https://github.com/Kyubyong/g2pK'''
183 |     tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
184 |     for token in tokens:
185 |         num, classifier = token
186 |         if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
187 |             spelledout = hangul_number(num, sino=False)
188 |         else:
189 |             spelledout = hangul_number(num, sino=True)
190 |         text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
191 |     # digit by digit for remaining digits
192 |     digits = '0123456789'
193 |     names = '영일이삼사오육칠팔구'
194 |     for d, n in zip(digits, names):
195 |         text = text.replace(d, n)
196 |     return text
197 | 
198 | 
199 | def korean_to_lazy_ipa(text):
200 |     text = latin_to_hangul(text)
201 |     text = number_to_hangul(text)
202 |     text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
203 |     for regex, replacement in _ipa_to_lazy_ipa:
204 |         text = re.sub(regex, replacement, text)
205 |     return text
206 | 
207 | 
208 | def korean_to_ipa(text):
209 |     text = korean_to_lazy_ipa(text)
210 |     return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
211 | 


--------------------------------------------------------------------------------
/text/mandarin.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import re
  4 | from pypinyin import lazy_pinyin, BOPOMOFO
  5 | import jieba
  6 | import cn2an
  7 | import logging
  8 | 
  9 | logging.getLogger('jieba').setLevel(logging.WARNING)
 10 | jieba.set_dictionary(r'./jieba/dict.txt')
 11 | jieba.initialize()
 12 | 
 13 | 
 14 | # List of (Latin alphabet, bopomofo) pairs:
 15 | _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 16 |     ('a', 'ㄟˉ'),
 17 |     ('b', 'ㄅㄧˋ'),
 18 |     ('c', 'ㄙㄧˉ'),
 19 |     ('d', 'ㄉㄧˋ'),
 20 |     ('e', 'ㄧˋ'),
 21 |     ('f', 'ㄝˊㄈㄨˋ'),
 22 |     ('g', 'ㄐㄧˋ'),
 23 |     ('h', 'ㄝˇㄑㄩˋ'),
 24 |     ('i', 'ㄞˋ'),
 25 |     ('j', 'ㄐㄟˋ'),
 26 |     ('k', 'ㄎㄟˋ'),
 27 |     ('l', 'ㄝˊㄛˋ'),
 28 |     ('m', 'ㄝˊㄇㄨˋ'),
 29 |     ('n', 'ㄣˉ'),
 30 |     ('o', 'ㄡˉ'),
 31 |     ('p', 'ㄆㄧˉ'),
 32 |     ('q', 'ㄎㄧㄡˉ'),
 33 |     ('r', 'ㄚˋ'),
 34 |     ('s', 'ㄝˊㄙˋ'),
 35 |     ('t', 'ㄊㄧˋ'),
 36 |     ('u', 'ㄧㄡˉ'),
 37 |     ('v', 'ㄨㄧˉ'),
 38 |     ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
 39 |     ('x', 'ㄝˉㄎㄨˋㄙˋ'),
 40 |     ('y', 'ㄨㄞˋ'),
 41 |     ('z', 'ㄗㄟˋ')
 42 | ]]
 43 | 
 44 | # List of (bopomofo, romaji) pairs:
 45 | _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
 46 |     ('ㄅㄛ', 'p⁼wo'),
 47 |     ('ㄆㄛ', 'pʰwo'),
 48 |     ('ㄇㄛ', 'mwo'),
 49 |     ('ㄈㄛ', 'fwo'),
 50 |     ('ㄅ', 'p⁼'),
 51 |     ('ㄆ', 'pʰ'),
 52 |     ('ㄇ', 'm'),
 53 |     ('ㄈ', 'f'),
 54 |     ('ㄉ', 't⁼'),
 55 |     ('ㄊ', 'tʰ'),
 56 |     ('ㄋ', 'n'),
 57 |     ('ㄌ', 'l'),
 58 |     ('ㄍ', 'k⁼'),
 59 |     ('ㄎ', 'kʰ'),
 60 |     ('ㄏ', 'h'),
 61 |     ('ㄐ', 'ʧ⁼'),
 62 |     ('ㄑ', 'ʧʰ'),
 63 |     ('ㄒ', 'ʃ'),
 64 |     ('ㄓ', 'ʦ`⁼'),
 65 |     ('ㄔ', 'ʦ`ʰ'),
 66 |     ('ㄕ', 's`'),
 67 |     ('ㄖ', 'ɹ`'),
 68 |     ('ㄗ', 'ʦ⁼'),
 69 |     ('ㄘ', 'ʦʰ'),
 70 |     ('ㄙ', 's'),
 71 |     ('ㄚ', 'a'),
 72 |     ('ㄛ', 'o'),
 73 |     ('ㄜ', 'ə'),
 74 |     ('ㄝ', 'e'),
 75 |     ('ㄞ', 'ai'),
 76 |     ('ㄟ', 'ei'),
 77 |     ('ㄠ', 'au'),
 78 |     ('ㄡ', 'ou'),
 79 |     ('ㄧㄢ', 'yeNN'),
 80 |     ('ㄢ', 'aNN'),
 81 |     ('ㄧㄣ', 'iNN'),
 82 |     ('ㄣ', 'əNN'),
 83 |     ('ㄤ', 'aNg'),
 84 |     ('ㄧㄥ', 'iNg'),
 85 |     ('ㄨㄥ', 'uNg'),
 86 |     ('ㄩㄥ', 'yuNg'),
 87 |     ('ㄥ', 'əNg'),
 88 |     ('ㄦ', 'əɻ'),
 89 |     ('ㄧ', 'i'),
 90 |     ('ㄨ', 'u'),
 91 |     ('ㄩ', 'ɥ'),
 92 |     ('ˉ', '→'),
 93 |     ('ˊ', '↑'),
 94 |     ('ˇ', '↓↑'),
 95 |     ('ˋ', '↓'),
 96 |     ('˙', ''),
 97 |     ('，', ','),
 98 |     ('。', '.'),
 99 |     ('！', '!'),
100 |     ('？', '?'),
101 |     ('—', '-')
102 | ]]
103 | 
104 | # List of (romaji, ipa) pairs:
105 | _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
106 |     ('ʃy', 'ʃ'),
107 |     ('ʧʰy', 'ʧʰ'),
108 |     ('ʧ⁼y', 'ʧ⁼'),
109 |     ('NN', 'n'),
110 |     ('Ng', 'ŋ'),
111 |     ('y', 'j'),
112 |     ('h', 'x')
113 | ]]
114 | 
115 | # List of (bopomofo, ipa) pairs:
116 | _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
117 |     ('ㄅㄛ', 'p⁼wo'),
118 |     ('ㄆㄛ', 'pʰwo'),
119 |     ('ㄇㄛ', 'mwo'),
120 |     ('ㄈㄛ', 'fwo'),
121 |     ('ㄅ', 'p⁼'),
122 |     ('ㄆ', 'pʰ'),
123 |     ('ㄇ', 'm'),
124 |     ('ㄈ', 'f'),
125 |     ('ㄉ', 't⁼'),
126 |     ('ㄊ', 'tʰ'),
127 |     ('ㄋ', 'n'),
128 |     ('ㄌ', 'l'),
129 |     ('ㄍ', 'k⁼'),
130 |     ('ㄎ', 'kʰ'),
131 |     ('ㄏ', 'x'),
132 |     ('ㄐ', 'tʃ⁼'),
133 |     ('ㄑ', 'tʃʰ'),
134 |     ('ㄒ', 'ʃ'),
135 |     ('ㄓ', 'ts`⁼'),
136 |     ('ㄔ', 'ts`ʰ'),
137 |     ('ㄕ', 's`'),
138 |     ('ㄖ', 'ɹ`'),
139 |     ('ㄗ', 'ts⁼'),
140 |     ('ㄘ', 'tsʰ'),
141 |     ('ㄙ', 's'),
142 |     ('ㄚ', 'a'),
143 |     ('ㄛ', 'o'),
144 |     ('ㄜ', 'ə'),
145 |     ('ㄝ', 'ɛ'),
146 |     ('ㄞ', 'aɪ'),
147 |     ('ㄟ', 'eɪ'),
148 |     ('ㄠ', 'ɑʊ'),
149 |     ('ㄡ', 'oʊ'),
150 |     ('ㄧㄢ', 'jɛn'),
151 |     ('ㄩㄢ', 'ɥæn'),
152 |     ('ㄢ', 'an'),
153 |     ('ㄧㄣ', 'in'),
154 |     ('ㄩㄣ', 'ɥn'),
155 |     ('ㄣ', 'ən'),
156 |     ('ㄤ', 'ɑŋ'),
157 |     ('ㄧㄥ', 'iŋ'),
158 |     ('ㄨㄥ', 'ʊŋ'),
159 |     ('ㄩㄥ', 'jʊŋ'),
160 |     ('ㄥ', 'əŋ'),
161 |     ('ㄦ', 'əɻ'),
162 |     ('ㄧ', 'i'),
163 |     ('ㄨ', 'u'),
164 |     ('ㄩ', 'ɥ'),
165 |     ('ˉ', '→'),
166 |     ('ˊ', '↑'),
167 |     ('ˇ', '↓↑'),
168 |     ('ˋ', '↓'),
169 |     ('˙', ''),
170 |     ('，', ','),
171 |     ('。', '.'),
172 |     ('！', '!'),
173 |     ('？', '?'),
174 |     ('—', '-')
175 | ]]
176 | 
177 | # List of (bopomofo, ipa2) pairs:
178 | _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
179 |     ('ㄅㄛ', 'pwo'),
180 |     ('ㄆㄛ', 'pʰwo'),
181 |     ('ㄇㄛ', 'mwo'),
182 |     ('ㄈㄛ', 'fwo'),
183 |     ('ㄅ', 'p'),
184 |     ('ㄆ', 'pʰ'),
185 |     ('ㄇ', 'm'),
186 |     ('ㄈ', 'f'),
187 |     ('ㄉ', 't'),
188 |     ('ㄊ', 'tʰ'),
189 |     ('ㄋ', 'n'),
190 |     ('ㄌ', 'l'),
191 |     ('ㄍ', 'k'),
192 |     ('ㄎ', 'kʰ'),
193 |     ('ㄏ', 'h'),
194 |     ('ㄐ', 'tɕ'),
195 |     ('ㄑ', 'tɕʰ'),
196 |     ('ㄒ', 'ɕ'),
197 |     ('ㄓ', 'tʂ'),
198 |     ('ㄔ', 'tʂʰ'),
199 |     ('ㄕ', 'ʂ'),
200 |     ('ㄖ', 'ɻ'),
201 |     ('ㄗ', 'ts'),
202 |     ('ㄘ', 'tsʰ'),
203 |     ('ㄙ', 's'),
204 |     ('ㄚ', 'a'),
205 |     ('ㄛ', 'o'),
206 |     ('ㄜ', 'ɤ'),
207 |     ('ㄝ', 'ɛ'),
208 |     ('ㄞ', 'aɪ'),
209 |     ('ㄟ', 'eɪ'),
210 |     ('ㄠ', 'ɑʊ'),
211 |     ('ㄡ', 'oʊ'),
212 |     ('ㄧㄢ', 'jɛn'),
213 |     ('ㄩㄢ', 'yæn'),
214 |     ('ㄢ', 'an'),
215 |     ('ㄧㄣ', 'in'),
216 |     ('ㄩㄣ', 'yn'),
217 |     ('ㄣ', 'ən'),
218 |     ('ㄤ', 'ɑŋ'),
219 |     ('ㄧㄥ', 'iŋ'),
220 |     ('ㄨㄥ', 'ʊŋ'),
221 |     ('ㄩㄥ', 'jʊŋ'),
222 |     ('ㄥ', 'ɤŋ'),
223 |     ('ㄦ', 'əɻ'),
224 |     ('ㄧ', 'i'),
225 |     ('ㄨ', 'u'),
226 |     ('ㄩ', 'y'),
227 |     ('ˉ', '˥'),
228 |     ('ˊ', '˧˥'),
229 |     ('ˇ', '˨˩˦'),
230 |     ('ˋ', '˥˩'),
231 |     ('˙', ''),
232 |     ('，', ','),
233 |     ('。', '.'),
234 |     ('！', '!'),
235 |     ('？', '?'),
236 |     ('—', '-')
237 | ]]
238 | 
239 | 
240 | def number_to_chinese(text):
241 |     numbers = re.findall(r'\d+(?:\.?\d+)?', text)
242 |     for number in numbers:
243 |         text = text.replace(number, cn2an.an2cn(number), 1)
244 |     return text
245 | 
246 | 
247 | def chinese_to_bopomofo(text):
248 |     text = text.replace('、', '，').replace('；', '，').replace('：', '，')
249 |     words = jieba.lcut(text, cut_all=False)
250 |     text = ''
251 |     for word in words:
252 |         bopomofos = lazy_pinyin(word, BOPOMOFO)
253 |         if not re.search('[\u4e00-\u9fff]', word):
254 |             text += word
255 |             continue
256 |         for i in range(len(bopomofos)):
257 |             bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
258 |         if text != '':
259 |             text += ' '
260 |         text += ''.join(bopomofos)
261 |     return text
262 | 
263 | 
264 | def latin_to_bopomofo(text):
265 |     for regex, replacement in _latin_to_bopomofo:
266 |         text = re.sub(regex, replacement, text)
267 |     return text
268 | 
269 | 
270 | def bopomofo_to_romaji(text):
271 |     for regex, replacement in _bopomofo_to_romaji:
272 |         text = re.sub(regex, replacement, text)
273 |     return text
274 | 
275 | 
276 | def bopomofo_to_ipa(text):
277 |     for regex, replacement in _bopomofo_to_ipa:
278 |         text = re.sub(regex, replacement, text)
279 |     return text
280 | 
281 | 
282 | def bopomofo_to_ipa2(text):
283 |     for regex, replacement in _bopomofo_to_ipa2:
284 |         text = re.sub(regex, replacement, text)
285 |     return text
286 | 
287 | 
288 | def chinese_to_romaji(text):
289 |     text = number_to_chinese(text)
290 |     text = chinese_to_bopomofo(text)
291 |     text = latin_to_bopomofo(text)
292 |     text = bopomofo_to_romaji(text)
293 |     text = re.sub('i([aoe])', r'y\1', text)
294 |     text = re.sub('u([aoəe])', r'w\1', text)
295 |     text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
296 |                   r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
297 |     text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
298 |     return text
299 | 
300 | 
301 | def chinese_to_lazy_ipa(text):
302 |     text = chinese_to_romaji(text)
303 |     for regex, replacement in _romaji_to_ipa:
304 |         text = re.sub(regex, replacement, text)
305 |     return text
306 | 
307 | 
308 | def chinese_to_ipa(text):
309 |     text = number_to_chinese(text)
310 |     text = chinese_to_bopomofo(text)
311 |     text = latin_to_bopomofo(text)
312 |     text = bopomofo_to_ipa(text)
313 |     text = re.sub('i([aoe])', r'j\1', text)
314 |     text = re.sub('u([aoəe])', r'w\1', text)
315 |     text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
316 |                   r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
317 |     text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
318 |     return text
319 | 
320 | 
321 | def chinese_to_ipa2(text):
322 |     text = number_to_chinese(text)
323 |     text = chinese_to_bopomofo(text)
324 |     text = latin_to_bopomofo(text)
325 |     text = bopomofo_to_ipa2(text)
326 |     text = re.sub(r'i([aoe])', r'j\1', text)
327 |     text = re.sub(r'u([aoəe])', r'w\1', text)
328 |     text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
329 |     text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
330 |     return text
331 | 


--------------------------------------------------------------------------------
/text/ngu_dialect.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import opencc
 3 | 
 4 | 
 5 | dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
 6 |             'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
 7 |             'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
 8 |             'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
 9 |             'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
10 |             'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
11 | 
12 | converters = {}
13 | 
14 | for dialect in dialects.values():
15 |     try:
16 |         converters[dialect] = opencc.OpenCC(dialect)
17 |     except:
18 |         pass
19 | 
20 | 
21 | def ngu_dialect_to_ipa(text, dialect):
22 |     dialect = dialects[dialect]
23 |     text = converters[dialect].convert(text).replace('-','').replace('$',' ')
24 |     text = re.sub(r'[、；：]', '，', text)
25 |     text = re.sub(r'\s*，\s*', ', ', text)
26 |     text = re.sub(r'\s*。\s*', '. ', text)
27 |     text = re.sub(r'\s*？\s*', '? ', text)
28 |     text = re.sub(r'\s*！\s*', '! ', text)
29 |     text = re.sub(r'\s*$', '', text)
30 |     return text
31 | 


--------------------------------------------------------------------------------
/text/sanskrit.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from indic_transliteration import sanscript
 3 | 
 4 | 
 5 | # List of (iast, ipa) pairs:
 6 | _iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 7 |     ('a', 'ə'),
 8 |     ('ā', 'aː'),
 9 |     ('ī', 'iː'),
10 |     ('ū', 'uː'),
11 |     ('ṛ', 'ɹ`'),
12 |     ('ṝ', 'ɹ`ː'),
13 |     ('ḷ', 'l`'),
14 |     ('ḹ', 'l`ː'),
15 |     ('e', 'eː'),
16 |     ('o', 'oː'),
17 |     ('k', 'k⁼'),
18 |     ('k⁼h', 'kʰ'),
19 |     ('g', 'g⁼'),
20 |     ('g⁼h', 'gʰ'),
21 |     ('ṅ', 'ŋ'),
22 |     ('c', 'ʧ⁼'),
23 |     ('ʧ⁼h', 'ʧʰ'),
24 |     ('j', 'ʥ⁼'),
25 |     ('ʥ⁼h', 'ʥʰ'),
26 |     ('ñ', 'n^'),
27 |     ('ṭ', 't`⁼'),
28 |     ('t`⁼h', 't`ʰ'),
29 |     ('ḍ', 'd`⁼'),
30 |     ('d`⁼h', 'd`ʰ'),
31 |     ('ṇ', 'n`'),
32 |     ('t', 't⁼'),
33 |     ('t⁼h', 'tʰ'),
34 |     ('d', 'd⁼'),
35 |     ('d⁼h', 'dʰ'),
36 |     ('p', 'p⁼'),
37 |     ('p⁼h', 'pʰ'),
38 |     ('b', 'b⁼'),
39 |     ('b⁼h', 'bʰ'),
40 |     ('y', 'j'),
41 |     ('ś', 'ʃ'),
42 |     ('ṣ', 's`'),
43 |     ('r', 'ɾ'),
44 |     ('l̤', 'l`'),
45 |     ('h', 'ɦ'),
46 |     ("'", ''),
47 |     ('~', '^'),
48 |     ('ṃ', '^')
49 | ]]
50 | 
51 | 
52 | def devanagari_to_ipa(text):
53 |     text = text.replace('ॐ', 'ओम्')
54 |     text = re.sub(r'\s*।\s*$', '.', text)
55 |     text = re.sub(r'\s*।\s*', ', ', text)
56 |     text = re.sub(r'\s*॥', '.', text)
57 |     text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
58 |     for regex, replacement in _iast_to_ipa:
59 |         text = re.sub(regex, replacement, text)
60 |     text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0)
61 |                   [:-1]+'h'+x.group(1)+'*', text)
62 |     return text
63 | 


--------------------------------------------------------------------------------
/text/shanghainese.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import cn2an
 3 | import opencc
 4 | 
 5 | 
 6 | converter = opencc.OpenCC('zaonhe')
 7 | 
 8 | # List of (Latin alphabet, ipa) pairs:
 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10 |     ('A', 'ᴇ'),
11 |     ('B', 'bi'),
12 |     ('C', 'si'),
13 |     ('D', 'di'),
14 |     ('E', 'i'),
15 |     ('F', 'ᴇf'),
16 |     ('G', 'dʑi'),
17 |     ('H', 'ᴇtɕʰ'),
18 |     ('I', 'ᴀi'),
19 |     ('J', 'dʑᴇ'),
20 |     ('K', 'kʰᴇ'),
21 |     ('L', 'ᴇl'),
22 |     ('M', 'ᴇm'),
23 |     ('N', 'ᴇn'),
24 |     ('O', 'o'),
25 |     ('P', 'pʰi'),
26 |     ('Q', 'kʰiu'),
27 |     ('R', 'ᴀl'),
28 |     ('S', 'ᴇs'),
29 |     ('T', 'tʰi'),
30 |     ('U', 'ɦiu'),
31 |     ('V', 'vi'),
32 |     ('W', 'dᴀbɤliu'),
33 |     ('X', 'ᴇks'),
34 |     ('Y', 'uᴀi'),
35 |     ('Z', 'zᴇ')
36 | ]]
37 | 
38 | 
39 | def _number_to_shanghainese(num):
40 |     num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
41 |     return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
42 | 
43 | 
44 | def number_to_shanghainese(text):
45 |     return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
46 | 
47 | 
48 | def latin_to_ipa(text):
49 |     for regex, replacement in _latin_to_ipa:
50 |         text = re.sub(regex, replacement, text)
51 |     return text
52 | 
53 | 
54 | def shanghainese_to_ipa(text):
55 |     text = number_to_shanghainese(text.upper())
56 |     text = converter.convert(text).replace('-','').replace('$',' ')
57 |     text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
58 |     text = re.sub(r'[、；：]', '，', text)
59 |     text = re.sub(r'\s*，\s*', ', ', text)
60 |     text = re.sub(r'\s*。\s*', '. ', text)
61 |     text = re.sub(r'\s*？\s*', '? ', text)
62 |     text = re.sub(r'\s*！\s*', '! ', text)
63 |     text = re.sub(r'\s*$', '', text)
64 |     return text
65 | 


--------------------------------------------------------------------------------
/text/thai.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from num_thai.thainumbers import NumThai
 3 | 
 4 | 
 5 | num = NumThai()
 6 | 
 7 | # List of (Latin alphabet, Thai) pairs:
 8 | _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 9 |     ('a', 'เอ'),
10 |     ('b','บี'),
11 |     ('c','ซี'),
12 |     ('d','ดี'),
13 |     ('e','อี'),
14 |     ('f','เอฟ'),
15 |     ('g','จี'),
16 |     ('h','เอช'),
17 |     ('i','ไอ'),
18 |     ('j','เจ'),
19 |     ('k','เค'),
20 |     ('l','แอล'),
21 |     ('m','เอ็ม'),
22 |     ('n','เอ็น'),
23 |     ('o','โอ'),
24 |     ('p','พี'),
25 |     ('q','คิว'),
26 |     ('r','แอร์'),
27 |     ('s','เอส'),
28 |     ('t','ที'),
29 |     ('u','ยู'),
30 |     ('v','วี'),
31 |     ('w','ดับเบิลยู'),
32 |     ('x','เอ็กซ์'),
33 |     ('y','วาย'),
34 |     ('z','ซี')
35 | ]]
36 | 
37 | 
38 | def num_to_thai(text):
39 |     return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
40 | 
41 | def latin_to_thai(text):
42 |     for regex, replacement in _latin_to_thai:
43 |         text = re.sub(regex, replacement, text)
44 |     return text
45 | 


--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(inputs, 
 13 |                                            unnormalized_widths,
 14 |                                            unnormalized_heights,
 15 |                                            unnormalized_derivatives,
 16 |                                            inverse=False,
 17 |                                            tails=None, 
 18 |                                            tail_bound=1.,
 19 |                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 20 |                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 21 |                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
 22 | 
 23 |     if tails is None:
 24 |         spline_fn = rational_quadratic_spline
 25 |         spline_kwargs = {}
 26 |     else:
 27 |         spline_fn = unconstrained_rational_quadratic_spline
 28 |         spline_kwargs = {
 29 |             'tails': tails,
 30 |             'tail_bound': tail_bound
 31 |         }
 32 | 
 33 |     outputs, logabsdet = spline_fn(
 34 |             inputs=inputs,
 35 |             unnormalized_widths=unnormalized_widths,
 36 |             unnormalized_heights=unnormalized_heights,
 37 |             unnormalized_derivatives=unnormalized_derivatives,
 38 |             inverse=inverse,
 39 |             min_bin_width=min_bin_width,
 40 |             min_bin_height=min_bin_height,
 41 |             min_derivative=min_derivative,
 42 |             **spline_kwargs
 43 |     )
 44 |     return outputs, logabsdet
 45 | 
 46 | 
 47 | def searchsorted(bin_locations, inputs, eps=1e-6):
 48 |     bin_locations[..., -1] += eps
 49 |     return torch.sum(
 50 |         inputs[..., None] >= bin_locations,
 51 |         dim=-1
 52 |     ) - 1
 53 | 
 54 | 
 55 | def unconstrained_rational_quadratic_spline(inputs,
 56 |                                             unnormalized_widths,
 57 |                                             unnormalized_heights,
 58 |                                             unnormalized_derivatives,
 59 |                                             inverse=False,
 60 |                                             tails='linear',
 61 |                                             tail_bound=1.,
 62 |                                             min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 63 |                                             min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 64 |                                             min_derivative=DEFAULT_MIN_DERIVATIVE):
 65 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 66 |     outside_interval_mask = ~inside_interval_mask
 67 | 
 68 |     outputs = torch.zeros_like(inputs)
 69 |     logabsdet = torch.zeros_like(inputs)
 70 | 
 71 |     if tails == 'linear':
 72 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 73 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 74 |         unnormalized_derivatives[..., 0] = constant
 75 |         unnormalized_derivatives[..., -1] = constant
 76 | 
 77 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 78 |         logabsdet[outside_interval_mask] = 0
 79 |     else:
 80 |         raise RuntimeError('{} tails are not implemented.'.format(tails))
 81 | 
 82 |     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
 89 |         min_bin_width=min_bin_width,
 90 |         min_bin_height=min_bin_height,
 91 |         min_derivative=min_derivative
 92 |     )
 93 | 
 94 |     return outputs, logabsdet
 95 | 
 96 | def rational_quadratic_spline(inputs,
 97 |                               unnormalized_widths,
 98 |                               unnormalized_heights,
 99 |                               unnormalized_derivatives,
100 |                               inverse=False,
101 |                               left=0., right=1., bottom=0., top=1.,
102 |                               min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 |                               min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 |                               min_derivative=DEFAULT_MIN_DERIVATIVE):
105 |     if torch.min(inputs) < left or torch.max(inputs) > right:
106 |         raise ValueError('Input to a transform is not within its domain')
107 | 
108 |     num_bins = unnormalized_widths.shape[-1]
109 | 
110 |     if min_bin_width * num_bins > 1.0:
111 |         raise ValueError('Minimal bin width too large for the number of bins')
112 |     if min_bin_height * num_bins > 1.0:
113 |         raise ValueError('Minimal bin height too large for the number of bins')
114 | 
115 |     widths = F.softmax(unnormalized_widths, dim=-1)
116 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 |     cumwidths = torch.cumsum(widths, dim=-1)
118 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 |     cumwidths = (right - left) * cumwidths + left
120 |     cumwidths[..., 0] = left
121 |     cumwidths[..., -1] = right
122 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 | 
124 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 | 
126 |     heights = F.softmax(unnormalized_heights, dim=-1)
127 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 |     cumheights = torch.cumsum(heights, dim=-1)
129 |     cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 |     cumheights = (top - bottom) * cumheights + bottom
131 |     cumheights[..., 0] = bottom
132 |     cumheights[..., -1] = top
133 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
134 | 
135 |     if inverse:
136 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
137 |     else:
138 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 | 
140 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 | 
143 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 |     delta = heights / widths
145 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
146 | 
147 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
151 | 
152 |     if inverse:
153 |         a = (((inputs - input_cumheights) * (input_derivatives
154 |                                              + input_derivatives_plus_one
155 |                                              - 2 * input_delta)
156 |               + input_heights * (input_delta - input_derivatives)))
157 |         b = (input_heights * input_derivatives
158 |              - (inputs - input_cumheights) * (input_derivatives
159 |                                               + input_derivatives_plus_one
160 |                                               - 2 * input_delta))
161 |         c = - input_delta * (inputs - input_cumheights)
162 | 
163 |         discriminant = b.pow(2) - 4 * a * c
164 |         assert (discriminant >= 0).all()
165 | 
166 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
167 |         outputs = root * input_bin_widths + input_cumwidths
168 | 
169 |         theta_one_minus_theta = root * (1 - root)
170 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 |                                      * theta_one_minus_theta)
172 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 |                                                      + 2 * input_delta * theta_one_minus_theta
174 |                                                      + input_derivatives * (1 - root).pow(2))
175 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 | 
177 |         return outputs, -logabsdet
178 |     else:
179 |         theta = (inputs - input_cumwidths) / input_bin_widths
180 |         theta_one_minus_theta = theta * (1 - theta)
181 | 
182 |         numerator = input_heights * (input_delta * theta.pow(2)
183 |                                      + input_derivatives * theta_one_minus_theta)
184 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 |                                      * theta_one_minus_theta)
186 |         outputs = input_cumheights + numerator / denominator
187 | 
188 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 |                                                      + 2 * input_delta * theta_one_minus_theta
190 |                                                      + input_derivatives * (1 - theta).pow(2))
191 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 | 
193 |         return outputs, logabsdet
194 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from json import loads
 3 | from torch import load, FloatTensor
 4 | from numpy import float32
 5 | import librosa
 6 | 
 7 | 
 8 | class HParams():
 9 |   def __init__(self, **kwargs):
10 |     for k, v in kwargs.items():
11 |       if type(v) == dict:
12 |         v = HParams(**v)
13 |       self[k] = v
14 | 
15 |   def keys(self):
16 |     return self.__dict__.keys()
17 | 
18 |   def items(self):
19 |     return self.__dict__.items()
20 | 
21 |   def values(self):
22 |     return self.__dict__.values()
23 | 
24 |   def __len__(self):
25 |     return len(self.__dict__)
26 | 
27 |   def __getitem__(self, key):
28 |     return getattr(self, key)
29 | 
30 |   def __setitem__(self, key, value):
31 |     return setattr(self, key, value)
32 | 
33 |   def __contains__(self, key):
34 |     return key in self.__dict__
35 | 
36 |   def __repr__(self):
37 |     return self.__dict__.__repr__()
38 | 
39 | 
40 | def load_checkpoint(checkpoint_path, model):
41 |   checkpoint_dict = load(checkpoint_path, map_location='cpu')
42 |   iteration = checkpoint_dict['iteration']
43 |   saved_state_dict = checkpoint_dict['model']
44 |   if hasattr(model, 'module'):
45 |     state_dict = model.module.state_dict()
46 |   else:
47 |     state_dict = model.state_dict()
48 |   new_state_dict= {}
49 |   for k, v in state_dict.items():
50 |     try:
51 |       new_state_dict[k] = saved_state_dict[k]
52 |     except:
53 |       logging.info("%s is not in the checkpoint" % k)
54 |       new_state_dict[k] = v
55 |   if hasattr(model, 'module'):
56 |     model.module.load_state_dict(new_state_dict)
57 |   else:
58 |     model.load_state_dict(new_state_dict)
59 |   logging.info("Loaded checkpoint '{}' (iteration {})" .format(
60 |     checkpoint_path, iteration))
61 |   return
62 | 
63 | 
64 | def get_hparams_from_file(config_path):
65 |   with open(config_path, "r") as f:
66 |     data = f.read()
67 |   config = loads(data)
68 | 
69 |   hparams = HParams(**config)
70 |   return hparams
71 | 
72 | 
73 | def load_audio_to_torch(full_path, target_sampling_rate):
74 |   audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
75 |   return FloatTensor(audio.astype(float32))
76 | 


--------------------------------------------------------------------------------