├── .gitattributes
├── .github
└── workflows
│ └── build-validation.yml
├── .gitignore
├── .vscode
├── launch.json
└── tasks.json
├── Examples
└── Godot
│ └── README.md
├── LICENSE
├── NeMoOnnxSharp.Example
├── ModelDownloader.cs
├── NeMoOnnxSharp.Example.csproj
├── PretrainedModelInfo.cs
└── Program.cs
├── NeMoOnnxSharp.Tests
├── AudioFeatureBufferTest.cs
├── Data
│ ├── 61-70968-0000-mod.wav
│ ├── 61-70968-0000.wav
│ ├── cmudict-test
│ ├── heteronyms-test
│ ├── mel_spectrogram.bin
│ └── mfcc.bin
├── FFTTest.cs
├── NeMoOnnxSharp.Tests.csproj
├── PreprocessorTest.cs
├── TextTokenizersTest.cs
└── WaveFileTest.cs
├── NeMoOnnxSharp.sln
├── NeMoOnnxSharp
├── AudioPreprocessing
│ ├── AudioFeatureBuffer.cs
│ ├── AudioToMFCCPreprocessor.cs
│ ├── AudioToMelSpectrogramPreprocessor.cs
│ ├── FFT.cs
│ ├── FeatureNormalize.cs
│ ├── HTKMelBands.cs
│ ├── IAudioFeatureBuffer.cs
│ ├── IAudioPreprocessor.cs
│ ├── IFeaturizer.cs
│ ├── MFCC.cs
│ ├── MFCCNorm.cs
│ ├── MelBands.cs
│ ├── MelNorm.cs
│ ├── MelScale.cs
│ ├── SlaneyMelBands.cs
│ ├── Window.cs
│ └── WindowFunction.cs
├── FrameVAD.cs
├── Models
│ ├── ASRModel.cs
│ ├── CharTokenizer.cs
│ ├── EncDecCTCConfig.cs
│ ├── EncDecCTCModel.cs
│ ├── EncDecClassificationConfig.cs
│ ├── EncDecClassificationModel.cs
│ ├── Model.cs
│ ├── ModelConfig.cs
│ ├── SpectrogramGenerator.cs
│ ├── SpectrogramGeneratorConfig.cs
│ ├── Vocoder.cs
│ └── VocoderConfig.cs
├── NeMoOnnxSharp.csproj
├── SpeechConfig.cs
├── SpeechRecognitionEventArgs.cs
├── SpeechRecognizer.cs
├── SpeechSynthesisResult.cs
├── SpeechSynthesizer.cs
├── TTSTokenizers
│ ├── BaseCharsTokenizerr.cs
│ ├── BaseTokenizer.cs
│ ├── EnglishG2p.cs
│ ├── EnglishPhonemesTokenizer.cs
│ ├── GermanCharsTokenizer.cs
│ └── TokenizerUtils.cs
└── WaveFile.cs
├── Python
├── .flake8
├── convert_librispeech.py
├── export_models.py
└── make_test.py
├── README.md
└── test_data
├── .gitignore
├── 61-70968-0000.wav
├── 61-70968-0001.wav
├── 61-70968-0002.wav
├── 61-70968-0003.wav
├── 61-70968-0004.wav
├── 61-70968-0005.wav
├── 61-70968-0006.wav
├── 61-70968-0007.wav
├── 61-70968-0008.wav
├── 61-70968-0009.wav
├── 61-70968-0010.wav
├── 61-70968-0011.wav
├── 61-70968-0012.wav
├── 61-70968-0013.wav
├── 61-70968-0014.wav
├── 61-70968-0015.wav
├── 61-70968-0016.wav
├── 61-70968-0017.wav
├── 61-70968-0018.wav
├── 61-70968-0019.wav
├── 61-70968-0020.wav
├── 61-70968-0021.wav
├── 61-70968-0022.wav
├── 61-70968-0023.wav
├── 61-70968-0024.wav
├── 61-70968-0025.wav
├── 61-70968-0026.wav
├── 61-70968-0027.wav
├── 61-70968-0028.wav
├── 61-70968-0029.wav
├── 61-70968-0030.wav
├── 61-70968-0031.wav
├── 61-70968-0032.wav
├── 61-70968-0033.wav
├── 61-70968-0034.wav
├── 61-70968-0035.wav
├── 61-70968-0036.wav
├── 61-70968-0037.wav
├── 61-70968-0038.wav
├── 61-70968-0039.wav
├── 61-70968-0040.wav
├── 61-70968-0041.wav
├── 61-70968-0042.wav
├── 61-70968-0043.wav
├── 61-70968-0044.wav
├── 61-70968-0045.wav
├── 61-70968-0046.wav
├── 61-70968-0047.wav
├── 61-70968-0048.wav
├── 61-70968-0049.wav
├── 61-70968-0050.wav
├── 61-70968-0051.wav
├── 61-70968-0052.wav
├── 61-70968-0053.wav
├── 61-70968-0054.wav
├── 61-70968-0055.wav
├── 61-70968-0056.wav
├── 61-70968-0057.wav
├── 61-70968-0058.wav
├── 61-70968-0059.wav
├── 61-70968-0060.wav
├── 61-70968-0061.wav
├── 61-70968-0062.wav
├── README.md
├── SpeechCommands_demo.wav
├── samples_thorsten-21.06-emotional_neutral.wav
└── transcript.txt
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.onnx filter=lfs diff=lfs merge=lfs -text
2 | *.wav filter=lfs diff=lfs merge=lfs -text
3 |
--------------------------------------------------------------------------------
/.github/workflows/build-validation.yml:
--------------------------------------------------------------------------------
1 | name: build
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | paths:
7 | - '**.cs'
8 | - '**.csproj'
9 | pull_request:
10 | branches: [ main ]
11 | paths:
12 | - '**.cs'
13 | - '**.csproj'
14 |
15 | jobs:
16 | build:
17 |
18 | runs-on: ubuntu-latest
19 |
20 | steps:
21 | - uses: actions/checkout@v3
22 | with:
23 | lfs: true
24 | - name: Checkout LFS objects
25 | run: git lfs checkout
26 | - name: Setup .NET
27 | uses: actions/setup-dotnet@v3
28 | with:
29 | dotnet-version: 6.0.x
30 | - uses: actions/cache@v3
31 | with:
32 | path: ~/.nuget/packages
33 | # Look to see if there is a cache hit for the corresponding requirements file
34 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/packages.lock.json') }}
35 | restore-keys: |
36 | ${{ runner.os }}-nuget
37 | - name: Restore dependencies
38 | run: dotnet restore
39 | - name: Build
40 | run: dotnet build --configuration Release --no-restore
41 | - name: Test
42 | run: dotnet test --no-restore --verbosity normal
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Aa][Rr][Mm]/
27 | [Aa][Rr][Mm]64/
28 | bld/
29 | [Bb]in/
30 | [Oo]bj/
31 | [Ll]og/
32 | [Ll]ogs/
33 |
34 | # Visual Studio 2015/2017 cache/options directory
35 | .vs/
36 | # Uncomment if you have tasks that create the project's static files in wwwroot
37 | #wwwroot/
38 |
39 | # Visual Studio 2017 auto generated files
40 | Generated\ Files/
41 |
42 | # MSTest test Results
43 | [Tt]est[Rr]esult*/
44 | [Bb]uild[Ll]og.*
45 |
46 | # NUnit
47 | *.VisualState.xml
48 | TestResult.xml
49 | nunit-*.xml
50 |
51 | # Build Results of an ATL Project
52 | [Dd]ebugPS/
53 | [Rr]eleasePS/
54 | dlldata.c
55 |
56 | # Benchmark Results
57 | BenchmarkDotNet.Artifacts/
58 |
59 | # .NET Core
60 | project.lock.json
61 | project.fragment.lock.json
62 | artifacts/
63 |
64 | # StyleCop
65 | StyleCopReport.xml
66 |
67 | # Files built by Visual Studio
68 | *_i.c
69 | *_p.c
70 | *_h.h
71 | *.ilk
72 | *.meta
73 | *.obj
74 | *.iobj
75 | *.pch
76 | *.pdb
77 | *.ipdb
78 | *.pgc
79 | *.pgd
80 | *.rsp
81 | *.sbr
82 | *.tlb
83 | *.tli
84 | *.tlh
85 | *.tmp
86 | *.tmp_proj
87 | *_wpftmp.csproj
88 | *.log
89 | *.vspscc
90 | *.vssscc
91 | .builds
92 | *.pidb
93 | *.svclog
94 | *.scc
95 |
96 | # Chutzpah Test files
97 | _Chutzpah*
98 |
99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 |
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 |
116 | # Visual Studio Trace Files
117 | *.e2e
118 |
119 | # TFS 2012 Local Workspace
120 | $tf/
121 |
122 | # Guidance Automation Toolkit
123 | *.gpState
124 |
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 |
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 |
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 |
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 |
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 |
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 |
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 |
153 | # Web workbench (sass)
154 | .sass-cache/
155 |
156 | # Installshield output folder
157 | [Ee]xpress/
158 |
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 |
169 | # Click-Once directory
170 | publish/
171 |
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 |
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 |
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 |
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 |
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 |
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 |
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 |
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 |
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 |
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 |
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 |
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 |
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 |
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 |
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 |
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 |
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 |
278 | # Visual Studio 6 build log
279 | *.plg
280 |
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 |
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 |
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 |
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 |
299 | # FAKE - F# Make
300 | .fake/
301 |
302 | # CodeRush personal settings
303 | .cr/personal
304 |
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 |
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 |
313 | # Tabs Studio
314 | *.tss
315 |
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 |
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 |
325 | # OpenCover UI analysis results
326 | OpenCover/
327 |
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 |
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 |
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 |
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 |
340 | # Local History for Visual Studio
341 | .localhistory/
342 |
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 |
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 |
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
351 |
352 | # ONNX runtime
353 | *.onnx
354 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 | {
5 | // Use IntelliSense to find out which attributes exist for C# debugging
6 | // Use hover for the description of the existing attributes
7 | // For further information visit https://github.com/dotnet/vscode-csharp/blob/main/debugger-launchjson.md
8 | "name": ".NET Core Launch (console)",
9 | "type": "coreclr",
10 | "request": "launch",
11 | "preLaunchTask": "build",
12 | // If you have changed target frameworks, make sure to update the program path.
13 | "program": "${workspaceFolder}/NeMoOnnxSharp.Example/bin/Debug/net7.0/NeMoOnnxSharp.Example.dll",
14 | "args": [],
15 | "cwd": "${workspaceFolder}/NeMoOnnxSharp.Example",
16 | // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console
17 | "console": "internalConsole",
18 | "stopAtEntry": false
19 | },
20 | {
21 | "name": ".NET Core Attach",
22 | "type": "coreclr",
23 | "request": "attach"
24 | }
25 | ]
26 | }
--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "2.0.0",
3 | "tasks": [
4 | {
5 | "label": "build",
6 | "command": "dotnet",
7 | "type": "process",
8 | "args": [
9 | "build",
10 | "${workspaceFolder}/NeMoOnnxSharp.sln",
11 | "/property:GenerateFullPaths=true",
12 | "/consoleloggerparameters:NoSummary"
13 | ],
14 | "problemMatcher": "$msCompile"
15 | },
16 | {
17 | "label": "publish",
18 | "command": "dotnet",
19 | "type": "process",
20 | "args": [
21 | "publish",
22 | "${workspaceFolder}/NeMoOnnxSharp.sln",
23 | "/property:GenerateFullPaths=true",
24 | "/consoleloggerparameters:NoSummary"
25 | ],
26 | "problemMatcher": "$msCompile"
27 | },
28 | {
29 | "label": "watch",
30 | "command": "dotnet",
31 | "type": "process",
32 | "args": [
33 | "watch",
34 | "run",
35 | "--project",
36 | "${workspaceFolder}/NeMoOnnxSharp.sln"
37 | ],
38 | "problemMatcher": "$msCompile"
39 | }
40 | ]
41 | }
--------------------------------------------------------------------------------
/Examples/Godot/README.md:
--------------------------------------------------------------------------------
1 | # NeMoOnnxGodot
2 |
3 | Moved to
4 | [Neural Speech Engine with NVIDIA NeMo and ONNX Runtime](https://godotengine.org/asset-library/asset/2298)
5 | of Godot Asset Library
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2022 Katsuya Iida
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Example/ModelDownloader.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Net.Http;
8 | using System.Security.Cryptography;
9 | using System.Text;
10 | using System.Threading;
11 | using System.Threading.Tasks;
12 |
13 | namespace NeMoOnnxSharp.Example
14 | {
15 | internal sealed class ModelDownloader : IDisposable
16 | {
17 | private readonly HttpClient _httpClient;
18 |
19 | public ModelDownloader()
20 | {
21 | _httpClient = new HttpClient();
22 | }
23 |
24 | public void Dispose()
25 | {
26 | _httpClient.Dispose();
27 | }
28 |
29 | private string GetFileChecksum(string path)
30 | {
31 | using SHA256 sha256 = SHA256.Create();
32 | using var stream = File.OpenRead(path);
33 | var hashValue = sha256.ComputeHash(stream);
34 | var sb = new StringBuilder();
35 | foreach (var value in hashValue)
36 | {
37 | sb.Append($"{value:x2}");
38 | }
39 | return sb.ToString();
40 | }
41 |
42 | private bool CheckCacheFile(string cacheFilePath, string expectedChecksum)
43 | {
44 | if (File.Exists(cacheFilePath))
45 | {
46 | string checksum = GetFileChecksum(cacheFilePath);
47 | if (string.Compare(checksum, expectedChecksum, true) == 0)
48 | {
49 | return true;
50 | }
51 | File.Delete(cacheFilePath);
52 | }
53 | return false;
54 | }
55 |
56 | private void ShowProgress(long progress, long? total)
57 | {
58 | if (total.HasValue)
59 | {
60 | Console.Write("\rDownloading... [{0}/{1} bytes]", progress, total);
61 | }
62 | else
63 | {
64 | Console.Write("\rDownloading... [{0} bytes]", progress);
65 | }
66 | }
67 |
68 | public async Task MayDownloadAsync(
69 | string filePath, string url, string sha256,
70 | CancellationToken cancellationToken = default)
71 | {
72 | if (CheckCacheFile(filePath, sha256))
73 | {
74 | Console.WriteLine("Using cached `{0}'.", url);
75 | }
76 | else
77 | {
78 | await DownloadAsync(url, filePath);
79 | if (!CheckCacheFile(filePath, sha256))
80 | {
81 | File.Delete(filePath);
82 | throw new InvalidDataException();
83 | }
84 | }
85 | }
86 |
87 | private async Task DownloadAsync(
88 | string url, string path,
89 | CancellationToken cancellationToken = default)
90 | {
91 | using (var response = await _httpClient.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, cancellationToken))
92 | {
93 | response.EnsureSuccessStatusCode();
94 | long currentPosition = 0;
95 | long? contentLength = response.Content.Headers.ContentLength;
96 | using (var reader = await response.Content.ReadAsStreamAsync(cancellationToken))
97 | {
98 | using (var writer = File.OpenWrite(path))
99 | {
100 | var lastDateTime = DateTime.UtcNow;
101 | byte[] buffer = new byte[4096];
102 | int bytesRead;
103 | while ((bytesRead = await reader.ReadAsync(buffer, 0, buffer.Length, cancellationToken)) != 0)
104 | {
105 | await writer.WriteAsync(buffer, 0, bytesRead, cancellationToken);
106 | currentPosition += bytesRead;
107 | var currentDateTime = DateTime.UtcNow;
108 | if ((currentDateTime - lastDateTime).Seconds >= 1)
109 | {
110 | lastDateTime = currentDateTime;
111 | ShowProgress(currentPosition, contentLength);
112 | }
113 | }
114 | }
115 | }
116 | }
117 | Console.WriteLine();
118 | }
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Example/NeMoOnnxSharp.Example.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net7.0
6 | enable
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Example/PretrainedModelInfo.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 |
6 | namespace NeMoOnnxSharp.Example
7 | {
8 | internal class PretrainedModelInfo
9 | {
10 | private static PretrainedModelInfo[]? _modelList = null;
11 |
12 | public static PretrainedModelInfo[] ModelList
13 | {
14 | get
15 | {
16 | if (_modelList == null)
17 | {
18 | _modelList = CreateModelList();
19 | }
20 | return _modelList;
21 | }
22 | }
23 |
24 | private static PretrainedModelInfo[] CreateModelList()
25 | {
26 | return new PretrainedModelInfo[]
27 | {
28 | new PretrainedModelInfo(
29 | "stt_en_quartznet15x5",
30 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/stt_en_quartznet15x5.onnx",
31 | "dde27f0528e92c05f7bc220a9be4a7bb99927da0a3a25db8f2f861e3559da90d"
32 | ),
33 | new PretrainedModelInfo(
34 | "QuartzNet15x5Base-En",
35 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/QuartzNet15x5Base-En.onnx",
36 | "ee1b72102fd0c5422d088e80f929dbdee7e889d256a4ce1e412cd49916823695"
37 | ),
38 | new PretrainedModelInfo(
39 | "vad_marblenet",
40 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/vad_marblenet.onnx",
41 | "edaf8a7bb62e4335f97aa70d1a447ccbd3942b58b870e08a20c0408a0fb106e0"
42 | ),
43 | new PretrainedModelInfo(
44 | "commandrecognition_en_matchboxnet3x1x64_v2",
45 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/commandrecognition_en_matchboxnet3x1x64_v2.onnx",
46 | "a0c5e4d14e83d3b6afdaf239265a390c2ca513bcdedf3d295bc1f9f97f19868a"
47 | ),
48 | new PretrainedModelInfo(
49 | "cmudict-0.7b_nv22.10",
50 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/cmudict-0.7b_nv22.10",
51 | "d330f3a3554d4c7ff8ef7bfc0c338ed74831d5f54109508fb829bdd82173608b"
52 | ),
53 | new PretrainedModelInfo(
54 | "heteronyms-052722",
55 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/heteronyms-052722",
56 | "b701909aedf753172eff223950f8859cd4b9b4c80199cf0a6e9ac4a307c8f8ec"
57 | ),
58 | new PretrainedModelInfo(
59 | "stt_de_quartznet15x5",
60 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.3/stt_de_quartznet15x5.onnx",
61 | "c6499961539c349117c4c724ba5f333d26b3242d2d39571fde44c3baa66d55fc"
62 | ),
63 | new PretrainedModelInfo(
64 | "tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210",
65 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.3/tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210.onnx",
66 | "35d351dcb5113a3af2eecc5051b42b747623328168a57b36b311f3396d5c1c74"
67 | ),
68 | new PretrainedModelInfo(
69 | "tts_de_hifigan_singleSpeaker_thorstenNeutral_2210",
70 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.3/tts_de_hifigan_singleSpeaker_thorstenNeutral_2210.onnx",
71 | "6be4e33bcc7e34b111d34be79157922802b224c2c4f1cc93dd62a5c19d936ade"
72 | ),
73 | new PretrainedModelInfo(
74 | "tts_en_fastpitch",
75 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_fastpitch.onnx",
76 | "a297174dea1084bd34d1af1a8447bc07f6c8aab7a4fea312c610eba6bc3d0eac"
77 | ),
78 | new PretrainedModelInfo(
79 | "tts_en_hifigan",
80 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_hifigan.onnx",
81 | "54501000b9de86b724931478b5bb8911e1b6ca6e293f68e9e10f60351f1949a3"
82 | )
83 | };
84 | }
85 |
86 | public static PretrainedModelInfo Get(string pretrainedModelName)
87 | {
88 | foreach (var info in ModelList)
89 | {
90 | if (pretrainedModelName == info.PretrainedModelName)
91 | {
92 | return info;
93 | }
94 | }
95 |
96 | throw new IndexOutOfRangeException();
97 | }
98 |
99 | public string PretrainedModelName { get; private set; }
100 | public string Location { get; private set; }
101 | public string Hash { get; private set; }
102 |
103 | public PretrainedModelInfo(
104 | string pretrainedModelName,
105 | string location,
106 | string hash)
107 | {
108 | PretrainedModelName = pretrainedModelName;
109 | Location = location;
110 | Hash = hash;
111 | }
112 | }
113 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/AudioFeatureBufferTest.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using NeMoOnnxSharp.AudioPreprocessing;
3 | using NuGet.Frameworks;
4 | using System;
5 | using System.Diagnostics;
6 | using System.IO;
7 | using System.Reflection;
8 | using System.Runtime.InteropServices;
9 | using System.Security.Cryptography;
10 |
11 | namespace NeMoOnnxSharp.Tests
12 | {
13 | [TestClass]
14 | public class AudioFeatureBufferTest
15 | {
16 | private AudioFeatureBuffer? _buffer;
17 |
18 | [TestInitialize]
19 | public void Initialize()
20 | {
21 | int sampleRate = 16000;
22 | var transform = new MFCC(
23 | sampleRate: sampleRate,
24 | window: WindowFunction.Hann,
25 | winLength: 400,
26 | nFFT: 512,
27 | nMels: 64,
28 | nMFCC: 64,
29 | fMin: 0.0,
30 | fMax: 0.0,
31 | logMels: true,
32 | melScale: MelScale.HTK,
33 | melNorm: MelNorm.None);
34 | _buffer = new AudioFeatureBuffer(
35 | transform,
36 | hopLength: 160);
37 | }
38 |
39 | [TestMethod]
40 | public void Test1()
41 | {
42 | Assert.IsNotNull(_buffer);
43 | int written;
44 | Assert.AreEqual(0, _buffer.OutputCount);
45 | written = _buffer.Write(new short[399]);
46 | Assert.AreEqual(399, written);
47 | Assert.AreEqual(0, _buffer.OutputCount);
48 | written = _buffer.Write(new short[1]);
49 | Assert.AreEqual(1, written);
50 | Assert.AreEqual(64, _buffer.OutputCount);
51 | _buffer.ConsumeOutput(64);
52 | Assert.AreEqual(0, _buffer.OutputCount);
53 | written = _buffer.Write(new short[160 * 3]);
54 | Assert.AreEqual(160 * 3, written);
55 | Assert.AreEqual(64 * 3, _buffer.OutputCount);
56 | written = _buffer.Write(new short[480]);
57 | Assert.AreEqual(480, written);
58 | Assert.AreEqual(64 * 6, _buffer.OutputCount);
59 | }
60 |
61 | [TestMethod]
62 | public void Test2()
63 | {
64 | Assert.IsNotNull(_buffer);
65 | int totalWritten = 0;
66 | int totalOutput = 0;
67 | var rng = new Random();
68 | for (int i = 0; i < 1000; i++)
69 | {
70 | int n = rng.Next(1024);
71 | int written = _buffer.Write(new short[n]);
72 | Assert.AreEqual(0, _buffer.OutputCount % 64);
73 | totalWritten += written;
74 | totalOutput += _buffer.OutputCount;
75 | if (totalWritten < 400)
76 | {
77 | Assert.AreEqual(0, totalOutput);
78 | }
79 | else
80 | {
81 | int m = (totalWritten - 400) / 160 + 1;
82 | Assert.AreEqual(m * 64, totalOutput);
83 | }
84 | _buffer.ConsumeOutput(_buffer.OutputCount);
85 | }
86 | }
87 | }
88 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/Data/61-70968-0000-mod.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:69766a33e70720c6187892ba96ba948e9c4f0daaa1a946a6de6741ff76b7e2bb
3 | size 216446
4 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/Data/61-70968-0000.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3f53c11bcec66e60659c3e53015f4f914d79b04eba0770347e644a4776fbe633
3 | size 157004
4 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/Data/cmudict-test:
--------------------------------------------------------------------------------
1 | # Comment
2 | YOU'VE Y UW1 V
3 | READ R EH1 D
4 | READ(1) R IY1 D
5 | BOOK B UH1 K
6 | THE DH AH0
7 | THE(1) DH AH1
8 | THE(2) DH IY0
9 | OPERATING AA1 P ER0 EY2 T IH0 NG
10 | OPERATING(1) AO1 P ER0 EY2 T IH0 NG
11 | SYSTEM S IH1 S T AH0 M
12 | DESIGN D IH0 Z AY1 N
13 | AND AH0 N D
14 | AND(1) AE1 N D
15 | IMPLEMENTATION IH2 M P L AH0 M EH0 N T EY1 SH AH0 N
16 | THIRD TH ER1 D
17 | EDITION AH0 D IH1 SH AH0 N
18 | EDITION(1) IH0 D IH1 SH AH0 N
19 | DID D IH1 D
20 | DID(1) D IH0 D
21 | YOU Y UW1
22 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/Data/heteronyms-test:
--------------------------------------------------------------------------------
1 | read
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/Data/mel_spectrogram.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaiidams/NeMoOnnxSharp/ad2ffe375e525bb63c59c9b1cd5154afe70351a0/NeMoOnnxSharp.Tests/Data/mel_spectrogram.bin
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/Data/mfcc.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaiidams/NeMoOnnxSharp/ad2ffe375e525bb63c59c9b1cd5154afe70351a0/NeMoOnnxSharp.Tests/Data/mfcc.bin
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/FFTTest.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using NeMoOnnxSharp.AudioPreprocessing;
3 | using System;
4 | using System.IO;
5 | using System.Runtime.InteropServices;
6 |
7 | namespace NeMoOnnxSharp.Tests
8 | {
9 | [TestClass]
10 | public class FFTTest
11 | {
12 | private static void CFFTRef(double[] xr, double[] xi, int N)
13 | {
14 | double[] yr = new double[N];
15 | double[] yi = new double[N];
16 | for (int i = 0; i < N; i++)
17 | {
18 | double vr = 0.0;
19 | double vi = 0.0;
20 | for (int k = 0; k < N; k++)
21 | {
22 | vr += Math.Cos(-2 * Math.PI * k * i / N) * xr[k];
23 | vi += Math.Sin(-2 * Math.PI * k * i / N) * xr[k];
24 | }
25 | yr[i] = vr;
26 | yi[i] = vi;
27 | }
28 | for (int i = 0; i < N; i++)
29 | {
30 | xr[i] = yr[i];
31 | xi[i] = yi[i];
32 | }
33 | }
34 |
35 | private static double MSE(double[] a, double[] b)
36 | {
37 | if (a.Length != b.Length) throw new ArgumentException();
38 | int len = Math.Min(a.Length, b.Length);
39 | double err = 0.0;
40 | for (int i = 0; i < len; i++)
41 | {
42 | double diff = a[i] - b[i];
43 | err += diff * diff;
44 | }
45 | return err / len;
46 | }
47 |
48 | [TestMethod]
49 | public void TestCFFT()
50 | {
51 | var rng = new Random();
52 | for (int N = 256; N <= 2048; N *= 2)
53 | {
54 | var xr0 = new double[N];
55 | var xi0 = new double[N];
56 | var xr1 = new double[N];
57 | var xi1 = new double[N];
58 | for (int i = 0; i < 10; i++)
59 | {
60 | for (int j = 0; j < N; j++)
61 | {
62 | xr0[j] = rng.NextDouble();
63 | xi0[j] = rng.NextDouble();
64 | xr1[j] = xr0[j];
65 | xi1[j] = rng.NextDouble();
66 | }
67 | CFFTRef(xr0, xi0, N);
68 | FFT.CFFT(xr1, xi1, N);
69 | double error = MSE(xr0, xi1);
70 | Assert.IsTrue(error < 1e-20);
71 | error = MSE(xi0, xr1);
72 | Assert.IsTrue(error < 1e-20);
73 | }
74 | }
75 | }
76 | }
77 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net7.0
5 | enable
6 | enable
7 |
8 | false
9 | true
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | PreserveNewest
24 |
25 |
26 | PreserveNewest
27 |
28 |
29 | PreserveNewest
30 |
31 |
32 | PreserveNewest
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | PreserveNewest
50 |
51 |
52 | PreserveNewest
53 |
54 |
55 | Never
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/PreprocessorTest.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using NeMoOnnxSharp.AudioPreprocessing;
3 | using System;
4 | using System.Diagnostics;
5 | using System.IO;
6 | using System.Reflection;
7 | using System.Runtime.InteropServices;
8 |
9 | namespace NeMoOnnxSharp.Tests
10 | {
11 | [TestClass]
12 | public class PreprocessorTest
13 | {
14 | private const int SampleRate = 16000;
15 | private const string SampleWAVSpeechFile = "61-70968-0000.wav";
16 |
17 | private static float[] ReadData(string file)
18 | {
19 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
20 | string path = Path.Combine(appDirPath, "Data", file);
21 | var bytes = File.ReadAllBytes(path);
22 | return MemoryMarshal.Cast(bytes).ToArray();
23 | }
24 |
25 | private static void AssertMSE(string path, float[] x, double threshold = 1e-3)
26 | {
27 | var truth = ReadData(path);
28 | double mse = MSE(truth, x);
29 | Console.WriteLine("MSE: {0}", mse);
30 | Assert.IsTrue(mse < threshold);
31 | }
32 |
33 | private static double MSE(float[] a, float[] b)
34 | {
35 | if (a.Length != b.Length) throw new ArgumentException();
36 | int len = Math.Min(a.Length, b.Length);
37 | double err = 0.0;
38 | for (int i = 0; i < len; i++)
39 | {
40 | double diff = a[i] - b[i];
41 | err += diff * diff;
42 | }
43 | return err / len;
44 | }
45 |
46 | short[]? audioSignal;
47 |
48 | [TestInitialize]
49 | public void Initialize()
50 | {
51 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
52 | string waveFile = Path.Combine(appDirPath, "Data", SampleWAVSpeechFile);
53 | audioSignal = WaveFile.ReadWAV(waveFile, SampleRate);
54 | }
55 |
56 | [TestMethod]
57 | public void TestMelSpectrogram()
58 | {
59 | var preprocessor = new AudioToMelSpectrogramPreprocessor(
60 | sampleRate: 16000,
61 | window: WindowFunction.Hann,
62 | windowSize: 0.02,
63 | windowStride: 0.01,
64 | nFFT: 512,
65 | features: 64);
66 | var x = preprocessor.GetFeatures(audioSignal);
67 | // NeMo pads the result to 16 time staps.
68 | var y = new float[((x.Length / 64 + 15) / 16) * 16 * 64];
69 | Array.Copy(x, y, x.Length);
70 | AssertMSE("mel_spectrogram.bin", y);
71 | }
72 |
73 | [TestMethod]
74 | public void TestMFCC()
75 | {
76 | var preprocessor = new AudioToMFCCPreprocessor(
77 | sampleRate: 16000,
78 | windowSize: 0.025,
79 | windowStride: 0.01,
80 | //preNormalize: 0.8,
81 | window: WindowFunction.Hann,
82 | nMels: 64,
83 | nMFCC: 64,
84 | nFFT: 512);
85 | var processedSignal = preprocessor.GetFeatures(audioSignal);
86 | AssertMSE("mfcc.bin", processedSignal, threshold: 1e-2);
87 | }
88 | }
89 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/TextTokenizersTest.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using NeMoOnnxSharp.TTSTokenizers;
3 | using System;
4 | using System.Diagnostics;
5 | using System.IO;
6 |
7 | namespace NeMoOnnxSharp.Tests
8 | {
9 | [TestClass]
10 | public class TextTokenizersTest
11 | {
12 | private readonly static string[] ExpectedTokens =
13 | {
14 | " ", "B", "CH", "D", "DH", "F", "G", "HH", "JH", "K", "L", "M",
15 | "N", "NG", "P", "R", "S", "SH", "T", "TH", "V", "W", "Y", "Z", "ZH",
16 | "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0",
17 | "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "EH0", "EH1",
18 | "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "IH0", "IH1", "IH2",
19 | "IY0", "IY1", "IY2", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "UH0",
20 | "UH1", "UH2", "UW0", "UW1", "UW2", "a", "b", "c", "d", "e", "f", "g",
21 | "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
22 | "v", "w", "x", "y", "z", "'", ",", ".", "!", "?", "-", ":", ";", "/",
23 | "\"", "(", ")", "[", "]", "{", "}", "", "", ""
24 | };
25 |
26 | private const string SampleText =
27 | "You've read the book “Operating Systems Design and Implementation, 3rd edition”. Did you?";
28 | private const string NormalizedSampleText =
29 | "You've read the book “Operating Systems Design and Implementation, third edition”. Did you?";
30 | private const string SamplePronText =
31 | "Y|UW1|V| |r|e|a|d| |t|h|e| |B|UH1|K| |“|o|p|e|r|a|t|i|n|g| |"
32 | + "S|IH1|S|T|AH0|M|Z| |D|IH0|Z|AY1|N| |a|n|d| |IH2|M|P|L|AH0|"
33 | + "M|EH0|N|T|EY1|SH|AH0|N|,| |TH|ER1|D| |e|d|i|t|i|o|n|”|.| |"
34 | + "d|i|d| |Y|UW1|?";
35 |
36 | private readonly static int[] SampleParsed =
37 | {
38 | 0, 22, 68, 20, 0, 87, 74, 70, 73, 0, 89, 77, 74,
39 | 0, 1, 65, 9, 0, 105, 84, 85, 74, 87, 70, 89, 78,
40 | 83, 76, 0, 16, 53, 16, 18, 31, 11, 23, 0, 3, 52,
41 | 23, 41, 12, 0, 70, 83, 73, 0, 54, 11, 14, 10, 31,
42 | 11, 43, 12, 18, 50, 17, 31, 12, 97, 0, 19, 47, 3,
43 | 0, 74, 73, 78, 89, 78, 84, 83, 105, 98, 0, 73, 78,
44 | 73, 0, 22, 68, 100, 0
45 | };
46 |
47 | private static readonly char[] ExpectedGermanTokens =
48 | {
49 | ' ',
50 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
51 | 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
52 | 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
53 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
54 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
55 | 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ü', 'ß',
56 | '\'',
57 | '!', '"', '(', ')', ',', '-', '.', '/', ':', ';',
58 | '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—',
59 | '‘', '‚', '“', '„', '‹', '›'
60 | };
61 |
62 | private const string GermanText = "Mist, wieder nichts geschafft.";
63 |
64 | private readonly static int[] GermanParsed =
65 | {
66 | 0, 13, 39, 49, 50, 66, 0, 53, 39, 35, 34, 35, 48, 0, 44, 39, 33, 38,
67 | 50, 49, 0, 37, 35, 49, 33, 38, 31, 36, 36, 50, 68, 0
68 | };
69 |
70 | [TestInitialize]
71 | public void Initialize()
72 | {
73 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
74 | _g2p = new EnglishG2p(
75 | phonemeDict: Path.Combine(appDirPath, "Data", "cmudict-test"),
76 | heteronyms: Path.Combine(appDirPath, "Data", "heteronyms-test"),
77 | phonemeProbability: 1.0);
78 | _tokenizer = new EnglishPhonemesTokenizer(
79 | _g2p,
80 | punct: true,
81 | stresses: true,
82 | chars: true,
83 | apostrophe: true,
84 | padWithSpace: true,
85 | addBlankAt: BaseTokenizer.AddBlankAt.True);
86 | }
87 |
88 | [TestMethod]
89 | public void TestTokenizerVocab()
90 | {
91 | Assert.IsNotNull(_tokenizer);
92 | CollectionAssert.AreEquivalent(ExpectedTokens, _tokenizer.Tokens);
93 | }
94 |
95 | [TestMethod]
96 | public void TestEnglishG2p()
97 | {
98 | Assert.IsNotNull(_g2p);
99 | var pron = string.Join("|", _g2p.Parse(NormalizedSampleText));
100 | Assert.AreEqual(SamplePronText, pron);
101 | }
102 |
103 | [TestMethod]
104 | public void TestEnglishEncode()
105 | {
106 | Assert.IsNotNull(_tokenizer);
107 | var parsed = _tokenizer.Encode(NormalizedSampleText);
108 | CollectionAssert.AreEquivalent(SampleParsed, parsed);
109 | }
110 |
111 | [TestMethod]
112 | public void TestGermanVocab()
113 | {
114 | _tokenizer = new GermanCharsTokenizer(padWithSpace: true);
115 | var expectedTokens = ExpectedGermanTokens.Select(c => c.ToString()).ToList();
116 | expectedTokens.Add("");
117 | expectedTokens.Add("");
118 | CollectionAssert.AreEquivalent(expectedTokens, _tokenizer.Tokens);
119 |
120 | var parsed = _tokenizer.Encode(GermanText);
121 | CollectionAssert.AreEquivalent(GermanParsed, parsed);
122 | }
123 |
124 | private EnglishG2p? _g2p;
125 | private BaseTokenizer? _tokenizer;
126 | }
127 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp.Tests/WaveFileTest.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.VisualStudio.TestTools.UnitTesting;
2 | using System;
3 | using System.Diagnostics;
4 | using System.IO;
5 | using System.Reflection;
6 | using System.Runtime.InteropServices;
7 |
8 | namespace NeMoOnnxSharp.Tests
9 | {
10 | [TestClass]
11 | public class WaveFileTest
12 | {
13 | private const int SampleRate = 16000;
14 | private const string SampleWAVSpeech1File = "61-70968-0000.wav";
15 | private const int SampleWAVSpeech1Length = 78480;
16 | private const string SampleWAVSpeech2File = "61-70968-0000-mod.wav";
17 | private const int SampleWAVSpeech2Length = 78480 / 2;
18 | private const string TempFile = "temp.wav";
19 |
20 | [TestMethod]
21 | public void Test1()
22 | {
23 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
24 | string waveFile = Path.Combine(appDirPath, "Data", SampleWAVSpeech1File);
25 | var waveform = WaveFile.ReadWAV(waveFile, SampleRate);
26 | Assert.AreEqual(waveform.Length, SampleWAVSpeech1Length);
27 |
28 | WaveFile.WriteWAV(TempFile, waveform, SampleRate);
29 | var waveform2 = WaveFile.ReadWAV(TempFile, SampleRate);
30 | Assert.IsTrue(IsArraysEqual(waveform, waveform2));
31 | }
32 |
33 | [TestMethod]
34 | public void Test2()
35 | {
36 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory;
37 | string waveFile = Path.Combine(appDirPath, "Data", SampleWAVSpeech2File);
38 | var waveform = WaveFile.ReadWAV(waveFile, SampleRate);
39 | Assert.AreEqual(waveform.Length, SampleWAVSpeech2Length);
40 |
41 | byte[] bytes = WaveFile.GetWAVBytes(waveform, SampleRate);
42 | Assert.AreEqual(bytes.Length, SampleWAVSpeech2Length * 2 + 44);
43 | }
44 |
45 | private bool IsArraysEqual(T[] x, T[] y) where T : struct
46 | {
47 | if (x.Length != y.Length) return false;
48 | for (int i = 0; i < x.Length; i++)
49 | {
50 | if (!x[i].Equals(y[i])) return false;
51 | }
52 | return true;
53 | }
54 | }
55 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.0.32014.148
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NeMoOnnxSharp.Example", "NeMoOnnxSharp.Example\NeMoOnnxSharp.Example.csproj", "{D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NeMoOnnxSharp", "NeMoOnnxSharp\NeMoOnnxSharp.csproj", "{69A674F7-593C-48C4-A5C7-5BCBC205E281}"
9 | EndProject
10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NeMoOnnxSharp.Tests", "NeMoOnnxSharp.Tests\NeMoOnnxSharp.Tests.csproj", "{4D0C8A9F-0574-4645-A0C2-51393982ACC8}"
11 | EndProject
12 | Global
13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
14 | Debug|Any CPU = Debug|Any CPU
15 | Release|Any CPU = Release|Any CPU
16 | EndGlobalSection
17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
18 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
19 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Debug|Any CPU.Build.0 = Debug|Any CPU
20 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Release|Any CPU.ActiveCfg = Release|Any CPU
21 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Release|Any CPU.Build.0 = Release|Any CPU
22 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
23 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Debug|Any CPU.Build.0 = Debug|Any CPU
24 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Release|Any CPU.ActiveCfg = Release|Any CPU
25 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Release|Any CPU.Build.0 = Release|Any CPU
26 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Debug|Any CPU.Build.0 = Debug|Any CPU
28 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Release|Any CPU.ActiveCfg = Release|Any CPU
29 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Release|Any CPU.Build.0 = Release|Any CPU
30 | EndGlobalSection
31 | GlobalSection(SolutionProperties) = preSolution
32 | HideSolutionNode = FALSE
33 | EndGlobalSection
34 | GlobalSection(ExtensibilityGlobals) = postSolution
35 | SolutionGuid = {E1B7E2B0-48B8-4C5A-9DEE-02037FFE0EA9}
36 | EndGlobalSection
37 | EndGlobal
38 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/AudioFeatureBuffer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 | using System.Text;
9 |
10 | namespace NeMoOnnxSharp.AudioPreprocessing
11 | {
12 | public class AudioFeatureBuffer : IAudioFeatureBuffer
13 | {
14 | private readonly IFeaturizer _transform;
15 | private readonly int _numInputChannels;
16 | private readonly int _numOutputChannels;
17 | private readonly int _hopLength;
18 | private readonly int _winLength;
19 | private readonly T1[] _inputBuffer;
20 | private int _inputCount;
21 | private readonly T2[] _outputBuffer;
22 | private int _outputCount;
23 |
24 | public int NumInputChannels => _numInputChannels;
25 | public int NumOutputChannels => _numOutputChannels;
26 | public int HopLength => _hopLength;
27 | public int WinLength => _winLength;
28 | public int OutputCount => _outputCount;
29 | public T2[] OutputBuffer => _outputBuffer;
30 | public int OutputPosition => _outputCount / _numOutputChannels * _hopLength + _inputCount;
31 |
32 | public AudioFeatureBuffer(
33 | IFeaturizer transform,
34 | int hopLength,
35 | int numOutputFrames = 100)
36 | {
37 | _transform = transform;
38 | _hopLength = hopLength;
39 | _winLength = transform.InputLength;
40 | _numInputChannels = 1;
41 | _numOutputChannels = transform.OutputLength;
42 | _inputBuffer = new T1[_winLength / _hopLength * _hopLength + _winLength];
43 | _inputCount = 0;
44 | _outputBuffer = new T2[_numOutputChannels * numOutputFrames];
45 | _outputCount = 0;
46 | }
47 |
48 | public int Write(T1[] input, int offset, int count)
49 | {
50 | return Write(input.AsSpan(offset, count));
51 | }
52 |
53 | public int Write(Span input)
54 | {
55 | int written = 0;
56 |
57 | if (_inputCount > 0)
58 | {
59 | // Here _inputCount < _winLength. Copy n elements where
60 | // 0 < _inputCount <= 160 -> n = _winLength - _inputCount
61 | // 160 < _inputCount <= 320 -> n = _hopLength + _winLength - _inputCount
62 | // 320 < _inputCount < 400 -> n = 2 * _hopLength + _winLength - _inputCount
63 | int needed = (_inputCount - 1) / _hopLength * _hopLength + _winLength - _inputCount;
64 | written = Math.Min(needed, input.Length);
65 |
66 | input.Slice(0, written).CopyTo(_inputBuffer.AsSpan(_inputCount, written));
67 | _inputCount += written;
68 |
69 | int inputBufferOffset = 0;
70 | while (inputBufferOffset + _winLength <= _inputCount)
71 | {
72 | _transform.GetFeatures(
73 | _inputBuffer.AsSpan(inputBufferOffset, _numInputChannels * _winLength),
74 | _outputBuffer.AsSpan(_outputCount, _numOutputChannels));
75 | _outputCount += _numOutputChannels;
76 | inputBufferOffset += _hopLength;
77 | }
78 |
79 | if (written < needed)
80 | {
81 | Array.Copy(_inputBuffer, inputBufferOffset, _inputBuffer, 0, _inputCount - inputBufferOffset);
82 | _inputCount -= inputBufferOffset;
83 | return written;
84 | }
85 |
86 | _inputCount = 0;
87 | written -= _winLength - _hopLength;
88 | }
89 |
90 | while (written + _winLength <= input.Length)
91 | {
92 | if (_outputCount + _numOutputChannels >= _outputBuffer.Length)
93 | {
94 | return written;
95 | }
96 | _transform.GetFeatures(
97 | input.Slice(written, _numInputChannels * _winLength),
98 | _outputBuffer.AsSpan(_outputCount, _numOutputChannels));
99 | _outputCount += _numOutputChannels;
100 | written += _hopLength;
101 | }
102 |
103 | input.Slice(written).CopyTo(_inputBuffer);
104 | _inputCount = input.Length - written;
105 | written = input.Length;
106 | return written;
107 | }
108 |
109 | public void ConsumeOutput(int count)
110 | {
111 | Array.Copy(_outputBuffer, count, _outputBuffer, 0, _outputCount - count);
112 | _outputCount -= count;
113 | }
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/AudioToMFCCPreprocessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 |
6 | namespace NeMoOnnxSharp.AudioPreprocessing
7 | {
8 | public class AudioToMFCCPreprocessor : IAudioPreprocessor
9 | {
10 | private readonly bool _center;
11 | protected readonly int _nWindowSize;
12 | protected readonly int _nWindowStride;
13 | private readonly double _preNormalize;
14 | private readonly IFeaturizer _featurizer;
15 |
16 | public int SampleRate => _featurizer.SampleRate;
17 |
18 | public AudioToMFCCPreprocessor(
19 | int sampleRate = 16000,
20 | double windowSize = 0.02,
21 | double windowStride = 0.01,
22 | int? nWindowSize = null,
23 | int? nWindowStride = null,
24 | WindowFunction window = WindowFunction.Hann,
25 | int? nFFT = null,
26 | double preNormalize = 0.0,
27 | bool center = true,
28 | double lowFreq = 0.0,
29 | double? highFreq = null,
30 | int nMels = 64,
31 | int nMFCC = 64,
32 | int dctType = 2,
33 | MFCCNorm norm = MFCCNorm.Ortho,
34 | bool log = true)
35 | {
36 | _preNormalize = preNormalize;
37 | _center = center;
38 | _nWindowSize = nWindowSize ?? (int)(windowSize * sampleRate);
39 | _nWindowStride = nWindowStride ?? (int)(windowStride * sampleRate);
40 | int _nFFT = nFFT ?? (int)Math.Pow(2, Math.Ceiling(Math.Log(_nWindowSize, 2)));
41 | _featurizer = new MFCC(
42 | sampleRate: sampleRate,
43 | window: window,
44 | winLength: _nWindowSize,
45 | nFFT: _nFFT,
46 | fMin: lowFreq,
47 | fMax: highFreq,
48 | nMels: nMels,
49 | nMFCC: nMFCC,
50 | dctType: dctType,
51 | mfccNorm: norm,
52 | logMels: log);
53 | }
54 |
55 | public float[] GetFeatures(Span input)
56 | {
57 | double scale = GetScaleFactor(input);
58 | int outputLength = GetOutputLength(input.Length);
59 | int outputStep = _featurizer.OutputLength;
60 | float[] output = new float[outputStep * outputLength];
61 | int inputOffset = -(_nWindowSize / 2);
62 | for (int outputOffset = 0; outputOffset < output.Length; outputOffset += outputStep)
63 | {
64 | if (inputOffset > 0 && inputOffset + _nWindowSize <= input.Length)
65 | {
66 | _featurizer.GetFeatures(
67 | input.Slice(inputOffset, _nWindowSize),
68 | output.AsSpan(outputOffset, outputStep));
69 | }
70 | else
71 | {
72 | Span temp = stackalloc short[_nWindowSize];
73 | int start = inputOffset;
74 | int end = inputOffset + _nWindowSize;
75 | int offset = 0;
76 | if (start < 0)
77 | {
78 | offset = -start;
79 | start = 0;
80 | }
81 | if (end >= input.Length)
82 | {
83 | end = input.Length;
84 | }
85 | if (end > start)
86 | {
87 | input.Slice(start, end - start).CopyTo(temp.Slice(offset));
88 | }
89 | _featurizer.GetFeatures(
90 | temp,
91 | output.AsSpan(outputOffset, outputStep));
92 | }
93 | inputOffset += _nWindowStride;
94 | }
95 | return output;
96 | }
97 |
98 | private int GetOutputLength(int inputLength)
99 | {
100 | if (_center)
101 | {
102 | return (inputLength + _nWindowStride - 1) / _nWindowStride;
103 | }
104 | else
105 | {
106 | return (inputLength - _nWindowStride) / _nWindowStride + 1;
107 | }
108 | }
109 |
110 | private double GetScaleFactor(Span input)
111 | {
112 | double scale;
113 | if (_preNormalize > 0)
114 | {
115 | scale = _preNormalize / MaxAbsValue(input);
116 | }
117 | else
118 | {
119 | scale = 1.0 / short.MaxValue;
120 | }
121 |
122 | return scale;
123 | }
124 |
125 | private int MaxAbsValue(Span input)
126 | {
127 | int maxValue = 1;
128 | for (int i = 0; i < input.Length; i++)
129 | {
130 | int value = input[i];
131 | if (value < 0) value = -value;
132 | if (maxValue < value) maxValue = value;
133 | }
134 | return maxValue;
135 | }
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/AudioToMelSpectrogramPreprocessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 |
6 | namespace NeMoOnnxSharp.AudioPreprocessing
7 | {
8 | public class AudioToMelSpectrogramPreprocessor : IAudioPreprocessor
9 | {
10 | private enum FrameType
11 | {
12 | None,
13 | Preemph,
14 | Center,
15 | CenterPreemph
16 | }
17 |
18 | private const double FeatureStdOffset = 1e-5;
19 |
20 | private static FrameType GetFrameType(bool center, double preemph)
21 | {
22 | if (preemph == 0.0)
23 | {
24 | return center ? FrameType.Center : FrameType.None;
25 | }
26 | else
27 | {
28 | return center ? FrameType.CenterPreemph : FrameType.Preemph;
29 | }
30 | }
31 |
32 | protected readonly int _sampleRate;
33 | protected readonly double[] _window;
34 | private readonly FrameType _frameType;
35 | protected readonly int _nWindowStride;
36 | protected readonly FeatureNormalize _normalize;
37 | private readonly double _preNormalize;
38 | protected readonly double _preemph;
39 | protected readonly double[] _melBands;
40 | protected readonly int _nFFT;
41 | protected readonly int _features;
42 | private readonly MelNorm _melNorm;
43 | private readonly int _magPower;
44 | private readonly double _logZeroGuardValue;
45 | private readonly bool _log;
46 |
47 | public int SampleRate => _sampleRate;
48 |
49 | public AudioToMelSpectrogramPreprocessor(
50 | int sampleRate = 16000,
51 | double windowSize = 0.02,
52 | double windowStride = 0.01,
53 | int? nWindowSize = null,
54 | int? nWindowStride = null,
55 | WindowFunction window = WindowFunction.Hann,
56 | FeatureNormalize normalize = FeatureNormalize.PerFeature,
57 | double preNormalize = 0.0,
58 | int? nFFT = null,
59 | double preemph = 0.97,
60 | bool center = true,
61 | int features = 64,
62 | double lowFreq = 0.0,
63 | double? highFreq = null,
64 | bool htk = false,
65 | MelNorm melNorm = MelNorm.Slaney,
66 | bool log = true,
67 | double? logZeroGuardValue = null,
68 | int magPower = 2)
69 | {
70 | _sampleRate = sampleRate;
71 | _preNormalize = preNormalize;
72 | _preemph = preemph;
73 | _window = Window.MakeWindow(window, nWindowSize ?? (int)(windowSize * sampleRate));
74 | _frameType = GetFrameType(center, preemph);
75 | _nWindowStride = nWindowStride ?? (int)(windowStride * sampleRate);
76 | _normalize = normalize;
77 | if (normalize != FeatureNormalize.PerFeature)
78 | {
79 | throw new ArgumentException("Only FeatureNormalize.PerFeature is supported");
80 | }
81 | _melBands = MelBands.MakeMelBands(
82 | lowFreq, highFreq ?? sampleRate / 2,
83 | features,
84 | htk ? MelScale.HTK : MelScale.Slaney);
85 | _melNorm = melNorm;
86 | _nFFT = nFFT ?? (int)Math.Pow(2, Math.Ceiling(Math.Log(_window.Length, 2)));
87 | _features = features;
88 | _magPower = magPower;
89 | _log = log;
90 | _logZeroGuardValue = logZeroGuardValue ?? Math.Pow(2, -24);
91 | }
92 |
93 | public float[] GetFeatures(Span input)
94 | {
95 | double scale = GetScaleFactor(input);
96 | int outputStep = _features;
97 | int outputLength = GetOutputLength(input);
98 | float[] output = new float[outputStep * outputLength];
99 | int waveformOffset = 0;
100 | for (int outputOffset = 0; outputOffset < output.Length; outputOffset += outputStep)
101 | {
102 | MelSpectrogramStep(input, waveformOffset, scale, output.AsSpan(outputOffset));
103 | waveformOffset += _nWindowStride;
104 | }
105 | if (_normalize != FeatureNormalize.None)
106 | {
107 | NormalizeBatch(output, outputStep);
108 | }
109 | return output;
110 | }
111 |
112 | private int GetOutputLength(Span input)
113 | {
114 | if (_frameType == FrameType.Center || _frameType == FrameType.CenterPreemph)
115 | {
116 | return (input.Length + _nWindowStride - 1) / _nWindowStride;
117 | }
118 | else
119 | {
120 | return (input.Length - _window.Length) / _nWindowStride + 1;
121 | }
122 | }
123 |
124 | private double GetScaleFactor(Span input)
125 | {
126 | double scale;
127 | if (_preNormalize > 0)
128 | {
129 | scale = _preNormalize / MaxAbsValue(input);
130 | }
131 | else
132 | {
133 | scale = 1.0 / short.MaxValue;
134 | }
135 |
136 | return scale;
137 | }
138 |
139 | private int MaxAbsValue(Span input)
140 | {
141 | int maxValue = 1;
142 | for (int i = 0; i < input.Length; i++)
143 | {
144 | int value = input[i];
145 | if (value < 0) value = -value;
146 | if (maxValue < value) maxValue = value;
147 | }
148 | return maxValue;
149 | }
150 |
151 | public void MelSpectrogramStep(
152 | Span input, int waveformOffset,
153 | double scale, Span output)
154 | {
155 | Span temp1 = stackalloc double[_nFFT];
156 | Span temp2 = stackalloc double[_nFFT];
157 | ReadFrame(input, waveformOffset, scale, temp1);
158 | FFT.CFFT(temp1, temp2, _nFFT);
159 | ToMagnitude(temp2, temp1, _nFFT);
160 | MelBands.ToMelSpectrogram(
161 | temp2, _melBands, _sampleRate, _nFFT, _features, _melNorm, _log, _logZeroGuardValue, temp1);
162 | for (int i = 0; i < _features; i++) output[i] = (float)temp1[i];
163 | }
164 |
165 | protected void ReadFrame(Span input, int offset, double scale, Span frame)
166 | {
167 | switch (_frameType)
168 | {
169 | case FrameType.None:
170 | ReadFrameNone(input, offset, scale, frame);
171 | break;
172 | case FrameType.Preemph:
173 | throw new NotImplementedException();
174 | case FrameType.Center:
175 | ReadFrameCenter(input, offset, scale, frame);
176 | break;
177 | case FrameType.CenterPreemph:
178 | ReadFrameCenterPreemphasis(input, offset, scale, frame);
179 | break;
180 | }
181 | }
182 |
183 | private void ReadFrameNone(Span input, int offset, double scale, Span frame)
184 | {
185 | for (int i = 0; i < _window.Length; i++)
186 | {
187 | frame[i] = input[offset + i] * _window[i] * scale;
188 | }
189 | for (int i = _window.Length; i < frame.Length; i++)
190 | {
191 | frame[i] = 0.0;
192 | }
193 | }
194 |
195 | private void ReadFrameCenter(Span input, int offset, double scale, Span frame)
196 | {
197 | int frameOffset = frame.Length / 2 - _window.Length / 2;
198 | for (int i = 0; i < frameOffset; i++)
199 | {
200 | frame[i] = 0;
201 | }
202 | int waveformOffset = offset - _window.Length / 2;
203 | for (int i = 0; i < _window.Length; i++)
204 | {
205 | int k = i + waveformOffset;
206 | double v = k >= 0 && k < input.Length ? input[k] : 0;
207 | frame[i + frameOffset] = scale * v * _window[i];
208 | }
209 | for (int i = frameOffset + _window.Length; i < frame.Length; i++)
210 | {
211 | frame[i] = 0;
212 | }
213 | }
214 |
215 | private void ReadFrameCenterPreemphasis(Span input, int offset, double scale, Span frame)
216 | {
217 | int frameOffset = (frame.Length - 1) / 2 - (_window.Length - 1) / 2;
218 | for (int i = 0; i < frameOffset; i++)
219 | {
220 | frame[i] = 0;
221 | }
222 | int waveformOffset = offset - (_window.Length - 1) / 2;
223 | for (int i = 0; i < _window.Length; i++)
224 | {
225 | int k = i + waveformOffset;
226 | double v = k >= 0 && k < input.Length ? input[k] : 0;
227 | k--;
228 | if (k >= 0 && k < input.Length) v -= _preemph * input[k];
229 | frame[i + frameOffset] = scale * v * _window[i];
230 | }
231 | for (int i = frameOffset + _window.Length; i < frame.Length; i++)
232 | {
233 | frame[i] = 0;
234 | }
235 | }
236 |
237 | private void ToMagnitude(Span xr, Span xi, int length)
238 | {
239 | if (_magPower == 2)
240 | {
241 | ToSquareMagnitude(xr, xi, length);
242 | }
243 | else if (_magPower == 1)
244 | {
245 | ToAbsoluteMagnitude(xr, xi, length);
246 | }
247 | else
248 | {
249 | throw new NotImplementedException("power must be 1 or 2.");
250 | }
251 | }
252 |
253 | private static void ToAbsoluteMagnitude(Span xr, Span xi, int length)
254 | {
255 | for (int i = 0; i < length; i++)
256 | {
257 | xr[i] = Math.Sqrt(xr[i] * xr[i] + xi[i] * xi[i]);
258 | }
259 | }
260 |
261 | private static void ToSquareMagnitude(Span xr, Span xi, int length)
262 | {
263 | for (int i = 0; i < length; i++)
264 | {
265 | xr[i] = xr[i] * xr[i] + xi[i] * xi[i];
266 | }
267 | }
268 |
269 | private void NormalizeBatch(float[] output, int outputStep)
270 | {
271 | int melspecLength = output.Length / outputStep;
272 | for (int i = 0; i < outputStep; i++)
273 | {
274 | double sum = 0;
275 | for (int j = 0; j < melspecLength; j++)
276 | {
277 | double v = output[i + outputStep * j];
278 | sum += v;
279 | }
280 | float mean = (float)(sum / melspecLength);
281 | sum = 0;
282 | for (int j = 0; j < melspecLength; j++)
283 | {
284 | double v = output[i + outputStep * j] - mean;
285 | sum += v * v;
286 | }
287 | double std = Math.Sqrt(sum / melspecLength);
288 | float invStd = (float)(1.0 / (FeatureStdOffset + std));
289 |
290 | for (int j = 0; j < melspecLength; j++)
291 | {
292 | float v = output[i + outputStep * j];
293 | output[i + outputStep * j] = (v - mean) * invStd;
294 | }
295 | }
296 | }
297 | }
298 | }
299 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/FFT.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Text;
7 |
8 | namespace NeMoOnnxSharp.AudioPreprocessing
9 | {
10 | public static class FFT
11 | {
12 | public static void CFFT(Span xr, Span xi, int N)
13 | {
14 | Span t = xi;
15 | xi = xr;
16 | xr = t;
17 | Swap(xr, xi, N);
18 | for (int n = 1; n < N; n *= 2)
19 | {
20 | for (int j = 0; j < N; j += n * 2)
21 | {
22 | for (int k = 0; k < n; k++)
23 | {
24 | double ar = Math.Cos(-Math.PI * k / n);
25 | double ai = Math.Sin(-Math.PI * k / n);
26 | double er = xr[j + k];
27 | double ei = xi[j + k];
28 | double or = xr[j + k + n];
29 | double oi = xi[j + k + n];
30 | double aor = ar * or - ai * oi;
31 | double aoi = ai * or + ar * oi;
32 | xr[j + k] = er + aor;
33 | xi[j + k] = ei + aoi;
34 | xr[j + k + n] = er - aor;
35 | xi[j + k + n] = ei - aoi;
36 | //Console.WriteLine("{0} {1}", j + k, j + k + n);
37 | }
38 | }
39 | }
40 | }
41 |
42 | public static void DCT2(Span xr, Span xi, int N)
43 | {
44 | // TODO Implement more efficiently.
45 | for (int i = 0; i < N; i++)
46 | {
47 | double s = 0;
48 | for (int j = 0; j < N; j++)
49 | {
50 | s += xr[j] * Math.Cos(Math.PI * (j + 0.5) * i / N);
51 | }
52 | xi[i] = i == 0 ? s / Math.Sqrt(N) : s / Math.Sqrt(N / 2);
53 | }
54 | }
55 |
56 | private static void Swap(Span xr, Span xi, int N)
57 | {
58 | if (N == 256)
59 | {
60 | Swap256(xr, xi);
61 | }
62 | else if (N == 512)
63 | {
64 | Swap512(xr, xi);
65 | }
66 | else if (N == 1024)
67 | {
68 | Swap1024(xr, xi);
69 | }
70 | else if (N == 2048)
71 | {
72 | Swap2048(xr, xi);
73 | }
74 | else
75 | {
76 | throw new ArgumentException("Only 256, 512, 1024 or 2048 is supported for N");
77 | }
78 | for (int i = 0; i < N; i++)
79 | {
80 | xi[i] = 0.0;
81 | }
82 | }
83 |
84 | private static void Swap256(Span xr, Span xi)
85 | {
86 | for (int i = 0; i < 256; i++)
87 | {
88 | int j = (i >> 7 & 0x01)
89 | + (i >> 5 & 0x02)
90 | + (i >> 3 & 0x04)
91 | + (i >> 1 & 0x08)
92 | + (i << 1 & 0x10)
93 | + (i << 3 & 0x20)
94 | + (i << 5 & 0x40)
95 | + (i << 7 & 0x80);
96 | xr[i] = xi[j];
97 | }
98 | }
99 |
100 | private static void Swap512(Span xr, Span xi)
101 | {
102 | for (int i = 0; i < 512; i++)
103 | {
104 | int j = (i >> 8 & 0x01)
105 | + (i >> 6 & 0x02)
106 | + (i >> 4 & 0x04)
107 | + (i >> 2 & 0x08)
108 | + (i & 0x10)
109 | + (i << 2 & 0x20)
110 | + (i << 4 & 0x40)
111 | + (i << 6 & 0x80)
112 | + (i << 8 & 0x100);
113 | xr[i] = xi[j];
114 | }
115 | }
116 |
117 | private static void Swap1024(Span xr, Span xi)
118 | {
119 | for (int i = 0; i < 1024; i++)
120 | {
121 | int j = (i >> 9 & 0x01)
122 | + (i >> 7 & 0x02)
123 | + (i >> 5 & 0x04)
124 | + (i >> 3 & 0x08)
125 | + (i >> 1 & 0x10)
126 | + (i << 1 & 0x20)
127 | + (i << 3 & 0x40)
128 | + (i << 5 & 0x80)
129 | + (i << 7 & 0x100)
130 | + (i << 9 & 0x200);
131 | xr[i] = xi[j];
132 | }
133 | }
134 |
135 | private static void Swap2048(Span xr, Span xi)
136 | {
137 | for (int i = 0; i < 2048; i++)
138 | {
139 | int j = (i >> 10 & 0x01)
140 | + (i >> 8 & 0x02)
141 | + (i >> 6 & 0x04)
142 | + (i >> 4 & 0x08)
143 | + (i >> 2 & 0x10)
144 | + (i & 0x20)
145 | + (i << 2 & 0x40)
146 | + (i << 4 & 0x80)
147 | + (i << 6 & 0x100)
148 | + (i << 8 & 0x200)
149 | + (i << 10 & 0x400);
150 | xr[i] = xi[j];
151 | }
152 | }
153 | }
154 | }
155 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/FeatureNormalize.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | namespace NeMoOnnxSharp.AudioPreprocessing
5 | {
6 | public enum FeatureNormalize
7 | {
8 | None,
9 | PerFeature,
10 | AllFeatures
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/HTKMelBands.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 | using System.Text;
8 | using System.Threading.Tasks;
9 |
10 | namespace NeMoOnnxSharp.AudioPreprocessing
11 | {
12 | internal static class HTKMelBands
13 | {
14 | public static double[] MakeMelBands(double melMinHz, double melMaxHz, int nMelBanks)
15 | {
16 | double melMin = HzToMel(melMinHz);
17 | double melMax = HzToMel(melMaxHz);
18 | double[] melBanks = new double[nMelBanks + 2];
19 | for (int i = 0; i < nMelBanks + 2; i++)
20 | {
21 | double mel = (melMax - melMin) * i / (nMelBanks + 1) + melMin;
22 | melBanks[i] = MelToHz(mel);
23 | }
24 | return melBanks;
25 | }
26 |
27 | private static double HzToMel(double hz)
28 | {
29 | return 2595 * Math.Log10(1 + hz / 700);
30 | }
31 |
32 | private static double MelToHz(double mel)
33 | {
34 | return (Math.Pow(10, mel / 2595) - 1) * 700;
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/IAudioFeatureBuffer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 | using System.Text;
8 |
9 | namespace NeMoOnnxSharp.AudioPreprocessing
10 | {
11 | public interface IAudioFeatureBuffer
12 | {
13 | public int NumInputChannels { get; }
14 | public int NumOutputChannels { get; }
15 | public int HopLength { get; }
16 | public int WinLength { get; }
17 | public int OutputCount { get; }
18 | public S[] OutputBuffer { get; }
19 | public int Write(T[] waveform, int offset, int count);
20 | public int Write(Span waveform);
21 | public void ConsumeOutput(int count);
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/IAudioPreprocessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 | using System.Text;
8 |
9 | namespace NeMoOnnxSharp.AudioPreprocessing
10 | {
11 | public interface IAudioPreprocessor
12 | {
13 | int SampleRate { get; }
14 |
15 | T2[] GetFeatures(Span input);
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/IFeaturizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 |
6 | namespace NeMoOnnxSharp.AudioPreprocessing
7 | {
8 | public interface IFeaturizer
9 | {
10 | int SampleRate { get; }
11 | int InputLength { get; }
12 | int OutputLength { get; }
13 | void GetFeatures(Span input, Span output);
14 | }
15 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/MFCC.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 |
6 | namespace NeMoOnnxSharp.AudioPreprocessing
7 | {
8 | public class MFCC : IFeaturizer
9 | {
10 | private const double InvMaxShort = 1.0 / short.MaxValue;
11 | private const double LogOffset = 1e-6;
12 |
13 | protected readonly int _sampleRate;
14 | protected readonly double[] _window;
15 | protected readonly double[] _melBands;
16 | protected readonly int _nFFT;
17 | protected readonly int _nMels;
18 | private readonly MelNorm _melNorm;
19 | private readonly int _power;
20 | private readonly bool _logMels;
21 | private readonly int _nMFCC;
22 |
23 | public int SampleRate => _sampleRate;
24 | public int InputLength => _window.Length;
25 | public int OutputLength => _nMFCC;
26 |
27 | public MFCC(
28 | int sampleRate = 16000,
29 | WindowFunction window = WindowFunction.Hann,
30 | int? winLength = null,
31 | int nFFT = 400,
32 | int power = 2,
33 | bool normalized = false,
34 | double fMin = 0.0,
35 | double? fMax = null,
36 | int nMels = 128,
37 | MelNorm melNorm = MelNorm.None,
38 | MelScale melScale = MelScale.HTK,
39 | int nMFCC = 40,
40 | int dctType = 2,
41 | MFCCNorm mfccNorm = MFCCNorm.Ortho,
42 | bool logMels = false)
43 | {
44 | if (dctType != 2)
45 | {
46 | throw new ArgumentException("Only DCT-II is supported");
47 | }
48 | if (normalized)
49 | {
50 | throw new ArgumentException("Normalizing by magnitude after stft is not supported");
51 | }
52 | if (mfccNorm != MFCCNorm.Ortho)
53 | {
54 | throw new ArgumentException("Only Ortho is supported for MFCC norm");
55 | }
56 | _sampleRate = sampleRate;
57 | _window = Window.MakeWindow(window, winLength ?? nFFT);
58 | _melBands = MelBands.MakeMelBands(fMin, fMax ?? sampleRate / 2, nMels, melScale);
59 | _melNorm = melNorm;
60 | _nFFT = nFFT;
61 | _nMels = nMels;
62 | _power = power;
63 | _logMels = logMels;
64 | _nMFCC = nMFCC;
65 | }
66 |
67 | public void GetFeatures(Span input, Span output)
68 | {
69 | Span temp1 = stackalloc double[_nFFT];
70 | Span temp2 = stackalloc double[_nFFT];
71 | ReadFrame(input, temp1);
72 | FFT.CFFT(temp1, temp2, _nFFT);
73 | ToMagnitude(temp2, temp1);
74 | MelBands.ToMelSpectrogram(
75 | temp2, _melBands, _sampleRate, _nFFT, _nMels, _melNorm, true, LogOffset, temp1);
76 | FFT.DCT2(temp1, temp2, _nMFCC);
77 | for (int i = 0; i < _nMFCC; i++) output[i] = (float)temp2[i];
78 | }
79 |
80 | private void ToMagnitude(Span xr, Span xi)
81 | {
82 | if (_power == 2)
83 | {
84 | ToSquareMagnitude(xr, xi);
85 | }
86 | else if (_power == 1)
87 | {
88 | ToAbsoluteMagnitude(xr, xi);
89 | }
90 | else
91 | {
92 | throw new NotImplementedException("power must be 1 or 2.");
93 | }
94 | }
95 |
96 | private static void ToAbsoluteMagnitude(Span xr, Span xi)
97 | {
98 | for (int i = 0; i < xr.Length; i++)
99 | {
100 | xr[i] = Math.Sqrt(xr[i] * xr[i] + xi[i] * xi[i]);
101 | }
102 | }
103 |
104 | private static void ToSquareMagnitude(Span xr, Span xi)
105 | {
106 | for (int i = 0; i < xr.Length; i++)
107 | {
108 | xr[i] = xr[i] * xr[i] + xi[i] * xi[i];
109 | }
110 | }
111 |
112 | private void ReadFrame(Span waveform, Span frame)
113 | {
114 | int frameOffset = frame.Length / 2 - _window.Length / 2;
115 | frame.Slice(0, frameOffset).Fill(0);
116 | for (int i = 0; i < _window.Length; i++)
117 | {
118 | frame[i + frameOffset] = InvMaxShort * waveform[i] * _window[i];
119 | }
120 | frame.Slice(frameOffset + _window.Length).Fill(0);
121 | }
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/MFCCNorm.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | namespace NeMoOnnxSharp.AudioPreprocessing
5 | {
6 | public enum MFCCNorm
7 | {
8 | None,
9 | Ortho
10 | }
11 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/MelBands.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 | using System.Text;
8 | using System.Threading.Tasks;
9 |
10 | namespace NeMoOnnxSharp.AudioPreprocessing
11 | {
12 | public static class MelBands
13 | {
14 | public static double[] MakeMelBands(double melMinHz, double melMaxHz, int nMelBanks, MelScale melScale)
15 | {
16 | if (melScale == MelScale.HTK)
17 | {
18 | return HTKMelBands.MakeMelBands(melMinHz, melMaxHz, nMelBanks);
19 | }
20 | else if (melScale == MelScale.Slaney)
21 | {
22 | return SlaneyMelBands.MakeMelBands(melMinHz, melMaxHz, nMelBanks);
23 | }
24 | else
25 | {
26 | throw new ArgumentException();
27 | }
28 | }
29 |
30 | public static void ToMelSpectrogram(
31 | Span spec, double[] melBands, double sampleRate,
32 | int nFFT, int nMels,
33 | MelNorm norm,
34 | bool log, double logOffset,
35 | Span melspec)
36 | {
37 | if (!log) throw new NotImplementedException();
38 | switch (norm)
39 | {
40 | case MelNorm.None:
41 | ToMelSpectrogramNone(spec, melBands, sampleRate, nFFT, nMels, logOffset, melspec);
42 | break;
43 | case MelNorm.Slaney:
44 | ToMelSpectrogramSlaney(spec, melBands, sampleRate, nFFT, nMels, logOffset, melspec);
45 | break;
46 | }
47 | }
48 |
49 | private static void ToMelSpectrogramNone(
50 | Span spec, double[] melBands, double sampleRate,
51 | int nFFT, int nMels, double logOffset,
52 | Span melspec)
53 | {
54 | for (int i = 0; i < nMels; i++)
55 | {
56 | double startHz = melBands[i];
57 | double peakHz = melBands[i + 1];
58 | double endHz = melBands[i + 2];
59 | double v = 0.0;
60 | int j = (int)(startHz * nFFT / sampleRate) + 1;
61 | while (true)
62 | {
63 | double hz = j * sampleRate / nFFT;
64 | if (hz > peakHz)
65 | break;
66 | double r = (hz - startHz) / (peakHz - startHz);
67 | v += spec[j] * r;
68 | j++;
69 | }
70 | while (true)
71 | {
72 | double hz = j * sampleRate / nFFT;
73 | if (hz > endHz)
74 | break;
75 | double r = (endHz - hz) / (endHz - peakHz);
76 | v += spec[j] * r;
77 | j++;
78 | }
79 | melspec[i] = (float)Math.Log(v + logOffset);
80 | }
81 | }
82 |
83 | private static void ToMelSpectrogramSlaney(
84 | Span spec, double[] melBands, double sampleRate,
85 | int nFFT, int nMels, double logOffset,
86 | Span melspec)
87 | {
88 | for (int i = 0; i < nMels; i++)
89 | {
90 | double startHz = melBands[i];
91 | double peakHz = melBands[i + 1];
92 | double endHz = melBands[i + 2];
93 | double v = 0.0;
94 | int j = (int)(startHz * nFFT / sampleRate) + 1;
95 | while (true)
96 | {
97 | double hz = j * sampleRate / nFFT;
98 | if (hz > peakHz)
99 | break;
100 | double r = (hz - startHz) / (peakHz - startHz);
101 | v += spec[j] * r * 2 / (endHz - startHz);
102 | j++;
103 | }
104 | while (true)
105 | {
106 | double hz = j * sampleRate / nFFT;
107 | if (hz > endHz)
108 | break;
109 | double r = (endHz - hz) / (endHz - peakHz);
110 | v += spec[j] * r * 2 / (endHz - startHz);
111 | j++;
112 | }
113 | melspec[i] = (float)Math.Log(v + logOffset);
114 | }
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/MelNorm.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Text;
7 |
8 | namespace NeMoOnnxSharp.AudioPreprocessing
9 | {
10 | public enum MelNorm
11 | {
12 | None,
13 | Slaney
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/MelScale.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | namespace NeMoOnnxSharp.AudioPreprocessing
5 | {
6 | public enum MelScale
7 | {
8 | HTK,
9 | Slaney,
10 | }
11 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/SlaneyMelBands.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 | using System.Text;
8 | using System.Threading.Tasks;
9 |
10 | namespace NeMoOnnxSharp.AudioPreprocessing
11 | {
12 | internal static class SlaneyMelBands
13 | {
14 | public static double[] MakeMelBands(double melMinHz, double melMaxHz, int nMelBanks)
15 | {
16 | double melMin = HzToMel(melMinHz);
17 | double melMax = HzToMel(melMaxHz);
18 | double[] melBanks = new double[nMelBanks + 2];
19 | for (int i = 0; i < nMelBanks + 2; i++)
20 | {
21 | double mel = (melMax - melMin) * i / (nMelBanks + 1) + melMin;
22 | melBanks[i] = MelToHz(mel);
23 | }
24 | return melBanks;
25 | }
26 |
27 | private static double HzToMel(double hz)
28 | {
29 | const double minLogHz = 1000.0; // beginning of log region in Hz
30 | const double linearMelHz = 200.0 / 3;
31 | double mel;
32 | if (hz >= minLogHz)
33 | {
34 | // Log region
35 | const double minLogMel = minLogHz / linearMelHz;
36 | double logStep = Math.Log(6.4) / 27.0;
37 | mel = minLogMel + Math.Log(hz / minLogHz) / logStep;
38 | }
39 | else
40 | {
41 | // Linear region
42 | mel = hz / linearMelHz;
43 | }
44 |
45 | return mel;
46 | }
47 |
48 | private static double MelToHz(double mel)
49 | {
50 | const double minLogHz = 1000.0; // beginning of log region in Hz
51 | const double linearMelHz = 200.0 / 3;
52 | const double minLogMel = minLogHz / linearMelHz; // same (Mels)
53 | double freq;
54 |
55 |
56 | if (mel >= minLogMel)
57 | {
58 | // Log region
59 | double logStep = Math.Log(6.4) / 27.0;
60 | freq = minLogHz * Math.Exp(logStep * (mel - minLogMel));
61 | }
62 | else
63 | {
64 | // Linear region
65 | freq = linearMelHz * mel;
66 | }
67 |
68 | return freq;
69 | }
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/Window.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Text;
7 |
8 | namespace NeMoOnnxSharp.AudioPreprocessing
9 | {
10 | public static class Window
11 | {
12 | public static double[] MakeWindow(WindowFunction function, int length)
13 | {
14 | if (function == WindowFunction.Hann)
15 | {
16 | return MakeHannWindow(length);
17 | }
18 | else
19 | {
20 | throw new ArgumentException("Unknown windows name");
21 | }
22 | }
23 |
24 | private static double[] MakeHannWindow(int length)
25 | {
26 | double[] window = new double[length];
27 | for (int i = 0; i < length; i++)
28 | {
29 | window[i] = 0.5 * (1 - Math.Cos(2 * Math.PI * i / (length - 1)));
30 | }
31 | return window;
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/AudioPreprocessing/WindowFunction.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Text;
7 |
8 | namespace NeMoOnnxSharp.AudioPreprocessing
9 | {
10 | public enum WindowFunction
11 | {
12 | Bartlett,
13 | Blackman,
14 | Hamming,
15 | Hann,
16 | Kaiser
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/FrameVAD.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 | using NeMoOnnxSharp.AudioPreprocessing;
9 | using NeMoOnnxSharp.Models;
10 |
11 | namespace NeMoOnnxSharp
12 | {
13 | public sealed class FrameVAD : IDisposable
14 | {
15 | private readonly int _sampleRate;
16 | private readonly int _modelWinLength;
17 | private readonly int _modelHopLength;
18 | private int _predictIndex;
19 | private float[] _predictWindow;
20 | private readonly AudioFeatureBuffer _featureBuffer;
21 | private readonly EncDecClassificationModel _vad;
22 |
23 | public FrameVAD(EncDecClassificationConfig config, int smoothingWinLength = 64)
24 | {
25 | _sampleRate = 16000;
26 | _modelWinLength = 32;
27 | _modelHopLength = 1;
28 | _predictIndex = 0;
29 | _predictWindow = new float[smoothingWinLength];
30 | var transform = new MFCC(
31 | sampleRate: _sampleRate,
32 | window: WindowFunction.Hann,
33 | winLength: 400,
34 | nFFT: 512,
35 | nMels: 64,
36 | nMFCC: 64,
37 | fMin: 0.0,
38 | fMax: null,
39 | logMels: true,
40 | melScale: MelScale.HTK,
41 | melNorm: MelNorm.None);
42 | _featureBuffer = new AudioFeatureBuffer(
43 | transform,
44 | hopLength: 160);
45 | _vad = new EncDecClassificationModel(config);
46 | }
47 |
48 | public int HopLength => _featureBuffer.HopLength * _modelHopLength;
49 |
50 | public int SampleRate => _sampleRate;
51 | public int PredictionOffset {
52 | get {
53 | int outputTotalWindow = (_predictWindow.Length - 1) * _modelHopLength + _modelWinLength;
54 | int outputPosition = _featureBuffer.OutputPosition;
55 | outputPosition += _featureBuffer.HopLength * (outputTotalWindow / 2 - _modelWinLength);
56 | return outputPosition - _featureBuffer.WinLength / 2;
57 | }
58 | }
59 |
60 | public void Dispose()
61 | {
62 | _vad.Dispose();
63 | }
64 |
65 | public float[] Transcribe(short[] input, int offset, int count)
66 | {
67 | return Transcribe(input.AsSpan(offset, count));
68 | }
69 |
70 | public float[] Transcribe(Span input)
71 | {
72 | var result = new List();
73 | while (input.Length > 0)
74 | {
75 | int written = _featureBuffer.Write(input);
76 | if (written == 0)
77 | {
78 | throw new InvalidDataException();
79 | }
80 | while (_featureBuffer.OutputCount >= _featureBuffer.NumOutputChannels * _modelWinLength)
81 | {
82 | var logits = _vad.Predict(_featureBuffer.OutputBuffer.AsSpan(0, _featureBuffer.NumOutputChannels * _modelWinLength));
83 | double x = Math.Exp(logits[0] - logits[1]);
84 |
85 | _predictWindow[_predictIndex] = (float)(1 / (x + 1));
86 | _predictIndex = (_predictIndex + 1) % _predictWindow.Length;
87 | result.Add(_predictWindow.Average());
88 | _featureBuffer.ConsumeOutput(_featureBuffer.NumOutputChannels * _modelHopLength);
89 | }
90 | input = input[written..];
91 | }
92 | return result.ToArray();
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/ASRModel.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 | using System.Text;
8 |
9 | namespace NeMoOnnxSharp.Models
10 | {
11 | public abstract class ASRModel : Model
12 | {
13 | protected ASRModel(ModelConfig config) : base(config)
14 | {
15 | }
16 |
17 | public abstract string Transcribe(Span inputSignal);
18 |
19 | protected float[] TransposeInputSignal(Span inputSignal, int nFeatures)
20 | {
21 | var transposedSignal = new float[inputSignal.Length];
22 | int rows = inputSignal.Length / nFeatures;
23 | for (int i = 0; i < rows; i++)
24 | {
25 | for (int j = 0; j < nFeatures; j++)
26 | {
27 | transposedSignal[j * rows + i] = inputSignal[i * nFeatures + j];
28 | }
29 | }
30 | return transposedSignal;
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/CharTokenizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 | using System.Text;
8 | using System.Text.RegularExpressions;
9 |
10 | namespace NeMoOnnxSharp.Models
11 | {
12 | public class CharTokenizer
13 | {
14 | private const string DefaultVocabulary = "_ abcdefghijklmnopqrstuvwxyz'";
15 | private static readonly Regex MergeRx = new Regex(@"(.)\1+");
16 |
17 | private readonly Regex _vocabRx;
18 | private readonly IDictionary _v2i;
19 | private readonly string _i2v;
20 |
21 | public CharTokenizer() : this(DefaultVocabulary)
22 | {
23 | }
24 |
25 | public CharTokenizer(string characters)
26 | {
27 | _vocabRx = new Regex("[^" + characters.Substring(1) + "]");
28 | _i2v = characters;
29 | _v2i = new Dictionary();
30 | for (int i = 0; i < _i2v.Length; i++) _v2i[_i2v[i]] = i;
31 | }
32 |
33 | public long[] Encode(string text)
34 | {
35 | string lower = text.ToLower().Trim();
36 | long[] encoded = new long[lower.Length];
37 | int j = 0;
38 | for (int i = 0; i < lower.Length; i++)
39 | {
40 | if (_v2i.TryGetValue(lower[i], out encoded[j]))
41 | {
42 | j++;
43 | }
44 | }
45 | return encoded.AsSpan(0, j).ToArray();
46 | }
47 |
48 | public string Decode(long[] encoded)
49 | {
50 | char[] chars = new char[encoded.Length];
51 | for (int i = 0; i < chars.Length; i++)
52 | {
53 | long index = encoded[i];
54 | if (index < 0 || index >= _i2v.Length) index = 0;
55 | chars[i] = _i2v[(int)index];
56 | }
57 | return new string(chars);
58 | }
59 |
60 | public string MergeRepeated(string text)
61 | {
62 | return MergeRx.Replace(text, @"$1").Replace("_", "");
63 | }
64 | }
65 | }
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/EncDecCTCConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 |
8 | namespace NeMoOnnxSharp.Models
9 | {
10 | public class EncDecCTCConfig : ModelConfig
11 | {
12 | public const string EnglishVocabulary = " abcdefghijklmnopqrstuvwxyz'_";
13 | public const string GermanVocabulary = " abcdefghijklmnopqrstuvwxyzäöüß_";
14 |
15 | public string? vocabulary;
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/EncDecCTCModel.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using Microsoft.ML.OnnxRuntime;
5 | using Microsoft.ML.OnnxRuntime.Tensors;
6 | using NeMoOnnxSharp.AudioPreprocessing;
7 | using System;
8 | using System.Collections.Generic;
9 | using System.Linq;
10 |
11 | namespace NeMoOnnxSharp.Models
12 | {
13 | public sealed class EncDecCTCModel : ASRModel, IDisposable
14 | {
15 | private readonly IAudioPreprocessor _preProcessor;
16 | private readonly CharTokenizer _tokenizer;
17 | private readonly int _features;
18 |
19 | public IAudioPreprocessor PreProcessor => _preProcessor;
20 | public int SampleRate => _preProcessor.SampleRate;
21 |
22 | public EncDecCTCModel(EncDecCTCConfig config) : base(config)
23 | {
24 | _features = 64;
25 | _preProcessor = new AudioToMelSpectrogramPreprocessor(
26 | sampleRate: 16000,
27 | window: WindowFunction.Hann,
28 | windowSize: 0.02,
29 | windowStride: 0.01,
30 | nFFT: 512,
31 | features: _features);
32 | if (config.vocabulary == null) throw new ArgumentNullException("config");
33 | _tokenizer = new CharTokenizer(config.vocabulary);
34 | }
35 |
36 | public void Dispose()
37 | {
38 | _inferSess.Dispose();
39 | }
40 |
41 | public override string Transcribe(Span inputSignal)
42 | {
43 | string text = string.Empty;
44 | var processedSignal = _preProcessor.GetFeatures(inputSignal);
45 | processedSignal = TransposeInputSignal(processedSignal, _features);
46 | var container = new List();
47 | var audioSignalData = new DenseTensor(
48 | processedSignal,
49 | new int[3] { 1, _features, processedSignal.Length / _features });
50 | container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData));
51 | using (var res = _inferSess.Run(container, new string[] { "logprobs" }))
52 | {
53 | var logprobs = res.First();
54 | long[] preds = ArgMax(logprobs.AsTensor());
55 | text = _tokenizer.Decode(preds);
56 | text = _tokenizer.MergeRepeated(text);
57 | }
58 | return text;
59 | }
60 |
61 | private long[] ArgMax(Tensor logprobs)
62 | {
63 | long[] preds = new long[logprobs.Dimensions[1]];
64 | for (int l = 0; l < preds.Length; l++)
65 | {
66 | int k = -1;
67 | float m = float.MinValue;
68 | for (int j = 0; j < logprobs.Dimensions[2]; j++)
69 | {
70 | if (m < logprobs[0, l, j])
71 | {
72 | k = j;
73 | m = logprobs[0, l, j];
74 | }
75 | }
76 | preds[l] = k;
77 | }
78 |
79 | return preds;
80 | }
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/EncDecClassificationConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 |
8 | namespace NeMoOnnxSharp.Models
9 | {
10 | public class EncDecClassificationConfig : ModelConfig
11 | {
12 | public static readonly string[] SpeechCommandsLabels = new string[]
13 | {
14 | "visual", "wow", "learn", "backward", "dog",
15 | "two", "left", "happy", "nine", "go",
16 | "up", "bed", "stop", "one", "zero",
17 | "tree", "seven", "on", "four", "bird",
18 | "right", "eight", "no", "six", "forward",
19 | "house", "marvin", "sheila", "five", "off",
20 | "three", "down", "cat", "follow", "yes"
21 | };
22 | public static readonly string[] VADLabels = new string[]
23 | {
24 | "background",
25 | "speech"
26 | };
27 |
28 | public string[]? labels;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/EncDecClassificationModel.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using Microsoft.ML.OnnxRuntime;
5 | using Microsoft.ML.OnnxRuntime.Tensors;
6 | using NeMoOnnxSharp.AudioPreprocessing;
7 | using System;
8 | using System.Collections.Generic;
9 | using System.Linq;
10 |
11 | namespace NeMoOnnxSharp.Models
12 | {
13 | public sealed class EncDecClassificationModel : ASRModel, IDisposable
14 | {
15 | private readonly IAudioPreprocessor _preProcessor;
16 | private readonly int _nMelBands;
17 | private readonly string[] _labels;
18 |
19 | public IAudioPreprocessor PreProcessor => _preProcessor;
20 |
21 | public EncDecClassificationModel(EncDecClassificationConfig config) : base(config)
22 | {
23 | _nMelBands = 64;
24 | _preProcessor = new AudioToMFCCPreprocessor(
25 | sampleRate: 16000,
26 | window: WindowFunction.Hann,
27 | windowSize: 0.025,
28 | windowStride: 0.01,
29 | nFFT: 512,
30 | //preNormalize: 0.8,
31 | nMels: 64,
32 | nMFCC: 64);
33 | if (config.labels == null) throw new ArgumentNullException("labels");
34 | _labels = config.labels;
35 | }
36 |
37 | public void Dispose()
38 | {
39 | _inferSess.Dispose();
40 | }
41 |
42 | public override string Transcribe(Span inputSignal)
43 | {
44 | string text = string.Empty;
45 | var processedSignal = _preProcessor.GetFeatures(inputSignal);
46 | processedSignal = TransposeInputSignal(processedSignal, _nMelBands);
47 | var container = new List();
48 | var audioSignalData = new DenseTensor(
49 | processedSignal,
50 | new int[3] { 1, _nMelBands, processedSignal.Length / _nMelBands });
51 | container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData));
52 | using (var res = _inferSess.Run(container, new string[] { "logits" }))
53 | {
54 | var scoreTensor = res.First();
55 | long pred = ArgMax(scoreTensor.AsTensor());
56 | text = _labels[pred];
57 | }
58 | return text;
59 | }
60 |
61 | public float[] Predict(Span processedSignal)
62 | {
63 | var transposedProcessedSignal = TransposeInputSignal(processedSignal, _nMelBands);
64 | var container = new List();
65 | var audioSignalData = new DenseTensor(
66 | transposedProcessedSignal,
67 | new int[3] { 1, _nMelBands, transposedProcessedSignal.Length / _nMelBands });
68 | container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData));
69 | float[] logits;
70 | using (var res = _inferSess.Run(container, new string[] { "logits" }))
71 | {
72 | var logitsTensor = res.First();
73 | logits = logitsTensor.AsTensor().ToArray();
74 | }
75 | return logits;
76 | }
77 |
78 | private long ArgMax(Tensor score)
79 | {
80 | int k = -1;
81 | float m = float.MinValue;
82 | for (int j = 0; j < score.Dimensions[1]; j++)
83 | {
84 | if (m < score[0, j])
85 | {
86 | k = j;
87 | m = score[0, j];
88 | }
89 | }
90 | return k;
91 | }
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/Model.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using Microsoft.ML.OnnxRuntime;
5 | using Microsoft.ML.OnnxRuntime.Tensors;
6 | using System;
7 | using System.Collections.Generic;
8 | using System.IO;
9 | using System.Linq;
10 |
11 | namespace NeMoOnnxSharp.Models
12 | {
13 | public abstract class Model
14 | {
15 | protected readonly InferenceSession _inferSess;
16 |
17 | protected Model(ModelConfig config)
18 | {
19 | if (config.model != null)
20 | {
21 | _inferSess = new InferenceSession(config.model);
22 | }
23 | else if (config.modelPath != null)
24 | {
25 | _inferSess = new InferenceSession(config.modelPath);
26 | }
27 | else
28 | {
29 | throw new InvalidDataException();
30 | }
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/ModelConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 |
8 | namespace NeMoOnnxSharp.Models
9 | {
10 | public class ModelConfig
11 | {
12 | public string? modelPath;
13 | public byte[]? model;
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/SpectrogramGenerator.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using Microsoft.ML.OnnxRuntime;
5 | using Microsoft.ML.OnnxRuntime.Tensors;
6 | using NeMoOnnxSharp.TTSTokenizers;
7 | using System;
8 | using System.Collections.Generic;
9 | using System.Linq;
10 |
11 | namespace NeMoOnnxSharp.Models
12 | {
13 | public sealed class SpectrogramGenerator : Model, IDisposable
14 | {
15 | private readonly BaseTokenizer _tokenizer;
16 |
17 | public SpectrogramGenerator(SpectrogramGeneratorConfig config) : base(config)
18 | {
19 | _tokenizer = _SetupTokenizer(config);
20 | }
21 |
22 | public void Dispose()
23 | {
24 | _inferSess.Dispose();
25 | }
26 |
27 | public int[] Parse(string strInput, bool normalize = true)
28 | {
29 | if (normalize)
30 | {
31 | strInput = _NormalizeText(strInput);
32 | }
33 | var encoded = _tokenizer.Encode(strInput);
34 | return encoded;
35 | }
36 |
37 | private static BaseTokenizer _SetupTokenizer(SpectrogramGeneratorConfig config)
38 | {
39 | BaseTokenizer tokenizer;
40 | if (config.textTokenizer == "EnglishPhonemesTokenizer")
41 | {
42 | if (config.phonemeDictPath == null) throw new ArgumentNullException();
43 | if (config.heteronymsPath == null) throw new ArgumentNullException();
44 | var g2p = new EnglishG2p(
45 | phonemeDict: config.phonemeDictPath,
46 | heteronyms: config.heteronymsPath,
47 | phonemeProbability: 1.0);
48 | tokenizer = new EnglishPhonemesTokenizer(
49 | g2p,
50 | punct: true,
51 | stresses: true,
52 | chars: true,
53 | apostrophe: true,
54 | padWithSpace: true,
55 | addBlankAt: BaseTokenizer.AddBlankAt.True);
56 | }
57 | else if (config.textTokenizer == "GermanCharsTokenizer")
58 | {
59 | tokenizer = new GermanCharsTokenizer(
60 | padWithSpace: true);
61 | }
62 | else
63 | {
64 | throw new ArgumentException();
65 | }
66 | return tokenizer;
67 | }
68 |
69 | private string _NormalizeText(string strInput)
70 | {
71 | return strInput;
72 | }
73 |
74 | public float[] GenerateSpectrogram(int[] tokens, double pace = 1.0)
75 | {
76 | var container = new List();
77 | var textData = new DenseTensor(
78 | tokens.Select(p => (long)p).ToArray(),
79 | new int[2] { 1, tokens.Length });
80 | container.Add(NamedOnnxValue.CreateFromTensor("text", textData));
81 | var paceData = new DenseTensor(
82 | new float[] { (float)pace },
83 | new int[2] { 1, 1 });
84 | container.Add(NamedOnnxValue.CreateFromTensor("pace", paceData));
85 | var pitchData = new DenseTensor(
86 | Enumerable.Range(0, tokens.Length).Select(i => 0.0f).ToArray(),
87 | new int[2] { 1, tokens.Length });
88 | container.Add(NamedOnnxValue.CreateFromTensor("pitch", pitchData));
89 | float[] spec;
90 | using (var res = _inferSess.Run(container, new string[] { "spect" }))
91 | {
92 | var pitchPredictedData = res.First().AsTensor();
93 | spec = pitchPredictedData.ToArray();
94 | }
95 | return spec;
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/SpectrogramGeneratorConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 |
9 | namespace NeMoOnnxSharp.Models
10 | {
11 | public class SpectrogramGeneratorConfig : ModelConfig
12 | {
13 | public string? phonemeDictPath;
14 | public string? heteronymsPath;
15 | public string? textTokenizer;
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/Vocoder.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using Microsoft.ML.OnnxRuntime;
5 | using Microsoft.ML.OnnxRuntime.Tensors;
6 | using System;
7 | using System.Collections.Generic;
8 | using System.Linq;
9 |
10 | namespace NeMoOnnxSharp.Models
11 | {
12 | public sealed class Vocoder : Model, IDisposable
13 | {
14 | private readonly int _nfilt;
15 | private readonly int _sampleRate;
16 |
17 | public Vocoder(VocoderConfig config) : base(config)
18 | {
19 | _nfilt = 80;
20 | _sampleRate = 22050;
21 | }
22 |
23 | public int SampleRate { get { return _sampleRate; } }
24 |
25 | public void Dispose()
26 | {
27 | _inferSess.Dispose();
28 | }
29 |
30 | public short[] ConvertSpectrogramToAudio(float[] spec)
31 | {
32 | var container = new List();
33 | var specData = new DenseTensor(
34 | spec,
35 | new int[3] { 1, _nfilt, spec.Length / _nfilt });
36 | container.Add(NamedOnnxValue.CreateFromTensor("spec", specData));
37 | float[] audio;
38 | using (var res = _inferSess.Run(container, new string[] { "audio" }))
39 | {
40 | var audioTensor = res.First().AsTensor();
41 | audio = audioTensor.ToArray();
42 | }
43 | return audio.Select(x => (short)(x * short.MaxValue)).ToArray();
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/Models/VocoderConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Linq;
7 |
8 | namespace NeMoOnnxSharp.Models
9 | {
10 | public class VocoderConfig : ModelConfig
11 | {
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/NeMoOnnxSharp.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.1
5 | enable
6 | $(VersionPrefix)1.3.0
7 | https://github.com/kaiidams/NeMoOnnxSharp
8 | nemo onnx text-to-speech csharp speech tts speech-synthesis speech-recognition asr
9 | Copyright (C) 2022 Katsuya Iida. All rights reserved.
10 | Text-to-speech and speech recognition, VAD with NVIDIA NeMo and ONNX Runtime for .NET Core.
11 | https://github.com/kaiidams/NeMoOnnxSharp
12 | Katsuya Iida
13 |
14 | Apache-2.0
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/SpeechConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 | using NeMoOnnxSharp.Models;
9 |
10 | namespace NeMoOnnxSharp
11 | {
12 | public class SpeechConfig
13 | {
14 | public SpeechConfig()
15 | {
16 | vad = new EncDecClassificationConfig();
17 | asr = new EncDecCTCConfig();
18 | specGen = new SpectrogramGeneratorConfig();
19 | vocoder = new VocoderConfig();
20 | }
21 |
22 | public EncDecClassificationConfig vad;
23 | public EncDecCTCConfig asr;
24 | public SpectrogramGeneratorConfig specGen;
25 | public VocoderConfig vocoder;
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/SpeechRecognitionEventArgs.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 |
9 | namespace NeMoOnnxSharp
10 | {
11 | public class SpeechRecognitionEventArgs
12 | {
13 | public SpeechRecognitionEventArgs(ulong offset, string? text = null, short[]? audio = null)
14 | {
15 | Offset = offset;
16 | Text = text;
17 | Audio = audio;
18 | }
19 |
20 | public ulong Offset { get; private set; }
21 | public string? Text { get; private set; }
22 | public short[]? Audio { get; private set; }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/SpeechRecognizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 | using System.Runtime.InteropServices;
9 | using NeMoOnnxSharp.Models;
10 |
11 | namespace NeMoOnnxSharp
12 | {
13 | public sealed class SpeechRecognizer : IDisposable
14 | {
15 | private readonly FrameVAD _frameVad;
16 | private readonly EncDecCTCModel _asrModel;
17 | private readonly int _audioBufferIncrease;
18 | private readonly int _audioBufferSize;
19 | int _audioBufferIndex;
20 | long _currentPosition;
21 | byte[] _audioBuffer;
22 | bool _isSpeech;
23 | private readonly float _speechStartThreadhold;
24 | private readonly float _speechEndThreadhold;
25 |
26 | public SpeechRecognizer(SpeechConfig config)
27 | {
28 | _frameVad = new FrameVAD(config.vad);
29 | _asrModel = new EncDecCTCModel(config.asr);
30 | _currentPosition = 0;
31 | _audioBufferIndex = 0;
32 | _audioBufferSize = sizeof(short) * _frameVad.SampleRate * 2; // 2sec
33 | _audioBufferIncrease = sizeof(short) * 5 * _frameVad.SampleRate; // 10sec
34 | _audioBuffer = new byte[_audioBufferSize];
35 | _isSpeech = false;
36 | _speechStartThreadhold = 0.7f;
37 | _speechEndThreadhold = 0.3f;
38 | }
39 |
40 | public int SampleRate => _frameVad.SampleRate;
41 | public event EventHandler? Recognized;
42 | public event EventHandler? SpeechStartDetected;
43 | public event EventHandler? SpeechEndDetected;
44 |
45 | public void Dispose()
46 | {
47 | _frameVad.Dispose();
48 | _asrModel.Dispose();
49 | }
50 |
51 | public void Write(byte[] input, int offset, int count)
52 | {
53 | Write(input.AsSpan(offset, count));
54 | }
55 |
56 | public void Write(short[] input, int offset, int count)
57 | {
58 | Write(input.AsSpan(offset, count));
59 | }
60 |
61 | public void Write(Span input)
62 | {
63 | var bytes = MemoryMarshal.Cast(input);
64 | Write(bytes);
65 | }
66 |
67 | public void Write(Span input)
68 | {
69 | while (input.Length > 0)
70 | {
71 | int len = input.Length;
72 | if (_isSpeech)
73 | {
74 | if (len > _audioBuffer.Length - _audioBufferIndex)
75 | {
76 | var tmp = new byte[_audioBuffer.Length + _audioBufferIncrease];
77 | Array.Copy(_audioBuffer, tmp, _audioBufferIndex);
78 | _audioBuffer = tmp;
79 | }
80 | }
81 | else
82 | {
83 | if (_audioBufferIndex >= _audioBuffer.Length)
84 | {
85 | _audioBufferIndex = 0;
86 | }
87 | len = Math.Min(_audioBuffer.Length - _audioBufferIndex, len);
88 | }
89 | input.Slice(0, len).CopyTo(_audioBuffer.AsSpan(_audioBufferIndex, len));
90 | input = input.Slice(len);
91 | int len2 = (len / sizeof(short)) * sizeof(short);
92 | var audioSignal = MemoryMarshal.Cast(_audioBuffer.AsSpan(_audioBufferIndex, len2));
93 | _audioBufferIndex += len;
94 | _currentPosition += audioSignal.Length;
95 | _Transcribe(audioSignal);
96 | }
97 | }
98 |
99 | private void _Transcribe(Span audioSignal)
100 | {
101 | var pos = -(audioSignal.Length + _frameVad.PredictionOffset);
102 | var result = _frameVad.Transcribe(audioSignal);
103 | foreach (var prob in result)
104 | {
105 | if (_isSpeech)
106 | {
107 | if (prob < _speechEndThreadhold)
108 | {
109 | _isSpeech = false;
110 | int posBytes = pos * sizeof(short);
111 | if (Recognized != null)
112 | {
113 | var audio = _audioBuffer.AsSpan(0, _audioBufferIndex + posBytes);
114 | var x = MemoryMarshal.Cast(audio).ToArray();
115 | string predictText = _asrModel.Transcribe(x);
116 | Recognized(this, new SpeechRecognitionEventArgs(
117 | (ulong)(_currentPosition + pos), predictText, x));
118 | }
119 | if (SpeechEndDetected != null)
120 | {
121 | SpeechEndDetected(this, new SpeechRecognitionEventArgs(
122 | (ulong)(_currentPosition + pos)));
123 | }
124 | _ResetAudioBuffer(posBytes);
125 | }
126 | }
127 | else
128 | {
129 | if (prob >= _speechStartThreadhold)
130 | {
131 | _isSpeech = true;
132 | if (SpeechStartDetected != null) {
133 | SpeechStartDetected(this, new SpeechRecognitionEventArgs(
134 | (ulong)(_currentPosition + pos)));
135 | }
136 | int pos2 = pos * sizeof(short);
137 | _ChangeAudioBufferForSpeech(pos2);
138 | }
139 | }
140 | pos += _frameVad.HopLength;
141 | }
142 | }
143 |
144 | private void _ResetAudioBuffer(int posBytes)
145 | {
146 | var tmp = new byte[_audioBufferSize];
147 | Array.Copy(
148 | _audioBuffer, _audioBufferIndex + posBytes,
149 | tmp, 0,
150 | -posBytes);
151 | _audioBuffer = tmp;
152 | _audioBufferIndex = -posBytes;
153 | }
154 |
155 | private void _ChangeAudioBufferForSpeech(int posBytes)
156 | {
157 | int audioBufferStart = _audioBufferIndex + posBytes;
158 | int audioBufferEnd = _audioBufferIndex;
159 | if (audioBufferStart >= 0)
160 | {
161 | Array.Copy(
162 | _audioBuffer, audioBufferStart,
163 | _audioBuffer, 0,
164 | audioBufferEnd - audioBufferStart);
165 | _audioBufferIndex = audioBufferEnd - audioBufferStart;
166 | }
167 | else if (audioBufferStart + _audioBuffer.Length >= audioBufferEnd)
168 | {
169 | var tmp = new byte[_audioBuffer.Length + _audioBufferIncrease];
170 | Array.Copy(
171 | _audioBuffer, audioBufferStart + _audioBuffer.Length,
172 | tmp, 0,
173 | -audioBufferStart);
174 | Array.Copy(
175 | _audioBuffer, 0,
176 | tmp, -audioBufferStart,
177 | audioBufferEnd);
178 | _audioBuffer = tmp;
179 | _audioBufferIndex = audioBufferEnd - audioBufferStart;
180 | }
181 | else
182 | {
183 | var tmp = new byte[_audioBuffer.Length + _audioBufferIncrease];
184 | Array.Copy(
185 | _audioBuffer, audioBufferEnd,
186 | tmp, 0,
187 | _audioBuffer.Length - audioBufferEnd);
188 | Array.Copy(
189 | _audioBuffer, 0,
190 | tmp, _audioBuffer.Length - audioBufferEnd,
191 | audioBufferEnd);
192 | _audioBuffer = tmp;
193 | _audioBufferIndex = _audioBuffer.Length;
194 | }
195 | }
196 | }
197 | }
198 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/SpeechSynthesisResult.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 |
9 | namespace NeMoOnnxSharp
10 | {
11 | public class SpeechSynthesisResult
12 | {
13 | public SpeechSynthesisResult()
14 | {
15 | }
16 |
17 | public short[]? AudioData { get; set; }
18 | public int SampleRate { get; set; }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/SpeechSynthesizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Linq;
8 | using NeMoOnnxSharp.Models;
9 |
10 | namespace NeMoOnnxSharp
11 | {
12 | public sealed class SpeechSynthesizer : IDisposable
13 | {
14 | private readonly SpectrogramGenerator _specGen;
15 | private readonly Vocoder _vocoder;
16 |
17 | public SpeechSynthesizer(SpeechConfig config)
18 | {
19 | _specGen = new SpectrogramGenerator(config.specGen);
20 | _vocoder = new Vocoder(config.vocoder);
21 | }
22 |
23 | public void Dispose()
24 | {
25 | _specGen.Dispose();
26 | _vocoder.Dispose();
27 | }
28 |
29 | public SpeechSynthesisResult SpeakText(string text)
30 | {
31 | var parsed = _specGen.Parse(text);
32 | var spec = _specGen.GenerateSpectrogram(parsed, pace: 1.0);
33 | var audio = _vocoder.ConvertSpectrogramToAudio(spec);
34 | return new SpeechSynthesisResult()
35 | {
36 | AudioData = audio,
37 | SampleRate = _vocoder.SampleRate
38 | };
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/TTSTokenizers/BaseCharsTokenizerr.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
5 | // largely located in the files found in this folder:
6 | //
7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
8 | //
9 | // The origin has the following copyright notice and license:
10 | //
11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE
12 | //
13 |
14 | using System;
15 | using System.Collections.Generic;
16 | using System.Linq;
17 | using System.Text;
18 |
19 | namespace NeMoOnnxSharp.TTSTokenizers
20 | {
21 | // nemo.collections.tts.torch.tts_tokenizers.BaseCharsTokenizer
22 | public class BaseCharsTokenizer : BaseTokenizer
23 | {
24 | public BaseCharsTokenizer(
25 | string chars,
26 | bool punct = true,
27 | string[]? nonDefaultPunctList = null,
28 | bool apostrophe = true,
29 | string oov = OOV,
30 | string sep = "|", // To be able to distinguish between 2/3 letters codes.
31 | AddBlankAt addBlankAt = AddBlankAt.None,
32 | bool padWithSpace = false)
33 | // object? text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=false),
34 | {
35 | _space = 0;
36 | var tokens = new List();
37 | tokens.Add(" ");
38 | tokens.AddRange(chars.Select(ch => ch.ToString()));
39 | if (apostrophe)
40 | {
41 | tokens.Add("'"); // Apostrophe for saving "don't" and "Joe's"
42 | }
43 |
44 | if (punct)
45 | {
46 | if (nonDefaultPunctList != null)
47 | {
48 | tokens.AddRange(nonDefaultPunctList);
49 | }
50 | else
51 | {
52 | tokens.AddRange(PunctList);
53 | }
54 | }
55 |
56 | tokens.Add(Pad);
57 | _pad = tokens.Count;
58 | if (addBlankAt != AddBlankAt.None)
59 | {
60 | _blank = tokens.Count;
61 | tokens.Add(Blank);
62 | }
63 |
64 | tokens.Add(oov); // Out Of Vocabulary
65 | _oov = tokens.Count;
66 |
67 | if (addBlankAt == AddBlankAt.Last)
68 | {
69 | throw new NotImplementedException();
70 | }
71 |
72 | _sep = sep;
73 | _punct = punct;
74 | _padWithSpace = padWithSpace;
75 |
76 | _id2token = tokens.ToArray();
77 | _token2id = new Dictionary(
78 | Enumerable.Range(0, _id2token.Length)
79 | .Select(i => new KeyValuePair(_id2token[i], i)));
80 | _utilIds = new HashSet() { _pad, _blank, _oov };
81 |
82 | _punct = punct;
83 | }
84 |
85 | public override int[] Encode(string text)
86 | {
87 | var cs = new List();
88 | var space = _id2token[_space];
89 | var tokens = Tokens;
90 |
91 | text = TextPreprocessingFunc(text);
92 | foreach (var c_ in text)
93 | {
94 | string c = c_.ToString();
95 |
96 | // Add a whitespace if the current char is a whitespace while the previous char is not a whitespace.
97 | if (c == space && cs.Count > 0 && cs[cs.Count - 1] != space)
98 | {
99 | cs.Add(c);
100 | }
101 | // Add the current char that is an alphanumeric or an apostrophe.
102 | else if ((char.IsLetterOrDigit(c, 0) || c == "'") && tokens.Contains(c))
103 | {
104 | cs.Add(c);
105 | }
106 | // Add a punctuation that has a single char.
107 | else if (!char.IsLetterOrDigit(c, 0) && _token2id.ContainsKey(c) && _punct)
108 | {
109 | cs.Add(c);
110 | }
111 | // Warn about unknown char
112 | else if (c != space)
113 | {
114 | // Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.
115 | }
116 | }
117 |
118 | // Remove trailing spaces
119 | if (cs.Count > 0)
120 | {
121 | while (cs[cs.Count - 1] == space)
122 | {
123 | cs.RemoveAt(cs.Count - 1);
124 | }
125 | }
126 |
127 | if (_padWithSpace)
128 | {
129 | cs.Insert(0, space);
130 | cs.Add(space);
131 | }
132 | return cs.Select(c => _token2id[c]).ToArray();
133 | }
134 |
135 | protected virtual string TextPreprocessingFunc(string text)
136 | {
137 | return TokenizerUtils.AnyLocaleTextPreprocessing(text);
138 | }
139 |
140 | private readonly string[] PunctList =
141 | { // Derived from LJSpeech and "/" additionally
142 | ",", ".", "!", "?", "-",
143 | ":", ";", "/", "\"", "(",
144 | ")", "[", "]", "{", "}",
145 | };
146 |
147 | private readonly bool _punct;
148 | }
149 | }
150 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
5 | // largely located in the files found in this folder:
6 | //
7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
8 | //
9 | // The origin has the following copyright notice and license:
10 | //
11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE
12 | //
13 |
14 | using System;
15 | using System.Collections.Generic;
16 | using System.Linq;
17 | using System.Text;
18 |
19 | namespace NeMoOnnxSharp.TTSTokenizers
20 | {
21 | public abstract class BaseTokenizer
22 | {
23 | public enum AddBlankAt
24 | {
25 | None,
26 | True,
27 | Last
28 | }
29 |
30 | protected const string Pad = "";
31 | protected const string Blank = "";
32 | protected const string OOV = "";
33 |
34 | protected BaseTokenizer()
35 | {
36 | _sep = string.Empty;
37 | _id2token = Array.Empty();
38 | _token2id = new Dictionary();
39 | _utilIds = new HashSet();
40 | }
41 |
42 | ///
43 | /// Turns str text into int tokens.
44 | ///
45 | public abstract int[] Encode(string text);
46 |
47 | ///
48 | /// Turns ints tokens into str text.
49 | ///
50 | public string Decode(int[] tokens)
51 | {
52 | return string.Join(
53 | _sep,
54 | tokens
55 | .Where(t => !_utilIds.Contains(t))
56 | .Select(t => _id2token[t]));
57 | }
58 |
59 | public string[] Tokens { get { return _id2token; } }
60 | public int PadId { get { return _pad; } }
61 | public int BlankId { get { return _blank; } }
62 | public int OOVId { get { return _oov; } }
63 | public string Sep { get { return _sep; } }
64 |
65 | protected string[] _id2token;
66 | protected IDictionary _token2id;
67 | protected ISet _utilIds;
68 | protected int _space;
69 | protected int _pad;
70 | protected int _blank;
71 | protected int _oov;
72 | protected string _sep;
73 | protected bool _padWithSpace;
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
5 | // largely located in the files found in this folder:
6 | //
7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/g2p/models/en_us_arpabet.py
8 | //
9 | // The origin has the following copyright notice and license:
10 | //
11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE
12 | //
13 |
14 | using System;
15 | using System.Collections.Generic;
16 | using System.IO;
17 | using System.Linq;
18 | using System.Text;
19 | using System.Text.RegularExpressions;
20 |
21 | namespace NeMoOnnxSharp.TTSTokenizers
22 | {
23 | // nemo.collections.tts.torch.g2ps.EnglishG2p
24 |
25 | ///
26 | /// English G2P module. This module converts words from grapheme to phoneme representation using phoneme_dict in CMU dict format.
27 | /// Optionally, it can ignore words which are heteronyms, ambiguous or marked as unchangeable by word_tokenize_func(see code for details).
28 | /// Ignored words are left unchanged or passed through apply_to_oov_word for handling.
29 | ///
30 | public class EnglishG2p
31 | {
32 | private readonly IDictionary _phonemeDict;
33 | private readonly HashSet _heteronyms;
34 | private readonly double _phonemeProbability;
35 | private readonly Random _random;
36 | private readonly Regex _alnumRx;
37 | private readonly bool _ignoreAmbiguousWords;
38 |
39 | ///
40 | /// Path to file in CMUdict format or dictionary of CMUdict-like entries.
41 | /// word_tokenize_func: Function for tokenizing text to words.
42 | /// Path to file with heteronyms (every line is new word) or list of words.
43 | /// The probability (0.
47 | public EnglishG2p(
48 | string phonemeDict,
49 | string heteronyms,
50 | bool ignoreAmbiguousWords = true,
51 | Encoding? encoding = null,
52 | double phonemeProbability = 0.5)
53 | {
54 | encoding = encoding ?? Encoding.GetEncoding("iso-8859-1");
55 | _phonemeDict = _ParseAsCmuDict(phonemeDict, encoding);
56 | _heteronyms = new HashSet(_ParseFileByLines(heteronyms, encoding));
57 | _phonemeProbability = phonemeProbability;
58 | _random = new Random();
59 | _alnumRx = new Regex(@"[a-zA-ZÀ-ÿ\d]");
60 | _ignoreAmbiguousWords = ignoreAmbiguousWords;
61 | }
62 |
63 | public string[] Parse(string text)
64 | {
65 | var words = TokenizerUtils.EnglishWordTokenize(text);
66 | var prons = new List();
67 | foreach (var (word, withoutChanges) in words)
68 | {
69 | if (withoutChanges)
70 | {
71 | prons.AddRange(word);
72 | continue;
73 | }
74 |
75 | var wordStr = word[0];
76 | var wordByHyphen = wordStr.Split('-');
77 | var (pron, isHandled) = ParseOneWord(wordStr);
78 |
79 | if (!isHandled && wordByHyphen.Length > 1)
80 | {
81 | pron = new List();
82 | foreach (var subWord in wordByHyphen)
83 | {
84 | var (p, _) = ParseOneWord(subWord);
85 | pron.AddRange(p);
86 | pron.Add("-");
87 | }
88 | pron.RemoveAt(pron.Count - 1);
89 | }
90 | prons.AddRange(pron);
91 | }
92 | return prons.ToArray();
93 | }
94 |
95 | private (List pron, bool isHandled) ParseOneWord(string word)
96 | {
97 | if (_phonemeProbability < 1.0 && _random.NextDouble() > _phonemeProbability)
98 | {
99 | return (StringToStringList(word), true);
100 | }
101 |
102 | // punctuation or whitespace.
103 | if (!_alnumRx.IsMatch(word))
104 | {
105 | return (StringToStringList(word), true);
106 | }
107 |
108 | // heteronyms
109 | if (_heteronyms != null && _heteronyms.Contains(word))
110 | {
111 | return (StringToStringList(word), true);
112 | }
113 |
114 | // `'s` suffix
115 | if (word.Length > 2
116 | && word.EndsWith("'s")
117 | && !_phonemeDict.ContainsKey(word))
118 | {
119 | var sword = word.Substring(0, word.Length - 2);
120 | if (_phonemeDict.ContainsKey(sword)
121 | && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword)))
122 | {
123 | var pron = _phonemeDict[sword][0].Split(" ").ToList();
124 | pron.Add("Z");
125 | return (pron, true);
126 | }
127 | }
128 |
129 | // `s` suffix
130 | if (word.Length > 1
131 | && word.EndsWith("s")
132 | && !_phonemeDict.ContainsKey(word))
133 | {
134 | var sword = word.Substring(0, word.Length - 1);
135 | if (_phonemeDict.ContainsKey(sword)
136 | && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword)))
137 | {
138 | var pron = _phonemeDict[sword][0].Split(" ").ToList();
139 | pron.Add("Z");
140 | return (pron, true);
141 | }
142 | }
143 |
144 | // phoneme dict
145 | if (_phonemeDict.ContainsKey(word) && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(word)))
146 | {
147 | return (_phonemeDict[word][0].Split(" ").ToList(), true);
148 | }
149 |
150 | return (StringToStringList(word), false);
151 | }
152 |
153 | private List StringToStringList(string word)
154 | {
155 | return word.Select(x => x.ToString()).ToList();
156 | }
157 |
158 | private bool _IsUniqueInPhonemeDict(string word)
159 | {
160 | return _phonemeDict[word].Length == 1;
161 | }
162 |
163 | private static IDictionary _ParseAsCmuDict(string phonemeDictPath, Encoding encoding)
164 | {
165 | var _alt_re = new Regex(@"\([0-9]+\)");
166 | var g2pDict = new Dictionary();
167 | using (var stream = new FileStream(phonemeDictPath, FileMode.Open))
168 | using (var reader = new StreamReader(stream, encoding))
169 | {
170 | string line;
171 | while ((line = reader.ReadLine()) != null)
172 | {
173 | if (line.Length > 0 && (('A' <= line[0] && line[0] <= 'Z') || line[0] == '\''))
174 | {
175 | var parts = line.Split(" ");
176 | var word = _alt_re.Replace(parts[0], "");
177 | word = word.ToLower();
178 |
179 | var pronunciation = parts[1].Trim();
180 | if (g2pDict.ContainsKey(word))
181 | {
182 | var v = new List(g2pDict[word])
183 | {
184 | pronunciation
185 | };
186 | g2pDict[word] = v.ToArray();
187 | }
188 | else
189 | {
190 | g2pDict[word] = new string[] { pronunciation };
191 | }
192 | }
193 | }
194 | }
195 | return g2pDict;
196 | }
197 |
198 | private static string[] _ParseFileByLines(string p, Encoding encoding)
199 | {
200 | var res = new List();
201 | using (var stream = new FileStream(p, FileMode.Open))
202 | using (var reader = new StreamReader(stream, encoding))
203 | {
204 | string line;
205 | while ((line = reader.ReadLine()) != null)
206 | {
207 | res.Add(line.TrimEnd());
208 | }
209 | }
210 | return res.ToArray();
211 | }
212 | }
213 | }
214 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
5 | // largely located in the files found in this folder:
6 | //
7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
8 | //
9 | // The origin has the following copyright notice and license:
10 | //
11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE
12 | //
13 |
14 | using System;
15 | using System.Collections.Generic;
16 | using System.Linq;
17 | using System.Text;
18 | using static System.Net.Mime.MediaTypeNames;
19 |
20 | namespace NeMoOnnxSharp.TTSTokenizers
21 | {
22 | // nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
23 | public class EnglishPhonemesTokenizer : BaseTokenizer
24 | {
25 | ///
26 | /// English phoneme-based tokenizer.
27 | ///
28 | /// Grapheme to phoneme module.
29 | /// Whether to reserve grapheme for basic punctuation or not.
30 | /// List of punctuation marks which will be used instead default.
31 | /// Whether to use phonemes codes with stresses (0-2) or not.
32 | /// Whether to additionally use chars together with phonemes. It is useful if g2p module can return chars too.
33 | /// Space token as string.
34 | /// Silence token as string (will be disabled if it is None).
35 | /// Whether to use apostrophe or not.
36 | /// OOV token as string.
37 | /// Separation token as string.
38 | /// Add blank to labels in the specified order ("last") or after tokens (any non None),
39 | /// if None then no blank in labels.
40 | /// Whether to pad text with spaces at the beginning and at the end or not.
41 | /// text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
42 | /// Basically, it replaces all non-unicode characters with unicode ones.
43 | /// Note that lower() function shouldn't be applied here, in case the text contains phonemes (it will be handled by g2p).
44 | public EnglishPhonemesTokenizer(
45 | EnglishG2p g2p,
46 | bool punct = true,
47 | string[]? nonDefaultPunctList = null,
48 | bool stresses = false,
49 | bool chars = false,
50 | string space = " ",
51 | string? silence = null,
52 | bool apostrophe = true,
53 | string oov = BaseTokenizer.OOV,
54 | string sep = "|", // To be able to distinguish between 2/3 letters codes.
55 | AddBlankAt addBlankAt = AddBlankAt.None,
56 | bool padWithSpace = false)
57 | // object? text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=false),
58 | {
59 | _phonemeProbability = null;
60 | _g2p = g2p;
61 | _space = 0;
62 | var tokens = new List();
63 | tokens.Add(space);
64 |
65 | if (silence != null)
66 | {
67 | throw new NotImplementedException();
68 | }
69 |
70 | tokens.AddRange(Consonants);
71 | var vowels = Vowels;
72 |
73 | if (stresses)
74 | {
75 | vowels = vowels.SelectMany(p => Enumerable.Range(0, 3), (p, s) => $"{p}{s}").ToArray();
76 | }
77 | tokens.AddRange(vowels);
78 |
79 | if (chars || _phonemeProbability != null)
80 | {
81 | if (!chars)
82 | {
83 | // logging.warning(
84 | // "phoneme_probability was not None, characters will be enabled even though "
85 | // "chars was set to False."
86 | // );
87 | }
88 | tokens.AddRange(AsciiLowercase.Select(ch => ch.ToString()));
89 | }
90 |
91 | if (apostrophe)
92 | {
93 | tokens.Add("'"); // Apostrophe
94 | }
95 |
96 | if (punct)
97 | {
98 | if (nonDefaultPunctList != null)
99 | {
100 | tokens.AddRange(nonDefaultPunctList);
101 | }
102 | else
103 | {
104 | tokens.AddRange(PunctList);
105 | }
106 | }
107 |
108 | tokens.Add(Pad);
109 | _pad = tokens.Count;
110 | if (addBlankAt != AddBlankAt.None)
111 | {
112 | _blank = tokens.Count;
113 | tokens.Add(Blank);
114 | }
115 |
116 | tokens.Add(oov); // Out Of Vocabulary
117 | _oov = tokens.Count;
118 |
119 | if (addBlankAt == AddBlankAt.Last)
120 | {
121 | throw new NotImplementedException();
122 | }
123 |
124 | _sep = sep;
125 | _padWithSpace = padWithSpace;
126 |
127 | _id2token = tokens.ToArray();
128 | _token2id = new Dictionary(
129 | Enumerable.Range(0, _id2token.Length)
130 | .Select(i => new KeyValuePair(_id2token[i], i)));
131 | _utilIds = new HashSet() { _pad, _blank, _oov };
132 |
133 | _stresses = stresses;
134 | _punct = punct;
135 | }
136 |
137 | public override int[] Encode(string text)
138 | {
139 | text = TokenizerUtils.EnglishTextPreprocessing(text);
140 | var g2pText = _g2p.Parse(text);
141 | return EncodeFromG2p(g2pText);
142 | }
143 |
144 | ///
145 | /// Encodes text that has already been run through G2P.
146 | /// Called for encoding to tokens after text preprocessing and G2P.
147 | ///
148 | /// G2P's output, could be a mixture of phonemes and graphemes,
149 | /// e.g. "see OOV" -> ['S', 'IY1', ' ', 'O', 'O', 'V']
150 | ///
151 | public int[] EncodeFromG2p(string[] g2pText)
152 | {
153 | var ps = new List();
154 | var space = _id2token[_space];
155 | foreach (var _p in g2pText)
156 | {
157 | string p = _p;
158 | // Remove stress
159 | if (p.Length == 3 && !_stresses)
160 | {
161 | p = p.Substring(0, 2);
162 | }
163 |
164 | // Add space if last one isn't one
165 | if (p == space && ps.Count > 0 && ps[ps.Count - 1] != space)
166 | {
167 | ps.Add(p);
168 | }
169 | // Add next phoneme or char (if chars=true)
170 | else if ((char.IsLetterOrDigit(p, 0) || p == "'") && _token2id.ContainsKey(p))
171 | {
172 | ps.Add(p);
173 | }
174 | // Add punct
175 | else if (_punct && !char.IsLetterOrDigit(p, 0) && _token2id.ContainsKey(p))
176 | {
177 | ps.Add(p);
178 | }
179 | else if (p != space)
180 | {
181 | // Unknown char/phoneme
182 | }
183 | }
184 |
185 | // Remove trailing spaces
186 | while (ps.Count > 0 && ps[ps.Count - 1] == space)
187 | {
188 | ps.RemoveAt(ps.Count - 1);
189 | }
190 |
191 | var res = new List();
192 | if (_padWithSpace)
193 | {
194 | res.Add(0);
195 | }
196 | res.AddRange(g2pText.Select(p => _token2id[p]));
197 | if (_padWithSpace)
198 | {
199 | res.Add(0);
200 | }
201 | return res.ToArray();
202 | }
203 |
204 | private readonly string[] PunctList =
205 | { // Derived from LJSpeech and "/" additionally
206 | ",", ".", "!", "?", "-",
207 | ":", ";", "/", "\"", "(",
208 | ")", "[", "]", "{", "}",
209 | };
210 | private readonly string[] Vowels = {
211 | "AA", "AE", "AH", "AO", "AW",
212 | "AY", "EH", "ER", "EY", "IH",
213 | "IY", "OW", "OY", "UH", "UW",
214 | };
215 | private readonly string[] Consonants = {
216 | "B", "CH", "D", "DH", "F", "G",
217 | "HH", "JH", "K", "L", "M", "N",
218 | "NG", "P", "R", "S", "SH", "T",
219 | "TH", "V", "W", "Y", "Z", "ZH",
220 | };
221 |
222 | private const string AsciiLowercase = "abcdefghijklmnopqrstuvwxyz";
223 |
224 | private readonly EnglishG2p _g2p;
225 | private readonly object? _phonemeProbability;
226 | private readonly bool _stresses;
227 | private readonly bool _punct;
228 | }
229 | }
230 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/TTSTokenizers/GermanCharsTokenizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
5 | // largely located in the files found in this folder:
6 | //
7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py
8 | //
9 | // The origin has the following copyright notice and license:
10 | //
11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE
12 | //
13 |
14 | using System;
15 | using System.Collections.Generic;
16 | using System.Linq;
17 | using System.Text;
18 | using static NeMoOnnxSharp.TTSTokenizers.BaseTokenizer;
19 |
20 | namespace NeMoOnnxSharp.TTSTokenizers
21 | {
22 | // nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer
23 | public class GermanCharsTokenizer : BaseCharsTokenizer
24 | {
25 | public GermanCharsTokenizer(
26 | bool padWithSpace = false
27 | ) : base(
28 | chars: new string(_CharsetStr),
29 | punct: true,
30 | addBlankAt: AddBlankAt.None,
31 | apostrophe: true,
32 | padWithSpace: padWithSpace,
33 | nonDefaultPunctList: _PunctList.Select(c => c.ToString()).ToArray()
34 | )
35 | {
36 | }
37 |
38 | private static readonly char[] _CharsetStr = new char[]
39 | {
40 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
41 | 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
42 | 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
43 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
44 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
45 | 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ü', 'ß',
46 | };
47 |
48 | private static readonly char[] _PunctList = new char[]
49 | {
50 | '!', '"', '(', ')', ',', '-', '.', '/', ':', ';',
51 | '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—',
52 | '‘', '‚', '“', '„', '‹', '›'
53 | };
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo,
5 | // largely located in the files found in this folder:
6 | //
7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
8 | //
9 | // The origin has the following copyright notice and license:
10 | //
11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE
12 | //
13 |
14 | using System;
15 | using System.Collections.Generic;
16 | using System.Globalization;
17 | using System.IO;
18 | using System.Linq;
19 | using System.Text;
20 | using System.Text.RegularExpressions;
21 |
22 | namespace NeMoOnnxSharp.TTSTokenizers
23 | {
24 | public static class TokenizerUtils
25 | {
26 | private static readonly Dictionary _synoGlyph2Ascii;
27 | private static readonly Regex _wordsReEn;
28 |
29 | static TokenizerUtils()
30 | {
31 | Tuple[] _synoglyphs = {
32 | new Tuple('\'', new[] { '’' }),
33 | new Tuple('"', new[] { '”', '“' }),
34 | };
35 |
36 | _synoGlyph2Ascii = new Dictionary();
37 | foreach (var (asc, glyphs) in _synoglyphs)
38 | {
39 | foreach (var g in glyphs)
40 | {
41 | _synoGlyph2Ascii[g] = asc;
42 | }
43 | }
44 |
45 | // define char set based on https://en.wikipedia.org/wiki/List_of_Unicode_characters
46 | var latinAlphabetBasic = "A-Za-z";
47 | _wordsReEn = new Regex(@$"([{latinAlphabetBasic}]+(?:[{latinAlphabetBasic}\-']*[{latinAlphabetBasic}]+)*)|(\|[^|]*\|)|([^{latinAlphabetBasic}|]+)");
48 | }
49 |
50 | ///
51 | /// Normalize unicode text with "NFC", and convert right single quotation mark (U+2019, decimal 8217) as an apostrophe.
52 | ///
53 | /// the original input sentence.
54 | /// normalized text.
55 | public static string AnyLocaleTextPreprocessing(string text)
56 | {
57 | var res = new List();
58 | foreach (var c in NormalizeUnicodeText(text))
59 | {
60 | if (c == '’') // right single quotation mark (U+2019, decimal 8217) as an apostrophe
61 | {
62 | res.Add('\'');
63 | }
64 | else
65 | {
66 | res.Add(c);
67 | }
68 | }
69 | return new string(res.ToArray());
70 | }
71 |
72 | ///
73 | /// TODO @xueyang: Apply NFC form may be too aggressive since it would ignore some accented characters that do not exist
74 | /// in predefined German alphabet(nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon.IPA_CHARACTER_SETS),
75 | /// such as 'é'. This is not expected.A better solution is to add an extra normalization with NFD to discard the
76 | /// diacritics and consider 'é' and 'e' produce similar pronunciations.
77 | ///
78 | /// Note that the tokenizer needs to run `unicodedata.normalize("NFC", x)` before calling `encode` function,
79 | /// especially for the characters that have diacritics, such as 'ö' in the German alphabet. 'ö' can be encoded as
80 | /// b'\xc3\xb6' (one char) as well as b'o\xcc\x88' (two chars). Without the normalization of composing two chars
81 | /// together and without a complete predefined set of diacritics, when the tokenizer reads the input sentence
82 | /// char-by-char, it would skip the combining diaeresis b'\xcc\x88', resulting in indistinguishable pronunciations
83 | /// for 'ö' and 'o'.
84 | ///
85 | /// the original input sentence.
86 | /// NFC normalized sentence.
87 | private static string NormalizeUnicodeText(string text)
88 | {
89 | // normalize word with NFC form
90 | return text.Normalize(NormalizationForm.FormC);
91 | }
92 |
93 | public static string EnglishTextPreprocessing(string text, bool lower = true)
94 | {
95 | text = new string(
96 | text.Normalize(NormalizationForm.FormD)
97 | .Where(ch => CharUnicodeInfo.GetUnicodeCategory(ch) != UnicodeCategory.NonSpacingMark)
98 | .Select(ch => _synoGlyph2Ascii.ContainsKey(ch) ? _synoGlyph2Ascii[ch] : ch)
99 | .ToArray());
100 |
101 | if (lower)
102 | {
103 | text = text.ToLower();
104 | }
105 | return text;
106 | }
107 |
108 | ///
109 | /// Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation
110 | /// can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including
111 | /// whitespaces.This function will split unchanged strings by whitespaces and return them as `List[str]`. For example,
112 | ///
113 | /// .. code-block::python
114 | /// [
115 | /// ('Hello', '', ''), # valid word
116 | /// ('', '', ' '), # punctuation mark
117 | /// ('World', '', ''), # valid word
118 | /// ('', '', ' '), # punctuation mark
119 | /// ('', '|NVIDIA unchanged|', ''), # unchangeable word
120 | /// ('', '', '!') # punctuation mark
121 | /// ]
122 | ///
123 | /// will be converted into,
124 | ///
125 | /// .. code-block::python
126 | /// [
127 | /// (["Hello"], false),
128 | /// ([" "], false),
129 | /// (["World"], false),
130 | /// ([" "], false),
131 | /// (["NVIDIA", "unchanged"], True),
132 | /// (["!"], false)
133 | /// ]
134 | ///
135 | /// a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element
136 | /// corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`.
137 | /// a flag to trigger lowercase all words. By default, it is false.
138 | /// a list of tuples like `(a list of words, is_unchanged)`.
139 | private static (string[], bool)[] _wordTokenize(MatchCollection words, bool isLower = false)
140 | {
141 | var result = new List<(string[], bool)>();
142 | foreach (Match word in words)
143 | {
144 | var maybeWord = word.Groups[0].Value;
145 | var maybeWithoutChanges = word.Groups[1].Value;
146 | var maybePunct = word.Groups[2].Value;
147 |
148 | var withoutChanges = false;
149 | string[] token;
150 | if (!string.IsNullOrEmpty(maybeWord))
151 | {
152 | if (isLower)
153 | {
154 | token = new[] { maybeWord.ToLower() };
155 | }
156 | else
157 | {
158 | token = new[] { maybeWord };
159 | }
160 | }
161 | else if (!string.IsNullOrEmpty(maybePunct))
162 | {
163 | token = new[] { maybePunct };
164 | }
165 | else if (!string.IsNullOrEmpty(maybeWithoutChanges))
166 | {
167 | withoutChanges = true;
168 | token = maybeWithoutChanges.Substring(1, maybeWithoutChanges.Length - 2).Split(' ');
169 | }
170 | else
171 | {
172 | throw new InvalidDataException(
173 | $"This is not expected. Found empty string: <{word}>. " +
174 | $"Please validate your regular expression pattern '_WORDS_RE_EN' or '_WORDS_RE_ANY_LOCALE'."
175 | );
176 | }
177 |
178 | result.Add((token, withoutChanges));
179 | }
180 | return result.ToArray();
181 | }
182 |
183 | public static (string[], bool)[] EnglishWordTokenize(string text)
184 | {
185 | var words = _wordsReEn.Matches(text);
186 | return _wordTokenize(words, isLower: true);
187 | }
188 | }
189 | }
190 |
--------------------------------------------------------------------------------
/NeMoOnnxSharp/WaveFile.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Katsuya Iida. All Rights Reserved.
2 | // See LICENSE in the project root for license information.
3 |
4 | using System;
5 | using System.Collections.Generic;
6 | using System.IO;
7 | using System.Runtime.InteropServices;
8 | using System.Text;
9 |
10 | namespace NeMoOnnxSharp
11 | {
12 | ///
13 | /// A static class to read and write WAV files.
14 | ///
15 | public static class WaveFile
16 | {
17 | ///
18 | /// Load a WAV file as a short array. The result is resampled
19 | /// with the target sampling rate and Multi-channel audio
20 | /// is converted to mono.
21 | ///
22 | /// File to read.
23 | /// the target sampling rate
24 | /// Waveform data.
25 | ///
26 | public static short[] ReadWAV(string path, int rate)
27 | {
28 | using (var stream = File.OpenRead(path))
29 | using (var reader = new BinaryReader(stream, Encoding.ASCII))
30 | {
31 | int originalRate;
32 | short originalNumChannels;
33 | var waveform = ReadWAV(reader, out originalRate, out originalNumChannels);
34 | return PostProcess(waveform, originalRate, originalNumChannels, rate);
35 | }
36 | }
37 |
38 | ///
39 | /// Save a short array as a WAV file.
40 | ///
41 | /// File to write.
42 | /// Waveform data.
43 | ///
44 | ///
45 | public static void WriteWAV(string path, short[] waveform, int rate)
46 | {
47 | short numChannels = 1;
48 | using (var stream = File.OpenWrite(path))
49 | {
50 | WriteWAV(stream, waveform, rate, numChannels);
51 | }
52 | }
53 |
54 | ///
55 | /// Encode a short array into a byte array in WAV format.
56 | ///
57 | /// Waveform data.
58 | ///
59 | /// A byte array in WAV format
60 | public static byte[] GetWAVBytes(short[] waveform, int rate)
61 | {
62 | byte[] data;
63 | short numChannels = 1;
64 | using (var stream = new MemoryStream())
65 | {
66 | WriteWAV(stream, waveform, rate, numChannels);
67 | data = stream.ToArray();
68 | }
69 | return data;
70 | }
71 |
72 | private static short[] ReadWAV(BinaryReader reader, out int rate, out short numChannels)
73 | {
74 | rate = 0;
75 | numChannels = 0;
76 | string fourCC = new string(reader.ReadChars(4));
77 | if (fourCC != "RIFF")
78 | throw new InvalidDataException();
79 | int chunkLen = reader.ReadInt32();
80 | fourCC = new string(reader.ReadChars(4));
81 | if (fourCC != "WAVE")
82 | throw new InvalidDataException();
83 | while (true)
84 | {
85 | fourCC = new string(reader.ReadChars(4));
86 | chunkLen = reader.ReadInt32();
87 | if (fourCC == "fmt ")
88 | {
89 | if (chunkLen < 16) throw new InvalidDataException();
90 | short formatTag = reader.ReadInt16();
91 | if (formatTag != 1) throw new InvalidDataException("Only PCM format is supported");
92 | numChannels = reader.ReadInt16();
93 | rate = reader.ReadInt32();
94 | int avgBytesPerSec = reader.ReadInt32();
95 | short blockAlign = reader.ReadInt16();
96 | short bitsPerSample = reader.ReadInt16();
97 | if (avgBytesPerSec * 8 != rate * bitsPerSample * numChannels || blockAlign * 8 != bitsPerSample * numChannels)
98 | {
99 | throw new InvalidDataException();
100 | }
101 | if (chunkLen > 16)
102 | {
103 | reader.ReadBytes(chunkLen - 16);
104 | }
105 | }
106 | else
107 | {
108 | if (rate == 0)
109 | {
110 | throw new InvalidDataException();
111 | }
112 | byte[] byteData = reader.ReadBytes(chunkLen);
113 | if (fourCC == "data")
114 | {
115 | return MemoryMarshal.Cast(byteData).ToArray();
116 | }
117 | }
118 | }
119 | }
120 |
121 | private static void WriteWAV(Stream stream, short[] waveform, int rate, short numChannels)
122 | {
123 | using (var writer = new BinaryWriter(stream, Encoding.ASCII))
124 | {
125 | WriteWAV(writer, waveform, rate, numChannels);
126 | }
127 | }
128 |
129 | private static void WriteWAV(BinaryWriter writer, short[] waveform, int rate, short numChannels)
130 | {
131 | short formatTag = 1; // PCM
132 | short bitsPerSample = 16;
133 | int avgBytesPerSec = rate * bitsPerSample * numChannels / 8;
134 | short blockAlign = (short)(numChannels * bitsPerSample / 8);
135 |
136 | string fourCC = "RIFF";
137 | writer.Write(fourCC.ToCharArray());
138 | int chunkLen = 36 + waveform.Length * (bitsPerSample / 8);
139 | writer.Write(chunkLen);
140 |
141 | fourCC = "WAVE";
142 | writer.Write(fourCC.ToCharArray());
143 |
144 | fourCC = "fmt ";
145 | chunkLen = 16;
146 |
147 | writer.Write(fourCC.ToCharArray());
148 | writer.Write(chunkLen);
149 | writer.Write(formatTag);
150 | writer.Write(numChannels);
151 | writer.Write(rate);
152 | writer.Write(avgBytesPerSec);
153 | writer.Write(blockAlign);
154 | writer.Write(bitsPerSample);
155 |
156 | fourCC = "data";
157 | chunkLen = waveform.Length * (bitsPerSample / 8);
158 |
159 | writer.Write(fourCC.ToCharArray());
160 | writer.Write(chunkLen);
161 | var waveformBytes = MemoryMarshal.Cast(waveform);
162 | writer.Write(waveformBytes.ToArray());
163 | }
164 |
165 | private static short[] PostProcess(short[] waveform, int sourceRate, int sourceNumChannels, int targetRate)
166 | {
167 | waveform = ToMono(waveform, sourceNumChannels);
168 | waveform = Resample(waveform, sourceRate, targetRate);
169 | return waveform;
170 | }
171 |
172 | private static short[] Resample(short[] waveform, int sourceRate, int targetRate)
173 | {
174 | if (sourceRate == targetRate) return waveform;
175 | if (waveform.Length == 0) return Array.Empty();
176 | long targetLength = (waveform.LongLength - 1) * targetRate / sourceRate + 1;
177 | short[] result = new short[targetLength];
178 | for (long i = 0; i < result.LongLength; i++)
179 | {
180 | result[i] = waveform[i * sourceRate / targetRate];
181 | }
182 | return result;
183 | }
184 |
185 | private static short[] ToMono(short[] waveform, int numChannels)
186 | {
187 | if (numChannels == 1) return waveform;
188 | int length = waveform.Length / numChannels;
189 | short[] result = new short[length];
190 | for (int i = 0; i < length; i++)
191 | {
192 | int value = 0;
193 | for (int j = 0; j < numChannels; j++)
194 | {
195 | value += waveform[i * numChannels + j];
196 | }
197 | result[i] = (short)(value / numChannels);
198 | }
199 | return result;
200 | }
201 | }
202 | }
203 |
--------------------------------------------------------------------------------
/Python/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501
--------------------------------------------------------------------------------
/Python/convert_librispeech.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import soundfile as sf
4 | import librosa
5 |
6 | id1, id2 = 61, 70968
7 | input_dir = os.path.join(sys.argv[1], "test-clean", str(id1), str(id2))
8 | output_dir = os.path.join("..", "test_data")
9 | transcript_file = os.path.join(input_dir, "%d-%d.trans.txt" % (id1, id2))
10 | output_file = os.path.join(output_dir, "transcript.txt")
11 | sample_rate = 16000
12 |
13 | os.makedirs(output_dir, exist_ok=True)
14 | with open(transcript_file, 'rt') as f:
15 | with open(output_file, 'wt') as outf:
16 | for line in f:
17 | name, _, text = line.rstrip('\r\n').partition(" ")
18 | text = text.lower()
19 | audio_file = os.path.join(input_dir, name + ".flac")
20 | wav_file = os.path.join(output_dir, name + ".wav")
21 | x, orig_sample_rate = sf.read(audio_file)
22 | assert x.ndim == 1
23 | x = librosa.resample(x, orig_sample_rate, sample_rate)
24 | print("Writing %s..." % (wav_file,))
25 | outf.write("%s.wav|%s\n" % (name, text))
26 | sf.write(wav_file, x, samplerate=sample_rate, subtype="PCM_16")
27 |
--------------------------------------------------------------------------------
/Python/export_models.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from omegaconf import OmegaConf
3 |
4 |
5 | def get_class(cls_path):
6 | package_path = '.'.join(cls_path.split('.')[:-1])
7 | cls_name = cls_path.split('.')[-1]
8 | package = importlib.import_module(package_path)
9 | return getattr(package, cls_name)
10 |
11 |
12 | def export(cls_path: str, model_name: str):
13 | cls = get_class(cls_path)
14 | model = cls.from_pretrained(model_name)
15 | model.export(f'{model_name}.onnx')
16 | print(OmegaConf.to_yaml(model._cfg))
17 |
18 |
19 | cls_path = 'nemo.collections.asr.models.EncDecClassificationModel'
20 | cls_path = 'nemo.collections.asr.models.EncDecCTCModel'
21 | cls_path = 'nemo.collections.asr.models.EncDecClassificationModel'
22 | model_name = 'vad_marblenet'
23 | model_name = 'stt_en_quartznet15x5'
24 | model_name = 'stt_en_jasper10x5dr'
25 | model_name = 'commandrecognition_en_matchboxnet3x1x64_v2'
26 | export(cls_path, model_name)
27 |
--------------------------------------------------------------------------------
/Python/make_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Katsuya Iida. All Rights Reserved.
2 | # See LICENSE in the project root for license information.
3 |
4 | import librosa
5 | import torch
6 | from nemo.collections.asr.modules import (
7 | AudioToMelSpectrogramPreprocessor,
8 | AudioToMFCCPreprocessor
9 | )
10 |
11 |
12 | def main():
13 | wavpath = "../NemoOnnxSharp.Tests/Data/61-70968-0000.wav"
14 | sr = 16000
15 | audio_signal, sr = librosa.load(wavpath, sr=sr, mono=True)
16 | assert audio_signal.ndim == 1
17 | audio_signal = torch.from_numpy(audio_signal)
18 | audio_signal = torch.unsqueeze(audio_signal, 0)
19 | length = torch.tensor([audio_signal.shape[1]], dtype=torch.int64)
20 | convert_mfcc(audio_signal, length)
21 | convert_mel_spectrogram(audio_signal, length)
22 |
23 |
24 | def convert_mel_spectrogram(audio_signal, length):
25 | print(audio_signal.shape)
26 | preprocessor = AudioToMelSpectrogramPreprocessor(
27 | normalize="per_feature",
28 | window_size=0.02,
29 | sample_rate=16000,
30 | window_stride=0.01,
31 | window="hann",
32 | features=64,
33 | n_fft=512,
34 | frame_splicing=1,
35 | dither=0.00001,
36 | stft_conv=False)
37 | with torch.no_grad():
38 | processed_signal, processed_signal_length = preprocessor(input_signal=audio_signal, length=length)
39 | print(processed_signal, processed_signal_length)
40 | print(processed_signal.shape, processed_signal_length)
41 | with open("../NemoOnnxSharp.Tests/Data/mel_spectrogram.bin", 'wb') as fp:
42 | fp.write(processed_signal[0].T.numpy().tobytes("C"))
43 |
44 |
45 | def convert_mfcc(audio_signal, length):
46 | print(audio_signal.shape)
47 | preprocessor = AudioToMFCCPreprocessor(
48 | window_size=0.025,
49 | window_stride=0.01,
50 | window="hann",
51 | n_mels=64,
52 | n_mfcc=64,
53 | n_fft=512)
54 | with torch.no_grad():
55 | processed_signal, processed_signal_length = preprocessor(input_signal=audio_signal, length=length)
56 | print(processed_signal, processed_signal_length)
57 | print(processed_signal.shape, processed_signal_length)
58 | with open("../NemoOnnxSharp.Tests/Data/mfcc.bin", 'wb') as fp:
59 | fp.write(processed_signal[0].T.numpy().tobytes("C"))
60 |
61 |
62 | if __name__ == "__main__":
63 | main()
64 |
--------------------------------------------------------------------------------
/test_data/.gitignore:
--------------------------------------------------------------------------------
1 | /generated-*.wav
2 | /recognized-*.wav
3 | /result.txt
--------------------------------------------------------------------------------
/test_data/61-70968-0000.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3f53c11bcec66e60659c3e53015f4f914d79b04eba0770347e644a4776fbe633
3 | size 157004
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0001.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df55c8a1e89386975f650a98f1e513d8d6e0c12cc5cb2d92ad00501e21a1d8e4
3 | size 115564
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0002.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ad96038297ec7d5ce499bbacb9fc0c7e2d462d031c6d7380f4960e31e3ef9bf3
3 | size 95084
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0003.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f9b573b19f504f3eaf341f126f788adbe73b379bfe3f0cfae0a5d51aadbfcb7d
3 | size 138124
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0004.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:94751ed39e020fab4c61c6fbdc750be55c2bea19d4e010c0ccf1fe5f1e19aedb
3 | size 124364
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0005.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5fd859700c65c3a15a3caea1c321914ed1c0318c183554fb60121ece797fe9b7
3 | size 162284
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0006.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a680ba9d1a32b2b8a0875165c4c1f2ff4fd3d84b7a4ffdc92daa7e345db0233d
3 | size 93964
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0007.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9e037a99f31885f1bba0a52181609462f832c2e6f46f064bd428c1731bc25a06
3 | size 113644
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0008.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:82a1bc1ef6e77e00b9af883b03a2c661cccdc67305731b679da6a21ca33138a6
3 | size 113164
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0009.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:07686334b7bc345a231eca201f614b4eb301ce1ef73834f4ae61296142ccddb6
3 | size 144364
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0010.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1e7dadf546cc01ce15394a155ba2f2fadd17c9dfba4a951fd24b366d1b7009d5
3 | size 265484
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0011.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ac0ec71479cc2cd204cf3caead4a7929d15fdd20d7eb08c9b35310c3ac5b0e88
3 | size 204044
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0012.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:252f2dc7532d5d679c1b57402018cd2b6cfd5f5607563e9e8d15fd6d049a8750
3 | size 83564
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0013.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ec88f797bcb446927417a91a63fa9a92aa3cab67a4d4e68b9f0d84070fdfd12a
3 | size 142444
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0014.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:663af9404ce03c9df89ddf341636eb4790a8fa3f78fa2646b7ac26ccd018f611
3 | size 239564
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0015.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3753e342574252e6414abdabcd09e8ccf1721696e8859f815ffdcdea5d799ae5
3 | size 172044
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0016.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:82acd474f1c191d996bf577c00685b49d4a7342a8000c7841afc245de3a7f44c
3 | size 119084
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0017.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d7e768f637aee1e4695f952002ee1c86209ef7c3fdba5f2860910e7d826750de
3 | size 163564
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0018.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3548563c071f784640611b9d5661351a90e26ef020d6a7cd22496afbc1594628
3 | size 77004
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0019.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:de5b2ae3f62ab70003299d154cf8ca174abecece9861654dc63154b1cbf15ec8
3 | size 175244
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0020.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2d3ff04464a3db8d8c7aee34a6cde6610b716b6632d495179db3b65388375ea3
3 | size 163404
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0021.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2e463d40cd4e60e5b23fd25f13c84d45bc453e246d83b1a63bca2d620db73fa0
3 | size 86124
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0022.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:66607ac384005d3429abf3548964630223d86262951a66a3d717538f4497fca6
3 | size 149484
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0023.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:52e72e87ce68c09af43f94cd1099021fae0781872e93e5da34f92dc5f88fd853
3 | size 160844
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0024.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:30d3317ad7492d7eefa5380caec36f259f912297891b3a0f83096c93af4f9849
3 | size 192844
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0025.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb6123b80871c0c31f43972ec9f77b7bce667cfc9a26774e6376a31f0ed7e660
3 | size 141164
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0026.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:012921ed3a7f5c1369188ca346d468ddf99abbd3c398810c3932e523f378429d
3 | size 157484
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0027.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:40c3b9f25002e201f671e14fe9fd8c12f218b08389d345b58abf26fc4b4fdf35
3 | size 219884
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0028.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ee119982e4c44c558f790c44b032f74f4f071c0ad67656647523504ba6ef3d6b
3 | size 171404
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0029.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4b84dceeddf76e96d8e20934c018cf7d1cb5805db138442cc9cf9681e1e7bddb
3 | size 111884
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0030.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2767e40fab3748caf7f07dc47297ecb6b5bf19b53f12bb29fe5626b791a795e0
3 | size 181964
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0031.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9d448f100c78ed193bb571b7140a405f960667705cc9a69a2779d697b48fc2e7
3 | size 93644
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0032.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bd3527051e39a0b2ae431630aa1cec818b7903cbe94e0fe0bd644c4f9e80b453
3 | size 137004
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0033.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6048875e595a68c17f4d79b2c36bfff5e60066aaad246b19a18d6d6101acfad3
3 | size 181964
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0034.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6054ef7bb5140173bc76ae5a7cf5d1fed714117fc2bb4320f370df247d1228d6
3 | size 135404
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0035.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e08be96d41087ff39c63d47d24fe0230a710e36b7acce0cefdb5c71ec0451f39
3 | size 254444
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0036.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:dede8e54d9f432d920dc7f63f108862265e77371f2a742cd41c82a783f09dd6c
3 | size 93962
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0037.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3df69bcdd3a6690d746700dd8e2af7c6db4c7eb426064ec289051bc1029ac605
3 | size 138124
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0038.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a2a3869fcae4ae7aa5f52f984cca42692aaedc70b479cc62c8742167d3eceece
3 | size 65644
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0039.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:81825272e79ee9e1d42eba1c7a51d8cd6cb8ecdd227d366ba211ca18cf1128da
3 | size 121804
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0040.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:433d9cadaa4a025491d857fa8a36729a7c5177d1fe44ede3967c7c7ea6b2196d
3 | size 126444
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0041.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ea941cbef651c558f0dcee117854f6f290f8f4ff3aeff6bc6ed669aa380b8b03
3 | size 218444
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0042.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0e4c658f1e067baf1083c919353436ff77d7ba0233def4f563de7a517b92fc65
3 | size 89164
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0043.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ac7dfa89b0ef04f305ebdd310bc2fed86e2bf314f69a3a6bb7e688f0ed91351a
3 | size 215564
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0044.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b0aa32154d539b8b39c6d44ece0b0ef1fd877bc0011826eeb60cff7ebd04ab33
3 | size 88684
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0045.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d95be7473028936629cfbc24ef1f9e550585d266fc15e6aed818a4f4d52a3c6f
3 | size 111244
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0046.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2f8d9401c832ca20fad55ef6f8934596c3746576f4077d0f24a9dd0d8921ba44
3 | size 113644
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0047.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:54fca568c27b79c7755e4267eb39a553f7ccf16389f14a1e0b9d03be6d24e803
3 | size 152844
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0048.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6a94a5b4b1b1818a1be154b875f8a5de2bf92dc5c292a95cc240b80485613893
3 | size 96684
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0049.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0c6976d30ebd5bfe7c829c055d2bf729b095741b6e7c7a5d08943391cb5773c5
3 | size 264044
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0050.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:847f20ecfbd7e24f565d21ed9e419dad341ad89919cb5e670d7315de1c2b1a4b
3 | size 178604
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0051.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:47fdb16ef5745504b705f7854b3bc201487f6bd0d9e8811c5c1b98c4030a5839
3 | size 100364
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0052.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:abbc8181efacff8a2dbc2d37c3f2f55e02879523dbd58eb5f3e14eb8e6f0a6bd
3 | size 84844
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0053.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6cf6c59d1589886118cffa1ed2eca8665e728019c4d3a7f21f81db5e402ba8f9
3 | size 135084
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0054.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:234f107f134babc6c1b0fdd041187ea14dc8518cd6a1e237e7da31474b41e157
3 | size 251564
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0055.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b9734d146511c648889a14fa1ddd092974fb33c472c5dd15eee5cc979130915a
3 | size 126924
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0056.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3645021d26b85c14262b1208860d5a8b81a339493b1e9ce3cd773d7213c2e306
3 | size 114124
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0057.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:04a851beed6a653b79f0147f008826aa30b41e04db371764540daee9671be383
3 | size 162124
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0058.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3ad240f4d20e9179b043859f3a0a1354c3fbf0ad23d50148b23d39f4217e2934
3 | size 58284
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0059.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d153ad1d451e53595e995df6a7941e8e671a001e3e2ee44b2957a54ee210f20b
3 | size 69484
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0060.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:14fb8ad902b2bc1ad34d4397f83c96125f2549c3301a232557b13e1f6243e665
3 | size 119564
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0061.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5a592bf3f9830627adcb5a4a6f76e2bf43bbcc9c62e431c714cfe595abce178a
3 | size 177004
4 |
--------------------------------------------------------------------------------
/test_data/61-70968-0062.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:362ba15629217d4b91c7f8dc08b82738d10db86bf85d20d1d107d247b4c6a6b4
3 | size 81804
4 |
--------------------------------------------------------------------------------
/test_data/README.md:
--------------------------------------------------------------------------------
1 | # Test data
2 |
3 | These files are from [LibriSpeech](http://www.openslr.org/12)
4 |
5 | - `transcript.txt`
6 | - `61-70968-0052.wav`
7 | - ...
8 |
9 | This file is from [NVIDIA NeMo tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb)
10 |
11 | - `SpeechCommands_demo.wav`
12 |
13 | This file is from [Thrsten Voice](https://github.com/thorstenMueller/Thorsten-Voice)
14 |
15 | - `samples_thorsten-21.06-emotional_neutral.wav`
16 |
--------------------------------------------------------------------------------
/test_data/SpeechCommands_demo.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bb9ca4c26860bc0cbea94ade4f18f1dd53ac79bbf6caef82507becaa1b4a083f
3 | size 54524
4 |
--------------------------------------------------------------------------------
/test_data/samples_thorsten-21.06-emotional_neutral.wav:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1ba03644402c5b1edc3c4cd7f76967febb66cbce6a8adbf294e0a6b2de87cd1f
3 | size 70454
4 |
--------------------------------------------------------------------------------
/test_data/transcript.txt:
--------------------------------------------------------------------------------
1 | 61-70968-0000.wav|he began a confused complaint against the wizard who had vanished behind the curtain on the left
2 | 61-70968-0001.wav|give not so earnest a mind to these mummeries child
3 | 61-70968-0002.wav|a golden fortune and a happy life
4 | 61-70968-0003.wav|he was like unto my father in a way and yet was not my father
5 | 61-70968-0004.wav|also there was a stripling page who turned into a maid
6 | 61-70968-0005.wav|this was so sweet a lady sir and in some manner i do think she died
7 | 61-70968-0006.wav|but then the picture was gone as quickly as it came
8 | 61-70968-0007.wav|sister nell do you hear these marvels
9 | 61-70968-0008.wav|take your place and let us see what the crystal can show to you
10 | 61-70968-0009.wav|like as not young master though i am an old man
11 | 61-70968-0010.wav|forthwith all ran to the opening of the tent to see what might be amiss but master will who peeped out first needed no more than one glance
12 | 61-70968-0011.wav|he gave way to the others very readily and retreated unperceived by the squire and mistress fitzooth to the rear of the tent
13 | 61-70968-0012.wav|cries of a nottingham a nottingham
14 | 61-70968-0013.wav|before them fled the stroller and his three sons capless and terrified
15 | 61-70968-0014.wav|what is the tumult and rioting cried out the squire authoritatively and he blew twice on a silver whistle which hung at his belt
16 | 61-70968-0015.wav|nay we refused their request most politely most noble said the little stroller
17 | 61-70968-0016.wav|and then they became vexed and would have snatched your purse from us
18 | 61-70968-0017.wav|i could not see my boy injured excellence for but doing his duty as one of cumberland's sons
19 | 61-70968-0018.wav|so i did push this fellow
20 | 61-70968-0019.wav|it is enough said george gamewell sharply and he turned upon the crowd
21 | 61-70968-0020.wav|shame on you citizens cried he i blush for my fellows of nottingham
22 | 61-70968-0021.wav|surely we can submit with good grace
23 | 61-70968-0022.wav|tis fine for you to talk old man answered the lean sullen apprentice
24 | 61-70968-0023.wav|but i wrestled with this fellow and do know that he played unfairly in the second bout
25 | 61-70968-0024.wav|spoke the squire losing all patience and it was to you that i gave another purse in consolation
26 | 61-70968-0025.wav|come to me men here here he raised his voice still louder
27 | 61-70968-0026.wav|the strollers took their part in it with hearty zest now that they had some chance of beating off their foes
28 | 61-70968-0027.wav|robin and the little tumbler between them tried to force the squire to stand back and very valiantly did these two comport themselves
29 | 61-70968-0028.wav|the head and chief of the riot the nottingham apprentice with clenched fists threatened montfichet
30 | 61-70968-0029.wav|the squire helped to thrust them all in and entered swiftly himself
31 | 61-70968-0030.wav|now be silent on your lives he began but the captured apprentice set up an instant shout
32 | 61-70968-0031.wav|silence you knave cried montfichet
33 | 61-70968-0032.wav|he felt for and found the wizard's black cloth the squire was quite out of breath
34 | 61-70968-0033.wav|thrusting open the proper entrance of the tent robin suddenly rushed forth with his burden with a great shout
35 | 61-70968-0034.wav|a montfichet a montfichet gamewell to the rescue
36 | 61-70968-0035.wav|taking advantage of this the squire's few men redoubled their efforts and encouraged by robin's and the little stroller's cries fought their way to him
37 | 61-70968-0036.wav|george montfichet will never forget this day
38 | 61-70968-0037.wav|what is your name lording asked the little stroller presently
39 | 61-70968-0038.wav|robin fitzooth
40 | 61-70968-0039.wav|and mine is will stuteley shall we be comrades
41 | 61-70968-0040.wav|right willingly for between us we have won the battle answered robin
42 | 61-70968-0041.wav|i like you will you are the second will that i have met and liked within two days is there a sign in that
43 | 61-70968-0042.wav|montfichet called out for robin to give him an arm
44 | 61-70968-0043.wav|friends said montfichet faintly to the wrestlers bear us escort so far as the sheriff's house
45 | 61-70968-0044.wav|it will not be safe for you to stay here now
46 | 61-70968-0045.wav|pray follow us with mine and my lord sheriff's men
47 | 61-70968-0046.wav|nottingham castle was reached and admittance was demanded
48 | 61-70968-0047.wav|master monceux the sheriff of nottingham was mightily put about when told of the rioting
49 | 61-70968-0048.wav|and henry might return to england at any moment
50 | 61-70968-0049.wav|have your will child if the boy also wills it montfichet answered feeling too ill to oppose anything very strongly just then
51 | 61-70968-0050.wav|he made an effort to hide his condition from them all and robin felt his fingers tighten upon his arm
52 | 61-70968-0051.wav|beg me a room of the sheriff child quickly
53 | 61-70968-0052.wav|but who is this fellow plucking at your sleeve
54 | 61-70968-0053.wav|he is my esquire excellency returned robin with dignity
55 | 61-70968-0054.wav|mistress fitzooth had been carried off by the sheriff's daughter and her maids as soon as they had entered the house so that robin alone had the care of montfichet
56 | 61-70968-0055.wav|robin was glad when at length they were left to their own devices
57 | 61-70968-0056.wav|the wine did certainly bring back the color to the squire's cheeks
58 | 61-70968-0057.wav|these escapades are not for old gamewell lad his day has come to twilight
59 | 61-70968-0058.wav|will you forgive me now
60 | 61-70968-0059.wav|it will be no disappointment to me
61 | 61-70968-0060.wav|no thanks i am glad to give you such easy happiness
62 | 61-70968-0061.wav|you are a worthy leech will presently whispered robin the wine has worked a marvel
63 | 61-70968-0062.wav|ay and show you some pretty tricks
64 |
--------------------------------------------------------------------------------