├── .gitattributes ├── .github └── workflows │ └── build-validation.yml ├── .gitignore ├── .vscode ├── launch.json └── tasks.json ├── Examples └── Godot │ └── README.md ├── LICENSE ├── NeMoOnnxSharp.Example ├── ModelDownloader.cs ├── NeMoOnnxSharp.Example.csproj ├── PretrainedModelInfo.cs └── Program.cs ├── NeMoOnnxSharp.Tests ├── AudioFeatureBufferTest.cs ├── Data │ ├── 61-70968-0000-mod.wav │ ├── 61-70968-0000.wav │ ├── cmudict-test │ ├── heteronyms-test │ ├── mel_spectrogram.bin │ └── mfcc.bin ├── FFTTest.cs ├── NeMoOnnxSharp.Tests.csproj ├── PreprocessorTest.cs ├── TextTokenizersTest.cs └── WaveFileTest.cs ├── NeMoOnnxSharp.sln ├── NeMoOnnxSharp ├── AudioPreprocessing │ ├── AudioFeatureBuffer.cs │ ├── AudioToMFCCPreprocessor.cs │ ├── AudioToMelSpectrogramPreprocessor.cs │ ├── FFT.cs │ ├── FeatureNormalize.cs │ ├── HTKMelBands.cs │ ├── IAudioFeatureBuffer.cs │ ├── IAudioPreprocessor.cs │ ├── IFeaturizer.cs │ ├── MFCC.cs │ ├── MFCCNorm.cs │ ├── MelBands.cs │ ├── MelNorm.cs │ ├── MelScale.cs │ ├── SlaneyMelBands.cs │ ├── Window.cs │ └── WindowFunction.cs ├── FrameVAD.cs ├── Models │ ├── ASRModel.cs │ ├── CharTokenizer.cs │ ├── EncDecCTCConfig.cs │ ├── EncDecCTCModel.cs │ ├── EncDecClassificationConfig.cs │ ├── EncDecClassificationModel.cs │ ├── Model.cs │ ├── ModelConfig.cs │ ├── SpectrogramGenerator.cs │ ├── SpectrogramGeneratorConfig.cs │ ├── Vocoder.cs │ └── VocoderConfig.cs ├── NeMoOnnxSharp.csproj ├── SpeechConfig.cs ├── SpeechRecognitionEventArgs.cs ├── SpeechRecognizer.cs ├── SpeechSynthesisResult.cs ├── SpeechSynthesizer.cs ├── TTSTokenizers │ ├── BaseCharsTokenizerr.cs │ ├── BaseTokenizer.cs │ ├── EnglishG2p.cs │ ├── EnglishPhonemesTokenizer.cs │ ├── GermanCharsTokenizer.cs │ └── TokenizerUtils.cs └── WaveFile.cs ├── Python ├── .flake8 ├── convert_librispeech.py ├── export_models.py └── make_test.py ├── README.md └── test_data ├── .gitignore ├── 61-70968-0000.wav ├── 61-70968-0001.wav ├── 61-70968-0002.wav ├── 61-70968-0003.wav ├── 61-70968-0004.wav ├── 61-70968-0005.wav ├── 61-70968-0006.wav ├── 61-70968-0007.wav ├── 61-70968-0008.wav ├── 61-70968-0009.wav ├── 61-70968-0010.wav ├── 61-70968-0011.wav ├── 61-70968-0012.wav ├── 61-70968-0013.wav ├── 61-70968-0014.wav ├── 61-70968-0015.wav ├── 61-70968-0016.wav ├── 61-70968-0017.wav ├── 61-70968-0018.wav ├── 61-70968-0019.wav ├── 61-70968-0020.wav ├── 61-70968-0021.wav ├── 61-70968-0022.wav ├── 61-70968-0023.wav ├── 61-70968-0024.wav ├── 61-70968-0025.wav ├── 61-70968-0026.wav ├── 61-70968-0027.wav ├── 61-70968-0028.wav ├── 61-70968-0029.wav ├── 61-70968-0030.wav ├── 61-70968-0031.wav ├── 61-70968-0032.wav ├── 61-70968-0033.wav ├── 61-70968-0034.wav ├── 61-70968-0035.wav ├── 61-70968-0036.wav ├── 61-70968-0037.wav ├── 61-70968-0038.wav ├── 61-70968-0039.wav ├── 61-70968-0040.wav ├── 61-70968-0041.wav ├── 61-70968-0042.wav ├── 61-70968-0043.wav ├── 61-70968-0044.wav ├── 61-70968-0045.wav ├── 61-70968-0046.wav ├── 61-70968-0047.wav ├── 61-70968-0048.wav ├── 61-70968-0049.wav ├── 61-70968-0050.wav ├── 61-70968-0051.wav ├── 61-70968-0052.wav ├── 61-70968-0053.wav ├── 61-70968-0054.wav ├── 61-70968-0055.wav ├── 61-70968-0056.wav ├── 61-70968-0057.wav ├── 61-70968-0058.wav ├── 61-70968-0059.wav ├── 61-70968-0060.wav ├── 61-70968-0061.wav ├── 61-70968-0062.wav ├── README.md ├── SpeechCommands_demo.wav ├── samples_thorsten-21.06-emotional_neutral.wav └── transcript.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.onnx filter=lfs diff=lfs merge=lfs -text 2 | *.wav filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.github/workflows/build-validation.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**.cs' 8 | - '**.csproj' 9 | pull_request: 10 | branches: [ main ] 11 | paths: 12 | - '**.cs' 13 | - '**.csproj' 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | with: 23 | lfs: true 24 | - name: Checkout LFS objects 25 | run: git lfs checkout 26 | - name: Setup .NET 27 | uses: actions/setup-dotnet@v3 28 | with: 29 | dotnet-version: 6.0.x 30 | - uses: actions/cache@v3 31 | with: 32 | path: ~/.nuget/packages 33 | # Look to see if there is a cache hit for the corresponding requirements file 34 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/packages.lock.json') }} 35 | restore-keys: | 36 | ${{ runner.os }}-nuget 37 | - name: Restore dependencies 38 | run: dotnet restore 39 | - name: Build 40 | run: dotnet build --configuration Release --no-restore 41 | - name: Test 42 | run: dotnet test --no-restore --verbosity normal 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ 351 | 352 | # ONNX runtime 353 | *.onnx 354 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | // Use IntelliSense to find out which attributes exist for C# debugging 6 | // Use hover for the description of the existing attributes 7 | // For further information visit https://github.com/dotnet/vscode-csharp/blob/main/debugger-launchjson.md 8 | "name": ".NET Core Launch (console)", 9 | "type": "coreclr", 10 | "request": "launch", 11 | "preLaunchTask": "build", 12 | // If you have changed target frameworks, make sure to update the program path. 13 | "program": "${workspaceFolder}/NeMoOnnxSharp.Example/bin/Debug/net7.0/NeMoOnnxSharp.Example.dll", 14 | "args": [], 15 | "cwd": "${workspaceFolder}/NeMoOnnxSharp.Example", 16 | // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console 17 | "console": "internalConsole", 18 | "stopAtEntry": false 19 | }, 20 | { 21 | "name": ".NET Core Attach", 22 | "type": "coreclr", 23 | "request": "attach" 24 | } 25 | ] 26 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "build", 6 | "command": "dotnet", 7 | "type": "process", 8 | "args": [ 9 | "build", 10 | "${workspaceFolder}/NeMoOnnxSharp.sln", 11 | "/property:GenerateFullPaths=true", 12 | "/consoleloggerparameters:NoSummary" 13 | ], 14 | "problemMatcher": "$msCompile" 15 | }, 16 | { 17 | "label": "publish", 18 | "command": "dotnet", 19 | "type": "process", 20 | "args": [ 21 | "publish", 22 | "${workspaceFolder}/NeMoOnnxSharp.sln", 23 | "/property:GenerateFullPaths=true", 24 | "/consoleloggerparameters:NoSummary" 25 | ], 26 | "problemMatcher": "$msCompile" 27 | }, 28 | { 29 | "label": "watch", 30 | "command": "dotnet", 31 | "type": "process", 32 | "args": [ 33 | "watch", 34 | "run", 35 | "--project", 36 | "${workspaceFolder}/NeMoOnnxSharp.sln" 37 | ], 38 | "problemMatcher": "$msCompile" 39 | } 40 | ] 41 | } -------------------------------------------------------------------------------- /Examples/Godot/README.md: -------------------------------------------------------------------------------- 1 | # NeMoOnnxGodot 2 | 3 | Moved to 4 | [Neural Speech Engine with NVIDIA NeMo and ONNX Runtime](https://godotengine.org/asset-library/asset/2298) 5 | of Godot Asset Library -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 Katsuya Iida 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /NeMoOnnxSharp.Example/ModelDownloader.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Net.Http; 8 | using System.Security.Cryptography; 9 | using System.Text; 10 | using System.Threading; 11 | using System.Threading.Tasks; 12 | 13 | namespace NeMoOnnxSharp.Example 14 | { 15 | internal sealed class ModelDownloader : IDisposable 16 | { 17 | private readonly HttpClient _httpClient; 18 | 19 | public ModelDownloader() 20 | { 21 | _httpClient = new HttpClient(); 22 | } 23 | 24 | public void Dispose() 25 | { 26 | _httpClient.Dispose(); 27 | } 28 | 29 | private string GetFileChecksum(string path) 30 | { 31 | using SHA256 sha256 = SHA256.Create(); 32 | using var stream = File.OpenRead(path); 33 | var hashValue = sha256.ComputeHash(stream); 34 | var sb = new StringBuilder(); 35 | foreach (var value in hashValue) 36 | { 37 | sb.Append($"{value:x2}"); 38 | } 39 | return sb.ToString(); 40 | } 41 | 42 | private bool CheckCacheFile(string cacheFilePath, string expectedChecksum) 43 | { 44 | if (File.Exists(cacheFilePath)) 45 | { 46 | string checksum = GetFileChecksum(cacheFilePath); 47 | if (string.Compare(checksum, expectedChecksum, true) == 0) 48 | { 49 | return true; 50 | } 51 | File.Delete(cacheFilePath); 52 | } 53 | return false; 54 | } 55 | 56 | private void ShowProgress(long progress, long? total) 57 | { 58 | if (total.HasValue) 59 | { 60 | Console.Write("\rDownloading... [{0}/{1} bytes]", progress, total); 61 | } 62 | else 63 | { 64 | Console.Write("\rDownloading... [{0} bytes]", progress); 65 | } 66 | } 67 | 68 | public async Task MayDownloadAsync( 69 | string filePath, string url, string sha256, 70 | CancellationToken cancellationToken = default) 71 | { 72 | if (CheckCacheFile(filePath, sha256)) 73 | { 74 | Console.WriteLine("Using cached `{0}'.", url); 75 | } 76 | else 77 | { 78 | await DownloadAsync(url, filePath); 79 | if (!CheckCacheFile(filePath, sha256)) 80 | { 81 | File.Delete(filePath); 82 | throw new InvalidDataException(); 83 | } 84 | } 85 | } 86 | 87 | private async Task DownloadAsync( 88 | string url, string path, 89 | CancellationToken cancellationToken = default) 90 | { 91 | using (var response = await _httpClient.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, cancellationToken)) 92 | { 93 | response.EnsureSuccessStatusCode(); 94 | long currentPosition = 0; 95 | long? contentLength = response.Content.Headers.ContentLength; 96 | using (var reader = await response.Content.ReadAsStreamAsync(cancellationToken)) 97 | { 98 | using (var writer = File.OpenWrite(path)) 99 | { 100 | var lastDateTime = DateTime.UtcNow; 101 | byte[] buffer = new byte[4096]; 102 | int bytesRead; 103 | while ((bytesRead = await reader.ReadAsync(buffer, 0, buffer.Length, cancellationToken)) != 0) 104 | { 105 | await writer.WriteAsync(buffer, 0, bytesRead, cancellationToken); 106 | currentPosition += bytesRead; 107 | var currentDateTime = DateTime.UtcNow; 108 | if ((currentDateTime - lastDateTime).Seconds >= 1) 109 | { 110 | lastDateTime = currentDateTime; 111 | ShowProgress(currentPosition, contentLength); 112 | } 113 | } 114 | } 115 | } 116 | } 117 | Console.WriteLine(); 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /NeMoOnnxSharp.Example/NeMoOnnxSharp.Example.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net7.0 6 | enable 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /NeMoOnnxSharp.Example/PretrainedModelInfo.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | 6 | namespace NeMoOnnxSharp.Example 7 | { 8 | internal class PretrainedModelInfo 9 | { 10 | private static PretrainedModelInfo[]? _modelList = null; 11 | 12 | public static PretrainedModelInfo[] ModelList 13 | { 14 | get 15 | { 16 | if (_modelList == null) 17 | { 18 | _modelList = CreateModelList(); 19 | } 20 | return _modelList; 21 | } 22 | } 23 | 24 | private static PretrainedModelInfo[] CreateModelList() 25 | { 26 | return new PretrainedModelInfo[] 27 | { 28 | new PretrainedModelInfo( 29 | "stt_en_quartznet15x5", 30 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/stt_en_quartznet15x5.onnx", 31 | "dde27f0528e92c05f7bc220a9be4a7bb99927da0a3a25db8f2f861e3559da90d" 32 | ), 33 | new PretrainedModelInfo( 34 | "QuartzNet15x5Base-En", 35 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/QuartzNet15x5Base-En.onnx", 36 | "ee1b72102fd0c5422d088e80f929dbdee7e889d256a4ce1e412cd49916823695" 37 | ), 38 | new PretrainedModelInfo( 39 | "vad_marblenet", 40 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/vad_marblenet.onnx", 41 | "edaf8a7bb62e4335f97aa70d1a447ccbd3942b58b870e08a20c0408a0fb106e0" 42 | ), 43 | new PretrainedModelInfo( 44 | "commandrecognition_en_matchboxnet3x1x64_v2", 45 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.1/commandrecognition_en_matchboxnet3x1x64_v2.onnx", 46 | "a0c5e4d14e83d3b6afdaf239265a390c2ca513bcdedf3d295bc1f9f97f19868a" 47 | ), 48 | new PretrainedModelInfo( 49 | "cmudict-0.7b_nv22.10", 50 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/cmudict-0.7b_nv22.10", 51 | "d330f3a3554d4c7ff8ef7bfc0c338ed74831d5f54109508fb829bdd82173608b" 52 | ), 53 | new PretrainedModelInfo( 54 | "heteronyms-052722", 55 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/heteronyms-052722", 56 | "b701909aedf753172eff223950f8859cd4b9b4c80199cf0a6e9ac4a307c8f8ec" 57 | ), 58 | new PretrainedModelInfo( 59 | "stt_de_quartznet15x5", 60 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.3/stt_de_quartznet15x5.onnx", 61 | "c6499961539c349117c4c724ba5f333d26b3242d2d39571fde44c3baa66d55fc" 62 | ), 63 | new PretrainedModelInfo( 64 | "tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210", 65 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.3/tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210.onnx", 66 | "35d351dcb5113a3af2eecc5051b42b747623328168a57b36b311f3396d5c1c74" 67 | ), 68 | new PretrainedModelInfo( 69 | "tts_de_hifigan_singleSpeaker_thorstenNeutral_2210", 70 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.3/tts_de_hifigan_singleSpeaker_thorstenNeutral_2210.onnx", 71 | "6be4e33bcc7e34b111d34be79157922802b224c2c4f1cc93dd62a5c19d936ade" 72 | ), 73 | new PretrainedModelInfo( 74 | "tts_en_fastpitch", 75 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_fastpitch.onnx", 76 | "a297174dea1084bd34d1af1a8447bc07f6c8aab7a4fea312c610eba6bc3d0eac" 77 | ), 78 | new PretrainedModelInfo( 79 | "tts_en_hifigan", 80 | "https://github.com/kaiidams/NeMoOnnxSharp/releases/download/v1.2/tts_en_hifigan.onnx", 81 | "54501000b9de86b724931478b5bb8911e1b6ca6e293f68e9e10f60351f1949a3" 82 | ) 83 | }; 84 | } 85 | 86 | public static PretrainedModelInfo Get(string pretrainedModelName) 87 | { 88 | foreach (var info in ModelList) 89 | { 90 | if (pretrainedModelName == info.PretrainedModelName) 91 | { 92 | return info; 93 | } 94 | } 95 | 96 | throw new IndexOutOfRangeException(); 97 | } 98 | 99 | public string PretrainedModelName { get; private set; } 100 | public string Location { get; private set; } 101 | public string Hash { get; private set; } 102 | 103 | public PretrainedModelInfo( 104 | string pretrainedModelName, 105 | string location, 106 | string hash) 107 | { 108 | PretrainedModelName = pretrainedModelName; 109 | Location = location; 110 | Hash = hash; 111 | } 112 | } 113 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/AudioFeatureBufferTest.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using NeMoOnnxSharp.AudioPreprocessing; 3 | using NuGet.Frameworks; 4 | using System; 5 | using System.Diagnostics; 6 | using System.IO; 7 | using System.Reflection; 8 | using System.Runtime.InteropServices; 9 | using System.Security.Cryptography; 10 | 11 | namespace NeMoOnnxSharp.Tests 12 | { 13 | [TestClass] 14 | public class AudioFeatureBufferTest 15 | { 16 | private AudioFeatureBuffer? _buffer; 17 | 18 | [TestInitialize] 19 | public void Initialize() 20 | { 21 | int sampleRate = 16000; 22 | var transform = new MFCC( 23 | sampleRate: sampleRate, 24 | window: WindowFunction.Hann, 25 | winLength: 400, 26 | nFFT: 512, 27 | nMels: 64, 28 | nMFCC: 64, 29 | fMin: 0.0, 30 | fMax: 0.0, 31 | logMels: true, 32 | melScale: MelScale.HTK, 33 | melNorm: MelNorm.None); 34 | _buffer = new AudioFeatureBuffer( 35 | transform, 36 | hopLength: 160); 37 | } 38 | 39 | [TestMethod] 40 | public void Test1() 41 | { 42 | Assert.IsNotNull(_buffer); 43 | int written; 44 | Assert.AreEqual(0, _buffer.OutputCount); 45 | written = _buffer.Write(new short[399]); 46 | Assert.AreEqual(399, written); 47 | Assert.AreEqual(0, _buffer.OutputCount); 48 | written = _buffer.Write(new short[1]); 49 | Assert.AreEqual(1, written); 50 | Assert.AreEqual(64, _buffer.OutputCount); 51 | _buffer.ConsumeOutput(64); 52 | Assert.AreEqual(0, _buffer.OutputCount); 53 | written = _buffer.Write(new short[160 * 3]); 54 | Assert.AreEqual(160 * 3, written); 55 | Assert.AreEqual(64 * 3, _buffer.OutputCount); 56 | written = _buffer.Write(new short[480]); 57 | Assert.AreEqual(480, written); 58 | Assert.AreEqual(64 * 6, _buffer.OutputCount); 59 | } 60 | 61 | [TestMethod] 62 | public void Test2() 63 | { 64 | Assert.IsNotNull(_buffer); 65 | int totalWritten = 0; 66 | int totalOutput = 0; 67 | var rng = new Random(); 68 | for (int i = 0; i < 1000; i++) 69 | { 70 | int n = rng.Next(1024); 71 | int written = _buffer.Write(new short[n]); 72 | Assert.AreEqual(0, _buffer.OutputCount % 64); 73 | totalWritten += written; 74 | totalOutput += _buffer.OutputCount; 75 | if (totalWritten < 400) 76 | { 77 | Assert.AreEqual(0, totalOutput); 78 | } 79 | else 80 | { 81 | int m = (totalWritten - 400) / 160 + 1; 82 | Assert.AreEqual(m * 64, totalOutput); 83 | } 84 | _buffer.ConsumeOutput(_buffer.OutputCount); 85 | } 86 | } 87 | } 88 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/Data/61-70968-0000-mod.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:69766a33e70720c6187892ba96ba948e9c4f0daaa1a946a6de6741ff76b7e2bb 3 | size 216446 4 | -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/Data/61-70968-0000.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3f53c11bcec66e60659c3e53015f4f914d79b04eba0770347e644a4776fbe633 3 | size 157004 4 | -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/Data/cmudict-test: -------------------------------------------------------------------------------- 1 | # Comment 2 | YOU'VE Y UW1 V 3 | READ R EH1 D 4 | READ(1) R IY1 D 5 | BOOK B UH1 K 6 | THE DH AH0 7 | THE(1) DH AH1 8 | THE(2) DH IY0 9 | OPERATING AA1 P ER0 EY2 T IH0 NG 10 | OPERATING(1) AO1 P ER0 EY2 T IH0 NG 11 | SYSTEM S IH1 S T AH0 M 12 | DESIGN D IH0 Z AY1 N 13 | AND AH0 N D 14 | AND(1) AE1 N D 15 | IMPLEMENTATION IH2 M P L AH0 M EH0 N T EY1 SH AH0 N 16 | THIRD TH ER1 D 17 | EDITION AH0 D IH1 SH AH0 N 18 | EDITION(1) IH0 D IH1 SH AH0 N 19 | DID D IH1 D 20 | DID(1) D IH0 D 21 | YOU Y UW1 22 | -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/Data/heteronyms-test: -------------------------------------------------------------------------------- 1 | read -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/Data/mel_spectrogram.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaiidams/NeMoOnnxSharp/ad2ffe375e525bb63c59c9b1cd5154afe70351a0/NeMoOnnxSharp.Tests/Data/mel_spectrogram.bin -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/Data/mfcc.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaiidams/NeMoOnnxSharp/ad2ffe375e525bb63c59c9b1cd5154afe70351a0/NeMoOnnxSharp.Tests/Data/mfcc.bin -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/FFTTest.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using NeMoOnnxSharp.AudioPreprocessing; 3 | using System; 4 | using System.IO; 5 | using System.Runtime.InteropServices; 6 | 7 | namespace NeMoOnnxSharp.Tests 8 | { 9 | [TestClass] 10 | public class FFTTest 11 | { 12 | private static void CFFTRef(double[] xr, double[] xi, int N) 13 | { 14 | double[] yr = new double[N]; 15 | double[] yi = new double[N]; 16 | for (int i = 0; i < N; i++) 17 | { 18 | double vr = 0.0; 19 | double vi = 0.0; 20 | for (int k = 0; k < N; k++) 21 | { 22 | vr += Math.Cos(-2 * Math.PI * k * i / N) * xr[k]; 23 | vi += Math.Sin(-2 * Math.PI * k * i / N) * xr[k]; 24 | } 25 | yr[i] = vr; 26 | yi[i] = vi; 27 | } 28 | for (int i = 0; i < N; i++) 29 | { 30 | xr[i] = yr[i]; 31 | xi[i] = yi[i]; 32 | } 33 | } 34 | 35 | private static double MSE(double[] a, double[] b) 36 | { 37 | if (a.Length != b.Length) throw new ArgumentException(); 38 | int len = Math.Min(a.Length, b.Length); 39 | double err = 0.0; 40 | for (int i = 0; i < len; i++) 41 | { 42 | double diff = a[i] - b[i]; 43 | err += diff * diff; 44 | } 45 | return err / len; 46 | } 47 | 48 | [TestMethod] 49 | public void TestCFFT() 50 | { 51 | var rng = new Random(); 52 | for (int N = 256; N <= 2048; N *= 2) 53 | { 54 | var xr0 = new double[N]; 55 | var xi0 = new double[N]; 56 | var xr1 = new double[N]; 57 | var xi1 = new double[N]; 58 | for (int i = 0; i < 10; i++) 59 | { 60 | for (int j = 0; j < N; j++) 61 | { 62 | xr0[j] = rng.NextDouble(); 63 | xi0[j] = rng.NextDouble(); 64 | xr1[j] = xr0[j]; 65 | xi1[j] = rng.NextDouble(); 66 | } 67 | CFFTRef(xr0, xi0, N); 68 | FFT.CFFT(xr1, xi1, N); 69 | double error = MSE(xr0, xi1); 70 | Assert.IsTrue(error < 1e-20); 71 | error = MSE(xi0, xr1); 72 | Assert.IsTrue(error < 1e-20); 73 | } 74 | } 75 | } 76 | } 77 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/NeMoOnnxSharp.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net7.0 5 | enable 6 | enable 7 | 8 | false 9 | true 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | PreserveNewest 24 | 25 | 26 | PreserveNewest 27 | 28 | 29 | PreserveNewest 30 | 31 | 32 | PreserveNewest 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | PreserveNewest 50 | 51 | 52 | PreserveNewest 53 | 54 | 55 | Never 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/PreprocessorTest.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using NeMoOnnxSharp.AudioPreprocessing; 3 | using System; 4 | using System.Diagnostics; 5 | using System.IO; 6 | using System.Reflection; 7 | using System.Runtime.InteropServices; 8 | 9 | namespace NeMoOnnxSharp.Tests 10 | { 11 | [TestClass] 12 | public class PreprocessorTest 13 | { 14 | private const int SampleRate = 16000; 15 | private const string SampleWAVSpeechFile = "61-70968-0000.wav"; 16 | 17 | private static float[] ReadData(string file) 18 | { 19 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory; 20 | string path = Path.Combine(appDirPath, "Data", file); 21 | var bytes = File.ReadAllBytes(path); 22 | return MemoryMarshal.Cast(bytes).ToArray(); 23 | } 24 | 25 | private static void AssertMSE(string path, float[] x, double threshold = 1e-3) 26 | { 27 | var truth = ReadData(path); 28 | double mse = MSE(truth, x); 29 | Console.WriteLine("MSE: {0}", mse); 30 | Assert.IsTrue(mse < threshold); 31 | } 32 | 33 | private static double MSE(float[] a, float[] b) 34 | { 35 | if (a.Length != b.Length) throw new ArgumentException(); 36 | int len = Math.Min(a.Length, b.Length); 37 | double err = 0.0; 38 | for (int i = 0; i < len; i++) 39 | { 40 | double diff = a[i] - b[i]; 41 | err += diff * diff; 42 | } 43 | return err / len; 44 | } 45 | 46 | short[]? audioSignal; 47 | 48 | [TestInitialize] 49 | public void Initialize() 50 | { 51 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory; 52 | string waveFile = Path.Combine(appDirPath, "Data", SampleWAVSpeechFile); 53 | audioSignal = WaveFile.ReadWAV(waveFile, SampleRate); 54 | } 55 | 56 | [TestMethod] 57 | public void TestMelSpectrogram() 58 | { 59 | var preprocessor = new AudioToMelSpectrogramPreprocessor( 60 | sampleRate: 16000, 61 | window: WindowFunction.Hann, 62 | windowSize: 0.02, 63 | windowStride: 0.01, 64 | nFFT: 512, 65 | features: 64); 66 | var x = preprocessor.GetFeatures(audioSignal); 67 | // NeMo pads the result to 16 time staps. 68 | var y = new float[((x.Length / 64 + 15) / 16) * 16 * 64]; 69 | Array.Copy(x, y, x.Length); 70 | AssertMSE("mel_spectrogram.bin", y); 71 | } 72 | 73 | [TestMethod] 74 | public void TestMFCC() 75 | { 76 | var preprocessor = new AudioToMFCCPreprocessor( 77 | sampleRate: 16000, 78 | windowSize: 0.025, 79 | windowStride: 0.01, 80 | //preNormalize: 0.8, 81 | window: WindowFunction.Hann, 82 | nMels: 64, 83 | nMFCC: 64, 84 | nFFT: 512); 85 | var processedSignal = preprocessor.GetFeatures(audioSignal); 86 | AssertMSE("mfcc.bin", processedSignal, threshold: 1e-2); 87 | } 88 | } 89 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/TextTokenizersTest.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using NeMoOnnxSharp.TTSTokenizers; 3 | using System; 4 | using System.Diagnostics; 5 | using System.IO; 6 | 7 | namespace NeMoOnnxSharp.Tests 8 | { 9 | [TestClass] 10 | public class TextTokenizersTest 11 | { 12 | private readonly static string[] ExpectedTokens = 13 | { 14 | " ", "B", "CH", "D", "DH", "F", "G", "HH", "JH", "K", "L", "M", 15 | "N", "NG", "P", "R", "S", "SH", "T", "TH", "V", "W", "Y", "Z", "ZH", 16 | "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", 17 | "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "EH0", "EH1", 18 | "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "IH0", "IH1", "IH2", 19 | "IY0", "IY1", "IY2", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "UH0", 20 | "UH1", "UH2", "UW0", "UW1", "UW2", "a", "b", "c", "d", "e", "f", "g", 21 | "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", 22 | "v", "w", "x", "y", "z", "'", ",", ".", "!", "?", "-", ":", ";", "/", 23 | "\"", "(", ")", "[", "]", "{", "}", "", "", "" 24 | }; 25 | 26 | private const string SampleText = 27 | "You've read the book “Operating Systems Design and Implementation, 3rd edition”. Did you?"; 28 | private const string NormalizedSampleText = 29 | "You've read the book “Operating Systems Design and Implementation, third edition”. Did you?"; 30 | private const string SamplePronText = 31 | "Y|UW1|V| |r|e|a|d| |t|h|e| |B|UH1|K| |“|o|p|e|r|a|t|i|n|g| |" 32 | + "S|IH1|S|T|AH0|M|Z| |D|IH0|Z|AY1|N| |a|n|d| |IH2|M|P|L|AH0|" 33 | + "M|EH0|N|T|EY1|SH|AH0|N|,| |TH|ER1|D| |e|d|i|t|i|o|n|”|.| |" 34 | + "d|i|d| |Y|UW1|?"; 35 | 36 | private readonly static int[] SampleParsed = 37 | { 38 | 0, 22, 68, 20, 0, 87, 74, 70, 73, 0, 89, 77, 74, 39 | 0, 1, 65, 9, 0, 105, 84, 85, 74, 87, 70, 89, 78, 40 | 83, 76, 0, 16, 53, 16, 18, 31, 11, 23, 0, 3, 52, 41 | 23, 41, 12, 0, 70, 83, 73, 0, 54, 11, 14, 10, 31, 42 | 11, 43, 12, 18, 50, 17, 31, 12, 97, 0, 19, 47, 3, 43 | 0, 74, 73, 78, 89, 78, 84, 83, 105, 98, 0, 73, 78, 44 | 73, 0, 22, 68, 100, 0 45 | }; 46 | 47 | private static readonly char[] ExpectedGermanTokens = 48 | { 49 | ' ', 50 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 51 | 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 52 | 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ', 53 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 54 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 55 | 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ü', 'ß', 56 | '\'', 57 | '!', '"', '(', ')', ',', '-', '.', '/', ':', ';', 58 | '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—', 59 | '‘', '‚', '“', '„', '‹', '›' 60 | }; 61 | 62 | private const string GermanText = "Mist, wieder nichts geschafft."; 63 | 64 | private readonly static int[] GermanParsed = 65 | { 66 | 0, 13, 39, 49, 50, 66, 0, 53, 39, 35, 34, 35, 48, 0, 44, 39, 33, 38, 67 | 50, 49, 0, 37, 35, 49, 33, 38, 31, 36, 36, 50, 68, 0 68 | }; 69 | 70 | [TestInitialize] 71 | public void Initialize() 72 | { 73 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory; 74 | _g2p = new EnglishG2p( 75 | phonemeDict: Path.Combine(appDirPath, "Data", "cmudict-test"), 76 | heteronyms: Path.Combine(appDirPath, "Data", "heteronyms-test"), 77 | phonemeProbability: 1.0); 78 | _tokenizer = new EnglishPhonemesTokenizer( 79 | _g2p, 80 | punct: true, 81 | stresses: true, 82 | chars: true, 83 | apostrophe: true, 84 | padWithSpace: true, 85 | addBlankAt: BaseTokenizer.AddBlankAt.True); 86 | } 87 | 88 | [TestMethod] 89 | public void TestTokenizerVocab() 90 | { 91 | Assert.IsNotNull(_tokenizer); 92 | CollectionAssert.AreEquivalent(ExpectedTokens, _tokenizer.Tokens); 93 | } 94 | 95 | [TestMethod] 96 | public void TestEnglishG2p() 97 | { 98 | Assert.IsNotNull(_g2p); 99 | var pron = string.Join("|", _g2p.Parse(NormalizedSampleText)); 100 | Assert.AreEqual(SamplePronText, pron); 101 | } 102 | 103 | [TestMethod] 104 | public void TestEnglishEncode() 105 | { 106 | Assert.IsNotNull(_tokenizer); 107 | var parsed = _tokenizer.Encode(NormalizedSampleText); 108 | CollectionAssert.AreEquivalent(SampleParsed, parsed); 109 | } 110 | 111 | [TestMethod] 112 | public void TestGermanVocab() 113 | { 114 | _tokenizer = new GermanCharsTokenizer(padWithSpace: true); 115 | var expectedTokens = ExpectedGermanTokens.Select(c => c.ToString()).ToList(); 116 | expectedTokens.Add(""); 117 | expectedTokens.Add(""); 118 | CollectionAssert.AreEquivalent(expectedTokens, _tokenizer.Tokens); 119 | 120 | var parsed = _tokenizer.Encode(GermanText); 121 | CollectionAssert.AreEquivalent(GermanParsed, parsed); 122 | } 123 | 124 | private EnglishG2p? _g2p; 125 | private BaseTokenizer? _tokenizer; 126 | } 127 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp.Tests/WaveFileTest.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using System; 3 | using System.Diagnostics; 4 | using System.IO; 5 | using System.Reflection; 6 | using System.Runtime.InteropServices; 7 | 8 | namespace NeMoOnnxSharp.Tests 9 | { 10 | [TestClass] 11 | public class WaveFileTest 12 | { 13 | private const int SampleRate = 16000; 14 | private const string SampleWAVSpeech1File = "61-70968-0000.wav"; 15 | private const int SampleWAVSpeech1Length = 78480; 16 | private const string SampleWAVSpeech2File = "61-70968-0000-mod.wav"; 17 | private const int SampleWAVSpeech2Length = 78480 / 2; 18 | private const string TempFile = "temp.wav"; 19 | 20 | [TestMethod] 21 | public void Test1() 22 | { 23 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory; 24 | string waveFile = Path.Combine(appDirPath, "Data", SampleWAVSpeech1File); 25 | var waveform = WaveFile.ReadWAV(waveFile, SampleRate); 26 | Assert.AreEqual(waveform.Length, SampleWAVSpeech1Length); 27 | 28 | WaveFile.WriteWAV(TempFile, waveform, SampleRate); 29 | var waveform2 = WaveFile.ReadWAV(TempFile, SampleRate); 30 | Assert.IsTrue(IsArraysEqual(waveform, waveform2)); 31 | } 32 | 33 | [TestMethod] 34 | public void Test2() 35 | { 36 | string appDirPath = AppDomain.CurrentDomain.BaseDirectory; 37 | string waveFile = Path.Combine(appDirPath, "Data", SampleWAVSpeech2File); 38 | var waveform = WaveFile.ReadWAV(waveFile, SampleRate); 39 | Assert.AreEqual(waveform.Length, SampleWAVSpeech2Length); 40 | 41 | byte[] bytes = WaveFile.GetWAVBytes(waveform, SampleRate); 42 | Assert.AreEqual(bytes.Length, SampleWAVSpeech2Length * 2 + 44); 43 | } 44 | 45 | private bool IsArraysEqual(T[] x, T[] y) where T : struct 46 | { 47 | if (x.Length != y.Length) return false; 48 | for (int i = 0; i < x.Length; i++) 49 | { 50 | if (!x[i].Equals(y[i])) return false; 51 | } 52 | return true; 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.0.32014.148 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NeMoOnnxSharp.Example", "NeMoOnnxSharp.Example\NeMoOnnxSharp.Example.csproj", "{D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NeMoOnnxSharp", "NeMoOnnxSharp\NeMoOnnxSharp.csproj", "{69A674F7-593C-48C4-A5C7-5BCBC205E281}" 9 | EndProject 10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NeMoOnnxSharp.Tests", "NeMoOnnxSharp.Tests\NeMoOnnxSharp.Tests.csproj", "{4D0C8A9F-0574-4645-A0C2-51393982ACC8}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Any CPU = Debug|Any CPU 15 | Release|Any CPU = Release|Any CPU 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 19 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Debug|Any CPU.Build.0 = Debug|Any CPU 20 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Release|Any CPU.ActiveCfg = Release|Any CPU 21 | {D583F4A1-65A9-4BD2-91D5-8A24E0B325E0}.Release|Any CPU.Build.0 = Release|Any CPU 22 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Debug|Any CPU.Build.0 = Debug|Any CPU 24 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Release|Any CPU.ActiveCfg = Release|Any CPU 25 | {69A674F7-593C-48C4-A5C7-5BCBC205E281}.Release|Any CPU.Build.0 = Release|Any CPU 26 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 27 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Debug|Any CPU.Build.0 = Debug|Any CPU 28 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Release|Any CPU.ActiveCfg = Release|Any CPU 29 | {4D0C8A9F-0574-4645-A0C2-51393982ACC8}.Release|Any CPU.Build.0 = Release|Any CPU 30 | EndGlobalSection 31 | GlobalSection(SolutionProperties) = preSolution 32 | HideSolutionNode = FALSE 33 | EndGlobalSection 34 | GlobalSection(ExtensibilityGlobals) = postSolution 35 | SolutionGuid = {E1B7E2B0-48B8-4C5A-9DEE-02037FFE0EA9} 36 | EndGlobalSection 37 | EndGlobal 38 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/AudioFeatureBuffer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using System.Text; 9 | 10 | namespace NeMoOnnxSharp.AudioPreprocessing 11 | { 12 | public class AudioFeatureBuffer : IAudioFeatureBuffer 13 | { 14 | private readonly IFeaturizer _transform; 15 | private readonly int _numInputChannels; 16 | private readonly int _numOutputChannels; 17 | private readonly int _hopLength; 18 | private readonly int _winLength; 19 | private readonly T1[] _inputBuffer; 20 | private int _inputCount; 21 | private readonly T2[] _outputBuffer; 22 | private int _outputCount; 23 | 24 | public int NumInputChannels => _numInputChannels; 25 | public int NumOutputChannels => _numOutputChannels; 26 | public int HopLength => _hopLength; 27 | public int WinLength => _winLength; 28 | public int OutputCount => _outputCount; 29 | public T2[] OutputBuffer => _outputBuffer; 30 | public int OutputPosition => _outputCount / _numOutputChannels * _hopLength + _inputCount; 31 | 32 | public AudioFeatureBuffer( 33 | IFeaturizer transform, 34 | int hopLength, 35 | int numOutputFrames = 100) 36 | { 37 | _transform = transform; 38 | _hopLength = hopLength; 39 | _winLength = transform.InputLength; 40 | _numInputChannels = 1; 41 | _numOutputChannels = transform.OutputLength; 42 | _inputBuffer = new T1[_winLength / _hopLength * _hopLength + _winLength]; 43 | _inputCount = 0; 44 | _outputBuffer = new T2[_numOutputChannels * numOutputFrames]; 45 | _outputCount = 0; 46 | } 47 | 48 | public int Write(T1[] input, int offset, int count) 49 | { 50 | return Write(input.AsSpan(offset, count)); 51 | } 52 | 53 | public int Write(Span input) 54 | { 55 | int written = 0; 56 | 57 | if (_inputCount > 0) 58 | { 59 | // Here _inputCount < _winLength. Copy n elements where 60 | // 0 < _inputCount <= 160 -> n = _winLength - _inputCount 61 | // 160 < _inputCount <= 320 -> n = _hopLength + _winLength - _inputCount 62 | // 320 < _inputCount < 400 -> n = 2 * _hopLength + _winLength - _inputCount 63 | int needed = (_inputCount - 1) / _hopLength * _hopLength + _winLength - _inputCount; 64 | written = Math.Min(needed, input.Length); 65 | 66 | input.Slice(0, written).CopyTo(_inputBuffer.AsSpan(_inputCount, written)); 67 | _inputCount += written; 68 | 69 | int inputBufferOffset = 0; 70 | while (inputBufferOffset + _winLength <= _inputCount) 71 | { 72 | _transform.GetFeatures( 73 | _inputBuffer.AsSpan(inputBufferOffset, _numInputChannels * _winLength), 74 | _outputBuffer.AsSpan(_outputCount, _numOutputChannels)); 75 | _outputCount += _numOutputChannels; 76 | inputBufferOffset += _hopLength; 77 | } 78 | 79 | if (written < needed) 80 | { 81 | Array.Copy(_inputBuffer, inputBufferOffset, _inputBuffer, 0, _inputCount - inputBufferOffset); 82 | _inputCount -= inputBufferOffset; 83 | return written; 84 | } 85 | 86 | _inputCount = 0; 87 | written -= _winLength - _hopLength; 88 | } 89 | 90 | while (written + _winLength <= input.Length) 91 | { 92 | if (_outputCount + _numOutputChannels >= _outputBuffer.Length) 93 | { 94 | return written; 95 | } 96 | _transform.GetFeatures( 97 | input.Slice(written, _numInputChannels * _winLength), 98 | _outputBuffer.AsSpan(_outputCount, _numOutputChannels)); 99 | _outputCount += _numOutputChannels; 100 | written += _hopLength; 101 | } 102 | 103 | input.Slice(written).CopyTo(_inputBuffer); 104 | _inputCount = input.Length - written; 105 | written = input.Length; 106 | return written; 107 | } 108 | 109 | public void ConsumeOutput(int count) 110 | { 111 | Array.Copy(_outputBuffer, count, _outputBuffer, 0, _outputCount - count); 112 | _outputCount -= count; 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/AudioToMFCCPreprocessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | 6 | namespace NeMoOnnxSharp.AudioPreprocessing 7 | { 8 | public class AudioToMFCCPreprocessor : IAudioPreprocessor 9 | { 10 | private readonly bool _center; 11 | protected readonly int _nWindowSize; 12 | protected readonly int _nWindowStride; 13 | private readonly double _preNormalize; 14 | private readonly IFeaturizer _featurizer; 15 | 16 | public int SampleRate => _featurizer.SampleRate; 17 | 18 | public AudioToMFCCPreprocessor( 19 | int sampleRate = 16000, 20 | double windowSize = 0.02, 21 | double windowStride = 0.01, 22 | int? nWindowSize = null, 23 | int? nWindowStride = null, 24 | WindowFunction window = WindowFunction.Hann, 25 | int? nFFT = null, 26 | double preNormalize = 0.0, 27 | bool center = true, 28 | double lowFreq = 0.0, 29 | double? highFreq = null, 30 | int nMels = 64, 31 | int nMFCC = 64, 32 | int dctType = 2, 33 | MFCCNorm norm = MFCCNorm.Ortho, 34 | bool log = true) 35 | { 36 | _preNormalize = preNormalize; 37 | _center = center; 38 | _nWindowSize = nWindowSize ?? (int)(windowSize * sampleRate); 39 | _nWindowStride = nWindowStride ?? (int)(windowStride * sampleRate); 40 | int _nFFT = nFFT ?? (int)Math.Pow(2, Math.Ceiling(Math.Log(_nWindowSize, 2))); 41 | _featurizer = new MFCC( 42 | sampleRate: sampleRate, 43 | window: window, 44 | winLength: _nWindowSize, 45 | nFFT: _nFFT, 46 | fMin: lowFreq, 47 | fMax: highFreq, 48 | nMels: nMels, 49 | nMFCC: nMFCC, 50 | dctType: dctType, 51 | mfccNorm: norm, 52 | logMels: log); 53 | } 54 | 55 | public float[] GetFeatures(Span input) 56 | { 57 | double scale = GetScaleFactor(input); 58 | int outputLength = GetOutputLength(input.Length); 59 | int outputStep = _featurizer.OutputLength; 60 | float[] output = new float[outputStep * outputLength]; 61 | int inputOffset = -(_nWindowSize / 2); 62 | for (int outputOffset = 0; outputOffset < output.Length; outputOffset += outputStep) 63 | { 64 | if (inputOffset > 0 && inputOffset + _nWindowSize <= input.Length) 65 | { 66 | _featurizer.GetFeatures( 67 | input.Slice(inputOffset, _nWindowSize), 68 | output.AsSpan(outputOffset, outputStep)); 69 | } 70 | else 71 | { 72 | Span temp = stackalloc short[_nWindowSize]; 73 | int start = inputOffset; 74 | int end = inputOffset + _nWindowSize; 75 | int offset = 0; 76 | if (start < 0) 77 | { 78 | offset = -start; 79 | start = 0; 80 | } 81 | if (end >= input.Length) 82 | { 83 | end = input.Length; 84 | } 85 | if (end > start) 86 | { 87 | input.Slice(start, end - start).CopyTo(temp.Slice(offset)); 88 | } 89 | _featurizer.GetFeatures( 90 | temp, 91 | output.AsSpan(outputOffset, outputStep)); 92 | } 93 | inputOffset += _nWindowStride; 94 | } 95 | return output; 96 | } 97 | 98 | private int GetOutputLength(int inputLength) 99 | { 100 | if (_center) 101 | { 102 | return (inputLength + _nWindowStride - 1) / _nWindowStride; 103 | } 104 | else 105 | { 106 | return (inputLength - _nWindowStride) / _nWindowStride + 1; 107 | } 108 | } 109 | 110 | private double GetScaleFactor(Span input) 111 | { 112 | double scale; 113 | if (_preNormalize > 0) 114 | { 115 | scale = _preNormalize / MaxAbsValue(input); 116 | } 117 | else 118 | { 119 | scale = 1.0 / short.MaxValue; 120 | } 121 | 122 | return scale; 123 | } 124 | 125 | private int MaxAbsValue(Span input) 126 | { 127 | int maxValue = 1; 128 | for (int i = 0; i < input.Length; i++) 129 | { 130 | int value = input[i]; 131 | if (value < 0) value = -value; 132 | if (maxValue < value) maxValue = value; 133 | } 134 | return maxValue; 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/AudioToMelSpectrogramPreprocessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | 6 | namespace NeMoOnnxSharp.AudioPreprocessing 7 | { 8 | public class AudioToMelSpectrogramPreprocessor : IAudioPreprocessor 9 | { 10 | private enum FrameType 11 | { 12 | None, 13 | Preemph, 14 | Center, 15 | CenterPreemph 16 | } 17 | 18 | private const double FeatureStdOffset = 1e-5; 19 | 20 | private static FrameType GetFrameType(bool center, double preemph) 21 | { 22 | if (preemph == 0.0) 23 | { 24 | return center ? FrameType.Center : FrameType.None; 25 | } 26 | else 27 | { 28 | return center ? FrameType.CenterPreemph : FrameType.Preemph; 29 | } 30 | } 31 | 32 | protected readonly int _sampleRate; 33 | protected readonly double[] _window; 34 | private readonly FrameType _frameType; 35 | protected readonly int _nWindowStride; 36 | protected readonly FeatureNormalize _normalize; 37 | private readonly double _preNormalize; 38 | protected readonly double _preemph; 39 | protected readonly double[] _melBands; 40 | protected readonly int _nFFT; 41 | protected readonly int _features; 42 | private readonly MelNorm _melNorm; 43 | private readonly int _magPower; 44 | private readonly double _logZeroGuardValue; 45 | private readonly bool _log; 46 | 47 | public int SampleRate => _sampleRate; 48 | 49 | public AudioToMelSpectrogramPreprocessor( 50 | int sampleRate = 16000, 51 | double windowSize = 0.02, 52 | double windowStride = 0.01, 53 | int? nWindowSize = null, 54 | int? nWindowStride = null, 55 | WindowFunction window = WindowFunction.Hann, 56 | FeatureNormalize normalize = FeatureNormalize.PerFeature, 57 | double preNormalize = 0.0, 58 | int? nFFT = null, 59 | double preemph = 0.97, 60 | bool center = true, 61 | int features = 64, 62 | double lowFreq = 0.0, 63 | double? highFreq = null, 64 | bool htk = false, 65 | MelNorm melNorm = MelNorm.Slaney, 66 | bool log = true, 67 | double? logZeroGuardValue = null, 68 | int magPower = 2) 69 | { 70 | _sampleRate = sampleRate; 71 | _preNormalize = preNormalize; 72 | _preemph = preemph; 73 | _window = Window.MakeWindow(window, nWindowSize ?? (int)(windowSize * sampleRate)); 74 | _frameType = GetFrameType(center, preemph); 75 | _nWindowStride = nWindowStride ?? (int)(windowStride * sampleRate); 76 | _normalize = normalize; 77 | if (normalize != FeatureNormalize.PerFeature) 78 | { 79 | throw new ArgumentException("Only FeatureNormalize.PerFeature is supported"); 80 | } 81 | _melBands = MelBands.MakeMelBands( 82 | lowFreq, highFreq ?? sampleRate / 2, 83 | features, 84 | htk ? MelScale.HTK : MelScale.Slaney); 85 | _melNorm = melNorm; 86 | _nFFT = nFFT ?? (int)Math.Pow(2, Math.Ceiling(Math.Log(_window.Length, 2))); 87 | _features = features; 88 | _magPower = magPower; 89 | _log = log; 90 | _logZeroGuardValue = logZeroGuardValue ?? Math.Pow(2, -24); 91 | } 92 | 93 | public float[] GetFeatures(Span input) 94 | { 95 | double scale = GetScaleFactor(input); 96 | int outputStep = _features; 97 | int outputLength = GetOutputLength(input); 98 | float[] output = new float[outputStep * outputLength]; 99 | int waveformOffset = 0; 100 | for (int outputOffset = 0; outputOffset < output.Length; outputOffset += outputStep) 101 | { 102 | MelSpectrogramStep(input, waveformOffset, scale, output.AsSpan(outputOffset)); 103 | waveformOffset += _nWindowStride; 104 | } 105 | if (_normalize != FeatureNormalize.None) 106 | { 107 | NormalizeBatch(output, outputStep); 108 | } 109 | return output; 110 | } 111 | 112 | private int GetOutputLength(Span input) 113 | { 114 | if (_frameType == FrameType.Center || _frameType == FrameType.CenterPreemph) 115 | { 116 | return (input.Length + _nWindowStride - 1) / _nWindowStride; 117 | } 118 | else 119 | { 120 | return (input.Length - _window.Length) / _nWindowStride + 1; 121 | } 122 | } 123 | 124 | private double GetScaleFactor(Span input) 125 | { 126 | double scale; 127 | if (_preNormalize > 0) 128 | { 129 | scale = _preNormalize / MaxAbsValue(input); 130 | } 131 | else 132 | { 133 | scale = 1.0 / short.MaxValue; 134 | } 135 | 136 | return scale; 137 | } 138 | 139 | private int MaxAbsValue(Span input) 140 | { 141 | int maxValue = 1; 142 | for (int i = 0; i < input.Length; i++) 143 | { 144 | int value = input[i]; 145 | if (value < 0) value = -value; 146 | if (maxValue < value) maxValue = value; 147 | } 148 | return maxValue; 149 | } 150 | 151 | public void MelSpectrogramStep( 152 | Span input, int waveformOffset, 153 | double scale, Span output) 154 | { 155 | Span temp1 = stackalloc double[_nFFT]; 156 | Span temp2 = stackalloc double[_nFFT]; 157 | ReadFrame(input, waveformOffset, scale, temp1); 158 | FFT.CFFT(temp1, temp2, _nFFT); 159 | ToMagnitude(temp2, temp1, _nFFT); 160 | MelBands.ToMelSpectrogram( 161 | temp2, _melBands, _sampleRate, _nFFT, _features, _melNorm, _log, _logZeroGuardValue, temp1); 162 | for (int i = 0; i < _features; i++) output[i] = (float)temp1[i]; 163 | } 164 | 165 | protected void ReadFrame(Span input, int offset, double scale, Span frame) 166 | { 167 | switch (_frameType) 168 | { 169 | case FrameType.None: 170 | ReadFrameNone(input, offset, scale, frame); 171 | break; 172 | case FrameType.Preemph: 173 | throw new NotImplementedException(); 174 | case FrameType.Center: 175 | ReadFrameCenter(input, offset, scale, frame); 176 | break; 177 | case FrameType.CenterPreemph: 178 | ReadFrameCenterPreemphasis(input, offset, scale, frame); 179 | break; 180 | } 181 | } 182 | 183 | private void ReadFrameNone(Span input, int offset, double scale, Span frame) 184 | { 185 | for (int i = 0; i < _window.Length; i++) 186 | { 187 | frame[i] = input[offset + i] * _window[i] * scale; 188 | } 189 | for (int i = _window.Length; i < frame.Length; i++) 190 | { 191 | frame[i] = 0.0; 192 | } 193 | } 194 | 195 | private void ReadFrameCenter(Span input, int offset, double scale, Span frame) 196 | { 197 | int frameOffset = frame.Length / 2 - _window.Length / 2; 198 | for (int i = 0; i < frameOffset; i++) 199 | { 200 | frame[i] = 0; 201 | } 202 | int waveformOffset = offset - _window.Length / 2; 203 | for (int i = 0; i < _window.Length; i++) 204 | { 205 | int k = i + waveformOffset; 206 | double v = k >= 0 && k < input.Length ? input[k] : 0; 207 | frame[i + frameOffset] = scale * v * _window[i]; 208 | } 209 | for (int i = frameOffset + _window.Length; i < frame.Length; i++) 210 | { 211 | frame[i] = 0; 212 | } 213 | } 214 | 215 | private void ReadFrameCenterPreemphasis(Span input, int offset, double scale, Span frame) 216 | { 217 | int frameOffset = (frame.Length - 1) / 2 - (_window.Length - 1) / 2; 218 | for (int i = 0; i < frameOffset; i++) 219 | { 220 | frame[i] = 0; 221 | } 222 | int waveformOffset = offset - (_window.Length - 1) / 2; 223 | for (int i = 0; i < _window.Length; i++) 224 | { 225 | int k = i + waveformOffset; 226 | double v = k >= 0 && k < input.Length ? input[k] : 0; 227 | k--; 228 | if (k >= 0 && k < input.Length) v -= _preemph * input[k]; 229 | frame[i + frameOffset] = scale * v * _window[i]; 230 | } 231 | for (int i = frameOffset + _window.Length; i < frame.Length; i++) 232 | { 233 | frame[i] = 0; 234 | } 235 | } 236 | 237 | private void ToMagnitude(Span xr, Span xi, int length) 238 | { 239 | if (_magPower == 2) 240 | { 241 | ToSquareMagnitude(xr, xi, length); 242 | } 243 | else if (_magPower == 1) 244 | { 245 | ToAbsoluteMagnitude(xr, xi, length); 246 | } 247 | else 248 | { 249 | throw new NotImplementedException("power must be 1 or 2."); 250 | } 251 | } 252 | 253 | private static void ToAbsoluteMagnitude(Span xr, Span xi, int length) 254 | { 255 | for (int i = 0; i < length; i++) 256 | { 257 | xr[i] = Math.Sqrt(xr[i] * xr[i] + xi[i] * xi[i]); 258 | } 259 | } 260 | 261 | private static void ToSquareMagnitude(Span xr, Span xi, int length) 262 | { 263 | for (int i = 0; i < length; i++) 264 | { 265 | xr[i] = xr[i] * xr[i] + xi[i] * xi[i]; 266 | } 267 | } 268 | 269 | private void NormalizeBatch(float[] output, int outputStep) 270 | { 271 | int melspecLength = output.Length / outputStep; 272 | for (int i = 0; i < outputStep; i++) 273 | { 274 | double sum = 0; 275 | for (int j = 0; j < melspecLength; j++) 276 | { 277 | double v = output[i + outputStep * j]; 278 | sum += v; 279 | } 280 | float mean = (float)(sum / melspecLength); 281 | sum = 0; 282 | for (int j = 0; j < melspecLength; j++) 283 | { 284 | double v = output[i + outputStep * j] - mean; 285 | sum += v * v; 286 | } 287 | double std = Math.Sqrt(sum / melspecLength); 288 | float invStd = (float)(1.0 / (FeatureStdOffset + std)); 289 | 290 | for (int j = 0; j < melspecLength; j++) 291 | { 292 | float v = output[i + outputStep * j]; 293 | output[i + outputStep * j] = (v - mean) * invStd; 294 | } 295 | } 296 | } 297 | } 298 | } 299 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/FFT.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace NeMoOnnxSharp.AudioPreprocessing 9 | { 10 | public static class FFT 11 | { 12 | public static void CFFT(Span xr, Span xi, int N) 13 | { 14 | Span t = xi; 15 | xi = xr; 16 | xr = t; 17 | Swap(xr, xi, N); 18 | for (int n = 1; n < N; n *= 2) 19 | { 20 | for (int j = 0; j < N; j += n * 2) 21 | { 22 | for (int k = 0; k < n; k++) 23 | { 24 | double ar = Math.Cos(-Math.PI * k / n); 25 | double ai = Math.Sin(-Math.PI * k / n); 26 | double er = xr[j + k]; 27 | double ei = xi[j + k]; 28 | double or = xr[j + k + n]; 29 | double oi = xi[j + k + n]; 30 | double aor = ar * or - ai * oi; 31 | double aoi = ai * or + ar * oi; 32 | xr[j + k] = er + aor; 33 | xi[j + k] = ei + aoi; 34 | xr[j + k + n] = er - aor; 35 | xi[j + k + n] = ei - aoi; 36 | //Console.WriteLine("{0} {1}", j + k, j + k + n); 37 | } 38 | } 39 | } 40 | } 41 | 42 | public static void DCT2(Span xr, Span xi, int N) 43 | { 44 | // TODO Implement more efficiently. 45 | for (int i = 0; i < N; i++) 46 | { 47 | double s = 0; 48 | for (int j = 0; j < N; j++) 49 | { 50 | s += xr[j] * Math.Cos(Math.PI * (j + 0.5) * i / N); 51 | } 52 | xi[i] = i == 0 ? s / Math.Sqrt(N) : s / Math.Sqrt(N / 2); 53 | } 54 | } 55 | 56 | private static void Swap(Span xr, Span xi, int N) 57 | { 58 | if (N == 256) 59 | { 60 | Swap256(xr, xi); 61 | } 62 | else if (N == 512) 63 | { 64 | Swap512(xr, xi); 65 | } 66 | else if (N == 1024) 67 | { 68 | Swap1024(xr, xi); 69 | } 70 | else if (N == 2048) 71 | { 72 | Swap2048(xr, xi); 73 | } 74 | else 75 | { 76 | throw new ArgumentException("Only 256, 512, 1024 or 2048 is supported for N"); 77 | } 78 | for (int i = 0; i < N; i++) 79 | { 80 | xi[i] = 0.0; 81 | } 82 | } 83 | 84 | private static void Swap256(Span xr, Span xi) 85 | { 86 | for (int i = 0; i < 256; i++) 87 | { 88 | int j = (i >> 7 & 0x01) 89 | + (i >> 5 & 0x02) 90 | + (i >> 3 & 0x04) 91 | + (i >> 1 & 0x08) 92 | + (i << 1 & 0x10) 93 | + (i << 3 & 0x20) 94 | + (i << 5 & 0x40) 95 | + (i << 7 & 0x80); 96 | xr[i] = xi[j]; 97 | } 98 | } 99 | 100 | private static void Swap512(Span xr, Span xi) 101 | { 102 | for (int i = 0; i < 512; i++) 103 | { 104 | int j = (i >> 8 & 0x01) 105 | + (i >> 6 & 0x02) 106 | + (i >> 4 & 0x04) 107 | + (i >> 2 & 0x08) 108 | + (i & 0x10) 109 | + (i << 2 & 0x20) 110 | + (i << 4 & 0x40) 111 | + (i << 6 & 0x80) 112 | + (i << 8 & 0x100); 113 | xr[i] = xi[j]; 114 | } 115 | } 116 | 117 | private static void Swap1024(Span xr, Span xi) 118 | { 119 | for (int i = 0; i < 1024; i++) 120 | { 121 | int j = (i >> 9 & 0x01) 122 | + (i >> 7 & 0x02) 123 | + (i >> 5 & 0x04) 124 | + (i >> 3 & 0x08) 125 | + (i >> 1 & 0x10) 126 | + (i << 1 & 0x20) 127 | + (i << 3 & 0x40) 128 | + (i << 5 & 0x80) 129 | + (i << 7 & 0x100) 130 | + (i << 9 & 0x200); 131 | xr[i] = xi[j]; 132 | } 133 | } 134 | 135 | private static void Swap2048(Span xr, Span xi) 136 | { 137 | for (int i = 0; i < 2048; i++) 138 | { 139 | int j = (i >> 10 & 0x01) 140 | + (i >> 8 & 0x02) 141 | + (i >> 6 & 0x04) 142 | + (i >> 4 & 0x08) 143 | + (i >> 2 & 0x10) 144 | + (i & 0x20) 145 | + (i << 2 & 0x40) 146 | + (i << 4 & 0x80) 147 | + (i << 6 & 0x100) 148 | + (i << 8 & 0x200) 149 | + (i << 10 & 0x400); 150 | xr[i] = xi[j]; 151 | } 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/FeatureNormalize.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | namespace NeMoOnnxSharp.AudioPreprocessing 5 | { 6 | public enum FeatureNormalize 7 | { 8 | None, 9 | PerFeature, 10 | AllFeatures 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/HTKMelBands.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | 10 | namespace NeMoOnnxSharp.AudioPreprocessing 11 | { 12 | internal static class HTKMelBands 13 | { 14 | public static double[] MakeMelBands(double melMinHz, double melMaxHz, int nMelBanks) 15 | { 16 | double melMin = HzToMel(melMinHz); 17 | double melMax = HzToMel(melMaxHz); 18 | double[] melBanks = new double[nMelBanks + 2]; 19 | for (int i = 0; i < nMelBanks + 2; i++) 20 | { 21 | double mel = (melMax - melMin) * i / (nMelBanks + 1) + melMin; 22 | melBanks[i] = MelToHz(mel); 23 | } 24 | return melBanks; 25 | } 26 | 27 | private static double HzToMel(double hz) 28 | { 29 | return 2595 * Math.Log10(1 + hz / 700); 30 | } 31 | 32 | private static double MelToHz(double mel) 33 | { 34 | return (Math.Pow(10, mel / 2595) - 1) * 700; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/IAudioFeatureBuffer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | 9 | namespace NeMoOnnxSharp.AudioPreprocessing 10 | { 11 | public interface IAudioFeatureBuffer 12 | { 13 | public int NumInputChannels { get; } 14 | public int NumOutputChannels { get; } 15 | public int HopLength { get; } 16 | public int WinLength { get; } 17 | public int OutputCount { get; } 18 | public S[] OutputBuffer { get; } 19 | public int Write(T[] waveform, int offset, int count); 20 | public int Write(Span waveform); 21 | public void ConsumeOutput(int count); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/IAudioPreprocessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | 9 | namespace NeMoOnnxSharp.AudioPreprocessing 10 | { 11 | public interface IAudioPreprocessor 12 | { 13 | int SampleRate { get; } 14 | 15 | T2[] GetFeatures(Span input); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/IFeaturizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | 6 | namespace NeMoOnnxSharp.AudioPreprocessing 7 | { 8 | public interface IFeaturizer 9 | { 10 | int SampleRate { get; } 11 | int InputLength { get; } 12 | int OutputLength { get; } 13 | void GetFeatures(Span input, Span output); 14 | } 15 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/MFCC.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | 6 | namespace NeMoOnnxSharp.AudioPreprocessing 7 | { 8 | public class MFCC : IFeaturizer 9 | { 10 | private const double InvMaxShort = 1.0 / short.MaxValue; 11 | private const double LogOffset = 1e-6; 12 | 13 | protected readonly int _sampleRate; 14 | protected readonly double[] _window; 15 | protected readonly double[] _melBands; 16 | protected readonly int _nFFT; 17 | protected readonly int _nMels; 18 | private readonly MelNorm _melNorm; 19 | private readonly int _power; 20 | private readonly bool _logMels; 21 | private readonly int _nMFCC; 22 | 23 | public int SampleRate => _sampleRate; 24 | public int InputLength => _window.Length; 25 | public int OutputLength => _nMFCC; 26 | 27 | public MFCC( 28 | int sampleRate = 16000, 29 | WindowFunction window = WindowFunction.Hann, 30 | int? winLength = null, 31 | int nFFT = 400, 32 | int power = 2, 33 | bool normalized = false, 34 | double fMin = 0.0, 35 | double? fMax = null, 36 | int nMels = 128, 37 | MelNorm melNorm = MelNorm.None, 38 | MelScale melScale = MelScale.HTK, 39 | int nMFCC = 40, 40 | int dctType = 2, 41 | MFCCNorm mfccNorm = MFCCNorm.Ortho, 42 | bool logMels = false) 43 | { 44 | if (dctType != 2) 45 | { 46 | throw new ArgumentException("Only DCT-II is supported"); 47 | } 48 | if (normalized) 49 | { 50 | throw new ArgumentException("Normalizing by magnitude after stft is not supported"); 51 | } 52 | if (mfccNorm != MFCCNorm.Ortho) 53 | { 54 | throw new ArgumentException("Only Ortho is supported for MFCC norm"); 55 | } 56 | _sampleRate = sampleRate; 57 | _window = Window.MakeWindow(window, winLength ?? nFFT); 58 | _melBands = MelBands.MakeMelBands(fMin, fMax ?? sampleRate / 2, nMels, melScale); 59 | _melNorm = melNorm; 60 | _nFFT = nFFT; 61 | _nMels = nMels; 62 | _power = power; 63 | _logMels = logMels; 64 | _nMFCC = nMFCC; 65 | } 66 | 67 | public void GetFeatures(Span input, Span output) 68 | { 69 | Span temp1 = stackalloc double[_nFFT]; 70 | Span temp2 = stackalloc double[_nFFT]; 71 | ReadFrame(input, temp1); 72 | FFT.CFFT(temp1, temp2, _nFFT); 73 | ToMagnitude(temp2, temp1); 74 | MelBands.ToMelSpectrogram( 75 | temp2, _melBands, _sampleRate, _nFFT, _nMels, _melNorm, true, LogOffset, temp1); 76 | FFT.DCT2(temp1, temp2, _nMFCC); 77 | for (int i = 0; i < _nMFCC; i++) output[i] = (float)temp2[i]; 78 | } 79 | 80 | private void ToMagnitude(Span xr, Span xi) 81 | { 82 | if (_power == 2) 83 | { 84 | ToSquareMagnitude(xr, xi); 85 | } 86 | else if (_power == 1) 87 | { 88 | ToAbsoluteMagnitude(xr, xi); 89 | } 90 | else 91 | { 92 | throw new NotImplementedException("power must be 1 or 2."); 93 | } 94 | } 95 | 96 | private static void ToAbsoluteMagnitude(Span xr, Span xi) 97 | { 98 | for (int i = 0; i < xr.Length; i++) 99 | { 100 | xr[i] = Math.Sqrt(xr[i] * xr[i] + xi[i] * xi[i]); 101 | } 102 | } 103 | 104 | private static void ToSquareMagnitude(Span xr, Span xi) 105 | { 106 | for (int i = 0; i < xr.Length; i++) 107 | { 108 | xr[i] = xr[i] * xr[i] + xi[i] * xi[i]; 109 | } 110 | } 111 | 112 | private void ReadFrame(Span waveform, Span frame) 113 | { 114 | int frameOffset = frame.Length / 2 - _window.Length / 2; 115 | frame.Slice(0, frameOffset).Fill(0); 116 | for (int i = 0; i < _window.Length; i++) 117 | { 118 | frame[i + frameOffset] = InvMaxShort * waveform[i] * _window[i]; 119 | } 120 | frame.Slice(frameOffset + _window.Length).Fill(0); 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/MFCCNorm.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | namespace NeMoOnnxSharp.AudioPreprocessing 5 | { 6 | public enum MFCCNorm 7 | { 8 | None, 9 | Ortho 10 | } 11 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/MelBands.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | 10 | namespace NeMoOnnxSharp.AudioPreprocessing 11 | { 12 | public static class MelBands 13 | { 14 | public static double[] MakeMelBands(double melMinHz, double melMaxHz, int nMelBanks, MelScale melScale) 15 | { 16 | if (melScale == MelScale.HTK) 17 | { 18 | return HTKMelBands.MakeMelBands(melMinHz, melMaxHz, nMelBanks); 19 | } 20 | else if (melScale == MelScale.Slaney) 21 | { 22 | return SlaneyMelBands.MakeMelBands(melMinHz, melMaxHz, nMelBanks); 23 | } 24 | else 25 | { 26 | throw new ArgumentException(); 27 | } 28 | } 29 | 30 | public static void ToMelSpectrogram( 31 | Span spec, double[] melBands, double sampleRate, 32 | int nFFT, int nMels, 33 | MelNorm norm, 34 | bool log, double logOffset, 35 | Span melspec) 36 | { 37 | if (!log) throw new NotImplementedException(); 38 | switch (norm) 39 | { 40 | case MelNorm.None: 41 | ToMelSpectrogramNone(spec, melBands, sampleRate, nFFT, nMels, logOffset, melspec); 42 | break; 43 | case MelNorm.Slaney: 44 | ToMelSpectrogramSlaney(spec, melBands, sampleRate, nFFT, nMels, logOffset, melspec); 45 | break; 46 | } 47 | } 48 | 49 | private static void ToMelSpectrogramNone( 50 | Span spec, double[] melBands, double sampleRate, 51 | int nFFT, int nMels, double logOffset, 52 | Span melspec) 53 | { 54 | for (int i = 0; i < nMels; i++) 55 | { 56 | double startHz = melBands[i]; 57 | double peakHz = melBands[i + 1]; 58 | double endHz = melBands[i + 2]; 59 | double v = 0.0; 60 | int j = (int)(startHz * nFFT / sampleRate) + 1; 61 | while (true) 62 | { 63 | double hz = j * sampleRate / nFFT; 64 | if (hz > peakHz) 65 | break; 66 | double r = (hz - startHz) / (peakHz - startHz); 67 | v += spec[j] * r; 68 | j++; 69 | } 70 | while (true) 71 | { 72 | double hz = j * sampleRate / nFFT; 73 | if (hz > endHz) 74 | break; 75 | double r = (endHz - hz) / (endHz - peakHz); 76 | v += spec[j] * r; 77 | j++; 78 | } 79 | melspec[i] = (float)Math.Log(v + logOffset); 80 | } 81 | } 82 | 83 | private static void ToMelSpectrogramSlaney( 84 | Span spec, double[] melBands, double sampleRate, 85 | int nFFT, int nMels, double logOffset, 86 | Span melspec) 87 | { 88 | for (int i = 0; i < nMels; i++) 89 | { 90 | double startHz = melBands[i]; 91 | double peakHz = melBands[i + 1]; 92 | double endHz = melBands[i + 2]; 93 | double v = 0.0; 94 | int j = (int)(startHz * nFFT / sampleRate) + 1; 95 | while (true) 96 | { 97 | double hz = j * sampleRate / nFFT; 98 | if (hz > peakHz) 99 | break; 100 | double r = (hz - startHz) / (peakHz - startHz); 101 | v += spec[j] * r * 2 / (endHz - startHz); 102 | j++; 103 | } 104 | while (true) 105 | { 106 | double hz = j * sampleRate / nFFT; 107 | if (hz > endHz) 108 | break; 109 | double r = (endHz - hz) / (endHz - peakHz); 110 | v += spec[j] * r * 2 / (endHz - startHz); 111 | j++; 112 | } 113 | melspec[i] = (float)Math.Log(v + logOffset); 114 | } 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/MelNorm.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace NeMoOnnxSharp.AudioPreprocessing 9 | { 10 | public enum MelNorm 11 | { 12 | None, 13 | Slaney 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/MelScale.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | namespace NeMoOnnxSharp.AudioPreprocessing 5 | { 6 | public enum MelScale 7 | { 8 | HTK, 9 | Slaney, 10 | } 11 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/SlaneyMelBands.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | 10 | namespace NeMoOnnxSharp.AudioPreprocessing 11 | { 12 | internal static class SlaneyMelBands 13 | { 14 | public static double[] MakeMelBands(double melMinHz, double melMaxHz, int nMelBanks) 15 | { 16 | double melMin = HzToMel(melMinHz); 17 | double melMax = HzToMel(melMaxHz); 18 | double[] melBanks = new double[nMelBanks + 2]; 19 | for (int i = 0; i < nMelBanks + 2; i++) 20 | { 21 | double mel = (melMax - melMin) * i / (nMelBanks + 1) + melMin; 22 | melBanks[i] = MelToHz(mel); 23 | } 24 | return melBanks; 25 | } 26 | 27 | private static double HzToMel(double hz) 28 | { 29 | const double minLogHz = 1000.0; // beginning of log region in Hz 30 | const double linearMelHz = 200.0 / 3; 31 | double mel; 32 | if (hz >= minLogHz) 33 | { 34 | // Log region 35 | const double minLogMel = minLogHz / linearMelHz; 36 | double logStep = Math.Log(6.4) / 27.0; 37 | mel = minLogMel + Math.Log(hz / minLogHz) / logStep; 38 | } 39 | else 40 | { 41 | // Linear region 42 | mel = hz / linearMelHz; 43 | } 44 | 45 | return mel; 46 | } 47 | 48 | private static double MelToHz(double mel) 49 | { 50 | const double minLogHz = 1000.0; // beginning of log region in Hz 51 | const double linearMelHz = 200.0 / 3; 52 | const double minLogMel = minLogHz / linearMelHz; // same (Mels) 53 | double freq; 54 | 55 | 56 | if (mel >= minLogMel) 57 | { 58 | // Log region 59 | double logStep = Math.Log(6.4) / 27.0; 60 | freq = minLogHz * Math.Exp(logStep * (mel - minLogMel)); 61 | } 62 | else 63 | { 64 | // Linear region 65 | freq = linearMelHz * mel; 66 | } 67 | 68 | return freq; 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/Window.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace NeMoOnnxSharp.AudioPreprocessing 9 | { 10 | public static class Window 11 | { 12 | public static double[] MakeWindow(WindowFunction function, int length) 13 | { 14 | if (function == WindowFunction.Hann) 15 | { 16 | return MakeHannWindow(length); 17 | } 18 | else 19 | { 20 | throw new ArgumentException("Unknown windows name"); 21 | } 22 | } 23 | 24 | private static double[] MakeHannWindow(int length) 25 | { 26 | double[] window = new double[length]; 27 | for (int i = 0; i < length; i++) 28 | { 29 | window[i] = 0.5 * (1 - Math.Cos(2 * Math.PI * i / (length - 1))); 30 | } 31 | return window; 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/AudioPreprocessing/WindowFunction.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace NeMoOnnxSharp.AudioPreprocessing 9 | { 10 | public enum WindowFunction 11 | { 12 | Bartlett, 13 | Blackman, 14 | Hamming, 15 | Hann, 16 | Kaiser 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/FrameVAD.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using NeMoOnnxSharp.AudioPreprocessing; 9 | using NeMoOnnxSharp.Models; 10 | 11 | namespace NeMoOnnxSharp 12 | { 13 | public sealed class FrameVAD : IDisposable 14 | { 15 | private readonly int _sampleRate; 16 | private readonly int _modelWinLength; 17 | private readonly int _modelHopLength; 18 | private int _predictIndex; 19 | private float[] _predictWindow; 20 | private readonly AudioFeatureBuffer _featureBuffer; 21 | private readonly EncDecClassificationModel _vad; 22 | 23 | public FrameVAD(EncDecClassificationConfig config, int smoothingWinLength = 64) 24 | { 25 | _sampleRate = 16000; 26 | _modelWinLength = 32; 27 | _modelHopLength = 1; 28 | _predictIndex = 0; 29 | _predictWindow = new float[smoothingWinLength]; 30 | var transform = new MFCC( 31 | sampleRate: _sampleRate, 32 | window: WindowFunction.Hann, 33 | winLength: 400, 34 | nFFT: 512, 35 | nMels: 64, 36 | nMFCC: 64, 37 | fMin: 0.0, 38 | fMax: null, 39 | logMels: true, 40 | melScale: MelScale.HTK, 41 | melNorm: MelNorm.None); 42 | _featureBuffer = new AudioFeatureBuffer( 43 | transform, 44 | hopLength: 160); 45 | _vad = new EncDecClassificationModel(config); 46 | } 47 | 48 | public int HopLength => _featureBuffer.HopLength * _modelHopLength; 49 | 50 | public int SampleRate => _sampleRate; 51 | public int PredictionOffset { 52 | get { 53 | int outputTotalWindow = (_predictWindow.Length - 1) * _modelHopLength + _modelWinLength; 54 | int outputPosition = _featureBuffer.OutputPosition; 55 | outputPosition += _featureBuffer.HopLength * (outputTotalWindow / 2 - _modelWinLength); 56 | return outputPosition - _featureBuffer.WinLength / 2; 57 | } 58 | } 59 | 60 | public void Dispose() 61 | { 62 | _vad.Dispose(); 63 | } 64 | 65 | public float[] Transcribe(short[] input, int offset, int count) 66 | { 67 | return Transcribe(input.AsSpan(offset, count)); 68 | } 69 | 70 | public float[] Transcribe(Span input) 71 | { 72 | var result = new List(); 73 | while (input.Length > 0) 74 | { 75 | int written = _featureBuffer.Write(input); 76 | if (written == 0) 77 | { 78 | throw new InvalidDataException(); 79 | } 80 | while (_featureBuffer.OutputCount >= _featureBuffer.NumOutputChannels * _modelWinLength) 81 | { 82 | var logits = _vad.Predict(_featureBuffer.OutputBuffer.AsSpan(0, _featureBuffer.NumOutputChannels * _modelWinLength)); 83 | double x = Math.Exp(logits[0] - logits[1]); 84 | 85 | _predictWindow[_predictIndex] = (float)(1 / (x + 1)); 86 | _predictIndex = (_predictIndex + 1) % _predictWindow.Length; 87 | result.Add(_predictWindow.Average()); 88 | _featureBuffer.ConsumeOutput(_featureBuffer.NumOutputChannels * _modelHopLength); 89 | } 90 | input = input[written..]; 91 | } 92 | return result.ToArray(); 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/ASRModel.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | 9 | namespace NeMoOnnxSharp.Models 10 | { 11 | public abstract class ASRModel : Model 12 | { 13 | protected ASRModel(ModelConfig config) : base(config) 14 | { 15 | } 16 | 17 | public abstract string Transcribe(Span inputSignal); 18 | 19 | protected float[] TransposeInputSignal(Span inputSignal, int nFeatures) 20 | { 21 | var transposedSignal = new float[inputSignal.Length]; 22 | int rows = inputSignal.Length / nFeatures; 23 | for (int i = 0; i < rows; i++) 24 | { 25 | for (int j = 0; j < nFeatures; j++) 26 | { 27 | transposedSignal[j * rows + i] = inputSignal[i * nFeatures + j]; 28 | } 29 | } 30 | return transposedSignal; 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/CharTokenizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Text.RegularExpressions; 9 | 10 | namespace NeMoOnnxSharp.Models 11 | { 12 | public class CharTokenizer 13 | { 14 | private const string DefaultVocabulary = "_ abcdefghijklmnopqrstuvwxyz'"; 15 | private static readonly Regex MergeRx = new Regex(@"(.)\1+"); 16 | 17 | private readonly Regex _vocabRx; 18 | private readonly IDictionary _v2i; 19 | private readonly string _i2v; 20 | 21 | public CharTokenizer() : this(DefaultVocabulary) 22 | { 23 | } 24 | 25 | public CharTokenizer(string characters) 26 | { 27 | _vocabRx = new Regex("[^" + characters.Substring(1) + "]"); 28 | _i2v = characters; 29 | _v2i = new Dictionary(); 30 | for (int i = 0; i < _i2v.Length; i++) _v2i[_i2v[i]] = i; 31 | } 32 | 33 | public long[] Encode(string text) 34 | { 35 | string lower = text.ToLower().Trim(); 36 | long[] encoded = new long[lower.Length]; 37 | int j = 0; 38 | for (int i = 0; i < lower.Length; i++) 39 | { 40 | if (_v2i.TryGetValue(lower[i], out encoded[j])) 41 | { 42 | j++; 43 | } 44 | } 45 | return encoded.AsSpan(0, j).ToArray(); 46 | } 47 | 48 | public string Decode(long[] encoded) 49 | { 50 | char[] chars = new char[encoded.Length]; 51 | for (int i = 0; i < chars.Length; i++) 52 | { 53 | long index = encoded[i]; 54 | if (index < 0 || index >= _i2v.Length) index = 0; 55 | chars[i] = _i2v[(int)index]; 56 | } 57 | return new string(chars); 58 | } 59 | 60 | public string MergeRepeated(string text) 61 | { 62 | return MergeRx.Replace(text, @"$1").Replace("_", ""); 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/EncDecCTCConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | 8 | namespace NeMoOnnxSharp.Models 9 | { 10 | public class EncDecCTCConfig : ModelConfig 11 | { 12 | public const string EnglishVocabulary = " abcdefghijklmnopqrstuvwxyz'_"; 13 | public const string GermanVocabulary = " abcdefghijklmnopqrstuvwxyzäöüß_"; 14 | 15 | public string? vocabulary; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/EncDecCTCModel.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using Microsoft.ML.OnnxRuntime; 5 | using Microsoft.ML.OnnxRuntime.Tensors; 6 | using NeMoOnnxSharp.AudioPreprocessing; 7 | using System; 8 | using System.Collections.Generic; 9 | using System.Linq; 10 | 11 | namespace NeMoOnnxSharp.Models 12 | { 13 | public sealed class EncDecCTCModel : ASRModel, IDisposable 14 | { 15 | private readonly IAudioPreprocessor _preProcessor; 16 | private readonly CharTokenizer _tokenizer; 17 | private readonly int _features; 18 | 19 | public IAudioPreprocessor PreProcessor => _preProcessor; 20 | public int SampleRate => _preProcessor.SampleRate; 21 | 22 | public EncDecCTCModel(EncDecCTCConfig config) : base(config) 23 | { 24 | _features = 64; 25 | _preProcessor = new AudioToMelSpectrogramPreprocessor( 26 | sampleRate: 16000, 27 | window: WindowFunction.Hann, 28 | windowSize: 0.02, 29 | windowStride: 0.01, 30 | nFFT: 512, 31 | features: _features); 32 | if (config.vocabulary == null) throw new ArgumentNullException("config"); 33 | _tokenizer = new CharTokenizer(config.vocabulary); 34 | } 35 | 36 | public void Dispose() 37 | { 38 | _inferSess.Dispose(); 39 | } 40 | 41 | public override string Transcribe(Span inputSignal) 42 | { 43 | string text = string.Empty; 44 | var processedSignal = _preProcessor.GetFeatures(inputSignal); 45 | processedSignal = TransposeInputSignal(processedSignal, _features); 46 | var container = new List(); 47 | var audioSignalData = new DenseTensor( 48 | processedSignal, 49 | new int[3] { 1, _features, processedSignal.Length / _features }); 50 | container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData)); 51 | using (var res = _inferSess.Run(container, new string[] { "logprobs" })) 52 | { 53 | var logprobs = res.First(); 54 | long[] preds = ArgMax(logprobs.AsTensor()); 55 | text = _tokenizer.Decode(preds); 56 | text = _tokenizer.MergeRepeated(text); 57 | } 58 | return text; 59 | } 60 | 61 | private long[] ArgMax(Tensor logprobs) 62 | { 63 | long[] preds = new long[logprobs.Dimensions[1]]; 64 | for (int l = 0; l < preds.Length; l++) 65 | { 66 | int k = -1; 67 | float m = float.MinValue; 68 | for (int j = 0; j < logprobs.Dimensions[2]; j++) 69 | { 70 | if (m < logprobs[0, l, j]) 71 | { 72 | k = j; 73 | m = logprobs[0, l, j]; 74 | } 75 | } 76 | preds[l] = k; 77 | } 78 | 79 | return preds; 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/EncDecClassificationConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | 8 | namespace NeMoOnnxSharp.Models 9 | { 10 | public class EncDecClassificationConfig : ModelConfig 11 | { 12 | public static readonly string[] SpeechCommandsLabels = new string[] 13 | { 14 | "visual", "wow", "learn", "backward", "dog", 15 | "two", "left", "happy", "nine", "go", 16 | "up", "bed", "stop", "one", "zero", 17 | "tree", "seven", "on", "four", "bird", 18 | "right", "eight", "no", "six", "forward", 19 | "house", "marvin", "sheila", "five", "off", 20 | "three", "down", "cat", "follow", "yes" 21 | }; 22 | public static readonly string[] VADLabels = new string[] 23 | { 24 | "background", 25 | "speech" 26 | }; 27 | 28 | public string[]? labels; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/EncDecClassificationModel.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using Microsoft.ML.OnnxRuntime; 5 | using Microsoft.ML.OnnxRuntime.Tensors; 6 | using NeMoOnnxSharp.AudioPreprocessing; 7 | using System; 8 | using System.Collections.Generic; 9 | using System.Linq; 10 | 11 | namespace NeMoOnnxSharp.Models 12 | { 13 | public sealed class EncDecClassificationModel : ASRModel, IDisposable 14 | { 15 | private readonly IAudioPreprocessor _preProcessor; 16 | private readonly int _nMelBands; 17 | private readonly string[] _labels; 18 | 19 | public IAudioPreprocessor PreProcessor => _preProcessor; 20 | 21 | public EncDecClassificationModel(EncDecClassificationConfig config) : base(config) 22 | { 23 | _nMelBands = 64; 24 | _preProcessor = new AudioToMFCCPreprocessor( 25 | sampleRate: 16000, 26 | window: WindowFunction.Hann, 27 | windowSize: 0.025, 28 | windowStride: 0.01, 29 | nFFT: 512, 30 | //preNormalize: 0.8, 31 | nMels: 64, 32 | nMFCC: 64); 33 | if (config.labels == null) throw new ArgumentNullException("labels"); 34 | _labels = config.labels; 35 | } 36 | 37 | public void Dispose() 38 | { 39 | _inferSess.Dispose(); 40 | } 41 | 42 | public override string Transcribe(Span inputSignal) 43 | { 44 | string text = string.Empty; 45 | var processedSignal = _preProcessor.GetFeatures(inputSignal); 46 | processedSignal = TransposeInputSignal(processedSignal, _nMelBands); 47 | var container = new List(); 48 | var audioSignalData = new DenseTensor( 49 | processedSignal, 50 | new int[3] { 1, _nMelBands, processedSignal.Length / _nMelBands }); 51 | container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData)); 52 | using (var res = _inferSess.Run(container, new string[] { "logits" })) 53 | { 54 | var scoreTensor = res.First(); 55 | long pred = ArgMax(scoreTensor.AsTensor()); 56 | text = _labels[pred]; 57 | } 58 | return text; 59 | } 60 | 61 | public float[] Predict(Span processedSignal) 62 | { 63 | var transposedProcessedSignal = TransposeInputSignal(processedSignal, _nMelBands); 64 | var container = new List(); 65 | var audioSignalData = new DenseTensor( 66 | transposedProcessedSignal, 67 | new int[3] { 1, _nMelBands, transposedProcessedSignal.Length / _nMelBands }); 68 | container.Add(NamedOnnxValue.CreateFromTensor("audio_signal", audioSignalData)); 69 | float[] logits; 70 | using (var res = _inferSess.Run(container, new string[] { "logits" })) 71 | { 72 | var logitsTensor = res.First(); 73 | logits = logitsTensor.AsTensor().ToArray(); 74 | } 75 | return logits; 76 | } 77 | 78 | private long ArgMax(Tensor score) 79 | { 80 | int k = -1; 81 | float m = float.MinValue; 82 | for (int j = 0; j < score.Dimensions[1]; j++) 83 | { 84 | if (m < score[0, j]) 85 | { 86 | k = j; 87 | m = score[0, j]; 88 | } 89 | } 90 | return k; 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/Model.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using Microsoft.ML.OnnxRuntime; 5 | using Microsoft.ML.OnnxRuntime.Tensors; 6 | using System; 7 | using System.Collections.Generic; 8 | using System.IO; 9 | using System.Linq; 10 | 11 | namespace NeMoOnnxSharp.Models 12 | { 13 | public abstract class Model 14 | { 15 | protected readonly InferenceSession _inferSess; 16 | 17 | protected Model(ModelConfig config) 18 | { 19 | if (config.model != null) 20 | { 21 | _inferSess = new InferenceSession(config.model); 22 | } 23 | else if (config.modelPath != null) 24 | { 25 | _inferSess = new InferenceSession(config.modelPath); 26 | } 27 | else 28 | { 29 | throw new InvalidDataException(); 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/ModelConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | 8 | namespace NeMoOnnxSharp.Models 9 | { 10 | public class ModelConfig 11 | { 12 | public string? modelPath; 13 | public byte[]? model; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/SpectrogramGenerator.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using Microsoft.ML.OnnxRuntime; 5 | using Microsoft.ML.OnnxRuntime.Tensors; 6 | using NeMoOnnxSharp.TTSTokenizers; 7 | using System; 8 | using System.Collections.Generic; 9 | using System.Linq; 10 | 11 | namespace NeMoOnnxSharp.Models 12 | { 13 | public sealed class SpectrogramGenerator : Model, IDisposable 14 | { 15 | private readonly BaseTokenizer _tokenizer; 16 | 17 | public SpectrogramGenerator(SpectrogramGeneratorConfig config) : base(config) 18 | { 19 | _tokenizer = _SetupTokenizer(config); 20 | } 21 | 22 | public void Dispose() 23 | { 24 | _inferSess.Dispose(); 25 | } 26 | 27 | public int[] Parse(string strInput, bool normalize = true) 28 | { 29 | if (normalize) 30 | { 31 | strInput = _NormalizeText(strInput); 32 | } 33 | var encoded = _tokenizer.Encode(strInput); 34 | return encoded; 35 | } 36 | 37 | private static BaseTokenizer _SetupTokenizer(SpectrogramGeneratorConfig config) 38 | { 39 | BaseTokenizer tokenizer; 40 | if (config.textTokenizer == "EnglishPhonemesTokenizer") 41 | { 42 | if (config.phonemeDictPath == null) throw new ArgumentNullException(); 43 | if (config.heteronymsPath == null) throw new ArgumentNullException(); 44 | var g2p = new EnglishG2p( 45 | phonemeDict: config.phonemeDictPath, 46 | heteronyms: config.heteronymsPath, 47 | phonemeProbability: 1.0); 48 | tokenizer = new EnglishPhonemesTokenizer( 49 | g2p, 50 | punct: true, 51 | stresses: true, 52 | chars: true, 53 | apostrophe: true, 54 | padWithSpace: true, 55 | addBlankAt: BaseTokenizer.AddBlankAt.True); 56 | } 57 | else if (config.textTokenizer == "GermanCharsTokenizer") 58 | { 59 | tokenizer = new GermanCharsTokenizer( 60 | padWithSpace: true); 61 | } 62 | else 63 | { 64 | throw new ArgumentException(); 65 | } 66 | return tokenizer; 67 | } 68 | 69 | private string _NormalizeText(string strInput) 70 | { 71 | return strInput; 72 | } 73 | 74 | public float[] GenerateSpectrogram(int[] tokens, double pace = 1.0) 75 | { 76 | var container = new List(); 77 | var textData = new DenseTensor( 78 | tokens.Select(p => (long)p).ToArray(), 79 | new int[2] { 1, tokens.Length }); 80 | container.Add(NamedOnnxValue.CreateFromTensor("text", textData)); 81 | var paceData = new DenseTensor( 82 | new float[] { (float)pace }, 83 | new int[2] { 1, 1 }); 84 | container.Add(NamedOnnxValue.CreateFromTensor("pace", paceData)); 85 | var pitchData = new DenseTensor( 86 | Enumerable.Range(0, tokens.Length).Select(i => 0.0f).ToArray(), 87 | new int[2] { 1, tokens.Length }); 88 | container.Add(NamedOnnxValue.CreateFromTensor("pitch", pitchData)); 89 | float[] spec; 90 | using (var res = _inferSess.Run(container, new string[] { "spect" })) 91 | { 92 | var pitchPredictedData = res.First().AsTensor(); 93 | spec = pitchPredictedData.ToArray(); 94 | } 95 | return spec; 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/SpectrogramGeneratorConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | 9 | namespace NeMoOnnxSharp.Models 10 | { 11 | public class SpectrogramGeneratorConfig : ModelConfig 12 | { 13 | public string? phonemeDictPath; 14 | public string? heteronymsPath; 15 | public string? textTokenizer; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/Vocoder.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using Microsoft.ML.OnnxRuntime; 5 | using Microsoft.ML.OnnxRuntime.Tensors; 6 | using System; 7 | using System.Collections.Generic; 8 | using System.Linq; 9 | 10 | namespace NeMoOnnxSharp.Models 11 | { 12 | public sealed class Vocoder : Model, IDisposable 13 | { 14 | private readonly int _nfilt; 15 | private readonly int _sampleRate; 16 | 17 | public Vocoder(VocoderConfig config) : base(config) 18 | { 19 | _nfilt = 80; 20 | _sampleRate = 22050; 21 | } 22 | 23 | public int SampleRate { get { return _sampleRate; } } 24 | 25 | public void Dispose() 26 | { 27 | _inferSess.Dispose(); 28 | } 29 | 30 | public short[] ConvertSpectrogramToAudio(float[] spec) 31 | { 32 | var container = new List(); 33 | var specData = new DenseTensor( 34 | spec, 35 | new int[3] { 1, _nfilt, spec.Length / _nfilt }); 36 | container.Add(NamedOnnxValue.CreateFromTensor("spec", specData)); 37 | float[] audio; 38 | using (var res = _inferSess.Run(container, new string[] { "audio" })) 39 | { 40 | var audioTensor = res.First().AsTensor(); 41 | audio = audioTensor.ToArray(); 42 | } 43 | return audio.Select(x => (short)(x * short.MaxValue)).ToArray(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/Models/VocoderConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | 8 | namespace NeMoOnnxSharp.Models 9 | { 10 | public class VocoderConfig : ModelConfig 11 | { 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/NeMoOnnxSharp.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | netstandard2.1 5 | enable 6 | $(VersionPrefix)1.3.0 7 | https://github.com/kaiidams/NeMoOnnxSharp 8 | nemo onnx text-to-speech csharp speech tts speech-synthesis speech-recognition asr 9 | Copyright (C) 2022 Katsuya Iida. All rights reserved. 10 | Text-to-speech and speech recognition, VAD with NVIDIA NeMo and ONNX Runtime for .NET Core. 11 | https://github.com/kaiidams/NeMoOnnxSharp 12 | Katsuya Iida 13 | 14 | Apache-2.0 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/SpeechConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using NeMoOnnxSharp.Models; 9 | 10 | namespace NeMoOnnxSharp 11 | { 12 | public class SpeechConfig 13 | { 14 | public SpeechConfig() 15 | { 16 | vad = new EncDecClassificationConfig(); 17 | asr = new EncDecCTCConfig(); 18 | specGen = new SpectrogramGeneratorConfig(); 19 | vocoder = new VocoderConfig(); 20 | } 21 | 22 | public EncDecClassificationConfig vad; 23 | public EncDecCTCConfig asr; 24 | public SpectrogramGeneratorConfig specGen; 25 | public VocoderConfig vocoder; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/SpeechRecognitionEventArgs.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | 9 | namespace NeMoOnnxSharp 10 | { 11 | public class SpeechRecognitionEventArgs 12 | { 13 | public SpeechRecognitionEventArgs(ulong offset, string? text = null, short[]? audio = null) 14 | { 15 | Offset = offset; 16 | Text = text; 17 | Audio = audio; 18 | } 19 | 20 | public ulong Offset { get; private set; } 21 | public string? Text { get; private set; } 22 | public short[]? Audio { get; private set; } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/SpeechRecognizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using System.Runtime.InteropServices; 9 | using NeMoOnnxSharp.Models; 10 | 11 | namespace NeMoOnnxSharp 12 | { 13 | public sealed class SpeechRecognizer : IDisposable 14 | { 15 | private readonly FrameVAD _frameVad; 16 | private readonly EncDecCTCModel _asrModel; 17 | private readonly int _audioBufferIncrease; 18 | private readonly int _audioBufferSize; 19 | int _audioBufferIndex; 20 | long _currentPosition; 21 | byte[] _audioBuffer; 22 | bool _isSpeech; 23 | private readonly float _speechStartThreadhold; 24 | private readonly float _speechEndThreadhold; 25 | 26 | public SpeechRecognizer(SpeechConfig config) 27 | { 28 | _frameVad = new FrameVAD(config.vad); 29 | _asrModel = new EncDecCTCModel(config.asr); 30 | _currentPosition = 0; 31 | _audioBufferIndex = 0; 32 | _audioBufferSize = sizeof(short) * _frameVad.SampleRate * 2; // 2sec 33 | _audioBufferIncrease = sizeof(short) * 5 * _frameVad.SampleRate; // 10sec 34 | _audioBuffer = new byte[_audioBufferSize]; 35 | _isSpeech = false; 36 | _speechStartThreadhold = 0.7f; 37 | _speechEndThreadhold = 0.3f; 38 | } 39 | 40 | public int SampleRate => _frameVad.SampleRate; 41 | public event EventHandler? Recognized; 42 | public event EventHandler? SpeechStartDetected; 43 | public event EventHandler? SpeechEndDetected; 44 | 45 | public void Dispose() 46 | { 47 | _frameVad.Dispose(); 48 | _asrModel.Dispose(); 49 | } 50 | 51 | public void Write(byte[] input, int offset, int count) 52 | { 53 | Write(input.AsSpan(offset, count)); 54 | } 55 | 56 | public void Write(short[] input, int offset, int count) 57 | { 58 | Write(input.AsSpan(offset, count)); 59 | } 60 | 61 | public void Write(Span input) 62 | { 63 | var bytes = MemoryMarshal.Cast(input); 64 | Write(bytes); 65 | } 66 | 67 | public void Write(Span input) 68 | { 69 | while (input.Length > 0) 70 | { 71 | int len = input.Length; 72 | if (_isSpeech) 73 | { 74 | if (len > _audioBuffer.Length - _audioBufferIndex) 75 | { 76 | var tmp = new byte[_audioBuffer.Length + _audioBufferIncrease]; 77 | Array.Copy(_audioBuffer, tmp, _audioBufferIndex); 78 | _audioBuffer = tmp; 79 | } 80 | } 81 | else 82 | { 83 | if (_audioBufferIndex >= _audioBuffer.Length) 84 | { 85 | _audioBufferIndex = 0; 86 | } 87 | len = Math.Min(_audioBuffer.Length - _audioBufferIndex, len); 88 | } 89 | input.Slice(0, len).CopyTo(_audioBuffer.AsSpan(_audioBufferIndex, len)); 90 | input = input.Slice(len); 91 | int len2 = (len / sizeof(short)) * sizeof(short); 92 | var audioSignal = MemoryMarshal.Cast(_audioBuffer.AsSpan(_audioBufferIndex, len2)); 93 | _audioBufferIndex += len; 94 | _currentPosition += audioSignal.Length; 95 | _Transcribe(audioSignal); 96 | } 97 | } 98 | 99 | private void _Transcribe(Span audioSignal) 100 | { 101 | var pos = -(audioSignal.Length + _frameVad.PredictionOffset); 102 | var result = _frameVad.Transcribe(audioSignal); 103 | foreach (var prob in result) 104 | { 105 | if (_isSpeech) 106 | { 107 | if (prob < _speechEndThreadhold) 108 | { 109 | _isSpeech = false; 110 | int posBytes = pos * sizeof(short); 111 | if (Recognized != null) 112 | { 113 | var audio = _audioBuffer.AsSpan(0, _audioBufferIndex + posBytes); 114 | var x = MemoryMarshal.Cast(audio).ToArray(); 115 | string predictText = _asrModel.Transcribe(x); 116 | Recognized(this, new SpeechRecognitionEventArgs( 117 | (ulong)(_currentPosition + pos), predictText, x)); 118 | } 119 | if (SpeechEndDetected != null) 120 | { 121 | SpeechEndDetected(this, new SpeechRecognitionEventArgs( 122 | (ulong)(_currentPosition + pos))); 123 | } 124 | _ResetAudioBuffer(posBytes); 125 | } 126 | } 127 | else 128 | { 129 | if (prob >= _speechStartThreadhold) 130 | { 131 | _isSpeech = true; 132 | if (SpeechStartDetected != null) { 133 | SpeechStartDetected(this, new SpeechRecognitionEventArgs( 134 | (ulong)(_currentPosition + pos))); 135 | } 136 | int pos2 = pos * sizeof(short); 137 | _ChangeAudioBufferForSpeech(pos2); 138 | } 139 | } 140 | pos += _frameVad.HopLength; 141 | } 142 | } 143 | 144 | private void _ResetAudioBuffer(int posBytes) 145 | { 146 | var tmp = new byte[_audioBufferSize]; 147 | Array.Copy( 148 | _audioBuffer, _audioBufferIndex + posBytes, 149 | tmp, 0, 150 | -posBytes); 151 | _audioBuffer = tmp; 152 | _audioBufferIndex = -posBytes; 153 | } 154 | 155 | private void _ChangeAudioBufferForSpeech(int posBytes) 156 | { 157 | int audioBufferStart = _audioBufferIndex + posBytes; 158 | int audioBufferEnd = _audioBufferIndex; 159 | if (audioBufferStart >= 0) 160 | { 161 | Array.Copy( 162 | _audioBuffer, audioBufferStart, 163 | _audioBuffer, 0, 164 | audioBufferEnd - audioBufferStart); 165 | _audioBufferIndex = audioBufferEnd - audioBufferStart; 166 | } 167 | else if (audioBufferStart + _audioBuffer.Length >= audioBufferEnd) 168 | { 169 | var tmp = new byte[_audioBuffer.Length + _audioBufferIncrease]; 170 | Array.Copy( 171 | _audioBuffer, audioBufferStart + _audioBuffer.Length, 172 | tmp, 0, 173 | -audioBufferStart); 174 | Array.Copy( 175 | _audioBuffer, 0, 176 | tmp, -audioBufferStart, 177 | audioBufferEnd); 178 | _audioBuffer = tmp; 179 | _audioBufferIndex = audioBufferEnd - audioBufferStart; 180 | } 181 | else 182 | { 183 | var tmp = new byte[_audioBuffer.Length + _audioBufferIncrease]; 184 | Array.Copy( 185 | _audioBuffer, audioBufferEnd, 186 | tmp, 0, 187 | _audioBuffer.Length - audioBufferEnd); 188 | Array.Copy( 189 | _audioBuffer, 0, 190 | tmp, _audioBuffer.Length - audioBufferEnd, 191 | audioBufferEnd); 192 | _audioBuffer = tmp; 193 | _audioBufferIndex = _audioBuffer.Length; 194 | } 195 | } 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/SpeechSynthesisResult.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | 9 | namespace NeMoOnnxSharp 10 | { 11 | public class SpeechSynthesisResult 12 | { 13 | public SpeechSynthesisResult() 14 | { 15 | } 16 | 17 | public short[]? AudioData { get; set; } 18 | public int SampleRate { get; set; } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/SpeechSynthesizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using NeMoOnnxSharp.Models; 9 | 10 | namespace NeMoOnnxSharp 11 | { 12 | public sealed class SpeechSynthesizer : IDisposable 13 | { 14 | private readonly SpectrogramGenerator _specGen; 15 | private readonly Vocoder _vocoder; 16 | 17 | public SpeechSynthesizer(SpeechConfig config) 18 | { 19 | _specGen = new SpectrogramGenerator(config.specGen); 20 | _vocoder = new Vocoder(config.vocoder); 21 | } 22 | 23 | public void Dispose() 24 | { 25 | _specGen.Dispose(); 26 | _vocoder.Dispose(); 27 | } 28 | 29 | public SpeechSynthesisResult SpeakText(string text) 30 | { 31 | var parsed = _specGen.Parse(text); 32 | var spec = _specGen.GenerateSpectrogram(parsed, pace: 1.0); 33 | var audio = _vocoder.ConvertSpectrogramToAudio(spec); 34 | return new SpeechSynthesisResult() 35 | { 36 | AudioData = audio, 37 | SampleRate = _vocoder.SampleRate 38 | }; 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/TTSTokenizers/BaseCharsTokenizerr.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, 5 | // largely located in the files found in this folder: 6 | // 7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py 8 | // 9 | // The origin has the following copyright notice and license: 10 | // 11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE 12 | // 13 | 14 | using System; 15 | using System.Collections.Generic; 16 | using System.Linq; 17 | using System.Text; 18 | 19 | namespace NeMoOnnxSharp.TTSTokenizers 20 | { 21 | // nemo.collections.tts.torch.tts_tokenizers.BaseCharsTokenizer 22 | public class BaseCharsTokenizer : BaseTokenizer 23 | { 24 | public BaseCharsTokenizer( 25 | string chars, 26 | bool punct = true, 27 | string[]? nonDefaultPunctList = null, 28 | bool apostrophe = true, 29 | string oov = OOV, 30 | string sep = "|", // To be able to distinguish between 2/3 letters codes. 31 | AddBlankAt addBlankAt = AddBlankAt.None, 32 | bool padWithSpace = false) 33 | // object? text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=false), 34 | { 35 | _space = 0; 36 | var tokens = new List(); 37 | tokens.Add(" "); 38 | tokens.AddRange(chars.Select(ch => ch.ToString())); 39 | if (apostrophe) 40 | { 41 | tokens.Add("'"); // Apostrophe for saving "don't" and "Joe's" 42 | } 43 | 44 | if (punct) 45 | { 46 | if (nonDefaultPunctList != null) 47 | { 48 | tokens.AddRange(nonDefaultPunctList); 49 | } 50 | else 51 | { 52 | tokens.AddRange(PunctList); 53 | } 54 | } 55 | 56 | tokens.Add(Pad); 57 | _pad = tokens.Count; 58 | if (addBlankAt != AddBlankAt.None) 59 | { 60 | _blank = tokens.Count; 61 | tokens.Add(Blank); 62 | } 63 | 64 | tokens.Add(oov); // Out Of Vocabulary 65 | _oov = tokens.Count; 66 | 67 | if (addBlankAt == AddBlankAt.Last) 68 | { 69 | throw new NotImplementedException(); 70 | } 71 | 72 | _sep = sep; 73 | _punct = punct; 74 | _padWithSpace = padWithSpace; 75 | 76 | _id2token = tokens.ToArray(); 77 | _token2id = new Dictionary( 78 | Enumerable.Range(0, _id2token.Length) 79 | .Select(i => new KeyValuePair(_id2token[i], i))); 80 | _utilIds = new HashSet() { _pad, _blank, _oov }; 81 | 82 | _punct = punct; 83 | } 84 | 85 | public override int[] Encode(string text) 86 | { 87 | var cs = new List(); 88 | var space = _id2token[_space]; 89 | var tokens = Tokens; 90 | 91 | text = TextPreprocessingFunc(text); 92 | foreach (var c_ in text) 93 | { 94 | string c = c_.ToString(); 95 | 96 | // Add a whitespace if the current char is a whitespace while the previous char is not a whitespace. 97 | if (c == space && cs.Count > 0 && cs[cs.Count - 1] != space) 98 | { 99 | cs.Add(c); 100 | } 101 | // Add the current char that is an alphanumeric or an apostrophe. 102 | else if ((char.IsLetterOrDigit(c, 0) || c == "'") && tokens.Contains(c)) 103 | { 104 | cs.Add(c); 105 | } 106 | // Add a punctuation that has a single char. 107 | else if (!char.IsLetterOrDigit(c, 0) && _token2id.ContainsKey(c) && _punct) 108 | { 109 | cs.Add(c); 110 | } 111 | // Warn about unknown char 112 | else if (c != space) 113 | { 114 | // Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped. 115 | } 116 | } 117 | 118 | // Remove trailing spaces 119 | if (cs.Count > 0) 120 | { 121 | while (cs[cs.Count - 1] == space) 122 | { 123 | cs.RemoveAt(cs.Count - 1); 124 | } 125 | } 126 | 127 | if (_padWithSpace) 128 | { 129 | cs.Insert(0, space); 130 | cs.Add(space); 131 | } 132 | return cs.Select(c => _token2id[c]).ToArray(); 133 | } 134 | 135 | protected virtual string TextPreprocessingFunc(string text) 136 | { 137 | return TokenizerUtils.AnyLocaleTextPreprocessing(text); 138 | } 139 | 140 | private readonly string[] PunctList = 141 | { // Derived from LJSpeech and "/" additionally 142 | ",", ".", "!", "?", "-", 143 | ":", ";", "/", "\"", "(", 144 | ")", "[", "]", "{", "}", 145 | }; 146 | 147 | private readonly bool _punct; 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/TTSTokenizers/BaseTokenizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, 5 | // largely located in the files found in this folder: 6 | // 7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py 8 | // 9 | // The origin has the following copyright notice and license: 10 | // 11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE 12 | // 13 | 14 | using System; 15 | using System.Collections.Generic; 16 | using System.Linq; 17 | using System.Text; 18 | 19 | namespace NeMoOnnxSharp.TTSTokenizers 20 | { 21 | public abstract class BaseTokenizer 22 | { 23 | public enum AddBlankAt 24 | { 25 | None, 26 | True, 27 | Last 28 | } 29 | 30 | protected const string Pad = ""; 31 | protected const string Blank = ""; 32 | protected const string OOV = ""; 33 | 34 | protected BaseTokenizer() 35 | { 36 | _sep = string.Empty; 37 | _id2token = Array.Empty(); 38 | _token2id = new Dictionary(); 39 | _utilIds = new HashSet(); 40 | } 41 | 42 | /// 43 | /// Turns str text into int tokens. 44 | /// 45 | public abstract int[] Encode(string text); 46 | 47 | /// 48 | /// Turns ints tokens into str text. 49 | /// 50 | public string Decode(int[] tokens) 51 | { 52 | return string.Join( 53 | _sep, 54 | tokens 55 | .Where(t => !_utilIds.Contains(t)) 56 | .Select(t => _id2token[t])); 57 | } 58 | 59 | public string[] Tokens { get { return _id2token; } } 60 | public int PadId { get { return _pad; } } 61 | public int BlankId { get { return _blank; } } 62 | public int OOVId { get { return _oov; } } 63 | public string Sep { get { return _sep; } } 64 | 65 | protected string[] _id2token; 66 | protected IDictionary _token2id; 67 | protected ISet _utilIds; 68 | protected int _space; 69 | protected int _pad; 70 | protected int _blank; 71 | protected int _oov; 72 | protected string _sep; 73 | protected bool _padWithSpace; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/TTSTokenizers/EnglishG2p.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, 5 | // largely located in the files found in this folder: 6 | // 7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/g2p/models/en_us_arpabet.py 8 | // 9 | // The origin has the following copyright notice and license: 10 | // 11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE 12 | // 13 | 14 | using System; 15 | using System.Collections.Generic; 16 | using System.IO; 17 | using System.Linq; 18 | using System.Text; 19 | using System.Text.RegularExpressions; 20 | 21 | namespace NeMoOnnxSharp.TTSTokenizers 22 | { 23 | // nemo.collections.tts.torch.g2ps.EnglishG2p 24 | 25 | /// 26 | /// English G2P module. This module converts words from grapheme to phoneme representation using phoneme_dict in CMU dict format. 27 | /// Optionally, it can ignore words which are heteronyms, ambiguous or marked as unchangeable by word_tokenize_func(see code for details). 28 | /// Ignored words are left unchanged or passed through apply_to_oov_word for handling. 29 | /// 30 | public class EnglishG2p 31 | { 32 | private readonly IDictionary _phonemeDict; 33 | private readonly HashSet _heteronyms; 34 | private readonly double _phonemeProbability; 35 | private readonly Random _random; 36 | private readonly Regex _alnumRx; 37 | private readonly bool _ignoreAmbiguousWords; 38 | 39 | /// 40 | /// Path to file in CMUdict format or dictionary of CMUdict-like entries. 41 | /// word_tokenize_func: Function for tokenizing text to words. 42 | /// Path to file with heteronyms (every line is new word) or list of words. 43 | /// The probability (0. 47 | public EnglishG2p( 48 | string phonemeDict, 49 | string heteronyms, 50 | bool ignoreAmbiguousWords = true, 51 | Encoding? encoding = null, 52 | double phonemeProbability = 0.5) 53 | { 54 | encoding = encoding ?? Encoding.GetEncoding("iso-8859-1"); 55 | _phonemeDict = _ParseAsCmuDict(phonemeDict, encoding); 56 | _heteronyms = new HashSet(_ParseFileByLines(heteronyms, encoding)); 57 | _phonemeProbability = phonemeProbability; 58 | _random = new Random(); 59 | _alnumRx = new Regex(@"[a-zA-ZÀ-ÿ\d]"); 60 | _ignoreAmbiguousWords = ignoreAmbiguousWords; 61 | } 62 | 63 | public string[] Parse(string text) 64 | { 65 | var words = TokenizerUtils.EnglishWordTokenize(text); 66 | var prons = new List(); 67 | foreach (var (word, withoutChanges) in words) 68 | { 69 | if (withoutChanges) 70 | { 71 | prons.AddRange(word); 72 | continue; 73 | } 74 | 75 | var wordStr = word[0]; 76 | var wordByHyphen = wordStr.Split('-'); 77 | var (pron, isHandled) = ParseOneWord(wordStr); 78 | 79 | if (!isHandled && wordByHyphen.Length > 1) 80 | { 81 | pron = new List(); 82 | foreach (var subWord in wordByHyphen) 83 | { 84 | var (p, _) = ParseOneWord(subWord); 85 | pron.AddRange(p); 86 | pron.Add("-"); 87 | } 88 | pron.RemoveAt(pron.Count - 1); 89 | } 90 | prons.AddRange(pron); 91 | } 92 | return prons.ToArray(); 93 | } 94 | 95 | private (List pron, bool isHandled) ParseOneWord(string word) 96 | { 97 | if (_phonemeProbability < 1.0 && _random.NextDouble() > _phonemeProbability) 98 | { 99 | return (StringToStringList(word), true); 100 | } 101 | 102 | // punctuation or whitespace. 103 | if (!_alnumRx.IsMatch(word)) 104 | { 105 | return (StringToStringList(word), true); 106 | } 107 | 108 | // heteronyms 109 | if (_heteronyms != null && _heteronyms.Contains(word)) 110 | { 111 | return (StringToStringList(word), true); 112 | } 113 | 114 | // `'s` suffix 115 | if (word.Length > 2 116 | && word.EndsWith("'s") 117 | && !_phonemeDict.ContainsKey(word)) 118 | { 119 | var sword = word.Substring(0, word.Length - 2); 120 | if (_phonemeDict.ContainsKey(sword) 121 | && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword))) 122 | { 123 | var pron = _phonemeDict[sword][0].Split(" ").ToList(); 124 | pron.Add("Z"); 125 | return (pron, true); 126 | } 127 | } 128 | 129 | // `s` suffix 130 | if (word.Length > 1 131 | && word.EndsWith("s") 132 | && !_phonemeDict.ContainsKey(word)) 133 | { 134 | var sword = word.Substring(0, word.Length - 1); 135 | if (_phonemeDict.ContainsKey(sword) 136 | && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(sword))) 137 | { 138 | var pron = _phonemeDict[sword][0].Split(" ").ToList(); 139 | pron.Add("Z"); 140 | return (pron, true); 141 | } 142 | } 143 | 144 | // phoneme dict 145 | if (_phonemeDict.ContainsKey(word) && (!_ignoreAmbiguousWords || _IsUniqueInPhonemeDict(word))) 146 | { 147 | return (_phonemeDict[word][0].Split(" ").ToList(), true); 148 | } 149 | 150 | return (StringToStringList(word), false); 151 | } 152 | 153 | private List StringToStringList(string word) 154 | { 155 | return word.Select(x => x.ToString()).ToList(); 156 | } 157 | 158 | private bool _IsUniqueInPhonemeDict(string word) 159 | { 160 | return _phonemeDict[word].Length == 1; 161 | } 162 | 163 | private static IDictionary _ParseAsCmuDict(string phonemeDictPath, Encoding encoding) 164 | { 165 | var _alt_re = new Regex(@"\([0-9]+\)"); 166 | var g2pDict = new Dictionary(); 167 | using (var stream = new FileStream(phonemeDictPath, FileMode.Open)) 168 | using (var reader = new StreamReader(stream, encoding)) 169 | { 170 | string line; 171 | while ((line = reader.ReadLine()) != null) 172 | { 173 | if (line.Length > 0 && (('A' <= line[0] && line[0] <= 'Z') || line[0] == '\'')) 174 | { 175 | var parts = line.Split(" "); 176 | var word = _alt_re.Replace(parts[0], ""); 177 | word = word.ToLower(); 178 | 179 | var pronunciation = parts[1].Trim(); 180 | if (g2pDict.ContainsKey(word)) 181 | { 182 | var v = new List(g2pDict[word]) 183 | { 184 | pronunciation 185 | }; 186 | g2pDict[word] = v.ToArray(); 187 | } 188 | else 189 | { 190 | g2pDict[word] = new string[] { pronunciation }; 191 | } 192 | } 193 | } 194 | } 195 | return g2pDict; 196 | } 197 | 198 | private static string[] _ParseFileByLines(string p, Encoding encoding) 199 | { 200 | var res = new List(); 201 | using (var stream = new FileStream(p, FileMode.Open)) 202 | using (var reader = new StreamReader(stream, encoding)) 203 | { 204 | string line; 205 | while ((line = reader.ReadLine()) != null) 206 | { 207 | res.Add(line.TrimEnd()); 208 | } 209 | } 210 | return res.ToArray(); 211 | } 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/TTSTokenizers/EnglishPhonemesTokenizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, 5 | // largely located in the files found in this folder: 6 | // 7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py 8 | // 9 | // The origin has the following copyright notice and license: 10 | // 11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE 12 | // 13 | 14 | using System; 15 | using System.Collections.Generic; 16 | using System.Linq; 17 | using System.Text; 18 | using static System.Net.Mime.MediaTypeNames; 19 | 20 | namespace NeMoOnnxSharp.TTSTokenizers 21 | { 22 | // nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer 23 | public class EnglishPhonemesTokenizer : BaseTokenizer 24 | { 25 | /// 26 | /// English phoneme-based tokenizer. 27 | /// 28 | /// Grapheme to phoneme module. 29 | /// Whether to reserve grapheme for basic punctuation or not. 30 | /// List of punctuation marks which will be used instead default. 31 | /// Whether to use phonemes codes with stresses (0-2) or not. 32 | /// Whether to additionally use chars together with phonemes. It is useful if g2p module can return chars too. 33 | /// Space token as string. 34 | /// Silence token as string (will be disabled if it is None). 35 | /// Whether to use apostrophe or not. 36 | /// OOV token as string. 37 | /// Separation token as string. 38 | /// Add blank to labels in the specified order ("last") or after tokens (any non None), 39 | /// if None then no blank in labels. 40 | /// Whether to pad text with spaces at the beginning and at the end or not. 41 | /// text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. 42 | /// Basically, it replaces all non-unicode characters with unicode ones. 43 | /// Note that lower() function shouldn't be applied here, in case the text contains phonemes (it will be handled by g2p). 44 | public EnglishPhonemesTokenizer( 45 | EnglishG2p g2p, 46 | bool punct = true, 47 | string[]? nonDefaultPunctList = null, 48 | bool stresses = false, 49 | bool chars = false, 50 | string space = " ", 51 | string? silence = null, 52 | bool apostrophe = true, 53 | string oov = BaseTokenizer.OOV, 54 | string sep = "|", // To be able to distinguish between 2/3 letters codes. 55 | AddBlankAt addBlankAt = AddBlankAt.None, 56 | bool padWithSpace = false) 57 | // object? text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=false), 58 | { 59 | _phonemeProbability = null; 60 | _g2p = g2p; 61 | _space = 0; 62 | var tokens = new List(); 63 | tokens.Add(space); 64 | 65 | if (silence != null) 66 | { 67 | throw new NotImplementedException(); 68 | } 69 | 70 | tokens.AddRange(Consonants); 71 | var vowels = Vowels; 72 | 73 | if (stresses) 74 | { 75 | vowels = vowels.SelectMany(p => Enumerable.Range(0, 3), (p, s) => $"{p}{s}").ToArray(); 76 | } 77 | tokens.AddRange(vowels); 78 | 79 | if (chars || _phonemeProbability != null) 80 | { 81 | if (!chars) 82 | { 83 | // logging.warning( 84 | // "phoneme_probability was not None, characters will be enabled even though " 85 | // "chars was set to False." 86 | // ); 87 | } 88 | tokens.AddRange(AsciiLowercase.Select(ch => ch.ToString())); 89 | } 90 | 91 | if (apostrophe) 92 | { 93 | tokens.Add("'"); // Apostrophe 94 | } 95 | 96 | if (punct) 97 | { 98 | if (nonDefaultPunctList != null) 99 | { 100 | tokens.AddRange(nonDefaultPunctList); 101 | } 102 | else 103 | { 104 | tokens.AddRange(PunctList); 105 | } 106 | } 107 | 108 | tokens.Add(Pad); 109 | _pad = tokens.Count; 110 | if (addBlankAt != AddBlankAt.None) 111 | { 112 | _blank = tokens.Count; 113 | tokens.Add(Blank); 114 | } 115 | 116 | tokens.Add(oov); // Out Of Vocabulary 117 | _oov = tokens.Count; 118 | 119 | if (addBlankAt == AddBlankAt.Last) 120 | { 121 | throw new NotImplementedException(); 122 | } 123 | 124 | _sep = sep; 125 | _padWithSpace = padWithSpace; 126 | 127 | _id2token = tokens.ToArray(); 128 | _token2id = new Dictionary( 129 | Enumerable.Range(0, _id2token.Length) 130 | .Select(i => new KeyValuePair(_id2token[i], i))); 131 | _utilIds = new HashSet() { _pad, _blank, _oov }; 132 | 133 | _stresses = stresses; 134 | _punct = punct; 135 | } 136 | 137 | public override int[] Encode(string text) 138 | { 139 | text = TokenizerUtils.EnglishTextPreprocessing(text); 140 | var g2pText = _g2p.Parse(text); 141 | return EncodeFromG2p(g2pText); 142 | } 143 | 144 | /// 145 | /// Encodes text that has already been run through G2P. 146 | /// Called for encoding to tokens after text preprocessing and G2P. 147 | /// 148 | /// G2P's output, could be a mixture of phonemes and graphemes, 149 | /// e.g. "see OOV" -> ['S', 'IY1', ' ', 'O', 'O', 'V'] 150 | /// 151 | public int[] EncodeFromG2p(string[] g2pText) 152 | { 153 | var ps = new List(); 154 | var space = _id2token[_space]; 155 | foreach (var _p in g2pText) 156 | { 157 | string p = _p; 158 | // Remove stress 159 | if (p.Length == 3 && !_stresses) 160 | { 161 | p = p.Substring(0, 2); 162 | } 163 | 164 | // Add space if last one isn't one 165 | if (p == space && ps.Count > 0 && ps[ps.Count - 1] != space) 166 | { 167 | ps.Add(p); 168 | } 169 | // Add next phoneme or char (if chars=true) 170 | else if ((char.IsLetterOrDigit(p, 0) || p == "'") && _token2id.ContainsKey(p)) 171 | { 172 | ps.Add(p); 173 | } 174 | // Add punct 175 | else if (_punct && !char.IsLetterOrDigit(p, 0) && _token2id.ContainsKey(p)) 176 | { 177 | ps.Add(p); 178 | } 179 | else if (p != space) 180 | { 181 | // Unknown char/phoneme 182 | } 183 | } 184 | 185 | // Remove trailing spaces 186 | while (ps.Count > 0 && ps[ps.Count - 1] == space) 187 | { 188 | ps.RemoveAt(ps.Count - 1); 189 | } 190 | 191 | var res = new List(); 192 | if (_padWithSpace) 193 | { 194 | res.Add(0); 195 | } 196 | res.AddRange(g2pText.Select(p => _token2id[p])); 197 | if (_padWithSpace) 198 | { 199 | res.Add(0); 200 | } 201 | return res.ToArray(); 202 | } 203 | 204 | private readonly string[] PunctList = 205 | { // Derived from LJSpeech and "/" additionally 206 | ",", ".", "!", "?", "-", 207 | ":", ";", "/", "\"", "(", 208 | ")", "[", "]", "{", "}", 209 | }; 210 | private readonly string[] Vowels = { 211 | "AA", "AE", "AH", "AO", "AW", 212 | "AY", "EH", "ER", "EY", "IH", 213 | "IY", "OW", "OY", "UH", "UW", 214 | }; 215 | private readonly string[] Consonants = { 216 | "B", "CH", "D", "DH", "F", "G", 217 | "HH", "JH", "K", "L", "M", "N", 218 | "NG", "P", "R", "S", "SH", "T", 219 | "TH", "V", "W", "Y", "Z", "ZH", 220 | }; 221 | 222 | private const string AsciiLowercase = "abcdefghijklmnopqrstuvwxyz"; 223 | 224 | private readonly EnglishG2p _g2p; 225 | private readonly object? _phonemeProbability; 226 | private readonly bool _stresses; 227 | private readonly bool _punct; 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/TTSTokenizers/GermanCharsTokenizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, 5 | // largely located in the files found in this folder: 6 | // 7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/torch/tts_tokenizers.py 8 | // 9 | // The origin has the following copyright notice and license: 10 | // 11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE 12 | // 13 | 14 | using System; 15 | using System.Collections.Generic; 16 | using System.Linq; 17 | using System.Text; 18 | using static NeMoOnnxSharp.TTSTokenizers.BaseTokenizer; 19 | 20 | namespace NeMoOnnxSharp.TTSTokenizers 21 | { 22 | // nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer 23 | public class GermanCharsTokenizer : BaseCharsTokenizer 24 | { 25 | public GermanCharsTokenizer( 26 | bool padWithSpace = false 27 | ) : base( 28 | chars: new string(_CharsetStr), 29 | punct: true, 30 | addBlankAt: AddBlankAt.None, 31 | apostrophe: true, 32 | padWithSpace: padWithSpace, 33 | nonDefaultPunctList: _PunctList.Select(c => c.ToString()).ToArray() 34 | ) 35 | { 36 | } 37 | 38 | private static readonly char[] _CharsetStr = new char[] 39 | { 40 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 41 | 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 42 | 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ', 43 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 44 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 45 | 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ü', 'ß', 46 | }; 47 | 48 | private static readonly char[] _PunctList = new char[] 49 | { 50 | '!', '"', '(', ')', ',', '-', '.', '/', ':', ';', 51 | '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—', 52 | '‘', '‚', '“', '„', '‹', '›' 53 | }; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/TTSTokenizers/TokenizerUtils.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | // A number of implementation details in this file have been translated from the Python scripts of NVIDIA NeMo, 5 | // largely located in the files found in this folder: 6 | // 7 | // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py 8 | // 9 | // The origin has the following copyright notice and license: 10 | // 11 | // https://github.com/NVIDIA/NeMo/blob/main/LICENSE 12 | // 13 | 14 | using System; 15 | using System.Collections.Generic; 16 | using System.Globalization; 17 | using System.IO; 18 | using System.Linq; 19 | using System.Text; 20 | using System.Text.RegularExpressions; 21 | 22 | namespace NeMoOnnxSharp.TTSTokenizers 23 | { 24 | public static class TokenizerUtils 25 | { 26 | private static readonly Dictionary _synoGlyph2Ascii; 27 | private static readonly Regex _wordsReEn; 28 | 29 | static TokenizerUtils() 30 | { 31 | Tuple[] _synoglyphs = { 32 | new Tuple('\'', new[] { '’' }), 33 | new Tuple('"', new[] { '”', '“' }), 34 | }; 35 | 36 | _synoGlyph2Ascii = new Dictionary(); 37 | foreach (var (asc, glyphs) in _synoglyphs) 38 | { 39 | foreach (var g in glyphs) 40 | { 41 | _synoGlyph2Ascii[g] = asc; 42 | } 43 | } 44 | 45 | // define char set based on https://en.wikipedia.org/wiki/List_of_Unicode_characters 46 | var latinAlphabetBasic = "A-Za-z"; 47 | _wordsReEn = new Regex(@$"([{latinAlphabetBasic}]+(?:[{latinAlphabetBasic}\-']*[{latinAlphabetBasic}]+)*)|(\|[^|]*\|)|([^{latinAlphabetBasic}|]+)"); 48 | } 49 | 50 | /// 51 | /// Normalize unicode text with "NFC", and convert right single quotation mark (U+2019, decimal 8217) as an apostrophe. 52 | /// 53 | /// the original input sentence. 54 | /// normalized text. 55 | public static string AnyLocaleTextPreprocessing(string text) 56 | { 57 | var res = new List(); 58 | foreach (var c in NormalizeUnicodeText(text)) 59 | { 60 | if (c == '’') // right single quotation mark (U+2019, decimal 8217) as an apostrophe 61 | { 62 | res.Add('\''); 63 | } 64 | else 65 | { 66 | res.Add(c); 67 | } 68 | } 69 | return new string(res.ToArray()); 70 | } 71 | 72 | /// 73 | /// TODO @xueyang: Apply NFC form may be too aggressive since it would ignore some accented characters that do not exist 74 | /// in predefined German alphabet(nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon.IPA_CHARACTER_SETS), 75 | /// such as 'é'. This is not expected.A better solution is to add an extra normalization with NFD to discard the 76 | /// diacritics and consider 'é' and 'e' produce similar pronunciations. 77 | /// 78 | /// Note that the tokenizer needs to run `unicodedata.normalize("NFC", x)` before calling `encode` function, 79 | /// especially for the characters that have diacritics, such as 'ö' in the German alphabet. 'ö' can be encoded as 80 | /// b'\xc3\xb6' (one char) as well as b'o\xcc\x88' (two chars). Without the normalization of composing two chars 81 | /// together and without a complete predefined set of diacritics, when the tokenizer reads the input sentence 82 | /// char-by-char, it would skip the combining diaeresis b'\xcc\x88', resulting in indistinguishable pronunciations 83 | /// for 'ö' and 'o'. 84 | /// 85 | /// the original input sentence. 86 | /// NFC normalized sentence. 87 | private static string NormalizeUnicodeText(string text) 88 | { 89 | // normalize word with NFC form 90 | return text.Normalize(NormalizationForm.FormC); 91 | } 92 | 93 | public static string EnglishTextPreprocessing(string text, bool lower = true) 94 | { 95 | text = new string( 96 | text.Normalize(NormalizationForm.FormD) 97 | .Where(ch => CharUnicodeInfo.GetUnicodeCategory(ch) != UnicodeCategory.NonSpacingMark) 98 | .Select(ch => _synoGlyph2Ascii.ContainsKey(ch) ? _synoGlyph2Ascii[ch] : ch) 99 | .ToArray()); 100 | 101 | if (lower) 102 | { 103 | text = text.ToLower(); 104 | } 105 | return text; 106 | } 107 | 108 | /// 109 | /// Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation 110 | /// can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including 111 | /// whitespaces.This function will split unchanged strings by whitespaces and return them as `List[str]`. For example, 112 | /// 113 | /// .. code-block::python 114 | /// [ 115 | /// ('Hello', '', ''), # valid word 116 | /// ('', '', ' '), # punctuation mark 117 | /// ('World', '', ''), # valid word 118 | /// ('', '', ' '), # punctuation mark 119 | /// ('', '|NVIDIA unchanged|', ''), # unchangeable word 120 | /// ('', '', '!') # punctuation mark 121 | /// ] 122 | /// 123 | /// will be converted into, 124 | /// 125 | /// .. code-block::python 126 | /// [ 127 | /// (["Hello"], false), 128 | /// ([" "], false), 129 | /// (["World"], false), 130 | /// ([" "], false), 131 | /// (["NVIDIA", "unchanged"], True), 132 | /// (["!"], false) 133 | /// ] 134 | /// 135 | /// a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element 136 | /// corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`. 137 | /// a flag to trigger lowercase all words. By default, it is false. 138 | /// a list of tuples like `(a list of words, is_unchanged)`. 139 | private static (string[], bool)[] _wordTokenize(MatchCollection words, bool isLower = false) 140 | { 141 | var result = new List<(string[], bool)>(); 142 | foreach (Match word in words) 143 | { 144 | var maybeWord = word.Groups[0].Value; 145 | var maybeWithoutChanges = word.Groups[1].Value; 146 | var maybePunct = word.Groups[2].Value; 147 | 148 | var withoutChanges = false; 149 | string[] token; 150 | if (!string.IsNullOrEmpty(maybeWord)) 151 | { 152 | if (isLower) 153 | { 154 | token = new[] { maybeWord.ToLower() }; 155 | } 156 | else 157 | { 158 | token = new[] { maybeWord }; 159 | } 160 | } 161 | else if (!string.IsNullOrEmpty(maybePunct)) 162 | { 163 | token = new[] { maybePunct }; 164 | } 165 | else if (!string.IsNullOrEmpty(maybeWithoutChanges)) 166 | { 167 | withoutChanges = true; 168 | token = maybeWithoutChanges.Substring(1, maybeWithoutChanges.Length - 2).Split(' '); 169 | } 170 | else 171 | { 172 | throw new InvalidDataException( 173 | $"This is not expected. Found empty string: <{word}>. " + 174 | $"Please validate your regular expression pattern '_WORDS_RE_EN' or '_WORDS_RE_ANY_LOCALE'." 175 | ); 176 | } 177 | 178 | result.Add((token, withoutChanges)); 179 | } 180 | return result.ToArray(); 181 | } 182 | 183 | public static (string[], bool)[] EnglishWordTokenize(string text) 184 | { 185 | var words = _wordsReEn.Matches(text); 186 | return _wordTokenize(words, isLower: true); 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /NeMoOnnxSharp/WaveFile.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Katsuya Iida. All Rights Reserved. 2 | // See LICENSE in the project root for license information. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Runtime.InteropServices; 8 | using System.Text; 9 | 10 | namespace NeMoOnnxSharp 11 | { 12 | /// 13 | /// A static class to read and write WAV files. 14 | /// 15 | public static class WaveFile 16 | { 17 | /// 18 | /// Load a WAV file as a short array. The result is resampled 19 | /// with the target sampling rate and Multi-channel audio 20 | /// is converted to mono. 21 | /// 22 | /// File to read. 23 | /// the target sampling rate 24 | /// Waveform data. 25 | /// 26 | public static short[] ReadWAV(string path, int rate) 27 | { 28 | using (var stream = File.OpenRead(path)) 29 | using (var reader = new BinaryReader(stream, Encoding.ASCII)) 30 | { 31 | int originalRate; 32 | short originalNumChannels; 33 | var waveform = ReadWAV(reader, out originalRate, out originalNumChannels); 34 | return PostProcess(waveform, originalRate, originalNumChannels, rate); 35 | } 36 | } 37 | 38 | /// 39 | /// Save a short array as a WAV file. 40 | /// 41 | /// File to write. 42 | /// Waveform data. 43 | /// 44 | /// 45 | public static void WriteWAV(string path, short[] waveform, int rate) 46 | { 47 | short numChannels = 1; 48 | using (var stream = File.OpenWrite(path)) 49 | { 50 | WriteWAV(stream, waveform, rate, numChannels); 51 | } 52 | } 53 | 54 | /// 55 | /// Encode a short array into a byte array in WAV format. 56 | /// 57 | /// Waveform data. 58 | /// 59 | /// A byte array in WAV format 60 | public static byte[] GetWAVBytes(short[] waveform, int rate) 61 | { 62 | byte[] data; 63 | short numChannels = 1; 64 | using (var stream = new MemoryStream()) 65 | { 66 | WriteWAV(stream, waveform, rate, numChannels); 67 | data = stream.ToArray(); 68 | } 69 | return data; 70 | } 71 | 72 | private static short[] ReadWAV(BinaryReader reader, out int rate, out short numChannels) 73 | { 74 | rate = 0; 75 | numChannels = 0; 76 | string fourCC = new string(reader.ReadChars(4)); 77 | if (fourCC != "RIFF") 78 | throw new InvalidDataException(); 79 | int chunkLen = reader.ReadInt32(); 80 | fourCC = new string(reader.ReadChars(4)); 81 | if (fourCC != "WAVE") 82 | throw new InvalidDataException(); 83 | while (true) 84 | { 85 | fourCC = new string(reader.ReadChars(4)); 86 | chunkLen = reader.ReadInt32(); 87 | if (fourCC == "fmt ") 88 | { 89 | if (chunkLen < 16) throw new InvalidDataException(); 90 | short formatTag = reader.ReadInt16(); 91 | if (formatTag != 1) throw new InvalidDataException("Only PCM format is supported"); 92 | numChannels = reader.ReadInt16(); 93 | rate = reader.ReadInt32(); 94 | int avgBytesPerSec = reader.ReadInt32(); 95 | short blockAlign = reader.ReadInt16(); 96 | short bitsPerSample = reader.ReadInt16(); 97 | if (avgBytesPerSec * 8 != rate * bitsPerSample * numChannels || blockAlign * 8 != bitsPerSample * numChannels) 98 | { 99 | throw new InvalidDataException(); 100 | } 101 | if (chunkLen > 16) 102 | { 103 | reader.ReadBytes(chunkLen - 16); 104 | } 105 | } 106 | else 107 | { 108 | if (rate == 0) 109 | { 110 | throw new InvalidDataException(); 111 | } 112 | byte[] byteData = reader.ReadBytes(chunkLen); 113 | if (fourCC == "data") 114 | { 115 | return MemoryMarshal.Cast(byteData).ToArray(); 116 | } 117 | } 118 | } 119 | } 120 | 121 | private static void WriteWAV(Stream stream, short[] waveform, int rate, short numChannels) 122 | { 123 | using (var writer = new BinaryWriter(stream, Encoding.ASCII)) 124 | { 125 | WriteWAV(writer, waveform, rate, numChannels); 126 | } 127 | } 128 | 129 | private static void WriteWAV(BinaryWriter writer, short[] waveform, int rate, short numChannels) 130 | { 131 | short formatTag = 1; // PCM 132 | short bitsPerSample = 16; 133 | int avgBytesPerSec = rate * bitsPerSample * numChannels / 8; 134 | short blockAlign = (short)(numChannels * bitsPerSample / 8); 135 | 136 | string fourCC = "RIFF"; 137 | writer.Write(fourCC.ToCharArray()); 138 | int chunkLen = 36 + waveform.Length * (bitsPerSample / 8); 139 | writer.Write(chunkLen); 140 | 141 | fourCC = "WAVE"; 142 | writer.Write(fourCC.ToCharArray()); 143 | 144 | fourCC = "fmt "; 145 | chunkLen = 16; 146 | 147 | writer.Write(fourCC.ToCharArray()); 148 | writer.Write(chunkLen); 149 | writer.Write(formatTag); 150 | writer.Write(numChannels); 151 | writer.Write(rate); 152 | writer.Write(avgBytesPerSec); 153 | writer.Write(blockAlign); 154 | writer.Write(bitsPerSample); 155 | 156 | fourCC = "data"; 157 | chunkLen = waveform.Length * (bitsPerSample / 8); 158 | 159 | writer.Write(fourCC.ToCharArray()); 160 | writer.Write(chunkLen); 161 | var waveformBytes = MemoryMarshal.Cast(waveform); 162 | writer.Write(waveformBytes.ToArray()); 163 | } 164 | 165 | private static short[] PostProcess(short[] waveform, int sourceRate, int sourceNumChannels, int targetRate) 166 | { 167 | waveform = ToMono(waveform, sourceNumChannels); 168 | waveform = Resample(waveform, sourceRate, targetRate); 169 | return waveform; 170 | } 171 | 172 | private static short[] Resample(short[] waveform, int sourceRate, int targetRate) 173 | { 174 | if (sourceRate == targetRate) return waveform; 175 | if (waveform.Length == 0) return Array.Empty(); 176 | long targetLength = (waveform.LongLength - 1) * targetRate / sourceRate + 1; 177 | short[] result = new short[targetLength]; 178 | for (long i = 0; i < result.LongLength; i++) 179 | { 180 | result[i] = waveform[i * sourceRate / targetRate]; 181 | } 182 | return result; 183 | } 184 | 185 | private static short[] ToMono(short[] waveform, int numChannels) 186 | { 187 | if (numChannels == 1) return waveform; 188 | int length = waveform.Length / numChannels; 189 | short[] result = new short[length]; 190 | for (int i = 0; i < length; i++) 191 | { 192 | int value = 0; 193 | for (int j = 0; j < numChannels; j++) 194 | { 195 | value += waveform[i * numChannels + j]; 196 | } 197 | result[i] = (short)(value / numChannels); 198 | } 199 | return result; 200 | } 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /Python/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501 -------------------------------------------------------------------------------- /Python/convert_librispeech.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import soundfile as sf 4 | import librosa 5 | 6 | id1, id2 = 61, 70968 7 | input_dir = os.path.join(sys.argv[1], "test-clean", str(id1), str(id2)) 8 | output_dir = os.path.join("..", "test_data") 9 | transcript_file = os.path.join(input_dir, "%d-%d.trans.txt" % (id1, id2)) 10 | output_file = os.path.join(output_dir, "transcript.txt") 11 | sample_rate = 16000 12 | 13 | os.makedirs(output_dir, exist_ok=True) 14 | with open(transcript_file, 'rt') as f: 15 | with open(output_file, 'wt') as outf: 16 | for line in f: 17 | name, _, text = line.rstrip('\r\n').partition(" ") 18 | text = text.lower() 19 | audio_file = os.path.join(input_dir, name + ".flac") 20 | wav_file = os.path.join(output_dir, name + ".wav") 21 | x, orig_sample_rate = sf.read(audio_file) 22 | assert x.ndim == 1 23 | x = librosa.resample(x, orig_sample_rate, sample_rate) 24 | print("Writing %s..." % (wav_file,)) 25 | outf.write("%s.wav|%s\n" % (name, text)) 26 | sf.write(wav_file, x, samplerate=sample_rate, subtype="PCM_16") 27 | -------------------------------------------------------------------------------- /Python/export_models.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from omegaconf import OmegaConf 3 | 4 | 5 | def get_class(cls_path): 6 | package_path = '.'.join(cls_path.split('.')[:-1]) 7 | cls_name = cls_path.split('.')[-1] 8 | package = importlib.import_module(package_path) 9 | return getattr(package, cls_name) 10 | 11 | 12 | def export(cls_path: str, model_name: str): 13 | cls = get_class(cls_path) 14 | model = cls.from_pretrained(model_name) 15 | model.export(f'{model_name}.onnx') 16 | print(OmegaConf.to_yaml(model._cfg)) 17 | 18 | 19 | cls_path = 'nemo.collections.asr.models.EncDecClassificationModel' 20 | cls_path = 'nemo.collections.asr.models.EncDecCTCModel' 21 | cls_path = 'nemo.collections.asr.models.EncDecClassificationModel' 22 | model_name = 'vad_marblenet' 23 | model_name = 'stt_en_quartznet15x5' 24 | model_name = 'stt_en_jasper10x5dr' 25 | model_name = 'commandrecognition_en_matchboxnet3x1x64_v2' 26 | export(cls_path, model_name) 27 | -------------------------------------------------------------------------------- /Python/make_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Katsuya Iida. All Rights Reserved. 2 | # See LICENSE in the project root for license information. 3 | 4 | import librosa 5 | import torch 6 | from nemo.collections.asr.modules import ( 7 | AudioToMelSpectrogramPreprocessor, 8 | AudioToMFCCPreprocessor 9 | ) 10 | 11 | 12 | def main(): 13 | wavpath = "../NemoOnnxSharp.Tests/Data/61-70968-0000.wav" 14 | sr = 16000 15 | audio_signal, sr = librosa.load(wavpath, sr=sr, mono=True) 16 | assert audio_signal.ndim == 1 17 | audio_signal = torch.from_numpy(audio_signal) 18 | audio_signal = torch.unsqueeze(audio_signal, 0) 19 | length = torch.tensor([audio_signal.shape[1]], dtype=torch.int64) 20 | convert_mfcc(audio_signal, length) 21 | convert_mel_spectrogram(audio_signal, length) 22 | 23 | 24 | def convert_mel_spectrogram(audio_signal, length): 25 | print(audio_signal.shape) 26 | preprocessor = AudioToMelSpectrogramPreprocessor( 27 | normalize="per_feature", 28 | window_size=0.02, 29 | sample_rate=16000, 30 | window_stride=0.01, 31 | window="hann", 32 | features=64, 33 | n_fft=512, 34 | frame_splicing=1, 35 | dither=0.00001, 36 | stft_conv=False) 37 | with torch.no_grad(): 38 | processed_signal, processed_signal_length = preprocessor(input_signal=audio_signal, length=length) 39 | print(processed_signal, processed_signal_length) 40 | print(processed_signal.shape, processed_signal_length) 41 | with open("../NemoOnnxSharp.Tests/Data/mel_spectrogram.bin", 'wb') as fp: 42 | fp.write(processed_signal[0].T.numpy().tobytes("C")) 43 | 44 | 45 | def convert_mfcc(audio_signal, length): 46 | print(audio_signal.shape) 47 | preprocessor = AudioToMFCCPreprocessor( 48 | window_size=0.025, 49 | window_stride=0.01, 50 | window="hann", 51 | n_mels=64, 52 | n_mfcc=64, 53 | n_fft=512) 54 | with torch.no_grad(): 55 | processed_signal, processed_signal_length = preprocessor(input_signal=audio_signal, length=length) 56 | print(processed_signal, processed_signal_length) 57 | print(processed_signal.shape, processed_signal_length) 58 | with open("../NemoOnnxSharp.Tests/Data/mfcc.bin", 'wb') as fp: 59 | fp.write(processed_signal[0].T.numpy().tobytes("C")) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /test_data/.gitignore: -------------------------------------------------------------------------------- 1 | /generated-*.wav 2 | /recognized-*.wav 3 | /result.txt -------------------------------------------------------------------------------- /test_data/61-70968-0000.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3f53c11bcec66e60659c3e53015f4f914d79b04eba0770347e644a4776fbe633 3 | size 157004 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0001.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:df55c8a1e89386975f650a98f1e513d8d6e0c12cc5cb2d92ad00501e21a1d8e4 3 | size 115564 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0002.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ad96038297ec7d5ce499bbacb9fc0c7e2d462d031c6d7380f4960e31e3ef9bf3 3 | size 95084 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0003.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f9b573b19f504f3eaf341f126f788adbe73b379bfe3f0cfae0a5d51aadbfcb7d 3 | size 138124 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0004.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:94751ed39e020fab4c61c6fbdc750be55c2bea19d4e010c0ccf1fe5f1e19aedb 3 | size 124364 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0005.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5fd859700c65c3a15a3caea1c321914ed1c0318c183554fb60121ece797fe9b7 3 | size 162284 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0006.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a680ba9d1a32b2b8a0875165c4c1f2ff4fd3d84b7a4ffdc92daa7e345db0233d 3 | size 93964 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0007.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9e037a99f31885f1bba0a52181609462f832c2e6f46f064bd428c1731bc25a06 3 | size 113644 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0008.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:82a1bc1ef6e77e00b9af883b03a2c661cccdc67305731b679da6a21ca33138a6 3 | size 113164 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0009.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:07686334b7bc345a231eca201f614b4eb301ce1ef73834f4ae61296142ccddb6 3 | size 144364 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0010.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1e7dadf546cc01ce15394a155ba2f2fadd17c9dfba4a951fd24b366d1b7009d5 3 | size 265484 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0011.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ac0ec71479cc2cd204cf3caead4a7929d15fdd20d7eb08c9b35310c3ac5b0e88 3 | size 204044 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0012.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:252f2dc7532d5d679c1b57402018cd2b6cfd5f5607563e9e8d15fd6d049a8750 3 | size 83564 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0013.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ec88f797bcb446927417a91a63fa9a92aa3cab67a4d4e68b9f0d84070fdfd12a 3 | size 142444 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0014.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:663af9404ce03c9df89ddf341636eb4790a8fa3f78fa2646b7ac26ccd018f611 3 | size 239564 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0015.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3753e342574252e6414abdabcd09e8ccf1721696e8859f815ffdcdea5d799ae5 3 | size 172044 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0016.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:82acd474f1c191d996bf577c00685b49d4a7342a8000c7841afc245de3a7f44c 3 | size 119084 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0017.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d7e768f637aee1e4695f952002ee1c86209ef7c3fdba5f2860910e7d826750de 3 | size 163564 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0018.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3548563c071f784640611b9d5661351a90e26ef020d6a7cd22496afbc1594628 3 | size 77004 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0019.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:de5b2ae3f62ab70003299d154cf8ca174abecece9861654dc63154b1cbf15ec8 3 | size 175244 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0020.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2d3ff04464a3db8d8c7aee34a6cde6610b716b6632d495179db3b65388375ea3 3 | size 163404 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0021.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2e463d40cd4e60e5b23fd25f13c84d45bc453e246d83b1a63bca2d620db73fa0 3 | size 86124 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0022.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:66607ac384005d3429abf3548964630223d86262951a66a3d717538f4497fca6 3 | size 149484 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0023.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:52e72e87ce68c09af43f94cd1099021fae0781872e93e5da34f92dc5f88fd853 3 | size 160844 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0024.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:30d3317ad7492d7eefa5380caec36f259f912297891b3a0f83096c93af4f9849 3 | size 192844 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0025.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb6123b80871c0c31f43972ec9f77b7bce667cfc9a26774e6376a31f0ed7e660 3 | size 141164 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0026.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:012921ed3a7f5c1369188ca346d468ddf99abbd3c398810c3932e523f378429d 3 | size 157484 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0027.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:40c3b9f25002e201f671e14fe9fd8c12f218b08389d345b58abf26fc4b4fdf35 3 | size 219884 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0028.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ee119982e4c44c558f790c44b032f74f4f071c0ad67656647523504ba6ef3d6b 3 | size 171404 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0029.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4b84dceeddf76e96d8e20934c018cf7d1cb5805db138442cc9cf9681e1e7bddb 3 | size 111884 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0030.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2767e40fab3748caf7f07dc47297ecb6b5bf19b53f12bb29fe5626b791a795e0 3 | size 181964 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0031.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9d448f100c78ed193bb571b7140a405f960667705cc9a69a2779d697b48fc2e7 3 | size 93644 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0032.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bd3527051e39a0b2ae431630aa1cec818b7903cbe94e0fe0bd644c4f9e80b453 3 | size 137004 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0033.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6048875e595a68c17f4d79b2c36bfff5e60066aaad246b19a18d6d6101acfad3 3 | size 181964 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0034.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6054ef7bb5140173bc76ae5a7cf5d1fed714117fc2bb4320f370df247d1228d6 3 | size 135404 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0035.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e08be96d41087ff39c63d47d24fe0230a710e36b7acce0cefdb5c71ec0451f39 3 | size 254444 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0036.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:dede8e54d9f432d920dc7f63f108862265e77371f2a742cd41c82a783f09dd6c 3 | size 93962 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0037.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3df69bcdd3a6690d746700dd8e2af7c6db4c7eb426064ec289051bc1029ac605 3 | size 138124 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0038.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a2a3869fcae4ae7aa5f52f984cca42692aaedc70b479cc62c8742167d3eceece 3 | size 65644 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0039.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:81825272e79ee9e1d42eba1c7a51d8cd6cb8ecdd227d366ba211ca18cf1128da 3 | size 121804 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0040.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:433d9cadaa4a025491d857fa8a36729a7c5177d1fe44ede3967c7c7ea6b2196d 3 | size 126444 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0041.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ea941cbef651c558f0dcee117854f6f290f8f4ff3aeff6bc6ed669aa380b8b03 3 | size 218444 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0042.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0e4c658f1e067baf1083c919353436ff77d7ba0233def4f563de7a517b92fc65 3 | size 89164 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0043.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ac7dfa89b0ef04f305ebdd310bc2fed86e2bf314f69a3a6bb7e688f0ed91351a 3 | size 215564 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0044.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b0aa32154d539b8b39c6d44ece0b0ef1fd877bc0011826eeb60cff7ebd04ab33 3 | size 88684 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0045.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d95be7473028936629cfbc24ef1f9e550585d266fc15e6aed818a4f4d52a3c6f 3 | size 111244 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0046.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2f8d9401c832ca20fad55ef6f8934596c3746576f4077d0f24a9dd0d8921ba44 3 | size 113644 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0047.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:54fca568c27b79c7755e4267eb39a553f7ccf16389f14a1e0b9d03be6d24e803 3 | size 152844 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0048.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6a94a5b4b1b1818a1be154b875f8a5de2bf92dc5c292a95cc240b80485613893 3 | size 96684 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0049.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0c6976d30ebd5bfe7c829c055d2bf729b095741b6e7c7a5d08943391cb5773c5 3 | size 264044 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0050.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:847f20ecfbd7e24f565d21ed9e419dad341ad89919cb5e670d7315de1c2b1a4b 3 | size 178604 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0051.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:47fdb16ef5745504b705f7854b3bc201487f6bd0d9e8811c5c1b98c4030a5839 3 | size 100364 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0052.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:abbc8181efacff8a2dbc2d37c3f2f55e02879523dbd58eb5f3e14eb8e6f0a6bd 3 | size 84844 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0053.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6cf6c59d1589886118cffa1ed2eca8665e728019c4d3a7f21f81db5e402ba8f9 3 | size 135084 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0054.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:234f107f134babc6c1b0fdd041187ea14dc8518cd6a1e237e7da31474b41e157 3 | size 251564 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0055.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b9734d146511c648889a14fa1ddd092974fb33c472c5dd15eee5cc979130915a 3 | size 126924 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0056.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3645021d26b85c14262b1208860d5a8b81a339493b1e9ce3cd773d7213c2e306 3 | size 114124 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0057.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:04a851beed6a653b79f0147f008826aa30b41e04db371764540daee9671be383 3 | size 162124 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0058.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3ad240f4d20e9179b043859f3a0a1354c3fbf0ad23d50148b23d39f4217e2934 3 | size 58284 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0059.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d153ad1d451e53595e995df6a7941e8e671a001e3e2ee44b2957a54ee210f20b 3 | size 69484 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0060.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:14fb8ad902b2bc1ad34d4397f83c96125f2549c3301a232557b13e1f6243e665 3 | size 119564 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0061.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5a592bf3f9830627adcb5a4a6f76e2bf43bbcc9c62e431c714cfe595abce178a 3 | size 177004 4 | -------------------------------------------------------------------------------- /test_data/61-70968-0062.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:362ba15629217d4b91c7f8dc08b82738d10db86bf85d20d1d107d247b4c6a6b4 3 | size 81804 4 | -------------------------------------------------------------------------------- /test_data/README.md: -------------------------------------------------------------------------------- 1 | # Test data 2 | 3 | These files are from [LibriSpeech](http://www.openslr.org/12) 4 | 5 | - `transcript.txt` 6 | - `61-70968-0052.wav` 7 | - ... 8 | 9 | This file is from [NVIDIA NeMo tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb) 10 | 11 | - `SpeechCommands_demo.wav` 12 | 13 | This file is from [Thrsten Voice](https://github.com/thorstenMueller/Thorsten-Voice) 14 | 15 | - `samples_thorsten-21.06-emotional_neutral.wav` 16 | -------------------------------------------------------------------------------- /test_data/SpeechCommands_demo.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bb9ca4c26860bc0cbea94ade4f18f1dd53ac79bbf6caef82507becaa1b4a083f 3 | size 54524 4 | -------------------------------------------------------------------------------- /test_data/samples_thorsten-21.06-emotional_neutral.wav: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1ba03644402c5b1edc3c4cd7f76967febb66cbce6a8adbf294e0a6b2de87cd1f 3 | size 70454 4 | -------------------------------------------------------------------------------- /test_data/transcript.txt: -------------------------------------------------------------------------------- 1 | 61-70968-0000.wav|he began a confused complaint against the wizard who had vanished behind the curtain on the left 2 | 61-70968-0001.wav|give not so earnest a mind to these mummeries child 3 | 61-70968-0002.wav|a golden fortune and a happy life 4 | 61-70968-0003.wav|he was like unto my father in a way and yet was not my father 5 | 61-70968-0004.wav|also there was a stripling page who turned into a maid 6 | 61-70968-0005.wav|this was so sweet a lady sir and in some manner i do think she died 7 | 61-70968-0006.wav|but then the picture was gone as quickly as it came 8 | 61-70968-0007.wav|sister nell do you hear these marvels 9 | 61-70968-0008.wav|take your place and let us see what the crystal can show to you 10 | 61-70968-0009.wav|like as not young master though i am an old man 11 | 61-70968-0010.wav|forthwith all ran to the opening of the tent to see what might be amiss but master will who peeped out first needed no more than one glance 12 | 61-70968-0011.wav|he gave way to the others very readily and retreated unperceived by the squire and mistress fitzooth to the rear of the tent 13 | 61-70968-0012.wav|cries of a nottingham a nottingham 14 | 61-70968-0013.wav|before them fled the stroller and his three sons capless and terrified 15 | 61-70968-0014.wav|what is the tumult and rioting cried out the squire authoritatively and he blew twice on a silver whistle which hung at his belt 16 | 61-70968-0015.wav|nay we refused their request most politely most noble said the little stroller 17 | 61-70968-0016.wav|and then they became vexed and would have snatched your purse from us 18 | 61-70968-0017.wav|i could not see my boy injured excellence for but doing his duty as one of cumberland's sons 19 | 61-70968-0018.wav|so i did push this fellow 20 | 61-70968-0019.wav|it is enough said george gamewell sharply and he turned upon the crowd 21 | 61-70968-0020.wav|shame on you citizens cried he i blush for my fellows of nottingham 22 | 61-70968-0021.wav|surely we can submit with good grace 23 | 61-70968-0022.wav|tis fine for you to talk old man answered the lean sullen apprentice 24 | 61-70968-0023.wav|but i wrestled with this fellow and do know that he played unfairly in the second bout 25 | 61-70968-0024.wav|spoke the squire losing all patience and it was to you that i gave another purse in consolation 26 | 61-70968-0025.wav|come to me men here here he raised his voice still louder 27 | 61-70968-0026.wav|the strollers took their part in it with hearty zest now that they had some chance of beating off their foes 28 | 61-70968-0027.wav|robin and the little tumbler between them tried to force the squire to stand back and very valiantly did these two comport themselves 29 | 61-70968-0028.wav|the head and chief of the riot the nottingham apprentice with clenched fists threatened montfichet 30 | 61-70968-0029.wav|the squire helped to thrust them all in and entered swiftly himself 31 | 61-70968-0030.wav|now be silent on your lives he began but the captured apprentice set up an instant shout 32 | 61-70968-0031.wav|silence you knave cried montfichet 33 | 61-70968-0032.wav|he felt for and found the wizard's black cloth the squire was quite out of breath 34 | 61-70968-0033.wav|thrusting open the proper entrance of the tent robin suddenly rushed forth with his burden with a great shout 35 | 61-70968-0034.wav|a montfichet a montfichet gamewell to the rescue 36 | 61-70968-0035.wav|taking advantage of this the squire's few men redoubled their efforts and encouraged by robin's and the little stroller's cries fought their way to him 37 | 61-70968-0036.wav|george montfichet will never forget this day 38 | 61-70968-0037.wav|what is your name lording asked the little stroller presently 39 | 61-70968-0038.wav|robin fitzooth 40 | 61-70968-0039.wav|and mine is will stuteley shall we be comrades 41 | 61-70968-0040.wav|right willingly for between us we have won the battle answered robin 42 | 61-70968-0041.wav|i like you will you are the second will that i have met and liked within two days is there a sign in that 43 | 61-70968-0042.wav|montfichet called out for robin to give him an arm 44 | 61-70968-0043.wav|friends said montfichet faintly to the wrestlers bear us escort so far as the sheriff's house 45 | 61-70968-0044.wav|it will not be safe for you to stay here now 46 | 61-70968-0045.wav|pray follow us with mine and my lord sheriff's men 47 | 61-70968-0046.wav|nottingham castle was reached and admittance was demanded 48 | 61-70968-0047.wav|master monceux the sheriff of nottingham was mightily put about when told of the rioting 49 | 61-70968-0048.wav|and henry might return to england at any moment 50 | 61-70968-0049.wav|have your will child if the boy also wills it montfichet answered feeling too ill to oppose anything very strongly just then 51 | 61-70968-0050.wav|he made an effort to hide his condition from them all and robin felt his fingers tighten upon his arm 52 | 61-70968-0051.wav|beg me a room of the sheriff child quickly 53 | 61-70968-0052.wav|but who is this fellow plucking at your sleeve 54 | 61-70968-0053.wav|he is my esquire excellency returned robin with dignity 55 | 61-70968-0054.wav|mistress fitzooth had been carried off by the sheriff's daughter and her maids as soon as they had entered the house so that robin alone had the care of montfichet 56 | 61-70968-0055.wav|robin was glad when at length they were left to their own devices 57 | 61-70968-0056.wav|the wine did certainly bring back the color to the squire's cheeks 58 | 61-70968-0057.wav|these escapades are not for old gamewell lad his day has come to twilight 59 | 61-70968-0058.wav|will you forgive me now 60 | 61-70968-0059.wav|it will be no disappointment to me 61 | 61-70968-0060.wav|no thanks i am glad to give you such easy happiness 62 | 61-70968-0061.wav|you are a worthy leech will presently whispered robin the wine has worked a marvel 63 | 61-70968-0062.wav|ay and show you some pretty tricks 64 | --------------------------------------------------------------------------------