├── VadSharp
├── .gitignore
├── VadSpeechSegment.cs
├── VadSharp.csproj
├── VadSharp.sln
├── VadOnnxModel.cs
└── VadDetector.cs
├── VadSharp.Example
├── VadSharp.Example.csproj
└── Program.cs
├── LICENSE
├── README.md
└── .gitignore
/VadSharp/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /obj
3 | /packages
--------------------------------------------------------------------------------
/VadSharp.Example/VadSharp.Example.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net9.0
6 | enable
7 | enable
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/VadSharp/VadSpeechSegment.cs:
--------------------------------------------------------------------------------
1 | namespace VadSharp;
2 |
3 | public class VadSpeechSegment
4 | {
5 | public int? StartOffset { get; set; }
6 | public int? EndOffset { get; set; }
7 | public float? StartSecond { get; set; }
8 | public float? EndSecond { get; set; }
9 |
10 | public VadSpeechSegment()
11 | {
12 | }
13 |
14 | public VadSpeechSegment(int startOffset, int? endOffset, float? startSecond, float? endSecond)
15 | {
16 | StartOffset = startOffset;
17 | EndOffset = endOffset;
18 | StartSecond = startSecond;
19 | EndSecond = endSecond;
20 | }
21 | }
--------------------------------------------------------------------------------
/VadSharp/VadSharp.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net6.0
5 | enable
6 | enable
7 | AnyCPU;x64
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Gabriele Bologna
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/VadSharp/VadSharp.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.12.35707.178
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VadSharp", "VadSharp.csproj", "{D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}"
7 | EndProject
8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VadSharp.Example", "..\VadSharp.Example\VadSharp.Example.csproj", "{9720E5DF-3AEC-4506-855D-9D20D8EED77F}"
9 | EndProject
10 | Global
11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | Debug|Any CPU = Debug|Any CPU
13 | Debug|x64 = Debug|x64
14 | Release|Any CPU = Release|Any CPU
15 | Release|x64 = Release|x64
16 | EndGlobalSection
17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
18 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
19 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Debug|Any CPU.Build.0 = Debug|Any CPU
20 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Debug|x64.ActiveCfg = Debug|x64
21 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Debug|x64.Build.0 = Debug|x64
22 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Release|Any CPU.ActiveCfg = Release|Any CPU
23 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Release|Any CPU.Build.0 = Release|Any CPU
24 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Release|x64.ActiveCfg = Release|x64
25 | {D9C9F6E2-E55A-4A5D-9F72-60EF8D921DC4}.Release|x64.Build.0 = Release|x64
26 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Debug|Any CPU.Build.0 = Debug|Any CPU
28 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Debug|x64.ActiveCfg = Debug|Any CPU
29 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Debug|x64.Build.0 = Debug|Any CPU
30 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Release|Any CPU.ActiveCfg = Release|Any CPU
31 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Release|Any CPU.Build.0 = Release|Any CPU
32 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Release|x64.ActiveCfg = Release|Any CPU
33 | {9720E5DF-3AEC-4506-855D-9D20D8EED77F}.Release|x64.Build.0 = Release|Any CPU
34 | EndGlobalSection
35 | GlobalSection(SolutionProperties) = preSolution
36 | HideSolutionNode = FALSE
37 | EndGlobalSection
38 | EndGlobal
39 |
--------------------------------------------------------------------------------
/VadSharp.Example/Program.cs:
--------------------------------------------------------------------------------
1 | using System.Security.Cryptography;
2 | using System.Text;
3 | using NAudio.Wave;
4 | using VadSharp;
5 |
6 | public class Program
7 | {
8 | private const int SAMPLE_RATE = 16000;
9 | private const float THRESHOLD = 0.5f;
10 | private const int MIN_SPEECH_DURATION_MS = 250;
11 | private const float MAX_SPEECH_DURATION_SECONDS = float.PositiveInfinity;
12 | private const int MIN_SILENCE_DURATION_MS = 100;
13 | private const int SPEECH_PAD_MS = 30;
14 | private const bool USE_DIRECT_ML = true;
15 |
16 | public static void Main()
17 | {
18 | Console.Title = "VadSharpExample | Made by https://github.com/GabryB03/";
19 |
20 | string modelPath = Path.Combine(AppContext.BaseDirectory, "resources", "silero_vad.onnx");
21 | string audioPath = Path.Combine(AppContext.BaseDirectory, "resources", "test.wav");
22 |
23 | if (!File.Exists(modelPath))
24 | {
25 | Console.WriteLine($"Model file not found: {modelPath}");
26 | return;
27 | }
28 |
29 | if (!File.Exists(audioPath))
30 | {
31 | Console.WriteLine($"Audio file not found: {audioPath}");
32 | return;
33 | }
34 |
35 | VadDetector vadDetector = new VadDetector(modelPath, THRESHOLD, SAMPLE_RATE, MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS, USE_DIRECT_ML);
36 |
37 | {
38 | List speechTimeList = vadDetector.GetSpeechSegmentList(LoadAudioFile(audioPath), 8000);
39 | StringBuilder sb = new StringBuilder();
40 |
41 | foreach (VadSpeechSegment speechSegment in speechTimeList)
42 | {
43 | sb.AppendLine($"[-] Start second: {speechSegment.StartSecond.ToString().Replace(",", ".")}s, end second: {speechSegment.EndSecond.ToString().Replace(",", ".")}s");
44 | }
45 |
46 | Console.WriteLine(sb.ToString());
47 | }
48 |
49 | {
50 | List speechTimeList = vadDetector.GetSpeechSegmentList(audioPath);
51 | StringBuilder sb = new StringBuilder();
52 |
53 | foreach (VadSpeechSegment speechSegment in speechTimeList)
54 | {
55 | sb.AppendLine($"[-] Start second: {speechSegment.StartSecond.ToString().Replace(",", ".")}s, end second: {speechSegment.EndSecond.ToString().Replace(",", ".")}s");
56 | }
57 |
58 | Console.WriteLine(sb.ToString());
59 | }
60 |
61 | Console.ReadLine();
62 | }
63 |
64 | public static float[] LoadAudioFile(string filePath)
65 | {
66 | using var reader = new AudioFileReader(filePath);
67 | int sampleCount = (int)(reader.Length / (reader.WaveFormat.BitsPerSample / 8));
68 | List samples = new List();
69 | float[] buffer = new float[reader.WaveFormat.SampleRate];
70 | int readSamples;
71 |
72 | while ((readSamples = reader.Read(buffer, 0, buffer.Length)) > 0)
73 | {
74 | for (int i = 0; i < readSamples; i++)
75 | {
76 | samples.Add(buffer[i]);
77 | }
78 | }
79 |
80 | return samples.ToArray();
81 | }
82 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VadSharp - Voice Activity Detection in C#.NET
2 |
3 |     
4 |
5 | ## 🚀 The Best VAD Implementation in C#
6 |
7 | VadSharp is the **first and most efficient** implementation of **[Silero VAD](https://github.com/snakers4/silero-vad/) in C#**, supporting the latest **V5 model** with all its advanced features. It is faster than the original Python version and runs on **any GPU (NVIDIA, AMD, Intel) and CPU** with **ONNXRuntime and DirectML**.
8 |
9 | This project represents **my first significant contribution to the world of artificial intelligence in terms of development**, and I sincerely hope that users will appreciate this effort. Your support and feedback are highly valued! 🙌
10 |
11 | ---
12 |
13 | ## 🛠 Features & Benefits
14 |
15 | ✅ **Stellar Accuracy** - Excellent results in speech detection tasks.
16 |
17 | ⚡ **Fast** - Processes 30ms+ audio chunks in under **1ms** on a single CPU thread, even faster with batching or GPU acceleration.
18 |
19 | 📦 **Lightweight** - Model size is only **~2MB**.
20 |
21 | 🌎 **General** - Trained on **6,000+ languages**, handling various background noises and recording conditions.
22 |
23 | 🎚 **Flexible Sampling Rate** - Supports **8000 Hz and 16000 Hz**.
24 |
25 | 🌍 **Highly Portable** - Runs anywhere **ONNX and ML.NET** are available.
26 |
27 | 🔓 **No Strings Attached** - **MIT License**, no telemetry, no registration, no vendor lock-in.
28 |
29 | ---
30 |
31 | ## 📌 Installation
32 |
33 | ```sh
34 | # Install ONNXRuntime and ML.NET
35 | Install-Package Microsoft.ML.OnnxRuntime
36 | Install-Package Microsoft.ML
37 | Install-Package NAudio
38 | ```
39 |
40 | ---
41 |
42 | ## 🧑💻 Example Usage
43 |
44 | ```csharp
45 | using System.Text;
46 | using VadSharp;
47 |
48 | public class Program
49 | {
50 | private const int SAMPLE_RATE = 16000;
51 | private const float THRESHOLD = 0.5f;
52 | private const int MIN_SPEECH_DURATION_MS = 250;
53 | private const float MAX_SPEECH_DURATION_SECONDS = float.PositiveInfinity;
54 | private const int MIN_SILENCE_DURATION_MS = 100;
55 | private const int SPEECH_PAD_MS = 30;
56 |
57 | public static void Main()
58 | {
59 | Console.Title = "VadSharpExample | Made by https://github.com/GabryB03/";
60 |
61 | string modelPath = Path.Combine(AppContext.BaseDirectory, "resources", "silero_vad.onnx");
62 | string audioPath = Path.Combine(AppContext.BaseDirectory, "resources", "test.wav");
63 |
64 | if (!File.Exists(modelPath))
65 | {
66 | Console.WriteLine($"Model file not found: {modelPath}");
67 | return;
68 | }
69 |
70 | if (!File.Exists(audioPath))
71 | {
72 | Console.WriteLine($"Audio file not found: {audioPath}");
73 | return;
74 | }
75 |
76 | VadDetector vadDetector = new VadDetector(modelPath, THRESHOLD, SAMPLE_RATE, MIN_SPEECH_DURATION_MS, MAX_SPEECH_DURATION_SECONDS, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
77 | List speechTimeList = vadDetector.GetSpeechSegmentList(audioPath);
78 | StringBuilder sb = new StringBuilder();
79 |
80 | foreach (VadSpeechSegment speechSegment in speechTimeList)
81 | {
82 | sb.AppendLine($"[-] Start second: {speechSegment.StartSecond.ToString().Replace(",", ".")}s, end second: {speechSegment.EndSecond.ToString().Replace(",", ".")}s");
83 | }
84 |
85 | Console.WriteLine(sb.ToString());
86 | Console.ReadLine();
87 | }
88 | }
89 | ```
90 |
91 | ---
92 |
93 | ## 🌟 Contributing
94 |
95 | Contributions are welcome there! 🚀 Follow these steps to create a **Pull Request (PR):**
96 |
97 | 1. **Fork the repository**
98 | 2. **Clone your fork**:
99 | ```sh
100 | git clone https://github.com/your-username/VadSharp.git
101 | ```
102 | 3. **Create a new branch**:
103 | ```sh
104 | git checkout -b feature-branch
105 | ```
106 | 4. **Make your changes & commit**:
107 | ```sh
108 | git add .
109 | git commit -m "Your awesome feature!"
110 | ```
111 | 5. **Push the branch & create a PR**:
112 | ```sh
113 | git push origin feature-branch
114 | ```
115 | 6. **Open a PR on GitHub**
116 |
117 | ---
118 |
119 | ## 🐛 Issues & Bug Reports
120 |
121 | If you find a bug or have a feature request, please **open an issue**:
122 |
123 | 1. Go to the [Issues Tab](https://github.com/GabryB03/VadSharp/issues).
124 | 2. Click on **"New Issue"**.
125 | 3. Provide a **clear and concise** description of the problem.
126 | 4. If possible, include **screenshots and logs**.
127 |
128 | I will review and respond ASAP! 🚀
129 |
130 | ---
131 |
132 | ## ✨ Credits
133 |
134 | All of my credits go to the original inventor of the [Silero VAD](https://github.com/snakers4/silero-vad/) project,
135 | which has worked hard to the architecture of the algorithm and trained the models!
136 |
137 | ## 📜 License
138 |
139 | VadSharp is licensed under the **MIT License**. Feel free to use, modify, and distribute it as you like!
140 |
141 | 📌 **Made with ❤️ by [GabryB03](https://github.com/GabryB03/)**
142 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Ww][Ii][Nn]32/
27 | [Aa][Rr][Mm]/
28 | [Aa][Rr][Mm]64/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Ll]og/
33 | [Ll]ogs/
34 |
35 | # Visual Studio 2015/2017 cache/options directory
36 | .vs/
37 | # Uncomment if you have tasks that create the project's static files in wwwroot
38 | #wwwroot/
39 |
40 | # Visual Studio 2017 auto generated files
41 | Generated\ Files/
42 |
43 | # MSTest test Results
44 | [Tt]est[Rr]esult*/
45 | [Bb]uild[Ll]og.*
46 |
47 | # NUnit
48 | *.VisualState.xml
49 | TestResult.xml
50 | nunit-*.xml
51 |
52 | # Build Results of an ATL Project
53 | [Dd]ebugPS/
54 | [Rr]eleasePS/
55 | dlldata.c
56 |
57 | # Benchmark Results
58 | BenchmarkDotNet.Artifacts/
59 |
60 | # .NET Core
61 | project.lock.json
62 | project.fragment.lock.json
63 | artifacts/
64 |
65 | # ASP.NET Scaffolding
66 | ScaffoldingReadMe.txt
67 |
68 | # StyleCop
69 | StyleCopReport.xml
70 |
71 | # Files built by Visual Studio
72 | *_i.c
73 | *_p.c
74 | *_h.h
75 | *.ilk
76 | *.meta
77 | *.obj
78 | *.iobj
79 | *.pch
80 | *.pdb
81 | *.ipdb
82 | *.pgc
83 | *.pgd
84 | *.rsp
85 | # but not Directory.Build.rsp, as it configures directory-level build defaults
86 | !Directory.Build.rsp
87 | *.sbr
88 | *.tlb
89 | *.tli
90 | *.tlh
91 | *.tmp
92 | *.tmp_proj
93 | *_wpftmp.csproj
94 | *.log
95 | *.tlog
96 | *.vspscc
97 | *.vssscc
98 | .builds
99 | *.pidb
100 | *.svclog
101 | *.scc
102 |
103 | # Chutzpah Test files
104 | _Chutzpah*
105 |
106 | # Visual C++ cache files
107 | ipch/
108 | *.aps
109 | *.ncb
110 | *.opendb
111 | *.opensdf
112 | *.sdf
113 | *.cachefile
114 | *.VC.db
115 | *.VC.VC.opendb
116 |
117 | # Visual Studio profiler
118 | *.psess
119 | *.vsp
120 | *.vspx
121 | *.sap
122 |
123 | # Visual Studio Trace Files
124 | *.e2e
125 |
126 | # TFS 2012 Local Workspace
127 | $tf/
128 |
129 | # Guidance Automation Toolkit
130 | *.gpState
131 |
132 | # ReSharper is a .NET coding add-in
133 | _ReSharper*/
134 | *.[Rr]e[Ss]harper
135 | *.DotSettings.user
136 |
137 | # TeamCity is a build add-in
138 | _TeamCity*
139 |
140 | # DotCover is a Code Coverage Tool
141 | *.dotCover
142 |
143 | # AxoCover is a Code Coverage Tool
144 | .axoCover/*
145 | !.axoCover/settings.json
146 |
147 | # Coverlet is a free, cross platform Code Coverage Tool
148 | coverage*.json
149 | coverage*.xml
150 | coverage*.info
151 |
152 | # Visual Studio code coverage results
153 | *.coverage
154 | *.coveragexml
155 |
156 | # NCrunch
157 | _NCrunch_*
158 | .*crunch*.local.xml
159 | nCrunchTemp_*
160 |
161 | # MightyMoose
162 | *.mm.*
163 | AutoTest.Net/
164 |
165 | # Web workbench (sass)
166 | .sass-cache/
167 |
168 | # Installshield output folder
169 | [Ee]xpress/
170 |
171 | # DocProject is a documentation generator add-in
172 | DocProject/buildhelp/
173 | DocProject/Help/*.HxT
174 | DocProject/Help/*.HxC
175 | DocProject/Help/*.hhc
176 | DocProject/Help/*.hhk
177 | DocProject/Help/*.hhp
178 | DocProject/Help/Html2
179 | DocProject/Help/html
180 |
181 | # Click-Once directory
182 | publish/
183 |
184 | # Publish Web Output
185 | *.[Pp]ublish.xml
186 | *.azurePubxml
187 | # Note: Comment the next line if you want to checkin your web deploy settings,
188 | # but database connection strings (with potential passwords) will be unencrypted
189 | *.pubxml
190 | *.publishproj
191 |
192 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
193 | # checkin your Azure Web App publish settings, but sensitive information contained
194 | # in these scripts will be unencrypted
195 | PublishScripts/
196 |
197 | # NuGet Packages
198 | *.nupkg
199 | # NuGet Symbol Packages
200 | *.snupkg
201 | # The packages folder can be ignored because of Package Restore
202 | **/[Pp]ackages/*
203 | # except build/, which is used as an MSBuild target.
204 | !**/[Pp]ackages/build/
205 | # Uncomment if necessary however generally it will be regenerated when needed
206 | #!**/[Pp]ackages/repositories.config
207 | # NuGet v3's project.json files produces more ignorable files
208 | *.nuget.props
209 | *.nuget.targets
210 |
211 | # Microsoft Azure Build Output
212 | csx/
213 | *.build.csdef
214 |
215 | # Microsoft Azure Emulator
216 | ecf/
217 | rcf/
218 |
219 | # Windows Store app package directories and files
220 | AppPackages/
221 | BundleArtifacts/
222 | Package.StoreAssociation.xml
223 | _pkginfo.txt
224 | *.appx
225 | *.appxbundle
226 | *.appxupload
227 |
228 | # Visual Studio cache files
229 | # files ending in .cache can be ignored
230 | *.[Cc]ache
231 | # but keep track of directories ending in .cache
232 | !?*.[Cc]ache/
233 |
234 | # Others
235 | ClientBin/
236 | ~$*
237 | *~
238 | *.dbmdl
239 | *.dbproj.schemaview
240 | *.jfm
241 | *.pfx
242 | *.publishsettings
243 | orleans.codegen.cs
244 |
245 | # Including strong name files can present a security risk
246 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
247 | #*.snk
248 |
249 | # Since there are multiple workflows, uncomment next line to ignore bower_components
250 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
251 | #bower_components/
252 |
253 | # RIA/Silverlight projects
254 | Generated_Code/
255 |
256 | # Backup & report files from converting an old project file
257 | # to a newer Visual Studio version. Backup files are not needed,
258 | # because we have git ;-)
259 | _UpgradeReport_Files/
260 | Backup*/
261 | UpgradeLog*.XML
262 | UpgradeLog*.htm
263 | ServiceFabricBackup/
264 | *.rptproj.bak
265 |
266 | # SQL Server files
267 | *.mdf
268 | *.ldf
269 | *.ndf
270 |
271 | # Business Intelligence projects
272 | *.rdl.data
273 | *.bim.layout
274 | *.bim_*.settings
275 | *.rptproj.rsuser
276 | *- [Bb]ackup.rdl
277 | *- [Bb]ackup ([0-9]).rdl
278 | *- [Bb]ackup ([0-9][0-9]).rdl
279 |
280 | # Microsoft Fakes
281 | FakesAssemblies/
282 |
283 | # GhostDoc plugin setting file
284 | *.GhostDoc.xml
285 |
286 | # Node.js Tools for Visual Studio
287 | .ntvs_analysis.dat
288 | node_modules/
289 |
290 | # Visual Studio 6 build log
291 | *.plg
292 |
293 | # Visual Studio 6 workspace options file
294 | *.opt
295 |
296 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
297 | *.vbw
298 |
299 | # Visual Studio 6 auto-generated project file (contains which files were open etc.)
300 | *.vbp
301 |
302 | # Visual Studio 6 workspace and project file (working project files containing files to include in project)
303 | *.dsw
304 | *.dsp
305 |
306 | # Visual Studio 6 technical files
307 | *.ncb
308 | *.aps
309 |
310 | # Visual Studio LightSwitch build output
311 | **/*.HTMLClient/GeneratedArtifacts
312 | **/*.DesktopClient/GeneratedArtifacts
313 | **/*.DesktopClient/ModelManifest.xml
314 | **/*.Server/GeneratedArtifacts
315 | **/*.Server/ModelManifest.xml
316 | _Pvt_Extensions
317 |
318 | # Paket dependency manager
319 | .paket/paket.exe
320 | paket-files/
321 |
322 | # FAKE - F# Make
323 | .fake/
324 |
325 | # CodeRush personal settings
326 | .cr/personal
327 |
328 | # Python Tools for Visual Studio (PTVS)
329 | __pycache__/
330 | *.pyc
331 |
332 | # Cake - Uncomment if you are using it
333 | # tools/**
334 | # !tools/packages.config
335 |
336 | # Tabs Studio
337 | *.tss
338 |
339 | # Telerik's JustMock configuration file
340 | *.jmconfig
341 |
342 | # BizTalk build output
343 | *.btp.cs
344 | *.btm.cs
345 | *.odx.cs
346 | *.xsd.cs
347 |
348 | # OpenCover UI analysis results
349 | OpenCover/
350 |
351 | # Azure Stream Analytics local run output
352 | ASALocalRun/
353 |
354 | # MSBuild Binary and Structured Log
355 | *.binlog
356 |
357 | # NVidia Nsight GPU debugger configuration file
358 | *.nvuser
359 |
360 | # MFractors (Xamarin productivity tool) working folder
361 | .mfractor/
362 |
363 | # Local History for Visual Studio
364 | .localhistory/
365 |
366 | # Visual Studio History (VSHistory) files
367 | .vshistory/
368 |
369 | # BeatPulse healthcheck temp database
370 | healthchecksdb
371 |
372 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
373 | MigrationBackup/
374 |
375 | # Ionide (cross platform F# VS Code tools) working folder
376 | .ionide/
377 |
378 | # Fody - auto-generated XML schema
379 | FodyWeavers.xsd
380 |
381 | # VS Code files for those working on multiple tools
382 | .vscode/*
383 | !.vscode/settings.json
384 | !.vscode/tasks.json
385 | !.vscode/launch.json
386 | !.vscode/extensions.json
387 | *.code-workspace
388 |
389 | # Local History for Visual Studio Code
390 | .history/
391 |
392 | # Windows Installer files from build outputs
393 | *.cab
394 | *.msi
395 | *.msix
396 | *.msm
397 | *.msp
398 |
399 | # JetBrains Rider
400 | *.sln.iml
401 |
--------------------------------------------------------------------------------
/VadSharp/VadOnnxModel.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.ML.OnnxRuntime;
2 | using Microsoft.ML.OnnxRuntime.Tensors;
3 |
4 | namespace VadSharp
5 | {
6 | public class VadOnnxModel : IDisposable
7 | {
8 | private readonly InferenceSession session;
9 | private float[][][] state;
10 | private float[][] context;
11 | private int lastSr, lastBatchSize;
12 | private static readonly int[] SupportedRates = { 8000, 16000 };
13 |
14 | public VadOnnxModel(string modelPath, bool useDirectML = true)
15 | {
16 | var sessionOptions = new SessionOptions();
17 |
18 | if (useDirectML)
19 | {
20 | try
21 | {
22 | sessionOptions.AppendExecutionProvider_DML();
23 | }
24 | catch
25 | {
26 |
27 | }
28 | }
29 |
30 | sessionOptions.InterOpNumThreads = 1;
31 | sessionOptions.IntraOpNumThreads = 1;
32 | sessionOptions.EnableCpuMemArena = true;
33 |
34 | session = new InferenceSession(modelPath, sessionOptions);
35 | ResetStates();
36 | }
37 |
38 | public void ResetStates()
39 | {
40 | state = new float[2][][]
41 | {
42 | new float[1][] { new float[128] },
43 | new float[1][] { new float[128] }
44 | };
45 | context = Array.Empty();
46 | lastSr = 0;
47 | lastBatchSize = 0;
48 | }
49 |
50 | public void Dispose() => session?.Dispose();
51 |
52 | public class ValidationResult
53 | {
54 | public float[][] X { get; }
55 | public int Sr { get; }
56 | public ValidationResult(float[][] x, int sr)
57 | {
58 | X = x;
59 | Sr = sr;
60 | }
61 | }
62 |
63 | private ValidationResult ValidateInput(float[][] x, int sr)
64 | {
65 | if (x.Length == 1)
66 | {
67 | x = new float[][] { x[0] };
68 | }
69 |
70 | if (x.Length > 2)
71 | {
72 | throw new ArgumentException($"Incorrect audio data dimension: {x[0].Length}");
73 | }
74 |
75 | if (sr != 16000 && sr % 16000 == 0)
76 | {
77 | int step = sr / 16000;
78 |
79 | for (int i = 0; i < x.Length; i++)
80 | {
81 | float[] current = x[i];
82 | int newLength = (current.Length + step - 1) / step;
83 | float[] newArr = new float[newLength];
84 |
85 | for (int j = 0, index = 0; j < current.Length; j += step, index++)
86 | {
87 | newArr[index] = current[j];
88 | }
89 |
90 | x[i] = newArr;
91 | }
92 |
93 | sr = 16000;
94 | }
95 |
96 | if (Array.IndexOf(SupportedRates, sr) < 0)
97 | {
98 | throw new ArgumentException($"Only supports sample rates {string.Join(", ", SupportedRates)} (or multiples of 16000)");
99 | }
100 |
101 | if (((float)sr) / x[0].Length > 31.25)
102 | {
103 | throw new ArgumentException("Input audio is too short");
104 | }
105 |
106 | return new ValidationResult(x, sr);
107 | }
108 |
109 | private static float[][] Concatenate(float[][] a, float[][] b)
110 | {
111 | if (a.Length != b.Length)
112 | {
113 | throw new ArgumentException("The number of rows in both arrays must be the same.");
114 | }
115 |
116 | int rows = a.Length;
117 | float[][] result = new float[rows][];
118 |
119 | for (int i = 0; i < rows; i++)
120 | {
121 | int lenA = a[i].Length;
122 | int lenB = b[i].Length;
123 | float[] row = new float[lenA + lenB];
124 | Buffer.BlockCopy(a[i], 0, row, 0, lenA * sizeof(float));
125 | Buffer.BlockCopy(b[i], 0, row, lenA * sizeof(float), lenB * sizeof(float));
126 | result[i] = row;
127 | }
128 |
129 | return result;
130 | }
131 |
132 | private static float[][] GetLastColumns(float[][] array, int contextSize)
133 | {
134 | int rows = array.Length;
135 | int cols = array[0].Length;
136 |
137 | if (contextSize > cols)
138 | {
139 | throw new ArgumentException("contextSize cannot be greater than the number of columns in the array.");
140 | }
141 |
142 | float[][] result = new float[rows][];
143 |
144 | for (int i = 0; i < rows; i++)
145 | {
146 | float[] row = new float[contextSize];
147 | Buffer.BlockCopy(array[i], (cols - contextSize) * sizeof(float), row, 0, contextSize * sizeof(float));
148 | result[i] = row;
149 | }
150 |
151 | return result;
152 | }
153 |
154 | public float[] Call(float[][] x, int sr)
155 | {
156 | var validation = ValidateInput(x, sr);
157 | x = validation.X;
158 | sr = validation.Sr;
159 |
160 | int numberSamples = (sr == 16000) ? 512 : 256;
161 |
162 | if (x[0].Length != numberSamples)
163 | {
164 | throw new ArgumentException($"Provided number of samples is {x[0].Length} (Supported values: 256 for 8000 sample rate, 512 for 16000)");
165 | }
166 |
167 | int batchSize = x.Length;
168 | int contextSize = (sr == 16000) ? 64 : 32;
169 |
170 | if (lastBatchSize == 0 || lastSr != sr || lastBatchSize != batchSize)
171 | {
172 | ResetStates();
173 | }
174 |
175 | if (context.Length != batchSize)
176 | {
177 | context = new float[batchSize][];
178 |
179 | for (int i = 0; i < batchSize; i++)
180 | {
181 | context[i] = new float[contextSize];
182 | }
183 | }
184 |
185 | x = Concatenate(context, x);
186 | int rows = x.Length;
187 | int cols = x[0].Length;
188 |
189 | float[] inputData = new float[rows * cols];
190 |
191 | for (int i = 0; i < rows; i++)
192 | {
193 | Array.Copy(x[i], 0, inputData, i * cols, cols);
194 | }
195 |
196 | var inputTensor = new DenseTensor(inputData, new[] { rows, cols });
197 | var srTensor = new DenseTensor(new long[] { sr }, new[] { 1 });
198 |
199 | int stateDim0 = state.Length;
200 | int stateDim1 = state[0].Length;
201 | int stateDim2 = state[0][0].Length;
202 |
203 | float[] stateData = new float[stateDim0 * stateDim1 * stateDim2];
204 | int index = 0;
205 |
206 | for (int i = 0; i < stateDim0; i++)
207 | {
208 | for (int j = 0; j < stateDim1; j++)
209 | {
210 | Array.Copy(state[i][j], 0, stateData, index, stateDim2);
211 | index += stateDim2;
212 | }
213 | }
214 |
215 | var stateTensor = new DenseTensor(stateData, new[] { stateDim0, stateDim1, stateDim2 });
216 |
217 | var inputs = new List
218 | {
219 | NamedOnnxValue.CreateFromTensor("input", inputTensor),
220 | NamedOnnxValue.CreateFromTensor("sr", srTensor),
221 | NamedOnnxValue.CreateFromTensor("state", stateTensor)
222 | };
223 |
224 | using var outputs = session.Run(inputs);
225 | var outputTensor = outputs.First(o => o.Name == "output").AsTensor();
226 | var newStateTensor = outputs.First(o => o.Name == "stateN").AsTensor();
227 |
228 | context = GetLastColumns(x, contextSize);
229 | lastSr = sr;
230 | lastBatchSize = batchSize;
231 |
232 | var dims = newStateTensor.Dimensions;
233 | int d0 = dims[0], d1 = dims[1], d2 = dims[2];
234 | state = new float[d0][][];
235 | float[] newStateFlat = newStateTensor.ToArray();
236 | index = 0;
237 |
238 | for (int i = 0; i < d0; i++)
239 | {
240 | state[i] = new float[d1][];
241 | for (int j = 0; j < d1; j++)
242 | {
243 | float[] row = new float[d2];
244 | Array.Copy(newStateFlat, index, row, 0, d2);
245 | state[i][j] = row;
246 | index += d2;
247 | }
248 | }
249 |
250 | return outputTensor.ToArray();
251 | }
252 | }
253 | }
--------------------------------------------------------------------------------
/VadSharp/VadDetector.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection.PortableExecutable;
2 | using System.Security.Cryptography;
3 | using NAudio.Wave;
4 | using NAudio.Wave.SampleProviders;
5 |
6 | namespace VadSharp
7 | {
8 | public class VadDetector
9 | {
10 | private readonly VadOnnxModel _model;
11 | private readonly float _threshold;
12 | private readonly float _negThreshold;
13 | private readonly int _samplingRate;
14 | private readonly int _windowSizeSample;
15 | private readonly float _minSpeechSamples;
16 | private readonly float _speechPadSamples;
17 | private readonly float _maxSpeechSamples;
18 | private readonly float _minSilenceSamples;
19 | private readonly float _minSilenceSamplesAtMaxSpeech;
20 | private int _audioLengthSamples;
21 | private const float THRESHOLD_GAP = 0.15f;
22 | private const int SAMPLING_RATE_8K = 8000;
23 | private const int SAMPLING_RATE_16K = 16000;
24 | private const string _modelHash = "47d6ceb95435caf8049e0ea17a4dd95580e8a5950976ec770962c5534e64f2ea71ce196957102104fe014a2bdfa766f323ab5e46b6beb1fc46db129622298913";
25 |
26 | public VadDetector(string onnxModelPath, float threshold, int samplingRate, int minSpeechDurationMs, float maxSpeechDurationSeconds, int minSilenceDurationMs, int speechPadMs, bool useDirectML = true)
27 | {
28 | if (samplingRate != SAMPLING_RATE_8K && samplingRate != SAMPLING_RATE_16K)
29 | {
30 | throw new ArgumentException("Sampling rate not supported, only available for [8000, 16000]");
31 | }
32 |
33 | SHA512 sha2_512 = SHA512.Create();
34 | byte[] loadedModelBytes = File.ReadAllBytes(onnxModelPath);
35 | byte[] hashedModel = sha2_512.ComputeHash(loadedModelBytes);
36 | string hashString = BitConverter.ToString(hashedModel).Replace("-", "").ToLower();
37 |
38 | if (hashString != _modelHash)
39 | {
40 | throw new ArgumentException("Model not supported");
41 | }
42 |
43 | _model = new VadOnnxModel(onnxModelPath, useDirectML);
44 | _samplingRate = samplingRate;
45 | _threshold = threshold;
46 | _negThreshold = threshold - THRESHOLD_GAP;
47 | _windowSizeSample = samplingRate == SAMPLING_RATE_16K ? 512 : 256;
48 | _minSpeechSamples = samplingRate * minSpeechDurationMs / 1000f;
49 | _speechPadSamples = samplingRate * speechPadMs / 1000f;
50 | _maxSpeechSamples = samplingRate * maxSpeechDurationSeconds - _windowSizeSample - 2 * _speechPadSamples;
51 | _minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
52 | _minSilenceSamplesAtMaxSpeech = samplingRate * 98 / 1000f;
53 | Reset();
54 | }
55 |
56 | public void Reset() => _model.ResetStates();
57 |
58 | public List GetSpeechSegmentList(string wavFile) => GetSpeechSegmentList(new FileInfo(wavFile));
59 |
60 | public List GetSpeechSegmentList(FileInfo wavFile)
61 | {
62 | Reset();
63 | using var reader = new AudioFileReader(wavFile.FullName);
64 | var resampler = new WdlResamplingSampleProvider(reader, _samplingRate);
65 | var speechProbList = new List();
66 | _audioLengthSamples = (int)(reader.Length / 2);
67 | int window = _windowSizeSample;
68 | float[] buffer = new float[window];
69 |
70 | while (resampler.Read(buffer, 0, window) > 0)
71 | {
72 | float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0];
73 | speechProbList.Add(speechProb);
74 | }
75 |
76 | return CalculateProb(speechProbList);
77 | }
78 |
79 | public List GetSpeechSegmentList(float[] audioBuffer, int originalSampleRate)
80 | {
81 | audioBuffer = ResampleAudioBuffer(audioBuffer, originalSampleRate, 16000);
82 | Reset();
83 |
84 | var speechProbList = new List();
85 | int totalSamples = audioBuffer.Length;
86 | _audioLengthSamples = totalSamples;
87 | int window = _windowSizeSample;
88 |
89 | for (int i = 0; i < totalSamples; i += window)
90 | {
91 | float[] buffer = new float[window];
92 | int remaining = totalSamples - i;
93 |
94 | if (remaining >= window)
95 | {
96 | Array.Copy(audioBuffer, i, buffer, 0, window);
97 | }
98 | else
99 | {
100 | Array.Copy(audioBuffer, i, buffer, 0, remaining);
101 | for (int j = remaining; j < window; j++)
102 | {
103 | buffer[j] = 0f;
104 | }
105 | }
106 |
107 | float speechProb = _model.Call(new[] { buffer }, _samplingRate)[0];
108 | speechProbList.Add(speechProb);
109 | }
110 |
111 | return CalculateProb(speechProbList);
112 | }
113 |
114 | private float[] ResampleAudioBuffer(float[] audioBuffer, int originalRate, int targetRate)
115 | {
116 | if (originalRate == targetRate)
117 | {
118 | return audioBuffer;
119 | }
120 |
121 | double ratio = (double)targetRate / originalRate;
122 | int newLength = (int)(audioBuffer.Length * ratio);
123 | float[] resampledBuffer = new float[newLength];
124 |
125 | for (int i = 0; i < newLength; i++)
126 | {
127 | double origPos = i / ratio;
128 | int index = (int)Math.Floor(origPos);
129 | double frac = origPos - index;
130 |
131 | if (index + 1 < audioBuffer.Length)
132 | {
133 | resampledBuffer[i] = (float)(audioBuffer[index] * (1 - frac) + audioBuffer[index + 1] * frac);
134 | }
135 | else
136 | {
137 | resampledBuffer[i] = audioBuffer[index];
138 | }
139 | }
140 |
141 | return resampledBuffer;
142 | }
143 |
144 | private List CalculateProb(List speechProbList)
145 | {
146 | var result = new List();
147 | bool triggered = false;
148 | int tempEnd = 0, prevEnd = 0, nextStart = 0;
149 | var segment = new VadSpeechSegment();
150 | int window = _windowSizeSample;
151 |
152 | for (int i = 0, count = speechProbList.Count; i < count; i++)
153 | {
154 | float prob = speechProbList[i];
155 | int currentOffset = window * i;
156 |
157 | if (prob >= _threshold && tempEnd != 0)
158 | {
159 | tempEnd = 0;
160 |
161 | if (nextStart < prevEnd)
162 | {
163 | nextStart = currentOffset;
164 | }
165 | }
166 |
167 | if (prob >= _threshold && !triggered)
168 | {
169 | triggered = true;
170 | segment.StartOffset = currentOffset;
171 | continue;
172 | }
173 |
174 | if (triggered && currentOffset - segment.StartOffset > _maxSpeechSamples)
175 | {
176 | if (prevEnd != 0)
177 | {
178 | segment.EndOffset = prevEnd;
179 | result.Add(segment);
180 | segment = new VadSpeechSegment();
181 |
182 | if (nextStart < prevEnd)
183 | {
184 | triggered = false;
185 | }
186 | else
187 | {
188 | segment.StartOffset = nextStart;
189 | }
190 |
191 | prevEnd = nextStart = tempEnd = 0;
192 | }
193 | else
194 | {
195 | segment.EndOffset = currentOffset;
196 | result.Add(segment);
197 | segment = new VadSpeechSegment();
198 | prevEnd = nextStart = tempEnd = 0;
199 | triggered = false;
200 | continue;
201 | }
202 | }
203 |
204 | if (prob < _negThreshold && triggered)
205 | {
206 | if (tempEnd == 0)
207 | {
208 | tempEnd = currentOffset;
209 | }
210 |
211 | if (currentOffset - tempEnd > _minSilenceSamplesAtMaxSpeech)
212 | {
213 | prevEnd = tempEnd;
214 | }
215 |
216 | if (currentOffset - tempEnd < _minSilenceSamples)
217 | {
218 | continue;
219 | }
220 | else
221 | {
222 | segment.EndOffset = tempEnd;
223 |
224 | if ((segment.EndOffset - segment.StartOffset) > _minSpeechSamples)
225 | {
226 | result.Add(segment);
227 | }
228 |
229 | segment = new VadSpeechSegment();
230 | prevEnd = nextStart = tempEnd = 0;
231 | triggered = false;
232 | continue;
233 | }
234 | }
235 | }
236 |
237 | if (segment.StartOffset != null && (_audioLengthSamples - segment.StartOffset) > _minSpeechSamples)
238 | {
239 | segment.EndOffset = _audioLengthSamples;
240 | result.Add(segment);
241 | }
242 |
243 | for (int i = 0, count = result.Count; i < count; i++)
244 | {
245 | var item = result[i];
246 |
247 | if (i == 0)
248 | {
249 | item.StartOffset = Math.Max(0, item.StartOffset.Value - (int)_speechPadSamples);
250 | }
251 |
252 | if (i != count - 1)
253 | {
254 | var nextItem = result[i + 1];
255 | int silenceDuration = nextItem.StartOffset.Value - item.EndOffset.Value;
256 |
257 | if (silenceDuration < 2 * _speechPadSamples)
258 | {
259 | int halfSilence = silenceDuration / 2;
260 | item.EndOffset += halfSilence;
261 | nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - halfSilence);
262 | }
263 | else
264 | {
265 | item.EndOffset = Math.Min(_audioLengthSamples, item.EndOffset.Value + (int)_speechPadSamples);
266 | nextItem.StartOffset = Math.Max(0, nextItem.StartOffset.Value - (int)_speechPadSamples);
267 | }
268 | }
269 | else
270 | {
271 | item.EndOffset = Math.Min(_audioLengthSamples, item.EndOffset.Value + (int)_speechPadSamples);
272 | }
273 | }
274 |
275 | return MergeListAndCalculateSecond(result, _samplingRate);
276 | }
277 |
278 | private List MergeListAndCalculateSecond(List segments, int samplingRate)
279 | {
280 | var merged = new List();
281 |
282 | if (segments == null || segments.Count == 0)
283 | {
284 | return merged;
285 | }
286 |
287 | segments.Sort((a, b) => a.StartOffset.Value.CompareTo(b.StartOffset.Value));
288 | int left = segments[0].StartOffset.Value;
289 | int right = segments[0].EndOffset.Value;
290 |
291 | for (int i = 1, count = segments.Count; i < count; i++)
292 | {
293 | var seg = segments[i];
294 |
295 | if (seg.StartOffset > right)
296 | {
297 | merged.Add(new VadSpeechSegment(left, right, CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
298 | left = seg.StartOffset.Value;
299 | right = seg.EndOffset.Value;
300 | }
301 | else
302 | {
303 | right = Math.Max(right, seg.EndOffset.Value);
304 | }
305 | }
306 |
307 | merged.Add(new VadSpeechSegment(left, right, CalculateSecondByOffset(left, samplingRate), CalculateSecondByOffset(right, samplingRate)));
308 | return merged;
309 | }
310 |
311 | private float CalculateSecondByOffset(int offset, int samplingRate)
312 | {
313 | float seconds = offset / (float)samplingRate;
314 | return (float)(Math.Floor(seconds * 1000.0f) / 1000.0f);
315 | }
316 | }
317 | }
--------------------------------------------------------------------------------