├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE ├── README.md ├── crates ├── vosk-sys │ ├── CHANGELOG.md │ ├── Cargo.toml │ ├── README.md │ └── src │ │ └── lib.rs └── vosk │ ├── CHANGELOG.md │ ├── Cargo.toml │ ├── README.md │ ├── examples │ ├── grammar.rs │ ├── microphone.rs │ ├── read_wav.rs │ └── speaker_model.rs │ └── src │ ├── gpu.rs │ ├── lib.rs │ ├── log.rs │ ├── models │ ├── batch.rs │ ├── mod.rs │ └── sequential.rs │ └── recognition │ ├── batch.rs │ ├── errors.rs │ ├── mod.rs │ ├── results.rs │ └── sequential.rs ├── flake.lock ├── flake.nix └── nix └── shells └── default └── default.nix /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | env: 12 | CARGO_TERM_COLOR: always 13 | NIX_DEV: nix develop --command 14 | CARGO_FLAGS: --all-targets --all-features 15 | 16 | jobs: 17 | rustfmt: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v4 22 | - name: Install Nix 23 | uses: DeterminateSystems/nix-installer-action@v14 24 | - name: Cache Nix store 25 | uses: DeterminateSystems/magic-nix-cache-action@v8 26 | - name: Run rustfmt 27 | run: $NIX_DEV cargo fmt --all -- --check 28 | 29 | clippy: 30 | runs-on: ubuntu-latest 31 | steps: 32 | - name: Checkout repository 33 | uses: actions/checkout@v4 34 | - name: Install Nix 35 | uses: DeterminateSystems/nix-installer-action@v14 36 | - name: Cache Nix store 37 | uses: DeterminateSystems/magic-nix-cache-action@v8 38 | - name: Run clippy 39 | run: $NIX_DEV cargo clippy --workspace $CARGO_FLAGS -- -D warnings 40 | 41 | build: 42 | runs-on: ubuntu-latest 43 | steps: 44 | - name: Checkout repository 45 | uses: actions/checkout@v4 46 | - name: Install Nix 47 | uses: DeterminateSystems/nix-installer-action@v14 48 | - name: Cache Nix store 49 | uses: DeterminateSystems/magic-nix-cache-action@v8 50 | - name: Build 51 | run: $NIX_DEV cargo build $CARGO_FLAGS 52 | - name: Run tests 53 | run: $NIX_DEV cargo test $CARGO_FLAGS 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.3.1 2 | * Fix flag-enabled items not showing up on [docs.rs](https://docs.rs/vosk/0.3.0/vosk/index.html). 3 | 4 | # 0.3.0 5 | * Add support for Batch recognition ([PR](https://github.com/Bear-03/vosk-rs/pull/8)). 6 | * [BREAKING] Redesign `LogLevel` to adequately represent Kaldi log levels ([PR](https://github.com/Bear-03/vosk-rs/pull/9)). 7 | * [BREAKING] `Recognizer::accept_waveform` methods now return `Result` (previously `T`). 8 | Vosk takes the buffer length as an `i32` so the user should be able to handle errors that arise due to the 9 | buffer being longer than `i32::MAX`. 10 | * [BREAKING] `Model::find_word` now returns `Option` (previously `Option`) to adjust it to the values 11 | that Vosk can return. 12 | 13 | # 0.2.0 14 | * Documentation fixes. 15 | * Loosen bounds for Recognizer::new_with_grammar. 16 | * [BREAKING] Extra double quotes are no longer needed for phrases in `Recognizer::new_with_grammar`. 17 | 18 | # 0.1.0 19 | * First release. 20 | 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "crates/*" 4 | ] 5 | resolver = "2" 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Bear_03 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vosk 2 | 3 | [![Latest release](https://img.shields.io/crates/v/vosk.svg)](https://crates.io/crates/vosk) 4 | [![Documentation](https://docs.rs/vosk/badge.svg)](https://docs.rs/vosk) 5 | [![MIT](https://img.shields.io/github/license/Bear-03/vosk-rs)](https://github.com/Bear-03/vosk-rs) 6 | [![Build Status](https://github.com/Bear-03/vosk-rs/workflows/CI/badge.svg)](https://github.com/Bear-03/vosk-rs/actions?workflow=CI) 7 | 8 | Safe FFI bindings around the [Vosk API Speech Recognition Toolkit](https://github.com/alphacep/vosk-api). 9 | 10 | ## Usage 11 | ```rust 12 | // Simplified version of examples/read_wav.rs 13 | 14 | // Normally you would not want to hardcode the audio samples 15 | let samples = vec![100, -2, 700, 30, 4, 5]; 16 | let model_path = "/path/to/model"; 17 | 18 | let model = Model::new(model_path).unwrap(); 19 | let mut recognizer = Recognizer::new(&model, 16000.0).unwrap(); 20 | 21 | recognizer.set_max_alternatives(10); 22 | recognizer.set_words(true); 23 | recognizer.set_partial_words(true); 24 | 25 | for sample in samples.chunks(100) { 26 | recognizer.accept_waveform(sample); 27 | println!("{:#?}", recognizer.partial_result()); 28 | } 29 | 30 | println!("{:#?}", recognizer.final_result().multiple().unwrap()); 31 | ``` 32 | 33 | ## Setup 34 | 35 | ### Compilation 36 | 37 | The Vosk-API libraries have to be discoverable by the rust linker. Download the zip file containing the dynamic libraries for your platform [here](https://github.com/alphacep/vosk-api/releases). For iOS development you have to use static libraries. Get the static libraries from the [vosk-api][vosk-api-ios] team. 38 | 39 | #### Using dynamic libraries 40 | Do either of the following: 41 | 42 | - **Recommended:** Create a [build script][build-script-explanation] and provide cargo with the path to the libraries 43 | - Use the [`RUSTFLAGS` environment variable][rust-env-variables] to provide the path to the variables like so: 44 | `RUSTFLAGS=-L/path/to/the/libraries` 45 | with `cargo:rustc-link-search` or `cargo:rustc-link-lib`. 46 | - Make the vosk library accessible system or user-wide: 47 | - Windows: Move the libraries to a directory in your `PATH` environment variable. 48 | - Linux: Move them to `/usr/local/lib`, `/usr/lib` or set the `LIBRARY_PATH` environment variable to the directory containing the libraries. 49 | 50 | Although the approaches are equivalent, using a build script is more convenient because it does not require 51 | the developer to remember a terminal command or change anything outside the project scope. 52 | 53 | #### Using static libraries (macOS-only, targeting iOS) 54 | 55 | - [Extract](https://llvm.org/docs/CommandGuide/llvm-lipo.html) the correct non-fat file (also called thin file) from the static fat file (libvosk.a) for each architecture you would like to support. 56 | - [Mark your crate type as](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-crate-type-field) `staticlib`. 57 | - Create a [build script][build-script-explanation] and provide cargo with the path to the libraries with `cargo:rustc-link-search=` and `cargo:rustc-link-lib=static=`. 58 | 59 | ##### Troubleshooting 60 | In real-world scenarios, one will use Rust to cross compile a library (e.g. Android and iOS). Therefore, we need both `cdylib` as well as the `staticlib` as crate-type. If you compile as usual with cargo build (e.g.: `cargo build --target aarch64-apple-ios --release`) it will not work, because cargo tries to build the dylib as well. Fortunately, since rust 1.64, there is a new option for [rustc](https://github.com/rust-lang/cargo/issues/10083) in the stable channel. Because of this, the following will work: `cargo rustc --crate-type staticlib --lib --target aarch64-apple-ios --release` 61 | 62 | ### Execution 63 | Executables compiled with a dynamic lib must have access to the vosk library at runtime. Executables compiled with a statically compiled library do not. 64 | 65 | #### Using dynamic libraries 66 | Do either of the following: 67 | 68 | - **Recommended:** Copy the libraries to the root of the executable 69 | (`target/` by default). It is recommended that you use a tool such as 70 | [cargo-make](https://sagiegurari.github.io/cargo-make/) to automate moving the libraries 71 | from another, more practical, directory to the destination during build. 72 | - Make the vosk library accessible system or user-wide: 73 | - Windows: Move the libraries to a directory in your `PATH` environment variable. 74 | - Linux: Move them to `/usr/local/lib`, `/usr/lib` or set the `LD_LIBRARY_PATH` environment variable to the directory containing the libraries. Note: `LD_LIBRARY_PATH` is not the same as `LIBRARY_PATH` mentioned in the compilation step. 75 | 76 | 77 | #### Using static libraries (iOS-only) 78 | 79 | - Add the compiled .a library (or libraries if you would like to support more than one architecture) to your iOS project 80 | - Set `Enable Bitcode` to **no** for your target 81 | - Add the `Accelerate Framework` from the iOS SDK to your project 82 | - Depending on your library and use case, you have to write some C -> Objective-C -> Swift glue code. 83 | 84 | [build-script-explanation]: https://doc.rust-lang.org/cargo/reference/build-scripts.html 85 | [rust-env-variables]: https://doc.rust-lang.org/cargo/reference/environment-variables.html 86 | [vosk-api-ios]: https://alphacephei.com/vosk/install#ios-build 87 | -------------------------------------------------------------------------------- /crates/vosk-sys/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.2.0 2 | * Add support for Batch recognition ([PR](https://github.com/Bear-03/vosk-rs/pull/8)). 3 | 4 | # 0.1.1 5 | * Documentation fixes. 6 | 7 | # 0.1.0 8 | * First release. -------------------------------------------------------------------------------- /crates/vosk-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "vosk-sys" 3 | version = "0.2.0" 4 | edition = "2021" 5 | authors = ["Bear_03"] 6 | description = "Raw FFI bindings around the Vosk API Speech Recognition Toolkit" 7 | license = "MIT" 8 | repository = "https://github.com/Bear-03/vosk-rs" 9 | keywords = ["speech", "speech-to-text", "stt"] 10 | categories = ["api-bindings", "multimedia::audio"] 11 | 12 | [dependencies] 13 | -------------------------------------------------------------------------------- /crates/vosk-sys/README.md: -------------------------------------------------------------------------------- 1 | # Vosk-sys 2 | 3 | [![Latest release](https://img.shields.io/crates/v/vosk-sys.svg)](https://crates.io/crates/vosk-sys) 4 | [![Documentation](https://docs.rs/vosk-sys/badge.svg)](https://docs.rs/vosk-sys) 5 | [![MIT](https://img.shields.io/github/license/Bear-03/vosk-rs)](https://github.com/Bear-03/vosk-rs) 6 | [![Build Status](https://github.com/Bear-03/vosk-rs/workflows/CI/badge.svg)](https://github.com/Bear-03/vosk-rs/actions?workflow=CI) 7 | 8 | Raw FFI bindings around the [Vosk API Speech Recognition Toolkit](https://github.com/alphacep/vosk-api), 9 | autogenerated via [rust-bindgen](https://github.com/rust-lang/rust-bindgen). 10 | 11 | ## Setup and usage 12 | 13 | This crate needs the same setup as its safe counterpart. Read the steps [here](../../README.md). 14 | 15 | -------------------------------------------------------------------------------- /crates/vosk-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* automatically generated by rust-bindgen 0.60.1 */ 2 | 3 | #![allow(non_snake_case)] 4 | #![allow(non_camel_case_types)] 5 | #![allow(non_upper_case_globals)] 6 | 7 | #[repr(C)] 8 | #[derive(Debug, Copy, Clone)] 9 | pub struct VoskModel { 10 | _unused: [u8; 0], 11 | } 12 | 13 | #[repr(C)] 14 | #[derive(Debug, Copy, Clone)] 15 | pub struct VoskSpkModel { 16 | _unused: [u8; 0], 17 | } 18 | 19 | #[repr(C)] 20 | #[derive(Debug, Copy, Clone)] 21 | pub struct VoskRecognizer { 22 | _unused: [u8; 0], 23 | } 24 | 25 | #[repr(C)] 26 | #[derive(Debug, Copy, Clone)] 27 | pub struct VoskBatchModel { 28 | _unused: [u8; 0], 29 | } 30 | 31 | #[repr(C)] 32 | #[derive(Debug, Copy, Clone)] 33 | pub struct VoskBatchRecognizer { 34 | _unused: [u8; 0], 35 | } 36 | 37 | #[cfg_attr(not(target_os = "windows"), link(name = "vosk"))] 38 | #[cfg_attr(target_os = "windows", link(name = "libvosk"))] 39 | extern "C" { 40 | #[doc = " Loads model data from the file and returns the model object"] 41 | #[doc = ""] 42 | #[doc = " @param model_path: the path of the model on the filesystem"] 43 | #[doc = " @returns model object or NULL if problem occured"] 44 | pub fn vosk_model_new(model_path: *const ::std::os::raw::c_char) -> *mut VoskModel; 45 | 46 | #[doc = " Releases the model memory"] 47 | #[doc = ""] 48 | #[doc = " The model object is reference-counted so if some recognizer"] 49 | #[doc = " depends on this model, model might still stay alive. When"] 50 | #[doc = " last recognizer is released, model will be released too."] 51 | pub fn vosk_model_free(model: *mut VoskModel); 52 | 53 | #[doc = " Check if a word can be recognized by the model"] 54 | #[doc = " @param word: the word"] 55 | #[doc = " @returns the word symbol if @param word exists inside the model"] 56 | #[doc = " or -1 otherwise."] 57 | #[doc = " Reminding that word symbol 0 is for \\"] 58 | pub fn vosk_model_find_word( 59 | model: *mut VoskModel, 60 | word: *const ::std::os::raw::c_char, 61 | ) -> ::std::os::raw::c_int; 62 | 63 | #[doc = " Loads speaker model data from the file and returns the model object"] 64 | #[doc = ""] 65 | #[doc = " @param model_path: the path of the model on the filesystem"] 66 | #[doc = " @returns model object or NULL if problem occured"] 67 | pub fn vosk_spk_model_new(model_path: *const ::std::os::raw::c_char) -> *mut VoskSpkModel; 68 | 69 | #[doc = " Releases the model memory"] 70 | #[doc = ""] 71 | #[doc = " The model object is reference-counted so if some recognizer"] 72 | #[doc = " depends on this model, model might still stay alive. When"] 73 | #[doc = " last recognizer is released, model will be released too."] 74 | pub fn vosk_spk_model_free(model: *mut VoskSpkModel); 75 | 76 | #[doc = " Creates the recognizer object"] 77 | #[doc = ""] 78 | #[doc = " The recognizers process the speech and return text using shared model data"] 79 | #[doc = " @param model VoskModel containing static data for recognizer. Model can be"] 80 | #[doc = " shared across recognizers, even running in different threads."] 81 | #[doc = " @param sample_rate The sample rate of the audio you going to feed into the recognizer."] 82 | #[doc = " Make sure this rate matches the audio content, it is a common"] 83 | #[doc = " issue causing accuracy problems."] 84 | #[doc = " @returns recognizer object or NULL if problem occured"] 85 | pub fn vosk_recognizer_new(model: *mut VoskModel, sample_rate: f32) -> *mut VoskRecognizer; 86 | 87 | #[doc = " Creates the recognizer object with speaker recognition"] 88 | #[doc = ""] 89 | #[doc = " With the speaker recognition mode the recognizer not just recognize"] 90 | #[doc = " text but also return speaker vectors one can use for speaker identification"] 91 | #[doc = ""] 92 | #[doc = " @param model VoskModel containing static data for recognizer. Model can be"] 93 | #[doc = " shared across recognizers, even running in different threads."] 94 | #[doc = " @param sample_rate The sample rate of the audio you going to feed into the recognizer."] 95 | #[doc = " Make sure this rate matches the audio content, it is a common"] 96 | #[doc = " issue causing accuracy problems."] 97 | #[doc = " @param spk_model speaker model for speaker identification"] 98 | #[doc = " @returns recognizer object or NULL if problem occured"] 99 | pub fn vosk_recognizer_new_spk( 100 | model: *mut VoskModel, 101 | sample_rate: f32, 102 | spk_model: *mut VoskSpkModel, 103 | ) -> *mut VoskRecognizer; 104 | 105 | #[doc = " Creates the recognizer object with the phrase list"] 106 | #[doc = ""] 107 | #[doc = " Sometimes when you want to improve recognition accuracy and when you don't need"] 108 | #[doc = " to recognize large vocabulary you can specify a list of phrases to recognize. This"] 109 | #[doc = " will improve recognizer speed and accuracy but might return \\[unk\\] if user said"] 110 | #[doc = " something different."] 111 | #[doc = ""] 112 | #[doc = " Only recognizers with lookahead models support this type of quick configuration."] 113 | #[doc = " Precompiled HCLG graph models are not supported."] 114 | #[doc = ""] 115 | #[doc = " @param model VoskModel containing static data for recognizer. Model can be"] 116 | #[doc = " shared across recognizers, even running in different threads."] 117 | #[doc = " @param sample_rate The sample rate of the audio you going to feed into the recognizer."] 118 | #[doc = " Make sure this rate matches the audio content, it is a common"] 119 | #[doc = " issue causing accuracy problems."] 120 | #[doc = " @param grammar The string with the list of phrases to recognize as JSON array of strings,"] 121 | #[doc = " for example \"\\[\"one two three four five\", \"\\[unk\\]\"\\]\"."] 122 | #[doc = ""] 123 | #[doc = " @returns recognizer object or NULL if problem occured"] 124 | pub fn vosk_recognizer_new_grm( 125 | model: *mut VoskModel, 126 | sample_rate: f32, 127 | grammar: *const ::std::os::raw::c_char, 128 | ) -> *mut VoskRecognizer; 129 | 130 | #[doc = " Adds speaker model to already initialized recognizer"] 131 | #[doc = ""] 132 | #[doc = " Can add speaker recognition model to already created recognizer. Helps to initialize"] 133 | #[doc = " speaker recognition for grammar-based recognizer."] 134 | #[doc = ""] 135 | #[doc = " @param spk_model Speaker recognition model"] 136 | pub fn vosk_recognizer_set_spk_model( 137 | recognizer: *mut VoskRecognizer, 138 | spk_model: *mut VoskSpkModel, 139 | ); 140 | 141 | #[doc = " Configures recognizer to output n-best results"] 142 | #[doc = ""] 143 | #[doc = "
"]
144 |     #[doc = "   {"]
145 |     #[doc = "      \"alternatives\": ["]
146 |     #[doc = "          { \"text\": \"one two three four five\", \"confidence\": 0.97 },"]
147 |     #[doc = "          { \"text\": \"one two three for five\", \"confidence\": 0.03 },"]
148 |     #[doc = "      ]"]
149 |     #[doc = "   }"]
150 |     #[doc = " 
"] 151 | #[doc = ""] 152 | #[doc = " @param max_alternatives - maximum alternatives to return from recognition results"] 153 | pub fn vosk_recognizer_set_max_alternatives( 154 | recognizer: *mut VoskRecognizer, 155 | max_alternatives: ::std::os::raw::c_int, 156 | ); 157 | 158 | #[doc = " Enables words with times in the output"] 159 | #[doc = ""] 160 | #[doc = "
"]
161 |     #[doc = "   \"result\" : [{"]
162 |     #[doc = "       \"conf\" : 1.000000,"]
163 |     #[doc = "       \"end\" : 1.110000,"]
164 |     #[doc = "       \"start\" : 0.870000,"]
165 |     #[doc = "       \"word\" : \"what\""]
166 |     #[doc = "     }, {"]
167 |     #[doc = "       \"conf\" : 1.000000,"]
168 |     #[doc = "       \"end\" : 1.530000,"]
169 |     #[doc = "       \"start\" : 1.110000,"]
170 |     #[doc = "       \"word\" : \"zero\""]
171 |     #[doc = "     }, {"]
172 |     #[doc = "       \"conf\" : 1.000000,"]
173 |     #[doc = "       \"end\" : 1.950000,"]
174 |     #[doc = "       \"start\" : 1.530000,"]
175 |     #[doc = "       \"word\" : \"zero\""]
176 |     #[doc = "     }, {"]
177 |     #[doc = "       \"conf\" : 1.000000,"]
178 |     #[doc = "       \"end\" : 2.340000,"]
179 |     #[doc = "       \"start\" : 1.950000,"]
180 |     #[doc = "       \"word\" : \"zero\""]
181 |     #[doc = "     }, {"]
182 |     #[doc = "       \"conf\" : 1.000000,"]
183 |     #[doc = "       \"end\" : 2.610000,"]
184 |     #[doc = "       \"start\" : 2.340000,"]
185 |     #[doc = "       \"word\" : \"one\""]
186 |     #[doc = "     }],"]
187 |     #[doc = " 
"] 188 | #[doc = ""] 189 | #[doc = " @param words - boolean value"] 190 | pub fn vosk_recognizer_set_words(recognizer: *mut VoskRecognizer, words: ::std::os::raw::c_int); 191 | 192 | #[doc = " Like above return words and confidences in partial results"] 193 | #[doc = ""] 194 | #[doc = " @param partial_words - boolean value"] 195 | pub fn vosk_recognizer_set_partial_words( 196 | recognizer: *mut VoskRecognizer, 197 | partial_words: ::std::os::raw::c_int, 198 | ); 199 | 200 | #[doc = " Set NLSML output"] 201 | #[doc = " @param nlsml - boolean value"] 202 | pub fn vosk_recognizer_set_nlsml(recognizer: *mut VoskRecognizer, nlsml: ::std::os::raw::c_int); 203 | 204 | #[doc = " Accept voice data"] 205 | #[doc = ""] 206 | #[doc = " accept and process new chunk of voice data"] 207 | #[doc = ""] 208 | #[doc = " @param data - audio data in PCM 16-bit mono format"] 209 | #[doc = " @param length - length of the audio data"] 210 | #[doc = " @returns 1 if silence is occured and you can retrieve a new utterance with result method"] 211 | #[doc = " 0 if decoding continues"] 212 | #[doc = " -1 if exception occured"] 213 | pub fn vosk_recognizer_accept_waveform( 214 | recognizer: *mut VoskRecognizer, 215 | data: *const ::std::os::raw::c_char, 216 | length: ::std::os::raw::c_int, 217 | ) -> ::std::os::raw::c_int; 218 | 219 | #[doc = " Same as above but the version with the short data for language bindings where you have"] 220 | #[doc = " audio as array of shorts"] 221 | pub fn vosk_recognizer_accept_waveform_s( 222 | recognizer: *mut VoskRecognizer, 223 | data: *const ::std::os::raw::c_short, 224 | length: ::std::os::raw::c_int, 225 | ) -> ::std::os::raw::c_int; 226 | 227 | #[doc = " Same as above but the version with the float data for language bindings where you have"] 228 | #[doc = " audio as array of floats"] 229 | pub fn vosk_recognizer_accept_waveform_f( 230 | recognizer: *mut VoskRecognizer, 231 | data: *const f32, 232 | length: ::std::os::raw::c_int, 233 | ) -> ::std::os::raw::c_int; 234 | 235 | #[doc = " Returns speech recognition result"] 236 | #[doc = ""] 237 | #[doc = " @returns the result in JSON format which contains decoded line, decoded"] 238 | #[doc = " words, times in seconds and confidences. You can parse this result"] 239 | #[doc = " with any json parser"] 240 | #[doc = ""] 241 | #[doc = "
"]
242 |     #[doc = "  {"]
243 |     #[doc = "    \"text\" : \"what zero zero zero one\""]
244 |     #[doc = "  }"]
245 |     #[doc = " 
"] 246 | #[doc = ""] 247 | #[doc = " If alternatives enabled it returns result with alternatives, see also vosk_recognizer_set_alternatives()."] 248 | #[doc = ""] 249 | #[doc = " If word times enabled returns word time, see also vosk_recognizer_set_word_times()."] 250 | pub fn vosk_recognizer_result(recognizer: *mut VoskRecognizer) 251 | -> *const ::std::os::raw::c_char; 252 | 253 | #[doc = " Returns partial speech recognition"] 254 | #[doc = ""] 255 | #[doc = " @returns partial speech recognition text which is not yet finalized."] 256 | #[doc = " result may change as recognizer process more data."] 257 | #[doc = ""] 258 | #[doc = "
"]
259 |     #[doc = " {"]
260 |     #[doc = "    \"partial\" : \"cyril one eight zero\""]
261 |     #[doc = " }"]
262 |     #[doc = " 
"] 263 | pub fn vosk_recognizer_partial_result( 264 | recognizer: *mut VoskRecognizer, 265 | ) -> *const ::std::os::raw::c_char; 266 | 267 | #[doc = " Returns speech recognition result. Same as result, but doesn't wait for silence"] 268 | #[doc = " You usually call it in the end of the stream to get final bits of audio. It"] 269 | #[doc = " flushes the feature pipeline, so all remaining audio chunks got processed."] 270 | #[doc = ""] 271 | #[doc = " @returns speech result in JSON format."] 272 | pub fn vosk_recognizer_final_result( 273 | recognizer: *mut VoskRecognizer, 274 | ) -> *const ::std::os::raw::c_char; 275 | 276 | #[doc = " Resets the recognizer"] 277 | #[doc = ""] 278 | #[doc = " Resets current results so the recognition can continue from scratch"] 279 | pub fn vosk_recognizer_reset(recognizer: *mut VoskRecognizer); 280 | 281 | #[doc = " Releases recognizer object"] 282 | #[doc = ""] 283 | #[doc = " Underlying model is also unreferenced and if needed released"] 284 | pub fn vosk_recognizer_free(recognizer: *mut VoskRecognizer); 285 | 286 | #[doc = " Set log level for Kaldi messages"] 287 | #[doc = ""] 288 | #[doc = " @param log_level the level"] 289 | #[doc = " 0 - default value to print info and error messages but no debug"] 290 | #[doc = " less than 0 - don't print info messages"] 291 | #[doc = " greather than 0 - more verbose mode"] 292 | pub fn vosk_set_log_level(log_level: ::std::os::raw::c_int); 293 | 294 | #[doc = " Init, automatically select a CUDA device and allow multithreading."] 295 | #[doc = " Must be called once from the main thread."] 296 | #[doc = " Has no effect if HAVE_CUDA flag is not set."] 297 | pub fn vosk_gpu_init(); 298 | 299 | #[doc = " Init CUDA device in a multi-threaded environment."] 300 | #[doc = " Must be called for each thread."] 301 | #[doc = " Has no effect if HAVE_CUDA flag is not set."] 302 | pub fn vosk_gpu_thread_init(); 303 | 304 | #[doc = " Creates the batch recognizer object"] 305 | #[doc = ""] 306 | #[doc = " @returns model object or NULL if problem occured"] 307 | pub fn vosk_batch_model_new(model_path: *const ::std::os::raw::c_char) -> *mut VoskBatchModel; 308 | 309 | #[doc = " Releases batch model object"] 310 | pub fn vosk_batch_model_free(model: *mut VoskBatchModel); 311 | 312 | #[doc = " Wait for the processing"] 313 | pub fn vosk_batch_model_wait(model: *mut VoskBatchModel); 314 | 315 | #[doc = " Creates batch recognizer object"] 316 | #[doc = " @returns recognizer object or NULL if problem occured"] 317 | pub fn vosk_batch_recognizer_new( 318 | model: *mut VoskBatchModel, 319 | sample_rate: f32, 320 | ) -> *mut VoskBatchRecognizer; 321 | 322 | #[doc = " Releases batch recognizer object"] 323 | pub fn vosk_batch_recognizer_free(recognizer: *mut VoskBatchRecognizer); 324 | 325 | #[doc = " Accept batch voice data"] 326 | pub fn vosk_batch_recognizer_accept_waveform( 327 | recognizer: *mut VoskBatchRecognizer, 328 | data: *const ::std::os::raw::c_char, 329 | length: ::std::os::raw::c_int, 330 | ); 331 | 332 | #[doc = " Set NLSML output"] 333 | #[doc = " @param nlsml - boolean value"] 334 | pub fn vosk_batch_recognizer_set_nlsml( 335 | recognizer: *mut VoskBatchRecognizer, 336 | nlsml: ::std::os::raw::c_int, 337 | ); 338 | 339 | #[doc = " Closes the stream"] 340 | pub fn vosk_batch_recognizer_finish_stream(recognizer: *mut VoskBatchRecognizer); 341 | 342 | #[doc = " Return results"] 343 | pub fn vosk_batch_recognizer_front_result( 344 | recognizer: *mut VoskBatchRecognizer, 345 | ) -> *const ::std::os::raw::c_char; 346 | 347 | #[doc = " Release and free first retrieved result"] 348 | pub fn vosk_batch_recognizer_pop(recognizer: *mut VoskBatchRecognizer); 349 | 350 | #[doc = " Get amount of pending chunks for more intelligent waiting"] 351 | pub fn vosk_batch_recognizer_get_pending_chunks( 352 | recognizer: *mut VoskBatchRecognizer, 353 | ) -> ::std::os::raw::c_int; 354 | } 355 | -------------------------------------------------------------------------------- /crates/vosk/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../../CHANGELOG.md -------------------------------------------------------------------------------- /crates/vosk/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "vosk" 3 | version = "0.3.1" 4 | edition = "2021" 5 | authors = ["Bear_03"] 6 | description = "Safe wrapper around the Vosk API Speech Recognition Toolkit" 7 | license = "MIT" 8 | repository = "https://github.com/Bear-03/vosk-rs" 9 | keywords = ["speech", "speech-to-text", "stt"] 10 | categories = ["api-bindings", "multimedia::audio"] 11 | 12 | [dependencies] 13 | vosk-sys = { path = "../vosk-sys", version = "0.2" } 14 | serde_json = "1.0" 15 | serde = { version = "1.0", features = ["derive"] } 16 | thiserror = "1.0" 17 | 18 | [dev-dependencies] 19 | # Dependencies for examples 20 | cpal = "0.15" 21 | dasp = "0.11" 22 | hound = "3.5" 23 | 24 | [features] 25 | batch = [] 26 | 27 | [package.metadata.docs.rs] 28 | all-features = true 29 | cargo-args = ["-Zunstable-options", "-Zrustdoc-scrape-examples"] 30 | rustdoc-args = ["--cfg", "docsrs"] 31 | -------------------------------------------------------------------------------- /crates/vosk/README.md: -------------------------------------------------------------------------------- 1 | ../../README.md -------------------------------------------------------------------------------- /crates/vosk/examples/grammar.rs: -------------------------------------------------------------------------------- 1 | //! Run with: 2 | //! cargo run --example grammar 3 | //! e.g. "cargo run --example grammar /home/user/stt/model /home/user/stt/test.wav" 4 | //! (The WAV file must have signed 16-bit samples) 5 | //! 6 | //! Read the "Setup" section in the README to know how to link the vosk dynamic 7 | //! libaries to the examples 8 | 9 | use std::env; 10 | 11 | use hound::WavReader; 12 | use vosk::{DecodingState, Model, Recognizer}; 13 | 14 | fn main() { 15 | let mut args = env::args(); 16 | args.next(); 17 | 18 | let model_path = args.next().expect("A model path was not provided"); 19 | let wav_path = args 20 | .next() 21 | .expect("A path for the WAV file to be read was not provided"); 22 | 23 | let mut reader = WavReader::open(wav_path).expect("Could not create the WAV reader"); 24 | let samples = reader 25 | .samples() 26 | .collect::>>() 27 | .expect("Could not read WAV file"); 28 | 29 | let model = Model::new(model_path).expect("Could not create the model"); 30 | 31 | let mut recognizer = Recognizer::new_with_grammar( 32 | &model, 33 | reader.spec().sample_rate as f32, 34 | // Provide a list of phrases to be recognized. 35 | // 36 | // If "[unk]" is added, it will be the fallback for any word that could not be recognized. 37 | // Otherwise, the best match will be used in the result, even if it is most likely 38 | // incorrect. 39 | // 40 | // Note that the words in a phrase can still be recognized separately 41 | &["one two three four five six seven eight nine zero", "[unk]"], 42 | ) 43 | .expect("Could not create the recognizer"); 44 | 45 | for sample in samples.chunks(4000) { 46 | let state = recognizer.accept_waveform(sample).unwrap(); 47 | match state { 48 | DecodingState::Finalized => { 49 | println!("{:#?}", recognizer.result().single().unwrap()); 50 | } 51 | DecodingState::Running => { 52 | println!("{:#?}", recognizer.partial_result()); 53 | } 54 | DecodingState::Failed => { 55 | eprintln!("an error occurred") 56 | } 57 | } 58 | } 59 | 60 | println!("{:#?}", recognizer.final_result().single().unwrap()); 61 | } 62 | -------------------------------------------------------------------------------- /crates/vosk/examples/microphone.rs: -------------------------------------------------------------------------------- 1 | //! Run with: 2 | //! cargo run --example microphone 3 | //! e.g. "cargo run --example microphone /home/user/stt/model 10" 4 | //! 5 | //! Read the "Setup" section in the README to know how to link the vosk dynamic 6 | //! libaries to the examples 7 | 8 | use std::{ 9 | env, 10 | sync::{Arc, Mutex}, 11 | time::Duration, 12 | }; 13 | 14 | use cpal::{ 15 | traits::{DeviceTrait, HostTrait, StreamTrait}, 16 | ChannelCount, SampleFormat, 17 | }; 18 | use dasp::{sample::ToSample, Sample}; 19 | use vosk::{DecodingState, Model, Recognizer}; 20 | 21 | fn main() { 22 | let mut args = env::args(); 23 | args.next(); 24 | 25 | let model_path = args.next().expect("A model path was not provided"); 26 | let record_duration = Duration::from_secs( 27 | args.next() 28 | .expect("A recording duration was not provided") 29 | .parse() 30 | .expect("Invalid recording duration"), 31 | ); 32 | 33 | let audio_input_device = cpal::default_host() 34 | .default_input_device() 35 | .expect("No input device connected"); 36 | 37 | let config = audio_input_device 38 | .default_input_config() 39 | .expect("Failed to load default input config"); 40 | let channels = config.channels(); 41 | 42 | let model = Model::new(model_path).expect("Could not create the model"); 43 | let mut recognizer = Recognizer::new(&model, config.sample_rate().0 as f32) 44 | .expect("Could not create the Recognizer"); 45 | 46 | recognizer.set_max_alternatives(10); 47 | recognizer.set_words(true); 48 | recognizer.set_partial_words(true); 49 | 50 | let recognizer = Arc::new(Mutex::new(recognizer)); 51 | 52 | let err_fn = move |err| { 53 | eprintln!("an error occurred on stream: {}", err); 54 | }; 55 | 56 | let recognizer_clone = recognizer.clone(); 57 | let stream = match config.sample_format() { 58 | SampleFormat::I8 => audio_input_device.build_input_stream( 59 | &config.into(), 60 | move |data: &[i8], _| recognize(&mut recognizer_clone.lock().unwrap(), data, channels), 61 | err_fn, 62 | None, 63 | ), 64 | SampleFormat::I16 => audio_input_device.build_input_stream( 65 | &config.into(), 66 | move |data: &[i16], _| recognize(&mut recognizer_clone.lock().unwrap(), data, channels), 67 | err_fn, 68 | None, 69 | ), 70 | SampleFormat::I32 => audio_input_device.build_input_stream( 71 | &config.into(), 72 | move |data: &[i32], _| recognize(&mut recognizer_clone.lock().unwrap(), data, channels), 73 | err_fn, 74 | None, 75 | ), 76 | SampleFormat::F32 => audio_input_device.build_input_stream( 77 | &config.into(), 78 | move |data: &[f32], _| recognize(&mut recognizer_clone.lock().unwrap(), data, channels), 79 | err_fn, 80 | None, 81 | ), 82 | sample_format => panic!("Unsupported sample format '{sample_format}'"), 83 | } 84 | .expect("Could not build stream"); 85 | 86 | stream.play().expect("Could not play stream"); 87 | println!("Recording..."); 88 | 89 | std::thread::sleep(record_duration); 90 | drop(stream); 91 | 92 | println!("{:#?}", recognizer.lock().unwrap().final_result()); 93 | } 94 | 95 | fn recognize>( 96 | recognizer: &mut Recognizer, 97 | data: &[T], 98 | channels: ChannelCount, 99 | ) { 100 | let data: Vec = data.iter().map(|v| v.to_sample()).collect(); 101 | let data = if channels != 1 { 102 | stereo_to_mono(&data) 103 | } else { 104 | data 105 | }; 106 | 107 | let state = recognizer.accept_waveform(&data).unwrap(); 108 | match state { 109 | DecodingState::Running => { 110 | println!("partial: {:#?}", recognizer.partial_result()); 111 | } 112 | DecodingState::Finalized => { 113 | // Result will always be multiple because we called set_max_alternatives 114 | println!("result: {:#?}", recognizer.result().multiple().unwrap()); 115 | } 116 | DecodingState::Failed => eprintln!("error"), 117 | } 118 | } 119 | 120 | pub fn stereo_to_mono(input_data: &[i16]) -> Vec { 121 | let mut result = Vec::with_capacity(input_data.len() / 2); 122 | result.extend( 123 | input_data 124 | .chunks_exact(2) 125 | .map(|chunk| chunk[0] / 2 + chunk[1] / 2), 126 | ); 127 | 128 | result 129 | } 130 | -------------------------------------------------------------------------------- /crates/vosk/examples/read_wav.rs: -------------------------------------------------------------------------------- 1 | //! Run with: 2 | //! cargo run --example read_wav 3 | //! e.g. "cargo run --example read_wav /home/user/stt/model /home/user/stt/test.wav" 4 | //! (The WAV file must have signed 16-bit samples) 5 | //! 6 | //! Read the "Setup" section in the README to know how to link the vosk dynamic 7 | //! libaries to the examples 8 | 9 | use std::env; 10 | 11 | use hound::WavReader; 12 | use vosk::{Model, Recognizer}; 13 | 14 | fn main() { 15 | let mut args = env::args(); 16 | args.next(); 17 | 18 | let model_path = args.next().expect("A model path was not provided"); 19 | let wav_path = args 20 | .next() 21 | .expect("A path for the wav file to be read was not provided"); 22 | 23 | let mut reader = WavReader::open(wav_path).expect("Could not create the WAV reader"); 24 | let samples = reader 25 | .samples() 26 | .collect::>>() 27 | .expect("Could not read WAV file"); 28 | 29 | let model = Model::new(model_path).expect("Could not create the model"); 30 | let mut recognizer = Recognizer::new(&model, reader.spec().sample_rate as f32) 31 | .expect("Could not create the recognizer"); 32 | 33 | recognizer.set_max_alternatives(10); 34 | recognizer.set_words(true); 35 | recognizer.set_partial_words(true); 36 | 37 | for sample in samples.chunks(100) { 38 | recognizer.accept_waveform(sample).unwrap(); 39 | println!("{:#?}", recognizer.partial_result()); 40 | } 41 | 42 | println!("{:#?}", recognizer.final_result().multiple().unwrap()); 43 | } 44 | -------------------------------------------------------------------------------- /crates/vosk/examples/speaker_model.rs: -------------------------------------------------------------------------------- 1 | //! Run with: 2 | //! cargo run --example read_wav 3 | //! e.g. "cargo run --example speaker_model /home/user/stt/model /home/user/stt/speaker_model /home/user/stt/test.wav" 4 | //! (The WAV file must have signed 16-bit sample) 5 | //! 6 | //! Read the "Setup" section in the README to know how to link the vosk dynamic 7 | //! libaries to the examples 8 | 9 | use std::env; 10 | 11 | use hound::WavReader; 12 | use vosk::{Model, Recognizer, SpeakerModel}; 13 | 14 | fn main() { 15 | let mut args = env::args(); 16 | args.next(); 17 | 18 | let model_path = args.next().expect("A model path was not provided"); 19 | let speaker_model_path = args.next().expect("A speaker model path was not provided"); 20 | let wav_path = args 21 | .next() 22 | .expect("A path for the WAV file to be read was not provided"); 23 | 24 | let mut reader = WavReader::open(wav_path).expect("Could not create the WAV reader"); 25 | let samples = reader 26 | .samples() 27 | .collect::>>() 28 | .expect("Could not read WAV file"); 29 | 30 | let model = Model::new(model_path).expect("Could not create the model"); 31 | let spk_model = 32 | SpeakerModel::new(speaker_model_path).expect("Could not create the speaker model"); 33 | let mut recognizer = 34 | Recognizer::new_with_speaker(&model, reader.spec().sample_rate as f32, &spk_model) 35 | .expect("Could not create the recognizer"); 36 | 37 | // Alternatives cannot be enabled as the Alternative objets do not contain the speaker info 38 | // recognizer.set_max_alternatives(10); 39 | 40 | // Words will remain disabled so the speaker data is more visible, though they could be enabled 41 | // recognizer.set_words(true); 42 | // recognizer.set_partial_words(true); 43 | 44 | for sample in samples.chunks(100) { 45 | recognizer.accept_waveform(sample).unwrap(); 46 | println!("{:#?}", recognizer.partial_result()); 47 | } 48 | 49 | println!("{:#?}", recognizer.final_result().single().unwrap()); 50 | } 51 | -------------------------------------------------------------------------------- /crates/vosk/src/gpu.rs: -------------------------------------------------------------------------------- 1 | /// Init, automatically select a CUDA device and allow multithreading. 2 | /// Must be called once from the main thread. 3 | pub fn gpu_init() { 4 | unsafe { vosk_sys::vosk_gpu_init() } 5 | } 6 | 7 | /// Init CUDA device in a multi-threaded environment. 8 | /// Must be called for each thread. 9 | pub fn gpu_thread_init() { 10 | unsafe { vosk_sys::vosk_gpu_thread_init() } 11 | } 12 | -------------------------------------------------------------------------------- /crates/vosk/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(missing_docs)] 2 | #![cfg_attr(docsrs, feature(doc_auto_cfg))] 3 | 4 | //! Safe FFI bindings around the [Vosk API Speech Recognition Toolkit](https://github.com/alphacep/vosk-api). 5 | //! 6 | //! **Basic usage:** 7 | //! * Create a [`Model`] 8 | //! * Create a [`Recognizer`] with that model 9 | //! * Feel audio to the recognizer with [`Recognizer::accept_waveform`] 10 | //! * Get the processed result with [`Recognizer::result`], 11 | //! [`Recognizer::partial_result`] or [`Recognizer::final_result`] 12 | 13 | #[cfg(feature = "batch")] 14 | mod gpu; 15 | mod log; 16 | mod models; 17 | mod recognition; 18 | 19 | pub use crate::{log::*, models::*, recognition::*}; 20 | #[cfg(feature = "batch")] 21 | pub use gpu::*; 22 | -------------------------------------------------------------------------------- /crates/vosk/src/log.rs: -------------------------------------------------------------------------------- 1 | use std::os::raw::c_int; 2 | use vosk_sys::*; 3 | 4 | #[derive(Debug, Default, Clone, Copy)] 5 | /// Log level for Kaldi messages. 6 | pub enum LogLevel { 7 | /// Print Error, Warn, and Info (default) 8 | #[default] 9 | Info, 10 | 11 | /// Print Error and Warn messages. 12 | Warn, 13 | 14 | /// Only print Error messages. 15 | Error, 16 | } 17 | 18 | impl LogLevel { 19 | pub(self) fn to_c_int(self) -> c_int { 20 | match self { 21 | Self::Info => 0, 22 | Self::Warn => -1, 23 | Self::Error => -2, 24 | } 25 | } 26 | } 27 | 28 | /// Set log level for Kaldi messages. 29 | /// 30 | /// Default: [`LogLevel::Info`]. 31 | pub fn set_log_level(log_level: LogLevel) { 32 | unsafe { vosk_set_log_level(log_level.to_c_int()) } 33 | } 34 | -------------------------------------------------------------------------------- /crates/vosk/src/models/batch.rs: -------------------------------------------------------------------------------- 1 | use std::{ffi::CString, ptr::NonNull}; 2 | use vosk_sys::*; 3 | 4 | /// The same as [`Model`], but uses 5 | /// a CUDA enabled Nvidia GPU and dynamic batching to enable higher throughput. 6 | /// 7 | /// [`Model`]: super::sequential::Model 8 | pub struct BatchModel(pub(crate) NonNull); 9 | 10 | impl BatchModel { 11 | /// Loads model data from the file and returns the model object, or [`None`] 12 | /// if a problem occured. 13 | /// 14 | /// * `model_path` - the path to the model directory. 15 | #[must_use] 16 | pub fn new(model_path: impl Into) -> Option { 17 | let model_path_c = CString::new(model_path.into()).ok()?; 18 | let model_ptr = unsafe { vosk_batch_model_new(model_path_c.as_ptr()) }; 19 | 20 | Some(Self(NonNull::new(model_ptr)?)) 21 | } 22 | 23 | /// Waits for inferencing to finish 24 | pub fn wait(&self) { 25 | unsafe { vosk_batch_model_wait(self.0.as_ptr()) }; 26 | } 27 | } 28 | 29 | impl Drop for BatchModel { 30 | fn drop(&mut self) { 31 | unsafe { vosk_batch_model_free(self.0.as_ptr()) } 32 | } 33 | } 34 | 35 | unsafe impl Send for BatchModel {} 36 | unsafe impl Sync for BatchModel {} 37 | -------------------------------------------------------------------------------- /crates/vosk/src/models/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "batch")] 2 | mod batch; 3 | mod sequential; 4 | 5 | #[cfg(feature = "batch")] 6 | pub use batch::BatchModel; 7 | pub use sequential::{Model, SpeakerModel}; 8 | -------------------------------------------------------------------------------- /crates/vosk/src/models/sequential.rs: -------------------------------------------------------------------------------- 1 | use std::{ffi::CString, ptr::NonNull}; 2 | use vosk_sys::*; 3 | 4 | // SAFETY: 5 | // All models can be safely shared across threads 6 | // They hold static data and they won't be mutated by Vosk, so it is safe 7 | // to pass ther pointer to multiple Recognizers even from immutable references 8 | // https://github.com/alphacep/vosk-api/blob/a7bc5a22d411e22bebf4df1cc5554b473c7456d4/src/vosk_api.h 9 | 10 | /// Model that stores all the data required for recognition. 11 | pub struct Model(pub(crate) NonNull); 12 | 13 | impl Model { 14 | /// Loads model data from the file and returns the model object, or [`None`] 15 | /// if a problem occured. 16 | /// 17 | /// * `model_path` - the path to the model directory. 18 | #[must_use] 19 | pub fn new(model_path: impl Into) -> Option { 20 | let model_path_c = CString::new(model_path.into()).ok()?; 21 | let model_ptr = unsafe { vosk_model_new(model_path_c.as_ptr()) }; 22 | 23 | Some(Self(NonNull::new(model_ptr)?)) 24 | } 25 | 26 | /// Check if a word can be recognized by the model. 27 | /// If it is, this returns Some with the index of the word in the model. 28 | /// If it is not, this returns None. 29 | /// 30 | /// Word symbol `0` is for ``. 31 | #[must_use] 32 | pub fn find_word(&mut self, word: &str) -> Option { 33 | let word_c = CString::new(word).ok()?; 34 | 35 | let symbol = unsafe { vosk_model_find_word(self.0.as_ptr(), word_c.as_ptr()) }; 36 | 37 | if symbol == -1 { 38 | None 39 | } else { 40 | // UNWRAP: the only possible negative symbol was -1 41 | Some(u32::try_from(symbol).unwrap()) 42 | } 43 | } 44 | } 45 | 46 | impl Drop for Model { 47 | fn drop(&mut self) { 48 | unsafe { vosk_model_free(self.0.as_ptr()) } 49 | } 50 | } 51 | 52 | unsafe impl Send for Model {} 53 | unsafe impl Sync for Model {} 54 | 55 | /// The same as [`Model`] but contains the data for speaker identification. 56 | pub struct SpeakerModel(pub(crate) NonNull); 57 | 58 | impl SpeakerModel { 59 | /// Loads speaker model data from the file and returns the model 60 | /// object, or [`None`] if a problem occured. 61 | /// 62 | /// * `model_path` - the path to the model in the filesystem. 63 | #[must_use] 64 | pub fn new(model_path: impl Into) -> Option { 65 | let model_path_c = CString::new(model_path.into()).ok()?; 66 | let model_ptr = unsafe { vosk_spk_model_new(model_path_c.as_ptr()) }; 67 | 68 | Some(Self(NonNull::new(model_ptr)?)) 69 | } 70 | } 71 | 72 | impl Drop for SpeakerModel { 73 | fn drop(&mut self) { 74 | unsafe { vosk_spk_model_free(self.0.as_ptr()) } 75 | } 76 | } 77 | 78 | unsafe impl Send for SpeakerModel {} 79 | unsafe impl Sync for SpeakerModel {} 80 | -------------------------------------------------------------------------------- /crates/vosk/src/recognition/batch.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | errors::AcceptWaveformError, 3 | results::{result_from_json_c_str, Word}, 4 | }; 5 | use crate::models::BatchModel; 6 | use vosk_sys::*; 7 | 8 | use std::ptr::NonNull; 9 | 10 | /// The main object which processes data using GPU inferencing. 11 | /// Takes audio as input and returns decoded information as words, confidences, times, and other metadata. 12 | pub struct BatchRecognizer(std::ptr::NonNull); 13 | 14 | impl BatchRecognizer { 15 | /// Creates the recognizer object. Returns [`None`] if a problem occured. 16 | /// 17 | /// The recognizers process the speech and return text using shared model data. 18 | /// 19 | /// * `model` - [`BatchModel`] containing static data for recognizer. Model can be shared 20 | /// across recognizers, even running in different threads. 21 | /// 22 | /// * `sample_rate` - The sample rate of the audio you going to feed into the recognizer. 23 | /// Make sure this rate matches the audio content, it is a common issue causing accuracy problems. 24 | /// 25 | /// [`BatchModel`]: crate::BatchModel 26 | #[must_use] 27 | pub fn new(model: &BatchModel, sample_rate: f32) -> Option { 28 | let recognizer_ptr = unsafe { vosk_batch_recognizer_new(model.0.as_ptr(), sample_rate) }; 29 | Some(Self(NonNull::new(recognizer_ptr)?)) 30 | } 31 | 32 | /// Enables or disables Natural Language Semantics Markup Language (NLSML) in the output. 33 | pub fn set_nlsml(&mut self, enable: bool) { 34 | unsafe { vosk_batch_recognizer_set_nlsml(self.0.as_ptr(), i32::from(enable)) } 35 | } 36 | 37 | /// Accept and process new chunk of voice data. 38 | /// 39 | /// * `data` - Audio data in PCM 16-bit mono format as an array of i8. 40 | pub fn accept_waveform(&mut self, data: &[i8]) -> Result<(), AcceptWaveformError> { 41 | let len = data.len(); 42 | 43 | unsafe { 44 | vosk_batch_recognizer_accept_waveform( 45 | self.0.as_ptr(), 46 | data.as_ptr(), 47 | i32::try_from(len).map_err(|_| AcceptWaveformError::BufferTooLong(len))?, 48 | ) 49 | }; 50 | 51 | Ok(()) 52 | } 53 | 54 | /// Closes the stream to the model. 55 | pub fn finish_stream(&mut self) { 56 | unsafe { vosk_batch_recognizer_finish_stream(self.0.as_ptr()) }; 57 | } 58 | 59 | /// Gets the front of the result queue. 60 | pub fn front_result(&mut self) -> Word { 61 | unsafe { result_from_json_c_str(vosk_batch_recognizer_front_result(self.0.as_ptr())) } 62 | } 63 | 64 | /// Removes the front of the result queue. 65 | pub fn pop(&mut self) { 66 | unsafe { vosk_batch_recognizer_pop(self.0.as_ptr()) } 67 | } 68 | 69 | /// Gets the number of chunks that have yet to be processed. 70 | pub fn get_pending_chunks(&mut self) -> u32 { 71 | // UNWRAP: A "count" of chunks will never be negative 72 | u32::try_from(unsafe { vosk_batch_recognizer_get_pending_chunks(self.0.as_ptr()) }).unwrap() 73 | } 74 | } 75 | 76 | // SAFETY: Recognizer shares no state, so it is Send 77 | unsafe impl Send for BatchRecognizer {} 78 | // SAFETY: All methods that mutate Recognizer require a &mut to it, 79 | // which ensures exclusive access, so it is Sync 80 | unsafe impl Sync for BatchRecognizer {} 81 | 82 | impl Drop for BatchRecognizer { 83 | fn drop(&mut self) { 84 | unsafe { vosk_batch_recognizer_free(self.0.as_ptr()) } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /crates/vosk/src/recognition/errors.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | /// Possible errors that accept_waveform methods might return. 4 | #[derive(Error, Debug)] 5 | pub enum AcceptWaveformError { 6 | /// Error returned if the user passes in a buffer of a length 7 | /// that exceeds the maximum supported buffer length. 8 | #[error( 9 | "the length of the provided audio buffer was {0} (expected < {})", 10 | i32::MAX 11 | )] 12 | BufferTooLong(usize), 13 | } 14 | -------------------------------------------------------------------------------- /crates/vosk/src/recognition/mod.rs: -------------------------------------------------------------------------------- 1 | use std::os::raw::c_int; 2 | 3 | #[cfg(feature = "batch")] 4 | mod batch; 5 | mod errors; 6 | mod results; 7 | mod sequential; 8 | 9 | #[cfg(feature = "batch")] 10 | pub use batch::BatchRecognizer; 11 | pub use errors::*; 12 | pub use results::*; 13 | pub use sequential::Recognizer; 14 | 15 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 16 | /// State of the decodification after processing a chunk of data. 17 | pub enum DecodingState { 18 | /// Silence has occured and you can retrieve a new utterance with the [`Recognizer::result`]. 19 | Finalized, 20 | /// Decoding still continues. 21 | Running, 22 | /// Decoding failed in some way. 23 | Failed, 24 | } 25 | 26 | impl DecodingState { 27 | /// Returns the variant that corresponds to `value` in C. 28 | pub(self) fn from_c_int(value: c_int) -> Self { 29 | match value { 30 | 1 => Self::Finalized, 31 | 0 => Self::Running, 32 | _ => Self::Failed, 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /crates/vosk/src/recognition/results.rs: -------------------------------------------------------------------------------- 1 | //! Results of a recognition 2 | 3 | use std::ffi::{c_char, CStr}; 4 | 5 | use serde::{Deserialize, Serialize}; 6 | 7 | /// A single word in a [`CompleteResultSingle`] and metadata about it. 8 | /// 9 | /// Unlike in [`WordInAlternative`], the confidence ([`conf`]) is part of each word, 10 | /// rather than part of an [`Alternative`]. 11 | /// 12 | /// [`conf`]: Self::conf 13 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 14 | pub struct Word<'a> { 15 | /// Confidence that this word is. 16 | pub conf: f32, 17 | 18 | /// Time in seconds when the word starts. 19 | pub start: f32, 20 | 21 | /// Time in seconds when the word ends. 22 | pub end: f32, 23 | 24 | /// The transcribed word. 25 | pub word: &'a str, 26 | } 27 | 28 | /// A single word in an [`Alternative`] and metadata about it. 29 | /// 30 | /// Unlike [`Word`], it does not contain the confidence, 31 | /// as it is part of the [`Alternative`] itself. 32 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 33 | pub struct WordInAlternative<'a> { 34 | /// Time in seconds when the word starts. 35 | pub start: f32, 36 | 37 | /// Time in seconds when the word ends. 38 | pub end: f32, 39 | 40 | /// The transcribed word. 41 | pub word: &'a str, 42 | } 43 | 44 | /// An alternative transcript in a [`CompleteResultMultiple`]. 45 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 46 | pub struct Alternative<'a> { 47 | /// Confidence of the recognizer that this is the correct alternative transcript. 48 | pub confidence: f32, 49 | 50 | /// Collection of words present in [`text`] with metadata about them. 51 | /// 52 | /// Empty unless [`Recognizer::set_words`] is passed `true`. 53 | /// 54 | /// [`text`]: Self::text 55 | /// [`Recognizer::set_words`]: crate::Recognizer::set_words 56 | #[serde(default)] 57 | pub result: Vec>, 58 | 59 | /// Full transcript text. 60 | pub text: &'a str, 61 | } 62 | 63 | /// Recognition result if [`Recognizer::set_max_alternatives`] 64 | /// is passed a non-zero value. 65 | /// 66 | /// Inner type of [`CompleteResult::Multiple`]. 67 | /// 68 | /// [`Recognizer::set_max_alternatives`]: crate::Recognizer::set_max_alternatives 69 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 70 | pub struct CompleteResultMultiple<'a> { 71 | /// All the possible results of the transcription, ordered from most to less likely. 72 | #[serde(borrow)] 73 | pub alternatives: Vec>, 74 | } 75 | 76 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 77 | /// Data useful for speaker identification. 78 | pub struct SpeakerInfo { 79 | /// Speaker vector used for speaker identification. 80 | #[serde(rename = "spk")] 81 | pub vector: Vec, 82 | 83 | /// Data frames in which the speaker was not in silence. 84 | #[serde(rename = "spk_frames")] 85 | pub frames: u16, 86 | } 87 | 88 | /// Recognition result if [`Recognizer::set_max_alternatives`] 89 | /// is passed a zero (default). 90 | /// 91 | /// Inner type of [`CompleteResult::Single`]. 92 | /// 93 | /// [`Recognizer::set_max_alternatives`]: crate::Recognizer::set_max_alternatives 94 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 95 | pub struct CompleteResultSingle<'a> { 96 | /// Information about the speaker, used for speaker identification 97 | /// 98 | /// Enabled if the [`Recognizer`] was passed a [`SpeakerModel`] with 99 | /// [`Recognizer::new_with_speaker`] or [`Recognizer::set_speaker_model`], 100 | /// [`None`] otherwise 101 | /// 102 | /// [`SpeakerModel`]: crate::SpeakerModel 103 | /// [`Recognizer`]: crate::Recognizer 104 | /// [`Recognizer::new_with_speaker`]: crate::Recognizer::new_with_speaker 105 | /// [`Recognizer::set_speaker_model`]: crate::Recognizer::set_speaker_model 106 | #[serde(flatten)] 107 | pub speaker_info: Option, 108 | 109 | /// Collection of words present in [`text`] with metadata about them. 110 | /// 111 | /// Empty unless [`Recognizer::set_words`] is passed `true`. 112 | /// 113 | /// [`text`]: Self::text 114 | /// [`Recognizer::set_words`]: crate::Recognizer::set_words 115 | #[serde(default)] 116 | pub result: Vec>, 117 | 118 | /// Full text of the transcript. 119 | pub text: &'a str, 120 | } 121 | 122 | /// Different results that can be returned from 123 | /// [`Recognizer::result`] and [`Recognizer::final_result`]. 124 | /// 125 | /// [`Recognizer::result`]: crate::Recognizer::result 126 | /// [`Recognizer::final_result`]: crate::Recognizer::final_result 127 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 128 | #[serde(untagged)] 129 | pub enum CompleteResult<'a> { 130 | /// Result if [`Recognizer::set_max_alternatives`] is passed zero (default). 131 | /// 132 | /// [`Recognizer::set_max_alternatives`]: crate::Recognizer::set_max_alternatives 133 | #[serde(borrow)] 134 | Single(CompleteResultSingle<'a>), 135 | 136 | /// Result if [`Recognizer::set_max_alternatives`] is passed a non-zero value. 137 | /// 138 | /// [`Recognizer::set_max_alternatives`]: crate::Recognizer::set_max_alternatives 139 | Multiple(CompleteResultMultiple<'a>), 140 | } 141 | 142 | impl<'a> CompleteResult<'a> { 143 | /// Returns the inner [`CompleteResultSingle`] if `self` was [`Single`], and [`None`] otherwise. 144 | /// 145 | /// [`Single`]: Self::Single 146 | #[must_use] 147 | pub fn single(self) -> Option> { 148 | match self { 149 | Self::Single(x) => Some(x), 150 | Self::Multiple(_) => None, 151 | } 152 | } 153 | 154 | /// Returns the inner [`CompleteResultMultiple`] if `self` was [`Multiple`], and [`None`] otherwise. 155 | /// 156 | /// [`Multiple`]: Self::Multiple 157 | #[must_use] 158 | pub fn multiple(self) -> Option> { 159 | match self { 160 | Self::Single(_) => None, 161 | Self::Multiple(x) => Some(x), 162 | } 163 | } 164 | } 165 | 166 | /// Result returned by [`Recognizer::partial_result`]. 167 | /// The result may change after processing more data as decoding is not yet complete. 168 | /// 169 | /// [`Recognizer::partial_result`]: crate::Recognizer::partial_result 170 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 171 | pub struct PartialResult<'a> { 172 | // The "partial" JSON key will not be present if partial_result is called when the recognizer isn't running (DecodingState::Running). 173 | // It makes sense to return an empty string in that case 174 | /// Full text of the partial transcript. 175 | #[serde(default)] 176 | pub partial: &'a str, 177 | 178 | /// Collection of words present in [`partial`] with metadata about them. 179 | /// 180 | /// Empty unless [`Recognizer::set_partial_words`] is passed `true`. 181 | /// 182 | /// [`partial`]: Self::partial 183 | /// [`Recognizer::set_partial_words`]: crate::Recognizer::set_partial_words 184 | #[serde(default)] 185 | pub partial_result: Vec>, 186 | } 187 | 188 | /// Generic function to retrieve a given type of result from the recognizer. 189 | pub(super) unsafe fn result_from_json_c_str<'de, T: Deserialize<'de>>(ptr: *const c_char) -> T { 190 | // UNWRAP: Panics in here will never be the caller's fault, but rather some 191 | // edge case that was not thought of and should be reported, so it does not 192 | // make sense to return a Result. 193 | let string = CStr::from_ptr(ptr).to_str().unwrap(); 194 | serde_json::from_str(string).unwrap_or_else(|_| panic!("Invalid JSON: {string:?}")) 195 | } 196 | -------------------------------------------------------------------------------- /crates/vosk/src/recognition/sequential.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | result_from_json_c_str, AcceptWaveformError, CompleteResult, DecodingState, PartialResult, 3 | }; 4 | use crate::models::{Model, SpeakerModel}; 5 | 6 | use std::{ffi::CString, os::raw::c_int, ptr::NonNull}; 7 | use vosk_sys::*; 8 | 9 | /// The main object which processes data. 10 | /// Takes audio as input and returns decoded information as words, confidences, times, and other metadata. 11 | pub struct Recognizer(NonNull); 12 | 13 | impl Recognizer { 14 | /// Creates the recognizer object. Returns [`None`] if a problem occured. 15 | /// 16 | /// The recognizers process the speech and return text using shared model data. 17 | /// 18 | /// * `model` - [`Model`] containing static data for recognizer. Model can be shared 19 | /// across recognizers, even running in different threads. 20 | /// 21 | /// * `sample_rate` - The sample rate of the audio you going to feed into the recognizer. 22 | /// Make sure this rate matches the audio content, it is a common issue causing accuracy problems. 23 | /// 24 | /// [`Model`]: crate::Model 25 | #[must_use] 26 | pub fn new(model: &Model, sample_rate: f32) -> Option { 27 | let recognizer_ptr = unsafe { vosk_recognizer_new(model.0.as_ptr(), sample_rate) }; 28 | Some(Self(NonNull::new(recognizer_ptr)?)) 29 | } 30 | 31 | /// Creates the recognizer object with speaker recognition. Returns [`None`] if a problem occured 32 | /// 33 | /// With the speaker recognition mode the recognizer not just recognize 34 | /// text but also return speaker vectors one can use for speaker identification 35 | /// 36 | /// * `model` - [`Model`] containing the data for recognizer. Model can be 37 | /// shared across recognizers, even running in different threads. 38 | /// 39 | /// * `sample_rate` - The sample rate of the audio you going to feed into the recognizer. 40 | /// Make sure this rate matches the audio content, it is a common 41 | /// issue causing accuracy problems. 42 | /// 43 | /// * `spk_model` - Speaker model for speaker identification. 44 | /// 45 | /// [`Model`]: crate::Model 46 | #[must_use] 47 | pub fn new_with_speaker( 48 | model: &Model, 49 | sample_rate: f32, 50 | speaker_model: &SpeakerModel, 51 | ) -> Option { 52 | let recognizer_ptr = unsafe { 53 | vosk_recognizer_new_spk(model.0.as_ptr(), sample_rate, speaker_model.0.as_ptr()) 54 | }; 55 | 56 | Some(Self(NonNull::new(recognizer_ptr)?)) 57 | } 58 | 59 | /// Creates the recognizer object with that only recognizes certain words. 60 | /// Returns [`None`] if a problem occured. 61 | /// 62 | /// Sometimes when you want to improve recognition accuracy and when you don't need 63 | /// to recognize large vocabulary you can specify a list of phrases to recognize. This 64 | /// will improve recognizer speed and accuracy but might return \[unk\] if user said 65 | /// something different. 66 | /// 67 | /// Only recognizers with lookahead models support this type of quick configuration. 68 | /// Precompiled HCLG graph models are not supported. 69 | /// 70 | /// * `model` - [`Model`] containing the data for recognizer. Model can be shared 71 | /// across recognizers, even running in different threads. 72 | /// 73 | /// * `sample_rate` - The sample rate of the audio you going to feed into the recognizer. 74 | /// Make sure this rate matches the audio content, it is a common issue causing accuracy problems. 75 | /// 76 | /// * `grammar` - The list of phrases to recognize. 77 | /// 78 | /// # Examples 79 | /// 80 | /// ```no_run 81 | /// # use vosk::{Model, Recognizer}; 82 | /// # 83 | /// let model = Model::new("/path/to/model").expect("Could not create a model"); 84 | /// let recognizer = Recognizer::new_with_grammar( 85 | /// &model, 86 | /// 16000.0, 87 | /// &["one two three four five", "[unk]"], 88 | /// ) 89 | /// .expect("Could not create a recognizer"); 90 | /// ``` 91 | /// 92 | /// [`Model`]: crate::Model 93 | #[must_use] 94 | pub fn new_with_grammar( 95 | model: &Model, 96 | sample_rate: f32, 97 | grammar: &[impl AsRef], 98 | ) -> Option { 99 | let grammar_c = CString::new(format!( 100 | "[{}]", 101 | grammar 102 | .iter() 103 | .map(|phrase| format!("\"{}\"", phrase.as_ref())) 104 | .collect::>() 105 | .join(", ") 106 | )) 107 | .ok()?; 108 | let recognizer_ptr = 109 | unsafe { vosk_recognizer_new_grm(model.0.as_ptr(), sample_rate, grammar_c.as_ptr()) }; 110 | 111 | Some(Self(NonNull::new(recognizer_ptr)?)) 112 | } 113 | 114 | /// Adds speaker model to already initialized recognizer 115 | /// 116 | /// Can add speaker recognition model to already created recognizer. Helps to initialize 117 | /// speaker recognition for grammar-based recognizer. 118 | pub fn set_speaker_model(&mut self, speaker_model: &SpeakerModel) { 119 | unsafe { vosk_recognizer_set_spk_model(self.0.as_ptr(), speaker_model.0.as_ptr()) } 120 | } 121 | 122 | /// Configures recognizer to output n-best results in [`result`] and [`final_result`] 123 | /// 124 | /// The returned value from those methods will be a [`CompleteResult::Single`] 125 | /// if `max_alternatives` is 0, and [`CompleteResult::Multiple`] otherwise. 126 | /// 127 | /// * `max_alternatives` - Maximum alternatives to return (may be fewer) (default: 0) 128 | /// 129 | /// [`result`]: Self::result 130 | /// [`final_result`]: Self::final_result 131 | /// [`CompleteResult::Single`]: crate::CompleteResult::Single 132 | /// [`CompleteResult::Multiple`]: crate::CompleteResult::Multiple 133 | pub fn set_max_alternatives(&mut self, max_alternatives: u16) { 134 | unsafe { vosk_recognizer_set_max_alternatives(self.0.as_ptr(), max_alternatives as c_int) } 135 | } 136 | 137 | /// Enables or disables words with metadata in the output, represented as: 138 | /// 139 | /// * [`WordInAlternative`] in a [`CompleteResult::Multiple`] 140 | /// 141 | /// * [`Word`] in a [`CompleteResult::Single`] 142 | /// 143 | /// [`WordInAlternative`]: crate::WordInAlternative 144 | /// [`Word`]: crate::Word 145 | /// [`CompleteResult::Multiple`]: crate::CompleteResult::Multiple 146 | /// [`CompleteResult::Single`]: crate::CompleteResult::Single 147 | pub fn set_words(&mut self, enable: bool) { 148 | unsafe { vosk_recognizer_set_words(self.0.as_ptr(), i32::from(enable)) } 149 | } 150 | 151 | /// Like [`set_words`], but for [`PartialResult`]. 152 | /// 153 | /// Words will always be represented as [`Word`] 154 | /// 155 | /// [`set_words`]: Self::set_words 156 | /// [`PartialResult`]: crate::PartialResult 157 | /// [`Word`]: crate::Word 158 | pub fn set_partial_words(&mut self, enable: bool) { 159 | unsafe { vosk_recognizer_set_partial_words(self.0.as_ptr(), i32::from(enable)) } 160 | } 161 | 162 | /// Enables or disables Natural Language Semantics Markup Language (NLSML) in the output 163 | pub fn set_nlsml(&mut self, enable: bool) { 164 | unsafe { vosk_recognizer_set_nlsml(self.0.as_ptr(), i32::from(enable)) } 165 | } 166 | 167 | /// Accept and process new chunk of voice data. 168 | /// 169 | /// * `data` - Audio data in PCM 16-bit mono format. 170 | /// 171 | /// Returns a [`DecodingState`], which represents the state of the decodification 172 | /// after this chunk of data has been processed. 173 | pub fn accept_waveform(&mut self, data: &[i16]) -> Result { 174 | // vosk_recognizer_accept_waveform and vosk_recognizer_accept_waveform_f are meant 175 | // to be used by languages that do not have an i16 type (those functions also take PCM 16-bit audio, 176 | // but represented as an f32 or i8). Those extra functions aren't needed in rust so they 177 | // will not be wrapped 178 | 179 | let len = data.len(); 180 | 181 | let decoding_state = unsafe { 182 | vosk_recognizer_accept_waveform_s( 183 | self.0.as_ptr(), 184 | data.as_ptr(), 185 | i32::try_from(len).map_err(|_| AcceptWaveformError::BufferTooLong(len))?, 186 | ) 187 | }; 188 | 189 | Ok(DecodingState::from_c_int(decoding_state)) 190 | } 191 | 192 | /// Returns speech recognition result, waiting for silence (see [`DecodingState::Finalized`]) to give a result. 193 | /// 194 | /// The returned value will be a [`CompleteResult::Single`] 195 | /// if [`set_max_alternatives`] was passed a 0 (default), and 196 | /// [`CompleteResult::Multiple`] otherwise. 197 | /// 198 | /// If words are enabled (see [`set_words`]), it also returns metadata about the words. 199 | /// 200 | /// [`set_max_alternatives`]: Self::set_max_alternatives 201 | /// [`set_words`]: Self::set_words 202 | /// [`CompleteResult::Multiple`]: crate::CompleteResult::Multiple 203 | /// [`CompleteResult::Single`]: crate::CompleteResult::Single 204 | #[must_use] 205 | pub fn result(&mut self) -> CompleteResult { 206 | unsafe { result_from_json_c_str(vosk_recognizer_result(self.0.as_ptr())) } 207 | } 208 | 209 | /// Returns partial speech recognition, which is not yet finalized and may change after 210 | /// processing more data. 211 | /// 212 | /// If words are enabled (see [`set_partial_words`]), it also returns metadata about the words. 213 | /// 214 | /// [`set_partial_words`]: Self::set_partial_words 215 | #[must_use] 216 | pub fn partial_result(&mut self) -> PartialResult { 217 | unsafe { result_from_json_c_str(vosk_recognizer_partial_result(self.0.as_ptr())) } 218 | } 219 | 220 | /// Returns speech recognition result. Like [`result`] but it does not 221 | /// wait for silence and it flushes the data so everything is processed 222 | /// 223 | /// [`result`]: Self::result 224 | #[must_use] 225 | pub fn final_result(&mut self) -> CompleteResult { 226 | unsafe { result_from_json_c_str(vosk_recognizer_final_result(self.0.as_ptr())) } 227 | } 228 | 229 | /// Resets current results and data so the recognition can continue from scratch 230 | pub fn reset(&mut self) { 231 | unsafe { 232 | vosk_recognizer_reset(self.0.as_ptr()); 233 | } 234 | } 235 | } 236 | 237 | // SAFETY: Recognizer shares no state, so it is Send 238 | unsafe impl Send for Recognizer {} 239 | // SAFETY: All methods that mutate Recognizer require a &mut to it, 240 | // which ensures exclusive access, so it is Sync 241 | unsafe impl Sync for Recognizer {} 242 | 243 | impl Drop for Recognizer { 244 | fn drop(&mut self) { 245 | unsafe { vosk_recognizer_free(self.0.as_ptr()) } 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "flake-compat": { 4 | "flake": false, 5 | "locked": { 6 | "lastModified": 1650374568, 7 | "narHash": "sha256-Z+s0J8/r907g149rllvwhb4pKi8Wam5ij0st8PwAh+E=", 8 | "owner": "edolstra", 9 | "repo": "flake-compat", 10 | "rev": "b4a34015c698c7793d592d66adbab377907a2be8", 11 | "type": "github" 12 | }, 13 | "original": { 14 | "owner": "edolstra", 15 | "repo": "flake-compat", 16 | "type": "github" 17 | } 18 | }, 19 | "flake-utils": { 20 | "inputs": { 21 | "systems": "systems" 22 | }, 23 | "locked": { 24 | "lastModified": 1694529238, 25 | "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", 26 | "owner": "numtide", 27 | "repo": "flake-utils", 28 | "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", 29 | "type": "github" 30 | }, 31 | "original": { 32 | "owner": "numtide", 33 | "repo": "flake-utils", 34 | "type": "github" 35 | } 36 | }, 37 | "flake-utils-plus": { 38 | "inputs": { 39 | "flake-utils": "flake-utils" 40 | }, 41 | "locked": { 42 | "lastModified": 1715533576, 43 | "narHash": "sha256-fT4ppWeCJ0uR300EH3i7kmgRZnAVxrH+XtK09jQWihk=", 44 | "owner": "gytis-ivaskevicius", 45 | "repo": "flake-utils-plus", 46 | "rev": "3542fe9126dc492e53ddd252bb0260fe035f2c0f", 47 | "type": "github" 48 | }, 49 | "original": { 50 | "owner": "gytis-ivaskevicius", 51 | "repo": "flake-utils-plus", 52 | "rev": "3542fe9126dc492e53ddd252bb0260fe035f2c0f", 53 | "type": "github" 54 | } 55 | }, 56 | "nixpkgs": { 57 | "locked": { 58 | "lastModified": 1729755165, 59 | "narHash": "sha256-6IpnOHWsaSSjT3yvqlrWfHW6HVCT+wOAlUpcooGJ+FQ=", 60 | "owner": "NixOS", 61 | "repo": "nixpkgs", 62 | "rev": "cabaf14d3e69c9921d7acedf5d7d60bb2b90be02", 63 | "type": "github" 64 | }, 65 | "original": { 66 | "owner": "NixOS", 67 | "ref": "nixpkgs-unstable", 68 | "repo": "nixpkgs", 69 | "type": "github" 70 | } 71 | }, 72 | "root": { 73 | "inputs": { 74 | "nixpkgs": "nixpkgs", 75 | "rust-overlay": "rust-overlay", 76 | "snowfall-lib": "snowfall-lib" 77 | } 78 | }, 79 | "rust-overlay": { 80 | "inputs": { 81 | "nixpkgs": [ 82 | "nixpkgs" 83 | ] 84 | }, 85 | "locked": { 86 | "lastModified": 1729736953, 87 | "narHash": "sha256-Rb6JUop7NRklg0uzcre+A+Ebrn/ZiQPkm4QdKg6/3pw=", 88 | "owner": "oxalica", 89 | "repo": "rust-overlay", 90 | "rev": "29b1275740d9283467b8117499ec8cbb35250584", 91 | "type": "github" 92 | }, 93 | "original": { 94 | "owner": "oxalica", 95 | "repo": "rust-overlay", 96 | "type": "github" 97 | } 98 | }, 99 | "snowfall-lib": { 100 | "inputs": { 101 | "flake-compat": "flake-compat", 102 | "flake-utils-plus": "flake-utils-plus", 103 | "nixpkgs": [ 104 | "nixpkgs" 105 | ] 106 | }, 107 | "locked": { 108 | "lastModified": 1719005984, 109 | "narHash": "sha256-mpFl3Jv4fKnn+5znYXG6SsBjfXHJdRG5FEqNSPx0GLA=", 110 | "owner": "snowfallorg", 111 | "repo": "lib", 112 | "rev": "c6238c83de101729c5de3a29586ba166a9a65622", 113 | "type": "github" 114 | }, 115 | "original": { 116 | "owner": "snowfallorg", 117 | "repo": "lib", 118 | "type": "github" 119 | } 120 | }, 121 | "systems": { 122 | "locked": { 123 | "lastModified": 1681028828, 124 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 125 | "owner": "nix-systems", 126 | "repo": "default", 127 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 128 | "type": "github" 129 | }, 130 | "original": { 131 | "owner": "nix-systems", 132 | "repo": "default", 133 | "type": "github" 134 | } 135 | } 136 | }, 137 | "root": "root", 138 | "version": 7 139 | } 140 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | inputs = { 3 | nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; 4 | snowfall-lib = { 5 | url = "github:snowfallorg/lib"; 6 | inputs.nixpkgs.follows = "nixpkgs"; 7 | }; 8 | rust-overlay = { 9 | url = "github:oxalica/rust-overlay"; 10 | inputs.nixpkgs.follows = "nixpkgs"; 11 | }; 12 | }; 13 | 14 | outputs = { snowfall-lib, rust-overlay, ... } @ inputs: 15 | snowfall-lib.mkFlake { 16 | inherit inputs; 17 | src = ./nix; 18 | 19 | overlays = [ 20 | (import rust-overlay) 21 | ]; 22 | }; 23 | } -------------------------------------------------------------------------------- /nix/shells/default/default.nix: -------------------------------------------------------------------------------- 1 | { pkgs, lib, ... }: 2 | let 3 | # Libs and model files are needed to run the examples for testing purposes 4 | voskVersion = "0.3.45"; 5 | arch = builtins.elemAt (lib.strings.splitString "-" pkgs.system) 0; 6 | voskLib = pkgs.fetchzip { 7 | url = "https://github.com/alphacep/vosk-api/releases/download/v${voskVersion}/vosk-linux-${arch}-${voskVersion}.zip"; 8 | hash = "sha256-ToMDbD5ooFMHU0nNlfpLynF29kkfMknBluKO5PipLFY="; 9 | }; 10 | model = pkgs.fetchzip { 11 | url = "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"; 12 | hash = "sha256-CIoPZ/krX+UW2w7c84W3oc1n4zc9BBS/fc8rVYUthuY="; 13 | }; 14 | speakerModel = pkgs.fetchzip { 15 | url = "https://alphacephei.com/vosk/models/vosk-model-spk-0.4.zip"; 16 | hash = "sha256-wpTfZnEL1sCfpLhp+l62d8GcOinR15XnSHaLVASH4RA="; 17 | }; 18 | in 19 | pkgs.mkShell { 20 | buildInputs = with pkgs; [ 21 | (rust-bin.stable.latest.default.override { 22 | extensions = [ "rust-src" ]; 23 | }) 24 | alsa-lib 25 | ]; 26 | nativeBuildInputs = with pkgs; [ pkg-config ]; 27 | 28 | RUSTFLAGS = "-L${voskLib}"; 29 | LD_LIBRARY_PATH = lib.makeLibraryPath [ 30 | pkgs.stdenv.cc.cc 31 | voskLib 32 | ]; 33 | 34 | # Run the examples like "cargo run --example $MODEL $SPEAKER_MODEL" etc. 35 | MODEL = model; 36 | SPEAKER_MODEL = speakerModel; 37 | } --------------------------------------------------------------------------------