├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── cmd └── main.v ├── v.mod ├── vpkg.json └── vspeech.v /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: thecodrr 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vave/ 2 | vave_test -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Abdullah Atta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

📣 vSpeech 📜

3 |

4 | V bindings for Mozilla's DeepSpeech TensorFlow based library for Speech-to-Text. 5 |

6 | showb3037c75870403f5.gif
7 | 8 | ## Installation: 9 | 10 | Install using `vpkg` 11 | 12 | ```bash 13 | vpkg get https://github.com/thecodrr/vspeech 14 | ``` 15 | 16 | Install using `V`'s builtin `vpm` (you will need to import the module with: `import thecodrr.vspeech` with this method of installation): 17 | 18 | ```shell 19 | v install thecodrr.vspeech 20 | ``` 21 | 22 | Install using `git`: 23 | 24 | ```bash 25 | cd path/to/your/project 26 | git clone https://github.com/thecodrr/vspeech 27 | ``` 28 | 29 | You can use [thecodrr.vave](https://github.com/thecodrr/vave) for reading WAV files. 30 | 31 | Then in the wherever you want to use it: 32 | 33 | ```v 34 | import thecodrr.vspeech //OR simply vave depending on how you installed 35 | // Optional 36 | import thecodrr.vave 37 | ``` 38 | 39 | ### Manual: 40 | 41 | **Perform the following steps:** 42 | 43 | 1. Download the latest `native_client..tar.xz` matching your system from [DeepSpeech's Releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.1). 44 | 45 | 2. Extract the `.tar.xz` into your project directory in `libs` folder. **It MUST be in the libs folder. If you don't have one, create it and extract into it.** 46 | 47 | 3. Download `pre-trained` model from [DeepSpeech's Releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.1) (the file named `deepspeech-0.6.1-models.tar.gz`). It's pretty big (1.1G) so make sure you have the space. 48 | 49 | 4. Extract the model anywhere you like on your system. 50 | 51 | 5. **Extra:** If you don't have any audio files for testing etc. you can download the samples from [DeepSpeech's Releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.1) (the file named `audio-0.6.1.tar.gz`) 52 | 53 | 6. When you are done, run this command in your project directory: 54 | 55 | ``` 56 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib/ 57 | ``` 58 | 59 | And done! 60 | 61 | ### Automatic: 62 | 63 | _// TODO_ 64 | 65 | I will add a `bash` script for automating this process including the downloading and extracting etc. PRs welcome. 66 | 67 | ## Usage 68 | 69 | **There is a complete example of how to use this module in [`cmd/main.v`](https://github.com/thecodrr/vspeech/tree/master/cmd/main.v)** 70 | 71 | ```v 72 | import thecodrr.vspeech 73 | // specify values for use later 74 | const ( 75 | beam_width = 300 76 | lm_weight = 0.75 77 | valid_word_count_weight = 1.85 78 | ) 79 | // create a new model 80 | mut model := vspeech.new("/path/to/the/model.pbmm", 1) 81 | 82 | lm := "/path/to/the/lm/file" //its in the models archive 83 | trie := "/path/to/the/trie/file" //its in the models archive 84 | // enable the decoder with language model (optional) 85 | model.enable_decoder_with_lm(lm, trie, lm_weight, valid_word_count_weight) 86 | 87 | data := byteptr(0)//raw audio samples (use thecodrr.vave module for this) 88 | data_len := 0 //the total length of the buffer 89 | // convert the audio to text 90 | text := model.speech_to_text(data, data_len) 91 | println(text) 92 | 93 | // make sure to free everything 94 | unsafe { 95 | model.free() 96 | model.free_string(text) 97 | } 98 | ``` 99 | 100 | ## API 101 | 102 | #### `vspeech.new(model_path, beam_size)` 103 | 104 | Creates a new `Model` with the specified `model_path` and `beam_size`. 105 | 106 | `beam_size` decides the balance between accuracy and cost. The larger the `beam_size` the more accurate the decoding will be but at the cost of time and resources. 107 | 108 | `model_path` is the path to the model file. It is the file with `.pb` extension but it is better to use `.pbmm` file as it is mmapped and is lighter on the RAM. 109 | 110 | ### Model `struct` 111 | 112 | The main `struct` represents the interface to the underlying model. It has the following methods: 113 | 114 | #### 1. `enable_decoder_with_lm(lm_path, trie_path, lm_weight, valid_word_count_weight)` 115 | 116 | Load the Language Model and enable the decoder to use it. Read the method comments to know what each `param` does. 117 | 118 | #### 2. `get_model_sample_rate()` 119 | 120 | Use this to get the sample rate expected by the model. The audio samples you need converted **MUST** match this sample rate. 121 | 122 | #### 3. `speech_to_text(buffer, buffer_size)` 123 | 124 | This is the method that you are looking for. It's where all the magic happens (and also all the bugs). 125 | 126 | `buffer` is the audio data that needs to be decoded. Currently DeepSpeech supports 16-bit RAW PCM audio stream at the appropriate sample rate. You can use [thecodrr.vave](https://github.com/thecodrr/vave) to read audio samples from a WAV file. 127 | 128 | `buffer_size` is the total number of bytes in the buffer 129 | 130 | #### 4. `speech_to_text_with_metadata(buffer, buffer_size)` 131 | 132 | Same as `speech_to_text` except this returns a `Metadata` struct that you can use for output analysis etc. 133 | 134 | #### 5. `create_stream()` 135 | 136 | Create a stream for streaming audio data (from a microphone for example) into the decoder. This, however, isn't an actual stream i.e. there's no seek etc. This will initialize the streaming_state`in your`Model` instance which you can use as mentioned below. 137 | 138 | #### 6. `free()` 139 | 140 | Free the `Model` 141 | 142 | #### 7. `free_string(text)` 143 | 144 | Free the `string` the decoder outputted in `speech_to_text`. 145 | 146 | ### StreamingState 147 | 148 | The streaming state is used to handle pseudo-streaming of audio content into the decoder. It exposes the following methods: 149 | 150 | #### 1. `feed_audio_content(buffer, buffer_size)` 151 | 152 | Use this for feeding multiple chunks of data into the stream continuously. 153 | 154 | #### 2. `intermediate_decode()` 155 | 156 | You can use this to get the output of the current data in the stream. However, this is quite expensive due to no streaming capabilities in the decoder. Use this only when necessary. 157 | 158 | #### 3. `finish_stream()` 159 | 160 | Call this when streaming is finished and you want the final output of the whole stream. 161 | 162 | #### 4. `finish_stream_with_metadata()` 163 | 164 | Same as `finish_stream` but returns a `Metadata` struct which you can use to analyze the output. 165 | 166 | #### 5. `free()` 167 | 168 | Call this when done to free the captured StreamingState. 169 | 170 | ### Metadata 171 | 172 | **Fields:** 173 | 174 | `items` An array of `MetadataItem`s 175 | 176 | `num_items` Total number of items in the items array. 177 | 178 | `confidence` Approximated confidence value for this transcription 179 | 180 | **Methods:** 181 | 182 | `get_items()` - Converts the C pointer `MetadataItem` array into V array which you can iterate over normally. 183 | 184 | `get_text()` - Helper method to get the combined text from all the `MetadataItem`s outputting the result in one `string`. 185 | 186 | `free()` - Free the `Metadata` instance 187 | 188 | ### MetadataItem 189 | 190 | **Fields:** 191 | 192 | `character` - The character generated for transcription 193 | 194 | `timestep` - Position of the character in units of 20ms 195 | 196 | `start_time` - Position of the character in seconds 197 | 198 | **Methods:** 199 | 200 | `str()` - Combine and output all the data in the `MetadataItem` nicely into a `string`. 201 | 202 | ### Find this library useful? :heart: 203 | 204 | Support it by joining **[stargazers](https://github.com/thecodrr/vspeech/stargazers)** for this repository. :star:or [buy me a cup of coffee](https://ko-fi.com/thecodrr) 205 | And **[follow](https://github.com/thecodrr)** me for my next creations! 🤩 206 | 207 | # License 208 | 209 | ```xml 210 | MIT License 211 | 212 | Copyright (c) 2019 Abdullah Atta 213 | 214 | Permission is hereby granted, free of charge, to any person obtaining a copy 215 | of this software and associated documentation files (the "Software"), to deal 216 | in the Software without restriction, including without limitation the rights 217 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 218 | copies of the Software, and to permit persons to whom the Software is 219 | furnished to do so, subject to the following conditions: 220 | 221 | The above copyright notice and this permission notice shall be included in all 222 | copies or substantial portions of the Software. 223 | 224 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 225 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 226 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 227 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 228 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 229 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 230 | SOFTWARE. 231 | ``` 232 | -------------------------------------------------------------------------------- /cmd/main.v: -------------------------------------------------------------------------------- 1 | module main 2 | 3 | import ( 4 | thecodrr.vave 5 | thecodrr.vspeech 6 | os 7 | flag 8 | ) 9 | 10 | const ( 11 | beam_width = 300 12 | lm_weight = 0.75 13 | valid_word_count_weight = 1.85 14 | ) 15 | 16 | fn main(){ 17 | mut fp := flag.new_flag_parser(os.args) 18 | fp.application('vspeech') 19 | fp.version('v0.0.1') 20 | fp.description('A simple tool for converting speech to text using DeepSpeech.') 21 | fp.skip_executable() 22 | 23 | model := fp.string('model', '', "The path to the trained model file.") 24 | lm := fp.string('lm', '',"The path to the language model binary.") 25 | trie := fp.string('trie', '',"The path to the trie file.") 26 | audio := fp.string('audio', '',"The path to the audio file.") 27 | 28 | if os.args.len < 5 { 29 | println(fp.usage()) 30 | return 31 | } 32 | 33 | mut w := vave.open(audio, "r") 34 | defer {w.close()} 35 | 36 | data := w.read_raw() 37 | 38 | mut m := vspeech.new(model, beam_width) 39 | 40 | m.enable_decoder_with_lm(lm, trie, lm_weight, valid_word_count_weight) 41 | 42 | output := m.speech_to_text_with_metadata(data, w.data_len()) 43 | 44 | println(output.get_text()) 45 | 46 | //free everything 47 | unsafe { 48 | free(data) 49 | m.free() 50 | output.free() 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /v.mod: -------------------------------------------------------------------------------- 1 | Module { 2 | name: 'vspeech' 3 | version: '0.0.1' 4 | deps: ['thecodrr.vave'] 5 | } -------------------------------------------------------------------------------- /vpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vspeech", 3 | "version": "0.0.1", 4 | "author": ["thecodrr "], 5 | "repo": "https://github.com/thecodrr/vspeech", 6 | "sources": ["https://v-pkg.github.io/registry/"], 7 | "dependencies": ["https://github.com/thecodrr/vave.git"] 8 | } 9 | -------------------------------------------------------------------------------- /vspeech.v: -------------------------------------------------------------------------------- 1 | module vspeech 2 | 3 | // NOTE: must call `export LD_LIBRARY_PATH=$PWD/lib/` before using this. 4 | #flag -L $PWD/lib/ 5 | #flag -I $PWD/lib/ 6 | #flag -ldeepspeech 7 | #include 8 | 9 | struct C.ModelState 10 | struct C.StreamingState 11 | 12 | // MetadataItem stores each individual character, along with its timing information 13 | struct C.MetadataItem { 14 | pub: 15 | character byteptr // The character generated for transcription 16 | timestep int // Position of the character in units of 20ms 17 | start_time f32 // Position of the character in seconds 18 | } 19 | 20 | // Metadata stores the entire CTC output as an array of character metadata objects 21 | struct C.Metadata{ 22 | pub: 23 | items &MetadataItem // List of items 24 | num_items int // Size of the list of items 25 | /* Approximated confidence value for this transcription. This is roughly the 26 | * sum of the acoustic model logit values for each timestep/character that 27 | * contributed to the creation of this transcription. 28 | */ 29 | confidence f64 30 | } 31 | 32 | // primary 33 | fn C.DS_CreateModel() int 34 | fn C.DS_EnableDecoderWithLM() int 35 | fn C.DS_GetModelSampleRate() int 36 | fn C.DS_SpeechToText() byteptr 37 | fn C.DS_SpeechToTextWithMetadata() &Metadata 38 | 39 | // streaming 40 | fn C.DS_CreateStream() int 41 | fn C.DS_FeedAudioContent() 42 | fn C.DS_IntermediateDecode() byteptr 43 | fn C.DS_FinishStream() byteptr 44 | fn C.DS_FinishStreamWithMetadata() &Metadata 45 | 46 | // all functions related to freeing resources 47 | fn C.DS_FreeModel() 48 | fn C.DS_FreeMetadata() 49 | fn C.DS_FreeString() 50 | fn C.DS_FreeStream() 51 | 52 | // Model represents a DeepSpeech model 53 | struct Model { 54 | beam_width int 55 | model_path string 56 | model_state &ModelState 57 | pub: 58 | streaming_state &StreamingState 59 | } 60 | 61 | // new_model creates a new Model 62 | // 63 | // model_path The path to the frozen model graph. 64 | // beam_width The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time. 65 | pub fn new(model_path string, beam_width int) &Model { 66 | mut model := &Model{ 67 | beam_width: beam_width 68 | model_path: model_path 69 | model_state: C.NULL 70 | streaming_state: C.NULL 71 | } 72 | ret := DS_CreateModel(model_path.str, beam_width, &model.model_state) 73 | if ret > 0 { 74 | panic("Failed to create Model. Error code: ${ret.str()}") 75 | } 76 | return model 77 | } 78 | 79 | // free frees the model 80 | pub fn (m &Model) free(){ 81 | C.DS_FreeModel(m.model_state) 82 | } 83 | 84 | // free_string frees the speech-to-text string 85 | // 86 | // text the speech-to-text string gotten from DeepSpeech. 87 | pub fn (m &Model) free_string(text string){ 88 | C.DS_FreeString(text.str) 89 | } 90 | 91 | // enable_decoder_with_lm enables decoding using beam scoring with a KenLM language model. 92 | // 93 | // lm_path The path to the language model binary file. 94 | // trie_path The path to the trie file build from the same vocabulary as the language model binary. 95 | // lm_weight The weight to give to language model results when scoring. 96 | // valid_word_count_weight The weight (bonus) to give to beams when adding a new valid word to the decoding. 97 | pub fn (m mut Model) enable_decoder_with_lm(lm_path, trie_path string, lm_weight, valid_word_count_weight f64) { 98 | result := DS_EnableDecoderWithLM(m.model_state, lm_path.str, trie_path.str, lm_weight, valid_word_count_weight) 99 | if result > 0 { 100 | panic("Failed to enable decoder with language model. Error code: ${result.str()}") 101 | } 102 | } 103 | 104 | // get_model_sample_rate reads the sample rate that was used to produce the model file. 105 | pub fn (m &Model) get_model_sample_rate() int { 106 | return DS_GetModelSampleRate(m.model_state) 107 | } 108 | 109 | // speech_to_text uses the DeepSpeech model to perform Speech-To-Text. 110 | // buffer A 16-bit, mono raw audio signal at the appropriate sample rate. 111 | // bufferSize The number of samples in the audio signal. 112 | pub fn (m &Model) speech_to_text(buffer byteptr, buffer_size int) string { 113 | str := C.DS_SpeechToText(m.model_state, buffer, buffer_size) 114 | if str == C.NULL { 115 | panic("speech_to_text: error converting audio to text.") 116 | } 117 | return string(str) 118 | } 119 | 120 | // speech_to_text_with_metadata uses the DeepSpeech model to perform Speech-To-Text and output metadata about the results. 121 | // 122 | // buffer A 16-bit, mono raw audio signal at the appropriate sample rate. 123 | // buffer_size The number of samples in the audio signal. 124 | pub fn (m &Model) speech_to_text_with_metadata(buffer byteptr, buffer_size int) &Metadata { 125 | metadata := C.DS_SpeechToTextWithMetadata(m.model_state, buffer, buffer_size) 126 | if metadata == C.NULL { 127 | panic("speech_to_text_with_metadata: error converting audio to text.") 128 | } 129 | return metadata 130 | } 131 | 132 | // create_stream creates a new streaming inference state. The streaming state returned 133 | // by this function can then be passed to feed_audio_content() 134 | // and finish_stream(). 135 | pub fn (m &Model) create_stream() { 136 | ret := C.DS_CreateStream(m.model_state, &m.streaming_state) 137 | if ret > 0 { 138 | panic("create_stream: error creating stream.") 139 | } 140 | } 141 | 142 | // feed_audio_content feeds audio samples to an ongoing streaming inference. 143 | // 144 | // buffer A 16-bit, mono raw audio signal at the appropriate sample rate. 145 | // buffer_size The number of samples in the audio signal. 146 | pub fn (s &StreamingState) feed_audio_content(buffer byteptr, buffer_size int) { 147 | C.DS_FeedAudioContent(s, buffer, buffer_size) 148 | } 149 | 150 | // intermediate_decode computes the intermediate decoding of an ongoing streaming inference. 151 | // This is an expensive process as the decoder implementation isn't 152 | // currently capable of streaming, so it always starts from the beginning 153 | // of the audio. 154 | pub fn (s &StreamingState) intermediate_decode() string { 155 | str := C.DS_IntermediateDecode(s) 156 | if str == C.NULL { 157 | panic("intermediate_decode: error computing the text from the stream.") 158 | } 159 | return string(str) 160 | } 161 | 162 | // finish_stream signals the end of an audio signal to an ongoing streaming 163 | // inference, returns the STT result over the whole audio signal. 164 | pub fn (s &StreamingState) finish_stream() string { 165 | str := C.DS_FinishStream(s) 166 | if str == C.NULL { 167 | panic("finish_stream: error finishing the stream.") 168 | } 169 | return string(str) 170 | } 171 | 172 | // finish_stream_with_metadata signals the end of an audio signal to an ongoing streaming 173 | // inference, returns per-letter metadata. 174 | pub fn (s &StreamingState) finish_stream_with_metadata() &Metadata { 175 | metadata := C.DS_FinishStreamWithMetadata(s) 176 | if metadata == C.NULL { 177 | panic("finish_stream_with_metadata: error finishing the stream.") 178 | } 179 | return metadata 180 | } 181 | 182 | // free frees the stream. 183 | pub fn (s &StreamingState) free() { 184 | C.DS_FreeStream(s) 185 | } 186 | 187 | // get_items converts the C MetadataItem array to V MetadataItem array 188 | pub fn (m &Metadata) get_items() []MetadataItem { 189 | mut arr := []MetadataItem 190 | for i in 0..m.num_items { 191 | arr << m.items[i] 192 | } 193 | return arr 194 | } 195 | 196 | // get_text joins all the characters in the Metadata into one string 197 | pub fn (m &Metadata) get_text() string { 198 | mut str := [`0`].repeat(m.num_items) 199 | for i in 0..m.num_items { 200 | str[i] = *m.items[i].character 201 | } 202 | return string(byteptr(str.data)) 203 | } 204 | 205 | // free frees the Metadata 206 | pub fn (m &Metadata) free() { 207 | C.DS_FreeMetadata(m) 208 | } 209 | 210 | // str returns the string representation of the MetadataItem 211 | pub fn (m &MetadataItem) str() string { 212 | return 'Character: ${m.character}\nTimestep: ${m.timestep}\nStart time: ${m.start_time}\n' 213 | } --------------------------------------------------------------------------------