├── .github
└── FUNDING.yml
├── .gitignore
├── LICENSE
├── README.md
├── cmd
└── main.v
├── v.mod
├── vpkg.json
└── vspeech.v
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: thecodrr
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | vave/
2 | vave_test
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Abdullah Atta
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
7 |
8 | ## Installation:
9 |
10 | Install using `vpkg`
11 |
12 | ```bash
13 | vpkg get https://github.com/thecodrr/vspeech
14 | ```
15 |
16 | Install using `V`'s builtin `vpm` (you will need to import the module with: `import thecodrr.vspeech` with this method of installation):
17 |
18 | ```shell
19 | v install thecodrr.vspeech
20 | ```
21 |
22 | Install using `git`:
23 |
24 | ```bash
25 | cd path/to/your/project
26 | git clone https://github.com/thecodrr/vspeech
27 | ```
28 |
29 | You can use [thecodrr.vave](https://github.com/thecodrr/vave) for reading WAV files.
30 |
31 | Then in the wherever you want to use it:
32 |
33 | ```v
34 | import thecodrr.vspeech //OR simply vave depending on how you installed
35 | // Optional
36 | import thecodrr.vave
37 | ```
38 |
39 | ### Manual:
40 |
41 | **Perform the following steps:**
42 |
43 | 1. Download the latest `native_client..tar.xz` matching your system from [DeepSpeech's Releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.1).
44 |
45 | 2. Extract the `.tar.xz` into your project directory in `libs` folder. **It MUST be in the libs folder. If you don't have one, create it and extract into it.**
46 |
47 | 3. Download `pre-trained` model from [DeepSpeech's Releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.1) (the file named `deepspeech-0.6.1-models.tar.gz`). It's pretty big (1.1G) so make sure you have the space.
48 |
49 | 4. Extract the model anywhere you like on your system.
50 |
51 | 5. **Extra:** If you don't have any audio files for testing etc. you can download the samples from [DeepSpeech's Releases](https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.1) (the file named `audio-0.6.1.tar.gz`)
52 |
53 | 6. When you are done, run this command in your project directory:
54 |
55 | ```
56 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib/
57 | ```
58 |
59 | And done!
60 |
61 | ### Automatic:
62 |
63 | _// TODO_
64 |
65 | I will add a `bash` script for automating this process including the downloading and extracting etc. PRs welcome.
66 |
67 | ## Usage
68 |
69 | **There is a complete example of how to use this module in [`cmd/main.v`](https://github.com/thecodrr/vspeech/tree/master/cmd/main.v)**
70 |
71 | ```v
72 | import thecodrr.vspeech
73 | // specify values for use later
74 | const (
75 | beam_width = 300
76 | lm_weight = 0.75
77 | valid_word_count_weight = 1.85
78 | )
79 | // create a new model
80 | mut model := vspeech.new("/path/to/the/model.pbmm", 1)
81 |
82 | lm := "/path/to/the/lm/file" //its in the models archive
83 | trie := "/path/to/the/trie/file" //its in the models archive
84 | // enable the decoder with language model (optional)
85 | model.enable_decoder_with_lm(lm, trie, lm_weight, valid_word_count_weight)
86 |
87 | data := byteptr(0)//raw audio samples (use thecodrr.vave module for this)
88 | data_len := 0 //the total length of the buffer
89 | // convert the audio to text
90 | text := model.speech_to_text(data, data_len)
91 | println(text)
92 |
93 | // make sure to free everything
94 | unsafe {
95 | model.free()
96 | model.free_string(text)
97 | }
98 | ```
99 |
100 | ## API
101 |
102 | #### `vspeech.new(model_path, beam_size)`
103 |
104 | Creates a new `Model` with the specified `model_path` and `beam_size`.
105 |
106 | `beam_size` decides the balance between accuracy and cost. The larger the `beam_size` the more accurate the decoding will be but at the cost of time and resources.
107 |
108 | `model_path` is the path to the model file. It is the file with `.pb` extension but it is better to use `.pbmm` file as it is mmapped and is lighter on the RAM.
109 |
110 | ### Model `struct`
111 |
112 | The main `struct` represents the interface to the underlying model. It has the following methods:
113 |
114 | #### 1. `enable_decoder_with_lm(lm_path, trie_path, lm_weight, valid_word_count_weight)`
115 |
116 | Load the Language Model and enable the decoder to use it. Read the method comments to know what each `param` does.
117 |
118 | #### 2. `get_model_sample_rate()`
119 |
120 | Use this to get the sample rate expected by the model. The audio samples you need converted **MUST** match this sample rate.
121 |
122 | #### 3. `speech_to_text(buffer, buffer_size)`
123 |
124 | This is the method that you are looking for. It's where all the magic happens (and also all the bugs).
125 |
126 | `buffer` is the audio data that needs to be decoded. Currently DeepSpeech supports 16-bit RAW PCM audio stream at the appropriate sample rate. You can use [thecodrr.vave](https://github.com/thecodrr/vave) to read audio samples from a WAV file.
127 |
128 | `buffer_size` is the total number of bytes in the buffer
129 |
130 | #### 4. `speech_to_text_with_metadata(buffer, buffer_size)`
131 |
132 | Same as `speech_to_text` except this returns a `Metadata` struct that you can use for output analysis etc.
133 |
134 | #### 5. `create_stream()`
135 |
136 | Create a stream for streaming audio data (from a microphone for example) into the decoder. This, however, isn't an actual stream i.e. there's no seek etc. This will initialize the streaming_state`in your`Model` instance which you can use as mentioned below.
137 |
138 | #### 6. `free()`
139 |
140 | Free the `Model`
141 |
142 | #### 7. `free_string(text)`
143 |
144 | Free the `string` the decoder outputted in `speech_to_text`.
145 |
146 | ### StreamingState
147 |
148 | The streaming state is used to handle pseudo-streaming of audio content into the decoder. It exposes the following methods:
149 |
150 | #### 1. `feed_audio_content(buffer, buffer_size)`
151 |
152 | Use this for feeding multiple chunks of data into the stream continuously.
153 |
154 | #### 2. `intermediate_decode()`
155 |
156 | You can use this to get the output of the current data in the stream. However, this is quite expensive due to no streaming capabilities in the decoder. Use this only when necessary.
157 |
158 | #### 3. `finish_stream()`
159 |
160 | Call this when streaming is finished and you want the final output of the whole stream.
161 |
162 | #### 4. `finish_stream_with_metadata()`
163 |
164 | Same as `finish_stream` but returns a `Metadata` struct which you can use to analyze the output.
165 |
166 | #### 5. `free()`
167 |
168 | Call this when done to free the captured StreamingState.
169 |
170 | ### Metadata
171 |
172 | **Fields:**
173 |
174 | `items` An array of `MetadataItem`s
175 |
176 | `num_items` Total number of items in the items array.
177 |
178 | `confidence` Approximated confidence value for this transcription
179 |
180 | **Methods:**
181 |
182 | `get_items()` - Converts the C pointer `MetadataItem` array into V array which you can iterate over normally.
183 |
184 | `get_text()` - Helper method to get the combined text from all the `MetadataItem`s outputting the result in one `string`.
185 |
186 | `free()` - Free the `Metadata` instance
187 |
188 | ### MetadataItem
189 |
190 | **Fields:**
191 |
192 | `character` - The character generated for transcription
193 |
194 | `timestep` - Position of the character in units of 20ms
195 |
196 | `start_time` - Position of the character in seconds
197 |
198 | **Methods:**
199 |
200 | `str()` - Combine and output all the data in the `MetadataItem` nicely into a `string`.
201 |
202 | ### Find this library useful? :heart:
203 |
204 | Support it by joining **[stargazers](https://github.com/thecodrr/vspeech/stargazers)** for this repository. :star:or [buy me a cup of coffee](https://ko-fi.com/thecodrr)
205 | And **[follow](https://github.com/thecodrr)** me for my next creations! 🤩
206 |
207 | # License
208 |
209 | ```xml
210 | MIT License
211 |
212 | Copyright (c) 2019 Abdullah Atta
213 |
214 | Permission is hereby granted, free of charge, to any person obtaining a copy
215 | of this software and associated documentation files (the "Software"), to deal
216 | in the Software without restriction, including without limitation the rights
217 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
218 | copies of the Software, and to permit persons to whom the Software is
219 | furnished to do so, subject to the following conditions:
220 |
221 | The above copyright notice and this permission notice shall be included in all
222 | copies or substantial portions of the Software.
223 |
224 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
225 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
226 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
227 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
228 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
229 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
230 | SOFTWARE.
231 | ```
232 |
--------------------------------------------------------------------------------
/cmd/main.v:
--------------------------------------------------------------------------------
1 | module main
2 |
3 | import (
4 | thecodrr.vave
5 | thecodrr.vspeech
6 | os
7 | flag
8 | )
9 |
10 | const (
11 | beam_width = 300
12 | lm_weight = 0.75
13 | valid_word_count_weight = 1.85
14 | )
15 |
16 | fn main(){
17 | mut fp := flag.new_flag_parser(os.args)
18 | fp.application('vspeech')
19 | fp.version('v0.0.1')
20 | fp.description('A simple tool for converting speech to text using DeepSpeech.')
21 | fp.skip_executable()
22 |
23 | model := fp.string('model', '', "The path to the trained model file.")
24 | lm := fp.string('lm', '',"The path to the language model binary.")
25 | trie := fp.string('trie', '',"The path to the trie file.")
26 | audio := fp.string('audio', '',"The path to the audio file.")
27 |
28 | if os.args.len < 5 {
29 | println(fp.usage())
30 | return
31 | }
32 |
33 | mut w := vave.open(audio, "r")
34 | defer {w.close()}
35 |
36 | data := w.read_raw()
37 |
38 | mut m := vspeech.new(model, beam_width)
39 |
40 | m.enable_decoder_with_lm(lm, trie, lm_weight, valid_word_count_weight)
41 |
42 | output := m.speech_to_text_with_metadata(data, w.data_len())
43 |
44 | println(output.get_text())
45 |
46 | //free everything
47 | unsafe {
48 | free(data)
49 | m.free()
50 | output.free()
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/v.mod:
--------------------------------------------------------------------------------
1 | Module {
2 | name: 'vspeech'
3 | version: '0.0.1'
4 | deps: ['thecodrr.vave']
5 | }
--------------------------------------------------------------------------------
/vpkg.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "vspeech",
3 | "version": "0.0.1",
4 | "author": ["thecodrr "],
5 | "repo": "https://github.com/thecodrr/vspeech",
6 | "sources": ["https://v-pkg.github.io/registry/"],
7 | "dependencies": ["https://github.com/thecodrr/vave.git"]
8 | }
9 |
--------------------------------------------------------------------------------
/vspeech.v:
--------------------------------------------------------------------------------
1 | module vspeech
2 |
3 | // NOTE: must call `export LD_LIBRARY_PATH=$PWD/lib/` before using this.
4 | #flag -L $PWD/lib/
5 | #flag -I $PWD/lib/
6 | #flag -ldeepspeech
7 | #include
8 |
9 | struct C.ModelState
10 | struct C.StreamingState
11 |
12 | // MetadataItem stores each individual character, along with its timing information
13 | struct C.MetadataItem {
14 | pub:
15 | character byteptr // The character generated for transcription
16 | timestep int // Position of the character in units of 20ms
17 | start_time f32 // Position of the character in seconds
18 | }
19 |
20 | // Metadata stores the entire CTC output as an array of character metadata objects
21 | struct C.Metadata{
22 | pub:
23 | items &MetadataItem // List of items
24 | num_items int // Size of the list of items
25 | /* Approximated confidence value for this transcription. This is roughly the
26 | * sum of the acoustic model logit values for each timestep/character that
27 | * contributed to the creation of this transcription.
28 | */
29 | confidence f64
30 | }
31 |
32 | // primary
33 | fn C.DS_CreateModel() int
34 | fn C.DS_EnableDecoderWithLM() int
35 | fn C.DS_GetModelSampleRate() int
36 | fn C.DS_SpeechToText() byteptr
37 | fn C.DS_SpeechToTextWithMetadata() &Metadata
38 |
39 | // streaming
40 | fn C.DS_CreateStream() int
41 | fn C.DS_FeedAudioContent()
42 | fn C.DS_IntermediateDecode() byteptr
43 | fn C.DS_FinishStream() byteptr
44 | fn C.DS_FinishStreamWithMetadata() &Metadata
45 |
46 | // all functions related to freeing resources
47 | fn C.DS_FreeModel()
48 | fn C.DS_FreeMetadata()
49 | fn C.DS_FreeString()
50 | fn C.DS_FreeStream()
51 |
52 | // Model represents a DeepSpeech model
53 | struct Model {
54 | beam_width int
55 | model_path string
56 | model_state &ModelState
57 | pub:
58 | streaming_state &StreamingState
59 | }
60 |
61 | // new_model creates a new Model
62 | //
63 | // model_path The path to the frozen model graph.
64 | // beam_width The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.
65 | pub fn new(model_path string, beam_width int) &Model {
66 | mut model := &Model{
67 | beam_width: beam_width
68 | model_path: model_path
69 | model_state: C.NULL
70 | streaming_state: C.NULL
71 | }
72 | ret := DS_CreateModel(model_path.str, beam_width, &model.model_state)
73 | if ret > 0 {
74 | panic("Failed to create Model. Error code: ${ret.str()}")
75 | }
76 | return model
77 | }
78 |
79 | // free frees the model
80 | pub fn (m &Model) free(){
81 | C.DS_FreeModel(m.model_state)
82 | }
83 |
84 | // free_string frees the speech-to-text string
85 | //
86 | // text the speech-to-text string gotten from DeepSpeech.
87 | pub fn (m &Model) free_string(text string){
88 | C.DS_FreeString(text.str)
89 | }
90 |
91 | // enable_decoder_with_lm enables decoding using beam scoring with a KenLM language model.
92 | //
93 | // lm_path The path to the language model binary file.
94 | // trie_path The path to the trie file build from the same vocabulary as the language model binary.
95 | // lm_weight The weight to give to language model results when scoring.
96 | // valid_word_count_weight The weight (bonus) to give to beams when adding a new valid word to the decoding.
97 | pub fn (m mut Model) enable_decoder_with_lm(lm_path, trie_path string, lm_weight, valid_word_count_weight f64) {
98 | result := DS_EnableDecoderWithLM(m.model_state, lm_path.str, trie_path.str, lm_weight, valid_word_count_weight)
99 | if result > 0 {
100 | panic("Failed to enable decoder with language model. Error code: ${result.str()}")
101 | }
102 | }
103 |
104 | // get_model_sample_rate reads the sample rate that was used to produce the model file.
105 | pub fn (m &Model) get_model_sample_rate() int {
106 | return DS_GetModelSampleRate(m.model_state)
107 | }
108 |
109 | // speech_to_text uses the DeepSpeech model to perform Speech-To-Text.
110 | // buffer A 16-bit, mono raw audio signal at the appropriate sample rate.
111 | // bufferSize The number of samples in the audio signal.
112 | pub fn (m &Model) speech_to_text(buffer byteptr, buffer_size int) string {
113 | str := C.DS_SpeechToText(m.model_state, buffer, buffer_size)
114 | if str == C.NULL {
115 | panic("speech_to_text: error converting audio to text.")
116 | }
117 | return string(str)
118 | }
119 |
120 | // speech_to_text_with_metadata uses the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
121 | //
122 | // buffer A 16-bit, mono raw audio signal at the appropriate sample rate.
123 | // buffer_size The number of samples in the audio signal.
124 | pub fn (m &Model) speech_to_text_with_metadata(buffer byteptr, buffer_size int) &Metadata {
125 | metadata := C.DS_SpeechToTextWithMetadata(m.model_state, buffer, buffer_size)
126 | if metadata == C.NULL {
127 | panic("speech_to_text_with_metadata: error converting audio to text.")
128 | }
129 | return metadata
130 | }
131 |
132 | // create_stream creates a new streaming inference state. The streaming state returned
133 | // by this function can then be passed to feed_audio_content()
134 | // and finish_stream().
135 | pub fn (m &Model) create_stream() {
136 | ret := C.DS_CreateStream(m.model_state, &m.streaming_state)
137 | if ret > 0 {
138 | panic("create_stream: error creating stream.")
139 | }
140 | }
141 |
142 | // feed_audio_content feeds audio samples to an ongoing streaming inference.
143 | //
144 | // buffer A 16-bit, mono raw audio signal at the appropriate sample rate.
145 | // buffer_size The number of samples in the audio signal.
146 | pub fn (s &StreamingState) feed_audio_content(buffer byteptr, buffer_size int) {
147 | C.DS_FeedAudioContent(s, buffer, buffer_size)
148 | }
149 |
150 | // intermediate_decode computes the intermediate decoding of an ongoing streaming inference.
151 | // This is an expensive process as the decoder implementation isn't
152 | // currently capable of streaming, so it always starts from the beginning
153 | // of the audio.
154 | pub fn (s &StreamingState) intermediate_decode() string {
155 | str := C.DS_IntermediateDecode(s)
156 | if str == C.NULL {
157 | panic("intermediate_decode: error computing the text from the stream.")
158 | }
159 | return string(str)
160 | }
161 |
162 | // finish_stream signals the end of an audio signal to an ongoing streaming
163 | // inference, returns the STT result over the whole audio signal.
164 | pub fn (s &StreamingState) finish_stream() string {
165 | str := C.DS_FinishStream(s)
166 | if str == C.NULL {
167 | panic("finish_stream: error finishing the stream.")
168 | }
169 | return string(str)
170 | }
171 |
172 | // finish_stream_with_metadata signals the end of an audio signal to an ongoing streaming
173 | // inference, returns per-letter metadata.
174 | pub fn (s &StreamingState) finish_stream_with_metadata() &Metadata {
175 | metadata := C.DS_FinishStreamWithMetadata(s)
176 | if metadata == C.NULL {
177 | panic("finish_stream_with_metadata: error finishing the stream.")
178 | }
179 | return metadata
180 | }
181 |
182 | // free frees the stream.
183 | pub fn (s &StreamingState) free() {
184 | C.DS_FreeStream(s)
185 | }
186 |
187 | // get_items converts the C MetadataItem array to V MetadataItem array
188 | pub fn (m &Metadata) get_items() []MetadataItem {
189 | mut arr := []MetadataItem
190 | for i in 0..m.num_items {
191 | arr << m.items[i]
192 | }
193 | return arr
194 | }
195 |
196 | // get_text joins all the characters in the Metadata into one string
197 | pub fn (m &Metadata) get_text() string {
198 | mut str := [`0`].repeat(m.num_items)
199 | for i in 0..m.num_items {
200 | str[i] = *m.items[i].character
201 | }
202 | return string(byteptr(str.data))
203 | }
204 |
205 | // free frees the Metadata
206 | pub fn (m &Metadata) free() {
207 | C.DS_FreeMetadata(m)
208 | }
209 |
210 | // str returns the string representation of the MetadataItem
211 | pub fn (m &MetadataItem) str() string {
212 | return 'Character: ${m.character}\nTimestep: ${m.timestep}\nStart time: ${m.start_time}\n'
213 | }
--------------------------------------------------------------------------------