├── go.mod ├── LICENSE.md ├── README.md ├── go.sum └── main.go /go.mod: -------------------------------------------------------------------------------- 1 | module vosk-sound-test 2 | 3 | go 1.19 4 | 5 | require ( 6 | github.com/fatih/color v1.13.0 7 | github.com/gen2brain/malgo v0.10.35 8 | github.com/go-audio/wav v1.1.0 9 | github.com/gorilla/websocket v1.5.0 10 | ) 11 | 12 | require ( 13 | github.com/go-audio/audio v1.0.0 // indirect 14 | github.com/go-audio/riff v1.0.0 // indirect 15 | github.com/mattn/go-colorable v0.1.9 // indirect 16 | github.com/mattn/go-isatty v0.0.14 // indirect 17 | github.com/stretchr/testify v1.8.0 // indirect 18 | golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect 19 | ) 20 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright © 2022 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### A Golang demo for how to use: 2 | - [VOSK](https://alphacephei.com/vosk/ "VOSK") for speech recognition. 3 | - [miniaudio](https://github.com/mackron/miniaudio "miniaudio") for capturing audio input from **microphone**. 4 | 5 | 6 | ### What it does: 7 | 1. Capture audio input and send to a local voice recognition engine 8 | 2. Play back the captured sound 9 | 3. Record voice to a ".wav" file 10 | 11 | 12 | ### Usage: 13 | 1. `docker run -d -p 2700:2700 alphacep/kaldi-en:latest` 14 | This runs official VOSK-server docker image. 15 | 2. `./vosk-sound-test` 16 | It starts capturing and displays the words you say, also saves audio to file **out.wav** 17 | 3. Press `` to stop capturing and play back 18 | 4. Press `` again to exit. 19 | 20 | ### Troubleshooting 21 | 22 | 1. Low sound quality 23 | 24 | Maybe you're using some bluetooth airbuds like "airpods". For system like Linux, the input sound frequency is limited to 8000 at bluetooth stack, 16000 is a minimal frequency for VOSK to work well. A dedicated wired/wireless microphone should work. 25 | 26 | 2. It doesn't work at all 27 | 28 | Open the system sound manager, verify the recording device while this program is capturing. Sometimes a wrong device is choosed by default. 29 | 30 | ### Build from source 31 | 0. Install [Golang](https://go.dev/dl/ "Golang") 32 | 1. `git clone https://github.com/aj3423/vosk-sound-test` 33 | 2. `cd vosk-sound-test` 34 | 3. `go build .` 35 | 36 | ### License 37 | MIT 38 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w= 5 | github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= 6 | github.com/gen2brain/malgo v0.10.35 h1:D6aNo/Q0SnzQLHomTydTXxj4AJFdGJcVoE7I8JxPoUo= 7 | github.com/gen2brain/malgo v0.10.35/go.mod h1:zHSUNZAXfCeNsZou0RtQ6Zk7gDYLIcKOrUWtAdksnEs= 8 | github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= 9 | github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= 10 | github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA= 11 | github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498= 12 | github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g= 13 | github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= 14 | github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= 15 | github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= 16 | github.com/mattn/go-colorable v0.1.9 h1:sqDoxXbdeALODt0DAeJCVp38ps9ZogZEAXjus69YV3U= 17 | github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= 18 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 19 | github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= 20 | github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= 21 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 22 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 23 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 24 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 25 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 26 | github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= 27 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 28 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 29 | golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 30 | golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I= 31 | golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 32 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 33 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 34 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 35 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 36 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "net/url" 8 | "os" 9 | "strings" 10 | 11 | "github.com/fatih/color" 12 | "github.com/gen2brain/malgo" 13 | "github.com/go-audio/wav" 14 | "github.com/gorilla/websocket" 15 | ) 16 | 17 | const SampleRate uint32 = 16000 // 8000 is too low 18 | const OutFileName = "out.wav" 19 | 20 | var ( 21 | // a VOSK server address 22 | host string 23 | 24 | // set phrase to a limited dictionary to increase accuracy 25 | // not work for the model "vosk-model-en-us-0.22" (1.8G) 26 | // only works for the dynamic model "vosk-model-en-us-0.22-lgraph" (128M) 27 | limitWords bool 28 | ) 29 | 30 | func init() { 31 | flag.StringVar(&host, "host", "127.0.0.1:2700", "") 32 | flag.BoolVar(&limitWords, "limitWords", false, "") 33 | } 34 | 35 | func main() { 36 | flag.Parse() 37 | 38 | u := url.URL{Scheme: "ws", Host: host, Path: ""} 39 | ws, _, err := websocket.DefaultDialer.DialContext(context.Background(), u.String(), nil) 40 | chk(err) 41 | 42 | defer func() { 43 | // send finial msg 44 | ws.WriteMessage(websocket.TextMessage, []byte(`{"eof" : 1}`)) 45 | // read final msg 46 | ws.ReadMessage() 47 | ws.WriteMessage(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")) 48 | ws.Close() 49 | }() 50 | 51 | var bs string 52 | 53 | if !limitWords { 54 | bs = fmt.Sprintf(` 55 | { 56 | "config" : { 57 | "sample_rate" : %d, 58 | "words": 0 59 | } 60 | }`, SampleRate) 61 | } else { 62 | bs = fmt.Sprintf(` 63 | { 64 | "config" : { 65 | "sample_rate" : %d, 66 | "phrase_list" : [ 67 | "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 68 | "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "zed", 69 | 70 | "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", 71 | "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", 72 | "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", 73 | "hundred", "thousand", 74 | ], 75 | "words": 0 76 | } 77 | }`, SampleRate) 78 | } 79 | ws.WriteMessage(websocket.TextMessage, []byte(bs)) 80 | 81 | go func() { 82 | for { 83 | _, msg, err2 := ws.ReadMessage() 84 | if err2 != nil { 85 | color.Red(err2.Error()) 86 | break 87 | } 88 | 89 | if strings.Contains(string(msg), "text") { 90 | fmt.Println(string(msg)) 91 | } 92 | } 93 | }() 94 | 95 | ctx, err := malgo.InitContext(nil, malgo.ContextConfig{}, func(message string) { 96 | fmt.Printf("LOG <%v>\n", message) 97 | }) 98 | chk(err) 99 | 100 | defer func() { 101 | _ = ctx.Uninit() 102 | ctx.Free() 103 | }() 104 | 105 | deviceConfig := malgo.DefaultDeviceConfig(malgo.Capture) 106 | deviceConfig.Capture.Format = malgo.FormatS16 107 | deviceConfig.Capture.Channels = 1 108 | deviceConfig.SampleRate = SampleRate 109 | 110 | var playbackSampleCount uint32 111 | var capturedSampleCount uint32 112 | pCapturedSamples := make([]byte, 0) 113 | 114 | sizeInBytes := uint32(malgo.SampleSizeInBytes(deviceConfig.Capture.Format)) // == 2 115 | 116 | // ---- write to file ---- 117 | wavFile, err := os.Create(OutFileName) 118 | if err != nil { 119 | panic(err) 120 | } 121 | enc := wav.NewEncoder(wavFile, 122 | int(SampleRate), // SampleRate 123 | 16, // BitDepth 124 | 1, // Channels 125 | 1) // 1 == PCM 126 | 127 | device, err := malgo.InitDevice(ctx.Context, deviceConfig, malgo.DeviceCallbacks{ 128 | Data: func(_, pSample []byte, framecount uint32) { 129 | 130 | sampleCount := framecount * deviceConfig.Capture.Channels * sizeInBytes 131 | 132 | capturedSampleCount += sampleCount 133 | 134 | pCapturedSamples = append(pCapturedSamples, pSample...) 135 | 136 | // ws_.Write(pSample) 137 | ws.WriteMessage(websocket.BinaryMessage, pSample) 138 | 139 | single_frame_len := len(pSample) / int(framecount) 140 | 141 | for i := 0; i < int(framecount); i++ { 142 | enc.WriteFrame(pSample[i*single_frame_len : i*single_frame_len+single_frame_len]) 143 | } 144 | }, 145 | }) 146 | chk(err) 147 | 148 | err = device.Start() 149 | chk(err) 150 | 151 | fmt.Println("Press Enter to stop recording...") 152 | fmt.Scanln() 153 | 154 | device.Stop() 155 | device.Uninit() 156 | 157 | enc.Close() 158 | wavFile.Close() 159 | color.Yellow("wav saved to file: %s", OutFileName) 160 | 161 | // ---- playback ---- 162 | { 163 | deviceConfig = malgo.DefaultDeviceConfig(malgo.Playback) 164 | deviceConfig.Playback.Format = malgo.FormatS16 165 | deviceConfig.Playback.Channels = 1 166 | deviceConfig.SampleRate = SampleRate 167 | 168 | color.Blue("Playing...") 169 | 170 | device, err = malgo.InitDevice(ctx.Context, deviceConfig, malgo.DeviceCallbacks{ 171 | Data: func(pSample, _ []byte, framecount uint32) { 172 | samplesToRead := framecount * deviceConfig.Playback.Channels * sizeInBytes 173 | if samplesToRead > capturedSampleCount-playbackSampleCount { 174 | samplesToRead = capturedSampleCount - playbackSampleCount 175 | } 176 | 177 | copy(pSample, pCapturedSamples[playbackSampleCount:playbackSampleCount+samplesToRead]) 178 | 179 | playbackSampleCount += samplesToRead 180 | 181 | if playbackSampleCount == uint32(len(pCapturedSamples)) { 182 | playbackSampleCount = 0 183 | } 184 | }, 185 | }) 186 | chk(err) 187 | 188 | err = device.Start() 189 | chk(err) 190 | 191 | fmt.Println("Press Enter to quit...") 192 | fmt.Scanln() 193 | 194 | device.Stop() 195 | device.Uninit() 196 | } 197 | } 198 | 199 | func chk(err error) { 200 | if err != nil { 201 | panic(err) 202 | } 203 | } 204 | --------------------------------------------------------------------------------