├── .gitignore ├── LICENSE ├── README.md ├── cmd └── stt-translator │ └── main.go ├── files ├── LICENSE └── silero_vad.onnx ├── go.mod ├── go.sum ├── sound └── resample.go ├── vad ├── detector.go ├── silero.go └── vad.go └── whisper └── server_api.go /.gitignore: -------------------------------------------------------------------------------- 1 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Xbozon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-whisper-cpp-server-example 2 | 3 | Example using Silero VAD and whisper.cpp for speech recognition using go. For the article [Local, all-in-one Go speech-to-text solution with Silero VAD and whisper.cpp server](https://medium.com/@etolkachev93/local-all-in-one-go-speech-to-text-solution-with-silero-vad-and-whisper-cpp-server-94a69fa51b04). 4 | 5 | ## Dependencies (for mac) 6 | 7 | * Install whisper.cpp 8 | * Download the whisper model converted to ggml format: [ggerganov/whisper.cpp](https://huggingface.co/ggerganov/whisper.cpp) 9 | * Install onnxruntime: `brew install onnxruntime` 10 | 11 | ## How to run 12 | 13 | ```bash 14 | export LIBRARY_PATH=/opt/homebrew/Cellar/onnxruntime/1.17.1/lib 15 | C_INCLUDE_PATH=/opt/homebrew/Cellar/onnxruntime/1.17.1/include/onnxruntime go run main.go 16 | ``` -------------------------------------------------------------------------------- /cmd/stt-translator/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "log" 8 | "math" 9 | "os" 10 | "os/signal" 11 | "strconv" 12 | "syscall" 13 | "time" 14 | 15 | "github.com/go-audio/audio" 16 | "github.com/go-audio/wav" 17 | "github.com/gordonklaus/portaudio" 18 | "github.com/orcaman/writerseeker" 19 | 20 | "github.com/Xbozon/stt-translator/sound" 21 | vadlib "github.com/Xbozon/stt-translator/vad" 22 | "github.com/Xbozon/stt-translator/whisper" 23 | ) 24 | 25 | const ( 26 | whisperHost = "http://127.0.0.1:6001/inference" 27 | sileroFilePath = "../../files/silero_vad.onnx" 28 | 29 | minMicVolume = 450 30 | sendToVADDelay = time.Second 31 | maxWhisperSegmentDuration = time.Second * 25 32 | ) 33 | 34 | func main() { 35 | portaudio.Initialize() 36 | defer portaudio.Terminate() 37 | 38 | ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) 39 | defer stop() 40 | 41 | // If there is no selected device, print all of them and exit. 42 | args := os.Args[1:] 43 | if len(args) == 0 { 44 | printAvailableDevices() 45 | return 46 | } 47 | 48 | selectedDevice, err := selectInputDevice(args) 49 | if err != nil { 50 | log.Fatalf("select input device %s", err) 51 | return 52 | } 53 | 54 | done := make(chan bool) 55 | audioCtx, audioCancel := context.WithCancel(ctx) 56 | 57 | // Set up the audio stream parameters for LINEAR16 PCM 58 | in := make([]int16, 512*9) // Use int16 to capture 16-bit samples. 59 | audioStream, err := portaudio.OpenDefaultStream( 60 | selectedDevice.MaxInputChannels, 0, selectedDevice.DefaultSampleRate, len(in), &in, 61 | ) 62 | if err != nil { 63 | log.Fatalf("opening stream: %v", err) 64 | return 65 | } 66 | 67 | // Start the audio stream 68 | if err := audioStream.Start(); err != nil { 69 | log.Fatalf("starting stream: %v", err) 70 | return 71 | } 72 | 73 | // Silero VAD - pre-trained Voice Activity Detector. See: https://github.com/snakers4/silero-vad 74 | sileroVAD, err := vadlib.NewSileroDetector(sileroFilePath) 75 | if err != nil { 76 | log.Fatalf("creating silero detector: %v", err) 77 | } 78 | 79 | log.Println("started") 80 | 81 | var ( 82 | startListening time.Time 83 | processChan = make(chan []int16, 10) 84 | whisperChan = make(chan audio.Buffer, 10) 85 | buffer = make([]int16, 512*9) 86 | ) 87 | go func() { 88 | for { 89 | select { 90 | case <-audioCtx.Done(): 91 | if err := audioStream.Close(); err != nil { 92 | log.Println(err) 93 | } 94 | log.Println("got audioCtx.Done exit gracefully...") 95 | return 96 | default: 97 | // Read from the microphone 98 | if err := audioStream.Read(); err != nil { 99 | log.Printf("reading from stream: %v\n", err) 100 | continue 101 | } 102 | 103 | volume := calculateRMS16(in) 104 | if volume > minMicVolume { 105 | startListening = time.Now() 106 | } 107 | 108 | if time.Since(startListening) < sendToVADDelay && time.Since(startListening) < maxWhisperSegmentDuration { 109 | buffer = append(buffer, in...) 110 | 111 | log.Println("listening...", volume) 112 | } else if len(buffer) > 0 { 113 | // Whisper and Silero accept audio with SampleRate = 16000. 114 | // 115 | // Resample also copies the buffer to another slice. Potentially, using a channel instead of a 116 | // buffer can achieve better performance. 117 | processChan <- sound.ResampleInt16(buffer, int(selectedDevice.DefaultSampleRate), 16000) 118 | 119 | buffer = buffer[:0] 120 | } 121 | } 122 | } 123 | }() 124 | 125 | // Responsible for checking recorded sections for the presence of the user's voice. 126 | go vad(sileroVAD, processChan, whisperChan) 127 | // Encodes the final sound into wav and sends to whisper. 128 | go process(whisperChan) 129 | 130 | // Shutdown. 131 | go func() { 132 | <-ctx.Done() 133 | if err := ctx.Err(); err != nil { 134 | log.Println(fmt.Errorf("shutdown: %w", err)) 135 | } 136 | audioCancel() 137 | close(done) 138 | }() 139 | 140 | <-done 141 | log.Println("finished") 142 | } 143 | 144 | func vad(silero *vadlib.SileroDetector, input <-chan []int16, output chan audio.Buffer) { 145 | soundIntBuffer := &audio.IntBuffer{ 146 | Format: &audio.Format{SampleRate: 16000, NumChannels: 1}, 147 | } 148 | 149 | for { 150 | soundIntBuffer.Data = sound.ConvertInt16ToInt(<-input) 151 | 152 | start := time.Now() 153 | detected, err := silero.DetectVoice(soundIntBuffer) 154 | if err != nil { 155 | log.Println(fmt.Errorf("detect voice: %w", err)) 156 | continue 157 | } 158 | log.Println("voice detecting result", time.Since(start), detected) 159 | 160 | if detected { 161 | log.Println("sending to whisper...") 162 | output <- soundIntBuffer.Clone() 163 | } 164 | } 165 | } 166 | 167 | func process(in <-chan audio.Buffer) { 168 | api := whisper.NewServerApi(whisperHost, whisper.Config{ 169 | Temperature: 0, 170 | TemperatureInc: 0.2, 171 | Timeout: time.Second * 6, 172 | }) 173 | 174 | for { 175 | data := <-in 176 | 177 | // Emulate a file in RAM so that we don't have to create a real file. 178 | file := &writerseeker.WriterSeeker{} 179 | encoder := wav.NewEncoder(file, 16000, 16, 1, 1) 180 | 181 | // Write the audio buffer to the WAV file using the encoder 182 | if err := encoder.Write(data.AsIntBuffer()); err != nil { 183 | log.Println(fmt.Errorf("encoder write buffer: %w", err)) 184 | return 185 | } 186 | 187 | // Close the encoder to finalize the WAV file headers 188 | if err := encoder.Close(); err != nil { 189 | log.Println(fmt.Errorf("encoder close: %w", err)) 190 | return 191 | } 192 | 193 | // Read all data from the reader into memory 194 | wavData, err := io.ReadAll(file.Reader()) 195 | if err != nil { 196 | log.Println(fmt.Errorf("reading file into memory: %w", err)) 197 | return 198 | } 199 | 200 | start := time.Now() 201 | res, err := api.SendMultiPartForm(context.TODO(), wavData) 202 | if err != nil { 203 | log.Println(fmt.Errorf("sending multipart form: %w", err)) 204 | return 205 | } 206 | 207 | log.Println(fmt.Sprintf("done in: %s, result: %s", time.Since(start), res.Text)) 208 | } 209 | } 210 | 211 | func printAvailableDevices() { 212 | devices, err := portaudio.Devices() 213 | if err != nil { 214 | log.Fatalf("portaudio.Devices %s", err) 215 | return 216 | } 217 | for i, device := range devices { 218 | fmt.Printf( 219 | "ID: %d, Name: %s, MaxInputChannels: %d, Sample rate: %f\n", 220 | i, 221 | device.Name, 222 | device.MaxInputChannels, 223 | device.DefaultSampleRate, 224 | ) 225 | } 226 | } 227 | 228 | func selectInputDevice(args []string) (*portaudio.DeviceInfo, error) { 229 | deviceID, err := strconv.Atoi(args[0]) 230 | if err != nil { 231 | return nil, fmt.Errorf("parce int %w", err) 232 | } 233 | 234 | devices, err := portaudio.Devices() 235 | if err != nil { 236 | return nil, fmt.Errorf("select input device %w", err) 237 | } 238 | 239 | selectedDevice, err := portaudio.DefaultInputDevice() 240 | if err != nil { 241 | return nil, fmt.Errorf("find default device %w", err) 242 | } 243 | 244 | // Set default device to device with particular id 245 | selectedDevice = devices[deviceID] 246 | 247 | log.Println("selected device:", selectedDevice.Name, selectedDevice.DefaultSampleRate) 248 | 249 | return selectedDevice, nil 250 | } 251 | 252 | // calculateRMS16 calculates the root mean square of the audio buffer for int16 samples. 253 | func calculateRMS16(buffer []int16) float64 { 254 | var sumSquares float64 255 | for _, sample := range buffer { 256 | val := float64(sample) // Convert int16 to float64 for calculation 257 | sumSquares += val * val 258 | } 259 | meanSquares := sumSquares / float64(len(buffer)) 260 | return math.Sqrt(meanSquares) 261 | } 262 | -------------------------------------------------------------------------------- /files/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-present Silero Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /files/silero_vad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xbozon/go-whisper-cpp-server-example/5f0a28d201ab11ca31d4a5fd29d4c0ea15b0709d/files/silero_vad.onnx -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Xbozon/stt-translator 2 | 3 | go 1.21.4 4 | 5 | toolchain go1.21.9 6 | 7 | require ( 8 | github.com/go-audio/audio v1.0.0 9 | github.com/go-audio/wav v1.1.0 10 | github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 11 | github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 12 | github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e 13 | github.com/streamer45/silero-vad-go v0.1.3 14 | ) 15 | 16 | require github.com/go-audio/riff v1.0.0 // indirect 17 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= 4 | github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= 5 | github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA= 6 | github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498= 7 | github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g= 8 | github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= 9 | github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 h1:5AlozfqaVjGYGhms2OsdUyfdJME76E6rx5MdGpjzZpc= 10 | github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5/go.mod h1:WY8R6YKlI2ZI3UyzFk7P6yGSuS+hFwNtEzrexRyD7Es= 11 | github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 h1:dd7vnTDfjtwCETZDrRe+GPYNLA1jBtbZeyfyE8eZCyk= 12 | github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12/go.mod h1:i/KKcxEWEO8Yyl11DYafRPKOPVYTrhxiTRigjtEEXZU= 13 | github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e h1:s2RNOM/IGdY0Y6qfTeUKhDawdHDpK9RGBdx80qN4Ttw= 14 | github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e/go.mod h1:nBdnFKj15wFbf94Rwfq4m30eAcyY9V/IyKAGQFtqkW0= 15 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 16 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 17 | github.com/streamer45/silero-vad-go v0.1.3 h1:gz5mLanOOFg1K5eXTlOI7VVINONZxra5I+g2n7L6fGU= 18 | github.com/streamer45/silero-vad-go v0.1.3/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs= 19 | github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= 20 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 21 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 22 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 23 | -------------------------------------------------------------------------------- /sound/resample.go: -------------------------------------------------------------------------------- 1 | package sound 2 | 3 | func ResampleInt16(input []int16, inputRate, outputRate int) []int16 { 4 | // Calculate the resampling ratio 5 | ratio := float64(inputRate) / float64(outputRate) 6 | 7 | // Calculate the length of the resampled output 8 | outputLength := int(float64(len(input)) / ratio) 9 | 10 | // Allocate a slice for the resampled output 11 | output := make([]int16, outputLength) 12 | 13 | // Perform linear interpolation for resampling 14 | for i := 0; i < outputLength-1; i++ { 15 | // Calculate the corresponding position in the input 16 | pos := float64(i) * ratio 17 | 18 | // Calculate the indices of the surrounding input samples 19 | indexBefore := int(pos) 20 | indexAfter := indexBefore + 1 21 | if indexAfter >= len(input) { 22 | indexAfter = len(input) - 1 23 | } 24 | 25 | // Calculate the fractional part of the position 26 | frac := pos - float64(indexBefore) 27 | 28 | // Linearly interpolate between the two surrounding input samples 29 | output[i] = int16((1-frac)*float64(input[indexBefore]) + frac*float64(input[indexAfter])) 30 | } 31 | 32 | // Handle the last sample explicitly to avoid index out of range 33 | output[outputLength-1] = input[len(input)-1] 34 | 35 | return output 36 | } 37 | 38 | func ConvertInt16ToInt(input []int16) []int { 39 | output := make([]int, len(input)) // Allocate a slice for the output 40 | for i, value := range input { 41 | output[i] = int(value) // Convert each int16 to int and assign it to the output slice 42 | } 43 | return output // Return the converted slice 44 | } 45 | -------------------------------------------------------------------------------- /vad/detector.go: -------------------------------------------------------------------------------- 1 | package vad 2 | 3 | import ( 4 | "log" 5 | "time" 6 | ) 7 | 8 | const DefaultQuietTime = time.Millisecond * 1000 9 | 10 | type Detector struct { 11 | lastFlux float64 12 | sensitivity float64 13 | start time.Time 14 | quietTimeDelay time.Duration 15 | vad *VAD 16 | } 17 | 18 | func NewDetector(sensitivity float64, delay time.Duration, width int) *Detector { 19 | return &Detector{ 20 | sensitivity: sensitivity, 21 | quietTimeDelay: delay, 22 | vad: NewVAD(width), 23 | } 24 | } 25 | 26 | func (d *Detector) HearSomething(samples []byte) bool { 27 | flux := d.vad.Flux(bytesToInt16sLE(samples)) 28 | 29 | if d.lastFlux == 0 { 30 | d.lastFlux = flux * d.sensitivity 31 | return false 32 | } 33 | 34 | if flux >= d.lastFlux { 35 | //log.Println(flux, ">=", d.lastFlux*detectCoefficient) 36 | d.start = time.Now() 37 | return true 38 | } 39 | 40 | if time.Since(d.start) < d.quietTimeDelay { 41 | log.Println("delay") 42 | return true 43 | } 44 | 45 | if flux*d.sensitivity <= d.lastFlux { 46 | return false 47 | } 48 | 49 | return false 50 | } 51 | 52 | func bytesToInt16sLE(bytes []byte) []int16 { 53 | // Ensure the byte slice length is even 54 | if len(bytes)%2 != 0 { 55 | panic("bytesToInt16sLE: input bytes slice has odd length, must be even") 56 | } 57 | 58 | int16s := make([]int16, len(bytes)/2) 59 | for i := 0; i < len(int16s); i++ { 60 | int16s[i] = int16(bytes[2*i]) | int16(bytes[2*i+1])<<8 61 | } 62 | return int16s 63 | } 64 | -------------------------------------------------------------------------------- /vad/silero.go: -------------------------------------------------------------------------------- 1 | package vad 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/go-audio/audio" 7 | "github.com/streamer45/silero-vad-go/speech" 8 | ) 9 | 10 | type SileroDetector struct { 11 | detector *speech.Detector 12 | } 13 | 14 | func NewSileroDetector(filepath string) (*SileroDetector, error) { 15 | sd, err := speech.NewDetector(speech.DetectorConfig{ 16 | ModelPath: filepath, 17 | SampleRate: 16000, 18 | WindowSize: 1024, 19 | Threshold: 0.5, 20 | MinSilenceDurationMs: 0, 21 | SpeechPadMs: 0, 22 | }) 23 | if err != nil { 24 | return nil, fmt.Errorf("create silero detector: %w", err) 25 | } 26 | 27 | return &SileroDetector{ 28 | detector: sd, 29 | }, nil 30 | } 31 | 32 | // DetectVoice tries to identify the segment in which the voice is present. 33 | // You can also use a set of segments by iterating over it. 34 | // 35 | // for _, s := range segments { 36 | // log.Printf("speech starts at %0.2fs", s.SpeechStartAt) 37 | // if s.SpeechEndAt > 0 { 38 | // log.Printf("speech ends at %0.2fs", s.SpeechEndAt) 39 | // } 40 | // } 41 | func (s *SileroDetector) DetectVoice(buffer *audio.IntBuffer) (bool, error) { 42 | pcmBuf := buffer.AsFloat32Buffer() 43 | 44 | segments, err := s.detector.Detect(pcmBuf.Data) 45 | if err != nil { 46 | return false, fmt.Errorf("detect: %w", err) 47 | } 48 | 49 | return len(segments) > 0, nil 50 | } 51 | -------------------------------------------------------------------------------- /vad/vad.go: -------------------------------------------------------------------------------- 1 | package vad 2 | 3 | import ( 4 | "math" 5 | 6 | "github.com/mjibson/go-dsp/fft" 7 | ) 8 | 9 | // Voice Activity Detection (that's what they call it when you detect 10 | // that someone has started (or stopped) talking). 11 | 12 | type VAD struct { 13 | samples []complex128 14 | fft []complex128 15 | spectrum []float64 16 | lastSpectrum []float64 17 | } 18 | 19 | func NewVAD(width int) *VAD { 20 | return &VAD{ 21 | samples: make([]complex128, width), 22 | spectrum: make([]float64, width/2+1), 23 | lastSpectrum: make([]float64, width/2+1), 24 | } 25 | } 26 | 27 | // Flux Given the samples, return the spectral flux value as compared to the previous samples. 28 | func (v *VAD) Flux(samples []int16) float64 { 29 | for i, s := range samples { 30 | v.samples[i] = complex(float64(s), 0) 31 | } 32 | 33 | v.fft = fft.FFT(v.samples) 34 | copy(v.spectrum, v.lastSpectrum) 35 | 36 | for i, _ := range v.spectrum { 37 | c := v.fft[i] 38 | v.spectrum[i] = math.Sqrt(real(c)*real(c) + imag(c)*imag(c)) 39 | } 40 | 41 | var flux float64 42 | 43 | for i, s := range v.spectrum { 44 | flux += s - v.lastSpectrum[i] 45 | } 46 | 47 | return flux 48 | } 49 | 50 | func (v *VAD) FFT() []complex128 { 51 | return v.fft 52 | } 53 | -------------------------------------------------------------------------------- /whisper/server_api.go: -------------------------------------------------------------------------------- 1 | package whisper 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "mime/multipart" 10 | "net/http" 11 | "time" 12 | ) 13 | 14 | type Config struct { 15 | Temperature float32 16 | TemperatureInc float32 17 | Timeout time.Duration 18 | } 19 | 20 | type Response struct { 21 | Text string `json:"text"` 22 | } 23 | 24 | type ServerApi struct { 25 | url string 26 | config Config 27 | } 28 | 29 | func NewServerApi(url string, config Config) *ServerApi { 30 | return &ServerApi{url: url, config: config} 31 | } 32 | 33 | func (s *ServerApi) SendMultiPartForm(ctx context.Context, wavData []byte) (Response, error) { 34 | // Create a buffer to hold the multipart form data 35 | var b bytes.Buffer 36 | multipartWriter := multipart.NewWriter(&b) 37 | 38 | // Create a form file part 39 | part, err := multipartWriter.CreateFormFile("file", "example.wav") 40 | if err != nil { 41 | return Response{}, fmt.Errorf("creating multipart file form: %w", err) 42 | } 43 | 44 | // Write WAV data to the form file part 45 | _, err = part.Write(wavData) 46 | if err != nil { 47 | return Response{}, fmt.Errorf("write data to multipart writer: %w", err) 48 | } 49 | 50 | // Add a form field for the response format 51 | err = s.writeConfig(multipartWriter) 52 | if err != nil { 53 | return Response{}, fmt.Errorf("write whisper config: %w", err) 54 | } 55 | 56 | // Close the multipart writer to finalize the boundary 57 | err = multipartWriter.Close() 58 | if err != nil { 59 | return Response{}, fmt.Errorf("multipart writer close: %w", err) 60 | } 61 | 62 | // Create the HTTP send 63 | request, err := http.NewRequestWithContext(ctx, "POST", s.url, &b) 64 | if err != nil { 65 | return Response{}, fmt.Errorf("create request with context: %w", err) 66 | } 67 | request.Header.Set("Content-Type", multipartWriter.FormDataContentType()) 68 | 69 | return s.send(request) 70 | } 71 | 72 | func (s *ServerApi) send(request *http.Request) (Response, error) { 73 | // Perform the request 74 | client := &http.Client{Timeout: s.config.Timeout} 75 | 76 | response, err := client.Do(request) 77 | if err != nil { 78 | return Response{}, err // Handle the error appropriately 79 | } 80 | defer response.Body.Close() 81 | 82 | // Check response status 83 | if response.StatusCode != http.StatusOK { 84 | return Response{}, fmt.Errorf("server responded with status code: %d", response.StatusCode) 85 | } 86 | 87 | // read response body 88 | body, err := io.ReadAll(response.Body) 89 | if err != nil { 90 | return Response{}, fmt.Errorf("%w", err) 91 | } 92 | 93 | var result Response 94 | err = json.Unmarshal(body, &result) 95 | if err != nil { 96 | return Response{}, fmt.Errorf("body unmarshal: %w", err) 97 | } 98 | 99 | return result, nil 100 | } 101 | 102 | func (s *ServerApi) writeConfig(mw *multipart.Writer) error { 103 | // Add a form field for the response format 104 | err := mw.WriteField("response_format", "json") 105 | if err != nil { 106 | return fmt.Errorf("add response_format: %w", err) 107 | } 108 | 109 | err = mw.WriteField("temperature", fmt.Sprintf("%.2f", s.config.Temperature)) 110 | if err != nil { 111 | return fmt.Errorf("add temperature: %w", err) 112 | } 113 | 114 | err = mw.WriteField("temperature_inc", fmt.Sprintf("%.2f", s.config.TemperatureInc)) 115 | if err != nil { 116 | return fmt.Errorf("add temperature_inc: %w", err) 117 | } 118 | 119 | return nil 120 | } 121 | --------------------------------------------------------------------------------