├── examples ├── asrexample │ ├── test.pcm │ └── asrexample.go ├── flashexample │ ├── test.pcm │ └── flashexample.go ├── soeexample │ ├── english.wav │ └── main.go ├── virtual_number_example │ ├── test.pcm │ └── virtual_number_example.go └── ttsexample │ ├── ttsexample.go │ └── ttswsexample.go ├── tts ├── utils.go ├── speechsynthesizer.go └── speechwssynthesizer.go ├── xcheck-input.json ├── go.mod ├── CHANGELOG.md ├── README.md ├── common └── credential.go ├── asr ├── flashrecognizer.go ├── virtual_number_recogizer.go └── speechrecognizer.go ├── LICENSE └── soe └── speaking_assessment.go /examples/asrexample/test.pcm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TencentCloud/tencentcloud-speech-sdk-go/HEAD/examples/asrexample/test.pcm -------------------------------------------------------------------------------- /examples/flashexample/test.pcm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TencentCloud/tencentcloud-speech-sdk-go/HEAD/examples/flashexample/test.pcm -------------------------------------------------------------------------------- /examples/soeexample/english.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TencentCloud/tencentcloud-speech-sdk-go/HEAD/examples/soeexample/english.wav -------------------------------------------------------------------------------- /examples/virtual_number_example/test.pcm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TencentCloud/tencentcloud-speech-sdk-go/HEAD/examples/virtual_number_example/test.pcm -------------------------------------------------------------------------------- /tts/utils.go: -------------------------------------------------------------------------------- 1 | package tts 2 | 3 | import "os" 4 | 5 | func WriteFile(filename string, content []byte) error { 6 | fout, err := os.Create(filename) 7 | defer fout.Close() 8 | if err != nil { 9 | return err 10 | } 11 | 12 | _, err = fout.Write(content) 13 | if err != nil { 14 | return err 15 | } 16 | return nil 17 | } 18 | -------------------------------------------------------------------------------- /xcheck-input.json: -------------------------------------------------------------------------------- 1 | {"server": "http://xcheck.woa.com", "token": "1bbabf40-88ac-421b-9241-92498025832d", "proj-name": "", "proj-dir": "/data/__qci/root-workspaces/__qci-pipeline-10665501-1/tencentcloud-speech-sdk-go", "proj-url": "", "proj-lang": "", "result-type": "json", "output": "output.json", "timeout": 300, "excl-dirs": "", "excl-dir-ptns": "", "diff-files": []} -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/tencentcloud/tencentcloud-speech-sdk-go 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/BurntSushi/toml v0.3.1 // indirect 7 | github.com/google/uuid v1.1.2 8 | github.com/gorilla/websocket v1.4.2 9 | golang.org/x/net v0.0.0-20200904194848-62affa334b73 10 | google.golang.org/genproto v0.0.0-20200925023002-c2d885f95484 // indirect 11 | ) 12 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ## [1.0.0] - 2020-10-16 11 | 12 | ### Added 13 | 14 | - Added asr and tts sdk. 15 | - Added sdk examples. 16 | 17 | [1.0.0]: https://github.com/TencentCloud/tencentcloud-speech-sdk-go/releases/tag/1.0.0 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 简介 2 | 3 | 欢迎使用腾讯云语音SDK,腾讯云语音SDK为开发者提供了访问腾讯云语音识别、语音合成等语音服务的配套开发工具,简化腾讯云语音服务的接入流程。 4 | 5 | 本项目是腾讯云语音SDK的Go语言版本。 6 | 7 | # 依赖环境 8 | 9 | 1. Go 1.13 版本及以上,推荐使用go mod方式引用安装。 10 | 2. 使用相关产品前需要在腾讯云控制台已开通相关语音产品。 11 | 3. 在腾讯云控制台[账号信息](https://console.cloud.tencent.com/developer)页面查看账号APPID,[访问管理](https://console.cloud.tencent.com/cam/capi)页面获取 SecretID 和 SecretKey 。 12 | 13 | # 获取安装 14 | 15 | 推荐使用语言自带的工具安装 SDK : 16 | 17 | go get github.com/tencentcloud/tencentcloud-speech-sdk-go@latest 18 | 19 | # 示例 20 | 21 | 参见 [examples](https://github.com/TencentCloud/tencentcloud-speech-sdk-go/tree/master/examples) 目录,该目录下包含各语音服务的示例代码。 22 | -------------------------------------------------------------------------------- /common/credential.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | type Credential struct { 4 | SecretId string 5 | SecretKey string 6 | Token string 7 | } 8 | 9 | func NewCredential(secretId, secretKey string) *Credential { 10 | return &Credential{ 11 | SecretId: secretId, 12 | SecretKey: secretKey, 13 | } 14 | } 15 | 16 | func NewTokenCredential(secretId, secretKey, token string) *Credential { 17 | return &Credential{ 18 | SecretId: secretId, 19 | SecretKey: secretKey, 20 | Token: token, 21 | } 22 | } 23 | 24 | func (c *Credential) GetCredentialParams() map[string]string { 25 | p := map[string]string{ 26 | "SecretId": c.SecretId, 27 | } 28 | if c.Token != "" { 29 | p["Token"] = c.Token 30 | } 31 | return p 32 | } 33 | -------------------------------------------------------------------------------- /examples/flashexample/flashexample.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "sync" 9 | "time" 10 | 11 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/asr" 12 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 13 | ) 14 | 15 | var ( 16 | // AppID AppID 17 | AppID = "AppID" 18 | // SecretID SecretID 19 | SecretID = "" 20 | // SecretKey SecretKey 21 | SecretKey = "" 22 | // EngineType EngineType 23 | EngineType = "16k_zh" 24 | ) 25 | 26 | func main() { 27 | var c = flag.Int("c", 1, "concurrency") 28 | var l = flag.Bool("l", false, "loop or not") 29 | var f = flag.String("f", "test.pcm", "audio file") 30 | flag.Parse() 31 | 32 | var wg sync.WaitGroup 33 | for i := 0; i < *c; i++ { 34 | fmt.Println("Main: Starting worker", i) 35 | wg.Add(1) 36 | if *l { 37 | go processLoop(i, &wg, *f) 38 | } else { 39 | go processOnce(i, &wg, *f) 40 | } 41 | } 42 | 43 | fmt.Println("Main: Waiting for workers to finish") 44 | wg.Wait() 45 | fmt.Println("Main: Completed") 46 | 47 | } 48 | 49 | func processLoop(id int, wg *sync.WaitGroup, file string) { 50 | defer wg.Done() 51 | for { 52 | process(id, file) 53 | } 54 | } 55 | 56 | func processOnce(id int, wg *sync.WaitGroup, file string) { 57 | defer wg.Done() 58 | process(id, file) 59 | } 60 | 61 | func process(id int, file string) { 62 | audio, err := os.Open(file) 63 | defer audio.Close() 64 | if err != nil { 65 | fmt.Printf("open file error: %v\n", err) 66 | return 67 | } 68 | credential := common.NewCredential(SecretID, SecretKey) 69 | recognizer := asr.NewFlashRecognizer(AppID, credential) 70 | data, err := ioutil.ReadAll(audio) 71 | if err != nil { 72 | fmt.Printf("%s|failed read data, error: %v\n", time.Now().Format("2006-01-02 15:04:05"), err) 73 | return 74 | } 75 | 76 | req := new(asr.FlashRecognitionRequest) 77 | req.EngineType = EngineType 78 | req.VoiceFormat = "pcm" 79 | req.SpeakerDiarization = 0 80 | req.FilterDirty = 0 81 | req.FilterModal = 0 82 | req.FilterPunc = 0 83 | req.ConvertNumMode = 1 84 | req.FirstChannelOnly = 1 85 | req.WordInfo = 0 86 | 87 | resp, err := recognizer.Recognize(req, data) 88 | if err != nil { 89 | fmt.Printf("%s|failed do recognize, error: %v\n", time.Now().Format("2006-01-02 15:04:05"), err) 90 | return 91 | } 92 | fmt.Printf("request_id: %s\n", resp.RequestId) 93 | 94 | for _, channelResult := range resp.FlashResult { 95 | fmt.Printf("channel_id: %d, result: %s\n", channelResult.ChannelId, channelResult.Text) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /examples/ttsexample/ttsexample.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "sync" 7 | "time" 8 | 9 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 10 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/tts" 11 | ) 12 | 13 | var ( 14 | // AppID AppID 15 | AppID = 0 16 | // SecretID SecretID 17 | SecretID = "SecretID" 18 | // SecretKey SecretKey 19 | SecretKey = "SecretKey" 20 | ) 21 | 22 | // MySpeechSynthesisListener implementation of SpeechSynthesisListener 23 | type MySpeechSynthesisListener struct { 24 | ID int 25 | } 26 | 27 | // OnMessage implementation of SpeechSynthesisListener 28 | func (listener *MySpeechSynthesisListener) OnMessage(response *tts.SpeechSynthesisResponse) { 29 | fmt.Printf("%s|%d|OnMessage, size: %d\n", time.Now().Format("2006-01-02 15:04:05"), listener.ID, len(response.Data)) 30 | } 31 | 32 | // OnComplete implementation of SpeechSynthesisListener 33 | func (listener *MySpeechSynthesisListener) OnComplete(response *tts.SpeechSynthesisResponse) { 34 | fmt.Printf("%s|%d|OnComplete: %v\n", time.Now().Format("2006-01-02 15:04:05"), listener.ID, response) 35 | } 36 | 37 | // OnCancel implementation of SpeechSynthesisListener 38 | func (listener *MySpeechSynthesisListener) OnCancel(response *tts.SpeechSynthesisResponse) { 39 | fmt.Printf("%s|%d|OnCancel: %v\n", time.Now().Format("2006-01-02 15:04:05"), listener.ID, response) 40 | } 41 | 42 | // OnFail implementation of SpeechSynthesisListener 43 | func (listener *MySpeechSynthesisListener) OnFail(response *tts.SpeechSynthesisResponse, err error) { 44 | fmt.Printf("%s|%d|OnFail: %v, %v\n", time.Now().Format("2006-01-02 15:04:05"), listener.ID, response, err) 45 | } 46 | 47 | var proxyURL string 48 | 49 | func main() { 50 | var c = flag.Int("c", 1, "concurrency") 51 | var p = flag.String("p", "", "proxy url") 52 | flag.Parse() 53 | 54 | proxyURL = *p 55 | var wg sync.WaitGroup 56 | for i := 0; i < *c; i++ { 57 | fmt.Println("Main: Starting worker", i) 58 | wg.Add(1) 59 | go process(i, &wg) 60 | } 61 | 62 | fmt.Println("Main: Waiting for workers to finish") 63 | wg.Wait() 64 | fmt.Println("Main: Completed") 65 | 66 | } 67 | 68 | func process(id int, wg *sync.WaitGroup) { 69 | defer wg.Done() 70 | 71 | listener := &MySpeechSynthesisListener{ 72 | ID: id, 73 | } 74 | credential := common.NewCredential(SecretID, SecretKey) 75 | synthesizer := tts.NewSpeechSynthesizer(int64(AppID), credential, listener) 76 | synthesizer.VoiceType = 101000 77 | text := "语音合成可自定义音量和语速,让发音更自然、更专业、更符合场景需求。满足将文本转化成拟人化语音的需求,打通人机交互闭环。支持多种音色选择,语音合成可广泛应用于语音导航、有声读物、机器人、语音助手、自动新闻播报等场景,提升人机交互体验,提高语音类应用构建效率。" 78 | synthesizer.ProxyURL = proxyURL 79 | synthesizer.Synthesis(text) 80 | synthesizer.Wait() 81 | } 82 | -------------------------------------------------------------------------------- /examples/ttsexample/ttswsexample.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/google/uuid" 7 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 8 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/tts" 9 | "path" 10 | "strconv" 11 | "sync" 12 | "time" 13 | ) 14 | 15 | type MySpeechWsSynthesisListener struct { 16 | SessionId string 17 | Data []byte 18 | Index int 19 | } 20 | 21 | func (l *MySpeechWsSynthesisListener) OnSynthesisStart(r *tts.SpeechWsSynthesisResponse) { 22 | fmt.Printf("%s|OnSynthesisStart,sessionId:%s response: %s\n", time.Now().Format("2006-01-02 15:04:05"), l.SessionId, r.ToString()) 23 | } 24 | 25 | func (l *MySpeechWsSynthesisListener) OnSynthesisEnd(r *tts.SpeechWsSynthesisResponse) { 26 | fileName := fmt.Sprintf("test.mp3") 27 | tts.WriteFile(path.Join("./", fileName), l.Data) 28 | fmt.Printf("%s|OnSynthesisEnd,sessionId:%s response: %s\n", time.Now().Format("2006-01-02 15:04:05"), l.SessionId, r.ToString()) 29 | } 30 | func (l *MySpeechWsSynthesisListener) OnAudioResult(data []byte) { 31 | fmt.Printf("%s|OnAudioResult,sessionId:%s index:%d\n", time.Now().Format("2006-01-02 15:04:05"), l.SessionId, l.Index) 32 | l.Index = l.Index + 1 33 | l.Data = append(l.Data, data...) 34 | } 35 | func (l *MySpeechWsSynthesisListener) OnTextResult(r *tts.SpeechWsSynthesisResponse) { 36 | fmt.Printf("%s|OnTextResult,sessionId:%s response: %s\n", time.Now().Format("2006-01-02 15:04:05"), l.SessionId, r.ToString()) 37 | } 38 | func (l *MySpeechWsSynthesisListener) OnSynthesisFail(r *tts.SpeechWsSynthesisResponse, err error) { 39 | fmt.Printf("%s|OnSynthesisFail,sessionId:%s response: %s err:%s\n", time.Now().Format("2006-01-02 15:04:05"), l.SessionId, r.ToString(), err.Error()) 40 | } 41 | 42 | func main() { 43 | var c = flag.Int("c", 1, "concurrency") 44 | flag.Parse() 45 | var wg sync.WaitGroup 46 | for i := 0; i < *c; i++ { 47 | fmt.Println("Main: Starting worker", i) 48 | wg.Add(1) 49 | go processWs(i, &wg) 50 | } 51 | 52 | fmt.Println("Main: Waiting for workers to finish") 53 | wg.Wait() 54 | fmt.Println("Main: Completed") 55 | 56 | } 57 | 58 | func processWs(id int, wg *sync.WaitGroup) { 59 | defer wg.Done() 60 | //在腾讯云控制台账号信息页面查看账号APPID,访问管理页面获取 SecretID 和 SecretKey 。 61 | secretId := "替换为自己的secretId" 62 | secretKey := "替换为自己的secretKey" 63 | AppId := 0 //替换为自己的appid 64 | 65 | sessionId := fmt.Sprintf("%s_%s", strconv.Itoa(id), uuid.New().String()) 66 | listener := &MySpeechWsSynthesisListener{Data: make([]byte, 0), SessionId: sessionId} 67 | credential := common.NewCredential(secretId, secretKey) 68 | synthesizer := tts.NewSpeechWsSynthesizer(int64(AppId), credential, listener) 69 | synthesizer.SessionId = sessionId 70 | synthesizer.VoiceType = 1001 71 | synthesizer.Codec = "mp3" 72 | synthesizer.Text = "\n现状是各地的经济水平是参差不齐的。需要缩小较弱地域和较强地域的差距。要做好这个差事可不容易啊。\n\n" 73 | synthesizer.EnableSubtitle = true 74 | //synthesizer.EmotionCategory = "happy" 75 | //synthesizer.EmotionIntensity = 200 76 | //synthesizer.Debug = true 77 | //synthesizer.DebugFunc = func(message string) { fmt.Println(message) } 78 | err := synthesizer.Synthesis() 79 | if err != nil { 80 | fmt.Println(err.Error()) 81 | return 82 | } 83 | synthesizer.Wait() 84 | } 85 | -------------------------------------------------------------------------------- /examples/virtual_number_example/virtual_number_example.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/asr" 7 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 8 | "os" 9 | "sync" 10 | "time" 11 | ) 12 | 13 | var ( 14 | //TODO 补充信息 15 | AppID = "AppID" 16 | // SecretID SecretID 17 | SecretID = "" 18 | // SecretKey SecretKey 19 | SecretKey = "" 20 | // SliceSize SliceSize 21 | SliceSize = 3200 22 | ) 23 | 24 | // MyVNRecognitionListener implementation of SpeechRecognitionListener 25 | type MyVNRecognitionListener struct { 26 | ID int 27 | } 28 | 29 | // OnVNRecognitionStart implementation of SpeechRecognitionListener 30 | func (listener *MyVNRecognitionListener) OnVNRecognitionStart(response *asr.VNRecognitionResponse) { 31 | fmt.Printf("%s|%s|OnRecognitionStart\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID) 32 | } 33 | 34 | // OnVNRecognitionComplete implementation of SpeechRecognitionListener 35 | func (listener *MyVNRecognitionListener) OnVNRecognitionComplete(response *asr.VNRecognitionResponse) { 36 | fmt.Printf("%s|%s|OnRecognitionComplete|result:%d\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, response.Result) 37 | } 38 | 39 | // OnVNFail implementation of SpeechRecognitionListener 40 | func (listener *MyVNRecognitionListener) OnVNFail(response *asr.VNRecognitionResponse, err error) { 41 | fmt.Printf("%s|%s|OnFail: %v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, err) 42 | } 43 | 44 | var proxyURL string 45 | 46 | func main() { 47 | var c = flag.Int("c", 1, "concurrency") 48 | var l = flag.Bool("l", false, "loop or not") 49 | var f = flag.String("f", "test.pcm", "audio file") 50 | var p = flag.String("p", "", "proxy url") 51 | flag.Parse() 52 | 53 | proxyURL = *p 54 | var wg sync.WaitGroup 55 | for i := 0; i < *c; i++ { 56 | fmt.Println("Main: Starting worker", i) 57 | wg.Add(1) 58 | if *l { 59 | go processLoop(i, &wg, *f) 60 | } else { 61 | go processOnce(i, &wg, *f) 62 | } 63 | } 64 | 65 | fmt.Println("Main: Waiting for workers to finish") 66 | wg.Wait() 67 | fmt.Println("Main: Completed") 68 | 69 | } 70 | 71 | func processLoop(id int, wg *sync.WaitGroup, file string) { 72 | defer wg.Done() 73 | for { 74 | err := process(id, file) 75 | if err != nil { 76 | return 77 | } 78 | } 79 | } 80 | 81 | func processOnce(id int, wg *sync.WaitGroup, file string) { 82 | defer wg.Done() 83 | process(id, file) 84 | } 85 | 86 | func process(id int, file string) error { 87 | audio, err := os.Open(file) 88 | defer audio.Close() 89 | if err != nil { 90 | fmt.Printf("open file error: %v\n", err) 91 | return err 92 | } 93 | 94 | listener := &MyVNRecognitionListener{ 95 | ID: id, 96 | } 97 | credential := common.NewCredential(SecretID, SecretKey) 98 | recognizer := asr.NewVNRecognizer(AppID, credential, listener) 99 | recognizer.ProxyURL = proxyURL 100 | recognizer.VoiceFormat = asr.AudioFormatPCM 101 | recognizer.WaitTime = 30000 102 | //握手阶段 103 | err = recognizer.Start() 104 | if err != nil { 105 | fmt.Printf("%s|recognizer start failed, error: %v\n", time.Now().Format("2006-01-02 15:04:05"), err) 106 | return err 107 | } 108 | for { 109 | data := make([]byte, SliceSize) 110 | n, err := audio.Read(data) 111 | if err != nil { 112 | if err.Error() == "EOF" { 113 | break 114 | } 115 | fmt.Printf("read file error: %v\n", err) 116 | break 117 | } 118 | if n <= 0 { 119 | break 120 | } 121 | err, end := recognizer.Write(data) 122 | if err != nil || end { 123 | break 124 | } 125 | //模拟真实场景,200ms产生200ms数据 126 | time.Sleep(200 * time.Millisecond) 127 | } 128 | recognizer.Stop() 129 | return nil 130 | } 131 | -------------------------------------------------------------------------------- /examples/asrexample/asrexample.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "sync" 8 | "time" 9 | 10 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/asr" 11 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 12 | ) 13 | 14 | var ( 15 | // AppID AppID 16 | AppID = "" 17 | // SecretID SecretID 18 | SecretID = "" 19 | // SecretKey SecretKey 20 | SecretKey = "" 21 | // EngineModelType EngineModelType 22 | EngineModelType = "16k_zh" 23 | // SliceSize SliceSize 24 | SliceSize = 6400 25 | ) 26 | 27 | // MySpeechRecognitionListener implementation of SpeechRecognitionListener 28 | type MySpeechRecognitionListener struct { 29 | ID int 30 | } 31 | 32 | // OnRecognitionStart implementation of SpeechRecognitionListener 33 | func (listener *MySpeechRecognitionListener) OnRecognitionStart(response *asr.SpeechRecognitionResponse) { 34 | fmt.Printf("%s|%s|OnRecognitionStart\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID) 35 | } 36 | 37 | // OnSentenceBegin implementation of SpeechRecognitionListener 38 | func (listener *MySpeechRecognitionListener) OnSentenceBegin(response *asr.SpeechRecognitionResponse) { 39 | fmt.Printf("%s|%s|OnSentenceBegin: %v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, response) 40 | } 41 | 42 | // OnRecognitionResultChange implementation of SpeechRecognitionListener 43 | func (listener *MySpeechRecognitionListener) OnRecognitionResultChange(response *asr.SpeechRecognitionResponse) { 44 | fmt.Printf("%s|%s|OnRecognitionResultChange: %v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, response) 45 | } 46 | 47 | // OnSentenceEnd implementation of SpeechRecognitionListener 48 | func (listener *MySpeechRecognitionListener) OnSentenceEnd(response *asr.SpeechRecognitionResponse) { 49 | fmt.Printf("%s|%s|OnSentenceEnd: %v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, response) 50 | } 51 | 52 | // OnRecognitionComplete implementation of SpeechRecognitionListener 53 | func (listener *MySpeechRecognitionListener) OnRecognitionComplete(response *asr.SpeechRecognitionResponse) { 54 | fmt.Printf("%s|%s|OnRecognitionComplete\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID) 55 | } 56 | 57 | // OnFail implementation of SpeechRecognitionListener 58 | func (listener *MySpeechRecognitionListener) OnFail(response *asr.SpeechRecognitionResponse, err error) { 59 | fmt.Printf("%s|%s|OnFail: %v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, err) 60 | } 61 | 62 | var proxyURL string 63 | 64 | func main() { 65 | var c = flag.Int("c", 1, "concurrency") 66 | var l = flag.Bool("l", false, "loop or not") 67 | var f = flag.String("f", "test.pcm", "audio file") 68 | var p = flag.String("p", "", "proxy url") 69 | flag.Parse() 70 | 71 | proxyURL = *p 72 | var wg sync.WaitGroup 73 | for i := 0; i < *c; i++ { 74 | fmt.Println("Main: Starting worker", i) 75 | wg.Add(1) 76 | if *l { 77 | go processLoop(i, &wg, *f) 78 | } else { 79 | go processOnce(i, &wg, *f) 80 | } 81 | } 82 | 83 | fmt.Println("Main: Waiting for workers to finish") 84 | wg.Wait() 85 | fmt.Println("Main: Completed") 86 | 87 | } 88 | 89 | func processLoop(id int, wg *sync.WaitGroup, file string) { 90 | defer wg.Done() 91 | for { 92 | err := process(id, file) 93 | if err != nil { 94 | return 95 | } 96 | } 97 | } 98 | 99 | func processOnce(id int, wg *sync.WaitGroup, file string) { 100 | defer wg.Done() 101 | process(id, file) 102 | } 103 | 104 | func process(id int, file string) error { 105 | audio, err := os.Open(file) 106 | defer audio.Close() 107 | if err != nil { 108 | fmt.Printf("open file error: %v\n", err) 109 | return err 110 | } 111 | 112 | listener := &MySpeechRecognitionListener{ 113 | ID: id, 114 | } 115 | credential := common.NewCredential(SecretID, SecretKey) 116 | recognizer := asr.NewSpeechRecognizer(AppID, credential, EngineModelType, listener) 117 | recognizer.ProxyURL = proxyURL 118 | recognizer.VoiceFormat = asr.AudioFormatPCM 119 | err = recognizer.Start() 120 | if err != nil { 121 | fmt.Printf("%s|recognizer start failed, error: %v\n", time.Now().Format("2006-01-02 15:04:05"), err) 122 | return err 123 | } 124 | for { 125 | data := make([]byte, SliceSize) 126 | n, err := audio.Read(data) 127 | if err != nil { 128 | if err.Error() == "EOF" { 129 | break 130 | } 131 | fmt.Printf("read file error: %v\n", err) 132 | break 133 | } 134 | if n <= 0 { 135 | break 136 | } 137 | err = recognizer.Write(data[:n]) 138 | if err != nil { 139 | break 140 | } 141 | //模拟真实场景,200ms产生200ms数据 142 | //注意:该行sleep代码用于模拟实时音频流1:1产生音频数据(每200ms产生200ms音频),实际音频流场景建议删除该行代码,或业务根据自己的需求情况自行调整 143 | time.Sleep(200 * time.Millisecond) 144 | } 145 | recognizer.Stop() 146 | return nil 147 | } 148 | -------------------------------------------------------------------------------- /examples/soeexample/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "sync" 8 | "time" 9 | 10 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 11 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/soe" 12 | ) 13 | 14 | var ( 15 | //TODO 补充信息 16 | // AppID AppID 17 | AppID = "" 18 | //SecretID SecretID 19 | SecretID = "" 20 | //SecretKey SecretKey 21 | SecretKey = "" 22 | // Token 只有临时秘钥鉴权需要 23 | Token = "" 24 | 25 | // SliceSize SliceSize 26 | SliceSize = 1600 27 | ) 28 | 29 | // MySpeakingAssessmentListener implementation of SpeakingAssessmentListener 30 | type MySpeakingAssessmentListener struct { 31 | ID int 32 | } 33 | 34 | // OnRecognitionStart implementation of SpeakingAssessmentListener 35 | func (listener *MySpeakingAssessmentListener) OnRecognitionStart(response *soe.SpeakingAssessmentResponse) { 36 | fmt.Printf("%s|%s|OnRecognitionStart\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID) 37 | } 38 | 39 | // OnIntermediateResults implementation of SpeakingAssessmentListener 40 | func (listener *MySpeakingAssessmentListener) OnIntermediateResults(response *soe.SpeakingAssessmentResponse) { 41 | fmt.Printf("%s|%s|OnIntermediateResults|result:%+v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, response.Result) 42 | } 43 | 44 | // OnRecognitionComplete implementation of SpeakingAssessmentListener 45 | func (listener *MySpeakingAssessmentListener) OnRecognitionComplete(response *soe.SpeakingAssessmentResponse) { 46 | fmt.Printf("%s|%s|OnRecognitionComplete|result:%+v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, response.Result) 47 | } 48 | 49 | // OnFail implementation of SpeakingAssessmentListener 50 | func (listener *MySpeakingAssessmentListener) OnFail(response *soe.SpeakingAssessmentResponse, err error) { 51 | fmt.Printf("%s|%s|OnFail: %v\n", time.Now().Format("2006-01-02 15:04:05"), response.VoiceID, err) 52 | } 53 | 54 | var proxyURL string 55 | var recFlag = flag.Bool("rec", false, "enable rec mode") 56 | 57 | func main() { 58 | var c = flag.Int("c", 1, "concurrency") 59 | var l = flag.Bool("l", false, "loop or not") 60 | var f = flag.String("f", "english.wav", "audio file") 61 | var p = flag.String("p", "", "proxy url") 62 | flag.Parse() 63 | 64 | proxyURL = *p 65 | var wg sync.WaitGroup 66 | for i := 0; i < *c; i++ { 67 | fmt.Println("Main: Starting worker", i) 68 | wg.Add(1) 69 | if *l { 70 | go processLoop(i, &wg, *f) 71 | } else { 72 | go processOnce(i, &wg, *f) 73 | } 74 | } 75 | 76 | fmt.Println("Main: Waiting for workers to finish") 77 | wg.Wait() 78 | fmt.Println("Main: Completed") 79 | 80 | } 81 | 82 | func processLoop(id int, wg *sync.WaitGroup, file string) { 83 | defer wg.Done() 84 | for { 85 | err := process(id, file) 86 | if err != nil { 87 | return 88 | } 89 | } 90 | } 91 | 92 | func processOnce(id int, wg *sync.WaitGroup, file string) { 93 | defer wg.Done() 94 | process(id, file) 95 | } 96 | 97 | func process(id int, file string) error { 98 | audio, err := os.Open(file) 99 | if err != nil { 100 | fmt.Printf("open file error: %v\n", err) 101 | return err 102 | } 103 | defer audio.Close() 104 | 105 | listener := &MySpeakingAssessmentListener{ 106 | ID: id, 107 | } 108 | // 临时秘钥鉴权需要使用带token的方式 credential := common.NewTokenCredential(SecretID, SecretKey, Token) 109 | credential := common.NewCredential(SecretID, SecretKey) 110 | recognizer := soe.NewSpeechRecognizer(AppID, credential, listener) 111 | recognizer.ProxyURL = proxyURL 112 | recognizer.VoiceFormat = soe.AudioFormatWav 113 | recognizer.RefText = "beautiful" 114 | recognizer.ServerEngineType = "16k_en" 115 | recognizer.ScoreCoeff = 1.1 116 | recognizer.EvalMode = 0 117 | recognizer.Keyword = "" 118 | recognizer.SentenceInfoEnabled = 0 119 | recognizer.TextMode = 0 120 | if *recFlag { 121 | // 录音识别模式下可发送单个大长度分片(上限300s) 122 | // 单次连接只能发一个分片,得到识别结果后需要关闭此条websocket连接,再次识别需要重新建立连接 123 | // 录音识别模式适合已经存在完整录音文件数据需要一次性返回最终结果的场景 124 | // 更推荐使用流式识别模式,流式识别可以相对更快的得到识别结果,有更可靠的实时率保障 125 | recognizer.RecMode = 1 126 | } 127 | //握手阶段 128 | err = recognizer.Start() 129 | if err != nil { 130 | fmt.Printf("%s|recognizer start failed, error: %v\n", time.Now().Format("2006-01-02 15:04:05"), err) 131 | return err 132 | } 133 | seq := 0 134 | if *recFlag { 135 | // 录音识别模式可以一次性发送全部数据 136 | fileDataAll, err := os.ReadFile(file) 137 | if err != nil { 138 | fmt.Printf("read file error: %v\n", err) 139 | return err 140 | } 141 | if err = recognizer.Write(fileDataAll); err != nil { 142 | fmt.Printf("write data error: %v\n", err) 143 | return err 144 | } 145 | } else { 146 | // 流式识别模式,需要分片发送音频数据 147 | for { 148 | data := make([]byte, SliceSize) 149 | n, err := audio.Read(data) 150 | if err != nil { 151 | if err.Error() == "EOF" { 152 | break 153 | } 154 | fmt.Printf("read file error: %v\n", err) 155 | break 156 | } 157 | if n <= 0 { 158 | break 159 | } 160 | err = recognizer.Write(data) 161 | if err != nil { 162 | break 163 | } 164 | //模拟真实场景,200ms产生200ms数据 165 | time.Sleep(200 * time.Millisecond) 166 | seq++ 167 | } 168 | } 169 | 170 | recognizer.Stop() 171 | return nil 172 | } 173 | -------------------------------------------------------------------------------- /asr/flashrecognizer.go: -------------------------------------------------------------------------------- 1 | package asr 2 | 3 | import ( 4 | "bytes" 5 | "crypto/hmac" 6 | "crypto/sha1" 7 | "encoding/base64" 8 | "encoding/json" 9 | "fmt" 10 | "io/ioutil" 11 | "net" 12 | "net/http" 13 | "net/url" 14 | "sort" 15 | "strconv" 16 | "sync" 17 | "time" 18 | 19 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 20 | ) 21 | 22 | // FlashRecognitionRequest FlashRecognitionRequest 23 | type FlashRecognitionRequest struct { 24 | EngineType string `json:"engine_type"` 25 | VoiceFormat string `json:"voice_format"` 26 | SpeakerDiarization uint32 `json:"speaker_diarization"` 27 | HotwordId string `json:"hotword_id"` 28 | HotwordList string `json:"hotword_list"` 29 | CustomizationId string `json:"customization_id"` 30 | FilterDirty int32 `json:"filter_dirty"` 31 | FilterModal int32 `json:"filter_modal"` 32 | FilterPunc int32 `json:"filter_punc"` 33 | ConvertNumMode int32 `json:"convert_num_mode"` 34 | WordInfo int32 `json:"word_info"` 35 | FirstChannelOnly int32 `json:"first_channel_only"` 36 | ReinforceHotword int32 `json:"reinforce_hotword"` 37 | SentenceMaxLength int32 `json:"sentence_max_length"` 38 | } 39 | 40 | // FlashRecognitionResponse FlashRecognitionResponse 41 | type FlashRecognitionResponse struct { 42 | RequestId string `json:"request_id"` 43 | Code int `json:"code"` 44 | Message string `json:"message"` 45 | AudioDuration int64 `json:"audio_duration"` 46 | FlashResult []*FlashRecognitionResult `json:"flash_result,omitempty"` 47 | } 48 | 49 | // FlashRecognitionResult FlashRecognitionResult 50 | type FlashRecognitionResult struct { 51 | Text string `json:"text"` 52 | ChannelId int32 `json:"channel_id"` 53 | SentenceList []*FlashRecognitionSentence `json:"sentence_list,omitempty"` 54 | } 55 | 56 | // FlashRecognitionSentence FlashRecognitionSentence 57 | type FlashRecognitionSentence struct { 58 | Text string `json:"text"` 59 | StartTime uint32 `json:"start_time"` 60 | EndTime uint32 `json:"end_time"` 61 | SpeakerId int32 `json:"speaker_id"` 62 | WordList []*FlashWordData `json:"word_list,omitempty"` 63 | } 64 | 65 | // FlashWordData FlashWordData 66 | type FlashWordData struct { 67 | Word string `json:"word"` 68 | StartTime uint32 `json:"start_time"` 69 | EndTime uint32 `json:"end_time"` 70 | StableFlag uint32 `json:"stable_flag"` 71 | } 72 | 73 | // newFlashRecognitionResponse newFlashRecognitionResponse 74 | func newFlashRecognitionResponse(code int, message string) *FlashRecognitionResponse { 75 | return &FlashRecognitionResponse{ 76 | Code: code, 77 | Message: message, 78 | } 79 | } 80 | 81 | var ( 82 | flashHost = "asr.cloud.tencent.com" 83 | httpClient *http.Client 84 | 85 | connTimeout = 1 86 | rwTimeout = 600 87 | maxIdleConns = 100 88 | maxIdleConnsPerHost = 2 89 | idleConnTimeout = time.Duration(180) * time.Second 90 | 91 | //once : for once init 92 | once sync.Once 93 | ) 94 | 95 | // initHttpClient init http client 96 | func initHttpClient() { 97 | once.Do(func() { 98 | transport := &http.Transport{ 99 | Proxy: http.ProxyFromEnvironment, 100 | DialContext: (&net.Dialer{ 101 | Timeout: time.Duration(connTimeout) * time.Second, 102 | KeepAlive: time.Duration(rwTimeout*10) * time.Second, 103 | DualStack: true, 104 | }).DialContext, 105 | MaxIdleConns: maxIdleConns, 106 | IdleConnTimeout: idleConnTimeout, 107 | TLSHandshakeTimeout: time.Duration(connTimeout) * time.Second, 108 | ExpectContinueTimeout: 1 * time.Second, 109 | } 110 | httpClient = new(http.Client) 111 | httpClient.Transport = transport 112 | httpClient.Timeout = time.Duration(rwTimeout) * time.Second 113 | }) 114 | } 115 | 116 | // FlashRecognizer is the entry for ASR flash recognizer 117 | type FlashRecognizer struct { 118 | AppID string 119 | 120 | //for proxy 121 | ProxyURL string 122 | 123 | Credential *common.Credential 124 | } 125 | 126 | // NewFlashRecognizer creates instance of FlashRecognizer 127 | func NewFlashRecognizer(appID string, credential *common.Credential) *FlashRecognizer { 128 | initHttpClient() 129 | return &FlashRecognizer{ 130 | AppID: appID, 131 | Credential: credential, 132 | } 133 | } 134 | 135 | // Recognize Recognize 136 | func (recognizer *FlashRecognizer) Recognize(req *FlashRecognitionRequest, 137 | videoData []byte) (*FlashRecognitionResponse, error) { 138 | 139 | signStr, reqUrl := recognizer.buildURL(req) 140 | signature := recognizer.genSignature(signStr) 141 | 142 | headers := make(map[string]string) 143 | headers["Host"] = flashHost 144 | headers["Authorization"] = signature 145 | 146 | if len(recognizer.ProxyURL) > 0 { 147 | proxyURL, _ := url.Parse(recognizer.ProxyURL) 148 | httpClient.Transport.(*http.Transport).Proxy = http.ProxyURL(proxyURL) 149 | } 150 | 151 | httpReq, err := http.NewRequest("POST", reqUrl, bytes.NewReader(videoData)) 152 | if err != nil { 153 | return nil, fmt.Errorf("failed create http request, error: %s", err.Error()) 154 | } 155 | for k, v := range headers { 156 | httpReq.Header.Set(k, v) 157 | } 158 | httpResp, err := httpClient.Do(httpReq) 159 | if err != nil { 160 | return nil, fmt.Errorf("failed do request, error: %s", err.Error()) 161 | } 162 | defer httpResp.Body.Close() 163 | respData, err := ioutil.ReadAll(httpResp.Body) 164 | if err != nil { 165 | return nil, fmt.Errorf("failed read body, error: %s", err.Error()) 166 | } 167 | if httpResp.StatusCode != 200 { 168 | return nil, fmt.Errorf("http code not 200, respData: %s", string(respData)) 169 | } 170 | resp := &FlashRecognitionResponse{} 171 | err = json.Unmarshal(respData, &resp) 172 | if err != nil { 173 | return nil, fmt.Errorf("failed unmarshal, respData: %s, error: %s", respData, err.Error()) 174 | } 175 | if resp.Code != 0 { 176 | return resp, fmt.Errorf("request_id: %s, code: %d, message: %s", resp.RequestId, resp.Code, resp.Message) 177 | } 178 | return resp, nil 179 | } 180 | 181 | // buildURL buildURL 182 | func (recognizer *FlashRecognizer) buildURL(req *FlashRecognitionRequest) (string, string) { 183 | var queryMap = make(map[string]string) 184 | queryMap["secretid"] = recognizer.Credential.SecretId 185 | queryMap["engine_type"] = req.EngineType 186 | queryMap["voice_format"] = req.VoiceFormat 187 | queryMap["speaker_diarization"] = strconv.FormatInt(int64(req.SpeakerDiarization), 10) 188 | queryMap["hotword_id"] = req.HotwordId 189 | queryMap["hotword_list"] = req.HotwordList 190 | queryMap["customization_id"] = req.CustomizationId 191 | queryMap["filter_dirty"] = strconv.FormatInt(int64(req.FilterDirty), 10) 192 | queryMap["filter_modal"] = strconv.FormatInt(int64(req.FilterModal), 10) 193 | queryMap["filter_punc"] = strconv.FormatInt(int64(req.FilterPunc), 10) 194 | queryMap["convert_num_mode"] = strconv.FormatInt(int64(req.ConvertNumMode), 10) 195 | queryMap["word_info"] = strconv.FormatInt(int64(req.WordInfo), 10) 196 | queryMap["first_channel_only"] = strconv.FormatInt(int64(req.FirstChannelOnly), 10) 197 | queryMap["reinforce_hotword"] = strconv.FormatInt(int64(req.ReinforceHotword), 10) 198 | queryMap["sentence_max_length"] = strconv.FormatInt(int64(req.SentenceMaxLength), 10) 199 | var timestamp = time.Now().Unix() 200 | var timestampStr = strconv.FormatInt(timestamp, 10) 201 | queryMap["timestamp"] = timestampStr 202 | 203 | var keys []string 204 | for k := range queryMap { 205 | keys = append(keys, k) 206 | } 207 | sort.Strings(keys) 208 | 209 | var queryStrBuffer bytes.Buffer 210 | for _, k := range keys { 211 | queryStrBuffer.WriteString(k) 212 | queryStrBuffer.WriteString("=") 213 | queryStrBuffer.WriteString(queryMap[k]) 214 | queryStrBuffer.WriteString("&") 215 | } 216 | 217 | rs := []rune(queryStrBuffer.String()) 218 | rsLen := len(rs) 219 | queryStr := string(rs[0 : rsLen-1]) 220 | 221 | url := fmt.Sprintf("%s/asr/flash/v1/%s?%s", flashHost, recognizer.AppID, queryStr) 222 | signStr := fmt.Sprintf("POST%s", url) 223 | reqUrl := fmt.Sprintf("https://%s", url) 224 | return signStr, reqUrl 225 | } 226 | 227 | // genSignature genSignature 228 | func (recognizer *FlashRecognizer) genSignature(url string) string { 229 | hmac := hmac.New(sha1.New, []byte(recognizer.Credential.SecretKey)) 230 | signURL := url 231 | hmac.Write([]byte(signURL)) 232 | encryptedStr := hmac.Sum([]byte(nil)) 233 | var signature = base64.StdEncoding.EncodeToString(encryptedStr) 234 | return signature 235 | } 236 | -------------------------------------------------------------------------------- /tts/speechsynthesizer.go: -------------------------------------------------------------------------------- 1 | package tts 2 | 3 | import ( 4 | "bytes" 5 | "crypto/hmac" 6 | "crypto/sha1" 7 | "encoding/base64" 8 | "encoding/json" 9 | "fmt" 10 | "io/ioutil" 11 | "net" 12 | "net/http" 13 | "net/url" 14 | "sort" 15 | "strconv" 16 | "sync" 17 | "time" 18 | 19 | "github.com/google/uuid" 20 | 21 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 22 | ) 23 | 24 | // SpeechSynthesisResponse SpeechSynthesisResponse 25 | type SpeechSynthesisResponse struct { 26 | SessionID string 27 | Data []byte 28 | } 29 | 30 | // SpeechSynthesisListener is the listener of 31 | type SpeechSynthesisListener interface { 32 | OnMessage(*SpeechSynthesisResponse) 33 | OnComplete(*SpeechSynthesisResponse) 34 | OnCancel(*SpeechSynthesisResponse) 35 | OnFail(*SpeechSynthesisResponse, error) 36 | } 37 | 38 | // SpeechSynthesizer is the entry for TTS service 39 | type SpeechSynthesizer struct { 40 | AppID int64 41 | Credential *common.Credential 42 | VoiceType int64 43 | SampleRate int64 44 | Codec string 45 | 46 | ProxyURL string 47 | 48 | mutex sync.Mutex 49 | 50 | eventChan chan speechSynthesisEvent 51 | eventEnd chan int 52 | sessionID string 53 | listener SpeechSynthesisListener 54 | 55 | // 0 - idle, 1 - running, 2 - cancalled 56 | status int 57 | statusMutex sync.Mutex 58 | } 59 | 60 | // TTSRequest TTSRequest 61 | type ttsRequest struct { 62 | Action string `json:"Action"` 63 | AppID int64 `json:"AppId"` 64 | SecretID string `json:"SecretId"` 65 | Timestamp int64 `json:"Timestamp"` 66 | Expired int64 `json:"Expired"` 67 | Text string `json:"Text"` 68 | SessionID string `json:"SessionId"` 69 | ModelType int64 `json:"ModelType"` 70 | VoiceType int64 `json:"VoiceType"` 71 | SampleRate int64 `json:"SampleRate"` 72 | Codec string `json:"Codec"` 73 | } 74 | 75 | type ttsErrorJSONResponseError struct { 76 | Code string `json:"Code"` 77 | Message string `json:"Message"` 78 | } 79 | 80 | type ttsErrorJSONResponse struct { 81 | RequestID string `json:"RequestId"` 82 | Error ttsErrorJSONResponseError `json:"Error"` 83 | } 84 | 85 | type ttsErrorJSON struct { 86 | Response ttsErrorJSONResponse `json:"Response"` 87 | } 88 | 89 | const ( 90 | defaultVoiceType = 0 91 | defaultSampleRate = 16000 92 | defaultCodec = "pcm" 93 | defaultAction = "TextToStreamAudio" 94 | 95 | httpConnectTimeout = 2000 96 | httpReadHeaderTimeout = 2000 97 | 98 | maxMessageSize = 10240 99 | 100 | protocol = "https" 101 | host = "tts.cloud.tencent.com" 102 | path = "/stream" 103 | ) 104 | 105 | const ( 106 | eventTypeMessage = 0 107 | eventTypeComplete = 1 108 | eventTypeCancel = 2 109 | eventTypeFail = 3 110 | ) 111 | 112 | type eventType int 113 | 114 | type speechSynthesisEvent struct { 115 | t eventType 116 | r *SpeechSynthesisResponse 117 | err error 118 | } 119 | 120 | // NewSpeechSynthesizer creates instance of SpeechSynthesizer 121 | func NewSpeechSynthesizer(appID int64, credential *common.Credential, listener SpeechSynthesisListener) *SpeechSynthesizer { 122 | return &SpeechSynthesizer{ 123 | AppID: appID, 124 | Credential: credential, 125 | VoiceType: defaultVoiceType, 126 | SampleRate: defaultSampleRate, 127 | Codec: defaultCodec, 128 | 129 | listener: listener, 130 | 131 | status: 0, 132 | } 133 | } 134 | 135 | // Synthesis Synthesis 136 | func (synthesizer *SpeechSynthesizer) Synthesis(text string) error { 137 | synthesizer.mutex.Lock() 138 | defer synthesizer.mutex.Unlock() 139 | if synthesizer.getStatus() != 0 { 140 | return fmt.Errorf("synthesizer already started") 141 | } 142 | 143 | synthesizer.eventChan = make(chan speechSynthesisEvent, 10) 144 | synthesizer.eventEnd = make(chan int) 145 | go synthesizer.sendRequest(text) 146 | go synthesizer.eventDispatch() 147 | synthesizer.setStatus(1) 148 | return nil 149 | } 150 | 151 | // Cancel Cancel 152 | func (synthesizer *SpeechSynthesizer) Cancel() error { 153 | synthesizer.mutex.Lock() 154 | defer synthesizer.mutex.Unlock() 155 | synthesizer.setStatus(2) 156 | <-synthesizer.eventEnd 157 | return nil 158 | } 159 | 160 | // Wait Wait 161 | func (synthesizer *SpeechSynthesizer) Wait() error { 162 | synthesizer.mutex.Lock() 163 | defer synthesizer.mutex.Unlock() 164 | <-synthesizer.eventEnd 165 | return nil 166 | } 167 | 168 | func (synthesizer *SpeechSynthesizer) getStatus() int { 169 | synthesizer.statusMutex.Lock() 170 | defer synthesizer.statusMutex.Unlock() 171 | status := synthesizer.status 172 | return status 173 | } 174 | 175 | func (synthesizer *SpeechSynthesizer) setStatus(status int) { 176 | synthesizer.statusMutex.Lock() 177 | defer synthesizer.statusMutex.Unlock() 178 | synthesizer.status = status 179 | } 180 | 181 | func (synthesizer *SpeechSynthesizer) eventDispatch() { 182 | for e := range synthesizer.eventChan { 183 | switch e.t { 184 | case eventTypeMessage: 185 | synthesizer.listener.OnMessage(e.r) 186 | case eventTypeComplete: 187 | synthesizer.listener.OnComplete(e.r) 188 | case eventTypeCancel: 189 | synthesizer.listener.OnCancel(e.r) 190 | case eventTypeFail: 191 | synthesizer.listener.OnFail(e.r, e.err) 192 | } 193 | } 194 | synthesizer.setStatus(0) 195 | close(synthesizer.eventEnd) 196 | } 197 | 198 | func (synthesizer *SpeechSynthesizer) sendRequest(text string) { 199 | defer func() { 200 | close(synthesizer.eventChan) 201 | }() 202 | 203 | url := fmt.Sprintf("%s%s", host, path) 204 | var timestamp = time.Now().Unix() 205 | sessionID := uuid.New().String() 206 | req := ttsRequest{ 207 | Action: defaultAction, 208 | AppID: synthesizer.AppID, 209 | SecretID: synthesizer.Credential.SecretId, 210 | Timestamp: timestamp, 211 | Expired: timestamp + 24*60*60, 212 | Text: text, 213 | SessionID: sessionID, 214 | ModelType: 1, 215 | VoiceType: synthesizer.VoiceType, 216 | SampleRate: synthesizer.SampleRate, 217 | Codec: synthesizer.Codec, 218 | } 219 | signature := genSignature(url, &req, synthesizer.Credential.SecretKey) 220 | url = fmt.Sprintf("https://%s", url) 221 | postBody, err := json.Marshal(req) 222 | if err != nil { 223 | synthesizer.onError(err) 224 | return 225 | } 226 | httpReq, err := http.NewRequest("POST", url, bytes.NewReader(postBody)) 227 | 228 | if err != nil { 229 | synthesizer.onError(err) 230 | return 231 | } 232 | synthesizer.sessionID = sessionID 233 | httpReq.Header.Add("Content-Type", "application/json; charset=UTF-8") 234 | httpReq.Header.Add("Authorization", signature) 235 | httpClient := synthesizer.createHTTPClient() 236 | rsp, err := httpClient.Do(httpReq) 237 | if err != nil { 238 | synthesizer.onError(err) 239 | return 240 | } 241 | defer rsp.Body.Close() 242 | if rsp.StatusCode != 200 { 243 | synthesizer.onError(err) 244 | return 245 | } 246 | if len(rsp.Header["Content-Type"]) < 1 || rsp.Header["Content-Type"][0] != "application/octet-stream" { 247 | rspBody, _ := ioutil.ReadAll(rsp.Body) 248 | synthesizer.onError(fmt.Errorf(string(rspBody))) 249 | return 250 | } 251 | buffer := make([]byte, maxMessageSize, maxMessageSize) 252 | for { 253 | if synthesizer.getStatus() == 2 { 254 | synthesizer.onCancel() 255 | return 256 | } 257 | n, err := rsp.Body.Read(buffer) 258 | if err != nil { 259 | if err.Error() == "EOF" { 260 | break 261 | } 262 | synthesizer.onError(err) 263 | return 264 | } 265 | if n == 0 { 266 | continue 267 | } 268 | copyBuf := make([]byte, n, n) 269 | copy(copyBuf, buffer) 270 | synthesizer.onMessage(copyBuf) 271 | } 272 | synthesizer.onComplete() 273 | } 274 | 275 | func (synthesizer *SpeechSynthesizer) onMessage(data []byte) { 276 | r := &SpeechSynthesisResponse{ 277 | SessionID: synthesizer.sessionID, 278 | Data: data, 279 | } 280 | event := speechSynthesisEvent{ 281 | t: eventTypeMessage, 282 | r: r, 283 | err: nil, 284 | } 285 | synthesizer.eventChan <- event 286 | } 287 | 288 | func (synthesizer *SpeechSynthesizer) onComplete() { 289 | r := &SpeechSynthesisResponse{ 290 | SessionID: synthesizer.sessionID, 291 | } 292 | event := speechSynthesisEvent{ 293 | t: eventTypeComplete, 294 | r: r, 295 | err: nil, 296 | } 297 | synthesizer.eventChan <- event 298 | } 299 | 300 | func (synthesizer *SpeechSynthesizer) onCancel() { 301 | r := &SpeechSynthesisResponse{ 302 | SessionID: synthesizer.sessionID, 303 | } 304 | synthesizer.eventChan <- speechSynthesisEvent{ 305 | t: eventTypeCancel, 306 | r: r, 307 | err: nil, 308 | } 309 | } 310 | 311 | func (synthesizer *SpeechSynthesizer) onError(err error) { 312 | r := &SpeechSynthesisResponse{ 313 | SessionID: synthesizer.sessionID, 314 | } 315 | synthesizer.eventChan <- speechSynthesisEvent{ 316 | t: eventTypeFail, 317 | r: r, 318 | err: err, 319 | } 320 | } 321 | 322 | func (synthesizer *SpeechSynthesizer) createHTTPClient() *http.Client { 323 | httpTransport := &http.Transport{ 324 | Dial: (&net.Dialer{ 325 | Timeout: httpConnectTimeout * time.Millisecond, 326 | }).Dial, 327 | MaxIdleConns: 1, 328 | ResponseHeaderTimeout: httpReadHeaderTimeout * time.Millisecond, 329 | } 330 | if synthesizer.ProxyURL != "" { 331 | proxyURL, _ := url.Parse(synthesizer.ProxyURL) 332 | httpTransport.Proxy = http.ProxyURL(proxyURL) 333 | } 334 | return &http.Client{Transport: httpTransport} 335 | } 336 | 337 | func genSignature(url string, request *ttsRequest, secretKey string) string { 338 | var queryMap = make(map[string]string) 339 | queryMap["Action"] = request.Action 340 | queryMap["AppId"] = strconv.FormatInt(int64(request.AppID), 10) 341 | queryMap["SecretId"] = request.SecretID 342 | queryMap["Timestamp"] = strconv.FormatInt(int64(request.Timestamp), 10) 343 | queryMap["Expired"] = strconv.FormatInt(request.Expired, 10) 344 | queryMap["Text"] = request.Text 345 | queryMap["SessionId"] = request.SessionID 346 | queryMap["ModelType"] = strconv.FormatInt(int64(request.ModelType), 10) 347 | queryMap["VoiceType"] = strconv.FormatInt(int64(request.VoiceType), 10) 348 | queryMap["SampleRate"] = strconv.FormatInt(int64(request.SampleRate), 10) 349 | queryMap["Codec"] = request.Codec 350 | 351 | var keys []string 352 | for k := range queryMap { 353 | keys = append(keys, k) 354 | } 355 | sort.Strings(keys) 356 | 357 | var queryStrBuffer bytes.Buffer 358 | for _, k := range keys { 359 | queryStrBuffer.WriteString(k) 360 | queryStrBuffer.WriteString("=") 361 | queryStrBuffer.WriteString(queryMap[k]) 362 | queryStrBuffer.WriteString("&") 363 | } 364 | 365 | rs := []rune(queryStrBuffer.String()) 366 | rsLen := len(rs) 367 | queryStr := string(rs[0 : rsLen-1]) 368 | 369 | signURL := fmt.Sprintf("%s?%s", url, queryStr) 370 | 371 | hmac := hmac.New(sha1.New, []byte(secretKey)) 372 | signURL = "POST" + signURL 373 | hmac.Write([]byte(signURL)) 374 | encryptedStr := hmac.Sum([]byte(nil)) 375 | return base64.StdEncoding.EncodeToString(encryptedStr) 376 | } 377 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright (c) 2017-2018 Tencent Ltd. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /asr/virtual_number_recogizer.go: -------------------------------------------------------------------------------- 1 | package asr 2 | 3 | import ( 4 | "bytes" 5 | "crypto/hmac" 6 | "crypto/sha1" 7 | "encoding/base64" 8 | "encoding/json" 9 | "fmt" 10 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 11 | "net/http" 12 | "net/url" 13 | "runtime/debug" 14 | "sort" 15 | "strconv" 16 | "sync" 17 | "time" 18 | 19 | "github.com/google/uuid" 20 | "github.com/gorilla/websocket" 21 | ) 22 | 23 | // VNRecognitionListener User must impletement it. Get recognition result 24 | type VNRecognitionListener interface { 25 | OnVNRecognitionStart(*VNRecognitionResponse) 26 | OnVNRecognitionComplete(*VNRecognitionResponse) 27 | OnVNFail(*VNRecognitionResponse, error) 28 | } 29 | 30 | // VNRecognitionResponse is the reponse of asr service 31 | type VNRecognitionResponse struct { 32 | Code int `json:"code"` 33 | Message string `json:"message"` 34 | VoiceID string `json:"voice_id,omitempty"` 35 | MessageID string `json:"message_id,omitempty"` 36 | Final uint32 `json:"final,omitempty"` 37 | Result uint32 `json:"result"` 38 | } 39 | 40 | // VNRecognizer is the entry for ASR service 41 | type VNRecognizer struct { 42 | //request params 43 | AppID string 44 | VoiceFormat int 45 | WaitTime uint32 //等待时长 填0 后台默认30秒 最大60秒 单位毫秒 46 | 47 | Credential *common.Credential 48 | //listener 49 | listener VNRecognitionListener 50 | //uuid for voice 51 | VoiceID string 52 | 53 | //for proxy 54 | ProxyURL string 55 | 56 | //for websocet connection 57 | conn *websocket.Conn 58 | 59 | //send data channel 60 | dataChan chan []byte 61 | //for listener get response message 62 | eventChan chan VNRecognitionEvent 63 | 64 | //used in stop function, waiting for stop all goroutines 65 | sendEnd chan int 66 | receiveEnd chan int 67 | eventEnd chan int 68 | 69 | mutex sync.Mutex 70 | started bool 71 | hasEnd bool 72 | } 73 | 74 | const ( 75 | gDefaultVoiceFormat = 1 76 | 77 | gProtocol = "wss" 78 | gHost = "asr.cloud.tencent.com" 79 | gPath = "" 80 | ) 81 | 82 | const ( 83 | eventTypeVNRecognitionStart = 1 84 | eventTypeVNRecognitionComplete = 2 85 | eventTypeVNFail = 3 86 | ) 87 | 88 | type eventTypeVN int 89 | 90 | type VNRecognitionEvent struct { 91 | t eventTypeVN 92 | r *VNRecognitionResponse 93 | err error 94 | } 95 | 96 | // NewVNRecognizer creates instance of VNRecognizer 97 | func NewVNRecognizer(appID string, credential *common.Credential, 98 | listener VNRecognitionListener) *VNRecognizer { 99 | 100 | reco := &VNRecognizer{ 101 | AppID: appID, 102 | Credential: credential, 103 | VoiceFormat: gDefaultVoiceFormat, 104 | 105 | dataChan: make(chan []byte, 6400), 106 | eventChan: make(chan VNRecognitionEvent, 10), 107 | 108 | sendEnd: make(chan int), 109 | receiveEnd: make(chan int), 110 | eventEnd: make(chan int), 111 | 112 | listener: listener, 113 | started: false, 114 | } 115 | return reco 116 | } 117 | 118 | // Start connects to server and start a recognition session 119 | func (recognizer *VNRecognizer) Start() error { 120 | recognizer.mutex.Lock() 121 | defer recognizer.mutex.Unlock() 122 | 123 | if recognizer.started { 124 | return fmt.Errorf("recognizer is already started") 125 | } 126 | if recognizer.VoiceID == "" { 127 | voiceID := uuid.New().String() 128 | recognizer.VoiceID = voiceID 129 | } 130 | serverURL := recognizer.buildSignatureURL(recognizer.VoiceID) 131 | signature := recognizer.genSignature(recognizer.VoiceID) 132 | dialer := websocket.Dialer{} 133 | if len(recognizer.ProxyURL) > 0 { 134 | proxyURL, _ := url.Parse(recognizer.ProxyURL) 135 | dialer.Proxy = http.ProxyURL(proxyURL) 136 | } 137 | 138 | header := http.Header(make(map[string][]string)) 139 | urlStr := fmt.Sprintf("%s://%s&signature=%s", gProtocol, serverURL, url.QueryEscape(signature)) 140 | conn, _, err := dialer.Dial(urlStr, header) 141 | if err != nil { 142 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 143 | } 144 | _, data, err := conn.ReadMessage() 145 | if err != nil { 146 | conn.Close() 147 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 148 | } 149 | msg := VNRecognitionResponse{} 150 | err = json.Unmarshal(data, &msg) 151 | if err != nil { 152 | conn.Close() 153 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 154 | } 155 | if msg.Code != 0 { 156 | conn.Close() 157 | return fmt.Errorf("voice_id: %s, code: %d, message: %s", 158 | recognizer.VoiceID, msg.Code, msg.Message) 159 | } 160 | 161 | recognizer.conn = conn 162 | go recognizer.send() 163 | go recognizer.receive() 164 | go recognizer.eventDispatch() 165 | recognizer.started = true 166 | 167 | recognizer.eventChan <- VNRecognitionEvent{ 168 | t: eventTypeVNRecognitionStart, 169 | r: newVNRecognitionResponse(0, "sucess", recognizer.VoiceID, 170 | fmt.Sprintf("%s-RecognitionStart", recognizer.VoiceID), 0), 171 | err: nil, 172 | } 173 | return nil 174 | } 175 | 176 | // Write : write data in channel 177 | func (recognizer *VNRecognizer) Write(data []byte) (error, bool) { 178 | recognizer.mutex.Lock() 179 | defer recognizer.mutex.Unlock() 180 | if !recognizer.started { 181 | return fmt.Errorf("recognizer not running"), false 182 | } 183 | 184 | if recognizer.hasEnd { 185 | return nil, true 186 | } 187 | recognizer.dataChan <- data 188 | return nil, false 189 | } 190 | 191 | // Stop wait for the recognition process to complete 192 | func (recognizer *VNRecognizer) Stop() error { 193 | err := recognizer.stopInternal() 194 | if err != nil { 195 | return err 196 | } 197 | return nil 198 | } 199 | 200 | func (recognizer *VNRecognizer) stopInternal() error { 201 | recognizer.mutex.Lock() 202 | defer recognizer.mutex.Unlock() 203 | if !recognizer.started { 204 | return fmt.Errorf("recognizer is not running") 205 | } 206 | close(recognizer.dataChan) 207 | <-recognizer.receiveEnd 208 | <-recognizer.sendEnd 209 | <-recognizer.eventEnd 210 | recognizer.started = false 211 | err := recognizer.conn.Close() 212 | if err != nil { 213 | return err 214 | } 215 | return nil 216 | } 217 | 218 | func (recognizer *VNRecognizer) onError(code int, message string, err error) { 219 | //recognizer.mutex.Lock() 220 | if !recognizer.started { 221 | return 222 | } 223 | 224 | recognizer.listener.OnVNFail(newVNRecognitionResponse(code, message, recognizer.VoiceID, 225 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), err) 226 | /* 227 | recognizer.eventChan <- VNRecognitionEvent{ 228 | t: eventTypeVNFail, 229 | r: newVNRecognitionResponse(code, message, recognizer.VoiceID, 230 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), 231 | err: err, 232 | } 233 | recognizer.mutex.Unlock() 234 | */ 235 | go recognizer.stopInternal() 236 | } 237 | 238 | func (recognizer *VNRecognizer) send() { 239 | defer func() { 240 | // handle panic 241 | recognizer.genRecoverFunc()() 242 | close(recognizer.sendEnd) 243 | }() 244 | //send data 245 | for data := range recognizer.dataChan { 246 | if err := recognizer.conn.WriteMessage(websocket.BinaryMessage, data); err != nil { 247 | recognizer.onError(-1, "send error", fmt.Errorf("voice_id: %s, error: %s", 248 | recognizer.VoiceID, err.Error())) 249 | return 250 | } 251 | } 252 | //send stop msg 253 | if err := recognizer.conn.WriteMessage(websocket.TextMessage, []byte("{\"type\":\"end\"}")); err != nil { 254 | recognizer.onError(-1, "send error", fmt.Errorf("voice_id: %s, error: %s", 255 | recognizer.VoiceID, err.Error())) 256 | } 257 | } 258 | 259 | func (recognizer *VNRecognizer) eventDispatch() { 260 | defer func() { 261 | // handle panic 262 | recognizer.genRecoverFunc()() 263 | close(recognizer.eventEnd) 264 | }() 265 | for e := range recognizer.eventChan { 266 | switch e.t { 267 | case eventTypeVNRecognitionStart: 268 | recognizer.listener.OnVNRecognitionStart(e.r) 269 | case eventTypeVNRecognitionComplete: 270 | recognizer.listener.OnVNRecognitionComplete(e.r) 271 | case eventTypeVNFail: 272 | recognizer.listener.OnVNFail(e.r, e.err) 273 | } 274 | } 275 | } 276 | 277 | func (recognizer *VNRecognizer) receive() { 278 | defer func() { 279 | // handle panic 280 | recognizer.genRecoverFunc()() 281 | close(recognizer.eventChan) 282 | close(recognizer.receiveEnd) 283 | }() 284 | for { 285 | _, data, err := recognizer.conn.ReadMessage() 286 | if err != nil { 287 | recognizer.onError(-1, "receive error", fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error())) 288 | break 289 | } 290 | 291 | //fmt.Printf("%s", data) 292 | msg := VNRecognitionResponse{} 293 | err = json.Unmarshal(data, &msg) 294 | if err != nil { 295 | recognizer.onError(-1, "receive error", 296 | fmt.Errorf("voice_id: %s, error: %s", 297 | recognizer.VoiceID, err.Error())) 298 | break 299 | } 300 | if msg.Code != 0 { 301 | recognizer.onError(msg.Code, msg.Message, 302 | fmt.Errorf("VoiceID: %s, error code %d, message: %s", 303 | recognizer.VoiceID, msg.Code, msg.Message)) 304 | break 305 | } 306 | fmt.Println("receive data:", msg) 307 | if msg.Final == 1 { 308 | recognizer.hasEnd = true 309 | recognizer.eventChan <- VNRecognitionEvent{ 310 | t: eventTypeVNRecognitionComplete, 311 | r: &msg, 312 | err: nil, 313 | } 314 | break 315 | } 316 | } 317 | } 318 | 319 | func (recognizer *VNRecognizer) buildURL(voiceID string) string { 320 | var queryMap = make(map[string]string) 321 | queryMap["secretid"] = recognizer.Credential.SecretId 322 | var timestamp = time.Now().Unix() 323 | var timestampStr = strconv.FormatInt(timestamp, 10) 324 | queryMap["timestamp"] = timestampStr 325 | queryMap["expired"] = strconv.FormatInt(timestamp+24*60*60, 10) 326 | queryMap["nonce"] = timestampStr 327 | queryMap["appid"] = recognizer.AppID 328 | //params 329 | queryMap["voice_id"] = voiceID 330 | queryMap["voice_format"] = strconv.FormatInt(int64(recognizer.VoiceFormat), 10) 331 | queryMap["wait_time"] = strconv.FormatUint(uint64(recognizer.WaitTime), 10) 332 | var keys []string 333 | for k := range queryMap { 334 | keys = append(keys, k) 335 | } 336 | sort.Strings(keys) 337 | 338 | var queryStrBuffer bytes.Buffer 339 | for _, k := range keys { 340 | queryStrBuffer.WriteString(k) 341 | queryStrBuffer.WriteString("=") 342 | queryStrBuffer.WriteString(queryMap[k]) 343 | queryStrBuffer.WriteString("&") 344 | } 345 | 346 | rs := []rune(queryStrBuffer.String()) 347 | rsLen := len(rs) 348 | queryStr := string(rs[0 : rsLen-1]) 349 | 350 | //gen url 351 | url := fmt.Sprintf("%s/VirtualNumberTransfer?%s", gHost, queryStr) 352 | //url := fmt.Sprintf("%s/VirtualNumberTransfer/%s?%s", gHost, recognizer.AppID, queryStr) 353 | return url 354 | } 355 | 356 | func (recognizer *VNRecognizer) buildSignatureURL(voiceID string) string { 357 | var queryMap = make(map[string]string) 358 | queryMap["secretid"] = recognizer.Credential.SecretId 359 | var timestamp = time.Now().Unix() 360 | var timestampStr = strconv.FormatInt(timestamp, 10) 361 | queryMap["timestamp"] = timestampStr 362 | queryMap["expired"] = strconv.FormatInt(timestamp+24*60*60, 10) 363 | queryMap["nonce"] = timestampStr 364 | //params 365 | queryMap["voice_id"] = voiceID 366 | queryMap["voice_format"] = strconv.FormatInt(int64(recognizer.VoiceFormat), 10) 367 | var keys []string 368 | for k := range queryMap { 369 | keys = append(keys, k) 370 | } 371 | sort.Strings(keys) 372 | 373 | var queryStrBuffer bytes.Buffer 374 | for _, k := range keys { 375 | queryStrBuffer.WriteString(k) 376 | queryStrBuffer.WriteString("=") 377 | queryStrBuffer.WriteString(queryMap[k]) 378 | queryStrBuffer.WriteString("&") 379 | } 380 | 381 | rs := []rune(queryStrBuffer.String()) 382 | rsLen := len(rs) 383 | queryStr := string(rs[0 : rsLen-1]) 384 | 385 | url := fmt.Sprintf("%s/asr/virtual_number/v1/%s?%s", gHost, recognizer.AppID, queryStr) 386 | return url 387 | } 388 | 389 | func (recognizer *VNRecognizer) genSignature(voiceID string) string { 390 | url := recognizer.buildSignatureURL(voiceID) 391 | hmac := hmac.New(sha1.New, []byte(recognizer.Credential.SecretKey)) 392 | signURL := url 393 | hmac.Write([]byte(signURL)) 394 | encryptedStr := hmac.Sum([]byte(nil)) 395 | var signature = base64.StdEncoding.EncodeToString(encryptedStr) 396 | 397 | return signature 398 | } 399 | 400 | func newVNRecognitionResponse(code int, message string, voiceID string, 401 | messageID string, final uint32) *VNRecognitionResponse { 402 | return &VNRecognitionResponse{ 403 | Code: code, 404 | Message: message, 405 | VoiceID: voiceID, 406 | MessageID: messageID, 407 | Final: final, 408 | } 409 | } 410 | 411 | func (recognizer *VNRecognizer) genRecoverFunc() func() { 412 | return func() { 413 | if r := recover(); r != nil { 414 | var err error 415 | switch r := r.(type) { 416 | case error: 417 | err = r 418 | default: 419 | err = fmt.Errorf("%v", r) 420 | } 421 | retErr := fmt.Errorf("panic error ocurred! [err: %s] [stack: %s]", 422 | err.Error(), string(debug.Stack())) 423 | recognizer.eventChan <- VNRecognitionEvent{ 424 | t: eventTypeVNFail, 425 | r: newVNRecognitionResponse(-1, "panic error", recognizer.VoiceID, 426 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), 427 | err: retErr, 428 | } 429 | } 430 | } 431 | } 432 | -------------------------------------------------------------------------------- /tts/speechwssynthesizer.go: -------------------------------------------------------------------------------- 1 | package tts 2 | 3 | import ( 4 | "bytes" 5 | "crypto/hmac" 6 | "crypto/sha1" 7 | "encoding/base64" 8 | "encoding/json" 9 | "fmt" 10 | "github.com/gorilla/websocket" 11 | "net/http" 12 | "net/url" 13 | "runtime/debug" 14 | "sort" 15 | "strconv" 16 | "sync" 17 | "time" 18 | 19 | "github.com/google/uuid" 20 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 21 | ) 22 | 23 | // SpeechWsSynthesisResponse response 24 | type SpeechWsSynthesisResponse struct { 25 | SessionId string `json:"session_id"` //音频流唯一 id,由客户端在握手阶段生成并赋值在调用参数中 26 | RequestId string `json:"request_id"` //音频流唯一 id,由服务端在握手阶段自动生成 27 | MessageId string `json:"message_id"` //本 message 唯一 id 28 | Code int `json:"code"` //状态码,0代表正常,非0值表示发生错误 29 | Message string `json:"message"` //错误说明,发生错误时显示这个错误发生的具体原因,随着业务发展或体验优化,此文本可能会经常保持变更或更新 30 | Result SynthesisSubtitles `json:"result"` //最新语音合成文本结果 31 | Final int `json:"final"` //该字段返回1时表示文本全部合成结束,客户端收到后需主动关闭 websocket 连接 32 | } 33 | 34 | func (s *SpeechWsSynthesisResponse) ToString() string { 35 | d, _ := json.Marshal(s) 36 | return string(d) 37 | } 38 | 39 | // SynthesisSubtitles subtitles 40 | type SynthesisSubtitles struct { 41 | Subtitles []SynthesisSubtitle `json:"subtitles"` 42 | } 43 | 44 | // SynthesisSubtitle Subtitle 45 | type SynthesisSubtitle struct { 46 | Text string 47 | Phoneme string 48 | BeginTime int64 49 | EndTime int64 50 | BeginIndex int 51 | EndIndex int 52 | } 53 | 54 | // SpeechWsSynthesizer is the entry for TTS websocket service 55 | type SpeechWsSynthesizer struct { 56 | Credential *common.Credential 57 | action string `json:"Action"` 58 | AppID int64 `json:"AppId"` 59 | Timestamp int64 `json:"Timestamp"` 60 | Expired int64 `json:"Expired"` 61 | SessionId string `json:"SessionId"` 62 | Text string `json:"Text"` 63 | ModelType int64 `json:"ModelType"` 64 | VoiceType int64 `json:"VoiceType"` 65 | SampleRate int64 `json:"SampleRate"` 66 | Codec string `json:"Codec"` 67 | Speed float64 `json:"Speed"` 68 | Volume float64 `json:"Volume"` 69 | EnableSubtitle bool `json:"EnableSubtitle"` 70 | EmotionCategory string `json:"EmotionCategory"` 71 | EmotionIntensity int64 `json:"EmotionIntensity"` 72 | SegmentRate int64 `json:"SegmentRate"` 73 | FastVoiceType string `json:"FastVoiceType"` 74 | ExtParam map[string]string 75 | 76 | ProxyURL string 77 | mutex sync.Mutex 78 | receiveEnd chan int 79 | eventChan chan speechWsSynthesisEvent 80 | eventEnd chan int 81 | listener SpeechWsSynthesisListener 82 | status int 83 | statusMutex sync.Mutex 84 | conn *websocket.Conn //for websocet connection 85 | started bool 86 | 87 | Debug bool //是否debug 88 | DebugFunc func(message string) 89 | } 90 | 91 | // SpeechWsSynthesisListener is the listener of 92 | type SpeechWsSynthesisListener interface { 93 | OnSynthesisStart(*SpeechWsSynthesisResponse) 94 | OnSynthesisEnd(*SpeechWsSynthesisResponse) 95 | OnAudioResult(data []byte) 96 | OnTextResult(*SpeechWsSynthesisResponse) 97 | OnSynthesisFail(*SpeechWsSynthesisResponse, error) 98 | } 99 | 100 | const ( 101 | defaultWsVoiceType = 0 102 | defaultWsSampleRate = 16000 103 | defaultWsCodec = "pcm" 104 | defaultWsAction = "TextToStreamAudioWS" 105 | wsConnectTimeout = 2000 106 | wsReadHeaderTimeout = 2000 107 | maxWsMessageSize = 10240 108 | wsProtocol = "wss" 109 | wsHost = "tts.cloud.tencent.com" 110 | wsPath = "/stream_ws" 111 | ) 112 | 113 | const ( 114 | eventTypeWsStart = iota 115 | eventTypeWsEnd 116 | eventTypeWsAudioResult 117 | eventTypeWsTextResult 118 | eventTypeWsFail 119 | ) 120 | 121 | type eventWsType int 122 | 123 | type speechWsSynthesisEvent struct { 124 | t eventWsType 125 | r *SpeechWsSynthesisResponse 126 | d []byte 127 | err error 128 | } 129 | 130 | // NewSpeechWsSynthesizer creates instance of SpeechWsSynthesizer 131 | func NewSpeechWsSynthesizer(appID int64, credential *common.Credential, listener SpeechWsSynthesisListener) *SpeechWsSynthesizer { 132 | return &SpeechWsSynthesizer{ 133 | AppID: appID, 134 | Credential: credential, 135 | action: defaultWsAction, 136 | VoiceType: defaultWsVoiceType, 137 | SampleRate: defaultWsSampleRate, 138 | Codec: defaultWsCodec, 139 | listener: listener, 140 | status: 0, 141 | receiveEnd: make(chan int), 142 | eventChan: make(chan speechWsSynthesisEvent, 10), 143 | eventEnd: make(chan int), 144 | } 145 | } 146 | 147 | // Synthesis Start connects to server and start a synthesizer session 148 | func (synthesizer *SpeechWsSynthesizer) Synthesis() error { 149 | synthesizer.mutex.Lock() 150 | defer synthesizer.mutex.Unlock() 151 | 152 | if synthesizer.started { 153 | return fmt.Errorf("synthesizer is already started") 154 | } 155 | if synthesizer.SessionId == "" { 156 | SessionId := uuid.New().String() 157 | synthesizer.SessionId = SessionId 158 | } 159 | var timestamp = time.Now().Unix() 160 | synthesizer.Timestamp = timestamp 161 | synthesizer.Expired = timestamp + 24*60*60 162 | serverURL := synthesizer.buildURL(false) 163 | signature := synthesizer.genWsSignature(serverURL, synthesizer.Credential.SecretKey) 164 | if synthesizer.Debug && synthesizer.DebugFunc != nil { 165 | logMsg := fmt.Sprintf("serverURL:%s , signature:%s", serverURL, signature) 166 | synthesizer.DebugFunc(logMsg) 167 | } 168 | dialer := websocket.Dialer{} 169 | if len(synthesizer.ProxyURL) > 0 { 170 | proxyURL, _ := url.Parse(synthesizer.ProxyURL) 171 | dialer.Proxy = http.ProxyURL(proxyURL) 172 | } 173 | serverURL = synthesizer.buildURL(true) 174 | header := http.Header(make(map[string][]string)) 175 | urlStr := fmt.Sprintf("%s://%s&Signature=%s", wsProtocol, serverURL, url.QueryEscape(signature)) 176 | if synthesizer.Debug && synthesizer.DebugFunc != nil { 177 | logMsg := fmt.Sprintf("urlStr:%s ", urlStr) 178 | synthesizer.DebugFunc(logMsg) 179 | } 180 | conn, _, err := dialer.Dial(urlStr, header) 181 | if err != nil { 182 | return fmt.Errorf("session_id: %s, error: %s", synthesizer.SessionId, err.Error()) 183 | } 184 | _, data, err := conn.ReadMessage() 185 | if err != nil { 186 | conn.Close() 187 | return fmt.Errorf("session_id: %s, error: %s", synthesizer.SessionId, err.Error()) 188 | } 189 | msg := SpeechWsSynthesisResponse{} 190 | err = json.Unmarshal(data, &msg) 191 | if err != nil { 192 | conn.Close() 193 | return fmt.Errorf("session_id: %s, error: %s", synthesizer.SessionId, err.Error()) 194 | } 195 | if msg.Code != 0 { 196 | conn.Close() 197 | return fmt.Errorf("session_id: %s, code: %d, message: %s", 198 | synthesizer.SessionId, msg.Code, msg.Message) 199 | } 200 | msg.SessionId = synthesizer.SessionId 201 | synthesizer.conn = conn 202 | go synthesizer.receive() 203 | go synthesizer.eventDispatch() 204 | synthesizer.started = true 205 | synthesizer.setStatus(eventTypeWsStart) 206 | synthesizer.eventChan <- speechWsSynthesisEvent{ 207 | t: eventTypeWsStart, 208 | r: &msg, 209 | err: nil, 210 | } 211 | return nil 212 | } 213 | 214 | func (synthesizer *SpeechWsSynthesizer) receive() { 215 | defer func() { 216 | // handle panic 217 | synthesizer.genRecoverFunc()() 218 | close(synthesizer.eventChan) 219 | close(synthesizer.receiveEnd) 220 | }() 221 | for { 222 | optCode, data, err := synthesizer.conn.ReadMessage() 223 | if err != nil { 224 | synthesizer.onError(fmt.Errorf("SessionId: %s, error: %s", synthesizer.SessionId, err.Error())) 225 | break 226 | } 227 | if optCode == websocket.BinaryMessage { 228 | if synthesizer.Debug && synthesizer.DebugFunc != nil { 229 | synthesizer.DebugFunc(fmt.Sprintf("[%s] receive binary message size: %d", synthesizer.SessionId, len(data))) 230 | } 231 | msg := SpeechWsSynthesisResponse{SessionId: synthesizer.SessionId} 232 | synthesizer.eventChan <- speechWsSynthesisEvent{ 233 | t: eventTypeWsAudioResult, 234 | r: &msg, 235 | d: data, 236 | err: nil, 237 | } 238 | } 239 | if optCode == websocket.TextMessage { 240 | if synthesizer.Debug && synthesizer.DebugFunc != nil { 241 | synthesizer.DebugFunc(fmt.Sprintf("[%s] receive text message: %s", synthesizer.SessionId, string(data))) 242 | } 243 | msg := SpeechWsSynthesisResponse{} 244 | err = json.Unmarshal(data, &msg) 245 | if err != nil { 246 | synthesizer.onError(fmt.Errorf("SessionId: %s, error: %s", 247 | synthesizer.SessionId, err.Error())) 248 | break 249 | } 250 | msg.SessionId = synthesizer.SessionId 251 | if msg.Code != 0 { 252 | synthesizer.onErrorResp(msg, fmt.Errorf("VoiceID: %s, error code %d, message: %s", 253 | synthesizer.SessionId, msg.Code, msg.Message)) 254 | break 255 | } 256 | if msg.Final == 1 { 257 | synthesizer.setStatus(eventTypeWsEnd) 258 | synthesizer.closeConn() 259 | synthesizer.eventChan <- speechWsSynthesisEvent{ 260 | t: eventTypeWsEnd, 261 | r: &msg, 262 | err: nil, 263 | } 264 | break 265 | } 266 | synthesizer.eventChan <- speechWsSynthesisEvent{ 267 | t: eventTypeWsTextResult, 268 | r: &msg, 269 | err: nil, 270 | } 271 | } 272 | } 273 | } 274 | 275 | func (synthesizer *SpeechWsSynthesizer) eventDispatch() { 276 | defer func() { 277 | // handle panic 278 | synthesizer.genRecoverFunc()() 279 | close(synthesizer.eventEnd) 280 | }() 281 | for e := range synthesizer.eventChan { 282 | switch e.t { 283 | case eventTypeWsStart: 284 | synthesizer.listener.OnSynthesisStart(e.r) 285 | case eventTypeWsEnd: 286 | synthesizer.listener.OnSynthesisEnd(e.r) 287 | case eventTypeWsAudioResult: 288 | synthesizer.listener.OnAudioResult(e.d) 289 | case eventTypeWsTextResult: 290 | synthesizer.listener.OnTextResult(e.r) 291 | case eventTypeWsFail: 292 | synthesizer.listener.OnSynthesisFail(e.r, e.err) 293 | } 294 | } 295 | } 296 | 297 | // Wait Wait 298 | func (synthesizer *SpeechWsSynthesizer) Wait() error { 299 | synthesizer.mutex.Lock() 300 | defer synthesizer.mutex.Unlock() 301 | <-synthesizer.eventEnd 302 | <-synthesizer.receiveEnd 303 | return nil 304 | } 305 | 306 | func (synthesizer *SpeechWsSynthesizer) getStatus() int { 307 | synthesizer.statusMutex.Lock() 308 | defer synthesizer.statusMutex.Unlock() 309 | status := synthesizer.status 310 | return status 311 | } 312 | 313 | func (synthesizer *SpeechWsSynthesizer) setStatus(status int) { 314 | synthesizer.statusMutex.Lock() 315 | defer synthesizer.statusMutex.Unlock() 316 | synthesizer.status = status 317 | } 318 | 319 | func (synthesizer *SpeechWsSynthesizer) onError(err error) { 320 | r := &SpeechWsSynthesisResponse{ 321 | SessionId: synthesizer.SessionId, 322 | } 323 | synthesizer.closeConn() 324 | synthesizer.eventChan <- speechWsSynthesisEvent{ 325 | t: eventTypeWsFail, 326 | r: r, 327 | err: err, 328 | } 329 | } 330 | 331 | func (synthesizer *SpeechWsSynthesizer) onErrorResp(resp SpeechWsSynthesisResponse, err error) { 332 | synthesizer.closeConn() 333 | synthesizer.eventChan <- speechWsSynthesisEvent{ 334 | t: eventTypeWsFail, 335 | r: &resp, 336 | err: err, 337 | } 338 | } 339 | 340 | func (synthesizer *SpeechWsSynthesizer) buildURL(escape bool) string { 341 | var queryMap = make(map[string]string) 342 | queryMap["Action"] = synthesizer.action 343 | queryMap["AppId"] = strconv.FormatInt(synthesizer.AppID, 10) 344 | queryMap["SecretId"] = synthesizer.Credential.SecretId 345 | queryMap["Timestamp"] = strconv.FormatInt(synthesizer.Timestamp, 10) 346 | queryMap["Expired"] = strconv.FormatInt(synthesizer.Expired, 10) 347 | if escape { 348 | //url escapes the string so it can be safely placed 349 | queryMap["Text"] = url.QueryEscape(synthesizer.Text) 350 | } else { 351 | queryMap["Text"] = synthesizer.Text 352 | } 353 | queryMap["FastVoiceType"] = synthesizer.FastVoiceType 354 | queryMap["SessionId"] = synthesizer.SessionId 355 | queryMap["ModelType"] = strconv.FormatInt(synthesizer.ModelType, 10) 356 | queryMap["VoiceType"] = strconv.FormatInt(synthesizer.VoiceType, 10) 357 | queryMap["SampleRate"] = strconv.FormatInt(synthesizer.SampleRate, 10) 358 | queryMap["Speed"] = strconv.FormatFloat(synthesizer.Speed, 'g', -1, 64) 359 | queryMap["Volume"] = strconv.FormatFloat(synthesizer.Volume, 'g', -1, 64) 360 | queryMap["Codec"] = synthesizer.Codec 361 | queryMap["EnableSubtitle"] = strconv.FormatBool(synthesizer.EnableSubtitle) 362 | queryMap["EmotionCategory"] = synthesizer.EmotionCategory 363 | queryMap["EmotionIntensity"] = strconv.FormatInt(synthesizer.EmotionIntensity, 10) 364 | queryMap["SegmentRate"] = strconv.FormatInt(synthesizer.SegmentRate, 10) 365 | for k, v := range synthesizer.ExtParam { 366 | queryMap[k] = v 367 | } 368 | var keys []string 369 | for k := range queryMap { 370 | keys = append(keys, k) 371 | } 372 | sort.Strings(keys) 373 | 374 | var queryStrBuffer bytes.Buffer 375 | for _, k := range keys { 376 | queryStrBuffer.WriteString(k) 377 | queryStrBuffer.WriteString("=") 378 | queryStrBuffer.WriteString(queryMap[k]) 379 | queryStrBuffer.WriteString("&") 380 | } 381 | rs := []rune(queryStrBuffer.String()) 382 | rsLen := len(rs) 383 | queryStr := string(rs[0 : rsLen-1]) 384 | serverURL := fmt.Sprintf("%s%s", wsHost, wsPath) 385 | signURL := fmt.Sprintf("%s?%s", serverURL, queryStr) 386 | return signURL 387 | } 388 | 389 | func (synthesizer *SpeechWsSynthesizer) genWsSignature(signURL string, secretKey string) string { 390 | hmac := hmac.New(sha1.New, []byte(secretKey)) 391 | signURL = "GET" + signURL 392 | hmac.Write([]byte(signURL)) 393 | encryptedStr := hmac.Sum([]byte(nil)) 394 | return base64.StdEncoding.EncodeToString(encryptedStr) 395 | } 396 | 397 | func (synthesizer *SpeechWsSynthesizer) genRecoverFunc() func() { 398 | return func() { 399 | if r := recover(); r != nil { 400 | var err error 401 | switch r := r.(type) { 402 | case error: 403 | err = r 404 | default: 405 | err = fmt.Errorf("%v", r) 406 | } 407 | retErr := fmt.Errorf("panic error ocurred! [err: %s] [stack: %s]", 408 | err.Error(), string(debug.Stack())) 409 | msg := SpeechWsSynthesisResponse{ 410 | SessionId: synthesizer.SessionId, 411 | } 412 | synthesizer.eventChan <- speechWsSynthesisEvent{ 413 | t: eventTypeWsFail, 414 | r: &msg, 415 | err: retErr, 416 | } 417 | } 418 | } 419 | } 420 | 421 | // CloseConn close connection 422 | func (synthesizer *SpeechWsSynthesizer) CloseConn() { 423 | synthesizer.closeConn() 424 | } 425 | 426 | func (synthesizer *SpeechWsSynthesizer) closeConn() { 427 | err := synthesizer.conn.Close() 428 | if err != nil && synthesizer.Debug && synthesizer.DebugFunc != nil { 429 | synthesizer.DebugFunc(fmt.Sprintf("%s %s", time.Now().String(), err.Error())) 430 | } 431 | } 432 | -------------------------------------------------------------------------------- /soe/speaking_assessment.go: -------------------------------------------------------------------------------- 1 | package soe 2 | 3 | import ( 4 | "bytes" 5 | "crypto/hmac" 6 | "crypto/sha1" 7 | "encoding/base64" 8 | "encoding/json" 9 | "fmt" 10 | "net/http" 11 | "net/url" 12 | "runtime/debug" 13 | "sort" 14 | "strconv" 15 | "strings" 16 | "sync" 17 | "time" 18 | 19 | "github.com/google/uuid" 20 | "github.com/gorilla/websocket" 21 | 22 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 23 | ) 24 | 25 | // SpeakingAssessmentListener User must impletement it. Get recognition result 26 | type SpeakingAssessmentListener interface { 27 | OnRecognitionStart(*SpeakingAssessmentResponse) 28 | OnRecognitionComplete(*SpeakingAssessmentResponse) 29 | OnIntermediateResults(*SpeakingAssessmentResponse) 30 | OnFail(*SpeakingAssessmentResponse, error) 31 | } 32 | 33 | // SpeakingAssessmentResponse is the reponse of asr service 34 | type SpeakingAssessmentResponse struct { 35 | Code int `json:"code"` 36 | Message string `json:"message"` 37 | VoiceID string `json:"voice_id,omitempty"` 38 | MessageID string `json:"message_id,omitempty"` 39 | Final uint32 `json:"final,omitempty"` 40 | Result SentenceInfo `json:"result"` 41 | } 42 | 43 | // SentenceInfo ... 44 | type SentenceInfo struct { 45 | SuggestedScore float64 `json:"SuggestedScore"` 46 | PronAccuracy float64 `json:"PronAccuracy"` 47 | PronFluency float64 `json:"PronFluency"` 48 | PronCompletion float64 `json:"PronCompletion"` 49 | Words []WordRsp `json:"Words"` 50 | SentenceId int64 `json:"SentenceId"` 51 | RefTextId int64 `json:"RefTextId"` 52 | KeyWordHits []float32 `json:"KeyWordHits"` 53 | UnKeyWordHits []float32 `json:"UnKeyWordHits"` 54 | } 55 | 56 | // PhoneInfoTypeRsp is a struct/interface 57 | type PhoneInfoTypeRsp struct { 58 | Mbtm int64 `json:"MemBeginTime"` 59 | Metm int64 `json:"MemEndTime"` 60 | PronAccuracy float64 `json:"PronAccuracy"` 61 | DetectedStress bool `json:"DetectedStress"` 62 | Phone string `json:"Phone"` 63 | ReferencePhone string `json:"ReferencePhone"` 64 | ReferenceLetter string `json:"ReferenceLetter"` 65 | Stress bool `json:"Stress"` 66 | Tag int64 `json:"MatchTag"` 67 | } 68 | 69 | // Tone 中文声调检测结果 70 | type Tone struct { 71 | Valid bool `json:"Valid"` 72 | RefTone int `json:"RefTone"` 73 | HypTone int `json:"HypothesisTone"` 74 | // Confidence float32 `json:"Confidence"` 75 | } 76 | 77 | // WordRsp is a struct/interface 78 | type WordRsp struct { 79 | Mbtm int64 `json:"MemBeginTime"` 80 | Metm int64 `json:"MemEndTime"` 81 | PronAccuracy float64 `json:"PronAccuracy"` 82 | PronFluency float64 `json:"PronFluency"` 83 | ReferenceWord string `json:"ReferenceWord"` 84 | Word string `json:"Word"` 85 | Tag int64 `json:"MatchTag"` 86 | KeywordTag int64 `json:"KeywordTag"` 87 | PhoneInfo []PhoneInfoTypeRsp `json:"PhoneInfos"` 88 | Tone Tone `json:"Tone"` 89 | } 90 | 91 | // AudioFormat type 92 | const ( 93 | AudioFormatPCM = 0 94 | AudioFormatWav = 1 95 | AudioFormatMp3 = 2 96 | AudioFormatSilk = 3 97 | AudioFormatSpeex = 4 98 | ) 99 | 100 | // SpeechRecognizer is the entry for ASR service 101 | type SpeechRecognizer struct { 102 | //request params 103 | AppID string 104 | VoiceFormat int 105 | End int 106 | Timestamp int 107 | Nonce int 108 | Signature string 109 | VoiceData []byte 110 | Expired int 111 | TextMode int64 112 | RefText string 113 | Keyword string 114 | EvalMode int64 115 | ScoreCoeff float64 116 | ServerEngineType string 117 | SentenceInfoEnabled int64 118 | // 录音识别模式,0:实时识别 1:录音识别 119 | // 录音识别下可发送单个大长度分片,但是单次连接只能发一个分片,对音频的大小有限制,得到识别结果后需要重新建立连接 120 | // 推荐使用实时识别模式 121 | RecMode int 122 | 123 | Credential *common.Credential 124 | //listener 125 | listener SpeakingAssessmentListener 126 | //uuid for voice 127 | VoiceID string 128 | //for proxy 129 | ProxyURL string 130 | //for websocet connection 131 | conn *websocket.Conn 132 | //send data channel 133 | dataChan chan []byte 134 | //for listener get response message 135 | eventChan chan speechRecognitionEvent 136 | 137 | //used in stop function, waiting for stop all goroutines 138 | sendEnd chan int 139 | receiveEnd chan int 140 | eventEnd chan int 141 | 142 | mutex sync.Mutex 143 | started bool 144 | hasEnd bool 145 | } 146 | 147 | const ( 148 | defaultVoiceFormat = 1 149 | 150 | protocol = "wss" 151 | host = "soe.cloud.tencent.com" 152 | path = "soe/api" 153 | ) 154 | 155 | const ( 156 | eventTypeRecognitionStart = 1 157 | eventTypeIntermediateResults = 2 158 | eventTypeRecognitionComplete = 3 159 | eventTypeFail = 4 160 | ) 161 | 162 | type eventType int 163 | 164 | type speechRecognitionEvent struct { 165 | t eventType 166 | r *SpeakingAssessmentResponse 167 | err error 168 | } 169 | 170 | // NewSpeechRecognizer creates instance of SpeechRecognizer 171 | func NewSpeechRecognizer(appID string, credential *common.Credential, 172 | listener SpeakingAssessmentListener) *SpeechRecognizer { 173 | 174 | reco := &SpeechRecognizer{ 175 | AppID: appID, 176 | VoiceFormat: defaultVoiceFormat, 177 | End: 0, 178 | Timestamp: 0, 179 | Nonce: 0, 180 | Signature: "", 181 | VoiceData: nil, 182 | Expired: 0, 183 | TextMode: 0, 184 | RefText: "", 185 | Keyword: "", 186 | EvalMode: 0, 187 | ScoreCoeff: 1.0, 188 | RecMode: 0, 189 | ServerEngineType: "16k_en", 190 | SentenceInfoEnabled: 0, 191 | Credential: credential, 192 | listener: listener, 193 | VoiceID: "", 194 | ProxyURL: "", 195 | conn: nil, 196 | dataChan: make(chan []byte, 6400), 197 | eventChan: make(chan speechRecognitionEvent, 10), 198 | sendEnd: make(chan int), 199 | receiveEnd: make(chan int), 200 | eventEnd: make(chan int), 201 | mutex: sync.Mutex{}, 202 | started: false, 203 | hasEnd: false, 204 | } 205 | return reco 206 | } 207 | 208 | // Start connects to server and start a recognition session 209 | func (recognizer *SpeechRecognizer) Start() error { 210 | recognizer.mutex.Lock() 211 | defer recognizer.mutex.Unlock() 212 | 213 | if recognizer.started { 214 | return fmt.Errorf("recognizer is already started") 215 | } 216 | if recognizer.VoiceID == "" { 217 | voiceID := uuid.New().String() 218 | recognizer.VoiceID = voiceID 219 | } 220 | serverURL := recognizer.buildURL(recognizer.VoiceID) 221 | signature := recognizer.genSignature(serverURL) 222 | serverURL = serverURL[strings.Index(serverURL, "?")+1:] 223 | //请求参数进行转义 224 | serverURL = fmt.Sprintf("%s/%s/%s?%s", host, path, recognizer.AppID, url.PathEscape(serverURL)) 225 | dialer := websocket.Dialer{} 226 | if len(recognizer.ProxyURL) > 0 { 227 | proxyURL, _ := url.Parse(recognizer.ProxyURL) 228 | dialer.Proxy = http.ProxyURL(proxyURL) 229 | } 230 | 231 | header := http.Header(make(map[string][]string)) 232 | urlStr := fmt.Sprintf("%s://%s&signature=%s", protocol, serverURL, url.QueryEscape(signature)) 233 | conn, _, err := dialer.Dial(urlStr, header) 234 | if err != nil { 235 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 236 | } 237 | _, data, err := conn.ReadMessage() 238 | if err != nil { 239 | conn.Close() 240 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 241 | } 242 | msg := SpeakingAssessmentResponse{} 243 | err = json.Unmarshal(data, &msg) 244 | if err != nil { 245 | conn.Close() 246 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 247 | } 248 | if msg.Code != 0 { 249 | conn.Close() 250 | return fmt.Errorf("voice_id: %s, code: %d, message: %s", 251 | recognizer.VoiceID, msg.Code, msg.Message) 252 | } 253 | 254 | recognizer.conn = conn 255 | go recognizer.send() 256 | go recognizer.receive() 257 | go recognizer.eventDispatch() 258 | recognizer.started = true 259 | 260 | recognizer.eventChan <- speechRecognitionEvent{ 261 | t: eventTypeRecognitionStart, 262 | r: newSpeechRecognitionResponse(0, "success", recognizer.VoiceID, 263 | fmt.Sprintf("%s-RecognitionStart", recognizer.VoiceID), 0), 264 | err: nil, 265 | } 266 | return nil 267 | } 268 | 269 | // Write : write data in channel 270 | func (recognizer *SpeechRecognizer) Write(data []byte) error { 271 | recognizer.mutex.Lock() 272 | defer recognizer.mutex.Unlock() 273 | if !recognizer.started { 274 | return fmt.Errorf("recognizer not running") 275 | } 276 | recognizer.dataChan <- data 277 | return nil 278 | } 279 | 280 | // Stop wait for the recognition process to complete 281 | func (recognizer *SpeechRecognizer) Stop() error { 282 | err := recognizer.stopInternal() 283 | if err != nil { 284 | return err 285 | } 286 | return nil 287 | } 288 | 289 | func (recognizer *SpeechRecognizer) stopInternal() error { 290 | recognizer.mutex.Lock() 291 | defer recognizer.mutex.Unlock() 292 | if !recognizer.started { 293 | return fmt.Errorf("recognizer is not running") 294 | } 295 | close(recognizer.dataChan) 296 | <-recognizer.receiveEnd 297 | <-recognizer.sendEnd 298 | <-recognizer.eventEnd 299 | recognizer.started = false 300 | err := recognizer.conn.Close() 301 | if err != nil { 302 | return err 303 | } 304 | return nil 305 | } 306 | 307 | func (recognizer *SpeechRecognizer) onError(code int, message string, err error) { 308 | if !recognizer.started { 309 | return 310 | } 311 | recognizer.listener.OnFail(newSpeechRecognitionResponse(code, message, recognizer.VoiceID, 312 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), err) 313 | go recognizer.stopInternal() 314 | } 315 | 316 | func (recognizer *SpeechRecognizer) send() { 317 | defer func() { 318 | // handle panic 319 | recognizer.genRecoverFunc()() 320 | close(recognizer.sendEnd) 321 | }() 322 | //send data 323 | for data := range recognizer.dataChan { 324 | if err := recognizer.conn.WriteMessage(websocket.BinaryMessage, data); err != nil { 325 | recognizer.onError(-1, "send error", fmt.Errorf("voice_id: %s, error: %s", 326 | recognizer.VoiceID, err.Error())) 327 | return 328 | } 329 | } 330 | //send stop msg 331 | if err := recognizer.conn.WriteMessage(websocket.TextMessage, []byte("{\"type\":\"end\"}")); err != nil { 332 | recognizer.onError(-1, "send error", fmt.Errorf("voice_id: %s, error: %s", 333 | recognizer.VoiceID, err.Error())) 334 | } 335 | } 336 | 337 | func (recognizer *SpeechRecognizer) eventDispatch() { 338 | defer func() { 339 | // handle panic 340 | recognizer.genRecoverFunc()() 341 | close(recognizer.eventEnd) 342 | }() 343 | for e := range recognizer.eventChan { 344 | switch e.t { 345 | case eventTypeRecognitionStart: 346 | recognizer.listener.OnRecognitionStart(e.r) 347 | case eventTypeIntermediateResults: 348 | recognizer.listener.OnIntermediateResults(e.r) 349 | case eventTypeRecognitionComplete: 350 | recognizer.listener.OnRecognitionComplete(e.r) 351 | case eventTypeFail: 352 | recognizer.listener.OnFail(e.r, e.err) 353 | } 354 | } 355 | } 356 | 357 | func (recognizer *SpeechRecognizer) receive() { 358 | defer func() { 359 | // handle panic 360 | recognizer.genRecoverFunc()() 361 | close(recognizer.eventChan) 362 | close(recognizer.receiveEnd) 363 | }() 364 | for { 365 | _, data, err := recognizer.conn.ReadMessage() 366 | if err != nil { 367 | recognizer.onError(-1, "receive error", fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error())) 368 | break 369 | } 370 | 371 | //fmt.Printf("%s", data) 372 | msg := SpeakingAssessmentResponse{} 373 | err = json.Unmarshal(data, &msg) 374 | if err != nil { 375 | recognizer.onError(-1, "receive error", 376 | fmt.Errorf("voice_id: %s, error: %s", 377 | recognizer.VoiceID, err.Error())) 378 | break 379 | } 380 | if msg.Code != 0 { 381 | recognizer.onError(msg.Code, msg.Message, 382 | fmt.Errorf("VoiceID: %s, error code %d, message: %s", 383 | recognizer.VoiceID, msg.Code, msg.Message)) 384 | break 385 | } 386 | if msg.Final == 1 { 387 | recognizer.hasEnd = true 388 | recognizer.eventChan <- speechRecognitionEvent{ 389 | t: eventTypeRecognitionComplete, 390 | r: &msg, 391 | err: nil, 392 | } 393 | break 394 | } else { 395 | recognizer.eventChan <- speechRecognitionEvent{ 396 | t: eventTypeIntermediateResults, 397 | r: &msg, 398 | err: nil, 399 | } 400 | } 401 | } 402 | } 403 | 404 | func (recognizer *SpeechRecognizer) buildURL(voiceID string) string { 405 | var queryMap = make(map[string]string) 406 | queryMap["secretid"] = recognizer.Credential.SecretId 407 | // token参数用于临时秘钥鉴权 408 | if recognizer.Credential.Token != "" { 409 | queryMap["token"] = recognizer.Credential.Token 410 | } 411 | var timestamp = time.Now().Unix() 412 | var timestampStr = strconv.FormatInt(timestamp, 10) 413 | queryMap["timestamp"] = timestampStr 414 | queryMap["expired"] = strconv.FormatInt(timestamp+24*60*60, 10) 415 | queryMap["nonce"] = timestampStr 416 | //params 417 | queryMap["voice_id"] = voiceID 418 | queryMap["voice_format"] = strconv.FormatInt(int64(recognizer.VoiceFormat), 10) 419 | queryMap["text_mode"] = strconv.FormatInt(recognizer.TextMode, 10) 420 | queryMap["ref_text"] = recognizer.RefText 421 | queryMap["keyword"] = recognizer.Keyword 422 | queryMap["eval_mode"] = strconv.FormatInt(recognizer.EvalMode, 10) 423 | queryMap["score_coeff"] = fmt.Sprintf("%1f", recognizer.ScoreCoeff) 424 | queryMap["server_engine_type"] = recognizer.ServerEngineType 425 | queryMap["sentence_info_enabled"] = strconv.FormatInt(int64(recognizer.SentenceInfoEnabled), 10) 426 | queryMap["rec_mode"] = strconv.FormatInt(int64(recognizer.RecMode), 10) 427 | 428 | var keys []string 429 | for k := range queryMap { 430 | keys = append(keys, k) 431 | } 432 | sort.Strings(keys) 433 | 434 | var queryStrBuffer bytes.Buffer 435 | for _, k := range keys { 436 | queryStrBuffer.WriteString(k) 437 | queryStrBuffer.WriteString("=") 438 | queryStrBuffer.WriteString(queryMap[k]) 439 | queryStrBuffer.WriteString("&") 440 | } 441 | 442 | rs := []rune(queryStrBuffer.String()) 443 | rsLen := len(rs) 444 | queryStr := string(rs[0 : rsLen-1]) 445 | 446 | //gen url 447 | url := fmt.Sprintf("%s/%s/%s?%s", host, path, recognizer.AppID, queryStr) 448 | return url 449 | } 450 | 451 | func (recognizer *SpeechRecognizer) genSignature(url string) string { 452 | hmac := hmac.New(sha1.New, []byte(recognizer.Credential.SecretKey)) 453 | signURL := url 454 | hmac.Write([]byte(signURL)) 455 | encryptedStr := hmac.Sum([]byte(nil)) 456 | var signature = base64.StdEncoding.EncodeToString(encryptedStr) 457 | 458 | return signature 459 | } 460 | 461 | func newSpeechRecognitionResponse(code int, message string, voiceID string, 462 | messageID string, final uint32) *SpeakingAssessmentResponse { 463 | return &SpeakingAssessmentResponse{ 464 | Code: code, 465 | Message: message, 466 | VoiceID: voiceID, 467 | MessageID: messageID, 468 | Final: final, 469 | } 470 | } 471 | 472 | func (recognizer *SpeechRecognizer) genRecoverFunc() func() { 473 | return func() { 474 | if r := recover(); r != nil { 475 | var err error 476 | switch r := r.(type) { 477 | case error: 478 | err = r 479 | default: 480 | err = fmt.Errorf("%v", r) 481 | } 482 | retErr := fmt.Errorf("panic error ocurred! [err: %s] [stack: %s]", 483 | err.Error(), string(debug.Stack())) 484 | recognizer.eventChan <- speechRecognitionEvent{ 485 | t: eventTypeFail, 486 | r: newSpeechRecognitionResponse(-1, "panic error", recognizer.VoiceID, 487 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), 488 | err: retErr, 489 | } 490 | } 491 | } 492 | } 493 | -------------------------------------------------------------------------------- /asr/speechrecognizer.go: -------------------------------------------------------------------------------- 1 | package asr 2 | 3 | import ( 4 | "bytes" 5 | "crypto/hmac" 6 | "crypto/sha1" 7 | "encoding/base64" 8 | "encoding/json" 9 | "fmt" 10 | "net/http" 11 | "net/url" 12 | "runtime/debug" 13 | "sort" 14 | "strconv" 15 | "sync" 16 | "time" 17 | 18 | "github.com/google/uuid" 19 | "github.com/gorilla/websocket" 20 | 21 | "github.com/tencentcloud/tencentcloud-speech-sdk-go/common" 22 | ) 23 | 24 | // SpeechRecognitionListener User must impletement it. Get recognition result 25 | type SpeechRecognitionListener interface { 26 | OnRecognitionStart(*SpeechRecognitionResponse) 27 | OnSentenceBegin(*SpeechRecognitionResponse) 28 | OnRecognitionResultChange(*SpeechRecognitionResponse) 29 | OnSentenceEnd(*SpeechRecognitionResponse) 30 | OnRecognitionComplete(*SpeechRecognitionResponse) 31 | OnFail(*SpeechRecognitionResponse, error) 32 | } 33 | 34 | // SpeechRecognitionResponse is the reponse of asr service 35 | type SpeechRecognitionResponse struct { 36 | Code int `json:"code"` 37 | Message string `json:"message"` 38 | VoiceID string `json:"voice_id,omitempty"` 39 | MessageID string `json:"message_id,omitempty"` 40 | Final uint32 `json:"final,omitempty"` 41 | Result SpeechRecognitionResponseResult `json:"result,omitempty"` 42 | } 43 | 44 | // SpeechRecognitionResponseResult SpeechRecognitionResponseResult 45 | type SpeechRecognitionResponseResult struct { 46 | SliceType uint32 `json:"slice_type"` 47 | Index int `json:"index"` 48 | StartTime uint32 `json:"start_time"` 49 | EndTime uint32 `json:"end_time"` 50 | VoiceTextStr string `json:"voice_text_str"` 51 | WordSize uint32 `json:"word_size"` 52 | WordList []SpeechRecognitionResponseResultWord `json:"word_list"` 53 | } 54 | 55 | // SpeechRecognitionResponseResultWord SpeechRecognitionResponseResultWord 56 | type SpeechRecognitionResponseResultWord struct { 57 | Word string `json:"word"` 58 | StartTime uint32 `json:"start_time"` 59 | EndTime uint32 `json:"end_time"` 60 | StableFlag uint32 `json:"stable_flag"` 61 | } 62 | 63 | // AudioFormat type 64 | const ( 65 | AudioFormatPCM = 1 66 | AudioFormatSpeex = 4 67 | AudioFormatSilk = 6 68 | AudioFormatMp3 = 8 69 | AudioFormatOpus = 10 70 | AudioFormatWav = 12 71 | AudioFormatM4A = 14 72 | AudioFormatAAC = 16 73 | ) 74 | 75 | // SpeechRecognizer is the entry for ASR service 76 | type SpeechRecognizer struct { 77 | //request params 78 | AppID string 79 | EngineModelType string 80 | VoiceFormat int 81 | NeedVad int 82 | HotwordId string 83 | HotwordList string 84 | CustomizationId string 85 | FilterDirty int 86 | FilterModal int 87 | FilterPunc int 88 | ConvertNumMode int 89 | WordInfo int 90 | VadSilenceTime int 91 | ReinforceHotword int 92 | NoiseThreshold float64 93 | FilterEmptyResult int 94 | MaxSpeakTime int 95 | ReplaceTextId string 96 | ChatVadEnable int 97 | 98 | Credential *common.Credential 99 | //listener 100 | listener SpeechRecognitionListener 101 | //uuid for voice 102 | VoiceID string 103 | 104 | //for proxy 105 | ProxyURL string 106 | 107 | //for websocet connection 108 | conn *websocket.Conn 109 | 110 | //send data channel 111 | dataChan chan []byte 112 | //for listener get response message 113 | eventChan chan speechRecognitionEvent 114 | 115 | //used in stop function, waiting for stop all goroutines 116 | sendEnd chan int 117 | receiveEnd chan int 118 | eventEnd chan int 119 | 120 | mutex sync.Mutex 121 | started bool 122 | } 123 | 124 | const ( 125 | defaultVoiceFormat = 1 126 | defaultNeedVad = 1 127 | defaultWordInfo = 0 128 | defaultFilterDirty = 0 129 | defaultFilterModal = 0 130 | defaultFilterPunc = 0 131 | defaultConvertNumMode = 1 132 | defaultReinforceHotword = 0 133 | defaultFilterEmptyResult = 1 134 | defaultMaxSpeakTime = 0 135 | 136 | protocol = "wss" 137 | host = "asr.cloud.tencent.com" 138 | path = "" 139 | ) 140 | 141 | const ( 142 | eventTypeRecognitionStart = 0 143 | eventTypeSentenceBegin = 1 144 | eventTypeRecognitionResultChange = 2 145 | eventTypeSentenceEnd = 3 146 | eventTypeRecognitionComplete = 4 147 | eventTypeFail = 5 148 | ) 149 | 150 | type eventType int 151 | 152 | type speechRecognitionEvent struct { 153 | t eventType 154 | r *SpeechRecognitionResponse 155 | err error 156 | } 157 | 158 | // NewSpeechRecognizer creates instance of SpeechRecognizer 159 | func NewSpeechRecognizer(appID string, credential *common.Credential, engineModelType string, 160 | listener SpeechRecognitionListener) *SpeechRecognizer { 161 | 162 | reco := &SpeechRecognizer{ 163 | AppID: appID, 164 | Credential: credential, 165 | EngineModelType: engineModelType, 166 | VoiceFormat: defaultVoiceFormat, 167 | NeedVad: defaultNeedVad, 168 | FilterDirty: defaultFilterDirty, 169 | FilterModal: defaultFilterModal, 170 | FilterPunc: defaultFilterPunc, 171 | ConvertNumMode: defaultConvertNumMode, 172 | WordInfo: defaultWordInfo, 173 | ReinforceHotword: defaultReinforceHotword, 174 | FilterEmptyResult: defaultFilterEmptyResult, 175 | MaxSpeakTime: defaultMaxSpeakTime, 176 | 177 | dataChan: make(chan []byte, 6400), 178 | eventChan: make(chan speechRecognitionEvent, 10), 179 | 180 | sendEnd: make(chan int), 181 | receiveEnd: make(chan int), 182 | eventEnd: make(chan int), 183 | 184 | listener: listener, 185 | started: false, 186 | } 187 | return reco 188 | } 189 | 190 | // Start connects to server and start a recognition session 191 | func (recognizer *SpeechRecognizer) Start() error { 192 | recognizer.mutex.Lock() 193 | defer recognizer.mutex.Unlock() 194 | 195 | if recognizer.started { 196 | return fmt.Errorf("recognizer is already started") 197 | } 198 | if recognizer.VoiceID == "" { 199 | voiceID := uuid.New().String() 200 | recognizer.VoiceID = voiceID 201 | } 202 | serverURL := recognizer.buildURL(recognizer.VoiceID) 203 | signature := recognizer.genSignature(serverURL) 204 | 205 | dialer := websocket.Dialer{} 206 | if len(recognizer.ProxyURL) > 0 { 207 | proxyURL, _ := url.Parse(recognizer.ProxyURL) 208 | dialer.Proxy = http.ProxyURL(proxyURL) 209 | } 210 | 211 | header := http.Header(make(map[string][]string)) 212 | urlStr := fmt.Sprintf("%s://%s&signature=%s", protocol, serverURL, url.QueryEscape(signature)) 213 | conn, _, err := dialer.Dial(urlStr, header) 214 | if err != nil { 215 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 216 | } 217 | _, data, err := conn.ReadMessage() 218 | if err != nil { 219 | conn.Close() 220 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 221 | } 222 | msg := SpeechRecognitionResponse{} 223 | err = json.Unmarshal(data, &msg) 224 | if err != nil { 225 | conn.Close() 226 | return fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error()) 227 | } 228 | if msg.Code != 0 { 229 | conn.Close() 230 | return fmt.Errorf("voice_id: %s, code: %d, message: %s", 231 | recognizer.VoiceID, msg.Code, msg.Message) 232 | } 233 | 234 | recognizer.conn = conn 235 | go recognizer.send() 236 | go recognizer.receive() 237 | go recognizer.eventDispatch() 238 | recognizer.started = true 239 | 240 | recognizer.eventChan <- speechRecognitionEvent{ 241 | t: eventTypeRecognitionStart, 242 | r: newSpeechRecognitionResponse(0, "sucess", recognizer.VoiceID, 243 | fmt.Sprintf("%s-RecognitionStart", recognizer.VoiceID), 0), 244 | err: nil, 245 | } 246 | return nil 247 | } 248 | 249 | // Write : write data in channel 250 | func (recognizer *SpeechRecognizer) Write(data []byte) error { 251 | recognizer.mutex.Lock() 252 | defer recognizer.mutex.Unlock() 253 | if !recognizer.started { 254 | return fmt.Errorf("recognizer not running") 255 | } 256 | 257 | recognizer.dataChan <- data 258 | return nil 259 | } 260 | 261 | // Stop wait for the recognition process to complete 262 | func (recognizer *SpeechRecognizer) Stop() error { 263 | err := recognizer.stopInternal() 264 | if err != nil { 265 | return err 266 | } 267 | return nil 268 | } 269 | 270 | func (recognizer *SpeechRecognizer) stopInternal() error { 271 | recognizer.mutex.Lock() 272 | defer recognizer.mutex.Unlock() 273 | if !recognizer.started { 274 | return fmt.Errorf("recognizer is not running") 275 | } 276 | close(recognizer.dataChan) 277 | <-recognizer.receiveEnd 278 | <-recognizer.sendEnd 279 | <-recognizer.eventEnd 280 | recognizer.started = false 281 | return nil 282 | } 283 | 284 | func (recognizer *SpeechRecognizer) onError(code int, message string, err error) { 285 | //recognizer.mutex.Lock() 286 | if !recognizer.started { 287 | return 288 | } 289 | 290 | recognizer.listener.OnFail(newSpeechRecognitionResponse(code, message, recognizer.VoiceID, 291 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), err) 292 | /* 293 | recognizer.eventChan <- speechRecognitionEvent{ 294 | t: eventTypeFail, 295 | r: newSpeechRecognitionResponse(code, message, recognizer.VoiceID, 296 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), 297 | err: err, 298 | } 299 | recognizer.mutex.Unlock() 300 | */ 301 | go recognizer.stopInternal() 302 | } 303 | 304 | func (recognizer *SpeechRecognizer) send() { 305 | defer func() { 306 | // handle panic 307 | recognizer.genRecoverFunc()() 308 | close(recognizer.sendEnd) 309 | }() 310 | //send data 311 | for data := range recognizer.dataChan { 312 | if err := recognizer.conn.WriteMessage(websocket.BinaryMessage, data); err != nil { 313 | recognizer.onError(-1, "send error", fmt.Errorf("voice_id: %s, error: %s", 314 | recognizer.VoiceID, err.Error())) 315 | return 316 | } 317 | } 318 | //send stop msg 319 | if err := recognizer.conn.WriteMessage(websocket.TextMessage, []byte("{\"type\":\"end\"}")); err != nil { 320 | recognizer.onError(-1, "send error", fmt.Errorf("voice_id: %s, error: %s", 321 | recognizer.VoiceID, err.Error())) 322 | } 323 | } 324 | 325 | func (recognizer *SpeechRecognizer) eventDispatch() { 326 | defer func() { 327 | // handle panic 328 | recognizer.genRecoverFunc()() 329 | close(recognizer.eventEnd) 330 | }() 331 | for e := range recognizer.eventChan { 332 | switch e.t { 333 | case eventTypeRecognitionStart: 334 | recognizer.listener.OnRecognitionStart(e.r) 335 | case eventTypeSentenceBegin: 336 | recognizer.listener.OnSentenceBegin(e.r) 337 | case eventTypeRecognitionResultChange: 338 | recognizer.listener.OnRecognitionResultChange(e.r) 339 | case eventTypeSentenceEnd: 340 | recognizer.listener.OnSentenceEnd(e.r) 341 | case eventTypeRecognitionComplete: 342 | recognizer.listener.OnRecognitionComplete(e.r) 343 | case eventTypeFail: 344 | recognizer.listener.OnFail(e.r, e.err) 345 | } 346 | } 347 | } 348 | 349 | func (recognizer *SpeechRecognizer) receive() { 350 | defer func() { 351 | // handle panic 352 | recognizer.genRecoverFunc()() 353 | close(recognizer.eventChan) 354 | close(recognizer.receiveEnd) 355 | }() 356 | index := -1 357 | for { 358 | _, data, err := recognizer.conn.ReadMessage() 359 | if err != nil { 360 | recognizer.onError(-1, "receive error", fmt.Errorf("voice_id: %s, error: %s", recognizer.VoiceID, err.Error())) 361 | break 362 | } 363 | 364 | //fmt.Printf("%s", data) 365 | msg := SpeechRecognitionResponse{} 366 | err = json.Unmarshal(data, &msg) 367 | if err != nil { 368 | recognizer.onError(-1, "receive error", 369 | fmt.Errorf("voice_id: %s, error: %s", 370 | recognizer.VoiceID, err.Error())) 371 | break 372 | } 373 | if msg.Code != 0 { 374 | recognizer.onError(msg.Code, msg.Message, 375 | fmt.Errorf("VoiceID: %s, error code %d, message: %s", 376 | recognizer.VoiceID, msg.Code, msg.Message)) 377 | break 378 | } 379 | 380 | if msg.Final == 1 { 381 | recognizer.eventChan <- speechRecognitionEvent{ 382 | t: eventTypeRecognitionComplete, 383 | r: &msg, 384 | err: nil, 385 | } 386 | break 387 | } 388 | 389 | beginOrEnd := false 390 | if msg.Result.Index != index || msg.Result.SliceType == 0 { 391 | index = msg.Result.Index 392 | recognizer.eventChan <- speechRecognitionEvent{ 393 | t: eventTypeSentenceBegin, 394 | r: &msg, 395 | err: nil, 396 | } 397 | beginOrEnd = true 398 | } 399 | if msg.Result.SliceType == 2 { 400 | recognizer.eventChan <- speechRecognitionEvent{ 401 | t: eventTypeSentenceEnd, 402 | r: &msg, 403 | err: nil, 404 | } 405 | beginOrEnd = true 406 | } 407 | if !beginOrEnd { 408 | recognizer.eventChan <- speechRecognitionEvent{ 409 | t: eventTypeRecognitionResultChange, 410 | r: &msg, 411 | err: nil, 412 | } 413 | } 414 | } 415 | } 416 | 417 | func (recognizer *SpeechRecognizer) buildURL(voiceID string) string { 418 | var queryMap = make(map[string]string) 419 | queryMap["secretid"] = recognizer.Credential.SecretId 420 | var timestamp = time.Now().Unix() 421 | var timestampStr = strconv.FormatInt(timestamp, 10) 422 | queryMap["timestamp"] = timestampStr 423 | queryMap["expired"] = strconv.FormatInt(timestamp+24*60*60, 10) 424 | queryMap["nonce"] = timestampStr 425 | 426 | //params 427 | queryMap["engine_model_type"] = recognizer.EngineModelType 428 | queryMap["voice_id"] = voiceID 429 | queryMap["voice_format"] = strconv.FormatInt(int64(recognizer.VoiceFormat), 10) 430 | queryMap["needvad"] = strconv.FormatInt(int64(recognizer.NeedVad), 10) 431 | if recognizer.HotwordId != "" { 432 | queryMap["hotword_id"] = recognizer.HotwordId 433 | } 434 | if recognizer.HotwordList != "" { 435 | queryMap["hotword_list"] = recognizer.HotwordList 436 | } 437 | if recognizer.CustomizationId != "" { 438 | queryMap["customization_id"] = recognizer.CustomizationId 439 | } 440 | 441 | if recognizer.ReplaceTextId != "" { 442 | queryMap["replace_text_id"] = recognizer.ReplaceTextId 443 | } 444 | 445 | queryMap["filter_dirty"] = strconv.FormatInt(int64(recognizer.FilterDirty), 10) 446 | queryMap["filter_modal"] = strconv.FormatInt(int64(recognizer.FilterModal), 10) 447 | queryMap["filter_punc"] = strconv.FormatInt(int64(recognizer.FilterPunc), 10) 448 | queryMap["filter_empty_result"] = strconv.FormatInt(int64(recognizer.FilterEmptyResult), 10) 449 | queryMap["convert_num_mode"] = strconv.FormatInt(int64(recognizer.ConvertNumMode), 10) 450 | queryMap["word_info"] = strconv.FormatInt(int64(recognizer.WordInfo), 10) 451 | queryMap["reinforce_hotword"] = strconv.FormatInt(int64(recognizer.ReinforceHotword), 10) 452 | queryMap["max_speak_time"] = strconv.FormatInt(int64(recognizer.MaxSpeakTime), 10) 453 | if recognizer.VadSilenceTime > 0 { 454 | queryMap["vad_silence_time"] = strconv.FormatInt(int64(recognizer.VadSilenceTime), 10) 455 | } 456 | if recognizer.NoiseThreshold != 0 { 457 | queryMap["noise_threshold"] = strconv.FormatFloat(recognizer.NoiseThreshold, 'f', 3, 64) 458 | } 459 | if recognizer.ChatVadEnable > 0 { 460 | queryMap["chat_vad_enable"] = strconv.FormatInt(int64(recognizer.ChatVadEnable), 10) 461 | } 462 | 463 | var keys []string 464 | for k := range queryMap { 465 | keys = append(keys, k) 466 | } 467 | sort.Strings(keys) 468 | 469 | var queryStrBuffer bytes.Buffer 470 | for _, k := range keys { 471 | queryStrBuffer.WriteString(k) 472 | queryStrBuffer.WriteString("=") 473 | queryStrBuffer.WriteString(queryMap[k]) 474 | queryStrBuffer.WriteString("&") 475 | } 476 | 477 | rs := []rune(queryStrBuffer.String()) 478 | rsLen := len(rs) 479 | queryStr := string(rs[0 : rsLen-1]) 480 | 481 | //gen url 482 | url := fmt.Sprintf("%s/asr/v2/%s?%s", host, recognizer.AppID, queryStr) 483 | return url 484 | } 485 | 486 | func (recognizer *SpeechRecognizer) genSignature(url string) string { 487 | hmac := hmac.New(sha1.New, []byte(recognizer.Credential.SecretKey)) 488 | signURL := url 489 | hmac.Write([]byte(signURL)) 490 | encryptedStr := hmac.Sum([]byte(nil)) 491 | var signature = base64.StdEncoding.EncodeToString(encryptedStr) 492 | 493 | return signature 494 | } 495 | 496 | func newSpeechRecognitionResponse(code int, message string, voiceID string, 497 | messageID string, final uint32) *SpeechRecognitionResponse { 498 | return &SpeechRecognitionResponse{ 499 | Code: code, 500 | Message: message, 501 | VoiceID: voiceID, 502 | MessageID: messageID, 503 | Final: final, 504 | } 505 | } 506 | 507 | func (recognizer *SpeechRecognizer) genRecoverFunc() func() { 508 | return func() { 509 | if r := recover(); r != nil { 510 | var err error 511 | switch r := r.(type) { 512 | case error: 513 | err = r 514 | default: 515 | err = fmt.Errorf("%v", r) 516 | } 517 | retErr := fmt.Errorf("panic error ocurred! [err: %s] [stack: %s]", 518 | err.Error(), string(debug.Stack())) 519 | recognizer.eventChan <- speechRecognitionEvent{ 520 | t: eventTypeFail, 521 | r: newSpeechRecognitionResponse(-1, "panic error", recognizer.VoiceID, 522 | fmt.Sprintf("%s-Error", recognizer.VoiceID), 0), 523 | err: retErr, 524 | } 525 | } 526 | } 527 | } 528 | --------------------------------------------------------------------------------