├── LICENSE ├── README.md ├── cpp ├── .gitignore ├── CMakeLists.txt ├── asr.cpp ├── build.sh ├── test.sh └── tts.cpp ├── go ├── BasicProtobuf │ └── basic.pb.go ├── VoiceProxyProtobuf │ └── voiceproxy.pb.go ├── asr-client.go └── asr-client.osx ├── php └── req.php ├── proto ├── basic.proto ├── tts.proto ├── ttsbackend.proto └── voiceproxy.proto ├── python ├── .gitignore ├── README.txt ├── advanced_callback_example.py ├── advanced_callback_splitter.py ├── asrclient-cli.py ├── asrclient │ ├── __init__.py │ ├── basic.proto │ ├── client.py │ ├── transport.py │ ├── tts.proto │ ├── ttsbackend.proto │ ├── ttsclient.py │ └── voiceproxy.proto ├── setup.py └── ttsclient-cli.py └── webspeechkit ├── README.md └── src ├── equalizer.js ├── recognizer.js ├── recorder.js ├── recorderWorker.js ├── speechrecognition.js ├── textline.js └── tts.js /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yandex/speechkitcloud/817e2bc2f090a17b8d3a9180848d5174d804bc3b/README.md -------------------------------------------------------------------------------- /cpp/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(asr-curl-sample) 4 | 5 | add_executable(asr-curl-sample asr.cpp ) 6 | target_link_libraries(asr-curl-sample curl) 7 | 8 | add_executable(tts-curl-sample tts.cpp) 9 | target_link_libraries(tts-curl-sample curl) 10 | -------------------------------------------------------------------------------- /cpp/asr.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | size_t write_response_data(char *ptr, size_t size, size_t nmemb, void *userdata) 8 | { 9 | std::stringstream* s = (std::stringstream*)userdata; 10 | size_t n = size * nmemb; 11 | s->write(ptr, n); 12 | return n; 13 | } 14 | 15 | size_t read_request_data(char *ptr, size_t size, size_t nmemb, void *userdata) 16 | { 17 | std::ifstream* f = (std::ifstream*)userdata; 18 | size_t n = size * nmemb; 19 | f->read(ptr, n); 20 | size_t result = f->gcount(); 21 | return result; 22 | } 23 | 24 | int main(int argc, char** argv) 25 | { 26 | std::string filename; 27 | std::string key; 28 | 29 | std::cout << "argc=" << argc << std::endl; 30 | while (argc > 0) 31 | { 32 | int n = argc - 1; 33 | const char* val = argv[n]; 34 | 35 | if (n == 2) key = val; 36 | if (n == 1) filename = val; 37 | 38 | std::cout << "argv[" << n << "]=" << val << std::endl; 39 | argc--; 40 | } 41 | 42 | std::stringstream usage; 43 | usage << "Usage: "<< argv[0] << " "; 44 | 45 | if (filename.empty() || key.empty()) 46 | { 47 | std::cout << usage.str(); 48 | return -1; 49 | } 50 | 51 | CURL *curl = NULL; 52 | curl = curl_easy_init(); 53 | 54 | if (curl) 55 | { 56 | curl_easy_setopt(curl, CURLOPT_HEADER, 1); 57 | curl_easy_setopt(curl, CURLOPT_POST, 1); 58 | curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); 59 | curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); 60 | 61 | struct curl_slist *headers=NULL; 62 | 63 | headers = curl_slist_append(headers, "Content-Type: audio/x-wav"); 64 | curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); 65 | 66 | std::stringstream url; 67 | url << "asr.yandex.net/asr_xml?uuid=12345678123456781234567812345678&topic=general&lang=ru-RU&key=" 68 | << key; 69 | 70 | curl_easy_setopt(curl, CURLOPT_URL, url.str().c_str()); 71 | 72 | std::ifstream fileStream(filename, std::ifstream::binary); 73 | fileStream.seekg (0, fileStream.end); 74 | int length = fileStream.tellg(); 75 | fileStream.seekg (0, fileStream.beg); 76 | 77 | curl_easy_setopt(curl, CURLOPT_READFUNCTION, &read_request_data); 78 | curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, length); 79 | curl_easy_setopt(curl, CURLOPT_READDATA, &fileStream); 80 | 81 | std::stringstream contentStream; 82 | 83 | curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_response_data); 84 | curl_easy_setopt(curl, CURLOPT_WRITEDATA, &contentStream); 85 | 86 | CURLcode code = curl_easy_perform(curl); 87 | 88 | unsigned httpCode; 89 | curl_easy_getinfo(curl, CURLINFO_HTTP_CODE, &httpCode); 90 | std::stringstream msg; 91 | msg << "Http code is " << httpCode; 92 | std::cout << contentStream.str(); 93 | 94 | curl_free(headers); 95 | curl_easy_cleanup(curl); 96 | } 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /cpp/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # building with cmake 3 | 4 | mkdir -p build && cd build && cmake ../ && make -j 5 | -------------------------------------------------------------------------------- /cpp/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #demo key is 6372dda5-9674-4413-85ff-e9d0eb2f99a7 4 | 5 | ./build.sh 6 | ./build/tts-curl-sample 123 6372dda5-9674-4413-85ff-e9d0eb2f99a7 > build/123.wav 7 | ./build/asr-curl-sample build/123.wav 6372dda5-9674-4413-85ff-e9d0eb2f99a7 8 | -------------------------------------------------------------------------------- /cpp/tts.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // ./build/tts-curl-sample 123 6372dda5-9674-4413-85ff-e9d0eb2f99a7 | play -t wav - 8 | // same as 9 | // curl "tts.voicetech.yandex.net/generate?lang=ru_RU&format=wav&speaker=ermil&text=123&key=6372dda5-9674-4413-85ff-e9d0eb2f99a7" | play -t wav - 10 | 11 | using namespace std; 12 | 13 | const char* DEFAULT_HOST = "tts.voicetech.yandex.net"; 14 | const char* DEFAULT_LANG = "ru_RU"; 15 | const char* DEFAULT_FORMAT = "wav"; 16 | const char* DEFAULT_VOICE = "ermil"; 17 | const char* DEFAULT_TEXT = "123"; 18 | bool VERBOSE = false; 19 | 20 | int debug_callback(CURL *handle, 21 | curl_infotype type, 22 | char *data, 23 | size_t size, 24 | void *userdata) 25 | { 26 | if (type == CURLINFO_HEADER_OUT) 27 | { 28 | stringstream* s = (stringstream*)userdata; 29 | s->write(data, size); 30 | } 31 | return CURLE_OK; 32 | } 33 | 34 | size_t write_callback( void *ptr, size_t size, size_t nmemb, void *userdata) 35 | { 36 | stringstream* s = (stringstream*)userdata; 37 | size_t fullSize = size*nmemb; 38 | s->write(static_cast(ptr), fullSize); 39 | return fullSize; 40 | } 41 | 42 | size_t make_request(CURL* curl, const string& host, const string& text, const string& key) 43 | { 44 | if (curl) 45 | { 46 | curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); 47 | 48 | stringstream urlStream; 49 | urlStream << host 50 | << "/generate?lang=" << DEFAULT_LANG 51 | << "&format=" << DEFAULT_FORMAT 52 | << "&speaker=" << DEFAULT_VOICE 53 | << "&text=" << text 54 | << "&key=" << key; 55 | 56 | if (VERBOSE) cout << urlStream.str() << endl; 57 | 58 | curl_easy_setopt(curl, CURLOPT_URL, urlStream.str().c_str()); 59 | 60 | stringstream responseBodyStream; 61 | curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); 62 | curl_easy_setopt(curl, CURLOPT_WRITEDATA, &responseBodyStream); 63 | 64 | stringstream requestStream; 65 | 66 | curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, debug_callback); 67 | curl_easy_setopt(curl, CURLOPT_DEBUGDATA, &requestStream); 68 | curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); 69 | 70 | CURLcode code = curl_easy_perform(curl); 71 | 72 | string request = requestStream.str(); 73 | 74 | if (VERBOSE) cout << request.size() << endl << request << endl; 75 | 76 | unsigned httpCode; 77 | curl_easy_getinfo(curl, CURLINFO_HTTP_CODE, &httpCode); 78 | if (httpCode != 200) 79 | { 80 | if (VERBOSE) cout << "respose code is " << httpCode << endl; 81 | } 82 | 83 | cout << responseBodyStream.str(); 84 | } 85 | return 0; 86 | } 87 | 88 | int main(int argc, char* argv[]) 89 | { 90 | CURL *curl = NULL; 91 | curl = curl_easy_init(); 92 | 93 | string text; 94 | string key; 95 | 96 | if (VERBOSE) cout << "argc=" << argc << endl; 97 | while (argc > 0) 98 | { 99 | int n = argc - 1; 100 | const char* val = argv[n]; 101 | 102 | if (n == 2) key = val; 103 | if (n == 1) text = val; 104 | 105 | if (VERBOSE) cout << "argv[" << n << "]=" << val << endl; 106 | argc--; 107 | } 108 | 109 | if (text.empty() || key.empty()) 110 | { 111 | cout << "Usage: tts-curl-sample " << endl; 112 | return -1; 113 | } 114 | 115 | make_request(curl, DEFAULT_HOST, text, key); 116 | 117 | curl_easy_cleanup(curl); 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /go/BasicProtobuf/basic.pb.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-go. 2 | // source: basic.proto 3 | // DO NOT EDIT! 4 | 5 | /* 6 | Package BasicProtobuf is a generated protocol buffer package. 7 | 8 | It is generated from these files: 9 | basic.proto 10 | 11 | It has these top-level messages: 12 | ConnectionResponse 13 | */ 14 | package BasicProtobuf 15 | 16 | import proto "github.com/golang/protobuf/proto" 17 | import fmt "fmt" 18 | import math "math" 19 | 20 | // Reference imports to suppress errors if they are not otherwise used. 21 | var _ = proto.Marshal 22 | var _ = fmt.Errorf 23 | var _ = math.Inf 24 | 25 | // This is a compile-time assertion to ensure that this generated file 26 | // is compatible with the proto package it is being compiled against. 27 | const _ = proto.ProtoPackageIsVersion1 28 | 29 | type ConnectionResponse_ResponseCode int32 30 | 31 | const ( 32 | ConnectionResponse_OK ConnectionResponse_ResponseCode = 200 33 | ConnectionResponse_BadMessageFormatting ConnectionResponse_ResponseCode = 400 34 | ConnectionResponse_UnknownService ConnectionResponse_ResponseCode = 404 35 | ConnectionResponse_NotSupportedVersion ConnectionResponse_ResponseCode = 405 36 | ConnectionResponse_Timeout ConnectionResponse_ResponseCode = 408 37 | ConnectionResponse_ProtocolError ConnectionResponse_ResponseCode = 410 38 | ConnectionResponse_InternalError ConnectionResponse_ResponseCode = 500 39 | ConnectionResponse_InvalidKey ConnectionResponse_ResponseCode = 429 40 | ConnectionResponse_InvalidRequestParams ConnectionResponse_ResponseCode = 406 41 | ) 42 | 43 | var ConnectionResponse_ResponseCode_name = map[int32]string{ 44 | 200: "OK", 45 | 400: "BadMessageFormatting", 46 | 404: "UnknownService", 47 | 405: "NotSupportedVersion", 48 | 408: "Timeout", 49 | 410: "ProtocolError", 50 | 500: "InternalError", 51 | 429: "InvalidKey", 52 | 406: "InvalidRequestParams", 53 | } 54 | var ConnectionResponse_ResponseCode_value = map[string]int32{ 55 | "OK": 200, 56 | "BadMessageFormatting": 400, 57 | "UnknownService": 404, 58 | "NotSupportedVersion": 405, 59 | "Timeout": 408, 60 | "ProtocolError": 410, 61 | "InternalError": 500, 62 | "InvalidKey": 429, 63 | "InvalidRequestParams": 406, 64 | } 65 | 66 | func (x ConnectionResponse_ResponseCode) Enum() *ConnectionResponse_ResponseCode { 67 | p := new(ConnectionResponse_ResponseCode) 68 | *p = x 69 | return p 70 | } 71 | func (x ConnectionResponse_ResponseCode) String() string { 72 | return proto.EnumName(ConnectionResponse_ResponseCode_name, int32(x)) 73 | } 74 | func (x *ConnectionResponse_ResponseCode) UnmarshalJSON(data []byte) error { 75 | value, err := proto.UnmarshalJSONEnum(ConnectionResponse_ResponseCode_value, data, "ConnectionResponse_ResponseCode") 76 | if err != nil { 77 | return err 78 | } 79 | *x = ConnectionResponse_ResponseCode(value) 80 | return nil 81 | } 82 | func (ConnectionResponse_ResponseCode) EnumDescriptor() ([]byte, []int) { 83 | return fileDescriptor0, []int{0, 0} 84 | } 85 | 86 | type ConnectionResponse struct { 87 | ResponseCode *ConnectionResponse_ResponseCode `protobuf:"varint,1,req,name=responseCode,enum=BasicProtobuf.ConnectionResponse_ResponseCode" json:"responseCode,omitempty"` 88 | SessionId *string `protobuf:"bytes,2,req,name=sessionId" json:"sessionId,omitempty"` 89 | Message *string `protobuf:"bytes,3,opt,name=message" json:"message,omitempty"` 90 | XXX_unrecognized []byte `json:"-"` 91 | } 92 | 93 | func (m *ConnectionResponse) Reset() { *m = ConnectionResponse{} } 94 | func (m *ConnectionResponse) String() string { return proto.CompactTextString(m) } 95 | func (*ConnectionResponse) ProtoMessage() {} 96 | func (*ConnectionResponse) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} } 97 | 98 | func (m *ConnectionResponse) GetResponseCode() ConnectionResponse_ResponseCode { 99 | if m != nil && m.ResponseCode != nil { 100 | return *m.ResponseCode 101 | } 102 | return ConnectionResponse_OK 103 | } 104 | 105 | func (m *ConnectionResponse) GetSessionId() string { 106 | if m != nil && m.SessionId != nil { 107 | return *m.SessionId 108 | } 109 | return "" 110 | } 111 | 112 | func (m *ConnectionResponse) GetMessage() string { 113 | if m != nil && m.Message != nil { 114 | return *m.Message 115 | } 116 | return "" 117 | } 118 | 119 | func init() { 120 | proto.RegisterType((*ConnectionResponse)(nil), "BasicProtobuf.ConnectionResponse") 121 | proto.RegisterEnum("BasicProtobuf.ConnectionResponse_ResponseCode", ConnectionResponse_ResponseCode_name, ConnectionResponse_ResponseCode_value) 122 | } 123 | 124 | var fileDescriptor0 = []byte{ 125 | // 286 bytes of a gzipped FileDescriptorProto 126 | 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x64, 0x8f, 0x31, 0x4e, 0xf3, 0x30, 127 | 0x14, 0xc7, 0x95, 0x78, 0x88, 0xf2, 0xbe, 0xb4, 0x9f, 0x71, 0x19, 0xc2, 0x56, 0x75, 0xea, 0x94, 128 | 0x81, 0x23, 0xb4, 0x80, 0x14, 0x55, 0x40, 0x95, 0x02, 0xbb, 0x9b, 0x3c, 0x2a, 0x8b, 0xc6, 0x2f, 129 | 0xd8, 0x4e, 0x11, 0xb7, 0x60, 0x00, 0x84, 0xb8, 0x03, 0xa7, 0x80, 0x81, 0x03, 0x71, 0x00, 0x92, 130 | 0x92, 0x81, 0x8a, 0xed, 0xef, 0x9f, 0xfd, 0xf3, 0xfb, 0x3f, 0xf8, 0xb7, 0x94, 0x56, 0xe5, 0x49, 131 | 0x65, 0xc8, 0x91, 0xe8, 0x4d, 0xda, 0xc3, 0xbc, 0xcd, 0xcb, 0xfa, 0x7a, 0xf4, 0xe1, 0x83, 0x98, 132 | 0x92, 0xd6, 0x98, 0x3b, 0x45, 0x3a, 0x43, 0x5b, 0x91, 0xb6, 0x28, 0x8e, 0x20, 0x32, 0x5d, 0x9e, 133 | 0x52, 0x81, 0xb1, 0x37, 0xf4, 0xc7, 0xfd, 0xc3, 0x24, 0xd9, 0x91, 0x93, 0xbf, 0x62, 0x92, 0xfd, 134 | 0xb2, 0xc4, 0x1e, 0x84, 0x16, 0xad, 0x6d, 0xee, 0xd3, 0x22, 0xf6, 0x9b, 0x2f, 0x42, 0xf1, 0x1f, 135 | 0x82, 0xb2, 0x41, 0x72, 0x85, 0x31, 0x1b, 0x7a, 0xe3, 0x70, 0xf4, 0xee, 0x41, 0xb4, 0x23, 0x05, 136 | 0xe0, 0x9f, 0xcf, 0xf8, 0xa7, 0x27, 0x0e, 0x60, 0x7f, 0x22, 0x8b, 0xd3, 0x9f, 0xd7, 0x27, 0x64, 137 | 0x4a, 0xe9, 0x9c, 0xd2, 0x2b, 0xfe, 0xc0, 0xc4, 0x00, 0xfa, 0x97, 0xfa, 0x46, 0xd3, 0x9d, 0x5e, 138 | 0xa0, 0xd9, 0xa8, 0x1c, 0xf9, 0x23, 0x13, 0x31, 0x0c, 0xce, 0xc8, 0x2d, 0xea, 0xaa, 0x22, 0xe3, 139 | 0xb0, 0xb8, 0x42, 0xd3, 0x4e, 0xe6, 0x4f, 0x4c, 0x44, 0x10, 0x5c, 0xa8, 0x12, 0xa9, 0x76, 0xfc, 140 | 0x85, 0x09, 0x01, 0xbd, 0xed, 0x06, 0x39, 0xad, 0x8f, 0x8d, 0x21, 0xc3, 0x5f, 0xb7, 0x2c, 0xd5, 141 | 0x0e, 0x8d, 0x96, 0x1d, 0xfb, 0x62, 0x4d, 0x55, 0x48, 0xf5, 0x46, 0xae, 0x55, 0x31, 0xc3, 0x7b, 142 | 0xfe, 0xc6, 0xda, 0x42, 0x1d, 0xc8, 0xf0, 0xb6, 0x46, 0xeb, 0xe6, 0xd2, 0xc8, 0xd2, 0xf2, 0x67, 143 | 0xf6, 0x1d, 0x00, 0x00, 0xff, 0xff, 0xe0, 0x49, 0xa0, 0xc3, 0x63, 0x01, 0x00, 0x00, 144 | } 145 | -------------------------------------------------------------------------------- /go/asr-client.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "errors" 7 | "flag" 8 | "fmt" 9 | "github.com/golang/protobuf/proto" 10 | "github.com/yandex/speechkitcloud/go/BasicProtobuf" 11 | "github.com/yandex/speechkitcloud/go/VoiceProxyProtobuf" 12 | "io" 13 | "log" 14 | "net" 15 | "os" 16 | "strconv" 17 | ) 18 | 19 | type Debug bool 20 | 21 | func (d Debug) Printf(s string, a ...interface{}) { 22 | if d { 23 | log.Printf(s, a...) 24 | } 25 | } 26 | 27 | var dbg Debug 28 | 29 | func sendData(conn io.Writer, data []byte) (int, error) { 30 | written1, err := fmt.Fprintf(conn, "%x\r\n", len(data)) 31 | written2, err := conn.Write(data) 32 | return written1 + 2 + written2, err 33 | } 34 | 35 | func sendProtoMessage(conn io.Writer, message proto.Message) (int, error) { 36 | data, err := proto.Marshal(message) 37 | check("sendProtoMessage / proto.Marshal", err) 38 | written, err := sendData(conn, data) 39 | return written, err 40 | } 41 | 42 | func recvData(connReader *bufio.Reader) ([]byte, error) { 43 | resp, err := connReader.ReadString('\n') 44 | if len(resp) < 2 { 45 | return nil, errors.New("recvData / no length line found") 46 | } 47 | connRespProtoLength, err := strconv.ParseInt(resp[:len(resp)-2], 16, 64) 48 | check("recvData / strconv.ParseInt", err) 49 | 50 | dbg.Printf(">> 0x%s -> %d\n", resp[:len(resp)-2], int(connRespProtoLength)) 51 | 52 | buffer := make([]byte, int(connRespProtoLength)) 53 | _, err = io.ReadFull(connReader, buffer) 54 | check("recvData / io.ReadFull", err) 55 | return buffer, err 56 | } 57 | 58 | func recvProtoMessage(connReader *bufio.Reader, message proto.Message) error { 59 | buffer, err := recvData(connReader) 60 | check("recvProtoMessage / recvData", err) 61 | 62 | err = proto.Unmarshal(buffer, message) 63 | check("recvProtoMessage / proto.Unmarshal ", err) 64 | return err 65 | } 66 | 67 | func check(id interface{}, err error) { 68 | if err != nil { 69 | log.Fatal(id, err) 70 | } 71 | } 72 | 73 | func main() { 74 | serverPtr := flag.String("s", "asr.yandex.net", "ASR server to connect.") 75 | portPtr := flag.Int("p", 80, "Server port.") 76 | apiKeyPtr := flag.String("k", "069b6659-984b-4c5f-880e-aaedcfd84102", 77 | "Speechkit Cloud api key. You should get your own at https://developer.tech.yandex.ru.\n\r\tDefault is limited demo key.") 78 | topicPtr := flag.String("topic", "freeform", "Recognition model topic (aka \"model\").") 79 | formatPtr := flag.String("format", "audio/x-pcm;bit=16;rate=16000", "Input file format.") 80 | langPtr := flag.String("lang", "ru-RU", "Recognition language. ru-RU | en-EN | tr-TR | uk-UA.") 81 | verbosePtr := flag.Bool("verbose", false, "Print more debug output.") 82 | chunkSizePtr := flag.Int("chunk-size", 32768, "Client chops input file into chunks and sends it one-by-one in a streaming manner.\n\rDefault value roughly equals to one second of audio in default format.") 83 | 84 | flag.Parse() 85 | 86 | dbg = Debug(*verbosePtr) 87 | 88 | if len(flag.Args()) == 0 { 89 | log.Fatal("No input file!") 90 | } 91 | fileName := flag.Args()[0] 92 | 93 | connectionString := fmt.Sprintf("%v:%v", *serverPtr, *portPtr) 94 | dbg.Printf(connectionString) 95 | 96 | conn, err := net.Dial("tcp", connectionString) 97 | check(1, err) 98 | defer conn.Close() 99 | 100 | var upgradeRequest bytes.Buffer 101 | upgradeRequest.WriteString("GET /asr_partial HTTP/1.1\r\n") 102 | upgradeRequest.WriteString("Upgrade: dictation\r\n\r\n") 103 | 104 | dbg.Printf("%s", upgradeRequest.String()) 105 | _, err = upgradeRequest.WriteTo(conn) 106 | check(3, err) 107 | 108 | reader := bufio.NewReader(conn) 109 | 110 | resp, err := reader.ReadString('\n') 111 | for resp != "" { 112 | check(4, err) 113 | dbg.Printf(resp) 114 | if resp == "\r\n" { 115 | break 116 | } 117 | resp, err = reader.ReadString('\n') 118 | } 119 | 120 | dbg.Printf(">> done reading upgrade response, trying to send protobuf\n") 121 | 122 | initProto := &VoiceProxyProtobuf.ConnectionRequest{ 123 | ApiKey: proto.String(*apiKeyPtr), 124 | SpeechkitVersion: proto.String(""), 125 | ServiceName: proto.String(""), 126 | Device: proto.String("desktop"), 127 | Coords: proto.String("0, 0"), 128 | Uuid: proto.String("12345678123456788765432187654321"), 129 | ApplicationName: proto.String("golang-client"), 130 | Topic: proto.String(*topicPtr), 131 | Lang: proto.String(*langPtr), 132 | Format: proto.String(*formatPtr), 133 | } 134 | 135 | _, err = sendProtoMessage(conn, initProto) 136 | check(5, err) 137 | 138 | connRespProto := &BasicProtobuf.ConnectionResponse{} 139 | err = recvProtoMessage(reader, connRespProto) 140 | check(9, err) 141 | 142 | dbg.Printf(">> done reading connection response proto\n") 143 | dbg.Printf(">> connRespProto { %v}\n", connRespProto) 144 | 145 | f, err := os.Open(fileName) 146 | check(10, err) 147 | defer f.Close() 148 | fileInfo, err := f.Stat() 149 | check(12, err) 150 | 151 | var chunkSize int64 = int64(*chunkSizePtr) 152 | expectedChunksCount := int32(fileInfo.Size() / chunkSize) 153 | if fileInfo.Size()%chunkSize != 0 { 154 | expectedChunksCount++ // last chunk is probably < chunkSize 155 | } 156 | expectedChunksCount++ // final empty chunk 157 | 158 | go func() { 159 | var chunkCount int 160 | chunkBuff := make([]byte, chunkSize) 161 | for err == nil { 162 | var readCount int 163 | readCount, err = f.Read(chunkBuff) 164 | dbg.Printf(">> read chunk %d\n", readCount) 165 | if readCount > 0 { 166 | dbg.Printf(">> sending chunk %d\n", chunkCount) 167 | addDataProto := &VoiceProxyProtobuf.AddData{LastChunk: proto.Bool(false), AudioData: chunkBuff} 168 | _, err = sendProtoMessage(conn, addDataProto) 169 | check(11, err) 170 | chunkCount++ 171 | } 172 | } 173 | lastChunkProto := &VoiceProxyProtobuf.AddData{LastChunk: proto.Bool(true)} 174 | _, err = sendProtoMessage(conn, lastChunkProto) 175 | check(13, err) 176 | }() 177 | 178 | var loopCounter int32 179 | for err == nil && loopCounter < expectedChunksCount { 180 | dbg.Printf(">> recv proto loop %v/%v\n", loopCounter, expectedChunksCount) 181 | addDataRespProto := &VoiceProxyProtobuf.AddDataResponse{} 182 | err = recvProtoMessage(reader, addDataRespProto) 183 | dbg.Printf(">> addDataRespProto { %v}\n", addDataRespProto) 184 | 185 | if err == nil { 186 | loopCounter += addDataRespProto.GetMessagesCount() 187 | dbg.Printf(">> loopCounter increased, now %v/%v\n", loopCounter, expectedChunksCount) 188 | recognitions := addDataRespProto.GetRecognition() 189 | if recognitions != nil && len(recognitions) > 0 { 190 | fmt.Printf("got result: %v; endOfUtt: %v\n", addDataRespProto.GetRecognition()[0].GetNormalized(), addDataRespProto.GetEndOfUtt()) 191 | } 192 | } 193 | } 194 | 195 | check(14, err) 196 | 197 | fmt.Printf("Done, all fine!\n") 198 | } 199 | -------------------------------------------------------------------------------- /go/asr-client.osx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yandex/speechkitcloud/817e2bc2f090a17b8d3a9180848d5174d804bc3b/go/asr-client.osx -------------------------------------------------------------------------------- /php/req.php: -------------------------------------------------------------------------------- 1 | $max) return $result; 15 | $count=min(max($count,0),$max-$min+1); 16 | while(count($result)<$count) { 17 | $value=rand($min,$max-count($result)); 18 | foreach($result as $used) if($used<=$value) $value++; else break; 19 | $result[]=dechex($value); 20 | sort($result); 21 | } 22 | shuffle($result); 23 | return $result; 24 | } 25 | 26 | function recognize($file, $key) { 27 | $uuid=generateRandomSelection(0,30,64); 28 | $uuid=implode($uuid); 29 | $uuid=substr($uuid,1,32); 30 | $curl = curl_init(); 31 | $url = 'https://asr.yandex.net/asr_xml?'.http_build_query(array( 32 | 'key'=>$key, 33 | 'uuid' => $uuid, 34 | 'topic' => 'notes', 35 | 'lang'=>'ru-RU' 36 | )); 37 | curl_setopt($curl, CURLOPT_URL, $url); 38 | 39 | $data=file_get_contents(realpath($file)); 40 | 41 | curl_setopt($curl, CURLOPT_POST, true); 42 | curl_setopt($curl, CURLOPT_POSTFIELDS, $data); 43 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); 44 | curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); 45 | curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); 46 | curl_setopt($curl, CURLOPT_HTTPHEADER, array('Content-Type: audio/x-wav')); 47 | curl_setopt($curl, CURLOPT_VERBOSE, true); 48 | $response = curl_exec($curl); 49 | $err = curl_errno($curl); 50 | curl_close($curl); 51 | if ($err) 52 | throw new exception("curl err $err"); 53 | echo $response; 54 | } 55 | 56 | 57 | print_r($argv); 58 | $filename = $argv[1]; 59 | $key = $argv[2]; 60 | 61 | recognize($filename, $key); 62 | print("done\n"); 63 | 64 | ?> 65 | -------------------------------------------------------------------------------- /proto/basic.proto: -------------------------------------------------------------------------------- 1 | // Yandex ASR dictation api (draft): 2 | // Client initiate session with http upgrade request, for example: 3 | // GET /asr_partial HTTP/1.1\r\n 4 | // User-Agent:KeepAliveClient\r\n 5 | // Host: voice-stream.voicetech.yandex.net:80\r\n 6 | // Upgrade: dictation\r\n\r\n 7 | // Receive HTTP 101 Switched response. 8 | // Next send\receive protobuf messages, format 9 | // [hex size]\r\n[message body serialized with protobuf] 10 | 11 | // send ConnectionRequest, read ConnectionsResponse.. etc 12 | // send AddData, read AddDataResponse and so on. 13 | syntax = "proto2"; 14 | 15 | package BasicProtobuf; 16 | 17 | message ConnectionResponse 18 | { 19 | required ResponseCode responseCode = 1; 20 | 21 | required string sessionId = 2; 22 | 23 | optional string message = 3; 24 | 25 | enum ResponseCode { 26 | OK = 200; 27 | BadMessageFormatting = 400; 28 | UnknownService = 404; 29 | NotSupportedVersion = 405; 30 | Timeout = 408; 31 | ProtocolError = 410; 32 | InternalError = 500; 33 | InvalidKey = 429; 34 | InvalidRequestParams = 406; 35 | } 36 | } 37 | 38 | -------------------------------------------------------------------------------- /proto/tts.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "ttsbackend.proto"; 4 | 5 | package TTS; 6 | 7 | /////////////////////////////////////////////////////////////////////////// 8 | // Usage: 9 | //(1) send ConnectionRequest, receive ConnectionResponse 10 | //(2) send ParamsRequest, receive ParamsResponse 11 | //(3) send GenerateRequest, recieve GenerateResponse 12 | // 13 | // TTS proxy requires all this steps in fixed order 14 | // and none could be skipped 15 | 16 | message ConnectionRequest 17 | { 18 | required string serviceName = 1; // "tts", "asr", "asr_dictation", etc. 19 | 20 | required string uuid = 2; 21 | 22 | optional int32 protocolVersion = 3 [default = 1]; 23 | 24 | optional string deviceName = 4; 25 | 26 | // new field v2 27 | 28 | optional string speechkitVersion = 5; 29 | 30 | // warning, apiKey options only temporary (for /tcp hand), for /ytcp it absence will result in ConnectionResponse::KeyInvalid 31 | optional string apiKey = 6; 32 | 33 | optional string applicationName = 7; 34 | 35 | optional string coords = 8; 36 | } 37 | 38 | /////////////////////////////////////////////////////////////////////////// 39 | 40 | message ParamsRequest 41 | { 42 | optional bool listVoices = 1; 43 | } 44 | 45 | message ParamsResponse 46 | { 47 | message Voice 48 | { 49 | // use as "voice" in GenerateRequest 50 | required string name = 1; 51 | // 1 female, 2 male 52 | required int32 gender = 2; 53 | // 0x809 english 54 | // 0x419 ru 55 | // 0 for "internation" voice, that can be used with any language 56 | required int32 languageId = 3; 57 | 58 | required int32 initialSampleFreq = 4; 59 | 60 | // fine display name for menus and gui 61 | optional string displayName = 5; 62 | 63 | // this voice can be used in lowLevelGenerateRequest for mixing 64 | optional bool coreVoice = 6; 65 | } 66 | 67 | repeated Voice voiceList = 1; 68 | } 69 | 70 | /////////////////////////////////////////////////////////////////////////// 71 | 72 | message GenerateRequest 73 | { 74 | required string lang = 1; 75 | required string text = 2; 76 | required string application = 3; 77 | required string platform = 4; 78 | required string voice = 6; 79 | optional float speed = 31; 80 | optional string emotion = 10; 81 | 82 | enum Quality { 83 | High = 0; Low = 1; UltraHigh = 2; 84 | // Low means resample to 8000! 85 | // High meand resample to 16000. 86 | // UltraHigh means 48000 (or 32000 for SPEEX) 87 | } 88 | 89 | enum Format { 90 | Wav = 0; Pcm = 1; Spx = 2; Opus = 3; 91 | } 92 | 93 | optional Quality quality = 7 [default = High]; 94 | optional Format format = 8 [default = Spx]; 95 | optional bool requireMetainfo = 5 [default = false]; 96 | 97 | optional Generate lowLevelGenerateRequest = 30; 98 | 99 | // keep calm, and do not use '9' slot again 100 | optional string speed_obsolete = 9; 101 | 102 | optional float volume = 32 [default = 1.0]; 103 | 104 | optional bool chunked = 33 [default = false]; 105 | } 106 | 107 | message Feedback 108 | { 109 | required int32 elapsed = 1; 110 | required string event = 2; 111 | } 112 | 113 | -------------------------------------------------------------------------------- /proto/ttsbackend.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "basic.proto"; 4 | 5 | package TTS; 6 | 7 | message Generate 8 | { 9 | optional string sessionId = 30; 10 | 11 | //Language code, ex.: ru, en 12 | required string lang = 1; 13 | 14 | message WeightedParam 15 | { 16 | required string name = 1; 17 | required float weight = 2; 18 | } 19 | 20 | //Text for synthesis 21 | required string text = 2; 22 | 23 | //Speed of speech <1.0 - slower, >1.0 - faster 24 | optional float speed = 3 [default = 1.0]; 25 | 26 | //Supported voices are: zhar, omazh, jane, ermil, oksana 27 | repeated WeightedParam voices = 11; 28 | 29 | //Supported emotions are: good, neutral, evil 30 | repeated WeightedParam emotions = 12; 31 | 32 | //Supported genders are: male, female 33 | repeated WeightedParam genders = 13; 34 | 35 | optional bool requireMetainfo = 5 [default = false]; 36 | 37 | optional float msd_threshold = 14; 38 | optional float mgc_recurrence = 15; 39 | optional float subtract_durations_sigmas = 17; 40 | optional float lf0_postfilter = 18; 41 | optional float mgcGVWeight = 19; 42 | optional float lf0GVWeight = 20; 43 | optional float mvfGVWeight = 21; 44 | optional float mgc_postfilter1 = 22; 45 | optional float mgc_postfilter2 = 23; 46 | 47 | optional bool chunked = 24; //ex-fast 48 | } 49 | 50 | message GenerateResponse 51 | { 52 | message WordEvent 53 | { 54 | required int32 firstCharPositionInText = 1; 55 | required int32 bytesLengthInSignal = 2; 56 | optional string text = 3; 57 | optional string postag = 4; 58 | optional string homographTag = 5; 59 | } 60 | message Phoneme 61 | { 62 | required string ttsPhoneme = 1; 63 | required string IPAPhoneme = 2; 64 | required int32 viseme = 5; 65 | required int32 durationMs = 3; 66 | required int32 positionInBytesStream = 4; 67 | } 68 | 69 | // words and phomenes would be empty, unless requireMetainfo is set in GenerateRequest 70 | repeated WordEvent words = 1; 71 | repeated Phoneme phonemes = 2; 72 | optional bytes audioData = 3; 73 | required bool completed = 4; 74 | 75 | optional BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 5; 76 | //Error message 77 | optional string message = 6; 78 | 79 | //Lingware information 80 | optional string lang = 7; 81 | optional string version = 8; 82 | } 83 | 84 | message StopGeneration 85 | { 86 | } 87 | -------------------------------------------------------------------------------- /proto/voiceproxy.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "basic.proto"; 4 | 5 | package VoiceProxyProtobuf; 6 | 7 | // use this part of ConnectionRequest to specify additional options for decoder/proxy 8 | message AdvancedASROptions 9 | { 10 | // send back partial results, if disabled only results with endOfUtt == true will be send 11 | optional bool partial_results = 1 [default = true]; 12 | 13 | // beam, lattice_beam, lattice_nbest - are low level decoder options 14 | optional float beam = 2 [default = -1]; 15 | 16 | optional float lattice_beam = 3 [default = -1]; 17 | 18 | optional int32 lattice_nbest = 4 [default = -1]; 19 | 20 | // specify interval in 10mc of silence/noice which will separe sentences, defines how often you will receive endOfUtt == true 21 | optional int32 utterance_silence = 5 [default = 120]; 22 | 23 | // disable all partial and results with "endOfUtt" - will response only when AddData with "lastChunk" is received 24 | optional bool allow_multi_utt = 16 [default = true]; 25 | 26 | // if client sends too many chunks (more then server could process) - if timeout (in mc) specify how many buffers sound should be read 27 | // before sending to decoder, this may vary how often partial_results are sending 28 | optional int32 chunk_process_limit = 17 [default = 100]; 29 | 30 | // cmn is a internal feature of decoder 31 | optional int32 cmn_window = 18 [default = 600]; 32 | 33 | optional int32 cmn_latency = 19 [default = 150]; 34 | 35 | // capitalize and expected_num_count are features of "normalized" field of the AddDataResponse recognition result 36 | 37 | // specify if "normalized" results will be capitalize 38 | optional bool capitalize = 20 [default = false]; 39 | 40 | // if specified normalizer will try to fit this count, for example "twenty two" will normalize to 20 2, if "3" is set as expected, default will be 22 41 | optional int32 expected_num_count = 21 [default = 0]; 42 | 43 | // list of phrases for on-fly grammar, for example "yes", "no" in case of en-US 44 | // this field made "topic" in ConnectionRequest irrelevant, instead this list is used to build "language model" on the fly 45 | repeated string grammar = 22; 46 | 47 | // the same as previous, but partially support srgs, you can specify items, tags and simple rules, for example: 48 | // 49 | // 50 | // 51 | // 52 | // 53 | // оплатилоплатил 54 | // оплатилда 55 | // не оплатилне оплатил 56 | // не оплатилнет 57 | // 58 | // 59 | // 60 | optional string srgs = 23; 61 | 62 | // currently supports "gender", "age", "group", "language", "children", "emotion" and combination with ",", like "age,gender" 63 | // checkout BiometryResult 64 | optional string biometry = 24; 65 | 66 | // turn on confidence rescoring procedure 67 | optional bool use_snr = 25 [default = false]; 68 | 69 | // flags for confidence rescoring procedure 70 | repeated SnrFlag snr_flags = 26; 71 | 72 | // used to distinguish between biometry groups (devices) 73 | optional string biometry_group = 27; 74 | 75 | // enable special normalizers for "manual punctuation" i.e. replace "привет запятая как дела воспросительный знак" with "привет, как дела?" 76 | optional bool manual_punctuation = 28 [default = false]; 77 | } 78 | 79 | message ConnectionRequest 80 | { 81 | optional int32 protocolVersion = 1 [default = 1]; 82 | 83 | // leave empty if you are not speechkit 84 | required string speechkitVersion = 2; 85 | 86 | required string serviceName = 3; // "asr_dictation", etc. 87 | 88 | required string uuid = 4; 89 | 90 | optional string yandexuid = 21; 91 | 92 | required string apiKey = 5; 93 | 94 | required string applicationName = 6; 95 | 96 | // vendor:model:type... user defined 97 | required string device = 7; 98 | 99 | // lat.lat,lan.lan 100 | required string coords = 8; 101 | 102 | // "general", "mapsyari", "freeform", "music" 103 | // topic is ignored, if grammar or srgs from advancedAsrOptions are set 104 | required string topic = 9; 105 | 106 | // "ru-RU" 107 | required string lang = 10; 108 | 109 | // "audio/x-speex", "audio/x-pcm;bit=16;rate=8000", etc. 110 | required string format = 11; 111 | 112 | // enable punctuation mode for "freeform" topic (some other topic may support punctuation in the future) 113 | optional bool punctuation = 12 [default = true]; 114 | 115 | optional bool disableAntimatNormalizer = 18 [default = false]; 116 | 117 | optional AdvancedASROptions advancedASROptions = 19; 118 | 119 | optional bool skipAudioFromLogging = 20 [default = false]; 120 | 121 | // deprecated 122 | optional MusicRequest musicRequest = 17; 123 | } 124 | 125 | /////////////////////////////////////////////////////////////////////////// 126 | 127 | message AddData 128 | { 129 | optional bytes audioData = 1; 130 | 131 | required bool lastChunk = 2; 132 | } 133 | 134 | /////////////////////////////////////////////////////////////////////////// 135 | 136 | message AlignInfo 137 | { 138 | optional float start_time = 1; 139 | 140 | optional float end_time = 2; 141 | 142 | optional float acoustic_score = 3; 143 | 144 | optional float graph_score = 4; 145 | 146 | optional float lm_score = 5; 147 | 148 | optional float total_score = 6; 149 | } 150 | 151 | message Word 152 | { 153 | required float confidence = 1; 154 | 155 | required string value = 2; 156 | 157 | optional VoiceProxyProtobuf.AlignInfo align_info = 3; 158 | } 159 | 160 | message Result 161 | { 162 | // notice: confidence valid only when endOfUttr is true, otherwise always "1" 163 | required float confidence = 1; 164 | 165 | repeated Word words = 2; 166 | 167 | optional string normalized = 3; 168 | 169 | optional VoiceProxyProtobuf.AlignInfo align_info = 4; 170 | } 171 | 172 | message BiometryResult 173 | { 174 | required string classname = 1; 175 | 176 | required float confidence = 2; 177 | 178 | optional string tag = 3; 179 | } 180 | 181 | message SnrFlag 182 | { 183 | required string name = 1; 184 | required string value = 2; 185 | } 186 | 187 | message SnrFeature 188 | { 189 | optional string name = 1; 190 | 191 | optional float value = 2; 192 | } 193 | 194 | message SnrInfo 195 | { 196 | optional string normalizedText = 1; 197 | 198 | optional float snrValue = 2; 199 | 200 | repeated SnrFeature features = 3; 201 | 202 | optional string featureSlices = 4; 203 | 204 | optional int32 originalCandidateIndex = 5; 205 | 206 | optional string candidateSource = 6; 207 | } 208 | 209 | message SnrMetainfo 210 | { 211 | optional string name = 1; 212 | } 213 | 214 | message Metainfo 215 | { 216 | required float minBeam = 1; 217 | 218 | required float maxBeam = 2; 219 | 220 | repeated SnrInfo snrInfos = 3; 221 | 222 | optional string topic = 4; 223 | 224 | optional string lang = 5; 225 | 226 | optional string version = 6; 227 | 228 | optional string load_timestamp = 7; 229 | 230 | optional int32 snrResponseCode = 8; 231 | 232 | optional bool snr_performed_rescoring = 9; 233 | 234 | optional SnrMetainfo snrMetainfo = 10; 235 | } 236 | 237 | message AddDataResponse 238 | { 239 | required BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 1; 240 | 241 | repeated Result recognition = 2; 242 | 243 | // if true : recognition contains fully parsed N-best list (n results with n words) 244 | // otherwise recognition contains just 1 result 1 word with current "partical result" 245 | optional bool endOfUtt = 3 [default = false]; 246 | 247 | // how many AddData requests were merged for this response 248 | optional int32 messagesCount = 4 [default = 1]; 249 | 250 | // if not empty messageCounter should be 0 251 | optional string musicProxyResponse = 5; 252 | 253 | repeated BiometryResult bioResult = 6; 254 | 255 | optional Metainfo metainfo = 7; 256 | } 257 | 258 | // deprecated 259 | message MusicRequest 260 | { 261 | message MusicParam 262 | { 263 | required string name = 1; 264 | 265 | required string value = 2; 266 | } 267 | 268 | // default options are "uid", "OAuth", "widget" 269 | repeated MusicParam musicProxyOptions = 1; 270 | } 271 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | MANIFEST 4 | dist/ 5 | build/ 6 | asrclient.egg-info/ 7 | venv/ 8 | */*.sound 9 | */*.txt 10 | -------------------------------------------------------------------------------- /python/README.txt: -------------------------------------------------------------------------------- 1 | Description: 2 | 3 | This is a streaming client for Yandex speech recognition service (aka Yandex ASR). 4 | Comparing to http-api it provides much more info about a recognized text and the recognition process itself. 5 | Also it has no limit for an input file length. 6 | 7 | Install to Mac OS: 8 | 9 | Install python pip & python protobuf using as example MacPorts 10 | (opensource software package manager, instruction for installation here: https://www.macports.org/install.php): 11 | bash-3.2$ sudo port install git py27-pip py27-protobuf 12 | ... 13 | Continue? [Y/n]: Y 14 | ... 15 | 16 | After install PIP & protobuf compilers can checkout speechkit client: 17 | 18 | bash-3.2$ git clone https://github.com/yandex/speechkitcloud 19 | ... 20 | bash-3.2$ cd speechkitcloud/python 21 | bash-3.2$ protoc -I=asrclient --python_out=asrclient asrclient/*.proto 22 | bash-3.2$ python ./setup.py sdist 23 | ... 24 | bash-3.2$ cd dist 25 | bash-3.2$ ls 26 | asrclient-0.5.0.tar.gz 27 | 28 | You can have different result filename (more fresh version, etc), use it for pip install 29 | 30 | bash-3.2$ sudo pip install asrclient-0.5.0.tar.gz 31 | ... 32 | Successfully installed asrclient-0.5.0 futures-3.1.1 33 | 34 | If as default used system macos python, than asrclient-cli.py & ttsclient-cli.py installed into folder 35 | /Library/Frameworks/Python.framework/Versions/2.7/bin/ 36 | else (as default used python from macports) search it inside folder 37 | /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/ 38 | 39 | Short help can be received via --help option: 40 | bash-3.2$ /Library/Frameworks/Python.framework/Versions/2.7/bin/asrclient-cli.py --help 41 | bash-3.2$ /Library/Frameworks/Python.framework/Versions/2.7/bin/ttsclient-cli.py --help 42 | or for macports python installation: 43 | bash-3.2$ /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/asrclient-cli.py --help 44 | bash-3.2$ /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/ttsclient-cli.py --help 45 | 46 | XCode TROUBLESHOOTING: 47 | If after installing macports got error: 48 | Warning: xcodebuild exists but failed to execute 49 | Warning: Xcode does not appear to be installed; most ports will likely fail to build. 50 | 51 | use next commands for fix it: 52 | sudo xcode-select -s /Applications/Xcode.app/Contents/Developer 53 | xcodebuild -license 54 | 55 | 56 | Install to Ubuntu/Debian: 57 | 58 | You need to provide some python dependencies. Suggest something like this... 59 | 60 | sudo apt-get install python2.7 python-setuptools python-pip git protobuf-compiler 61 | git clone https://github.com/yandex/speechkitcloud 62 | cd speechkitcloud/python 63 | protoc -I=asrclient --python_out=asrclient asrclient/*.proto 64 | python ./setup.py sdist 65 | 66 | cd dist 67 | sudo pip install 68 | 69 | ...or you can provide the dependencies manually and run ./asrclient-cli.py directly (without install). 70 | 71 | 1. asrclient-cli.py 72 | 73 | Usage: 74 | 75 | asrclient-cli.py [OPTIONS] [FILES]... 76 | 77 | Options: 78 | -k, --key TEXT You could get it at 79 | https://developer.tech.yandex.ru/. Default 80 | is "paste-your-own-key". 81 | Use "internal" with Speechkit Box. 82 | -s, --server TEXT Default is asr.yandex.net. 83 | -p, --port INTEGER Default is 80. 84 | --format TEXT Input file format. Default is 85 | audio/x-pcm;bit=16;rate=16000. 86 | --model TEXT Recognition model: freeform, maps, general, etc. 87 | Use the last one if your sound comes from a 88 | phone call. It's just a model name, sound 89 | format may be different. Default is 90 | freeform. 91 | --lang TEXT Recognition language. ru-RU | en-EN | tr-TR 92 | | uk-UA. Default is ru-RU. 93 | --chunk-size INTEGER Default value 65536 bytes roughly equals to 94 | one second of audio in default format. 95 | --start-with-chunk INTEGER Use it to send only some part of the input 96 | file. Default is 0. 97 | --max-chunks-count INTEGER Use it to send only some part of the input 98 | file. Default means no limit is set. 99 | --reconnect-delay FLOAT Take a pause in case of network problems. 100 | Default value is 0.5 seconds. 101 | --inter-utt-silence FLOAT A pause between phrases finalization. 102 | Default value is 1.2 seconds. 103 | --cmn-latency INTEGER CMN latency parameter. Default value is 50. 104 | --reconnect-retry-count INTEGER 105 | Sequentional reconnects before giving up. 106 | Default is 5. 107 | --silent Don't print debug messages, only recognized 108 | text. 109 | --record Grab audio from system audio input instead 110 | of files. 111 | --nopunctuation Disable punctuation. 112 | --uuid TEXT UUID of your request. It can be helpful for 113 | further logs analysis. Default is random. 114 | --ipv4 Use ipv4 only connection. 115 | --realtime Emulate realtime record recognition. 116 | --callback-module TEXT Python module name which should implement 117 | advanced_callback(AddDataResponse). 118 | It takes 119 | corresponding protobuf message as a 120 | parameter. See advanced_callback_example.py 121 | for details. 122 | --help Show this message and exit. 123 | 124 | 125 | Examples: 126 | 127 | asrclient-cli.py --help 128 | 129 | asrclient-cli.py --key=active-key-from-your-account sound.wav 130 | 131 | asrclient-cli.py --key=active-key-from-your-account --silent sound.wav 132 | 133 | asrclient-cli.py --key=active-key-from-your-account --silent --callback-module advanced_callback_example sound.wav 134 | 135 | More: 136 | 137 | We expect incoming sound in specific format audio/x-pcm;bit=16;rate=16000 (single channel). 138 | To convert some random sound file to this, suggest 139 | 140 | sox sound.mp3 -t wav -c 1 --rate 16000 -b 16 -e signed-integer sound.wav 141 | 142 | 2. ttsclient-cli.py 143 | 144 | Usage: ttsclient-cli.py [OPTIONS] [FILE] [TEXTS]... 145 | 146 | Options: 147 | -k, --key TEXT You could get it at https://developer.tech.yandex.ru/. 148 | Default is "paste-your-own-key". 149 | -s, --server TEXT Default is tts.voicetech.yandex.net. 150 | -p, --port INTEGER Default is 80. 151 | --lang TEXT Synthesis language. ru-RU | en-EN | tr-TR | uk-UA. 152 | Default is ru-RU. 153 | --speaker TEXT Speaker for speech synthesis. Call this script with 154 | --list-speakers flag to get speakers list. 155 | --emotion TEXT Emotion for speech synthesis. Available values: good, 156 | neutral, evil. Default value depends on speaker's 157 | original emotion. 158 | --gender TEXT Speaker's gender for speech synthesis. Available 159 | values: male, female. Default value depends on 160 | speaker's original gender. 161 | --textfile FILENAME Read text from this file instead of command line 162 | arguments. 163 | --uuid TEXT UUID of your request. It can be helpful for further 164 | logs analysis. Default is random. 165 | --ipv4 Use ipv4 only connection. 166 | --list-speakers Only list available speakers, don't try to generate 167 | anything. 168 | --silent Don't print debug messages. 169 | --help Show this message and exit. 170 | 171 | Examples: 172 | 173 | ttsclient-cli.py --help 174 | 175 | ttsclient-cli.py --key=active-key-from-your-account --list-speakers 176 | 177 | ttsclient-cli.py --key=active-key-from-your-account --speaker jane --lang en-EN out.wav "Hello!" 178 | 179 | ttsclient-cli.py --key=active-key-from-your-account --speaker jane --textfile request.txt out.wav 180 | 181 | More: 182 | 183 | We generate sound in format audio/x-wav, single channel, 16000Hz, 16-bit signed integer PCM encoding. 184 | 185 | Useful links: 186 | 187 | http://sox.sourceforge.net/ - sound conversion library and utility. 188 | https://pypi.python.org/pypi/pip - python package manager. 189 | https://developer.tech.yandex.ru - obtain your key. 190 | https://tech.yandex.ru/speechkit/cloud/ - more about Yandex ASR. 191 | -------------------------------------------------------------------------------- /python/advanced_callback_example.py: -------------------------------------------------------------------------------- 1 | from asrclient.voiceproxy_pb2 import AddDataResponse as AsrResponse 2 | 3 | """ 4 | use it like 5 | ./asrclient-cli.py -k --callback-module advanced_callback_example --silent 6 | """ 7 | 8 | session_id = "not-set" 9 | 10 | def advanced_callback(asr_response, correction = 0): 11 | print("Got response:") 12 | print("end-of-utterance = {}".format(asr_response.endOfUtt)) 13 | r_count = 0 14 | for r in asr_response.recognition: 15 | print("recognition[{}] = {}; confidence = {}".format(r_count, r.normalized.encode("utf-8"), r.confidence)) 16 | print("utterance timings: from {} to {}".format(r.align_info.start_time+correction,r.align_info.end_time+correction)) 17 | w_count = 0 18 | for w in r.words: 19 | print("word[{}] = {}; confidence = {}".format(w_count, w.value.encode("utf-8"), w.confidence)) 20 | print("word timings: from {} to {}".format(w.align_info.start_time+correction,w.align_info.end_time+correction)) 21 | w_count += 1 22 | r_count += 1 23 | 24 | 25 | def advanced_utterance_callback(asr_response, data_chunks): 26 | data_length = 0 27 | for chunk in data_chunks: 28 | data_length += len(chunk) if chunk else 0 29 | print("Got complete utterance, for {0} data_chunks, session_id = {1}".format(len(data_chunks), session_id)) 30 | print("Metainfo", asr_response.metainfo.minBeam, asr_response.metainfo.maxBeam) 31 | print("Data length = {0}".format(data_length)) 32 | -------------------------------------------------------------------------------- /python/advanced_callback_splitter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from asrclient.voiceproxy_pb2 import AddDataResponse as AsrResponse 4 | from asrclient.ttsclient import generateWavHeader 5 | 6 | """ 7 | use it like 8 | ./asrclient-cli.py -k --callback-module advanced_callback_example --silent 9 | """ 10 | 11 | session_id = "not-set" 12 | start_timestamp = datetime.datetime.now().strftime("%d-%m-%Y_%H%M%S") 13 | 14 | def mkdir_p(path): 15 | try: 16 | os.makedirs(path) 17 | except OSError as exc: # Python >2.5 18 | if exc.errno == errno.EEXIST and os.path.isdir(path): 19 | pass 20 | else: 21 | raise 22 | 23 | utterance_count = 0 24 | 25 | def advanced_callback(asr_response, correction = 0): 26 | print("Got response:") 27 | print("end-of-utterance = {}".format(asr_response.endOfUtt)) 28 | r_count = 0 29 | for r in asr_response.recognition: 30 | print("recognition[{}] = {}; confidence = {}".format(r_count, r.normalized.encode("utf-8"), r.confidence)) 31 | print("utterance timings: from {} to {}".format(r.align_info.start_time+correction,r.align_info.end_time+correction)) 32 | w_count = 0 33 | for w in r.words: 34 | print("word[{}] = {}; confidence = {}".format(w_count, w.value.encode("utf-8"), w.confidence)) 35 | print("word timings: from {} to {}".format(w.align_info.start_time+correction,w.align_info.end_time+correction)) 36 | w_count += 1 37 | r_count += 1 38 | 39 | leftover = None 40 | leftmargin = 0 41 | def advanced_utterance_callback(asr_response, data_chunks): 42 | global utterance_count, leftover, leftmargin 43 | 44 | dirname = "./{0}_{1}/".format(start_timestamp, session_id) 45 | if not os.path.isdir(dirname): 46 | mkdir_p(dirname) 47 | print("Got complete utterance, for {0} data_chunks, session_id = {1}".format(len(data_chunks), session_id)) 48 | 49 | with open("{0}/{1}_{2}.wav".format(dirname, session_id, utterance_count), "wb") as sound_file: 50 | left = 0 51 | right = sum(map(len, filter(lambda x: x, data_chunks))) 52 | if asr_response.recognition: 53 | if asr_response.recognition[0].words: 54 | left = asr_response.recognition[0].words[0].align_info.start_time * 32000 55 | right = asr_response.recognition[0].words[-1].align_info.end_time * 32000 56 | else: 57 | left = asr_response.recognition[0].align_info.start_time * 32000 58 | right = asr_response.recognition[0].align_info.end_time * 32000 59 | 60 | result = "" 61 | print(left, right) 62 | chunks = [leftover] + data_chunks 63 | leftover = None 64 | for chunk in chunks: 65 | if not chunk: 66 | continue 67 | if chunk.startswith("RIFF"): 68 | chunk = chunk[44:] 69 | if len(result): 70 | result += chunk 71 | else: 72 | print(left, leftmargin, len(chunk)) 73 | if left - leftmargin < len(chunk): 74 | cutat = int(left - leftmargin) 75 | print(cutat) 76 | if cutat % 2: 77 | cutat -= 1 78 | result += chunk[cutat:] 79 | leftmargin += len(chunk) 80 | 81 | right = int(right) 82 | if right%2: 83 | right-=1 84 | 85 | if right < leftmargin: 86 | offset = leftmargin - right 87 | leftover = result[-offset:] 88 | result = result[:-offset] 89 | leftmargin = right 90 | 91 | data_size = len(result) 92 | sound_file.write(generateWavHeader(16000, True, data_size)) 93 | sound_file.write(result) 94 | 95 | with open("{0}/{1}_{2}.txt".format(dirname, session_id, utterance_count), "w") as txt_file: 96 | text = asr_response.recognition[0].normalized.encode("utf-8") 97 | if text is not None: 98 | txt_file.write(text) 99 | 100 | utterance_count += 1 101 | -------------------------------------------------------------------------------- /python/asrclient-cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Yandex ASR streaming client.""" 4 | 5 | from __future__ import absolute_import 6 | import logging 7 | import click 8 | import sys 9 | 10 | import importlib 11 | from asrclient import client 12 | 13 | try: 14 | import pyaudio 15 | is_pyaudio = True 16 | except ImportError: 17 | is_pyaudio = False 18 | 19 | 20 | @click.command() 21 | @click.option('-k', '--key', 22 | help='You could get it at https://developer.tech.yandex.ru/. Default is "{0}".'.format(client.DEFAULT_KEY_VALUE), 23 | default=client.DEFAULT_KEY_VALUE) 24 | @click.option('-s', '--server', 25 | help='Default is {0}.'.format(client.DEFAULT_SERVER_VALUE), 26 | default=client.DEFAULT_SERVER_VALUE) 27 | @click.option('-p', '--port', 28 | help='Default is {0}.'.format(client.DEFAULT_PORT_VALUE), 29 | default=client.DEFAULT_PORT_VALUE) 30 | @click.option('--format', 31 | help='Input file format. Default is {0}.'.format(client.DEFAULT_FORMAT_VALUE), 32 | default=client.DEFAULT_FORMAT_VALUE) 33 | @click.option('--model', 34 | help='Recognition model. freeform | freeform8alaw. Use the last one if your sound comes from a phone call. It\'s just a model name, sound format may be different. Default is {0}.'.format(client.DEFAULT_MODEL_VALUE), 35 | default=client.DEFAULT_MODEL_VALUE) 36 | @click.option('--lang', 37 | help='Recognition language. ru-RU | en-EN | tr-TR | uk-UA. Default is {0}.'.format(client.DEFAULT_LANG_VALUE), 38 | default=client.DEFAULT_LANG_VALUE) 39 | @click.option('--app', 40 | help='Application. Default is local.', 41 | default="local") 42 | @click.option('--chunk-size', 43 | default=client.DEFAULT_CHUNK_SIZE_VALUE, 44 | help='Default value {0} bytes roughly equals to one second of audio in default format.'.format(client.DEFAULT_CHUNK_SIZE_VALUE)) 45 | @click.option('--start-with-chunk', 46 | default=0, 47 | help='Use it to send only some part of the input file. Default is 0.') 48 | @click.option('--max-chunks-count', 49 | default=None, 50 | type=int, 51 | help='Use it to send only some part of the input file. Default means no limit is set.') 52 | @click.option('--reconnect-delay', 53 | default=client.DEFAULT_RECONNECT_DELAY, 54 | help='Take a pause in case of network problems. Default value is {0} seconds.'.format(client.DEFAULT_RECONNECT_DELAY)) 55 | @click.option('--inter-utt-silence', 56 | default=client.DEFAULT_INTER_UTT_SILENCE, 57 | type=float, 58 | help='A pause between phrases finalization. Default value is {0} seconds.'.format(client.DEFAULT_INTER_UTT_SILENCE/100.0)) 59 | @click.option('--cmn-latency', 60 | default=client.DEFAULT_CMN_LATENCY, 61 | help='CMN latecny parameter. Default value is {0}.'.format(client.DEFAULT_CMN_LATENCY)) 62 | @click.option('--reconnect-retry-count', 63 | default=client.DEFAULT_RECONNECT_RETRY_COUNT, 64 | help='Sequentional reconnects before giving up. Default is {0}.'.format(client.DEFAULT_RECONNECT_RETRY_COUNT)) 65 | @click.option('--silent', 66 | is_flag=True, 67 | help='Don\'t print debug messages, only recognized text.') 68 | @click.option('--record', 69 | is_flag=True, 70 | help='Grab audio from system audio input instead of files.') 71 | @click.option('--nopunctuation', 72 | is_flag=True, 73 | help='Disable punctuation.') 74 | @click.option('--uuid', 75 | default=client.DEFAULT_UUID_VALUE, 76 | help='UUID of your request. It can be helpful for further logs analysis. Default is random.') 77 | @click.option('--ipv4', 78 | is_flag=True, 79 | help='Use ipv4 only connection.') 80 | @click.option('--realtime', 81 | is_flag=True, 82 | help='Emulate realtime record recognition.') 83 | @click.option('--callback-module', 84 | help='Python module name which should implement advanced_callback(AddDataResponse).\nIt takes corresponding protobuf message as a parameter. See advanced_callback_example.py for details.', 85 | default=None) 86 | @click.argument('files', 87 | nargs=-1, 88 | type=click.File('rb')) 89 | @click.option('--capitalize', 90 | is_flag=True, 91 | help='Should each utterance start with a capital letter?') 92 | @click.option('--expected-num-count', 93 | default=0, 94 | type=int, 95 | help='How many digits should be in the answer? Special option, you don\'t need it!') 96 | @click.option('--biometry', 97 | help='Enable biometry , "gender", "age", "group", "language" or combine with "," like "age,gender"', 98 | default="") 99 | @click.option('--snr', 100 | default=False, 101 | is_flag=True, 102 | help='Deprecated') 103 | @click.option('--snr_flags', 104 | default="", 105 | type=str, 106 | help='Deprecated') 107 | @click.option('--grammar-file', 108 | default="", 109 | help='Custom grammar, can be list of lines or xml file description') 110 | def main(chunk_size, start_with_chunk, max_chunks_count, record, files, silent, **kwars): 111 | if not silent: 112 | logging.basicConfig(level=logging.INFO) 113 | 114 | chunks = [] 115 | if files: 116 | chunks = client.read_chunks_from_files(files, 117 | chunk_size, 118 | start_with_chunk, 119 | max_chunks_count) 120 | else: 121 | if record: 122 | if is_pyaudio: 123 | chunks = client.read_chunks_from_pyaudio(chunk_size) 124 | else: 125 | click.echo('Please install pyaudio module for system audio recording.') 126 | sys.exit(-2) 127 | 128 | def default_callback(utterance, start_time = 0.0, end_time = 0.0, data = None): 129 | click.echo(utterance) 130 | if (end_time > start_time): 131 | click.echo("from {0} to {1}".format(start_time, end_time)) 132 | 133 | if not chunks: 134 | click.echo('Please, specify one or more input filename.') 135 | else: 136 | client.recognize(chunks, 137 | callback=default_callback, 138 | **kwars) 139 | 140 | if __name__ == "__main__": 141 | main() 142 | -------------------------------------------------------------------------------- /python/asrclient/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yandex/speechkitcloud/817e2bc2f090a17b8d3a9180848d5174d804bc3b/python/asrclient/__init__.py -------------------------------------------------------------------------------- /python/asrclient/basic.proto: -------------------------------------------------------------------------------- 1 | // Yandex ASR dictation api (draft): 2 | // Client initiate session with http upgrade request, for example: 3 | // GET /asr_partial HTTP/1.1\r\n 4 | // User-Agent:KeepAliveClient\r\n 5 | // Host: voice-stream.voicetech.yandex.net:80\r\n 6 | // Upgrade: dictation\r\n\r\n 7 | // Receive HTTP 101 Switched response. 8 | // Next send\receive protobuf messages, format 9 | // [hex size]\r\n[message body serialized with protobuf] 10 | 11 | // send ConnectionRequest, read ConnectionsResponse.. etc 12 | // send AddData, read AddDataResponse and so on. 13 | syntax = "proto2"; 14 | 15 | package BasicProtobuf; 16 | 17 | message ConnectionResponse 18 | { 19 | required ResponseCode responseCode = 1; 20 | 21 | required string sessionId = 2; 22 | 23 | optional string message = 3; 24 | 25 | enum ResponseCode { 26 | OK = 200; 27 | BadMessageFormatting = 400; 28 | UnknownService = 404; 29 | NotSupportedVersion = 405; 30 | Timeout = 408; 31 | ProtocolError = 410; 32 | InternalError = 500; 33 | InvalidKey = 429; 34 | InvalidRequestParams = 406; 35 | UnsupportedMediaType = 415; 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /python/asrclient/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Yandex ASR streaming library.""" 4 | 5 | import os 6 | import logging 7 | import sys 8 | import time 9 | import codecs 10 | import importlib 11 | 12 | from uuid import uuid4 as randomUuid 13 | from socket import error as SocketError 14 | from google.protobuf.message import DecodeError as DecodeProtobufError 15 | if sys.version_info >= (3, 0): 16 | from .basic_pb2 import ConnectionResponse 17 | from .voiceproxy_pb2 import ConnectionRequest, AddData, AddDataResponse, AdvancedASROptions, SnrFlag 18 | from .transport import Transport, TransportError 19 | else: 20 | from basic_pb2 import ConnectionResponse 21 | from voiceproxy_pb2 import ConnectionRequest, AddData, AddDataResponse, AdvancedASROptions, SnrFlag 22 | from transport import Transport, TransportError 23 | from concurrent.futures import ThreadPoolExecutor, Future 24 | 25 | 26 | DEFAULT_KEY_VALUE = 'paste-your-own-key' 27 | DEFAULT_SERVER_VALUE = 'asr.yandex.net' 28 | DEFAULT_PORT_VALUE = 80 29 | 30 | DEFAULT_FORMAT_VALUE = 'audio/x-pcm;bit=16;rate=16000' 31 | # 'audio/x-pcm;bit=16;rate=8000' # use this format for 8k bitrate wav and pcm 32 | 33 | DEFAULT_MODEL_VALUE = 'freeform' 34 | DEFAULT_LANG_VALUE = 'ru-RU' 35 | 36 | DEFAULT_UUID_VALUE = randomUuid().hex 37 | 38 | DEFAULT_CHUNK_SIZE_VALUE = 1024*32*2 39 | DEFAULT_RECONNECT_DELAY = 0.5 40 | DEFAULT_RECONNECT_RETRY_COUNT = 5 41 | DEFAULT_PENDING_LIMIT = 50 42 | 43 | DEFAULT_INTER_UTT_SILENCE = 120 44 | DEFAULT_CMN_LATENCY = 50 45 | 46 | def bytes_in_sec(format): 47 | if "8000" in format: 48 | return 16000 49 | else: 50 | return 32000 51 | 52 | 53 | def read_chunks_from_pyaudio(chunk_size = DEFAULT_CHUNK_SIZE_VALUE): 54 | import pyaudio 55 | p = pyaudio.PyAudio() 56 | stream = p.open(format=pyaudio.paInt16, 57 | channels=1, 58 | rate=16000, 59 | input=True, 60 | frames_per_buffer=1024) 61 | while True: 62 | yield stream.read(chunk_size) 63 | 64 | 65 | def read_chunks_from_files(files, chunksize, start_from = 0, max_count = None): 66 | count = 0 67 | for f in files: 68 | chunk = f.read(chunksize) 69 | while chunk: 70 | if start_from <= count: 71 | if max_count is None or count < start_from + max_count: 72 | yield chunk 73 | count += 1 74 | chunk = f.read(chunksize) 75 | f.close() 76 | 77 | 78 | class ServerError(RuntimeError): 79 | def __init__(self, message): 80 | RuntimeError.__init__(self, message) 81 | 82 | 83 | class ServerConnection(object): 84 | 85 | def __init__(self, host, port, key, app, service, topic, lang, format, uuid, inter_utt_silence, cmn_latency, biometry, logger=None, punctuation=True, ipv4=False, capitalize=False, expected_num_count=0, snr=False, snr_flags=None, grammar_file=""): 86 | self.host = host 87 | self.port = port 88 | self.key = key 89 | self.app = app 90 | self.topic = topic 91 | self.service = service 92 | self.lang = lang 93 | self.format = format 94 | self.uuid = uuid 95 | self.logger = logger 96 | self.biometry = biometry 97 | self.punctuation = punctuation 98 | self.inter_utt_silence = inter_utt_silence 99 | self.cmn_latency = cmn_latency 100 | self.ipv4 = ipv4 101 | self.capitalize = capitalize 102 | self.expected_num_count = expected_num_count 103 | self.snr = snr 104 | 105 | if not snr_flags: 106 | self.snr_flags = [] 107 | elif isinstance(snr_flags, str) or isinstance(snr_flags, unicode): 108 | self.snr_flags = [a.split("=") for a in snr_flags.split(",")] 109 | else: 110 | self.snr_flags = snr_flags 111 | 112 | self.grammar_file = grammar_file 113 | 114 | self.log("uuid={0}".format(self.uuid)) 115 | 116 | self.session_id = "not-set" 117 | self.connect() 118 | 119 | 120 | def log(self, message): 121 | if self.logger is not None: 122 | self.logger.info(message) 123 | 124 | def connect(self): 125 | self.t = Transport(self.host, self.port, timeout=None, verbose=False, enable_ssl=(self.port==443), ipv4=self.ipv4) 126 | if not self.upgrade_connection(): 127 | raise ServerError('Unable to upgrade connection') 128 | self.log("Connected to {0}:{1}.".format(self.host, self.port)) 129 | 130 | response = self.send_init_request() 131 | if response.responseCode != 200: 132 | error_text = 'Wrong response from server, status_code={0}'.format( 133 | response.responseCode) 134 | if response.HasField("message"): 135 | error_text += ', message is "{0}"'.format(response.message) 136 | raise ServerError(error_text) 137 | 138 | self.session_id = response.sessionId 139 | self.log("session_id={0}".format(self.session_id)) 140 | 141 | return self.session_id 142 | 143 | def send_init_request(self): 144 | advancedASROptions = AdvancedASROptions( 145 | utterance_silence=int(self.inter_utt_silence), 146 | cmn_latency=self.cmn_latency, 147 | capitalize=self.capitalize, 148 | expected_num_count=self.expected_num_count, 149 | biometry=self.biometry, 150 | use_snr=self.snr, 151 | snr_flags=[SnrFlag(name=a[0], value=a[1]) for a in self.snr_flags], 152 | ) 153 | if len(self.grammar_file) > 0: 154 | with codecs.open(self.grammar_file, encoding='utf-8') as grammar: 155 | advancedASROptions.srgs = grammar.read() 156 | request = ConnectionRequest( 157 | speechkitVersion='', 158 | serviceName=self.service, 159 | uuid=self.uuid, 160 | apiKey=self.key, 161 | applicationName=self.app, 162 | device='desktop', 163 | coords='0, 0', 164 | topic=self.topic, 165 | lang=self.lang, 166 | format=self.format, 167 | punctuation=self.punctuation, 168 | advancedASROptions=advancedASROptions 169 | ) 170 | 171 | self.t.sendProtobuf(request) 172 | return self.t.recvProtobuf(ConnectionResponse) 173 | 174 | def upgrade_connection(self): 175 | logger = logging.getLogger('arslib') 176 | request = ('GET /asr_partial_checked HTTP/1.1\r\n' 177 | 'User-Agent: {user_agent}\r\n' 178 | 'Host: {host}:{port}\r\n' 179 | 'Upgrade: {service}\r\n\r\n').format( 180 | user_agent=self.app, 181 | host=self.host, 182 | port=self.port, 183 | service=self.service) 184 | 185 | self.t.send(request) 186 | check = 'HTTP/1.1 101 Switching Protocols' 187 | buffer = '' 188 | 189 | # possible infinite loop here? 190 | while True: 191 | buffer += self.t.recv(1) 192 | if buffer.startswith(check) and buffer.endswith('\r\n\r\n'): 193 | return True 194 | if len(buffer) > 300: 195 | logger.warning(buffer) 196 | return False 197 | 198 | def close(self): 199 | self.session_id = "" 200 | self.t.close() 201 | 202 | def reconnect(self, delay=None): 203 | self.log('Reconnecting!') 204 | self.close() 205 | if delay is not None: 206 | self.log('Going to sleep for {0} seconds'.format(delay)) 207 | time.sleep(delay) 208 | self.connect() 209 | 210 | def add_data(self, chunk): 211 | if chunk is None: 212 | self.t.sendProtobuf(AddData(lastChunk=True)) 213 | else: 214 | self.t.sendProtobuf(AddData(lastChunk=False, audioData=chunk)) 215 | 216 | 217 | def get_response_if_ready(self): 218 | response = self.t.recvProtobufIfAny(AddDataResponse, ConnectionResponse) 219 | 220 | if isinstance(response, ConnectionResponse): 221 | raise ServerError("Bad AddData response: %s %s" % (response.responseCode, response.message)) 222 | 223 | if response is not None: 224 | if response.responseCode != 200: 225 | error_text = 'Wrong response from server, status_code={0}'.format( 226 | response.responseCode) 227 | if response.HasField("message"): 228 | error_text += ', message is "{0}"'.format(response.message) 229 | raise ServerError(error_text) 230 | 231 | return response 232 | 233 | def recognize(chunks, 234 | callback=None, 235 | advanced_callback=None, 236 | callback_module=None, 237 | format=DEFAULT_FORMAT_VALUE, 238 | server=DEFAULT_SERVER_VALUE, 239 | port=DEFAULT_PORT_VALUE, 240 | key=DEFAULT_KEY_VALUE, 241 | app='local', 242 | service='dictation', 243 | model=DEFAULT_MODEL_VALUE, 244 | lang=DEFAULT_LANG_VALUE, 245 | inter_utt_silence=DEFAULT_INTER_UTT_SILENCE, 246 | cmn_latency=DEFAULT_CMN_LATENCY, 247 | biometry="", 248 | uuid=DEFAULT_UUID_VALUE, 249 | reconnect_delay=DEFAULT_RECONNECT_DELAY, 250 | reconnect_retry_count=DEFAULT_RECONNECT_RETRY_COUNT, 251 | pending_limit=DEFAULT_PENDING_LIMIT, 252 | ipv4=False, 253 | nopunctuation=False, 254 | realtime=False, 255 | capitalize=False, 256 | expected_num_count=0, 257 | snr=False, 258 | snr_flags=None, 259 | grammar_file=""): 260 | 261 | advanced_utterance_callback = None 262 | imported_module = None 263 | 264 | if callback_module is not None: 265 | imported_module = importlib.import_module(callback_module) 266 | 267 | try: 268 | advanced_callback = imported_module.advanced_callback 269 | except AttributeError: 270 | print("No advanced callback in the imported module!") 271 | 272 | try: 273 | advanced_utterance_callback = imported_module.advanced_utterance_callback 274 | except AttributeError: 275 | print("No advanced utterrance callback in the imported module!") 276 | 277 | 278 | class PendingRecognition(object): 279 | def __init__(self): 280 | self.logger = logging.getLogger('asrclient') 281 | 282 | self.server = ServerConnection(server, port, key, app, service, model, lang, format, uuid, inter_utt_silence, cmn_latency, biometry, self.logger, not nopunctuation, ipv4, capitalize, expected_num_count, snr, snr_flags, grammar_file) 283 | self.unrecognized_chunks = [] 284 | self.retry_count = 0 285 | self.pending_answers = 0 286 | self.chunks_answered = 0 287 | self.utterance_start_index = 0 288 | self.executor = ThreadPoolExecutor(max_workers=1) 289 | self.future = None 290 | self.last_end_time = 0 291 | self.correction_delta = 0 292 | self.last_chunk_sent = False 293 | 294 | def check_result(self): 295 | while True: 296 | try: 297 | response = self.server.get_response_if_ready() 298 | if response is not None: 299 | self.on_response(response) 300 | if self.last_chunk_sent and self.pending_answers <= 0: 301 | return 302 | else: 303 | time.sleep(0.01) 304 | except Exception as e: 305 | if self.pending_answers > 0: 306 | print("check result exception") 307 | print(type(e)) 308 | print(e) 309 | raise e 310 | else: 311 | return 312 | 313 | def on_response(self, response): 314 | 315 | messages_count = response.messagesCount 316 | self.chunks_answered += messages_count 317 | self.pending_answers -= messages_count 318 | 319 | self.logger.info("got response: endOfUtt={0}; len(recognition)={1}; messages_count={2}".format(response.endOfUtt, len(response.recognition), messages_count)) 320 | 321 | if response.endOfUtt: 322 | if (len(response.recognition) > 0): 323 | start_time = response.recognition[0].align_info.start_time + self.correction_delta 324 | end_time = response.recognition[0].align_info.end_time + self.correction_delta 325 | 326 | if start_time < self.last_end_time: 327 | self.correction_delta = self.last_end_time 328 | 329 | self.last_end_time = end_time 330 | 331 | if advanced_callback is not None: 332 | try: 333 | advanced_callback(response, self.correction_delta) 334 | except Exception as e: 335 | print("Exception in advanced_callback: ", e) 336 | else: 337 | if advanced_callback is not None: 338 | try: 339 | advanced_callback(response) 340 | except Exception as e: 341 | print("Exception in advanced_callback: ", e) 342 | return 343 | 344 | 345 | self.logger.info('Chunks from {0} to {1}.'.format(self.utterance_start_index, self.utterance_start_index + self.chunks_answered)) 346 | 347 | if advanced_utterance_callback is not None: 348 | try: 349 | advanced_utterance_callback(response, self.unrecognized_chunks[:self.chunks_answered]) 350 | except Exception as e: 351 | print("Exception in advanced_utterance_callback: ", e) 352 | elif callback is not None: 353 | if (len(response.recognition) > 0): 354 | start_time = response.recognition[0].align_info.start_time + self.correction_delta 355 | end_time = response.recognition[0].align_info.end_time + self.correction_delta 356 | utterance = response.recognition[0].normalized.encode('utf-8') 357 | callback(utterance, start_time, end_time, self.unrecognized_chunks[:self.chunks_answered]) 358 | 359 | del self.unrecognized_chunks[:self.chunks_answered] 360 | self.utterance_start_index += self.chunks_answered 361 | self.chunks_answered = 0 362 | self.retry_count = 0 363 | 364 | def send(self, chunk): 365 | self.logger.info("entering send() :start index {0}, pending answers {1}, chunks answered {2}".format(self.utterance_start_index, self.pending_answers, self.chunks_answered)) 366 | try: 367 | self.server.add_data(chunk) 368 | self.pending_answers += 1 369 | if chunk is None: 370 | self.last_chunk_sent = True 371 | except (DecodeProtobufError, ServerError, TransportError, SocketError) as e: 372 | self.logger.exception("Something bad happened, waiting for reconnect!") 373 | time.sleep(1) 374 | self.resendOnError() 375 | except Exception as e: 376 | self.logger.info("dbg send") 377 | print(type(e)) 378 | print(e) 379 | 380 | def reconnectOnError(self): 381 | global retry_count 382 | if self.retry_count < reconnect_retry_count: 383 | self.retry_count += 1 384 | self.server.reconnect(reconnect_delay) 385 | if imported_module is not None: 386 | imported_module.session_id = self.server.session_id 387 | else: 388 | raise RuntimeError("Gave up reconnecting!") 389 | 390 | def resendOnError(self): 391 | self.logger.info('Resending current utterance (chunks {0}-{1})...'.format(self.utterance_start_index, self.utterance_start_index + len(self.unrecognized_chunks))) 392 | self.pending_answers = 0 393 | self.chunks_answered = 0 394 | for i, chunk in enumerate(self.unrecognized_chunks): 395 | 396 | while state.pending_answers > pending_limit: 397 | time.sleep(0.01) 398 | 399 | if chunk is not None: 400 | self.logger.info('About to send chunk {0} ({1} bytes)'.format(self.utterance_start_index + i, len(chunk))) 401 | else: 402 | self.logger.info('No more chunks. Finalizing recognition.') 403 | 404 | self.send(chunk) 405 | 406 | 407 | start_at = time.time() 408 | 409 | state = PendingRecognition() 410 | if imported_module is not None: 411 | imported_module.session_id = state.server.session_id 412 | 413 | state.logger.info('Recognition was started.') 414 | chunks_count = 0 415 | 416 | state.future = state.executor.submit(state.check_result) 417 | 418 | sent_length = 0 419 | for index, chunk in enumerate(chunks): 420 | 421 | def check_future(): 422 | if not state.future.running(): 423 | state.logger.info("future not running!") 424 | state.logger.info(state.future.exception()) 425 | return False 426 | return True 427 | 428 | def onError(exception): 429 | state.logger.info('Connection lost! ({0})'.format(type(exception))) 430 | state.logger.info(exception.message) 431 | state.future.cancel() 432 | state.reconnectOnError() 433 | state.future = state.executor.submit(state.check_result) 434 | state.resendOnError() 435 | 436 | while realtime and (sent_length / bytes_in_sec(format) > time.time() - start_at): 437 | time.sleep(0.01) 438 | if not check_future(): 439 | onError(future.exception()) 440 | 441 | 442 | while state.pending_answers > pending_limit: 443 | time.sleep(0.01) 444 | if not check_future(): 445 | onError(state.future.exception()) 446 | 447 | state.logger.info('About to send chunk {0} ({1} bytes)'.format(index, len(chunk))) 448 | state.unrecognized_chunks.append(chunk) 449 | state.send(chunk) 450 | chunks_count = index + 1 451 | sent_length += len(chunk) 452 | 453 | state.logger.info('No more chunks. Finalizing recognition.') 454 | state.unrecognized_chunks.append(None) 455 | state.send(None) 456 | 457 | state.future.result() 458 | 459 | state.logger.info('Recognition is done.') 460 | 461 | fin_at = time.time() 462 | seconds_elapsed = fin_at - start_at 463 | 464 | state.logger.info("Start at {0}, finish at {1}, took {2} seconds".format(time.strftime("[%d.%m.%Y %H:%M:%S]", time.localtime(start_at)), 465 | time.strftime("[%d.%m.%Y %H:%M:%S]", time.localtime(fin_at)), 466 | seconds_elapsed)) 467 | chunks_per_second = chunks_count / seconds_elapsed 468 | state.logger.info("Avg. {0} chunks per second".format(chunks_per_second)) 469 | state.server.close() 470 | -------------------------------------------------------------------------------- /python/asrclient/transport.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import select 3 | import sys 4 | import time 5 | import ssl 6 | import pprint 7 | 8 | 9 | class TransportError(RuntimeError): 10 | def __init__(self, message): 11 | RuntimeError.__init__(self, message) 12 | 13 | 14 | class Transport: 15 | def __init__(self, ip, port, timeout=8, verbose=True, enable_ssl=False, ipv4=False, max_faults=0): 16 | self.verbose = verbose 17 | self.max_faults = max_faults 18 | tries = 5 19 | while tries > 0: 20 | try: 21 | if self.verbose: 22 | print('Trying to connect %s:%s' % (ip, port)) 23 | print("Tries left: %s" % (tries,)) 24 | if enable_ssl: 25 | s = socket.socket(socket.AF_INET if ipv4 else socket.AF_INET6, socket.SOCK_STREAM) 26 | ssl_sock = ssl.wrap_socket(s) 27 | ssl_sock.connect((ip, port)) 28 | print(repr(ssl_sock.getpeername())) 29 | print(ssl_sock.cipher()) 30 | print(pprint.pformat(ssl_sock.getpeercert())) 31 | self.socket = ssl_sock 32 | else: 33 | self.socket = socket.create_connection((ip, port), timeout) 34 | self.socket.settimeout(timeout) 35 | return None 36 | except Exception as ex: 37 | tries -= 1 38 | time.sleep(1) 39 | if (tries == 0): 40 | raise ex 41 | 42 | def __enter__(self): 43 | return self 44 | 45 | def send(self, data): 46 | faults = 0 47 | 48 | while True: 49 | try: 50 | rlist, wlist, xlist = select.select([], [self.socket], [self.socket], 0.1) 51 | if len(xlist): 52 | raise TransportError("send unavailable!") 53 | if len(wlist): 54 | break 55 | except Exception as e: 56 | if self.verbose: 57 | print("Exception on pre-send select: ", e) 58 | faults += 1 59 | if faults > self.max_faults: 60 | raise e 61 | while True: 62 | try: 63 | self.socket.send(data.encode("utf-8") if (sys.version_info[0] == 3 and type(data) == str) else data) 64 | break 65 | except Exception as e: 66 | if self.verbose: 67 | print("Exception on send: ", e) 68 | faults += 1 69 | if faults > self.max_faults: 70 | raise e 71 | if self.verbose: 72 | print("Send " + str(len(data))) 73 | 74 | def recv(self, length, decode=(sys.version_info[0] == 3)): 75 | res = b"" 76 | faults = 0 77 | while True: 78 | try: 79 | res += self.socket.recv(length - len(res)) 80 | if len(res) < length: 81 | rlist, _, xlist = select.select([self.socket], [], [self.socket], 0.1) 82 | else: 83 | if decode: 84 | return res.decode("utf-8") 85 | else: 86 | return res 87 | except Exception as e: 88 | if self.verbose: 89 | print("Exception on recv: ", e) 90 | faults += 1 91 | if faults > self.max_faults: 92 | raise e 93 | 94 | def sendFull(self, message): 95 | begin = 0 96 | while begin < len(message): 97 | begin += self.socket.send(message[begin:]) 98 | 99 | def sendMessage(self, message): 100 | self.socket.send(hex(len(message))[2:].encode("utf-8")) 101 | self.socket.send(b'\r\n') 102 | self.sendFull(message) 103 | if self.verbose: 104 | print("Send message size: ", len(message)) 105 | 106 | def recvMessage(self): 107 | size = b'' 108 | while True: 109 | symbol = self.socket.recv(1) 110 | 111 | if len(symbol) == 0: 112 | raise TransportError('Backend closed connection') 113 | 114 | assert(len(symbol) == 1), 'Bad symbol len from socket ' + str(len(symbol)) 115 | 116 | if symbol == b'\r': 117 | self.socket.recv(1) 118 | break 119 | else: 120 | size += symbol 121 | sizeInt = int(b'0x' + size, 0) 122 | if self.verbose: 123 | print("Got message. Expecting {0} bytes length.".format(sizeInt)) 124 | if (sizeInt > 0): 125 | result = b'' 126 | while len(result) < sizeInt: 127 | result += self.socket.recv(sizeInt - len(result), False) 128 | result = result 129 | assert (len(result) == sizeInt), 'Invalid message size' 130 | return result 131 | return '' 132 | 133 | def sendProtobuf(self, protobuf): 134 | self.sendMessage(protobuf.SerializeToString()) 135 | 136 | def recvProtobuf(self, *protobufTypes): 137 | savedException = None 138 | 139 | message = self.recvMessage() 140 | for protoType in protobufTypes: 141 | response = protoType() 142 | try: 143 | response.ParseFromString(message) 144 | return response 145 | except Exception as exc: 146 | savedException = exc 147 | 148 | raise savedException 149 | 150 | def recvProtobufIfAny(self, *protobuf): 151 | rlist, wlist, xlist = select.select([self.socket], [], [self.socket], 0) 152 | if (len(rlist)): 153 | return self.recvProtobuf(*protobuf) 154 | else: 155 | return None 156 | 157 | def transfer(self, sendProtobuf, receiveType): 158 | self.sendProtobuf(sendProtobuf) 159 | return self.recvProtobuf(receiveType) 160 | 161 | def close(self): 162 | if self.verbose: 163 | print('Close socket' + str(self.socket)) 164 | self.socket.close() 165 | 166 | def __exit__(self, type, value, traceback): 167 | self.close() 168 | 169 | server = "127.0.0.1" 170 | port = 8089 171 | 172 | 173 | def defaultHost(): 174 | return "{0}:{1}".format(server, port) 175 | 176 | 177 | def defaultTransport(): 178 | return Transport(server, port, verbose=False) 179 | -------------------------------------------------------------------------------- /python/asrclient/tts.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "ttsbackend.proto"; 4 | 5 | package TTS; 6 | 7 | /////////////////////////////////////////////////////////////////////////// 8 | // Usage: 9 | //(1) send ConnectionRequest, receive ConnectionResponse 10 | //(2) send ParamsRequest, receive ParamsResponse 11 | //(3) send GenerateRequest, recieve GenerateResponse 12 | // 13 | // TTS proxy requires all this steps in fixed order 14 | // and none could be skipped 15 | 16 | message ConnectionRequest 17 | { 18 | required string serviceName = 1; // "tts", "asr", "asr_dictation", etc. 19 | 20 | required string uuid = 2; 21 | 22 | optional int32 protocolVersion = 3 [default = 1]; 23 | 24 | optional string deviceName = 4; 25 | 26 | // new field v2 27 | 28 | optional string speechkitVersion = 5; 29 | 30 | // warning, apiKey options only temporary (for /tcp hand), for /ytcp it absence will result in ConnectionResponse::KeyInvalid 31 | optional string apiKey = 6; 32 | 33 | optional string applicationName = 7; 34 | 35 | optional string coords = 8; 36 | } 37 | 38 | /////////////////////////////////////////////////////////////////////////// 39 | 40 | message ParamsRequest 41 | { 42 | optional bool listVoices = 1; 43 | } 44 | 45 | message ParamsResponse 46 | { 47 | message Voice 48 | { 49 | // use as "voice" in GenerateRequest 50 | required string name = 1; 51 | // 1 female, 2 male 52 | required int32 gender = 2; 53 | // 0x809 english 54 | // 0x419 ru 55 | // 0 for "internation" voice, that can be used with any language 56 | required int32 languageId = 3; 57 | 58 | required int32 initialSampleFreq = 4; 59 | 60 | // fine display name for menus and gui 61 | optional string displayName = 5; 62 | 63 | // this voice can be used in lowLevelGenerateRequest for mixing 64 | optional bool coreVoice = 6; 65 | } 66 | 67 | repeated Voice voiceList = 1; 68 | } 69 | 70 | /////////////////////////////////////////////////////////////////////////// 71 | 72 | message GenerateRequest 73 | { 74 | required string lang = 1; 75 | required string text = 2; 76 | required string application = 3; 77 | required string platform = 4; 78 | required string voice = 6; 79 | optional float speed = 31; 80 | optional string emotion = 10; 81 | 82 | enum Quality { 83 | High = 0; Low = 1; UltraHigh = 2; 84 | // Low means resample to 8000! 85 | // High meand resample to 16000. 86 | // UltraHigh means 48000 (or 32000 for SPEEX) 87 | } 88 | 89 | enum Format { 90 | Wav = 0; Pcm = 1; Spx = 2; Opus = 3; 91 | } 92 | 93 | optional Quality quality = 7 [default = High]; 94 | optional Format format = 8 [default = Spx]; 95 | optional bool requireMetainfo = 5 [default = false]; 96 | 97 | optional Generate lowLevelGenerateRequest = 30; 98 | 99 | // keep calm, and do not use '9' slot again 100 | optional string speed_obsolete = 9; 101 | 102 | optional float volume = 32 [default = 1.0]; 103 | 104 | optional bool chunked = 33 [default = false]; 105 | } 106 | 107 | message Feedback 108 | { 109 | required int32 elapsed = 1; 110 | required string event = 2; 111 | } 112 | 113 | -------------------------------------------------------------------------------- /python/asrclient/ttsbackend.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "basic.proto"; 4 | 5 | package TTS; 6 | 7 | message Generate 8 | { 9 | optional string sessionId = 30; 10 | 11 | //Language code, ex.: ru, en 12 | required string lang = 1; 13 | 14 | message WeightedParam 15 | { 16 | required string name = 1; 17 | required float weight = 2; 18 | } 19 | 20 | //Text for synthesis 21 | required string text = 2; 22 | 23 | //Speed of speech <1.0 - slower, >1.0 - faster 24 | optional float speed = 3 [default = 1.0]; 25 | 26 | //Supported voices are: zhar, omazh, jane, ermil, oksana 27 | repeated WeightedParam voices = 11; 28 | 29 | //Supported emotions are: good, neutral, evil 30 | repeated WeightedParam emotions = 12; 31 | 32 | //Supported genders are: male, female 33 | repeated WeightedParam genders = 13; 34 | 35 | optional bool requireMetainfo = 5 [default = false]; 36 | 37 | optional float msd_threshold = 14; 38 | optional float mgc_recurrence = 15; 39 | optional float subtract_durations_sigmas = 17; 40 | optional float lf0_postfilter = 18; 41 | optional float mgcGVWeight = 19; 42 | optional float lf0GVWeight = 20; 43 | optional float mvfGVWeight = 21; 44 | optional float mgc_postfilter1 = 22; 45 | optional float mgc_postfilter2 = 23; 46 | 47 | optional bool chunked = 24; //ex-fast 48 | } 49 | 50 | message GenerateResponse 51 | { 52 | message WordEvent 53 | { 54 | required int32 firstCharPositionInText = 1; 55 | required int32 bytesLengthInSignal = 2; 56 | optional string text = 3; 57 | optional string postag = 4; 58 | optional string homographTag = 5; 59 | } 60 | message Phoneme 61 | { 62 | required string ttsPhoneme = 1; 63 | required string IPAPhoneme = 2; 64 | required int32 viseme = 5; 65 | required int32 durationMs = 3; 66 | required int32 positionInBytesStream = 4; 67 | } 68 | 69 | // words and phomenes would be empty, unless requireMetainfo is set in GenerateRequest 70 | repeated WordEvent words = 1; 71 | repeated Phoneme phonemes = 2; 72 | optional bytes audioData = 3; 73 | required bool completed = 4; 74 | 75 | optional BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 5; 76 | //Error message 77 | optional string message = 6; 78 | 79 | //Lingware information 80 | optional string lang = 7; 81 | optional string version = 8; 82 | } 83 | 84 | message StopGeneration 85 | { 86 | } 87 | -------------------------------------------------------------------------------- /python/asrclient/ttsclient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys, os 5 | import requests 6 | from datetime import datetime 7 | import time 8 | import random 9 | import logging 10 | 11 | if sys.version_info >= (3, 0): 12 | from .transport import Transport 13 | from .basic_pb2 import ConnectionResponse 14 | from .ttsbackend_pb2 import Generate, GenerateResponse 15 | from .tts_pb2 import GenerateRequest, ConnectionRequest, ParamsRequest, ParamsResponse 16 | else: 17 | from transport import Transport 18 | from basic_pb2 import ConnectionResponse 19 | from ttsbackend_pb2 import Generate, GenerateResponse 20 | from tts_pb2 import GenerateRequest, ConnectionRequest, ParamsRequest, ParamsResponse 21 | 22 | from uuid import uuid4 as randomUuid 23 | 24 | DEFAULT_KEY_VALUE = 'paste-your-own-key' 25 | DEFAULT_SERVER_VALUE = 'tts.voicetech.yandex.net' 26 | DEFAULT_PORT_VALUE = 80 27 | 28 | DEFAULT_LANG_VALUE = 'ru-RU' 29 | 30 | DEFAULT_UUID_VALUE = randomUuid().hex 31 | 32 | DEFAULT_FORMAT_VALUE = 'wav' 33 | DEFAULT_QUALITY_VALUE = 'high' 34 | 35 | def generateWavHeader(sample_rate, mono=True, data_size=0): 36 | gWavHeader = "RIFF\xff\xff\xff\xffWAVEfmt \x10\x00\x00\x00\x01\x00" + ("\x01" if mono else "\x02") + "\x00" 37 | wav_rate = "" 38 | wav_rate_align = "" 39 | sample_rate_align = sample_rate * 2 40 | for i in xrange(0, 4): 41 | wav_rate += chr(sample_rate % (256 if mono else 512)) # sample_rate * block_align (2 for mono) as int32 42 | wav_rate_align += chr(sample_rate_align % 256) # sample_rate as int32 43 | sample_rate /= 256 44 | sample_rate_align /= 256 45 | gWavHeader += wav_rate 46 | gWavHeader += wav_rate_align 47 | gWavHeader += "\x02" if mono else "\x04" 48 | gWavHeader += "\x00\x10\x00data\xff\xff\xff\xff" 49 | 50 | if data_size > 0: 51 | size_of_wav = data_size + 36 52 | hexWavSize = "" 53 | hexDataSize = "" 54 | for i in xrange(0,4): 55 | hexWavSize += chr(size_of_wav % 256) 56 | size_of_wav /= 256 57 | hexDataSize += chr(data_size % 256) 58 | data_size /= 256 59 | gWavHeader = gWavHeader[:4] + hexWavSize + gWavHeader[8:40] + hexDataSize 60 | 61 | return gWavHeader 62 | 63 | def upgradeToProtobuf(transport, server, port): 64 | transport.verbose = False 65 | transport.send("GET /ytcp2 HTTP/1.1\r\n" + 66 | "User-Agent:KeepAliveClient\r\n" + 67 | "Host: %s:%s\r\n" % (server, port) + 68 | "Upgrade: websocket\r\n\r\n"); 69 | check = "HTTP/1.1 101" 70 | checkRecv = "" 71 | while True: 72 | checkRecv += transport.recv(1) 73 | if checkRecv.startswith(check) and checkRecv.endswith("\r\n\r\n"): 74 | break 75 | if len(checkRecv) > 300: 76 | return False 77 | return True 78 | 79 | def list_speakers(server=DEFAULT_SERVER_VALUE, port=DEFAULT_PORT_VALUE, key=DEFAULT_KEY_VALUE, uuid=DEFAULT_UUID_VALUE, ipv4=False, **kwars): 80 | logger = logging.getLogger('asrclient') 81 | with Transport(server, port, timeout=None, verbose=False, enable_ssl=(port==443), ipv4=ipv4) as t: 82 | if not upgradeToProtobuf(t, server, port): 83 | logger.info("Wrong response on upgrade request. Exiting.") 84 | sys.exit(1) 85 | logger.info("Upgraded to protobuf, sending connect request.") 86 | 87 | t.sendProtobuf(ConnectionRequest( 88 | serviceName="tts", 89 | speechkitVersion="ttsclient", 90 | uuid=uuid, 91 | apiKey=key 92 | )) 93 | 94 | connectionResponse = t.recvProtobuf(ConnectionResponse) 95 | 96 | if connectionResponse.responseCode != 200: 97 | logger.info("Bad response code %s: %s" % (connectionResponse.responseCode, connectionResponse.message)) 98 | sys.exit(1) 99 | 100 | logger.info("Connected, getting speakers list.") 101 | 102 | t.sendProtobuf(ParamsRequest( 103 | listVoices=True 104 | )) 105 | 106 | res = t.recvProtobuf(ParamsResponse) 107 | 108 | print(", ".join([v.name for v in res.voiceList if v.coreVoice])) 109 | 110 | def generate(file, text, speaker, server=DEFAULT_SERVER_VALUE, port=DEFAULT_PORT_VALUE, key=DEFAULT_KEY_VALUE, uuid=DEFAULT_UUID_VALUE, lang=DEFAULT_LANG_VALUE, emotion=None, gender=None, ipv4=False, format=DEFAULT_FORMAT_VALUE, quality=DEFAULT_QUALITY_VALUE): 111 | logger = logging.getLogger('asrclient') 112 | with Transport(server, port, timeout=None, verbose=False, enable_ssl=(port==443), ipv4=ipv4) as t: 113 | if not upgradeToProtobuf(t, server, port): 114 | logger.info("Wrong response on upgrade request. Exiting.") 115 | sys.exit(1) 116 | logger.info("Upgraded to protobuf, sending connect request") 117 | 118 | t.sendProtobuf(ConnectionRequest( 119 | serviceName="tts", 120 | speechkitVersion="ttsclient", 121 | uuid=uuid, 122 | apiKey=key 123 | )) 124 | 125 | connectionResponse = t.recvProtobuf(ConnectionResponse) 126 | 127 | if connectionResponse.responseCode != 200: 128 | logger.info("Bad response code %s: %s" % (connectionResponse.responseCode, connectionResponse.message)) 129 | sys.exit(1) 130 | 131 | t.sendProtobuf(ParamsRequest( 132 | listVoices=True 133 | )) 134 | 135 | res = t.recvProtobuf(ParamsResponse) 136 | 137 | request = GenerateRequest( 138 | lang=lang, 139 | text=text, 140 | application="ttsclient", 141 | platform="local", 142 | voice=speaker, 143 | requireMetainfo=False, 144 | format={'wav': GenerateRequest.Pcm, 'pcm': GenerateRequest.Pcm, 'speex': GenerateRequest.Spx, 'opus': GenerateRequest.Opus}.get(format, GenerateRequest.Pcm), 145 | quality=({'low': GenerateRequest.Low, 'high': GenerateRequest.High, 'ultra': GenerateRequest.UltraHigh}[quality]), 146 | chunked=True 147 | ) 148 | 149 | if emotion or gender: 150 | request.lowLevelGenerateRequest.CopyFrom(Generate( 151 | voices=[Generate.WeightedParam(name=speaker, weight=1.0)], 152 | emotions=[Generate.WeightedParam(name=emotion, weight=1.0)] if emotion else [], 153 | genders=[Generate.WeightedParam(name=gender, weight=1.0)] if gender else [], 154 | lang=lang[:2], 155 | text=text, 156 | fast=False, 157 | requireMetainfo=False 158 | )) 159 | 160 | t.sendProtobuf(request) 161 | if format == 'wav': 162 | file.write(generateWavHeader({'ultra': 48000, 163 | 'high': 16000, 164 | 'low': 8000}[quality])) 165 | while True: 166 | ttsResponse = t.recvProtobuf(GenerateResponse) 167 | if ttsResponse.message: 168 | logger.info("Error on synthesis: %s" % (ttsResponse.message,)) 169 | sys.exit(2) 170 | 171 | if not ttsResponse.completed: 172 | file.write(ttsResponse.audioData) 173 | else: 174 | file.close() 175 | break 176 | logger.info("Request complete") 177 | -------------------------------------------------------------------------------- /python/asrclient/voiceproxy.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "basic.proto"; 4 | 5 | package VoiceProxyProtobuf; 6 | 7 | // use this part of ConnectionRequest to specify additional options for decoder/proxy 8 | message AdvancedASROptions 9 | { 10 | // send back partial results, if disabled only results with endOfUtt == true will be send 11 | optional bool partial_results = 1 [default = true]; 12 | 13 | // beam, lattice_beam, lattice_nbest - are low level decoder options 14 | optional float beam = 2 [default = -1]; 15 | 16 | optional float lattice_beam = 3 [default = -1]; 17 | 18 | optional int32 lattice_nbest = 4 [default = -1]; 19 | 20 | // specify interval in 10mc of silence/noice which will separe sentences, defines how often you will receive endOfUtt == true 21 | optional int32 utterance_silence = 5 [default = 120]; 22 | 23 | // disable all partial and results with "endOfUtt" - will response only when AddData with "lastChunk" is received 24 | optional bool allow_multi_utt = 16 [default = true]; 25 | 26 | // if client sends too many chunks (more then server could process) - if timeout (in mc) specify how many buffers sound should be read 27 | // before sending to decoder, this may vary how often partial_results are sending 28 | optional int32 chunk_process_limit = 17 [default = 100]; 29 | 30 | // cmn is a internal feature of decoder 31 | optional int32 cmn_window = 18 [default = 600]; 32 | 33 | optional int32 cmn_latency = 19 [default = 150]; 34 | 35 | // capitalize and expected_num_count are features of "normalized" field of the AddDataResponse recognition result 36 | 37 | // specify if "normalized" results will be capitalize 38 | optional bool capitalize = 20 [default = false]; 39 | 40 | // if specified normalizer will try to fit this count, for example "twenty two" will normalize to 20 2, if "3" is set as expected, default will be 22 41 | optional int32 expected_num_count = 21 [default = 0]; 42 | 43 | // list of phrases for on-fly grammar, for example "yes", "no" in case of en-US 44 | // this field made "topic" in ConnectionRequest irrelevant, instead this list is used to build "language model" on the fly 45 | repeated string grammar = 22; 46 | 47 | // the same as previous, but partially support srgs, you can specify items, tags and simple rules, for example: 48 | // 49 | // 50 | // 51 | // 52 | // 53 | // оплатилоплатил 54 | // оплатилда 55 | // не оплатилне оплатил 56 | // не оплатилнет 57 | // 58 | // 59 | // 60 | optional string srgs = 23; 61 | 62 | // currently supports "gender", "age", "group", "language", "children", "emotion" and combination with ",", like "age,gender" 63 | // checkout BiometryResult 64 | optional string biometry = 24; 65 | 66 | // turn on confidence rescoring procedure 67 | optional bool use_snr = 25 [default = false]; 68 | 69 | // flags for confidence rescoring procedure 70 | repeated SnrFlag snr_flags = 26; 71 | 72 | // used to distinguish between biometry groups (devices) 73 | optional string biometry_group = 27; 74 | 75 | // enable special normalizers for "manual punctuation" i.e. replace "привет запятая как дела воспросительный знак" with "привет, как дела?" 76 | optional bool manual_punctuation = 28 [default = false]; 77 | } 78 | 79 | message ConnectionRequest 80 | { 81 | optional int32 protocolVersion = 1 [default = 1]; 82 | 83 | // leave empty if you are not speechkit 84 | required string speechkitVersion = 2; 85 | 86 | required string serviceName = 3; // "asr_dictation", etc. 87 | 88 | required string uuid = 4; 89 | 90 | optional string yandexuid = 21; 91 | 92 | required string apiKey = 5; 93 | 94 | required string applicationName = 6; 95 | 96 | // vendor:model:type... user defined 97 | required string device = 7; 98 | 99 | // lat.lat,lan.lan 100 | required string coords = 8; 101 | 102 | // "general", "mapsyari", "freeform", "music" 103 | // topic is ignored, if grammar or srgs from advancedAsrOptions are set 104 | required string topic = 9; 105 | 106 | // "ru-RU" 107 | required string lang = 10; 108 | 109 | // "audio/x-speex", "audio/x-pcm;bit=16;rate=8000", etc. 110 | required string format = 11; 111 | 112 | // enable punctuation mode for "freeform" topic (some other topic may support punctuation in the future) 113 | optional bool punctuation = 12 [default = true]; 114 | 115 | optional bool disableAntimatNormalizer = 18 [default = false]; 116 | 117 | optional AdvancedASROptions advancedASROptions = 19; 118 | 119 | optional bool skipAudioFromLogging = 20 [default = false]; 120 | 121 | // deprecated 122 | optional MusicRequest musicRequest = 17; 123 | } 124 | 125 | /////////////////////////////////////////////////////////////////////////// 126 | 127 | message AddData 128 | { 129 | optional bytes audioData = 1; 130 | 131 | required bool lastChunk = 2; 132 | } 133 | 134 | /////////////////////////////////////////////////////////////////////////// 135 | 136 | message AlignInfo 137 | { 138 | optional float start_time = 1; 139 | 140 | optional float end_time = 2; 141 | 142 | optional float acoustic_score = 3; 143 | 144 | optional float graph_score = 4; 145 | 146 | optional float lm_score = 5; 147 | 148 | optional float total_score = 6; 149 | } 150 | 151 | message Word 152 | { 153 | required float confidence = 1; 154 | 155 | required string value = 2; 156 | 157 | optional VoiceProxyProtobuf.AlignInfo align_info = 3; 158 | } 159 | 160 | message Result 161 | { 162 | // notice: confidence valid only when endOfUttr is true, otherwise always "1" 163 | required float confidence = 1; 164 | 165 | repeated Word words = 2; 166 | 167 | optional string normalized = 3; 168 | 169 | optional VoiceProxyProtobuf.AlignInfo align_info = 4; 170 | } 171 | 172 | message BiometryResult 173 | { 174 | required string classname = 1; 175 | 176 | required float confidence = 2; 177 | 178 | optional string tag = 3; 179 | } 180 | 181 | message SnrFlag 182 | { 183 | required string name = 1; 184 | required string value = 2; 185 | } 186 | 187 | message SnrFeature 188 | { 189 | optional string name = 1; 190 | 191 | optional float value = 2; 192 | } 193 | 194 | message SnrInfo 195 | { 196 | optional string normalizedText = 1; 197 | 198 | optional float snrValue = 2; 199 | 200 | repeated SnrFeature features = 3; 201 | 202 | optional string featureSlices = 4; 203 | 204 | optional int32 originalCandidateIndex = 5; 205 | 206 | optional string candidateSource = 6; 207 | } 208 | 209 | message SnrMetainfo 210 | { 211 | optional string name = 1; 212 | } 213 | 214 | message Metainfo 215 | { 216 | required float minBeam = 1; 217 | 218 | required float maxBeam = 2; 219 | 220 | repeated SnrInfo snrInfos = 3; 221 | 222 | optional string topic = 4; 223 | 224 | optional string lang = 5; 225 | 226 | optional string version = 6; 227 | 228 | optional string load_timestamp = 7; 229 | 230 | optional int32 snrResponseCode = 8; 231 | 232 | optional bool snr_performed_rescoring = 9; 233 | 234 | optional SnrMetainfo snrMetainfo = 10; 235 | } 236 | 237 | message AddDataResponse 238 | { 239 | required BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 1; 240 | 241 | repeated Result recognition = 2; 242 | 243 | // if true : recognition contains fully parsed N-best list (n results with n words) 244 | // otherwise recognition contains just 1 result 1 word with current "partical result" 245 | optional bool endOfUtt = 3 [default = false]; 246 | 247 | // how many AddData requests were merged for this response 248 | optional int32 messagesCount = 4 [default = 1]; 249 | 250 | // if not empty messageCounter should be 0 251 | optional string musicProxyResponse = 5; 252 | 253 | repeated BiometryResult bioResult = 6; 254 | 255 | optional Metainfo metainfo = 7; 256 | } 257 | 258 | // deprecated 259 | message MusicRequest 260 | { 261 | message MusicParam 262 | { 263 | required string name = 1; 264 | 265 | required string value = 2; 266 | } 267 | 268 | // default options are "uid", "OAuth", "widget" 269 | repeated MusicParam musicProxyOptions = 1; 270 | } 271 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='asrclient', 5 | version='0.5.0', 6 | author='Andrey Pichugin, Alexander Artemenko, Andrey Semenov', 7 | author_email='voice@support.yandex.ru', 8 | description='Yandex ASR streaming client.', 9 | long_description=open('README.txt', 'r').read(), 10 | url='http://api.yandex.ru/speechkit/cloud-api/', 11 | platforms=['Any'], 12 | license='GNU GPLv3', 13 | packages=['asrclient'], 14 | install_requires=['protobuf', 'click', 'futures'], 15 | scripts=['asrclient-cli.py', 'ttsclient-cli.py'], 16 | package_data={'asrclient': ['*.proto']}, 17 | ) 18 | -------------------------------------------------------------------------------- /python/ttsclient-cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Yandex TTS streaming client.""" 4 | 5 | import logging 6 | import click 7 | import sys 8 | 9 | import asrclient.ttsclient as client 10 | 11 | 12 | @click.command() 13 | @click.option('-k', '--key', 14 | help='You could get it at https://developer.tech.yandex.ru/. Default is "{0}".'.format(client.DEFAULT_KEY_VALUE), 15 | default=client.DEFAULT_KEY_VALUE) 16 | @click.option('-s', '--server', 17 | help='Default is {0}.'.format(client.DEFAULT_SERVER_VALUE), 18 | default=client.DEFAULT_SERVER_VALUE) 19 | @click.option('-p', '--port', 20 | help='Default is {0}.'.format(client.DEFAULT_PORT_VALUE), 21 | default=client.DEFAULT_PORT_VALUE) 22 | @click.option('--lang', 23 | help='Synthesis language. ru-RU | en-EN | tr-TR | uk-UA. Default is {0}.'.format(client.DEFAULT_LANG_VALUE), 24 | default=client.DEFAULT_LANG_VALUE) 25 | @click.option('--speaker', 26 | help='Speaker for speech synthesis. Call this script with --list-speakers flag to get speakers list.', 27 | default='') 28 | @click.option('--emotion', 29 | help='Emotion for speech synthesis. Available values: good, neutral, evil. Default value depends on speaker\'s original emotion.', 30 | default=None) 31 | @click.option('--gender', 32 | help='Speaker\'s gender for speech synthesis. Available values: male, female. Default value depends on speaker\'s original gender.', 33 | default=None) 34 | @click.option('--textfile', 35 | help='Read text from this file instead of command line arguments.', 36 | type=click.File('r'), 37 | default=None) 38 | @click.option('--uuid', 39 | default=client.DEFAULT_UUID_VALUE, 40 | help='UUID of your request. It can be helpful for further logs analysis. Default is random.') 41 | @click.option('--ipv4', 42 | is_flag=True, 43 | help='Use ipv4 only connection.') 44 | @click.option('--list-speakers', 45 | is_flag=True, 46 | default=False, 47 | help='Only list available speakers, don\'t try to generate anything.') 48 | @click.option('--silent', 49 | is_flag=True, 50 | help='Don\'t print debug messages.') 51 | @click.option('--format', 52 | default=client.DEFAULT_FORMAT_VALUE, 53 | help='Format of output audio file. wav | pcm | speex | opus. Default is {0}.'.format(client.DEFAULT_FORMAT_VALUE)) 54 | @click.option('--quality', 55 | default=client.DEFAULT_QUALITY_VALUE, 56 | help='Quality output audio file. ultra | high | low. Default is {0}.'.format(client.DEFAULT_QUALITY_VALUE)) 57 | @click.argument('file', 58 | required=False, 59 | type=click.File('wb')) 60 | @click.argument('texts', 61 | nargs=-1) 62 | 63 | def main(silent, speaker, texts, textfile=None, list_speakers=False, **kwars): 64 | if not silent: 65 | logging.basicConfig(level=logging.INFO) 66 | if list_speakers: 67 | client.list_speakers(**kwars) 68 | sys.exit(0) 69 | if not speaker: 70 | print("Speaker is required. Please, call this script with --list-speakers flag to get speakers list.") 71 | sys.exit(1) 72 | if textfile: 73 | texts = map(str.strip, textfile.readlines()) 74 | client.generate(text=" ".join(texts).decode('utf8'), speaker=speaker, **kwars) 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /webspeechkit/README.md: -------------------------------------------------------------------------------- 1 | ###Quickstart 2 | ####Get API key 3 | First of all you will need to get API key for Yandex.SpeechKit. 4 | To do this go [here](https://developer.tech.yandex.ru) and get an API key for Yandex SpeechKit. 5 | 6 | ####Add dependecies to your web page 7 | Add Yandex.SpeechKit Web scripts from Yandex CDN to your web page: 8 | 9 | `` 10 | `` 11 | 12 | ####Use API to create wonderful voice interfaces 13 | Write some code for speech recognition logic. 14 | For example, if you need to simply recognize short voice requests than you'll need to write something like this: 15 | 16 | ``` 17 | window.onload = function() { 18 | ya.speechkit.recognize({ 19 | doneCallback: function (text) { 20 | console.log("You've said: " + text); 21 | }, 22 | initCallback: function () { 23 | console.log("You may speak now"); 24 | }, 25 | errorCallback: function (err) { 26 | console.log("Something gone wrong: " + err); 27 | }, 28 | model: 'freeform', // Model name for recognition process 29 | lang: 'ru-RU', //Language for recognition process 30 | apiKey: PUT_YOUR_API_KEY_HERE 31 | }); 32 | }; 33 | ``` 34 | 35 | Simple synthesis: 36 | 37 | ``` 38 | window.onload = function() { 39 | var tts = ya.speechkit.Tts( 40 | { 41 | speaker: 'jane', 42 | emotion: 'good', 43 | gender: 'female' 44 | }); 45 | tts.speak('1 2 3'); 46 | }; 47 | ``` 48 | -------------------------------------------------------------------------------- /webspeechkit/src/equalizer.js: -------------------------------------------------------------------------------- 1 | (function (namespace) { 2 | 'use strict'; 3 | 4 | if (typeof namespace.ya === 'undefined') { 5 | namespace.ya = {}; 6 | } 7 | if (typeof namespace.ya.speechkit === 'undefined') { 8 | namespace.ya.speechkit = {}; 9 | } 10 | 11 | namespace.ya.speechkit.Equalizer = function (target, recorder) { 12 | this.recorder = recorder; 13 | this.element = document.getElementById(target); 14 | this.element.style.textAlign = 'center'; 15 | this.element.innerText = ''; 16 | this.graf = document.createElement('canvas'); 17 | this.graf.style.width = '100%'; 18 | this.graf.style.height = '100%'; 19 | this.graf.width = 1000; 20 | 21 | this.element.appendChild(this.graf); 22 | 23 | if (!navigator.cancelAnimationFrame) { 24 | navigator.cancelAnimationFrame = navigator.webkitCancelAnimationFrame || 25 | navigator.mozCancelAnimationFrame; 26 | } 27 | if (!navigator.requestAnimationFrame) { 28 | navigator.requestAnimationFrame = navigator.webkitRequestAnimationFrame || 29 | navigator.mozRequestAnimationFrame; 30 | } 31 | 32 | this.refID = null; 33 | 34 | this.startDrawRealtime(); 35 | }; 36 | 37 | namespace.ya.speechkit.Equalizer.prototype = { 38 | destroy: function () { 39 | this.stopDrawRealtime(); 40 | this.element.removeChild(this.graf); 41 | }, 42 | stopDrawRealtime: function () { 43 | window.cancelAnimationFrame(this.rafID); 44 | this.rafID = null; 45 | }, 46 | startDrawRealtime: function () { 47 | var _this = this; 48 | function updateAnalysers(time) { 49 | if (!_this.analyserNode) { 50 | if (_this.recorder) { 51 | _this.analyserNode = _this.recorder.getAnalyserNode(); 52 | _this.context = _this.recorder.context; 53 | } else { 54 | return; 55 | } 56 | } 57 | 58 | var canvasWidth = _this.graf.width; 59 | var canvasHeight = _this.graf.height; 60 | var analyserContext = _this.graf.getContext('2d'); 61 | 62 | var SPACING = 2; 63 | var BAR_WIDTH = 1; 64 | var numBars = Math.round(canvasWidth / SPACING); 65 | var freqByteData = new Uint8Array(_this.analyserNode.frequencyBinCount); 66 | 67 | _this.analyserNode.getByteFrequencyData(freqByteData); 68 | 69 | analyserContext.clearRect(0, 0, canvasWidth, canvasHeight); 70 | analyserContext.fillStyle = '#F6D565'; 71 | analyserContext.lineCap = 'round'; 72 | var multiplier = _this.analyserNode.frequencyBinCount / numBars; 73 | 74 | for (var i = 0; i < numBars; ++i) { 75 | var magnitude = 0; 76 | var offset = Math.floor(i * multiplier); 77 | for (var j = 0; j < multiplier; j++) { 78 | magnitude += freqByteData[offset + j]; 79 | } 80 | magnitude = magnitude / multiplier / 2; 81 | analyserContext.fillStyle = 'hsl( ' + Math.round(i * 60 / numBars) + ', 100%, 50%)'; 82 | analyserContext.fillRect(i * SPACING, canvasHeight, BAR_WIDTH, -magnitude); 83 | } 84 | _this.rafID = window.requestAnimationFrame(updateAnalysers); 85 | } 86 | 87 | this.rafID = window.requestAnimationFrame(updateAnalysers); 88 | } 89 | }; 90 | }(this)); 91 | -------------------------------------------------------------------------------- /webspeechkit/src/recognizer.js: -------------------------------------------------------------------------------- 1 | (function (namespace) { 2 | 'use strict'; 3 | 4 | if (typeof namespace.ya === 'undefined') { 5 | namespace.ya = {}; 6 | } 7 | if (typeof namespace.ya.speechkit === 'undefined') { 8 | namespace.ya.speechkit = {}; 9 | } 10 | 11 | /** 12 | * Создает новый объект типа Recognizer. 13 | * @class Создает сессию и отправляет запрос на сервер для распознавания речи. 14 | * @name Recognizer 15 | * @param {Object} [options] Опции. 16 | * @param {callback:initCallback} [options.onInit] Функция-обработчик, которая будет вызвана после успешной инициализации 17 | * сессии. 18 | * @param {callback:dataCallback} [options.onResult] Функция-обработчик, которая будет вызвана после завершения распознавания речи. 19 | * @param {callback:errorCallback} [options.onError] 20 | * @param {String} [options.uuid=см. описание] UUID сессии. По умолчанию принимает значение, указанное 21 | * в настройках ya.speechkit.settings.uuid. 22 | * @param {String} [options.apikey] API-ключ. Если не задан, то используется ключ, указанный 23 | * в настройках ya.speechkit.settings.apikey. 24 | * @param {ya.speechkit.FORMAT} [options.format=ya.speechkit.FORMAT.PCM16] Формат аудиопотока. 25 | * @param {String} [options.url=см. описание] URL сервера, на котором будет производиться распознавание. 26 | * Если параметр не указан, то берется значение, заданное в настройках ya.speechkit.settings.asrUrl. По умолчанию оно равно 27 | * 'webasr.yandex.net/asrsocket.ws'. 28 | * @param {Boolean} [options.punctuation=true] Использовать ли пунктуацию. 29 | * @param {Boolean} [options.allowStrongLanguage=false] Отключить фильтрацию обсценной лексики. 30 | * @param {String} [options.model='notes'] Языковая модель, которая должна быть использована при распознавании. 31 | * Если параметр не указан, то используется значение, заданное в настройках ya.speechkit.model. Если в настройках значение не задано, то 32 | * используется модель 'notes'. 33 | * @param {String} [options.lang='ru-RU'] Язык распознавания. Возможные значения: 'ru-RU', 'en-US', 'tr-TR', 'uk-UA'. 34 | *

Если параметр не указан, то используется 35 | * значение, заданное в настройках ya.speechkit.lang. Если в настройках значение не задано, то по умолчанию 36 | * выбирается русский язык: 'ru-RU'.

37 | * @param {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример - sandbox. 38 | */ 39 | var Recognizer = function (options) { 40 | if (!(this instanceof namespace.ya.speechkit.Recognizer)) { 41 | return new namespace.ya.speechkit.Recognizer(options); 42 | } 43 | this.options = namespace.ya.speechkit._extend( 44 | {apikey: namespace.ya.speechkit.settings.apikey, 45 | uuid: namespace.ya.speechkit.settings.uuid, 46 | applicationName: namespace.ya.speechkit.settings.applicationName, 47 | url: namespace.ya.speechkit.settings.websocketProtocol + 48 | namespace.ya.speechkit.settings.asrUrl, 49 | onInit: function () {}, 50 | onResult: function () {}, 51 | onError: function () {}, 52 | punctuation: true, 53 | allowStrongLanguage: false 54 | }, 55 | options); 56 | 57 | // Backward compatibility 58 | this.options.key = this.options.apikey; 59 | this.options.format = this.options.format.mime; 60 | 61 | this.sessionId = null; 62 | this.socket = null; 63 | 64 | this.buffered = []; 65 | this.totaldata = 0; 66 | }; 67 | 68 | Recognizer.prototype = /** @lends Recognizer.prototype */{ 69 | /** 70 | * Send raw data to websocket. 71 | * @param data Any data to send to websocket (json string, raw audio data). 72 | * @private 73 | */ 74 | _sendRaw: function (data) { 75 | if (this.socket) { 76 | this.socket.send(data); 77 | } 78 | }, 79 | /** 80 | * Stringify JSON and send it to websocket. 81 | * @param {Object} json Object needed to be send to websocket. 82 | * @private 83 | */ 84 | _sendJson: function (json) { 85 | this._sendRaw(JSON.stringify({type: 'message', data: json})); 86 | }, 87 | /** 88 | * Запускает процесс распознавания. 89 | */ 90 | start: function () { 91 | this.sessionId = null; 92 | try { 93 | this.socket = new WebSocket(this.options.url); 94 | } catch (e) { 95 | this.options.onError('Error on socket creation: ' + e); 96 | this.options.stopCallback(); 97 | return; 98 | } 99 | 100 | this.socket.onopen = function () { 101 | // {uuid: uuid, key: key, format: audioFormat, punctuation: punctuation ... 102 | // console.log('Initial request: ' + JSON.stringify(this.options)); 103 | this._sendJson(this.options); 104 | }.bind(this); 105 | 106 | this.socket.onmessage = function (e) { 107 | var message = JSON.parse(e.data); 108 | 109 | if (message.type == 'InitResponse'){ 110 | this.sessionId = message.data.sessionId; 111 | this.options.onInit(message.data.sessionId, message.data.code); 112 | } else if (message.type == 'AddDataResponse'){ 113 | this.options.onResult(message.data.text, message.data.uttr, message.data.merge, message.data.words); 114 | if (typeof message.data.close !== 'undefined' && message.data.close) { 115 | this.close(); 116 | } 117 | } else if (message.type == 'Error'){ 118 | this.options.onError('Session ' + this.sessionId + ': ' + message.data); 119 | this.close(); 120 | } else { 121 | this.options.onError('Session ' + this.sessionId + ': ' + message); 122 | this.close(); 123 | } 124 | }.bind(this); 125 | 126 | this.socket.onerror = function (error) { 127 | this.options.onError('Socket error: ' + error.message); 128 | }.bind(this); 129 | 130 | this.socket.onclose = function (event) { 131 | }.bind(this); 132 | }, 133 | /** 134 | * Добавляет данные с аудио к потоку для распознавания речи. 135 | * Если сессия распознавания еще не была создана, то данные будут буферизованы и отправятся на сервер 136 | * по факту установления соединения. 137 | * @param {ArrayBuffer} data Буфер с аудио сигналом в формате PCM 16bit. 138 | */ 139 | addData: function (data) { 140 | this.totaldata += data.byteLength; 141 | 142 | if (!this.sessionId) { 143 | this.buffered.push(data); 144 | return; 145 | } 146 | 147 | for (var i = 0; i < this.buffered.length; i++){ 148 | this._sendRaw(new Blob([this.buffered[i]], {type: this.options.format})); 149 | this.totaldata += this.buffered[i].byteLength; 150 | } 151 | 152 | this.buffered = []; 153 | this._sendRaw(new Blob([data], {type: this.options.format})); 154 | }, 155 | /** 156 | * Принудительно завершает запись звука и отсылает запрос (не закрывает сессию распознавания, пока не получит от сервера последний ответ). 157 | */ 158 | finish: function () { 159 | this._sendJson({command: 'finish'}); 160 | }, 161 | /** 162 | * Завершает сессию распознавания речи, закрывая соединение с сервером. 163 | */ 164 | close: function () { 165 | this.options.onInit = function () {}; 166 | this.options.onResult = this.options.onInit; 167 | this.options.onError = this.options.onInit; 168 | 169 | if (this.socket) { 170 | this.socket.close(); 171 | this.options.stopCallback(); 172 | } 173 | this.socket = null; 174 | } 175 | }; 176 | 177 | namespace.ya.speechkit.Recognizer = Recognizer; 178 | 179 | /** 180 | * Функция-обработчик, которая будет вызвана после успешной инициализации 181 | * сессии. 182 | * @callback 183 | * @name initCallback 184 | * @param {String} sessionId Идентификатор сессии. 185 | * @param {Number} code HTTP-статус, который будет содержаться в ответе сервера после инициализации сессии (200). 186 | * @memberOf Recognizer 187 | */ 188 | 189 | /** 190 | * Функция-обработчик, которая будет вызвана в случае возникновения ошибки. 191 | * @callback 192 | * @name errorCallback 193 | * @param {String} message Текст сообщения об ошибке. 194 | * @memberOf Recognizer 195 | */ 196 | 197 | /** 198 | * Функция-обработчик, которая будет вызвана после завершения распознавания речи. 199 | * @callback 200 | * @name dataCallback 201 | * @param {String} text Распознанный текст. 202 | * @param {Boolean} utterance Является ли данный текст финальным результатом распознавания. 203 | * @param {Number} merge Число обработанных запросов по которым выдан ответ. (Сколько пакетов с данными были соединены в этот результат). 204 | * @memberOf Recognizer 205 | */ 206 | }(this)); 207 | -------------------------------------------------------------------------------- /webspeechkit/src/recorder.js: -------------------------------------------------------------------------------- 1 | (function (namespace) { 2 | 'use strict'; 3 | 4 | /** 5 | * Пространство имен для классов и методов библиотеки Yandex.Speechkit JS 6 | * @namespace ya.speechkit 7 | */ 8 | if (typeof namespace.ya === 'undefined') { 9 | namespace.ya = {}; 10 | } 11 | if (typeof namespace.ya.speechkit === 'undefined') { 12 | namespace.ya.speechkit = {}; 13 | } 14 | 15 | namespace.ya.speechkit.AudioContext = window.AudioContext || window.webkitAudioContext; 16 | 17 | if (typeof namespace.ya.speechkit.settings === 'undefined') { 18 | var js = document.createElement('script'); 19 | 20 | js.type = 'text/javascript'; 21 | js.src = 'https://webasr.yandex.net/jsapi/v1/webspeechkit-settings.js?seed=' + Math.random(); 22 | 23 | document.head.appendChild(js); 24 | } 25 | 26 | /** Набор поддерживаемых форматов аудио. 27 | * @readonly 28 | * @enum 29 | * @memberof ya.speechkit 30 | */ 31 | namespace.ya.speechkit.FORMAT = { 32 | /** PCM 8KHz дает плохое качество распознавания, но малый объем передаваемых на сервер данных */ 33 | PCM8: {format: 'pcm', sampleRate: 8000, mime: 'audio/x-pcm;bit=16;rate=8000', bufferSize: 1024}, 34 | /** PCM 16 KHz наилучшее качество распознавания при среднем объеме данных */ 35 | PCM16: {format: 'pcm', sampleRate: 16000, mime: 'audio/x-pcm;bit=16;rate=16000', bufferSize: 2048}, 36 | /** PCM 44 KHz большой размер передаваемых данных, возможны задержки на узком канале */ 37 | PCM44: {format: 'pcm', sampleRate: 44100, mime: 'audio/x-pcm;bit=16;rate=44100', bufferSize: 4096}, 38 | }; 39 | 40 | /** Media stream used by SpeechKit 41 | * @private 42 | * @memberof ya.speechkit 43 | */ 44 | namespace.ya.speechkit._stream = null; 45 | 46 | /** 47 | * Deep copies fileds from object 'from' to object 'to' 48 | * @param {Object} from Source object 49 | * @param {Object} to Destination object 50 | * @private 51 | */ 52 | namespace.ya.speechkit._extend = function (to, from) { 53 | var i; 54 | var toStr = Object.prototype.toString; 55 | var astr = '[object Array]'; 56 | to = to || {}; 57 | 58 | for (i in from) { 59 | if (from.hasOwnProperty(i)) { 60 | if (typeof from[i] === 'object') { 61 | to[i] = (toStr.call(from[i]) === astr) ? [] : {}; 62 | namespace.ya.speechkit._extend(to[i], from[i]); 63 | } else if (typeof from[i] !== 'undefined' || typeof to[i] === 'undefined') { 64 | to[i] = from[i]; 65 | } 66 | } 67 | } 68 | return to; 69 | }; 70 | 71 | /** 72 | * Создает объект для записи аудио-сигнала с микрофона. 73 | * @class Класс, управляющий записью звука с микрофона. 74 | * @name Recorder 75 | */ 76 | var Recorder = function () 77 | { 78 | if (!namespace.ya.speechkit._stream) { 79 | return null; 80 | } 81 | 82 | if (!(this instanceof Recorder)) { 83 | return new Recorder(); 84 | } 85 | 86 | this.worker = namespace.ya.speechkit.newWorker(); 87 | 88 | this.recording = false; 89 | 90 | this.paused = false; 91 | this.lastDataOnPause = 0; 92 | 93 | this.nullsArray = []; 94 | 95 | this.currCallback = null; 96 | this.buffCallback = null; 97 | this.startCallback = null; 98 | 99 | this.worker.onmessage = function (e) { 100 | if (e.data.command == 'int16stream') 101 | { 102 | var data = e.data.buffer; 103 | 104 | if (this.startCallback) { 105 | this.startCallback(data); 106 | } 107 | } else if (e.data.command == 'getBuffers' && this.buffCallback) { 108 | this.buffCallback(e.data.blob); 109 | } else if (e.data.command == 'clear' && this.currCallback) { 110 | this.currCallback(); 111 | } else if (this.currCallback) { 112 | this.currCallback(e.data.blob); 113 | } 114 | }.bind(this); 115 | 116 | }; 117 | 118 | Recorder.prototype = /** @lends Recorder.prototype */ { 119 | /** 120 | * Creates an input point for a given audio format (sets samplerate and buffer size 121 | * @param {ya.speechkit.FORMAT} format audio format (it's samplerate and bufferSize are being used) 122 | * @private 123 | */ 124 | _createNode: function (format) { 125 | if (!namespace.ya.speechkit.audiocontext) { 126 | namespace.ya.speechkit.audiocontext = new namespace.ya.speechkit.AudioContext(); 127 | } 128 | 129 | this.audioInput = namespace.ya.speechkit.audiocontext.createMediaStreamSource( 130 | namespace.ya.speechkit._stream); 131 | 132 | if (!namespace.ya.speechkit.audiocontext.createScriptProcessor) { 133 | this.node = namespace.ya.speechkit.audiocontext.createJavaScriptNode(format.bufferSize, 2, 2); 134 | } else { 135 | this.node = namespace.ya.speechkit.audiocontext.createScriptProcessor(format.bufferSize, 2, 2); 136 | } 137 | 138 | this.audioInput.connect(this.node); 139 | this.node.onaudioprocess = function (e) { 140 | if (!this.recording) {return;} 141 | 142 | if (this.paused) { 143 | if (Number(new Date()) - this.lastDataOnPause > 2000) { 144 | this.lastDataOnPause = Number(new Date()); 145 | this.worker.postMessage({ 146 | command: 'record', 147 | buffer: [ 148 | this.nullsArray, 149 | this.nullsArray 150 | ] 151 | }); 152 | } 153 | } else { 154 | this.worker.postMessage({ 155 | command: 'record', 156 | buffer: [ 157 | e.inputBuffer.getChannelData(0), 158 | e.inputBuffer.getChannelData(1) 159 | ] 160 | }); 161 | } 162 | }.bind(this); 163 | 164 | this.node.connect(namespace.ya.speechkit.audiocontext.destination); 165 | }, 166 | /** 167 | * Ставит запись звука на паузу. 168 | * Во время паузы на сервер будут отправляться периодически запросы с пустым звуком, чтобы сервер не обрывал сессию. 169 | */ 170 | pause: function () { 171 | this.paused = true; 172 | this.lastDataOnPause = Number(new Date()); 173 | }, 174 | /** 175 | * @returns {AudioContext} Текущий 176 | * AudioContextAudioContext, 177 | * с которого записывается звук. 178 | */ 179 | getAudioContext: function () { 180 | return namespace.ya.speechkit.audiocontext; 181 | }, 182 | /** 183 | * @returns {AnalyserNode} 184 | * AnalyserNode 185 | * AnalyserNode — объект, предназначенный для анализа аудио-сигнала в реальном времени. 186 | */ 187 | getAnalyserNode: function () { 188 | if (!namespace.ya.speechkit.audiocontext) { 189 | namespace.ya.speechkit.audiocontext = new namespace.ya.speechkit.AudioContext(); 190 | } 191 | var analyserNode = namespace.ya.speechkit.audiocontext.createAnalyser(); 192 | analyserNode.fftSize = 2048; 193 | this.audioInput.connect(analyserNode); 194 | return analyserNode; 195 | }, 196 | /** 197 | * @returns {Boolean} true, если запись звука стоит на паузе, false — в противном случае. 198 | */ 199 | isPaused: function () { 200 | return this.paused; 201 | }, 202 | /** 203 | * Начинает запись звука с микрофона. 204 | * @param {callback:streamCallback} cb Функция-обработчик, в которую будет передаваться записанный аудио-поток. 205 | * @param {ya.speechkit.FORMAT} [format=PCM16] Формат для записи аудио-сигнала. Доступные значения: 206 | *
    207 | *
  • PCM8 — плохое качество распознавания, но малый объем передаваемых на сервер данных;
  • 208 | *
  • PCM16 — наилучшее качество распознавания при среднем объеме данных;
  • 209 | *
  • PCM44 — большой размер передаваемых данных, возможны задержки на узком канале.
  • 210 | *
211 | */ 212 | start: function (cb, format) { 213 | var backref = this; 214 | if (!namespace.ya.speechkit._stream) { 215 | return namespace.ya.speechkit.initRecorder(function () {backref.start(cb, format);}, console.log); 216 | } 217 | 218 | if (!this.node) { 219 | this._createNode(format); 220 | } 221 | 222 | if (this.isPaused()) { 223 | this.paused = false; 224 | return; 225 | } 226 | if (typeof cb !== 'undefined') { 227 | this.startCallback = cb; 228 | } else { 229 | this.startCallback = null; 230 | } 231 | this.worker.postMessage({ 232 | command: 'init', 233 | config: { 234 | sampleRate: namespace.ya.speechkit.audiocontext.sampleRate, 235 | format: format || namespace.ya.speechkit.FORMAT.PCM16, 236 | channels: this.channelCount, 237 | } 238 | }); 239 | 240 | this.nullsArray = []; 241 | var bufferLen = (format || namespace.ya.speechkit.FORMAT.PCM16).bufferSize; 242 | for (var i = 0; i < bufferLen; i++) { 243 | this.nullsArray.push(0); 244 | } 245 | 246 | this.clear(function () {this.recording = true;}.bind(this)); 247 | }, 248 | /** 249 | * Останавливает запись звука. 250 | * @param {callback:wavCallback} cb Функция-обработчик, в которую будет передан объект Blob 251 | * с записанным аудио в формате wav. 252 | * @param {Number} [channelCount=2] Сколько каналов должно быть в wav-файле: 1 — mono, 2 — stereo. 253 | */ 254 | stop: function (cb, channelCount) { 255 | this.recording = false; 256 | if (this.node) { 257 | this.node.disconnect(); 258 | } 259 | 260 | this.node = null; 261 | if (namespace.ya.speechkit._stream && 262 | namespace.ya.speechkit._stream.getAudioTracks) { 263 | namespace.ya.speechkit._stream.getAudioTracks()[0].stop(); 264 | } else if (namespace.ya.speechkit._stream && 265 | typeof namespace.ya.speechkit._stream.stop !== 'undefined') { 266 | namespace.ya.speechkit._stream.stop(); 267 | } 268 | namespace.ya.speechkit._stream = null; 269 | if (typeof namespace.ya.speechkit.audiocontext !== 'undefined' && 270 | namespace.ya.speechkit.audiocontext !== null && 271 | typeof namespace.ya.speechkit.audiocontext.close !== 'undefined') { 272 | namespace.ya.speechkit.audiocontext.close(); 273 | namespace.ya.speechkit.audiocontext = null; 274 | } 275 | 276 | if (typeof cb !== 'undefined') { 277 | this.exportWav(function (blob) { 278 | cb(blob); 279 | }, channelCount || 2); 280 | } 281 | }, 282 | /** 283 | * @returns {Boolean} true, если идет запись звука, false — если запись стоит в режиме паузы. 284 | */ 285 | isRecording: function () { 286 | return this.recording; 287 | }, 288 | /** 289 | * Очищает буферы с записанным аудио-сигналом. 290 | * @param {callback:clearCallback} cb Функция-обработчик, которая будет вызвана, когда произойдет очистка. 291 | */ 292 | clear: function (cb) { 293 | if (typeof cb !== 'undefined') { 294 | this.currCallback = cb; 295 | } else { 296 | this.currCallback = null; 297 | } 298 | this.worker.postMessage({command: 'clear'}); 299 | }, 300 | /** 301 | * Метод для получения буферов с записанным аудио-сигналом. 302 | * @param {callback:buffersCallback} cb Функция, в которую будут переданы буферы с аудио-сигналом. 303 | */ 304 | getBuffers: function (cb) { 305 | if (typeof cb !== 'undefined') { 306 | this.buffCallback = cb; 307 | } else { 308 | this.buffCallback = null; 309 | } 310 | this.worker.postMessage({command: 'getBuffers'}); 311 | }, 312 | /** 313 | * Экспортирует записанный звук в wav-файл. 314 | * @param {callback:wavCallback} cb Функция, в которую будет передан объект Blob с файлом. 315 | * @param {Number} [channelCount=1] Количество каналов в wav-файле: 1 — mono, 2 — stereo. 316 | */ 317 | exportWav: function (cb, channelCount) { 318 | if (typeof cb !== 'undefined') { 319 | this.currCallback = cb; 320 | } else { 321 | this.currCallback = null; 322 | } 323 | var type = 'audio/wav'; 324 | 325 | if (!this.currCallback) {throw new Error('Callback not set');} 326 | 327 | var exportCommand = 'export' + (channelCount != 2 && 'Mono' || '') + 'WAV'; 328 | 329 | this.worker.postMessage({ 330 | command: exportCommand, 331 | type: type 332 | }); 333 | } 334 | }; 335 | 336 | namespace.ya.speechkit.Recorder = Recorder; 337 | 338 | namespace.ya.speechkit.getUserMedia = navigator.getUserMedia || 339 | navigator.mozGetUserMedia || 340 | navigator.msGetUserMedia || 341 | navigator.webkitGetUserMedia; 342 | 343 | namespace.ya.speechkit.mediaDevices = (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) ? 344 | navigator.mediaDevices : 345 | (namespace.ya.speechkit.getUserMedia ? { 346 | getUserMedia: function (c) { 347 | return new Promise(function (y, n) { 348 | namespace.ya.speechkit.getUserMedia.call(navigator, c, y, n); 349 | }); 350 | } 351 | } : null); 352 | 353 | namespace.ya.speechkit._stream = null; 354 | namespace.ya.speechkit.audiocontext = null; 355 | 356 | /** 357 | * Запрашивает у пользователя права для записи звука с микрофона. 358 | * @param {callback:initSuccessCallback} initSuccess Функция-обработчик, которая будет вызвана при успешном подключении к микрофону. 359 | * @param {callback:initFailCallback} initFail Функция-обработчик, в которую будет передано сообщения об ошибке, в случае неуспеха. 360 | */ 361 | namespace.ya.speechkit.initRecorder = function (initSuccess, initFail) 362 | { 363 | var badInitialization = function (err) { 364 | namespace.ya.speechkit._stream = null; 365 | if (typeof initFail !== 'undefined') { 366 | initFail(err); 367 | } 368 | }; 369 | 370 | if (namespace.ya.speechkit.mediaDevices) 371 | { 372 | namespace.ya.speechkit.mediaDevices.getUserMedia( 373 | {audio: true}).then( 374 | function (stream) { 375 | namespace.ya.speechkit._stream = stream; 376 | if (typeof initSuccess !== 'undefined') { 377 | initSuccess(); 378 | } 379 | }).catch( 380 | function (err) { 381 | badInitialization(err.message || err.name || err); 382 | }); 383 | } else { 384 | badInitialization('Your browser doesn\'t support Web Audio API. ' + 385 | 'Please, use Yandex.Browser: https://browser.yandex.ru'); 386 | } 387 | }; 388 | 389 | /** 390 | * Поддерживается ли рапознавание заданного языка. 391 | * @return true, если язык поддерживается, false — иначе. 392 | */ 393 | namespace.ya.speechkit.isLanguageSupported = function (lang) 394 | { 395 | if (namespace.ya.speechkit.settings.langWhitelist.indexOf(lang) >= 0) { 396 | return namespace.ya.speechkit.isSupported(); 397 | } else { 398 | return namespace.ya.speechkit.isWebAudioSupported(); 399 | } 400 | }; 401 | 402 | /** 403 | * Поддерживаются ли технологии рапознавания Яндекса. 404 | * @return true, если поддерживаются, false — иначе. 405 | */ 406 | namespace.ya.speechkit.isSupported = function () 407 | { 408 | var userAgent = navigator.userAgent.toLowerCase(); 409 | // Yandex recognition is 100% supported on mobile devices only in firefox 410 | return ((namespace.ya.speechkit.mediaDevices !== null) && 411 | ((/mozilla|firefox/.test(userAgent) && !/yabrowser/.test(userAgent)) || 412 | !/iphone|ipod|ipad|android|blackberry/.test(userAgent))); 413 | }; 414 | 415 | /** 416 | * Поддерживается ли рапознавание с помощью WebAudio API. 417 | * @return true, если поддерживается, false — иначе. 418 | */ 419 | namespace.ya.speechkit.isWebAudioSupported = function () 420 | { 421 | var userAgent = navigator.userAgent.toLowerCase(); 422 | var SpeechRecognition = namespace.SpeechRecognition || namespace.webkitSpeechRecognition; 423 | // Native recognition is only supported in original chrome and chromium 424 | return (typeof SpeechRecognition !== 'undefined' && !/yabrowser|opera|opr/.test(userAgent)); 425 | }; 426 | 427 | 428 | /** 429 | * Функция, которая будет вызвана по факту успешного получения прав на доступ к микрофону. 430 | * @callback 431 | * @name initSuccessCallback 432 | * @memberof Recorder 433 | */ 434 | 435 | /** 436 | * Функция-обработчик, которая будет вызвана при возникновении ошибки при получении доступа к микрофону. 437 | * @callback initFailCallback 438 | * @param {String} error Сообщение об ошибке. 439 | * @memberof Recorder 440 | */ 441 | 442 | /** 443 | * Функция для Blob с wav-файлом. 444 | * @callback 445 | * @name wavCallback 446 | * @param {Blob с MIME типом audio/wav} data wav-файл. 447 | * @memberof Recorder 448 | */ 449 | 450 | /** 451 | * Функция-обработчик, в которую будут переданы буферы записанного аудио. 452 | * @callback 453 | * @name buffersCallback 454 | * @param {Float32Array[]} buffers Буферы записанного аудио для двух каналов (моно и стерео). 455 | * @memberof Recorder 456 | */ 457 | 458 | /** 459 | * Функция, которая будет вызвана после очистки буферов (это сигнал готовности к повторному запуску). 460 | * @callback 461 | * @name clearCallback 462 | * @memberof Recorder 463 | */ 464 | 465 | /** 466 | * Функция-обработчик, в которую будет передаваться записанный аудио-поток. 467 | * @callback 468 | * @name streamCallback 469 | * @param {ArrayBuffer} stream Записанный PCM поток 16-bit. 470 | * @memberof Recorder 471 | */ 472 | 473 | }(this)); 474 | -------------------------------------------------------------------------------- /webspeechkit/src/recorderWorker.js: -------------------------------------------------------------------------------- 1 | (function (namespace) { 2 | 'use strict'; 3 | 4 | if (typeof namespace.ya === 'undefined') { 5 | namespace.ya = {}; 6 | } 7 | if (typeof namespace.ya.speechkit === 'undefined') { 8 | namespace.ya.speechkit = {}; 9 | } 10 | 11 | function _makeWorker(script) { 12 | var URL = window.URL || window.webkitURL; 13 | var Blob = window.Blob; 14 | var Worker = window.Worker; 15 | 16 | if (!URL || !Blob || !Worker || !script) { 17 | return null; 18 | } 19 | 20 | var blob = new Blob([script], {type: 'application/javascript'}); 21 | var worker = new Worker(URL.createObjectURL(blob)); 22 | return worker; 23 | } 24 | 25 | var inline_worker = 26 | "function iirFilter (sampleRate, cutoff, resonance, type) {" + 27 | "" + 28 | " var self = this," + 29 | " f = [0.0, 0.0, 0.0, 0.0]," + 30 | " freq, damp," + 31 | " prevCut, prevReso," + 32 | "" + 33 | " sin = Math.sin," + 34 | " min = Math.min," + 35 | " pow = Math.pow;" + 36 | "" + 37 | " self.cutoff = cutoff || 20000;" + 38 | " self.resonance = resonance || 0.1;" + 39 | " self.samplerate = sampleRate || 44100;" + 40 | " self.type = type || 0;" + 41 | "" + 42 | " function calcCoeff () {" + 43 | " freq = 2 * sin(Math.PI * min(0.25, self.cutoff / (self.samplerate * 2)));" + 44 | " damp = min(2 * (1 - pow(self.resonance, 0.25)), min(2, 2 / freq - freq * 0.5));" + 45 | " }" + 46 | "" + 47 | " self.pushSample = function (sample) {" + 48 | " if (prevCut !== self.cutoff || prevReso !== self.resonance){" + 49 | " calcCoeff();" + 50 | " prevCut = self.cutoff;" + 51 | " prevReso = self.resonance;" + 52 | " }" + 53 | "" + 54 | " f[3] = sample - damp * f[2];" + 55 | " f[0] = f[0] + freq * f[2];" + 56 | " f[1] = f[3] - f[0];" + 57 | " f[2] = freq * f[1] + f[2];" + 58 | "" + 59 | " f[3] = sample - damp * f[2];" + 60 | " f[0] = f[0] + freq * f[2];" + 61 | " f[1] = f[3] - f[0];" + 62 | " f[2] = freq * f[1] + f[2];" + 63 | "" + 64 | " return f[self.type];" + 65 | " };" + 66 | "" + 67 | " self.getMix = function (type) {" + 68 | " return f[type || self.type];" + 69 | " };" + 70 | "}" + 71 | "" + 72 | "var speex_loaded = false;" + 73 | "var recLength = 0;" + 74 | "var recBuffersL = [];" + 75 | "var recBuffersR = [];" + 76 | "var sampleRate;" + 77 | "var outSampleRate;" + 78 | "var tmp_buf = 0;" + 79 | "var need_buf_size = 4096;" + 80 | "var speex_converter = null;" + 81 | " " + 82 | "this.onmessage = function (e) {" + 83 | " switch (e.data.command) {" + 84 | " case 'init':" + 85 | " init(e.data.config);" + 86 | " break;" + 87 | " case 'record':" + 88 | " record(e.data.buffer);" + 89 | " break;" + 90 | " case 'exportWAV':" + 91 | " exportWAV(e.data.type);" + 92 | " break;" + 93 | " case 'exportMonoWAV':" + 94 | " exportMonoWAV(e.data.type);" + 95 | " break;" + 96 | " case 'getBuffers':" + 97 | " getBuffers();" + 98 | " break;" + 99 | " case 'clear':" + 100 | " clear();" + 101 | " break;" + 102 | " }" + 103 | "};" + 104 | " " + 105 | "function init(config) {" + 106 | " sampleRate = config.sampleRate;" + 107 | " outSampleRate = config.format.sampleRate || sampleRate;" + 108 | " need_buf_size = config.format.bufferSize || 4096;" + 109 | " speex_converter = null;" + 110 | " /*if (config.format.format == \'speex\') {" + 111 | " if (!speex_loaded) {" + 112 | " importScripts(\'./speex.min.js\');" + 113 | " speex_loaded = true;" + 114 | " }" + 115 | " need_buf_size /= 16;" + 116 | " speex_converter = new SpeexConverter(outSampleRate);" + 117 | " }*/" + 118 | "}" + 119 | "" + 120 | "var resample = function (inbuf) {" + 121 | " var speed = 1.0 * sampleRate / outSampleRate;" + 122 | " var l = Math.ceil(inbuf.length / speed);" + 123 | " var result = new Float32Array(l);" + 124 | " var bin = 0;" + 125 | " var num = 0;" + 126 | " var indexIn = 0;" + 127 | " var indexOut = 0;" + 128 | " for (indexOut = 1, indexIn = speed; indexOut < l - 1; indexIn += speed, indexOut++) {" + 129 | " var pos = Math.floor(indexIn);" + 130 | " var dt = indexIn - pos;" + 131 | " var second = (pos + 1 < inbuf.length) ? pos + 1 : inbuf.length - 1; " + 132 | " result[indexOut] = inbuf[pos] * (1 - dt) + inbuf[second] * dt;" + 133 | " }" + 134 | " result[0] = inbuf[0];" + 135 | " result[l - 1] = inbuf[inbuf.length - 1];" + 136 | " return result;" + 137 | "};" + 138 | " " + 139 | "function record(inputBuffer) {" + 140 | " if (outSampleRate == sampleRate) {" + 141 | " recBuffersL.push(inputBuffer[0]);" + 142 | " recBuffersR.push(inputBuffer[1]);" + 143 | " recLength += inputBuffer[0].length;" + 144 | " " + 145 | " var samples = inputBuffer[0];" + 146 | " var buffer = new ArrayBuffer(samples.length * 2);" + 147 | " var view = new DataView(buffer);" + 148 | " floatTo16BitPCM(view, 0, samples);" + 149 | " this.postMessage({command: 'int16stream', buffer: buffer});" + 150 | " } else {" + 151 | " var filter0 = new iirFilter(outSampleRate, outSampleRate * 0.125, 0.0); " + 152 | " var filter1 = new iirFilter(outSampleRate, outSampleRate * 0.125, 0.0); " + 153 | "" + 154 | " for (var i =0; i < inputBuffer[0].length; i++) { " + 155 | " inputBuffer[0][i] = filter0.pushSample(inputBuffer[0][i]); " + 156 | " inputBuffer[1][i] = filter1.pushSample(inputBuffer[1][i]); " + 157 | " }" + 158 | "" + 159 | " var resin0 = resample(inputBuffer[0], outSampleRate, sampleRate);" + 160 | " var resin1 = resample(inputBuffer[1], outSampleRate, sampleRate);" + 161 | " " + 162 | " recBuffersL.push(resin0);" + 163 | " recBuffersR.push(resin1);" + 164 | " recLength += resin0.length;" + 165 | " " + 166 | " var result = new Int16Array(resin0.length);" + 167 | " " + 168 | " for (var i = 0 ; i < resin0.length ; i++) {" + 169 | " result[i] = Math.floor(Math.min(Math.max((resin0[i] + resin1[i]) * 0.5, -1.0), 1.0) * 16383);" + 170 | " }" + 171 | " " + 172 | " if (speex_converter) {" + 173 | " result = speex_converter.convert(result);" + 174 | " } else {" + 175 | " result = result.buffer;" + 176 | " }" + 177 | " " + 178 | " if (!tmp_buf) {" + 179 | " tmp_buf = result;" + 180 | " } else {" + 181 | " var tmp = new DataView(new ArrayBuffer(tmp_buf.byteLength + result.byteLength));" + 182 | " tmp_buf = new DataView(tmp_buf);" + 183 | " result = new DataView(result);" + 184 | " " + 185 | " for (i = 0; i < tmp_buf.byteLength; i++) {" + 186 | " tmp.setUint8(i, tmp_buf.getUint8(i));" + 187 | " }" + 188 | " " + 189 | " for (i = 0; i < result.byteLength; i++) {" + 190 | " tmp.setUint8(i + tmp_buf.byteLength, result.getUint8(i));" + 191 | " }" + 192 | " " + 193 | " tmp_buf = tmp.buffer;" + 194 | " }" + 195 | " " + 196 | " if (tmp_buf.byteLength >= need_buf_size) {" + 197 | " this.postMessage({command: 'int16stream', buffer: tmp_buf});" + 198 | " tmp_buf = false;" + 199 | " }" + 200 | " }" + 201 | "}" + 202 | " " + 203 | "function exportWAV(type) {" + 204 | " var bufferL = mergeBuffers(recBuffersL, recLength);" + 205 | " var bufferR = mergeBuffers(recBuffersR, recLength);" + 206 | " var interleaved = interleave(bufferL, bufferR);" + 207 | " var dataview = encodeWAV(interleaved);" + 208 | " var audioBlob = new Blob([dataview], {type: type});" + 209 | " " + 210 | " this.postMessage({command: 'exportWAV', blob: audioBlob});" + 211 | "}" + 212 | " " + 213 | "function exportMonoWAV(type) {" + 214 | " var bufferL = mergeBuffers(recBuffersL, recLength);" + 215 | " var dataview = encodeWAV(bufferL, true);" + 216 | " var audioBlob = new Blob([dataview], {type: type});" + 217 | " " + 218 | " this.postMessage({command: 'exportMonoWAV', blob: audioBlob});" + 219 | "}" + 220 | " " + 221 | "function getBuffers() {" + 222 | " var buffers = [];" + 223 | " buffers.push(mergeBuffers(recBuffersL, recLength));" + 224 | " buffers.push(mergeBuffers(recBuffersR, recLength));" + 225 | " this.postMessage({command: 'getBuffers', blob: buffers});" + 226 | "}" + 227 | " " + 228 | "function clear() {" + 229 | " recLength = 0;" + 230 | " recBuffersL = [];" + 231 | " recBuffersR = [];" + 232 | " if (speex_converter) {" + 233 | " speex_converter.clear();" + 234 | " }" + 235 | " this.postMessage({command: 'clear'});" + 236 | "}" + 237 | " " + 238 | "function mergeBuffers(recBuffers, recLength) {" + 239 | " var result = new Float32Array(recLength);" + 240 | " var offset = 0;" + 241 | " for (var i = 0; i < recBuffers.length; i++){" + 242 | " result.set(recBuffers[i], offset);" + 243 | " offset += recBuffers[i].length;" + 244 | " }" + 245 | " return result;" + 246 | "}" + 247 | " " + 248 | "function interleave(inputL, inputR) {" + 249 | " var length = inputL.length + inputR.length;" + 250 | " var result = new Float32Array(length);" + 251 | " " + 252 | " var index = 0;" + 253 | " var inputIndex = 0;" + 254 | " " + 255 | " while (index < length){" + 256 | " result[index++] = inputL[inputIndex];" + 257 | " result[index++] = inputR[inputIndex];" + 258 | " inputIndex++;" + 259 | " }" + 260 | " return result;" + 261 | "}" + 262 | " " + 263 | "function floatTo16BitPCM(output, offset, input) {" + 264 | " for (var i = 0; i < input.length; i++, offset += 2){" + 265 | " var s = Math.max(-1, Math.min(1, input[i]));" + 266 | " output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);" + 267 | " }" + 268 | "}" + 269 | " " + 270 | "function writeString(view, offset, string) {" + 271 | " for (var i = 0; i < string.length; i++){" + 272 | " view.setUint8(offset + i, string.charCodeAt(i));" + 273 | " }" + 274 | "}" + 275 | " " + 276 | "function encodeWAV(samples, mono) {" + 277 | " var buffer = new ArrayBuffer(44 + samples.length * 2);" + 278 | " var view = new DataView(buffer);" + 279 | " " + 280 | " /* RIFF identifier */" + 281 | " writeString(view, 0, 'RIFF');" + 282 | " /* file length */" + 283 | " view.setUint32(4, 32 + samples.length * 2, true);" + 284 | " /* RIFF type */" + 285 | " writeString(view, 8, 'WAVE');" + 286 | " /* format chunk identifier */" + 287 | " writeString(view, 12, 'fmt ');" + 288 | " /* format chunk length */" + 289 | " view.setUint32(16, 16, true);" + 290 | " /* sample format (raw) */" + 291 | " view.setUint16(20, 1, true);" + 292 | " /* channel count */" + 293 | " view.setUint16(22, mono ? 1 : 2, true);" + 294 | " /* sample rate */" + 295 | " view.setUint32(24, outSampleRate, true);" + 296 | " /* block align (channel count * bytes per sample) */" + 297 | " var block_align = mono ? 2 : 4;" + 298 | " /* byte rate (sample rate * block align) */" + 299 | " view.setUint32(28, outSampleRate * block_align, true);" + 300 | " /* block align (channel count * bytes per sample) */" + 301 | " view.setUint16(32, block_align, true);" + 302 | " /* bits per sample */" + 303 | " view.setUint16(34, 16, true);" + 304 | " /* data chunk identifier */" + 305 | " writeString(view, 36, 'data');" + 306 | " /* data chunk length */" + 307 | " view.setUint32(40, samples.length * 2, true);" + 308 | " " + 309 | " floatTo16BitPCM(view, 44, samples);" + 310 | " " + 311 | " return view;" + 312 | "}" + 313 | " "; 314 | 315 | namespace.ya.speechkit.newWorker = function () { 316 | return _makeWorker(inline_worker); 317 | }; 318 | }(this)); 319 | 320 | -------------------------------------------------------------------------------- /webspeechkit/src/speechrecognition.js: -------------------------------------------------------------------------------- 1 | (function (namespace) { 2 | 'use strict'; 3 | 4 | if (typeof namespace.ya === 'undefined') { 5 | namespace.ya = {}; 6 | } 7 | if (typeof namespace.ya.speechkit === 'undefined') { 8 | namespace.ya.speechkit = {}; 9 | } 10 | 11 | function noop() {} 12 | 13 | /** 14 | * Параметры по умолчанию для SpeechRecognition 15 | * @private 16 | */ 17 | namespace.ya.speechkit._defaultOptions = function () { 18 | /** 19 | * @typedef {Object} SpeechRecognitionOptions 20 | * @property {SpeechRecognition~initCallback} initCallback - Функция, которая будет вызвана по факту инициализации сессии распознавания 21 | * @property {SpeechRecognition~errorCallback} errorCallback - Функция, которая будет вызвана по факту ошибки (все ошибки - критичны, и приводят к порче сессии) 22 | * @property {SpeechRecognition~dataCallback} dataCallback - Функция, в которую будут приходить результаты распознавания 23 | * @property {SpeechRecognition~infoCallback} infoCallback - Функция для технической информации 24 | * @property {SpeechRecognition~stopCallback} stopCallback - Функция, которая будет вызвана в момент остановки сессии распознавания 25 | * @property {Boolean} punctuation - Следует ли пытаться расставлять знаки препинания 26 | * @property {Boolean} allowStringLanguage - Следует ли отключить фильтрацию обсценной лексики 27 | * @property {String} model - Языковая модель для распознавания речи 28 | * @property {String} lang - Язык, речь на котором следует распознавать 29 | * @property {ya.speechkit.FORMAT} format - Формат передачи аудио сигнала 30 | * @property {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример - sandbox. 31 | */ 32 | return { 33 | initCallback: noop, 34 | errorCallback: noop, 35 | dataCallback: noop, 36 | infoCallback: noop, 37 | stopCallback: noop, 38 | punctuation: false, 39 | allowStrongLanguage: false, 40 | model: namespace.ya.speechkit.settings.model, 41 | applicationName: namespace.ya.speechkit.settings.applicationName, 42 | lang: namespace.ya.speechkit.settings.lang, 43 | format: namespace.ya.speechkit.FORMAT.PCM16, 44 | url: namespace.ya.speechkit.settings.websocketProtocol + 45 | namespace.ya.speechkit.settings.asrUrl, 46 | vad: false, 47 | speechStart: noop, 48 | speechEnd: noop, 49 | }; 50 | }; 51 | 52 | /** 53 | * Создает новый объект типа SpeechRecognition. 54 | * @class Класс для распознавания большого потока аудио-сигнала. 55 | * @name SpeechRecognition 56 | */ 57 | var SpeechRecognition = function () { 58 | if (!(this instanceof namespace.ya.speechkit.SpeechRecognition)) { 59 | return new namespace.ya.speechkit.SpeechRecognition(); 60 | } 61 | this.send = 0; 62 | this.send_bytes = 0; 63 | this.proc = 0; 64 | this.recorder = null; 65 | this.recognizer = null; 66 | this.vad = null; 67 | }; 68 | 69 | SpeechRecognition.prototype = /** @lends SpeechRecognition.prototype */ { 70 | /** 71 | * Запускает процесс распознавания речи. 72 | * @param {Object} [options] Параметры, которые будут использоваться во время сессии. 73 | * @param {callback:initCallback} [options.initCallback] Функция-обработчик, которая будет вызвана по факту инициализации сессии распознавания. 74 | * @param {callback:errorCallback} [options.errorCallback] Функция-обработчик, которая будет вызвана по факту ошибки (все ошибки критичны и приводят к завершению сессии). 75 | * @param {callback:dataCallback} [options.dataCallback] Функция-обработчик, которая будет вызвана после успешного завершения 76 | * распознавания. В качестве аргумента ей передаются результаты распознавания. 77 | * @param {callback:infoCallback} [options.infoCallback] Функция для получения технической информации. 78 | * @param {callback:stopCallback} [options.stopCallback] Функция-обработчик, которая будет вызвана в момент остановки сессии распознавания. 79 | * @param {String} [options.apikey] API-ключ. Если не задан, то используется ключ, указанный 80 | * в глобальных настройках {@link settings}. 81 | * @param {Boolean} [options.punctuation=false] Следует ли использовать пунктуацию. 82 | * @param {Boolean} [options.allowStrongLanguage=false] Следует ли отключить фильтрацию обсценной лексики. 83 | * @param {String} [options.model='notes'] Языковая модель для распознавания речи. Список доступных значений: 84 | *
    85 | *
  • 'notes' (по умолчанию) — общая лексика;
  • 86 | *
  • 'queries' — короткие запросы;
  • 87 | *
  • 'names' — имена;
  • 88 | *
  • 'dates' — даты;
  • 89 | *
  • 'maps' — топонимы;
  • 90 | *
  • 'notes' — тексты;
  • 91 | *
  • 'numbers' — числа.
  • 92 | *
93 | *

Если параметр не указан, то используется 94 | * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то 95 | * используется модель по умолчанию — 'notes'.

96 | * @param {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример - sandbox. 97 | * @param {String} [options.lang='ru-RU'] Язык, речь на котором следует распознавать. Возможные значения: 'ru-RU', 'en-US', 'tr-TR'. 98 | *

Если параметр не указан, то используется 99 | * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то по умолчанию 100 | * выбирается русский язык: 'ru-RU'.

101 | * @param {ya.speechkit.FORMAT} [options.format=ya.speechkit.FORMAT.PCM16] Формат передачи аудио-сигнала. 102 | * @param {Boolean} [options.partialResults=true] Отправлять ли на сервер промежуточные результаты. 103 | * @param {Number} [options.utteranceSilence=120] Длительность промежутка тишины во время записи речи (в десятках миллисекунд). Как только встречается 104 | * такой перерыв в речи, запись звука останавливается, и записанный фрагмент речи отправляется на сервер. 105 | */ 106 | start: function (options) { 107 | this.options = namespace.ya.speechkit._extend( 108 | namespace.ya.speechkit._extend( 109 | {}, 110 | namespace.ya.speechkit._defaultOptions() 111 | ), 112 | options); 113 | if (namespace.ya.speechkit.settings.langWhitelist.indexOf(this.options.lang) >= 0) { 114 | if (namespace.ya.speechkit._stream !== null) { 115 | this._onstart(); 116 | } else { 117 | namespace.ya.speechkit.initRecorder( 118 | this._onstart.bind(this), 119 | this.options.errorCallback 120 | ); 121 | } 122 | } else { 123 | var old_error_callback = this.options.errorCallback; 124 | this.recorder = namespace.ya.speechkit.WebAudioRecognition( 125 | namespace.ya.speechkit._extend( 126 | this.options, 127 | { 128 | errorCallback: function (e) { 129 | this.recorder = null; 130 | old_error_callback(e); 131 | }.bind(this) 132 | } 133 | )); 134 | this.recorder.start(); 135 | } 136 | }, 137 | /** 138 | * Will be called after successful call of initRecorder 139 | * @private 140 | */ 141 | _onstart: function () { 142 | if (this.recorder && this.recorder.isPaused()) { 143 | this.recorder.start(); 144 | } 145 | 146 | if (this.recognizer) { 147 | return; 148 | } 149 | 150 | this.send = 0; 151 | this.send_bytes = 0; 152 | this.proc = 0; 153 | 154 | if (!this.recorder) { 155 | this.recorder = new namespace.ya.speechkit.Recorder(); 156 | if (this.options.vad) { 157 | this.vad = new namespace.ya.speechkit.Vad({recorder: this.recorder, 158 | speechStart: this.options.speechStart, 159 | speechEnd: this.options.speechEnd}); 160 | } 161 | } 162 | 163 | this.recognizer = new namespace.ya.speechkit.Recognizer( 164 | namespace.ya.speechkit._extend(this.options, 165 | { 166 | onInit: function (sessionId, code) { 167 | this.recorder.start(function (data) { 168 | if (this.options.vad && this.vad) { 169 | this.vad.update(); 170 | } 171 | this.send++; 172 | this.send_bytes += data.byteLength; 173 | this.options.infoCallback({ 174 | send_bytes: this.send_bytes, 175 | format: this.options.format, 176 | send_packages: this.send, 177 | processed: this.proc 178 | }); 179 | this.recognizer.addData(data); 180 | }.bind(this), this.options.format); 181 | 182 | this.options.initCallback(sessionId, code, 'yandex'); 183 | }.bind(this), 184 | onResult: function (text, uttr, merge, words) { 185 | this.proc += merge; 186 | this.options.infoCallback({ 187 | send_bytes: this.send_bytes, 188 | format: this.options.format, 189 | send_packages: this.send, 190 | processed: this.proc 191 | }); 192 | this.options.dataCallback(text, uttr, merge, words); 193 | }.bind(this), 194 | onError: function (msg) { 195 | if (this.recorder) { 196 | this.recorder.stop(function () { this.recorder = null; }.bind(this)); 197 | } 198 | if (this.recognizer) { 199 | this.recognizer.close(); 200 | this.recognizer = null; 201 | } 202 | this.options.errorCallback(msg); 203 | }.bind(this), 204 | })); 205 | this.recognizer.start(); 206 | }, 207 | /** 208 | * Завершает сессию распознавания речи. 209 | * По завершении сессии будет вызвана функция-обработчик stopCallback. 210 | */ 211 | stop: function () { 212 | if (this.recognizer) { 213 | this.recognizer.finish(); 214 | } 215 | 216 | if (this.recorder) { 217 | this.recorder.stop( 218 | function () { 219 | this.recognizer = null; 220 | this.recorder = null; 221 | }.bind(this) 222 | ); 223 | } 224 | }, 225 | /** 226 | * Прерывает сессию распознавания речи (не дожидается финального результата распознавания). 227 | * По завершении сессии будет вызвана функция-обработчик stopCallback. 228 | */ 229 | abort: function () { 230 | if (this.recognizer) { 231 | this.recognizer.close(); 232 | } 233 | if (this.recorder) { 234 | this.recorder.stop( 235 | function () { 236 | this.recognizer = null; 237 | this.recorder = null; 238 | }.bind(this) 239 | ); 240 | } 241 | }, 242 | /** 243 | * Ставит сессию распознавания на паузу. 244 | * Чтобы соединение с сервером не прерывалось и можно было моментально возобновить распознавание, 245 | * на сервер периодически посылаются небольшие куски данных. 246 | */ 247 | pause: function () { 248 | if (this.recorder) { 249 | this.recorder.pause(); 250 | } 251 | }, 252 | /** 253 | * Определяет, стоит ли на паузе сессия распознавания. 254 | * @returns {Boolean} true, если сессия распознавания речи стоит на паузе, false — иначе. 255 | */ 256 | isPaused: function () { 257 | return (!this.recorder || this.recorder.isPaused()); 258 | } 259 | }; 260 | 261 | ya.speechkit.SpeechRecognition = SpeechRecognition; 262 | 263 | /** 264 | * Функция для распознавания коротких фрагментов речи. 265 | *

При вызове функции recognize() начинается запись звука с микрофона. 266 | * Как только наступает тишина более чем на одну секунду, запись 267 | * прекращается, и функция отправляет запрос на сервер для распознавания записанного фрагмента.

268 | *

Приемлемое качество распознавания обеспечивается на фрагментах длительностью не более 10 секунд. 269 | * При более длительном фрагменте качество распознавания ухудшается.

270 | * @static 271 | * @function 272 | * @name recognize 273 | * @param {Object} [options] Параметры распознавания речи. 274 | * @param {callback:SpeechRecognition.initCallback} [options.initCallback] Функция-обработчик, которая будет вызвана по факту 275 | * инициализации сессии распознавания. 276 | * @param {callback:SpeechRecognition.errorCallback} [options.errorCallback] Функция-обработчик, которая будет вызвана при возникновении ошибки 277 | * (все ошибки критичны и приводят к завершению сессии). 278 | * @param {callback:SpeechRecognition.recognitionDoneCallback} [options.doneCallback] Функция-обработчик, в которую будет отправлен результат распознавания речи. 279 | * @param {String} [options.apikey] API-ключ. По умолчанию принимает значение, указанное 280 | * в глобальных настройках {@link settings}. 281 | * @param {String} [options.model='notes'] Список доступных значений: 282 | *
    283 | *
  • 'notes' (по умолчанию) — текст;
  • 284 | *
  • 'queries' — короткие запросы;
  • 285 | *
  • 'names' — имена;
  • 286 | *
  • 'dates' — даты;
  • 287 | *
  • 'maps' — топонимы;
  • 288 | *
  • 'notes' — тексты;
  • 289 | *
  • 'numbers' — числа.
  • 290 | *
291 | *

Если параметр не указан, то используется 292 | * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то 293 | * используется модель по умолчанию — 'notes'.

294 | * @param {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример — sandbox. 295 | * @param {String} [options.lang='ru-RU'] Язык, речь на котором следует распознавать. Возможные значения: 'ru-RU', 'en-US', 'tr-TR'. 296 | *

Если параметр не указан, то используется 297 | * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то по умолчанию 298 | * выбирается русский язык: 'ru-RU'.

299 | * @param {Boolean} [options.partialResults=true] Отправлять ли на сервер промежуточные результаты. 300 | * @param {Number} [options.utteranceSilence=120] Длительность промежутка тишины во время записи речи (в десятках миллисекунд). Как только встречается 301 | * такой перерыв в речи, запись звука останавливается, и записанный фрагмент речи отправляется на сервер. 302 | */ 303 | 304 | namespace.ya.speechkit.recognize = function (options) { 305 | var dict = new namespace.ya.speechkit.SpeechRecognition(); 306 | 307 | var opts = namespace.ya.speechkit._extend( 308 | namespace.ya.speechkit._extend( 309 | {}, 310 | namespace.ya.speechkit._defaultOptions() 311 | ), 312 | options); 313 | 314 | opts.doneCallback = options.doneCallback; 315 | 316 | opts.dataCallback = function (text, uttr, merge) { 317 | if (uttr) { 318 | if (opts.doneCallback) { 319 | opts.doneCallback(text); 320 | } 321 | dict.stop(); 322 | } 323 | }; 324 | 325 | opts.stopCallback = function () { 326 | dict = null; 327 | }; 328 | 329 | dict.start(opts); 330 | }; 331 | 332 | /** 333 | * Функция, в которую передается полностью распознанный фрагмент текста. 334 | * @param {String} text Распознанная речь. 335 | * @callback 336 | * @name recognitionDoneCallback 337 | * @memberOf SpeechRecognition 338 | */ 339 | 340 | /** 341 | * Функция, которая будет вызвана после успешной инициализации сессии распознавания речи. 342 | * @callback 343 | * @name initCallback 344 | * @memberOf SpeechRecognition 345 | * @param {String} sessionId Идентификатор сессии. 346 | * @param {Number} code HTTP-статус, который будет содержаться в ответе сервера (200 в случае успеха). 347 | */ 348 | 349 | /** 350 | * Функция, в которую будут переданы сообщения об ошибках. 351 | * @callback 352 | * @name errorCallback 353 | * @memberOf SpeechRecognition 354 | * @param {String} message Текст сообщения об ошибке. 355 | */ 356 | 357 | /** 358 | * Функция для результатов распознавания речи. 359 | * @callback 360 | * @name dataCallback 361 | * @memberOf SpeechRecognition 362 | * @param {String} text Распознанный текст. 363 | * @param {Boolean} utterance Является ли данный текст финальным результатом распознавания. 364 | * @param {Number} merge Число обработанных запросов, по которым выдан ответ от сервера. 365 | */ 366 | 367 | /** 368 | * В эту функцию будет передаваться техническая информация. 369 | * @callback 370 | * @name infoCallback 371 | * @memberOf SpeechRecognition. 372 | * @param {Number} send_bytes Сколько байт аудио-данных было передано на сервер. 373 | * @param {Number} send_packages Сколько пакетов аудио-данных было передано на сервер. 374 | * @param {Number} processed Количество пакетов, на которые ответил сервер. 375 | * @param {ya.speechkit.FORMAT} format Какой формат аудио используется. 376 | */ 377 | 378 | /** 379 | * Функция, которая будет вызвана после остановки сессии распознавания речи. 380 | * @callback 381 | * @name stopCallback 382 | * @memberOf SpeechRecognition 383 | */ 384 | }(this)); 385 | -------------------------------------------------------------------------------- /webspeechkit/src/textline.js: -------------------------------------------------------------------------------- 1 | (function (namespace) { 2 | 'use strict'; 3 | 4 | if (typeof namespace.ya === 'undefined') { 5 | namespace.ya = {}; 6 | } 7 | if (typeof namespace.ya.speechkit === 'undefined') { 8 | namespace.ya.speechkit = {}; 9 | } 10 | 11 | namespace.ya.speechkit._mic_on = '' + 16 | ' ' + 17 | ' ' + 18 | ' ' + 19 | ' ' + 20 | ' ' + 21 | ' ' + 23 | ' ' + 25 | ' ' + 27 | ' ' + 28 | ' ' + 29 | ' ' + 30 | ' ' + 31 | ' '; 32 | 33 | namespace.ya.speechkit._mic_off = '' + 38 | ' ' + 39 | ' ' + 40 | ' ' + 42 | ' ' + 44 | ' ' + 46 | ' ' + 47 | ' ' + 48 | ' ' + 49 | ' ' + 50 | ' '; 51 | 52 | /** 53 | * @name Textline 54 | * @class Класс для добавления элемента управления "Поле для голосового ввода". 55 | * @param {String} target Идентификатор div-контейрена, в котором будет размещен элемент управления. 56 | * @param {Object} [options] Опции распознавания. 57 | * @param {Object} [options.onInputFinished] Функция, которая будет вызвана после завершения распознавания. В качесве ее 58 | * аргументов передается финальный распознанный текст. 59 | * @param {String} [options.apikey] API-ключ. Если не задан, то используется ключ, указанный 60 | * в глобальных настройках {@link settings}. 61 | * @param {Boolean} [options.allowStrongLanguage=false] Следует ли отключить фильтрацию обсценной лексики. 62 | * @param {String} [options.model='notes'] Языковая модель для распознавания речи. Список доступных значений: 63 | *
    64 | *
  • 'notes' (по умолчанию) — текст;
  • 65 | *
  • 'queries' — короткие запросы;
  • 66 | *
  • 'names' — имена;
  • 67 | *
  • 'dates' — даты;
  • 68 | *
  • 'maps' - топонимы;
  • 69 | *
  • 'notes' - тексты;
  • 70 | *
  • 'numbers' — числа.
  • 71 | *
72 | *

Если параметр не указан, то используется 73 | * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то 74 | * используется модель по умолчанию - 'notes'.

75 | * @param {String} [options.lang='ru-RU'] Язык, речь на котором следует распознавать. Возможные значения: 'ru-RU', 'en-US', 'tr-TR', 'uk-UA'. 76 | *

Если параметр не указан, то используется 77 | * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то по умолчанию 78 | * выбирается русский язык: 'ru-RU'.

79 | * @param {ya.speechkit.FORMAT} [options.format=ya.speechkit.FORMAT.PCM16] Формат передачи аудио-сигнала. 80 | */ 81 | namespace.ya.speechkit.Textline = function (target, options) { 82 | if (!(this instanceof namespace.ya.speechkit.Textline)) { 83 | return new namespace.ya.speechkit.Textline(target, options); 84 | } 85 | 86 | var el = document.getElementById(target); 87 | if (el.tagName != 'INPUT') { 88 | this.element = el; 89 | this.textinput = document.createElement('input'); 90 | this.textinput.style.height = '100%'; 91 | this.textinput.style.width = '100%'; 92 | } else { 93 | this.textinput = el; 94 | this.element = null; 95 | } 96 | this.textinput.style.backgroundImage = 'url(\'data:image/svg+xml;utf8,' + 97 | namespace.ya.speechkit._mic_off + '\')'; 98 | this.textinput.style.backgroundRepeat = 'no-repeat'; 99 | this.textinput.style.backgroundPosition = 'right center'; 100 | if (this.element) { 101 | this.element.appendChild(this.textinput); 102 | } 103 | 104 | this.dict = null; 105 | 106 | this.final_result = ''; 107 | 108 | var _this = this; 109 | 110 | this.textinput.onmousemove = function (event) { 111 | var rect = _this.textinput.getBoundingClientRect(); 112 | if (event.clientX - rect.x > rect.width - rect.height) 113 | { 114 | _this.textinput.style.cursor = 'pointer'; 115 | } else { 116 | _this.textinput.style.cursor = 'text'; 117 | } 118 | }; 119 | 120 | options = options || {}; 121 | 122 | options.dataCallback = function (text, uttr, merge) { 123 | _this.textinput.value = text; 124 | if (uttr) { 125 | if (options.onInputFinished) { 126 | _this.final_result = text; 127 | options.onInputFinished(text); 128 | } 129 | _this.dict.abort(); 130 | } 131 | }; 132 | 133 | options.initCallback = function () { 134 | _this.textinput.style.backgroundImage = 'url(\'data:image/svg+xml;utf8,' + ya.speechkit._mic_on + '\')'; 135 | }; 136 | 137 | options.stopCallback = function () { 138 | _this.textinput.style.backgroundImage = 'url(\'data:image/svg+xml;utf8,' + ya.speechkit._mic_off + '\')'; 139 | _this.dict = null; 140 | }; 141 | 142 | this.textinput.onmousedown = function (event) { 143 | var rect = _this.textinput.getBoundingClientRect(); 144 | 145 | if (event.clientX <= rect.width - rect.height) { 146 | return; 147 | } 148 | 149 | if (!_this.dict) { 150 | _this.dict = new ya.speechkit.SpeechRecognition(); 151 | } 152 | if (_this.dict.isPaused()) 153 | { 154 | _this.dict.start(options); 155 | } else { 156 | _this.dict.stop(); 157 | } 158 | }; 159 | 160 | return { 161 | /** 162 | * Удаляет элемент управления. 163 | * @name Textline.destroy 164 | * @function 165 | */ 166 | destroy: function () { 167 | if (_this.dict) { 168 | _this.dict.stop(); 169 | } 170 | _this.textinput.style.backgroundImage = ''; 171 | _this.textinput.onmousedown = function () {}; 172 | _this.textinput.onmousemove = function () {}; 173 | 174 | if (_this.element) { 175 | _this.element.removeChild(_this.textinput); 176 | } 177 | }, 178 | /** 179 | * Получает финальный результат распознавания в синхронном режиме. 180 | * @name Textline.value 181 | * @function 182 | * @returns {string} Результат распознавания. 183 | * 184 | * @example 185 | * var textline = new ya.speechkit.Textline('myDiv'); 186 | * 187 | * setTimeout(function () { 188 | * console.log("Результат распознавания: " + textline.value()); 189 | * }, 5000); 190 | */ 191 | value: function () { 192 | return _this.final_result; 193 | } 194 | }; 195 | }; 196 | }(this)); 197 | -------------------------------------------------------------------------------- /webspeechkit/src/tts.js: -------------------------------------------------------------------------------- 1 | (function (namespace) { 2 | 'use strict'; 3 | 4 | if (typeof namespace.ya === 'undefined') { 5 | namespace.ya = {}; 6 | } 7 | if (typeof namespace.ya.speechkit === 'undefined') { 8 | namespace.ya.speechkit = {}; 9 | } 10 | 11 | var speakersCache = null; 12 | 13 | /** 14 | * Воспроизводит аудиофайл. 15 | * @function 16 | * @static 17 | * @param {String | Blob} url URL, по которому доступен либо аудио-файл, 18 | * либо объект Blob со звуком в поддерживаемом браузером формате. 19 | * @param {Function} [cb] Функция-обработчик, которая будет вызвана после завершения воспроизведения. 20 | * @name play 21 | */ 22 | namespace.ya.speechkit.play = function (url, cb) { 23 | var audio = new Audio(url); 24 | audio.volume = 1.0; 25 | audio.onended = cb || function () {}; 26 | audio.play(); 27 | }; 28 | 29 | /** 30 | * @class Класс, предназначенный для использования технологии синтеза речи (озвучивания текста). 31 | * @name Tts 32 | * @param {TtsOptions} [options] Опции. 33 | * @param {String} [options.apikey] API-ключ (если в настройках ключ не был указан, то в конструкторе его необходимо указать). 34 | * @param {String} [options.emotion='neutral'] Эмоциональная окраска голоса. Доступные значения: 35 | *
    36 | *
  • 'neutral' — нейтральный (по умолчанию);
  • 37 | *
  • 'good' — доброжелательный;
  • 38 | *
  • 'evil' — злой.
  • 39 | *
40 | * @param {Array} [options.emotions] Массив эмоций вида [['emotion1', weight1], ['emotion2', weight2]], предназначенный для взвешенного смешивания эмоций 41 | * @param {String} [options.speaker='omazh'] Голос для озвучивания. Список доступных значений можно получить вызвав функцию Tts.speakers: 42 | * *
    43 | *
  • женские голоса: 'omazh' (по умолчанию) и 'jane';
  • 44 | *
  • 'мужские голоса: 'zahar' и 'ermil'.
  • 45 | *
46 | * @param {Array} [options.speakers] Массив голосов вида [['speaker1', weight1], ['speaker2', weight2]], предназначенный для взвешенного смешивания голосов. 47 | * weight может принимать значения от 1.0 до 3.0. Например, [['omazh', 1.5], ['zahar', 2.2]]. 48 | * @param {Array} [options.genders] Массив полов вида [['gender1', weight1], ['gender2', weight2]], предназначенный для взвешенного смешивания полов говорящего. 49 | * weight может принимать значения от 1.0 до 3.0. 50 | * @param {Boolean} [options.fast=false] Использовать "быстрый" синтез, который ускоряет генерацию звука путём уменьшения его качества. 51 | * @param {String} [options.lang='ru-RU'] Язык текста, который надо произнести. Доступные значения: 'ru-RU', 'en-US', 'tr-TR', 'uk-UA'. 52 | * @param {Float} [options.speed=1.0] Скорость синтеза речи. Принимает значения от 0.0 (медленно) до 2.0 (быстро). 53 | */ 54 | var Tts = function (options) { 55 | if (!(this instanceof namespace.ya.speechkit.Tts)) { 56 | return new namespace.ya.speechkit.Tts(options); 57 | } 58 | var _this = this; 59 | /** 60 | * Опции озвучивания текста. 61 | * @type TtsOptions 62 | * @name Tts.options 63 | * @field 64 | */ 65 | this.options = namespace.ya.speechkit._extend( 66 | { 67 | apikey: namespace.ya.speechkit.settings.apikey, 68 | uuid: namespace.ya.speechkit.settings.uuid, 69 | url: namespace.ya.speechkit.settings.websocketProtocol + 70 | namespace.ya.speechkit.settings.ttsStreamUrl, 71 | infoCallback: function () {}, 72 | errorCallback: function (msg) { 73 | console.log(msg); 74 | }, 75 | }, 76 | options); 77 | this.sessionId = null; 78 | this.socket = null; 79 | 80 | this.buffered = []; 81 | 82 | }; 83 | 84 | Tts.prototype = /** @lends Tts.prototype */{ 85 | /** 86 | * Send raw data to websocket 87 | * @param data Any data to send to websocket (json string, raw audio data) 88 | * @private 89 | */ 90 | _sendRaw: function (data) { 91 | if (this.socket) { 92 | this.socket.send(data); 93 | } 94 | }, 95 | /** 96 | * Stringify JSON and send it to websocket 97 | * @param {Object} json Object needed to be send to websocket 98 | * @private 99 | */ 100 | _sendJson: function (json) { 101 | this._sendRaw(JSON.stringify({type: 'message', data: json})); 102 | }, 103 | /** 104 | * @private 105 | * Озвучивание текста. 106 | * @param {String} text Текст. 107 | * @param {Function} [cb] Функция-обработчик, которая будет вызвана по завершении воспроизведения. 108 | * @param {TtsOptions} [options] Опции. 109 | */ 110 | say: function (text, cb, options) { 111 | this.speak( 112 | text, 113 | namespace.ya.speechkit._extend( 114 | this.options, 115 | namespace.ya.speechkit._extend( 116 | { 117 | dataCallback: function (blob) { 118 | var url = URL.createObjectURL(blob); 119 | namespace.ya.speechkit.play(url, cb); 120 | } 121 | }, 122 | options) 123 | ) 124 | ); 125 | }, 126 | /** 127 | * Озвучивание текста. 128 | * @param {TtsOptions} text Опции. 129 | * @param {TtsOptions} [options] Опции. 130 | */ 131 | speak: function (text, options) { 132 | var opts = namespace.ya.speechkit._extend( 133 | namespace.ya.speechkit._extend( 134 | {text: text}, 135 | this.options), 136 | options); 137 | try { 138 | this.socket = new WebSocket(opts.url); 139 | } catch (e) { 140 | opts.errorCallback('Error on socket creation: ' + e); 141 | return; 142 | } 143 | 144 | var context = namespace.ya.speechkit.audiocontext || new namespace.ya.speechkit.AudioContext(); 145 | namespace.ya.speechkit.audiocontext = context; 146 | 147 | this.socket.onopen = function () { 148 | this._sendJson(opts); 149 | }.bind(this); 150 | 151 | var play_queue = []; 152 | var playing = false; 153 | 154 | this.socket.binaryType = 'arraybuffer'; 155 | 156 | this.socket.onmessage = function (e) { 157 | var message = {}; 158 | if (e.data && e.data[0] == '{') { 159 | try { 160 | message = JSON.parse(e.data); 161 | } catch (ex) { 162 | message = {type: 'Audio', data: e.data}; 163 | } 164 | } else { 165 | message = {type: 'Audio', data: e.data}; 166 | } 167 | if (message.type == 'InitResponse') { 168 | this.sessionId = message.data.sessionId; 169 | } else if (message.type == 'Error') { 170 | opts.errorCallback('Session ' + this.sessionId + ': ' + message.data); 171 | this.socket.onclose = function() {}; 172 | this.socket.close(); 173 | } else if (message.type == 'Phonemes') { 174 | opts.infoCallback(message.data); 175 | } else if (message.type == 'Audio') { 176 | play_queue.push(message.data); 177 | } else { 178 | opts.errorCallback('Session ' + this.sessionId + ': ' + message); 179 | this.socket.onclose = function() {}; 180 | this.socket.close(); 181 | } 182 | }.bind(this); 183 | 184 | this.socket.onerror = function (error) { 185 | opts.errorCallback('Socket error: ' + error.message); 186 | }.bind(this); 187 | 188 | this.socket.onclose = function (event) { 189 | var res = Array.prototype.concat.apply([], play_queue); 190 | var blob = new Blob(res, {type: 'audio/x-wav'}); 191 | if (typeof opts.dataCallback !== 'undefined') { 192 | opts.dataCallback(blob); 193 | } else { 194 | var url = URL.createObjectURL(blob); 195 | namespace.ya.speechkit.play(url, opts.stopCallback); 196 | } 197 | }.bind(this); 198 | }, 199 | /** 200 | * Возвращает список доступных голосов и эмоций. 201 | * @param {String} [lang] Язык, для которого следует вернуть список доступных языков 202 | * @returns {Promise} Promise, который вернёт в resolve список доступных языков и эмоций 203 | */ 204 | speakers: function (lang) { 205 | return new Promise(function (resolve, reject) { 206 | 207 | if (speakersCache) { 208 | resolve(speakersCache); 209 | } else { 210 | var xhr = new XMLHttpRequest(); 211 | xhr.open('GET', this.options.url.replace('wss://', 'https://') 212 | .replace('ws://', 'http://') 213 | .replace('ttssocket.ws', 'speakers?engine=ytcp&lang=' + (lang || ''))); 214 | 215 | xhr.onreadystatechange = function () { 216 | if (this.readyState == 4) { 217 | if (this.status == 200) { 218 | try { 219 | speakersCache = JSON.parse(this.responseText); 220 | resolve(speakersCache); 221 | } catch (ex) { 222 | reject(ex.message); 223 | } 224 | } else { 225 | reject('Can\'t get speakers list!'); 226 | } 227 | } 228 | }; 229 | 230 | xhr.send(); 231 | } 232 | }.bind(this)); 233 | }, 234 | }; 235 | 236 | namespace.ya.speechkit.Tts = Tts; 237 | }(this)); 238 | 239 | --------------------------------------------------------------------------------