├── LICENSE
├── README.md
├── cpp
    ├── .gitignore
    ├── CMakeLists.txt
    ├── asr.cpp
    ├── build.sh
    ├── test.sh
    └── tts.cpp
├── go
    ├── BasicProtobuf
    │   └── basic.pb.go
    ├── VoiceProxyProtobuf
    │   └── voiceproxy.pb.go
    ├── asr-client.go
    └── asr-client.osx
├── php
    └── req.php
├── proto
    ├── basic.proto
    ├── tts.proto
    ├── ttsbackend.proto
    └── voiceproxy.proto
├── python
    ├── .gitignore
    ├── README.txt
    ├── advanced_callback_example.py
    ├── advanced_callback_splitter.py
    ├── asrclient-cli.py
    ├── asrclient
    │   ├── __init__.py
    │   ├── basic.proto
    │   ├── client.py
    │   ├── transport.py
    │   ├── tts.proto
    │   ├── ttsbackend.proto
    │   ├── ttsclient.py
    │   └── voiceproxy.proto
    ├── setup.py
    └── ttsclient-cli.py
└── webspeechkit
    ├── README.md
    └── src
        ├── equalizer.js
        ├── recognizer.js
        ├── recorder.js
        ├── recorderWorker.js
        ├── speechrecognition.js
        ├── textline.js
        └── tts.js


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yandex/speechkitcloud/817e2bc2f090a17b8d3a9180848d5174d804bc3b/README.md


--------------------------------------------------------------------------------
/cpp/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(asr-curl-sample)
 4 | 
 5 | add_executable(asr-curl-sample asr.cpp )
 6 | target_link_libraries(asr-curl-sample curl)
 7 | 
 8 | add_executable(tts-curl-sample tts.cpp)
 9 | target_link_libraries(tts-curl-sample curl)
10 | 


--------------------------------------------------------------------------------
/cpp/asr.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <string>
  4 | #include <sstream>
  5 | #include <curl/curl.h>
  6 | 
  7 | size_t write_response_data(char *ptr, size_t size, size_t nmemb, void *userdata)
  8 | {
  9 |   std::stringstream* s = (std::stringstream*)userdata;
 10 |   size_t n = size * nmemb;
 11 |   s->write(ptr, n);
 12 |   return n;
 13 | }
 14 | 
 15 | size_t read_request_data(char *ptr, size_t size, size_t nmemb, void *userdata)
 16 | {
 17 |   std::ifstream* f = (std::ifstream*)userdata;
 18 |   size_t n = size * nmemb;
 19 |   f->read(ptr, n);
 20 |   size_t result = f->gcount();
 21 |   return result;
 22 | }
 23 | 
 24 | int main(int argc, char** argv)
 25 | {
 26 |   std::string filename;
 27 |   std::string key;
 28 | 
 29 |   std::cout << "argc=" << argc << std::endl;
 30 |   while (argc > 0)
 31 |   {
 32 |     int n = argc - 1;
 33 |     const char* val = argv[n];
 34 | 
 35 |     if (n == 2) key = val;
 36 |     if (n == 1) filename = val;
 37 | 
 38 |     std::cout << "argv[" << n << "]=" << val << std::endl;
 39 |     argc--;
 40 |   }
 41 | 
 42 |   std::stringstream usage;
 43 |   usage << "Usage: "<< argv[0] << " <FILENAME> <API_KEY>";
 44 | 
 45 |   if (filename.empty() || key.empty())
 46 |   {
 47 |     std::cout << usage.str();
 48 |     return -1;
 49 |   }
 50 | 
 51 |   CURL *curl = NULL;
 52 |   curl = curl_easy_init();
 53 | 
 54 |   if (curl)
 55 |   {
 56 |     curl_easy_setopt(curl, CURLOPT_HEADER, 1);
 57 |     curl_easy_setopt(curl, CURLOPT_POST, 1);
 58 |     curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 59 |     curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
 60 | 
 61 |     struct curl_slist *headers=NULL;
 62 | 
 63 |     headers = curl_slist_append(headers, "Content-Type: audio/x-wav");
 64 |     curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
 65 | 
 66 |     std::stringstream url;
 67 |     url << "asr.yandex.net/asr_xml?uuid=12345678123456781234567812345678&topic=general&lang=ru-RU&key="
 68 |         << key;
 69 | 
 70 |     curl_easy_setopt(curl, CURLOPT_URL, url.str().c_str());
 71 | 
 72 |     std::ifstream fileStream(filename, std::ifstream::binary);
 73 |     fileStream.seekg (0, fileStream.end);
 74 |     int length = fileStream.tellg();
 75 |     fileStream.seekg (0, fileStream.beg);
 76 | 
 77 |     curl_easy_setopt(curl, CURLOPT_READFUNCTION, &read_request_data);
 78 |     curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, length);
 79 |     curl_easy_setopt(curl, CURLOPT_READDATA, &fileStream);
 80 | 
 81 |     std::stringstream contentStream;
 82 | 
 83 |     curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_response_data);
 84 |     curl_easy_setopt(curl, CURLOPT_WRITEDATA, &contentStream);
 85 | 
 86 |     CURLcode code = curl_easy_perform(curl);
 87 | 
 88 |     unsigned httpCode;
 89 |     curl_easy_getinfo(curl, CURLINFO_HTTP_CODE, &httpCode);
 90 |     std::stringstream msg;
 91 |     msg << "Http code is " << httpCode;
 92 |     std::cout << contentStream.str();
 93 | 
 94 |     curl_free(headers);
 95 |     curl_easy_cleanup(curl);
 96 |   }
 97 | 
 98 |   return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/cpp/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # building with cmake
3 | 
4 | mkdir -p build && cd build && cmake ../ && make -j
5 | 


--------------------------------------------------------------------------------
/cpp/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | #demo key is 6372dda5-9674-4413-85ff-e9d0eb2f99a7
4 | 
5 | ./build.sh
6 | ./build/tts-curl-sample 123 6372dda5-9674-4413-85ff-e9d0eb2f99a7 > build/123.wav
7 | ./build/asr-curl-sample build/123.wav 6372dda5-9674-4413-85ff-e9d0eb2f99a7
8 | 


--------------------------------------------------------------------------------
/cpp/tts.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <string>
  3 | #include <sstream>
  4 | #include <curl/curl.h>
  5 | #include <assert.h>
  6 | 
  7 | // ./build/tts-curl-sample 123 6372dda5-9674-4413-85ff-e9d0eb2f99a7 | play -t wav -
  8 | // same as
  9 | // curl "tts.voicetech.yandex.net/generate?lang=ru_RU&format=wav&speaker=ermil&text=123&key=6372dda5-9674-4413-85ff-e9d0eb2f99a7" | play -t wav -
 10 | 
 11 | using namespace std;
 12 | 
 13 | const char* DEFAULT_HOST = "tts.voicetech.yandex.net";
 14 | const char* DEFAULT_LANG = "ru_RU";
 15 | const char* DEFAULT_FORMAT = "wav";
 16 | const char* DEFAULT_VOICE = "ermil";
 17 | const char* DEFAULT_TEXT = "123";
 18 | bool VERBOSE = false;
 19 | 
 20 | int debug_callback(CURL *handle,
 21 |     curl_infotype type,
 22 |     char *data,
 23 |     size_t size,
 24 |     void *userdata)
 25 | {
 26 |   if (type == CURLINFO_HEADER_OUT)
 27 |   {
 28 |     stringstream* s = (stringstream*)userdata;
 29 |     s->write(data, size);
 30 |   }
 31 |   return CURLE_OK;
 32 | }
 33 | 
 34 | size_t write_callback( void *ptr, size_t size, size_t nmemb, void *userdata)
 35 | {
 36 |   stringstream* s = (stringstream*)userdata;
 37 |   size_t fullSize = size*nmemb;
 38 |   s->write(static_cast<const char *>(ptr), fullSize);
 39 |   return fullSize;
 40 | }
 41 | 
 42 | size_t make_request(CURL* curl, const string& host, const string& text, const string& key)
 43 | {
 44 |   if (curl)
 45 |   {
 46 |     curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
 47 | 
 48 |     stringstream urlStream;
 49 |     urlStream << host
 50 |       << "/generate?lang=" << DEFAULT_LANG
 51 |       << "&format=" << DEFAULT_FORMAT
 52 |       << "&speaker=" << DEFAULT_VOICE
 53 |       << "&text=" << text
 54 |       << "&key=" << key;
 55 | 
 56 |     if (VERBOSE) cout << urlStream.str() << endl;
 57 | 
 58 |     curl_easy_setopt(curl, CURLOPT_URL, urlStream.str().c_str());
 59 | 
 60 |     stringstream responseBodyStream;
 61 |     curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
 62 |     curl_easy_setopt(curl, CURLOPT_WRITEDATA, &responseBodyStream);
 63 | 
 64 |     stringstream requestStream;
 65 | 
 66 |     curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, debug_callback);
 67 |     curl_easy_setopt(curl, CURLOPT_DEBUGDATA, &requestStream);
 68 |     curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 69 | 
 70 |     CURLcode code = curl_easy_perform(curl);
 71 | 
 72 |     string request = requestStream.str();
 73 | 
 74 |     if (VERBOSE) cout << request.size() << endl << request << endl;
 75 | 
 76 |     unsigned httpCode;
 77 |     curl_easy_getinfo(curl, CURLINFO_HTTP_CODE, &httpCode);
 78 |     if (httpCode != 200)
 79 |     {
 80 |       if (VERBOSE) cout << "respose code is " << httpCode << endl;
 81 |     }
 82 | 
 83 |     cout << responseBodyStream.str();
 84 |   }
 85 |   return 0;
 86 | }
 87 | 
 88 | int  main(int argc, char* argv[])
 89 | {
 90 |   CURL *curl = NULL;
 91 |   curl = curl_easy_init();
 92 | 
 93 |   string text;
 94 |   string key;
 95 | 
 96 |   if (VERBOSE) cout << "argc=" << argc << endl;
 97 |   while (argc > 0)
 98 |   {
 99 |     int n = argc - 1;
100 |     const char* val = argv[n];
101 | 
102 |     if (n == 2) key = val;
103 |     if (n == 1) text = val;
104 | 
105 |     if (VERBOSE) cout << "argv[" << n << "]=" << val << endl;
106 |     argc--;
107 |   }
108 | 
109 |   if (text.empty() || key.empty())
110 |   {
111 |     cout << "Usage: tts-curl-sample <TEXT> <API_KEY>" << endl;
112 |     return -1;
113 |   }
114 | 
115 |   make_request(curl, DEFAULT_HOST, text, key);
116 | 
117 |   curl_easy_cleanup(curl);
118 |   return 0;
119 | }
120 | 


--------------------------------------------------------------------------------
/go/BasicProtobuf/basic.pb.go:
--------------------------------------------------------------------------------
  1 | // Code generated by protoc-gen-go.
  2 | // source: basic.proto
  3 | // DO NOT EDIT!
  4 | 
  5 | /*
  6 | Package BasicProtobuf is a generated protocol buffer package.
  7 | 
  8 | It is generated from these files:
  9 | 	basic.proto
 10 | 
 11 | It has these top-level messages:
 12 | 	ConnectionResponse
 13 | */
 14 | package BasicProtobuf
 15 | 
 16 | import proto "github.com/golang/protobuf/proto"
 17 | import fmt "fmt"
 18 | import math "math"
 19 | 
 20 | // Reference imports to suppress errors if they are not otherwise used.
 21 | var _ = proto.Marshal
 22 | var _ = fmt.Errorf
 23 | var _ = math.Inf
 24 | 
 25 | // This is a compile-time assertion to ensure that this generated file
 26 | // is compatible with the proto package it is being compiled against.
 27 | const _ = proto.ProtoPackageIsVersion1
 28 | 
 29 | type ConnectionResponse_ResponseCode int32
 30 | 
 31 | const (
 32 | 	ConnectionResponse_OK                   ConnectionResponse_ResponseCode = 200
 33 | 	ConnectionResponse_BadMessageFormatting ConnectionResponse_ResponseCode = 400
 34 | 	ConnectionResponse_UnknownService       ConnectionResponse_ResponseCode = 404
 35 | 	ConnectionResponse_NotSupportedVersion  ConnectionResponse_ResponseCode = 405
 36 | 	ConnectionResponse_Timeout              ConnectionResponse_ResponseCode = 408
 37 | 	ConnectionResponse_ProtocolError        ConnectionResponse_ResponseCode = 410
 38 | 	ConnectionResponse_InternalError        ConnectionResponse_ResponseCode = 500
 39 | 	ConnectionResponse_InvalidKey           ConnectionResponse_ResponseCode = 429
 40 | 	ConnectionResponse_InvalidRequestParams ConnectionResponse_ResponseCode = 406
 41 | )
 42 | 
 43 | var ConnectionResponse_ResponseCode_name = map[int32]string{
 44 | 	200: "OK",
 45 | 	400: "BadMessageFormatting",
 46 | 	404: "UnknownService",
 47 | 	405: "NotSupportedVersion",
 48 | 	408: "Timeout",
 49 | 	410: "ProtocolError",
 50 | 	500: "InternalError",
 51 | 	429: "InvalidKey",
 52 | 	406: "InvalidRequestParams",
 53 | }
 54 | var ConnectionResponse_ResponseCode_value = map[string]int32{
 55 | 	"OK": 200,
 56 | 	"BadMessageFormatting": 400,
 57 | 	"UnknownService":       404,
 58 | 	"NotSupportedVersion":  405,
 59 | 	"Timeout":              408,
 60 | 	"ProtocolError":        410,
 61 | 	"InternalError":        500,
 62 | 	"InvalidKey":           429,
 63 | 	"InvalidRequestParams": 406,
 64 | }
 65 | 
 66 | func (x ConnectionResponse_ResponseCode) Enum() *ConnectionResponse_ResponseCode {
 67 | 	p := new(ConnectionResponse_ResponseCode)
 68 | 	*p = x
 69 | 	return p
 70 | }
 71 | func (x ConnectionResponse_ResponseCode) String() string {
 72 | 	return proto.EnumName(ConnectionResponse_ResponseCode_name, int32(x))
 73 | }
 74 | func (x *ConnectionResponse_ResponseCode) UnmarshalJSON(data []byte) error {
 75 | 	value, err := proto.UnmarshalJSONEnum(ConnectionResponse_ResponseCode_value, data, "ConnectionResponse_ResponseCode")
 76 | 	if err != nil {
 77 | 		return err
 78 | 	}
 79 | 	*x = ConnectionResponse_ResponseCode(value)
 80 | 	return nil
 81 | }
 82 | func (ConnectionResponse_ResponseCode) EnumDescriptor() ([]byte, []int) {
 83 | 	return fileDescriptor0, []int{0, 0}
 84 | }
 85 | 
 86 | type ConnectionResponse struct {
 87 | 	ResponseCode     *ConnectionResponse_ResponseCode `protobuf:"varint,1,req,name=responseCode,enum=BasicProtobuf.ConnectionResponse_ResponseCode" json:"responseCode,omitempty"`
 88 | 	SessionId        *string                          `protobuf:"bytes,2,req,name=sessionId" json:"sessionId,omitempty"`
 89 | 	Message          *string                          `protobuf:"bytes,3,opt,name=message" json:"message,omitempty"`
 90 | 	XXX_unrecognized []byte                           `json:"-"`
 91 | }
 92 | 
 93 | func (m *ConnectionResponse) Reset()                    { *m = ConnectionResponse{} }
 94 | func (m *ConnectionResponse) String() string            { return proto.CompactTextString(m) }
 95 | func (*ConnectionResponse) ProtoMessage()               {}
 96 | func (*ConnectionResponse) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
 97 | 
 98 | func (m *ConnectionResponse) GetResponseCode() ConnectionResponse_ResponseCode {
 99 | 	if m != nil && m.ResponseCode != nil {
100 | 		return *m.ResponseCode
101 | 	}
102 | 	return ConnectionResponse_OK
103 | }
104 | 
105 | func (m *ConnectionResponse) GetSessionId() string {
106 | 	if m != nil && m.SessionId != nil {
107 | 		return *m.SessionId
108 | 	}
109 | 	return ""
110 | }
111 | 
112 | func (m *ConnectionResponse) GetMessage() string {
113 | 	if m != nil && m.Message != nil {
114 | 		return *m.Message
115 | 	}
116 | 	return ""
117 | }
118 | 
119 | func init() {
120 | 	proto.RegisterType((*ConnectionResponse)(nil), "BasicProtobuf.ConnectionResponse")
121 | 	proto.RegisterEnum("BasicProtobuf.ConnectionResponse_ResponseCode", ConnectionResponse_ResponseCode_name, ConnectionResponse_ResponseCode_value)
122 | }
123 | 
124 | var fileDescriptor0 = []byte{
125 | 	// 286 bytes of a gzipped FileDescriptorProto
126 | 	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x64, 0x8f, 0x31, 0x4e, 0xf3, 0x30,
127 | 	0x14, 0xc7, 0x95, 0x78, 0x88, 0xf2, 0xbe, 0xb4, 0x9f, 0x71, 0x19, 0xc2, 0x56, 0x75, 0xea, 0x94,
128 | 	0x81, 0x23, 0xb4, 0x80, 0x14, 0x55, 0x40, 0x95, 0x02, 0xbb, 0x9b, 0x3c, 0x2a, 0x8b, 0xc6, 0x2f,
129 | 	0xd8, 0x4e, 0x11, 0xb7, 0x60, 0x00, 0x84, 0xb8, 0x03, 0xa7, 0x80, 0x81, 0x03, 0x71, 0x00, 0x92,
130 | 	0x92, 0x81, 0x8a, 0xed, 0xef, 0x9f, 0xfd, 0xf3, 0xfb, 0x3f, 0xf8, 0xb7, 0x94, 0x56, 0xe5, 0x49,
131 | 	0x65, 0xc8, 0x91, 0xe8, 0x4d, 0xda, 0xc3, 0xbc, 0xcd, 0xcb, 0xfa, 0x7a, 0xf4, 0xe1, 0x83, 0x98,
132 | 	0x92, 0xd6, 0x98, 0x3b, 0x45, 0x3a, 0x43, 0x5b, 0x91, 0xb6, 0x28, 0x8e, 0x20, 0x32, 0x5d, 0x9e,
133 | 	0x52, 0x81, 0xb1, 0x37, 0xf4, 0xc7, 0xfd, 0xc3, 0x24, 0xd9, 0x91, 0x93, 0xbf, 0x62, 0x92, 0xfd,
134 | 	0xb2, 0xc4, 0x1e, 0x84, 0x16, 0xad, 0x6d, 0xee, 0xd3, 0x22, 0xf6, 0x9b, 0x2f, 0x42, 0xf1, 0x1f,
135 | 	0x82, 0xb2, 0x41, 0x72, 0x85, 0x31, 0x1b, 0x7a, 0xe3, 0x70, 0xf4, 0xee, 0x41, 0xb4, 0x23, 0x05,
136 | 	0xe0, 0x9f, 0xcf, 0xf8, 0xa7, 0x27, 0x0e, 0x60, 0x7f, 0x22, 0x8b, 0xd3, 0x9f, 0xd7, 0x27, 0x64,
137 | 	0x4a, 0xe9, 0x9c, 0xd2, 0x2b, 0xfe, 0xc0, 0xc4, 0x00, 0xfa, 0x97, 0xfa, 0x46, 0xd3, 0x9d, 0x5e,
138 | 	0xa0, 0xd9, 0xa8, 0x1c, 0xf9, 0x23, 0x13, 0x31, 0x0c, 0xce, 0xc8, 0x2d, 0xea, 0xaa, 0x22, 0xe3,
139 | 	0xb0, 0xb8, 0x42, 0xd3, 0x4e, 0xe6, 0x4f, 0x4c, 0x44, 0x10, 0x5c, 0xa8, 0x12, 0xa9, 0x76, 0xfc,
140 | 	0x85, 0x09, 0x01, 0xbd, 0xed, 0x06, 0x39, 0xad, 0x8f, 0x8d, 0x21, 0xc3, 0x5f, 0xb7, 0x2c, 0xd5,
141 | 	0x0e, 0x8d, 0x96, 0x1d, 0xfb, 0x62, 0x4d, 0x55, 0x48, 0xf5, 0x46, 0xae, 0x55, 0x31, 0xc3, 0x7b,
142 | 	0xfe, 0xc6, 0xda, 0x42, 0x1d, 0xc8, 0xf0, 0xb6, 0x46, 0xeb, 0xe6, 0xd2, 0xc8, 0xd2, 0xf2, 0x67,
143 | 	0xf6, 0x1d, 0x00, 0x00, 0xff, 0xff, 0xe0, 0x49, 0xa0, 0xc3, 0x63, 0x01, 0x00, 0x00,
144 | }
145 | 


--------------------------------------------------------------------------------
/go/asr-client.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"errors"
  7 | 	"flag"
  8 | 	"fmt"
  9 | 	"github.com/golang/protobuf/proto"
 10 | 	"github.com/yandex/speechkitcloud/go/BasicProtobuf"
 11 | 	"github.com/yandex/speechkitcloud/go/VoiceProxyProtobuf"
 12 | 	"io"
 13 | 	"log"
 14 | 	"net"
 15 | 	"os"
 16 | 	"strconv"
 17 | )
 18 | 
 19 | type Debug bool
 20 | 
 21 | func (d Debug) Printf(s string, a ...interface{}) {
 22 | 	if d {
 23 | 		log.Printf(s, a...)
 24 | 	}
 25 | }
 26 | 
 27 | var dbg Debug
 28 | 
 29 | func sendData(conn io.Writer, data []byte) (int, error) {
 30 | 	written1, err := fmt.Fprintf(conn, "%x\r\n", len(data))
 31 | 	written2, err := conn.Write(data)
 32 | 	return written1 + 2 + written2, err
 33 | }
 34 | 
 35 | func sendProtoMessage(conn io.Writer, message proto.Message) (int, error) {
 36 | 	data, err := proto.Marshal(message)
 37 | 	check("sendProtoMessage / proto.Marshal", err)
 38 | 	written, err := sendData(conn, data)
 39 | 	return written, err
 40 | }
 41 | 
 42 | func recvData(connReader *bufio.Reader) ([]byte, error) {
 43 | 	resp, err := connReader.ReadString('\n')
 44 | 	if len(resp) < 2 {
 45 | 		return nil, errors.New("recvData / no length line found")
 46 | 	}
 47 | 	connRespProtoLength, err := strconv.ParseInt(resp[:len(resp)-2], 16, 64)
 48 | 	check("recvData / strconv.ParseInt", err)
 49 | 
 50 | 	dbg.Printf(">> 0x%s -> %d\n", resp[:len(resp)-2], int(connRespProtoLength))
 51 | 
 52 | 	buffer := make([]byte, int(connRespProtoLength))
 53 | 	_, err = io.ReadFull(connReader, buffer)
 54 | 	check("recvData / io.ReadFull", err)
 55 | 	return buffer, err
 56 | }
 57 | 
 58 | func recvProtoMessage(connReader *bufio.Reader, message proto.Message) error {
 59 | 	buffer, err := recvData(connReader)
 60 | 	check("recvProtoMessage	/ recvData", err)
 61 | 
 62 | 	err = proto.Unmarshal(buffer, message)
 63 | 	check("recvProtoMessage / proto.Unmarshal ", err)
 64 | 	return err
 65 | }
 66 | 
 67 | func check(id interface{}, err error) {
 68 | 	if err != nil {
 69 | 		log.Fatal(id, err)
 70 | 	}
 71 | }
 72 | 
 73 | func main() {
 74 | 	serverPtr := flag.String("s", "asr.yandex.net", "ASR server to connect.")
 75 | 	portPtr := flag.Int("p", 80, "Server port.")
 76 | 	apiKeyPtr := flag.String("k", "069b6659-984b-4c5f-880e-aaedcfd84102",
 77 | 		"Speechkit Cloud api key. You should get your own at https://developer.tech.yandex.ru.\n\r\tDefault is limited demo key.")
 78 | 	topicPtr := flag.String("topic", "freeform", "Recognition model topic (aka \"model\").")
 79 | 	formatPtr := flag.String("format", "audio/x-pcm;bit=16;rate=16000", "Input file format.")
 80 | 	langPtr := flag.String("lang", "ru-RU", "Recognition language. ru-RU | en-EN | tr-TR | uk-UA.")
 81 | 	verbosePtr := flag.Bool("verbose", false, "Print more debug output.")
 82 | 	chunkSizePtr := flag.Int("chunk-size", 32768, "Client chops input file into chunks and sends it one-by-one in a streaming manner.\n\rDefault value roughly equals to one second of audio in default format.")
 83 | 
 84 | 	flag.Parse()
 85 | 
 86 | 	dbg = Debug(*verbosePtr)
 87 | 
 88 | 	if len(flag.Args()) == 0 {
 89 | 		log.Fatal("No input file!")
 90 | 	}
 91 | 	fileName := flag.Args()[0]
 92 | 
 93 | 	connectionString := fmt.Sprintf("%v:%v", *serverPtr, *portPtr)
 94 | 	dbg.Printf(connectionString)
 95 | 
 96 | 	conn, err := net.Dial("tcp", connectionString)
 97 | 	check(1, err)
 98 | 	defer conn.Close()
 99 | 
100 | 	var upgradeRequest bytes.Buffer
101 | 	upgradeRequest.WriteString("GET /asr_partial HTTP/1.1\r\n")
102 | 	upgradeRequest.WriteString("Upgrade: dictation\r\n\r\n")
103 | 
104 | 	dbg.Printf("%s", upgradeRequest.String())
105 | 	_, err = upgradeRequest.WriteTo(conn)
106 | 	check(3, err)
107 | 
108 | 	reader := bufio.NewReader(conn)
109 | 
110 | 	resp, err := reader.ReadString('\n')
111 | 	for resp != "" {
112 | 		check(4, err)
113 | 		dbg.Printf(resp)
114 | 		if resp == "\r\n" {
115 | 			break
116 | 		}
117 | 		resp, err = reader.ReadString('\n')
118 | 	}
119 | 
120 | 	dbg.Printf(">> done reading upgrade response, trying to send protobuf\n")
121 | 
122 | 	initProto := &VoiceProxyProtobuf.ConnectionRequest{
123 | 		ApiKey:           proto.String(*apiKeyPtr),
124 | 		SpeechkitVersion: proto.String(""),
125 | 		ServiceName:      proto.String(""),
126 | 		Device:           proto.String("desktop"),
127 | 		Coords:           proto.String("0, 0"),
128 | 		Uuid:             proto.String("12345678123456788765432187654321"),
129 | 		ApplicationName:  proto.String("golang-client"),
130 | 		Topic:            proto.String(*topicPtr),
131 | 		Lang:             proto.String(*langPtr),
132 | 		Format:           proto.String(*formatPtr),
133 | 	}
134 | 
135 | 	_, err = sendProtoMessage(conn, initProto)
136 | 	check(5, err)
137 | 
138 | 	connRespProto := &BasicProtobuf.ConnectionResponse{}
139 | 	err = recvProtoMessage(reader, connRespProto)
140 | 	check(9, err)
141 | 
142 | 	dbg.Printf(">> done reading connection response proto\n")
143 | 	dbg.Printf(">> connRespProto { %v}\n", connRespProto)
144 | 
145 | 	f, err := os.Open(fileName)
146 | 	check(10, err)
147 | 	defer f.Close()
148 | 	fileInfo, err := f.Stat()
149 | 	check(12, err)
150 | 
151 | 	var chunkSize int64 = int64(*chunkSizePtr)
152 | 	expectedChunksCount := int32(fileInfo.Size() / chunkSize)
153 | 	if fileInfo.Size()%chunkSize != 0 {
154 | 		expectedChunksCount++ // last chunk is probably < chunkSize
155 | 	}
156 | 	expectedChunksCount++ // final empty chunk
157 | 
158 | 	go func() {
159 | 		var chunkCount int
160 | 		chunkBuff := make([]byte, chunkSize)
161 | 		for err == nil {
162 | 			var readCount int
163 | 			readCount, err = f.Read(chunkBuff)
164 | 			dbg.Printf(">> read chunk %d\n", readCount)
165 | 			if readCount > 0 {
166 | 				dbg.Printf(">> sending chunk %d\n", chunkCount)
167 | 				addDataProto := &VoiceProxyProtobuf.AddData{LastChunk: proto.Bool(false), AudioData: chunkBuff}
168 | 				_, err = sendProtoMessage(conn, addDataProto)
169 | 				check(11, err)
170 | 				chunkCount++
171 | 			}
172 | 		}
173 | 		lastChunkProto := &VoiceProxyProtobuf.AddData{LastChunk: proto.Bool(true)}
174 | 		_, err = sendProtoMessage(conn, lastChunkProto)
175 | 		check(13, err)
176 | 	}()
177 | 
178 | 	var loopCounter int32
179 | 	for err == nil && loopCounter < expectedChunksCount {
180 | 		dbg.Printf(">> recv proto loop %v/%v\n", loopCounter, expectedChunksCount)
181 | 		addDataRespProto := &VoiceProxyProtobuf.AddDataResponse{}
182 | 		err = recvProtoMessage(reader, addDataRespProto)
183 | 		dbg.Printf(">> addDataRespProto { %v}\n", addDataRespProto)
184 | 
185 | 		if err == nil {
186 | 			loopCounter += addDataRespProto.GetMessagesCount()
187 | 			dbg.Printf(">> loopCounter increased, now %v/%v\n", loopCounter, expectedChunksCount)
188 | 			recognitions := addDataRespProto.GetRecognition()
189 | 			if recognitions != nil && len(recognitions) > 0 {
190 | 				fmt.Printf("got result: %v; endOfUtt: %v\n", addDataRespProto.GetRecognition()[0].GetNormalized(), addDataRespProto.GetEndOfUtt())
191 | 			}
192 | 		}
193 | 	}
194 | 
195 | 	check(14, err)
196 | 
197 | 	fmt.Printf("Done, all fine!\n")
198 | }
199 | 


--------------------------------------------------------------------------------
/go/asr-client.osx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yandex/speechkitcloud/817e2bc2f090a17b8d3a9180848d5174d804bc3b/go/asr-client.osx


--------------------------------------------------------------------------------
/php/req.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | if (!function_exists('curl_file_create')) {
 4 |     function curl_file_create($filename, $mimetype = '', $postname = '') {
 5 |         return "@$filename;filename="
 6 |             . ($postname ?: basename($filename))
 7 |             . ($mimetype ? ";type=$mimetype" : '');
 8 |     }
 9 | }
10 | 
11 | function generateRandomSelection($min, $max, $count)
12 | {
13 |     $result=array();
14 |     if($min>$max) return $result;
15 |     $count=min(max($count,0),$max-$min+1);
16 |     while(count($result)<$count) {
17 |         $value=rand($min,$max-count($result));
18 |         foreach($result as $used) if($used<=$value) $value++; else break;
19 |         $result[]=dechex($value);
20 |         sort($result);
21 |     }
22 |     shuffle($result);
23 |     return $result;
24 | }
25 | 
26 | function recognize($file, $key) {
27 |     $uuid=generateRandomSelection(0,30,64);
28 |     $uuid=implode($uuid);
29 |     $uuid=substr($uuid,1,32);
30 |     $curl = curl_init();
31 |     $url = 'https://asr.yandex.net/asr_xml?'.http_build_query(array(
32 |         'key'=>$key,
33 |         'uuid' => $uuid,
34 |         'topic' => 'notes',
35 |         'lang'=>'ru-RU'
36 |     ));
37 |     curl_setopt($curl, CURLOPT_URL, $url);
38 | 
39 |     $data=file_get_contents(realpath($file));
40 | 
41 |     curl_setopt($curl, CURLOPT_POST, true);
42 |     curl_setopt($curl, CURLOPT_POSTFIELDS, $data);
43 |     curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
44 |     curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);
45 |     curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
46 |     curl_setopt($curl, CURLOPT_HTTPHEADER, array('Content-Type: audio/x-wav'));
47 |     curl_setopt($curl, CURLOPT_VERBOSE, true);
48 |     $response = curl_exec($curl);
49 |     $err = curl_errno($curl);
50 |     curl_close($curl);
51 |     if ($err)
52 |         throw new exception("curl err $err");
53 |     echo $response;
54 | }
55 | 
56 | 
57 | print_r($argv);
58 | $filename = $argv[1];
59 | $key = $argv[2];
60 | 
61 | recognize($filename, $key);
62 | print("done\n");
63 | 
64 | ?>
65 | 


--------------------------------------------------------------------------------
/proto/basic.proto:
--------------------------------------------------------------------------------
 1 | // Yandex ASR dictation api (draft):
 2 | // Client initiate session with http upgrade request, for example:
 3 | // GET /asr_partial HTTP/1.1\r\n
 4 | // User-Agent:KeepAliveClient\r\n
 5 | // Host: voice-stream.voicetech.yandex.net:80\r\n
 6 | // Upgrade: dictation\r\n\r\n
 7 | // Receive HTTP 101 Switched response.
 8 | // Next send\receive protobuf messages, format
 9 | // [hex size]\r\n[message body serialized with protobuf]
10 | 
11 | // send ConnectionRequest, read ConnectionsResponse.. etc
12 | // send AddData, read AddDataResponse and so on.
13 | syntax = "proto2";
14 | 
15 | package BasicProtobuf;
16 | 
17 | message ConnectionResponse
18 | {
19 |   required ResponseCode responseCode = 1;
20 | 
21 |   required string sessionId = 2;
22 | 
23 |   optional string message = 3;
24 | 
25 |   enum ResponseCode {
26 |     OK = 200;
27 |     BadMessageFormatting = 400;
28 |     UnknownService = 404;
29 |     NotSupportedVersion = 405;
30 |     Timeout = 408;
31 |     ProtocolError = 410;
32 |     InternalError = 500;
33 |     InvalidKey = 429;
34 |     InvalidRequestParams = 406;
35 | 	}
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/proto/tts.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto2";
  2 | 
  3 | import "ttsbackend.proto";
  4 | 
  5 | package TTS;
  6 | 
  7 | ///////////////////////////////////////////////////////////////////////////
  8 | // Usage:
  9 | //(1) send ConnectionRequest, receive ConnectionResponse
 10 | //(2) send ParamsRequest, receive ParamsResponse
 11 | //(3) send GenerateRequest, recieve GenerateResponse
 12 | //
 13 | // TTS proxy requires all this steps in fixed order
 14 | // and none could be skipped
 15 | 
 16 | message ConnectionRequest
 17 | {
 18 |   required string serviceName = 1; // "tts", "asr", "asr_dictation", etc.
 19 | 
 20 |   required string uuid = 2;
 21 | 
 22 |   optional int32 protocolVersion = 3 [default = 1];
 23 | 
 24 |   optional string deviceName = 4;
 25 | 
 26 |   // new field v2
 27 | 
 28 |   optional string speechkitVersion = 5;
 29 | 
 30 |   // warning, apiKey options only temporary (for /tcp hand), for /ytcp it absence will result in ConnectionResponse::KeyInvalid
 31 |   optional string apiKey = 6;
 32 | 
 33 |   optional string applicationName = 7;
 34 | 
 35 |   optional string coords = 8;
 36 | }
 37 | 
 38 | ///////////////////////////////////////////////////////////////////////////
 39 | 
 40 | message ParamsRequest
 41 | {
 42 |   optional bool listVoices = 1;
 43 | }
 44 | 
 45 | message ParamsResponse
 46 | {
 47 |   message Voice
 48 |   {
 49 |     // use as "voice" in GenerateRequest
 50 |     required string name = 1;
 51 |     // 1 female, 2 male
 52 |     required int32 gender = 2;
 53 |     // 0x809 english
 54 |     // 0x419 ru
 55 |     // 0 for "internation" voice, that can be used with any language
 56 |     required int32 languageId = 3;
 57 | 
 58 |     required int32 initialSampleFreq = 4;
 59 | 
 60 |     // fine display name for menus and gui
 61 |     optional string displayName = 5;
 62 | 
 63 |     // this voice can be used in lowLevelGenerateRequest for mixing
 64 |     optional bool coreVoice = 6;
 65 |   }
 66 | 
 67 |   repeated Voice voiceList = 1;
 68 | }
 69 | 
 70 | ///////////////////////////////////////////////////////////////////////////
 71 | 
 72 | message GenerateRequest
 73 | {
 74 |   required string lang = 1;
 75 |   required string text = 2;
 76 |   required string application = 3;
 77 |   required string platform = 4;
 78 |   required string voice = 6;
 79 |   optional float speed = 31;
 80 |   optional string emotion = 10;
 81 | 
 82 |   enum Quality {
 83 |     High = 0; Low = 1; UltraHigh = 2;
 84 |     // Low means resample to 8000!
 85 |     // High meand resample to 16000.
 86 |     // UltraHigh means 48000 (or 32000 for SPEEX)
 87 |   }
 88 | 
 89 |   enum Format {
 90 |     Wav = 0; Pcm = 1; Spx = 2; Opus = 3;
 91 |   }
 92 | 
 93 |   optional Quality quality = 7 [default = High];
 94 |   optional Format format = 8 [default = Spx];
 95 |   optional bool requireMetainfo = 5 [default = false];
 96 | 
 97 |   optional Generate lowLevelGenerateRequest = 30;
 98 | 
 99 |   // keep calm, and do not use '9' slot again
100 |   optional string speed_obsolete = 9;
101 | 
102 |   optional float volume = 32 [default = 1.0];
103 | 
104 |   optional bool chunked = 33 [default = false];
105 | }
106 | 
107 | message Feedback
108 | {
109 |   required int32 elapsed = 1;
110 |   required string event = 2;
111 | }
112 | 
113 | 


--------------------------------------------------------------------------------
/proto/ttsbackend.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto2";
 2 | 
 3 | import "basic.proto";
 4 | 
 5 | package TTS;
 6 | 
 7 | message Generate
 8 | {
 9 |   optional string sessionId = 30;
10 |   
11 |   //Language code, ex.: ru, en
12 |   required string lang = 1;
13 |   
14 |   message WeightedParam
15 |   {
16 |     required string name = 1;
17 |     required float weight = 2;
18 |   }
19 | 
20 |   //Text for synthesis
21 |   required string text = 2;
22 | 
23 |   //Speed of speech <1.0 - slower, >1.0 - faster
24 |   optional float speed  = 3 [default = 1.0];
25 | 
26 |   //Supported voices are: zhar, omazh, jane, ermil, oksana
27 |   repeated WeightedParam voices = 11;
28 | 
29 |   //Supported emotions are: good, neutral, evil
30 |   repeated WeightedParam emotions = 12;
31 | 
32 |   //Supported genders are: male, female
33 |   repeated WeightedParam genders = 13;
34 |   
35 |   optional bool requireMetainfo = 5 [default = false];
36 | 
37 |   optional float msd_threshold = 14;
38 |   optional float mgc_recurrence = 15;
39 |   optional float subtract_durations_sigmas = 17;
40 |   optional float lf0_postfilter = 18;
41 |   optional float mgcGVWeight = 19;
42 |   optional float lf0GVWeight = 20;
43 |   optional float mvfGVWeight = 21;
44 |   optional float mgc_postfilter1 = 22;
45 |   optional float mgc_postfilter2 = 23;
46 | 
47 |   optional bool chunked = 24; //ex-fast
48 | }
49 | 
50 | message GenerateResponse
51 | {
52 |   message WordEvent
53 |   {
54 |     required int32 firstCharPositionInText = 1;
55 |     required int32 bytesLengthInSignal = 2;
56 |     optional string text = 3;
57 |     optional string postag = 4;
58 |     optional string homographTag = 5;
59 |   }
60 |   message Phoneme
61 |   {
62 |     required string ttsPhoneme = 1;
63 |     required string IPAPhoneme = 2;
64 |     required int32 viseme = 5;
65 |     required int32 durationMs = 3;
66 |     required int32 positionInBytesStream = 4;
67 |   }
68 |   
69 |   // words and phomenes would be empty, unless requireMetainfo is set in GenerateRequest
70 |   repeated WordEvent words = 1;
71 |   repeated Phoneme phonemes = 2;
72 |   optional bytes audioData = 3;
73 |   required bool completed = 4;
74 | 
75 |   optional BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 5;
76 |   //Error message
77 |   optional string message = 6; 
78 |   
79 |   //Lingware information
80 |   optional string lang = 7;
81 |   optional string version = 8;
82 | }
83 | 
84 | message StopGeneration
85 | {
86 | }
87 | 


--------------------------------------------------------------------------------
/proto/voiceproxy.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto2";
  2 | 
  3 | import "basic.proto";
  4 | 
  5 | package VoiceProxyProtobuf;
  6 | 
  7 | // use this part of ConnectionRequest to specify additional options for decoder/proxy
  8 | message AdvancedASROptions
  9 | {
 10 |   // send back partial results, if disabled only results with endOfUtt == true will be send
 11 |   optional bool partial_results = 1 [default = true];
 12 | 
 13 |   // beam, lattice_beam, lattice_nbest - are low level decoder options
 14 |   optional float beam = 2 [default = -1];
 15 | 
 16 |   optional float lattice_beam = 3 [default = -1];
 17 | 
 18 |   optional int32 lattice_nbest = 4 [default = -1];
 19 | 
 20 |   // specify interval in 10mc of silence/noice which will separe sentences, defines how often you will receive endOfUtt == true
 21 |   optional int32 utterance_silence = 5 [default = 120];
 22 | 
 23 |   // disable all partial and results with "endOfUtt" - will response only when AddData with "lastChunk" is received
 24 |   optional bool allow_multi_utt = 16 [default = true];
 25 | 
 26 |   // if client sends too many chunks (more then server could process) - if timeout (in mc) specify how many buffers sound should be read
 27 |   // before sending to decoder, this may vary how often partial_results are sending
 28 |   optional int32 chunk_process_limit = 17 [default = 100];
 29 | 
 30 |   // cmn is a internal feature of decoder
 31 |   optional int32 cmn_window = 18 [default = 600];
 32 | 
 33 |   optional int32 cmn_latency = 19 [default = 150];
 34 | 
 35 |   // capitalize and expected_num_count are features of "normalized" field of the AddDataResponse recognition result
 36 |   
 37 |   // specify if "normalized" results will be capitalize
 38 |   optional bool capitalize = 20 [default = false];
 39 | 
 40 |   // if specified normalizer will try to fit this count, for example "twenty two" will normalize to 20 2, if "3" is set as expected, default will be 22
 41 |   optional int32 expected_num_count = 21 [default = 0];
 42 |   
 43 |   // list of phrases for on-fly grammar, for example "yes", "no" in case of en-US
 44 |   // this field made "topic" in ConnectionRequest irrelevant, instead this list is used to build "language model" on the fly
 45 |   repeated string grammar = 22;
 46 |   
 47 |   // the same as previous, but partially support srgs, you can specify items, tags and simple rules, for example:
 48 |   //   <?xml version="1.0" ?>
 49 |   // <grammar xmlns="http://www.w3.org/2001/06/grammar" xml:lang="ru-RU" root="root" version="1.0" mode="voice" tag-format="semantics/1.0-literals">
 50 |   //   <rule id="root">
 51 |   //     <one-of>
 52 |   //       <item><ruleref special="GARBAGE" /></item>
 53 |   //       <item><tag>оплатил</tag>оплатил</item>
 54 |   //       <item><tag>оплатил</tag>да</item>
 55 |   //       <item><tag>не оплатил</tag>не оплатил</item>
 56 |   //       <item><tag>не оплатил</tag>нет</item>
 57 |   //     </one-of>
 58 |   //   </rule>
 59 |   // </grammar>
 60 |   optional string srgs = 23;
 61 | 
 62 |   // currently supports "gender", "age", "group", "language", "children", "emotion" and combination with ",", like "age,gender"
 63 |   // checkout BiometryResult
 64 |   optional string biometry = 24;
 65 | 
 66 |   // turn on confidence rescoring procedure
 67 |   optional bool use_snr = 25 [default = false];
 68 | 
 69 |   // flags for confidence rescoring procedure
 70 |   repeated SnrFlag snr_flags = 26;
 71 | 
 72 |   // used to distinguish between biometry groups (devices)
 73 |   optional string biometry_group = 27;
 74 | 
 75 |   // enable special normalizers for "manual punctuation" i.e. replace "привет запятая как дела воспросительный знак"  with "привет, как дела?"
 76 |   optional bool manual_punctuation = 28 [default = false];
 77 | }
 78 | 
 79 | message ConnectionRequest
 80 | {
 81 |   optional int32 protocolVersion = 1 [default = 1];
 82 | 
 83 |   // leave empty if you are not speechkit
 84 |   required string speechkitVersion = 2;
 85 | 
 86 |   required string serviceName = 3; // "asr_dictation", etc.
 87 | 
 88 |   required string uuid = 4;
 89 | 
 90 |   optional string yandexuid = 21;
 91 | 
 92 |   required string apiKey = 5;
 93 | 
 94 |   required string applicationName = 6;
 95 | 
 96 |   // vendor:model:type... user defined
 97 |   required string device = 7;
 98 | 
 99 |   // lat.lat,lan.lan
100 |   required string coords = 8;
101 | 
102 |   // "general", "mapsyari", "freeform", "music"
103 |   // topic is ignored, if grammar or srgs from advancedAsrOptions are set
104 |   required string topic = 9;
105 | 
106 |   // "ru-RU"
107 |   required string lang = 10;
108 | 
109 |   // "audio/x-speex", "audio/x-pcm;bit=16;rate=8000", etc.
110 |   required string format = 11;
111 | 
112 |   // enable punctuation mode for "freeform" topic (some other topic may support punctuation in the future)
113 |   optional bool punctuation = 12 [default = true];
114 | 
115 |   optional bool disableAntimatNormalizer = 18 [default = false];
116 | 
117 |   optional AdvancedASROptions advancedASROptions = 19;
118 | 
119 |   optional bool skipAudioFromLogging = 20 [default = false];
120 | 
121 |   // deprecated
122 |   optional MusicRequest musicRequest = 17;
123 | }
124 | 
125 | ///////////////////////////////////////////////////////////////////////////
126 | 
127 | message AddData
128 | {
129 |   optional bytes audioData = 1;
130 | 
131 |   required bool lastChunk = 2;
132 | }
133 | 
134 | ///////////////////////////////////////////////////////////////////////////
135 | 
136 | message AlignInfo
137 | {
138 |   optional float start_time = 1;
139 | 
140 |   optional float end_time = 2;
141 | 
142 |   optional float acoustic_score = 3;
143 | 
144 |   optional float graph_score = 4;
145 | 
146 |   optional float lm_score = 5;
147 | 
148 |   optional float total_score = 6;
149 | }
150 | 
151 | message Word
152 | {
153 |   required float confidence = 1;
154 | 
155 |   required string value = 2;
156 | 
157 |   optional VoiceProxyProtobuf.AlignInfo align_info = 3;
158 | }
159 | 
160 | message Result
161 | {
162 |   // notice: confidence valid only when endOfUttr is true, otherwise always "1"
163 |   required float confidence = 1;
164 | 
165 |   repeated Word words = 2;
166 | 
167 |   optional string normalized = 3;
168 | 
169 |   optional VoiceProxyProtobuf.AlignInfo align_info = 4;
170 | }
171 | 
172 | message BiometryResult
173 | {
174 |     required string classname = 1;
175 | 
176 |     required float confidence = 2;
177 | 
178 |     optional string tag = 3;
179 | }
180 | 
181 | message SnrFlag
182 | {
183 |     required string name = 1;
184 |     required string value = 2;
185 | }
186 | 
187 | message SnrFeature
188 | {
189 |     optional string name = 1;
190 | 
191 |     optional float value = 2;
192 | }
193 | 
194 | message SnrInfo
195 | {
196 |     optional string normalizedText = 1;
197 | 
198 |     optional float snrValue = 2;
199 | 
200 |     repeated SnrFeature features = 3;
201 | 
202 |     optional string featureSlices = 4;
203 | 
204 |     optional int32 originalCandidateIndex = 5;
205 | 
206 |     optional string candidateSource = 6;
207 | }
208 | 
209 | message SnrMetainfo
210 | {
211 |     optional string name = 1;
212 | }
213 | 
214 | message Metainfo
215 | {
216 |     required float minBeam = 1;
217 | 
218 |     required float maxBeam = 2;
219 |     
220 |     repeated SnrInfo snrInfos = 3;
221 | 
222 |     optional string topic = 4;
223 | 
224 |     optional string lang = 5;
225 | 
226 |     optional string version = 6;
227 |     
228 |     optional string load_timestamp = 7;
229 | 
230 |     optional int32 snrResponseCode = 8;
231 | 
232 |     optional bool snr_performed_rescoring = 9;
233 | 
234 |     optional SnrMetainfo snrMetainfo = 10;
235 | }
236 | 
237 | message AddDataResponse
238 | {
239 |   required BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 1;
240 | 
241 |   repeated Result recognition = 2;
242 | 
243 |   // if true : recognition contains fully parsed N-best list (n results with n words)
244 |   // otherwise recognition contains just 1 result 1 word with current "partical result"
245 |   optional bool endOfUtt = 3 [default = false];
246 | 
247 |   // how many AddData requests were merged for this response
248 |   optional int32 messagesCount = 4 [default = 1];
249 | 
250 |   // if not empty messageCounter should be 0
251 |   optional string musicProxyResponse = 5;
252 | 
253 |   repeated BiometryResult bioResult = 6;
254 | 
255 |   optional Metainfo metainfo = 7;
256 | }
257 | 
258 | // deprecated
259 | message MusicRequest
260 | {
261 |   message MusicParam
262 |   {
263 |     required string name = 1;
264 | 
265 |     required string value = 2;
266 |   }
267 | 
268 |   // default options are "uid", "OAuth", "widget"
269 |   repeated MusicParam musicProxyOptions = 1;
270 | }
271 | 


--------------------------------------------------------------------------------
/python/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *~
 3 | MANIFEST
 4 | dist/
 5 | build/
 6 | asrclient.egg-info/
 7 | venv/
 8 | */*.sound
 9 | */*.txt
10 | 


--------------------------------------------------------------------------------
/python/README.txt:
--------------------------------------------------------------------------------
  1 | Description:
  2 | 
  3 | This is a streaming client for Yandex speech recognition service (aka Yandex ASR).
  4 | Comparing to http-api it provides much more info about a recognized text and the recognition process itself.
  5 | Also it has no limit for an input file length.
  6 | 
  7 | Install to Mac OS:
  8 | 
  9 | Install python pip & python protobuf using as example MacPorts
 10 | (opensource software package manager, instruction for installation here: https://www.macports.org/install.php):
 11 | bash-3.2$ sudo port install git py27-pip py27-protobuf
 12 | ...
 13 | Continue? [Y/n]: Y
 14 | ...
 15 | 
 16 | After install PIP & protobuf compilers can checkout speechkit client:
 17 | 
 18 | bash-3.2$ git clone https://github.com/yandex/speechkitcloud
 19 | ...
 20 | bash-3.2$ cd speechkitcloud/python
 21 | bash-3.2$ protoc -I=asrclient --python_out=asrclient asrclient/*.proto
 22 | bash-3.2$ python ./setup.py sdist
 23 | ...
 24 | bash-3.2$ cd dist
 25 | bash-3.2$ ls
 26 | asrclient-0.5.0.tar.gz
 27 | 
 28 | You can have different result filename (more fresh version, etc), use it for pip install 
 29 | 
 30 | bash-3.2$ sudo pip install asrclient-0.5.0.tar.gz
 31 | ...
 32 | Successfully installed asrclient-0.5.0 futures-3.1.1
 33 | 
 34 | If as default used system macos python, than asrclient-cli.py & ttsclient-cli.py installed into folder
 35 | /Library/Frameworks/Python.framework/Versions/2.7/bin/
 36 | else (as default used python from macports) search it inside folder
 37 | /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/
 38 |  
 39 | Short help can be received via --help option:
 40 | bash-3.2$ /Library/Frameworks/Python.framework/Versions/2.7/bin/asrclient-cli.py --help
 41 | bash-3.2$ /Library/Frameworks/Python.framework/Versions/2.7/bin/ttsclient-cli.py --help
 42 | or for macports python installation:
 43 | bash-3.2$ /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/asrclient-cli.py --help
 44 | bash-3.2$ /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/ttsclient-cli.py --help
 45 | 
 46 | XCode TROUBLESHOOTING:
 47 | If after installing macports got error:
 48 | Warning: xcodebuild exists but failed to execute
 49 | Warning: Xcode does not appear to be installed; most ports will likely fail to build.
 50 | 
 51 | use next commands for fix it:
 52 | sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
 53 | xcodebuild -license
 54 | 
 55 | 
 56 | Install to Ubuntu/Debian:
 57 | 
 58 | You need to provide some python dependencies. Suggest something like this...
 59 | 
 60 | sudo apt-get install python2.7 python-setuptools python-pip git protobuf-compiler
 61 | git clone https://github.com/yandex/speechkitcloud
 62 | cd speechkitcloud/python
 63 | protoc -I=asrclient --python_out=asrclient asrclient/*.proto
 64 | python ./setup.py sdist
 65 | 
 66 | cd dist
 67 | sudo pip install <generated-file-name>
 68 | 
 69 | ...or you can provide the dependencies manually and run ./asrclient-cli.py directly (without install).
 70 | 
 71 | 1. asrclient-cli.py
 72 | 
 73 | Usage:
 74 | 
 75 | asrclient-cli.py [OPTIONS] [FILES]...
 76 | 
 77 | Options:
 78 |   -k, --key TEXT                  You could get it at
 79 |                                   https://developer.tech.yandex.ru/. Default
 80 |                                   is "paste-your-own-key". 
 81 |                                   Use "internal" with Speechkit Box.
 82 |   -s, --server TEXT               Default is asr.yandex.net.
 83 |   -p, --port INTEGER              Default is 80.
 84 |   --format TEXT                   Input file format. Default is
 85 |                                   audio/x-pcm;bit=16;rate=16000.
 86 |   --model TEXT                    Recognition model: freeform, maps, general, etc.
 87 |                                   Use the last one if your sound comes from a
 88 |                                   phone call. It's just a model name, sound
 89 |                                   format may be different. Default is
 90 |                                   freeform.
 91 |   --lang TEXT                     Recognition language. ru-RU | en-EN | tr-TR
 92 |                                   | uk-UA. Default is ru-RU.
 93 |   --chunk-size INTEGER            Default value 65536 bytes roughly equals to
 94 |                                   one second of audio in default format.
 95 |   --start-with-chunk INTEGER      Use it to send only some part of the input
 96 |                                   file. Default is 0.
 97 |   --max-chunks-count INTEGER      Use it to send only some part of the input
 98 |                                   file. Default means no limit is set.
 99 |   --reconnect-delay FLOAT         Take a pause in case of network problems.
100 |                                   Default value is 0.5 seconds.
101 |   --inter-utt-silence FLOAT       A pause between phrases finalization.
102 |                                   Default value is 1.2 seconds.
103 |   --cmn-latency INTEGER           CMN latency parameter. Default value is 50.
104 |   --reconnect-retry-count INTEGER
105 |                                   Sequentional reconnects before giving up.
106 |                                   Default is 5.
107 |   --silent                        Don't print debug messages, only recognized
108 |                                   text.
109 |   --record                        Grab audio from system audio input instead
110 |                                   of files.
111 |   --nopunctuation                 Disable punctuation.
112 |   --uuid TEXT                     UUID of your request. It can be helpful for
113 |                                   further logs analysis. Default is random.
114 |   --ipv4                          Use ipv4 only connection.
115 |   --realtime                      Emulate realtime record recognition.
116 |   --callback-module TEXT          Python module name which should implement
117 |                                   advanced_callback(AddDataResponse).
118 |                                   It takes
119 |                                   corresponding protobuf message as a
120 |                                   parameter. See advanced_callback_example.py
121 |                                   for details.
122 |   --help                          Show this message and exit.
123 | 
124 | 
125 | Examples:
126 | 
127 | asrclient-cli.py --help
128 | 
129 | asrclient-cli.py --key=active-key-from-your-account sound.wav
130 | 
131 | asrclient-cli.py --key=active-key-from-your-account --silent sound.wav
132 | 
133 | asrclient-cli.py --key=active-key-from-your-account --silent --callback-module advanced_callback_example sound.wav
134 | 
135 | More:
136 | 
137 | We expect incoming sound in specific format audio/x-pcm;bit=16;rate=16000 (single channel).
138 | To convert some random sound file to this, suggest
139 | 
140 | sox sound.mp3 -t wav -c 1 --rate 16000 -b 16 -e signed-integer sound.wav
141 | 
142 | 2. ttsclient-cli.py
143 | 
144 | Usage: ttsclient-cli.py [OPTIONS] [FILE] [TEXTS]...
145 | 
146 | Options:
147 |   -k, --key TEXT       You could get it at https://developer.tech.yandex.ru/.
148 |                        Default is "paste-your-own-key".
149 |   -s, --server TEXT    Default is tts.voicetech.yandex.net.
150 |   -p, --port INTEGER   Default is 80.
151 |   --lang TEXT          Synthesis language. ru-RU | en-EN | tr-TR | uk-UA.
152 |                        Default is ru-RU.
153 |   --speaker TEXT       Speaker for speech synthesis. Call this script with
154 |                        --list-speakers flag to get speakers list.
155 |   --emotion TEXT       Emotion for speech synthesis. Available values: good,
156 |                        neutral, evil. Default value depends on speaker's
157 |                        original emotion.
158 |   --gender TEXT        Speaker's gender for speech synthesis. Available
159 |                        values: male, female. Default value depends on
160 |                        speaker's original gender.
161 |   --textfile FILENAME  Read text from this file instead of command line
162 |                        arguments.
163 |   --uuid TEXT          UUID of your request. It can be helpful for further
164 |                        logs analysis. Default is random.
165 |   --ipv4               Use ipv4 only connection.
166 |   --list-speakers      Only list available speakers, don't try to generate
167 |                        anything.
168 |   --silent             Don't print debug messages.
169 |   --help               Show this message and exit.
170 | 
171 | Examples:
172 | 
173 | ttsclient-cli.py --help
174 | 
175 | ttsclient-cli.py --key=active-key-from-your-account --list-speakers
176 | 
177 | ttsclient-cli.py --key=active-key-from-your-account --speaker jane --lang en-EN out.wav "Hello!"
178 | 
179 | ttsclient-cli.py --key=active-key-from-your-account --speaker jane --textfile request.txt out.wav
180 | 
181 | More:
182 | 
183 | We generate sound in format audio/x-wav, single channel, 16000Hz, 16-bit signed integer PCM encoding.
184 | 
185 | Useful links:
186 | 
187 | http://sox.sourceforge.net/ - sound conversion library and utility.
188 | https://pypi.python.org/pypi/pip - python package manager.
189 | https://developer.tech.yandex.ru - obtain your key.
190 | https://tech.yandex.ru/speechkit/cloud/ - more about Yandex ASR.
191 | 


--------------------------------------------------------------------------------
/python/advanced_callback_example.py:
--------------------------------------------------------------------------------
 1 | from asrclient.voiceproxy_pb2 import AddDataResponse as AsrResponse
 2 | 
 3 | """
 4 | use it like
 5 | ./asrclient-cli.py -k <your-key> --callback-module advanced_callback_example --silent <path-to-your-sound.wav>
 6 | """
 7 | 
 8 | session_id = "not-set"
 9 | 
10 | def advanced_callback(asr_response, correction = 0):
11 |     print("Got response:")
12 |     print("end-of-utterance = {}".format(asr_response.endOfUtt))
13 |     r_count = 0
14 |     for r in asr_response.recognition:
15 |         print("recognition[{}] = {}; confidence = {}".format(r_count, r.normalized.encode("utf-8"), r.confidence))
16 |         print("utterance timings: from {} to {}".format(r.align_info.start_time+correction,r.align_info.end_time+correction))
17 |         w_count = 0
18 |         for w in r.words:
19 |             print("word[{}] = {}; confidence = {}".format(w_count, w.value.encode("utf-8"), w.confidence))
20 |             print("word timings: from {} to {}".format(w.align_info.start_time+correction,w.align_info.end_time+correction))
21 |             w_count += 1
22 |         r_count += 1
23 | 
24 | 
25 | def advanced_utterance_callback(asr_response, data_chunks):
26 |     data_length = 0
27 |     for chunk in data_chunks:
28 |         data_length += len(chunk) if chunk else 0
29 |     print("Got complete utterance, for {0} data_chunks, session_id = {1}".format(len(data_chunks), session_id))
30 |     print("Metainfo", asr_response.metainfo.minBeam, asr_response.metainfo.maxBeam)
31 |     print("Data length = {0}".format(data_length))
32 | 


--------------------------------------------------------------------------------
/python/advanced_callback_splitter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | from asrclient.voiceproxy_pb2 import AddDataResponse as AsrResponse
  4 | from asrclient.ttsclient import generateWavHeader
  5 | 
  6 | """
  7 | use it like
  8 | ./asrclient-cli.py -k <your-key> --callback-module advanced_callback_example --silent <path-to-your-sound.wav>
  9 | """
 10 | 
 11 | session_id = "not-set"
 12 | start_timestamp = datetime.datetime.now().strftime("%d-%m-%Y_%H%M%S")
 13 | 
 14 | def mkdir_p(path):
 15 |     try:
 16 |         os.makedirs(path)
 17 |     except OSError as exc:  # Python >2.5
 18 |         if exc.errno == errno.EEXIST and os.path.isdir(path):
 19 |             pass
 20 |         else:
 21 |             raise
 22 | 
 23 | utterance_count = 0
 24 | 
 25 | def advanced_callback(asr_response, correction = 0):
 26 |     print("Got response:")
 27 |     print("end-of-utterance = {}".format(asr_response.endOfUtt))
 28 |     r_count = 0
 29 |     for r in asr_response.recognition:
 30 |         print("recognition[{}] = {}; confidence = {}".format(r_count, r.normalized.encode("utf-8"), r.confidence))
 31 |         print("utterance timings: from {} to {}".format(r.align_info.start_time+correction,r.align_info.end_time+correction))
 32 |         w_count = 0
 33 |         for w in r.words:
 34 |             print("word[{}] = {}; confidence = {}".format(w_count, w.value.encode("utf-8"), w.confidence))
 35 |             print("word timings: from {} to {}".format(w.align_info.start_time+correction,w.align_info.end_time+correction))
 36 |             w_count += 1
 37 |         r_count += 1
 38 | 
 39 | leftover = None
 40 | leftmargin = 0
 41 | def advanced_utterance_callback(asr_response, data_chunks):    
 42 |     global utterance_count, leftover, leftmargin
 43 | 
 44 |     dirname = "./{0}_{1}/".format(start_timestamp, session_id)
 45 |     if not os.path.isdir(dirname):
 46 |         mkdir_p(dirname)
 47 |     print("Got complete utterance, for {0} data_chunks, session_id = {1}".format(len(data_chunks), session_id))
 48 | 
 49 |     with open("{0}/{1}_{2}.wav".format(dirname, session_id, utterance_count), "wb") as sound_file:
 50 |         left = 0
 51 |         right = sum(map(len, filter(lambda x: x, data_chunks)))
 52 |         if asr_response.recognition:
 53 |             if asr_response.recognition[0].words:
 54 |                 left = asr_response.recognition[0].words[0].align_info.start_time * 32000
 55 |                 right = asr_response.recognition[0].words[-1].align_info.end_time * 32000
 56 |             else:
 57 |                 left = asr_response.recognition[0].align_info.start_time * 32000
 58 |                 right = asr_response.recognition[0].align_info.end_time * 32000
 59 |         
 60 |         result = ""
 61 |         print(left, right)
 62 |         chunks = [leftover] + data_chunks
 63 |         leftover = None
 64 |         for chunk in chunks:
 65 |             if not chunk:
 66 |                 continue
 67 |             if chunk.startswith("RIFF"):
 68 |                 chunk = chunk[44:]
 69 |             if len(result):
 70 |                 result += chunk
 71 |             else:
 72 |                 print(left, leftmargin, len(chunk))
 73 |                 if left - leftmargin < len(chunk):
 74 |                     cutat = int(left - leftmargin)
 75 |                     print(cutat)
 76 |                     if cutat % 2:
 77 |                         cutat -= 1
 78 |                     result += chunk[cutat:]
 79 |             leftmargin += len(chunk)
 80 | 
 81 |         right = int(right)
 82 |         if right%2:
 83 |             right-=1
 84 | 
 85 |         if right < leftmargin:
 86 |             offset = leftmargin - right
 87 |             leftover = result[-offset:]
 88 |             result = result[:-offset]
 89 |             leftmargin = right
 90 | 
 91 |         data_size = len(result)
 92 |         sound_file.write(generateWavHeader(16000, True, data_size))
 93 |         sound_file.write(result)
 94 | 
 95 |     with open("{0}/{1}_{2}.txt".format(dirname, session_id, utterance_count), "w") as txt_file:
 96 |         text = asr_response.recognition[0].normalized.encode("utf-8")
 97 |         if text is not None:
 98 |             txt_file.write(text)
 99 | 
100 |     utterance_count += 1
101 | 


--------------------------------------------------------------------------------
/python/asrclient-cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """Yandex ASR streaming client."""
  4 | 
  5 | from __future__ import absolute_import
  6 | import logging
  7 | import click
  8 | import sys
  9 | 
 10 | import importlib
 11 | from asrclient import client
 12 | 
 13 | try:
 14 |     import pyaudio
 15 |     is_pyaudio = True
 16 | except ImportError:
 17 |     is_pyaudio = False
 18 | 
 19 | 
 20 | @click.command()
 21 | @click.option('-k', '--key',
 22 |               help='You could get it at https://developer.tech.yandex.ru/. Default is "{0}".'.format(client.DEFAULT_KEY_VALUE),
 23 |               default=client.DEFAULT_KEY_VALUE)
 24 | @click.option('-s', '--server',
 25 |               help='Default is {0}.'.format(client.DEFAULT_SERVER_VALUE),
 26 |               default=client.DEFAULT_SERVER_VALUE)
 27 | @click.option('-p', '--port',
 28 |               help='Default is {0}.'.format(client.DEFAULT_PORT_VALUE),
 29 |               default=client.DEFAULT_PORT_VALUE)
 30 | @click.option('--format',
 31 |               help='Input file format. Default is {0}.'.format(client.DEFAULT_FORMAT_VALUE),
 32 |               default=client.DEFAULT_FORMAT_VALUE)
 33 | @click.option('--model',
 34 |               help='Recognition model. freeform | freeform8alaw. Use the last one if your sound comes from a phone call. It\'s just a model name, sound format may be different. Default is {0}.'.format(client.DEFAULT_MODEL_VALUE),
 35 |               default=client.DEFAULT_MODEL_VALUE)
 36 | @click.option('--lang',
 37 |               help='Recognition language. ru-RU | en-EN | tr-TR | uk-UA. Default is {0}.'.format(client.DEFAULT_LANG_VALUE),
 38 |               default=client.DEFAULT_LANG_VALUE)
 39 | @click.option('--app',
 40 |               help='Application. Default is local.',
 41 |               default="local")
 42 | @click.option('--chunk-size',
 43 |               default=client.DEFAULT_CHUNK_SIZE_VALUE,
 44 |               help='Default value {0} bytes roughly equals to one second of audio in default format.'.format(client.DEFAULT_CHUNK_SIZE_VALUE))
 45 | @click.option('--start-with-chunk',
 46 |               default=0,
 47 |               help='Use it to send only some part of the input file. Default is 0.')
 48 | @click.option('--max-chunks-count',
 49 |               default=None,
 50 |               type=int,
 51 |               help='Use it to send only some part of the input file. Default means no limit is set.')
 52 | @click.option('--reconnect-delay',
 53 |               default=client.DEFAULT_RECONNECT_DELAY,
 54 |               help='Take a pause in case of network problems. Default value is {0} seconds.'.format(client.DEFAULT_RECONNECT_DELAY))
 55 | @click.option('--inter-utt-silence',
 56 |               default=client.DEFAULT_INTER_UTT_SILENCE,
 57 |               type=float,
 58 |               help='A pause between phrases finalization. Default value is {0} seconds.'.format(client.DEFAULT_INTER_UTT_SILENCE/100.0))
 59 | @click.option('--cmn-latency',
 60 |               default=client.DEFAULT_CMN_LATENCY,
 61 |               help='CMN latecny parameter. Default value is {0}.'.format(client.DEFAULT_CMN_LATENCY))
 62 | @click.option('--reconnect-retry-count',
 63 |               default=client.DEFAULT_RECONNECT_RETRY_COUNT,
 64 |               help='Sequentional reconnects before giving up. Default is {0}.'.format(client.DEFAULT_RECONNECT_RETRY_COUNT))
 65 | @click.option('--silent',
 66 |               is_flag=True,
 67 |               help='Don\'t print debug messages, only recognized text.')
 68 | @click.option('--record',
 69 |               is_flag=True,
 70 |               help='Grab audio from system audio input instead of files.')
 71 | @click.option('--nopunctuation',
 72 |               is_flag=True,
 73 |               help='Disable punctuation.')
 74 | @click.option('--uuid',
 75 |               default=client.DEFAULT_UUID_VALUE,
 76 |               help='UUID of your request. It can be helpful for further logs analysis. Default is random.')
 77 | @click.option('--ipv4',
 78 |               is_flag=True,
 79 |               help='Use ipv4 only connection.')
 80 | @click.option('--realtime',
 81 |               is_flag=True,
 82 |               help='Emulate realtime record recognition.')
 83 | @click.option('--callback-module',
 84 |               help='Python module name which should implement advanced_callback(AddDataResponse).\nIt takes corresponding protobuf message as a parameter. See advanced_callback_example.py for details.',
 85 |               default=None)
 86 | @click.argument('files',
 87 |                 nargs=-1,
 88 |                 type=click.File('rb'))
 89 | @click.option('--capitalize',
 90 |               is_flag=True,
 91 |               help='Should each utterance start with a capital letter?')
 92 | @click.option('--expected-num-count',
 93 |               default=0,
 94 |               type=int,
 95 |               help='How many digits should be in the answer? Special option, you don\'t need it!')
 96 | @click.option('--biometry',
 97 |               help='Enable biometry , "gender", "age", "group", "language" or combine with "," like "age,gender"',
 98 |               default="")
 99 | @click.option('--snr',
100 |               default=False,
101 |               is_flag=True,
102 |               help='Deprecated')
103 | @click.option('--snr_flags',
104 |               default="",
105 |               type=str,
106 |               help='Deprecated')
107 | @click.option('--grammar-file',
108 |               default="",
109 |               help='Custom grammar, can be list of lines or xml file description')
110 | def main(chunk_size, start_with_chunk, max_chunks_count, record, files, silent, **kwars):
111 |     if not silent:
112 |         logging.basicConfig(level=logging.INFO)
113 | 
114 |     chunks = []
115 |     if files:
116 |         chunks = client.read_chunks_from_files(files,
117 |                                                chunk_size,
118 |                                                start_with_chunk,
119 |                                                max_chunks_count)
120 |     else:
121 |         if record:
122 |             if is_pyaudio:
123 |                 chunks = client.read_chunks_from_pyaudio(chunk_size)
124 |             else:
125 |                 click.echo('Please install pyaudio module for system audio recording.')
126 |                 sys.exit(-2)
127 | 
128 |     def default_callback(utterance, start_time = 0.0, end_time = 0.0, data = None):
129 |         click.echo(utterance)
130 |         if (end_time > start_time):
131 |             click.echo("from {0} to {1}".format(start_time, end_time))
132 | 
133 |     if not chunks:
134 |         click.echo('Please, specify one or more input filename.')
135 |     else:
136 |         client.recognize(chunks,
137 |                          callback=default_callback,
138 |                          **kwars)
139 | 
140 | if __name__ == "__main__":
141 |         main()
142 | 


--------------------------------------------------------------------------------
/python/asrclient/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yandex/speechkitcloud/817e2bc2f090a17b8d3a9180848d5174d804bc3b/python/asrclient/__init__.py


--------------------------------------------------------------------------------
/python/asrclient/basic.proto:
--------------------------------------------------------------------------------
 1 | // Yandex ASR dictation api (draft):
 2 | // Client initiate session with http upgrade request, for example:
 3 | // GET /asr_partial HTTP/1.1\r\n
 4 | // User-Agent:KeepAliveClient\r\n
 5 | // Host: voice-stream.voicetech.yandex.net:80\r\n
 6 | // Upgrade: dictation\r\n\r\n
 7 | // Receive HTTP 101 Switched response.
 8 | // Next send\receive protobuf messages, format
 9 | // [hex size]\r\n[message body serialized with protobuf]
10 | 
11 | // send ConnectionRequest, read ConnectionsResponse.. etc
12 | // send AddData, read AddDataResponse and so on.
13 | syntax = "proto2";
14 | 
15 | package BasicProtobuf;
16 | 
17 | message ConnectionResponse
18 | {
19 |   required ResponseCode responseCode = 1;
20 | 
21 |   required string sessionId = 2;
22 | 
23 |   optional string message = 3;
24 | 
25 |   enum ResponseCode {
26 |     OK = 200;
27 |     BadMessageFormatting = 400;
28 |     UnknownService = 404;
29 |     NotSupportedVersion = 405;
30 |     Timeout = 408;
31 |     ProtocolError = 410;
32 |     InternalError = 500;
33 |     InvalidKey = 429;
34 |     InvalidRequestParams = 406;
35 |     UnsupportedMediaType = 415;
36 | 	}
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/python/asrclient/client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """Yandex ASR streaming library."""
  4 | 
  5 | import os
  6 | import logging
  7 | import sys
  8 | import time
  9 | import codecs
 10 | import importlib
 11 | 
 12 | from uuid import uuid4 as randomUuid
 13 | from socket import error as SocketError
 14 | from google.protobuf.message import DecodeError as DecodeProtobufError
 15 | if sys.version_info >= (3, 0):
 16 |     from .basic_pb2 import ConnectionResponse
 17 |     from .voiceproxy_pb2 import ConnectionRequest, AddData, AddDataResponse, AdvancedASROptions, SnrFlag
 18 |     from .transport import Transport, TransportError
 19 | else:
 20 |     from basic_pb2 import ConnectionResponse
 21 |     from voiceproxy_pb2 import ConnectionRequest, AddData, AddDataResponse, AdvancedASROptions, SnrFlag
 22 |     from transport import Transport, TransportError
 23 | from concurrent.futures import ThreadPoolExecutor, Future
 24 | 
 25 | 
 26 | DEFAULT_KEY_VALUE = 'paste-your-own-key'
 27 | DEFAULT_SERVER_VALUE = 'asr.yandex.net'
 28 | DEFAULT_PORT_VALUE = 80
 29 | 
 30 | DEFAULT_FORMAT_VALUE = 'audio/x-pcm;bit=16;rate=16000'
 31 | # 'audio/x-pcm;bit=16;rate=8000' # use this format for 8k bitrate wav and pcm
 32 | 
 33 | DEFAULT_MODEL_VALUE = 'freeform'
 34 | DEFAULT_LANG_VALUE = 'ru-RU'
 35 | 
 36 | DEFAULT_UUID_VALUE = randomUuid().hex
 37 | 
 38 | DEFAULT_CHUNK_SIZE_VALUE = 1024*32*2
 39 | DEFAULT_RECONNECT_DELAY = 0.5
 40 | DEFAULT_RECONNECT_RETRY_COUNT = 5
 41 | DEFAULT_PENDING_LIMIT = 50
 42 | 
 43 | DEFAULT_INTER_UTT_SILENCE = 120
 44 | DEFAULT_CMN_LATENCY = 50
 45 | 
 46 | def bytes_in_sec(format):
 47 |     if "8000" in format:
 48 |         return 16000
 49 |     else:
 50 |         return 32000
 51 | 
 52 | 
 53 | def read_chunks_from_pyaudio(chunk_size = DEFAULT_CHUNK_SIZE_VALUE):
 54 |     import pyaudio
 55 |     p = pyaudio.PyAudio()
 56 |     stream = p.open(format=pyaudio.paInt16,
 57 |                     channels=1,
 58 |                     rate=16000,
 59 |                     input=True,
 60 |                     frames_per_buffer=1024)
 61 |     while True:
 62 |         yield stream.read(chunk_size)
 63 | 
 64 | 
 65 | def read_chunks_from_files(files, chunksize, start_from = 0, max_count = None):
 66 |     count = 0
 67 |     for f in files:
 68 |         chunk = f.read(chunksize)
 69 |         while chunk:
 70 |             if  start_from <= count:
 71 |                 if max_count is None or count < start_from + max_count:
 72 |                     yield chunk
 73 |             count += 1
 74 |             chunk = f.read(chunksize)
 75 |         f.close()
 76 | 
 77 | 
 78 | class ServerError(RuntimeError):
 79 |     def __init__(self, message):
 80 |         RuntimeError.__init__(self, message)
 81 | 
 82 | 
 83 | class ServerConnection(object):
 84 | 
 85 |     def __init__(self, host, port, key, app, service, topic, lang, format, uuid, inter_utt_silence, cmn_latency, biometry, logger=None, punctuation=True, ipv4=False, capitalize=False, expected_num_count=0, snr=False, snr_flags=None, grammar_file=""):
 86 |         self.host = host
 87 |         self.port = port
 88 |         self.key = key
 89 |         self.app = app
 90 |         self.topic = topic
 91 |         self.service = service
 92 |         self.lang = lang
 93 |         self.format = format
 94 |         self.uuid = uuid
 95 |         self.logger = logger
 96 |         self.biometry = biometry
 97 |         self.punctuation = punctuation
 98 |         self.inter_utt_silence = inter_utt_silence
 99 |         self.cmn_latency = cmn_latency
100 |         self.ipv4 = ipv4
101 |         self.capitalize = capitalize
102 |         self.expected_num_count = expected_num_count
103 |         self.snr = snr
104 |         
105 |         if not snr_flags:
106 |             self.snr_flags = []
107 |         elif isinstance(snr_flags, str) or isinstance(snr_flags, unicode):
108 |             self.snr_flags = [a.split("=") for a in snr_flags.split(",")]
109 |         else:
110 |             self.snr_flags = snr_flags
111 |         
112 |         self.grammar_file = grammar_file
113 | 
114 |         self.log("uuid={0}".format(self.uuid))
115 | 
116 |         self.session_id = "not-set"
117 |         self.connect()
118 | 
119 | 
120 |     def log(self, message):
121 |         if self.logger is not None:
122 |             self.logger.info(message)
123 | 
124 |     def connect(self):
125 |         self.t = Transport(self.host, self.port, timeout=None, verbose=False, enable_ssl=(self.port==443), ipv4=self.ipv4)
126 |         if not self.upgrade_connection():
127 |             raise ServerError('Unable to upgrade connection')
128 |         self.log("Connected to {0}:{1}.".format(self.host, self.port))
129 | 
130 |         response = self.send_init_request()
131 |         if response.responseCode != 200:
132 |             error_text = 'Wrong response from server, status_code={0}'.format(
133 |                 response.responseCode)
134 |             if response.HasField("message"):
135 |                 error_text += ', message is "{0}"'.format(response.message)
136 |             raise ServerError(error_text)
137 | 
138 |         self.session_id = response.sessionId
139 |         self.log("session_id={0}".format(self.session_id))
140 | 
141 |         return self.session_id
142 | 
143 |     def send_init_request(self):
144 |         advancedASROptions = AdvancedASROptions(
145 |             utterance_silence=int(self.inter_utt_silence),
146 |             cmn_latency=self.cmn_latency,
147 |             capitalize=self.capitalize,
148 |             expected_num_count=self.expected_num_count,
149 |             biometry=self.biometry,
150 |             use_snr=self.snr,
151 |             snr_flags=[SnrFlag(name=a[0], value=a[1]) for a in self.snr_flags],
152 |         )
153 |         if len(self.grammar_file) > 0:
154 |             with codecs.open(self.grammar_file, encoding='utf-8') as grammar:
155 |                 advancedASROptions.srgs = grammar.read()
156 |         request = ConnectionRequest(
157 |             speechkitVersion='',
158 |             serviceName=self.service,
159 |             uuid=self.uuid,
160 |             apiKey=self.key,
161 |             applicationName=self.app,
162 |             device='desktop',
163 |             coords='0, 0',
164 |             topic=self.topic,
165 |             lang=self.lang,
166 |             format=self.format,
167 |             punctuation=self.punctuation,
168 |             advancedASROptions=advancedASROptions
169 |         )
170 | 
171 |         self.t.sendProtobuf(request)
172 |         return self.t.recvProtobuf(ConnectionResponse)
173 | 
174 |     def upgrade_connection(self):
175 |         logger = logging.getLogger('arslib')
176 |         request = ('GET /asr_partial_checked HTTP/1.1\r\n'
177 |                    'User-Agent: {user_agent}\r\n'
178 |                    'Host: {host}:{port}\r\n'
179 |                    'Upgrade: {service}\r\n\r\n').format(
180 |                        user_agent=self.app,
181 |                        host=self.host,
182 |                        port=self.port,
183 |                        service=self.service)
184 | 
185 |         self.t.send(request)
186 |         check = 'HTTP/1.1 101 Switching Protocols'
187 |         buffer = ''
188 | 
189 |         # possible infinite loop here?
190 |         while True:
191 |             buffer += self.t.recv(1)
192 |             if buffer.startswith(check) and buffer.endswith('\r\n\r\n'):
193 |                 return True
194 |             if len(buffer) > 300:
195 |                 logger.warning(buffer)
196 |                 return False
197 | 
198 |     def close(self):
199 |         self.session_id = ""
200 |         self.t.close()
201 | 
202 |     def reconnect(self, delay=None):
203 |         self.log('Reconnecting!')
204 |         self.close()
205 |         if delay is not None:
206 |             self.log('Going to sleep for {0} seconds'.format(delay))
207 |             time.sleep(delay)
208 |         self.connect()
209 | 
210 |     def add_data(self, chunk):
211 |         if chunk is None:
212 |             self.t.sendProtobuf(AddData(lastChunk=True))
213 |         else:
214 |             self.t.sendProtobuf(AddData(lastChunk=False, audioData=chunk))
215 | 
216 | 
217 |     def get_response_if_ready(self):
218 |         response = self.t.recvProtobufIfAny(AddDataResponse, ConnectionResponse)
219 | 
220 |         if isinstance(response, ConnectionResponse):
221 |             raise ServerError("Bad AddData response: %s %s" % (response.responseCode, response.message))
222 | 
223 |         if response is not None:
224 |             if response.responseCode != 200:
225 |                 error_text = 'Wrong response from server, status_code={0}'.format(
226 |                     response.responseCode)
227 |                 if response.HasField("message"):
228 |                     error_text += ', message is "{0}"'.format(response.message)
229 |                 raise ServerError(error_text)
230 | 
231 |         return response
232 | 
233 | def recognize(chunks,
234 |               callback=None,
235 |               advanced_callback=None,
236 |               callback_module=None,
237 |               format=DEFAULT_FORMAT_VALUE,
238 |               server=DEFAULT_SERVER_VALUE,
239 |               port=DEFAULT_PORT_VALUE,
240 |               key=DEFAULT_KEY_VALUE,
241 |               app='local',
242 |               service='dictation',
243 |               model=DEFAULT_MODEL_VALUE,
244 |               lang=DEFAULT_LANG_VALUE,
245 |               inter_utt_silence=DEFAULT_INTER_UTT_SILENCE,
246 |               cmn_latency=DEFAULT_CMN_LATENCY,
247 |               biometry="",
248 |               uuid=DEFAULT_UUID_VALUE,
249 |               reconnect_delay=DEFAULT_RECONNECT_DELAY,
250 |               reconnect_retry_count=DEFAULT_RECONNECT_RETRY_COUNT,
251 |               pending_limit=DEFAULT_PENDING_LIMIT,
252 |               ipv4=False,
253 |               nopunctuation=False,
254 |               realtime=False,
255 |               capitalize=False,
256 |               expected_num_count=0,
257 |               snr=False,
258 |               snr_flags=None,
259 |               grammar_file=""):
260 | 
261 |     advanced_utterance_callback = None
262 |     imported_module = None
263 | 
264 |     if callback_module is not None:
265 |         imported_module = importlib.import_module(callback_module)
266 | 
267 |         try:
268 |             advanced_callback = imported_module.advanced_callback
269 |         except AttributeError:
270 |             print("No advanced callback in the imported module!")
271 | 
272 |         try:
273 |             advanced_utterance_callback = imported_module.advanced_utterance_callback
274 |         except AttributeError:
275 |             print("No advanced utterrance callback in the imported module!")
276 | 
277 | 
278 |     class PendingRecognition(object):
279 |         def __init__(self):
280 |             self.logger = logging.getLogger('asrclient')
281 |             
282 |             self.server = ServerConnection(server, port, key, app, service, model, lang, format, uuid, inter_utt_silence, cmn_latency, biometry, self.logger, not nopunctuation, ipv4, capitalize, expected_num_count, snr, snr_flags, grammar_file)
283 |             self.unrecognized_chunks = []
284 |             self.retry_count = 0
285 |             self.pending_answers = 0
286 |             self.chunks_answered = 0
287 |             self.utterance_start_index = 0
288 |             self.executor = ThreadPoolExecutor(max_workers=1)
289 |             self.future = None
290 |             self.last_end_time = 0
291 |             self.correction_delta = 0
292 |             self.last_chunk_sent = False
293 | 
294 |         def check_result(self):
295 |             while True:
296 |                 try:
297 |                     response = self.server.get_response_if_ready()
298 |                     if response is not None:
299 |                         self.on_response(response)
300 |                     if self.last_chunk_sent and self.pending_answers <= 0:
301 |                         return
302 |                     else:
303 |                         time.sleep(0.01)
304 |                 except Exception as e:
305 |                     if self.pending_answers > 0:
306 |                         print("check result exception")
307 |                         print(type(e))
308 |                         print(e)
309 |                         raise e
310 |                     else:
311 |                         return
312 | 
313 |         def on_response(self, response):
314 | 
315 |             messages_count = response.messagesCount
316 |             self.chunks_answered += messages_count
317 |             self.pending_answers -= messages_count
318 | 
319 |             self.logger.info("got response: endOfUtt={0}; len(recognition)={1}; messages_count={2}".format(response.endOfUtt, len(response.recognition), messages_count))
320 | 
321 |             if response.endOfUtt:
322 |                 if (len(response.recognition) > 0):
323 |                     start_time = response.recognition[0].align_info.start_time + self.correction_delta
324 |                     end_time = response.recognition[0].align_info.end_time + self.correction_delta
325 | 
326 |                     if start_time < self.last_end_time:
327 |                         self.correction_delta = self.last_end_time
328 | 
329 |                     self.last_end_time = end_time
330 |                 
331 |                 if advanced_callback is not None:
332 |                     try:
333 |                         advanced_callback(response, self.correction_delta)
334 |                     except Exception as e:
335 |                         print("Exception in advanced_callback: ", e)
336 |             else:
337 |                 if advanced_callback is not None:
338 |                     try:
339 |                         advanced_callback(response)
340 |                     except Exception as e:
341 |                         print("Exception in advanced_callback: ", e)
342 |                 return
343 | 
344 | 
345 |             self.logger.info('Chunks from {0} to {1}.'.format(self.utterance_start_index, self.utterance_start_index + self.chunks_answered))
346 | 
347 |             if advanced_utterance_callback is not None:
348 |                 try:
349 |                     advanced_utterance_callback(response, self.unrecognized_chunks[:self.chunks_answered])
350 |                 except Exception as e:
351 |                     print("Exception in advanced_utterance_callback: ", e)
352 |             elif callback is not None:
353 |                 if (len(response.recognition) > 0):
354 |                     start_time = response.recognition[0].align_info.start_time + self.correction_delta
355 |                     end_time = response.recognition[0].align_info.end_time + self.correction_delta
356 |                     utterance = response.recognition[0].normalized.encode('utf-8')
357 |                     callback(utterance, start_time, end_time, self.unrecognized_chunks[:self.chunks_answered])
358 | 
359 |             del self.unrecognized_chunks[:self.chunks_answered]
360 |             self.utterance_start_index += self.chunks_answered
361 |             self.chunks_answered = 0
362 |             self.retry_count = 0
363 | 
364 |         def send(self, chunk):
365 |             self.logger.info("entering send() :start index {0}, pending answers {1}, chunks answered {2}".format(self.utterance_start_index, self.pending_answers, self.chunks_answered))
366 |             try:
367 |                 self.server.add_data(chunk)
368 |                 self.pending_answers += 1
369 |                 if chunk is None:
370 |                     self.last_chunk_sent = True
371 |             except (DecodeProtobufError, ServerError, TransportError, SocketError) as e:
372 |                 self.logger.exception("Something bad happened, waiting for reconnect!")
373 |                 time.sleep(1)
374 |                 self.resendOnError()
375 |             except Exception as e:
376 |                 self.logger.info("dbg send")
377 |                 print(type(e))
378 |                 print(e)
379 | 
380 |         def reconnectOnError(self):
381 |             global retry_count
382 |             if self.retry_count < reconnect_retry_count:
383 |                 self.retry_count += 1
384 |                 self.server.reconnect(reconnect_delay)
385 |                 if imported_module is not None:
386 |                     imported_module.session_id = self.server.session_id
387 |             else:
388 |                 raise RuntimeError("Gave up reconnecting!")
389 | 
390 |         def resendOnError(self):
391 |             self.logger.info('Resending current utterance (chunks {0}-{1})...'.format(self.utterance_start_index, self.utterance_start_index + len(self.unrecognized_chunks)))
392 |             self.pending_answers = 0
393 |             self.chunks_answered = 0
394 |             for i, chunk in enumerate(self.unrecognized_chunks):
395 | 
396 |                 while state.pending_answers > pending_limit:
397 |                     time.sleep(0.01)
398 | 
399 |                 if chunk is not None:
400 |                     self.logger.info('About to send chunk {0} ({1} bytes)'.format(self.utterance_start_index + i, len(chunk)))
401 |                 else:
402 |                     self.logger.info('No more chunks. Finalizing recognition.')
403 | 
404 |                 self.send(chunk)
405 | 
406 | 
407 |     start_at = time.time()
408 | 
409 |     state = PendingRecognition()
410 |     if imported_module is not None:
411 |         imported_module.session_id = state.server.session_id
412 | 
413 |     state.logger.info('Recognition was started.')
414 |     chunks_count = 0
415 | 
416 |     state.future = state.executor.submit(state.check_result)
417 | 
418 |     sent_length = 0
419 |     for index, chunk in enumerate(chunks):
420 | 
421 |         def check_future():
422 |             if not state.future.running():
423 |                 state.logger.info("future not running!")
424 |                 state.logger.info(state.future.exception())
425 |                 return False
426 |             return True
427 | 
428 |         def onError(exception):
429 |             state.logger.info('Connection lost! ({0})'.format(type(exception)))
430 |             state.logger.info(exception.message)
431 |             state.future.cancel()
432 |             state.reconnectOnError()
433 |             state.future = state.executor.submit(state.check_result)
434 |             state.resendOnError()
435 | 
436 |         while realtime and (sent_length / bytes_in_sec(format) > time.time() - start_at):
437 |             time.sleep(0.01)
438 |             if not check_future():
439 |                 onError(future.exception())
440 | 
441 | 
442 |         while state.pending_answers > pending_limit:
443 |             time.sleep(0.01)
444 |             if not check_future():
445 |                 onError(state.future.exception())
446 | 
447 |         state.logger.info('About to send chunk {0} ({1} bytes)'.format(index, len(chunk)))
448 |         state.unrecognized_chunks.append(chunk)
449 |         state.send(chunk)
450 |         chunks_count = index + 1
451 |         sent_length += len(chunk)
452 | 
453 |     state.logger.info('No more chunks. Finalizing recognition.')
454 |     state.unrecognized_chunks.append(None)
455 |     state.send(None)
456 | 
457 |     state.future.result()
458 | 
459 |     state.logger.info('Recognition is done.')
460 | 
461 |     fin_at = time.time()
462 |     seconds_elapsed = fin_at - start_at
463 | 
464 |     state.logger.info("Start at {0}, finish at {1}, took {2} seconds".format(time.strftime("[%d.%m.%Y %H:%M:%S]", time.localtime(start_at)),
465 |                                                                           time.strftime("[%d.%m.%Y %H:%M:%S]", time.localtime(fin_at)),
466 |                                                                           seconds_elapsed))
467 |     chunks_per_second = chunks_count / seconds_elapsed
468 |     state.logger.info("Avg. {0} chunks per second".format(chunks_per_second))
469 |     state.server.close()
470 | 


--------------------------------------------------------------------------------
/python/asrclient/transport.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | import select
  3 | import sys
  4 | import time
  5 | import ssl
  6 | import pprint
  7 | 
  8 | 
  9 | class TransportError(RuntimeError):
 10 |     def __init__(self, message):
 11 |         RuntimeError.__init__(self, message)
 12 | 
 13 | 
 14 | class Transport:
 15 |     def __init__(self, ip, port, timeout=8, verbose=True, enable_ssl=False, ipv4=False, max_faults=0):
 16 |         self.verbose = verbose
 17 |         self.max_faults = max_faults
 18 |         tries = 5
 19 |         while tries > 0:
 20 |             try:
 21 |                 if self.verbose:
 22 |                     print('Trying to connect %s:%s' % (ip, port))
 23 |                     print("Tries left: %s" % (tries,))
 24 |                 if enable_ssl:
 25 |                     s = socket.socket(socket.AF_INET if ipv4 else socket.AF_INET6, socket.SOCK_STREAM)
 26 |                     ssl_sock = ssl.wrap_socket(s)
 27 |                     ssl_sock.connect((ip, port))
 28 |                     print(repr(ssl_sock.getpeername()))
 29 |                     print(ssl_sock.cipher())
 30 |                     print(pprint.pformat(ssl_sock.getpeercert()))
 31 |                     self.socket = ssl_sock
 32 |                 else:
 33 |                     self.socket = socket.create_connection((ip, port), timeout)
 34 |                     self.socket.settimeout(timeout)
 35 |                 return None
 36 |             except Exception as ex:
 37 |                 tries -= 1
 38 |                 time.sleep(1)
 39 |                 if (tries == 0):
 40 |                     raise ex
 41 | 
 42 |     def __enter__(self):
 43 |         return self
 44 | 
 45 |     def send(self, data):
 46 |         faults = 0
 47 | 
 48 |         while True:
 49 |             try:
 50 |                 rlist, wlist, xlist = select.select([], [self.socket], [self.socket], 0.1)
 51 |                 if len(xlist):
 52 |                     raise TransportError("send unavailable!")
 53 |                 if len(wlist):
 54 |                     break
 55 |             except Exception as e:
 56 |                 if self.verbose:
 57 |                     print("Exception on pre-send select: ", e)
 58 |                 faults += 1
 59 |                 if faults > self.max_faults:
 60 |                     raise e
 61 |         while True:
 62 |             try:
 63 |                 self.socket.send(data.encode("utf-8") if (sys.version_info[0] == 3 and type(data) == str) else data)
 64 |                 break
 65 |             except Exception as e:
 66 |                 if self.verbose:
 67 |                     print("Exception on send: ", e)
 68 |                 faults += 1
 69 |                 if faults > self.max_faults:
 70 |                     raise e
 71 |         if self.verbose:
 72 |             print("Send " + str(len(data)))
 73 | 
 74 |     def recv(self, length, decode=(sys.version_info[0] == 3)):
 75 |         res = b""
 76 |         faults = 0
 77 |         while True:
 78 |             try:
 79 |                 res += self.socket.recv(length - len(res))
 80 |                 if len(res) < length:
 81 |                     rlist, _, xlist = select.select([self.socket], [], [self.socket], 0.1)
 82 |                 else:
 83 |                     if decode:
 84 |                         return res.decode("utf-8")
 85 |                     else:
 86 |                         return res
 87 |             except Exception as e:
 88 |                 if self.verbose:
 89 |                     print("Exception on recv: ", e)
 90 |                 faults += 1
 91 |                 if faults > self.max_faults:
 92 |                     raise e
 93 | 
 94 |     def sendFull(self, message):
 95 |         begin = 0
 96 |         while begin < len(message):
 97 |             begin += self.socket.send(message[begin:])
 98 | 
 99 |     def sendMessage(self, message):
100 |         self.socket.send(hex(len(message))[2:].encode("utf-8"))
101 |         self.socket.send(b'\r\n')
102 |         self.sendFull(message)
103 |         if self.verbose:
104 |             print("Send message size: ", len(message))
105 | 
106 |     def recvMessage(self):
107 |         size = b''
108 |         while True:
109 |             symbol = self.socket.recv(1)
110 | 
111 |             if len(symbol) == 0:
112 |                 raise TransportError('Backend closed connection')
113 | 
114 |             assert(len(symbol) == 1), 'Bad symbol len from socket ' + str(len(symbol))
115 | 
116 |             if symbol == b'\r':
117 |                 self.socket.recv(1)
118 |                 break
119 |             else:
120 |                 size += symbol
121 |         sizeInt = int(b'0x' + size, 0)
122 |         if self.verbose:
123 |             print("Got message. Expecting {0} bytes length.".format(sizeInt))
124 |         if (sizeInt > 0):
125 |             result = b''
126 |             while len(result) < sizeInt:
127 |                 result += self.socket.recv(sizeInt - len(result), False)
128 |             result = result
129 |             assert (len(result) == sizeInt), 'Invalid message size'
130 |             return result
131 |         return ''
132 | 
133 |     def sendProtobuf(self, protobuf):
134 |         self.sendMessage(protobuf.SerializeToString())
135 | 
136 |     def recvProtobuf(self, *protobufTypes):
137 |         savedException = None
138 | 
139 |         message = self.recvMessage()
140 |         for protoType in protobufTypes:
141 |             response = protoType()
142 |             try:
143 |                 response.ParseFromString(message)
144 |                 return response
145 |             except Exception as exc:
146 |                 savedException = exc
147 | 
148 |         raise savedException
149 | 
150 |     def recvProtobufIfAny(self, *protobuf):
151 |         rlist, wlist, xlist = select.select([self.socket], [], [self.socket], 0)
152 |         if (len(rlist)):
153 |             return self.recvProtobuf(*protobuf)
154 |         else:
155 |             return None
156 | 
157 |     def transfer(self, sendProtobuf, receiveType):
158 |         self.sendProtobuf(sendProtobuf)
159 |         return self.recvProtobuf(receiveType)
160 | 
161 |     def close(self):
162 |         if self.verbose:
163 |             print('Close socket' + str(self.socket))
164 |         self.socket.close()
165 | 
166 |     def __exit__(self, type, value, traceback):
167 |         self.close()
168 | 
169 | server = "127.0.0.1"
170 | port = 8089
171 | 
172 | 
173 | def defaultHost():
174 |     return "{0}:{1}".format(server, port)
175 | 
176 | 
177 | def defaultTransport():
178 |     return Transport(server, port, verbose=False)
179 | 


--------------------------------------------------------------------------------
/python/asrclient/tts.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto2";
  2 | 
  3 | import "ttsbackend.proto";
  4 | 
  5 | package TTS;
  6 | 
  7 | ///////////////////////////////////////////////////////////////////////////
  8 | // Usage:
  9 | //(1) send ConnectionRequest, receive ConnectionResponse
 10 | //(2) send ParamsRequest, receive ParamsResponse
 11 | //(3) send GenerateRequest, recieve GenerateResponse
 12 | //
 13 | // TTS proxy requires all this steps in fixed order
 14 | // and none could be skipped
 15 | 
 16 | message ConnectionRequest
 17 | {
 18 |   required string serviceName = 1; // "tts", "asr", "asr_dictation", etc.
 19 | 
 20 |   required string uuid = 2;
 21 | 
 22 |   optional int32 protocolVersion = 3 [default = 1];
 23 | 
 24 |   optional string deviceName = 4;
 25 | 
 26 |   // new field v2
 27 | 
 28 |   optional string speechkitVersion = 5;
 29 | 
 30 |   // warning, apiKey options only temporary (for /tcp hand), for /ytcp it absence will result in ConnectionResponse::KeyInvalid
 31 |   optional string apiKey = 6;
 32 | 
 33 |   optional string applicationName = 7;
 34 | 
 35 |   optional string coords = 8;
 36 | }
 37 | 
 38 | ///////////////////////////////////////////////////////////////////////////
 39 | 
 40 | message ParamsRequest
 41 | {
 42 |   optional bool listVoices = 1;
 43 | }
 44 | 
 45 | message ParamsResponse
 46 | {
 47 |   message Voice
 48 |   {
 49 |     // use as "voice" in GenerateRequest
 50 |     required string name = 1;
 51 |     // 1 female, 2 male
 52 |     required int32 gender = 2;
 53 |     // 0x809 english
 54 |     // 0x419 ru
 55 |     // 0 for "internation" voice, that can be used with any language
 56 |     required int32 languageId = 3;
 57 | 
 58 |     required int32 initialSampleFreq = 4;
 59 | 
 60 |     // fine display name for menus and gui
 61 |     optional string displayName = 5;
 62 | 
 63 |     // this voice can be used in lowLevelGenerateRequest for mixing
 64 |     optional bool coreVoice = 6;
 65 |   }
 66 | 
 67 |   repeated Voice voiceList = 1;
 68 | }
 69 | 
 70 | ///////////////////////////////////////////////////////////////////////////
 71 | 
 72 | message GenerateRequest
 73 | {
 74 |   required string lang = 1;
 75 |   required string text = 2;
 76 |   required string application = 3;
 77 |   required string platform = 4;
 78 |   required string voice = 6;
 79 |   optional float speed = 31;
 80 |   optional string emotion = 10;
 81 | 
 82 |   enum Quality {
 83 |     High = 0; Low = 1; UltraHigh = 2;
 84 |     // Low means resample to 8000!
 85 |     // High meand resample to 16000.
 86 |     // UltraHigh means 48000 (or 32000 for SPEEX)
 87 |   }
 88 | 
 89 |   enum Format {
 90 |     Wav = 0; Pcm = 1; Spx = 2; Opus = 3;
 91 |   }
 92 | 
 93 |   optional Quality quality = 7 [default = High];
 94 |   optional Format format = 8 [default = Spx];
 95 |   optional bool requireMetainfo = 5 [default = false];
 96 | 
 97 |   optional Generate lowLevelGenerateRequest = 30;
 98 | 
 99 |   // keep calm, and do not use '9' slot again
100 |   optional string speed_obsolete = 9;
101 | 
102 |   optional float volume = 32 [default = 1.0];
103 | 
104 |   optional bool chunked = 33 [default = false];
105 | }
106 | 
107 | message Feedback
108 | {
109 |   required int32 elapsed = 1;
110 |   required string event = 2;
111 | }
112 | 
113 | 


--------------------------------------------------------------------------------
/python/asrclient/ttsbackend.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto2";
 2 | 
 3 | import "basic.proto";
 4 | 
 5 | package TTS;
 6 | 
 7 | message Generate
 8 | {
 9 |   optional string sessionId = 30;
10 |   
11 |   //Language code, ex.: ru, en
12 |   required string lang = 1;
13 |   
14 |   message WeightedParam
15 |   {
16 |     required string name = 1;
17 |     required float weight = 2;
18 |   }
19 | 
20 |   //Text for synthesis
21 |   required string text = 2;
22 | 
23 |   //Speed of speech <1.0 - slower, >1.0 - faster
24 |   optional float speed  = 3 [default = 1.0];
25 | 
26 |   //Supported voices are: zhar, omazh, jane, ermil, oksana
27 |   repeated WeightedParam voices = 11;
28 | 
29 |   //Supported emotions are: good, neutral, evil
30 |   repeated WeightedParam emotions = 12;
31 | 
32 |   //Supported genders are: male, female
33 |   repeated WeightedParam genders = 13;
34 |   
35 |   optional bool requireMetainfo = 5 [default = false];
36 | 
37 |   optional float msd_threshold = 14;
38 |   optional float mgc_recurrence = 15;
39 |   optional float subtract_durations_sigmas = 17;
40 |   optional float lf0_postfilter = 18;
41 |   optional float mgcGVWeight = 19;
42 |   optional float lf0GVWeight = 20;
43 |   optional float mvfGVWeight = 21;
44 |   optional float mgc_postfilter1 = 22;
45 |   optional float mgc_postfilter2 = 23;
46 | 
47 |   optional bool chunked = 24; //ex-fast
48 | }
49 | 
50 | message GenerateResponse
51 | {
52 |   message WordEvent
53 |   {
54 |     required int32 firstCharPositionInText = 1;
55 |     required int32 bytesLengthInSignal = 2;
56 |     optional string text = 3;
57 |     optional string postag = 4;
58 |     optional string homographTag = 5;
59 |   }
60 |   message Phoneme
61 |   {
62 |     required string ttsPhoneme = 1;
63 |     required string IPAPhoneme = 2;
64 |     required int32 viseme = 5;
65 |     required int32 durationMs = 3;
66 |     required int32 positionInBytesStream = 4;
67 |   }
68 |   
69 |   // words and phomenes would be empty, unless requireMetainfo is set in GenerateRequest
70 |   repeated WordEvent words = 1;
71 |   repeated Phoneme phonemes = 2;
72 |   optional bytes audioData = 3;
73 |   required bool completed = 4;
74 | 
75 |   optional BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 5;
76 |   //Error message
77 |   optional string message = 6; 
78 |   
79 |   //Lingware information
80 |   optional string lang = 7;
81 |   optional string version = 8;
82 | }
83 | 
84 | message StopGeneration
85 | {
86 | }
87 | 


--------------------------------------------------------------------------------
/python/asrclient/ttsclient.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys, os
  5 | import requests
  6 | from datetime import datetime
  7 | import time
  8 | import random
  9 | import logging
 10 | 
 11 | if sys.version_info >= (3, 0):
 12 |     from .transport import Transport
 13 |     from .basic_pb2 import ConnectionResponse
 14 |     from .ttsbackend_pb2 import Generate, GenerateResponse
 15 |     from .tts_pb2 import GenerateRequest, ConnectionRequest, ParamsRequest, ParamsResponse
 16 | else:
 17 |     from transport import Transport
 18 |     from basic_pb2 import ConnectionResponse
 19 |     from ttsbackend_pb2 import Generate, GenerateResponse
 20 |     from tts_pb2 import GenerateRequest, ConnectionRequest, ParamsRequest, ParamsResponse
 21 | 
 22 | from uuid import uuid4 as randomUuid
 23 | 
 24 | DEFAULT_KEY_VALUE = 'paste-your-own-key'
 25 | DEFAULT_SERVER_VALUE = 'tts.voicetech.yandex.net'
 26 | DEFAULT_PORT_VALUE = 80
 27 | 
 28 | DEFAULT_LANG_VALUE = 'ru-RU'
 29 | 
 30 | DEFAULT_UUID_VALUE = randomUuid().hex
 31 | 
 32 | DEFAULT_FORMAT_VALUE = 'wav'
 33 | DEFAULT_QUALITY_VALUE = 'high'
 34 | 
 35 | def generateWavHeader(sample_rate, mono=True, data_size=0):
 36 |     gWavHeader = "RIFF\xff\xff\xff\xffWAVEfmt \x10\x00\x00\x00\x01\x00" + ("\x01" if mono else "\x02") + "\x00"
 37 |     wav_rate = ""
 38 |     wav_rate_align = ""
 39 |     sample_rate_align = sample_rate * 2
 40 |     for i in xrange(0, 4):
 41 |         wav_rate += chr(sample_rate % (256 if mono else 512))  # sample_rate * block_align (2 for mono) as int32
 42 |         wav_rate_align += chr(sample_rate_align % 256)  # sample_rate as int32
 43 |         sample_rate /= 256
 44 |         sample_rate_align /= 256
 45 |     gWavHeader += wav_rate
 46 |     gWavHeader += wav_rate_align
 47 |     gWavHeader += "\x02" if mono else "\x04"
 48 |     gWavHeader += "\x00\x10\x00data\xff\xff\xff\xff"
 49 | 
 50 |     if data_size > 0:
 51 |         size_of_wav = data_size + 36
 52 |         hexWavSize = ""
 53 |         hexDataSize = ""
 54 |         for i in xrange(0,4):
 55 |             hexWavSize += chr(size_of_wav % 256)
 56 |             size_of_wav /= 256
 57 |             hexDataSize += chr(data_size % 256)
 58 |             data_size /= 256
 59 |         gWavHeader = gWavHeader[:4] + hexWavSize + gWavHeader[8:40] + hexDataSize
 60 | 
 61 |     return gWavHeader
 62 | 
 63 | def upgradeToProtobuf(transport, server, port):
 64 |         transport.verbose = False
 65 |         transport.send("GET /ytcp2 HTTP/1.1\r\n" +
 66 |                 "User-Agent:KeepAliveClient\r\n" +
 67 |                 "Host: %s:%s\r\n" % (server, port) +
 68 |                 "Upgrade: websocket\r\n\r\n");
 69 |         check = "HTTP/1.1 101"
 70 |         checkRecv = ""
 71 |         while True:
 72 |             checkRecv += transport.recv(1)
 73 |             if checkRecv.startswith(check) and checkRecv.endswith("\r\n\r\n"):
 74 |                 break
 75 |             if len(checkRecv) > 300:
 76 |                 return False
 77 |         return True
 78 | 
 79 | def list_speakers(server=DEFAULT_SERVER_VALUE, port=DEFAULT_PORT_VALUE, key=DEFAULT_KEY_VALUE, uuid=DEFAULT_UUID_VALUE, ipv4=False, **kwars):
 80 |     logger = logging.getLogger('asrclient')
 81 |     with Transport(server, port, timeout=None, verbose=False, enable_ssl=(port==443), ipv4=ipv4) as t:
 82 |         if not upgradeToProtobuf(t, server, port):
 83 |             logger.info("Wrong response on upgrade request. Exiting.")
 84 |             sys.exit(1)
 85 |         logger.info("Upgraded to protobuf, sending connect request.")
 86 |         
 87 |         t.sendProtobuf(ConnectionRequest(
 88 |             serviceName="tts",
 89 |             speechkitVersion="ttsclient",
 90 |             uuid=uuid,
 91 |             apiKey=key
 92 |         ))
 93 | 
 94 |         connectionResponse = t.recvProtobuf(ConnectionResponse)
 95 |         
 96 |         if connectionResponse.responseCode != 200:
 97 |             logger.info("Bad response code %s: %s" % (connectionResponse.responseCode, connectionResponse.message))
 98 |             sys.exit(1)
 99 | 
100 |         logger.info("Connected, getting speakers list.")
101 | 
102 |         t.sendProtobuf(ParamsRequest(
103 |             listVoices=True
104 |         ))
105 | 
106 |         res = t.recvProtobuf(ParamsResponse)
107 | 
108 |         print(", ".join([v.name for v in res.voiceList if v.coreVoice]))
109 | 
110 | def generate(file, text, speaker, server=DEFAULT_SERVER_VALUE, port=DEFAULT_PORT_VALUE, key=DEFAULT_KEY_VALUE, uuid=DEFAULT_UUID_VALUE, lang=DEFAULT_LANG_VALUE, emotion=None, gender=None, ipv4=False, format=DEFAULT_FORMAT_VALUE, quality=DEFAULT_QUALITY_VALUE):
111 |     logger = logging.getLogger('asrclient')
112 |     with Transport(server, port, timeout=None, verbose=False, enable_ssl=(port==443), ipv4=ipv4) as t:
113 |         if not upgradeToProtobuf(t, server, port):
114 |             logger.info("Wrong response on upgrade request. Exiting.")
115 |             sys.exit(1)
116 |         logger.info("Upgraded to protobuf, sending connect request")
117 |         
118 |         t.sendProtobuf(ConnectionRequest(
119 |             serviceName="tts",
120 |             speechkitVersion="ttsclient",
121 |             uuid=uuid,
122 |             apiKey=key
123 |         ))
124 | 
125 |         connectionResponse = t.recvProtobuf(ConnectionResponse)
126 |         
127 |         if connectionResponse.responseCode != 200:
128 |             logger.info("Bad response code %s: %s" % (connectionResponse.responseCode, connectionResponse.message))
129 |             sys.exit(1)
130 | 
131 |         t.sendProtobuf(ParamsRequest(
132 |             listVoices=True
133 |         ))
134 | 
135 |         res = t.recvProtobuf(ParamsResponse)
136 | 
137 |         request = GenerateRequest(
138 |             lang=lang,
139 |             text=text,
140 |             application="ttsclient",
141 |             platform="local",
142 |             voice=speaker,
143 |             requireMetainfo=False,
144 |             format={'wav': GenerateRequest.Pcm, 'pcm': GenerateRequest.Pcm, 'speex': GenerateRequest.Spx, 'opus': GenerateRequest.Opus}.get(format, GenerateRequest.Pcm),
145 |             quality=({'low': GenerateRequest.Low, 'high': GenerateRequest.High, 'ultra': GenerateRequest.UltraHigh}[quality]),
146 |             chunked=True
147 |         )
148 | 
149 |         if emotion or gender:
150 |             request.lowLevelGenerateRequest.CopyFrom(Generate(
151 |                 voices=[Generate.WeightedParam(name=speaker, weight=1.0)],
152 |                 emotions=[Generate.WeightedParam(name=emotion, weight=1.0)] if emotion else [],
153 |                 genders=[Generate.WeightedParam(name=gender, weight=1.0)] if gender else [],
154 |                 lang=lang[:2],
155 |                 text=text,
156 |                 fast=False,
157 |                 requireMetainfo=False
158 |             ))
159 |         
160 |         t.sendProtobuf(request)
161 |         if format == 'wav':
162 |             file.write(generateWavHeader({'ultra': 48000,
163 |                                             'high': 16000,
164 |                                             'low': 8000}[quality]))
165 |         while True:
166 |             ttsResponse = t.recvProtobuf(GenerateResponse)
167 |             if ttsResponse.message:
168 |                 logger.info("Error on synthesis: %s" % (ttsResponse.message,))
169 |                 sys.exit(2)
170 |             
171 |             if not ttsResponse.completed:
172 |                 file.write(ttsResponse.audioData)
173 |             else:
174 |                 file.close()
175 |                 break
176 |     logger.info("Request complete")
177 | 


--------------------------------------------------------------------------------
/python/asrclient/voiceproxy.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto2";
  2 | 
  3 | import "basic.proto";
  4 | 
  5 | package VoiceProxyProtobuf;
  6 | 
  7 | // use this part of ConnectionRequest to specify additional options for decoder/proxy
  8 | message AdvancedASROptions
  9 | {
 10 |   // send back partial results, if disabled only results with endOfUtt == true will be send
 11 |   optional bool partial_results = 1 [default = true];
 12 | 
 13 |   // beam, lattice_beam, lattice_nbest - are low level decoder options
 14 |   optional float beam = 2 [default = -1];
 15 | 
 16 |   optional float lattice_beam = 3 [default = -1];
 17 | 
 18 |   optional int32 lattice_nbest = 4 [default = -1];
 19 | 
 20 |   // specify interval in 10mc of silence/noice which will separe sentences, defines how often you will receive endOfUtt == true
 21 |   optional int32 utterance_silence = 5 [default = 120];
 22 | 
 23 |   // disable all partial and results with "endOfUtt" - will response only when AddData with "lastChunk" is received
 24 |   optional bool allow_multi_utt = 16 [default = true];
 25 | 
 26 |   // if client sends too many chunks (more then server could process) - if timeout (in mc) specify how many buffers sound should be read
 27 |   // before sending to decoder, this may vary how often partial_results are sending
 28 |   optional int32 chunk_process_limit = 17 [default = 100];
 29 | 
 30 |   // cmn is a internal feature of decoder
 31 |   optional int32 cmn_window = 18 [default = 600];
 32 | 
 33 |   optional int32 cmn_latency = 19 [default = 150];
 34 | 
 35 |   // capitalize and expected_num_count are features of "normalized" field of the AddDataResponse recognition result
 36 |   
 37 |   // specify if "normalized" results will be capitalize
 38 |   optional bool capitalize = 20 [default = false];
 39 | 
 40 |   // if specified normalizer will try to fit this count, for example "twenty two" will normalize to 20 2, if "3" is set as expected, default will be 22
 41 |   optional int32 expected_num_count = 21 [default = 0];
 42 |   
 43 |   // list of phrases for on-fly grammar, for example "yes", "no" in case of en-US
 44 |   // this field made "topic" in ConnectionRequest irrelevant, instead this list is used to build "language model" on the fly
 45 |   repeated string grammar = 22;
 46 |   
 47 |   // the same as previous, but partially support srgs, you can specify items, tags and simple rules, for example:
 48 |   //   <?xml version="1.0" ?>
 49 |   // <grammar xmlns="http://www.w3.org/2001/06/grammar" xml:lang="ru-RU" root="root" version="1.0" mode="voice" tag-format="semantics/1.0-literals">
 50 |   //   <rule id="root">
 51 |   //     <one-of>
 52 |   //       <item><ruleref special="GARBAGE" /></item>
 53 |   //       <item><tag>оплатил</tag>оплатил</item>
 54 |   //       <item><tag>оплатил</tag>да</item>
 55 |   //       <item><tag>не оплатил</tag>не оплатил</item>
 56 |   //       <item><tag>не оплатил</tag>нет</item>
 57 |   //     </one-of>
 58 |   //   </rule>
 59 |   // </grammar>
 60 |   optional string srgs = 23;
 61 | 
 62 |   // currently supports "gender", "age", "group", "language", "children", "emotion" and combination with ",", like "age,gender"
 63 |   // checkout BiometryResult
 64 |   optional string biometry = 24;
 65 | 
 66 |   // turn on confidence rescoring procedure
 67 |   optional bool use_snr = 25 [default = false];
 68 | 
 69 |   // flags for confidence rescoring procedure
 70 |   repeated SnrFlag snr_flags = 26;
 71 | 
 72 |   // used to distinguish between biometry groups (devices)
 73 |   optional string biometry_group = 27;
 74 | 
 75 |   // enable special normalizers for "manual punctuation" i.e. replace "привет запятая как дела воспросительный знак"  with "привет, как дела?"
 76 |   optional bool manual_punctuation = 28 [default = false];
 77 | }
 78 | 
 79 | message ConnectionRequest
 80 | {
 81 |   optional int32 protocolVersion = 1 [default = 1];
 82 | 
 83 |   // leave empty if you are not speechkit
 84 |   required string speechkitVersion = 2;
 85 | 
 86 |   required string serviceName = 3; // "asr_dictation", etc.
 87 | 
 88 |   required string uuid = 4;
 89 | 
 90 |   optional string yandexuid = 21;
 91 | 
 92 |   required string apiKey = 5;
 93 | 
 94 |   required string applicationName = 6;
 95 | 
 96 |   // vendor:model:type... user defined
 97 |   required string device = 7;
 98 | 
 99 |   // lat.lat,lan.lan
100 |   required string coords = 8;
101 | 
102 |   // "general", "mapsyari", "freeform", "music"
103 |   // topic is ignored, if grammar or srgs from advancedAsrOptions are set
104 |   required string topic = 9;
105 | 
106 |   // "ru-RU"
107 |   required string lang = 10;
108 | 
109 |   // "audio/x-speex", "audio/x-pcm;bit=16;rate=8000", etc.
110 |   required string format = 11;
111 | 
112 |   // enable punctuation mode for "freeform" topic (some other topic may support punctuation in the future)
113 |   optional bool punctuation = 12 [default = true];
114 | 
115 |   optional bool disableAntimatNormalizer = 18 [default = false];
116 | 
117 |   optional AdvancedASROptions advancedASROptions = 19;
118 | 
119 |   optional bool skipAudioFromLogging = 20 [default = false];
120 | 
121 |   // deprecated
122 |   optional MusicRequest musicRequest = 17;
123 | }
124 | 
125 | ///////////////////////////////////////////////////////////////////////////
126 | 
127 | message AddData
128 | {
129 |   optional bytes audioData = 1;
130 | 
131 |   required bool lastChunk = 2;
132 | }
133 | 
134 | ///////////////////////////////////////////////////////////////////////////
135 | 
136 | message AlignInfo
137 | {
138 |   optional float start_time = 1;
139 | 
140 |   optional float end_time = 2;
141 | 
142 |   optional float acoustic_score = 3;
143 | 
144 |   optional float graph_score = 4;
145 | 
146 |   optional float lm_score = 5;
147 | 
148 |   optional float total_score = 6;
149 | }
150 | 
151 | message Word
152 | {
153 |   required float confidence = 1;
154 | 
155 |   required string value = 2;
156 | 
157 |   optional VoiceProxyProtobuf.AlignInfo align_info = 3;
158 | }
159 | 
160 | message Result
161 | {
162 |   // notice: confidence valid only when endOfUttr is true, otherwise always "1"
163 |   required float confidence = 1;
164 | 
165 |   repeated Word words = 2;
166 | 
167 |   optional string normalized = 3;
168 | 
169 |   optional VoiceProxyProtobuf.AlignInfo align_info = 4;
170 | }
171 | 
172 | message BiometryResult
173 | {
174 |     required string classname = 1;
175 | 
176 |     required float confidence = 2;
177 | 
178 |     optional string tag = 3;
179 | }
180 | 
181 | message SnrFlag
182 | {
183 |     required string name = 1;
184 |     required string value = 2;
185 | }
186 | 
187 | message SnrFeature
188 | {
189 |     optional string name = 1;
190 | 
191 |     optional float value = 2;
192 | }
193 | 
194 | message SnrInfo
195 | {
196 |     optional string normalizedText = 1;
197 | 
198 |     optional float snrValue = 2;
199 | 
200 |     repeated SnrFeature features = 3;
201 | 
202 |     optional string featureSlices = 4;
203 | 
204 |     optional int32 originalCandidateIndex = 5;
205 | 
206 |     optional string candidateSource = 6;
207 | }
208 | 
209 | message SnrMetainfo
210 | {
211 |     optional string name = 1;
212 | }
213 | 
214 | message Metainfo
215 | {
216 |     required float minBeam = 1;
217 | 
218 |     required float maxBeam = 2;
219 |     
220 |     repeated SnrInfo snrInfos = 3;
221 | 
222 |     optional string topic = 4;
223 | 
224 |     optional string lang = 5;
225 | 
226 |     optional string version = 6;
227 |     
228 |     optional string load_timestamp = 7;
229 | 
230 |     optional int32 snrResponseCode = 8;
231 | 
232 |     optional bool snr_performed_rescoring = 9;
233 | 
234 |     optional SnrMetainfo snrMetainfo = 10;
235 | }
236 | 
237 | message AddDataResponse
238 | {
239 |   required BasicProtobuf.ConnectionResponse.ResponseCode responseCode = 1;
240 | 
241 |   repeated Result recognition = 2;
242 | 
243 |   // if true : recognition contains fully parsed N-best list (n results with n words)
244 |   // otherwise recognition contains just 1 result 1 word with current "partical result"
245 |   optional bool endOfUtt = 3 [default = false];
246 | 
247 |   // how many AddData requests were merged for this response
248 |   optional int32 messagesCount = 4 [default = 1];
249 | 
250 |   // if not empty messageCounter should be 0
251 |   optional string musicProxyResponse = 5;
252 | 
253 |   repeated BiometryResult bioResult = 6;
254 | 
255 |   optional Metainfo metainfo = 7;
256 | }
257 | 
258 | // deprecated
259 | message MusicRequest
260 | {
261 |   message MusicParam
262 |   {
263 |     required string name = 1;
264 | 
265 |     required string value = 2;
266 |   }
267 | 
268 |   // default options are "uid", "OAuth", "widget"
269 |   repeated MusicParam musicProxyOptions = 1;
270 | }
271 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='asrclient',
 5 |     version='0.5.0',
 6 |     author='Andrey Pichugin, Alexander Artemenko, Andrey Semenov',
 7 |     author_email='voice@support.yandex.ru',
 8 |     description='Yandex ASR streaming client.',
 9 |     long_description=open('README.txt', 'r').read(),
10 |     url='http://api.yandex.ru/speechkit/cloud-api/',
11 |     platforms=['Any'],
12 |     license='GNU GPLv3',
13 |     packages=['asrclient'],
14 |     install_requires=['protobuf', 'click', 'futures'],
15 |     scripts=['asrclient-cli.py', 'ttsclient-cli.py'],
16 |     package_data={'asrclient': ['*.proto']},
17 | )
18 | 


--------------------------------------------------------------------------------
/python/ttsclient-cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """Yandex TTS streaming client."""
 4 | 
 5 | import logging
 6 | import click
 7 | import sys
 8 | 
 9 | import asrclient.ttsclient as client
10 | 
11 | 
12 | @click.command()
13 | @click.option('-k', '--key',
14 |               help='You could get it at https://developer.tech.yandex.ru/. Default is "{0}".'.format(client.DEFAULT_KEY_VALUE),
15 |               default=client.DEFAULT_KEY_VALUE)
16 | @click.option('-s', '--server',
17 |               help='Default is {0}.'.format(client.DEFAULT_SERVER_VALUE),
18 |               default=client.DEFAULT_SERVER_VALUE)
19 | @click.option('-p', '--port',
20 |               help='Default is {0}.'.format(client.DEFAULT_PORT_VALUE),
21 |               default=client.DEFAULT_PORT_VALUE)
22 | @click.option('--lang',
23 |               help='Synthesis language. ru-RU | en-EN | tr-TR | uk-UA. Default is {0}.'.format(client.DEFAULT_LANG_VALUE),
24 |               default=client.DEFAULT_LANG_VALUE)
25 | @click.option('--speaker',
26 |               help='Speaker for speech synthesis. Call this script with --list-speakers flag to get speakers list.',
27 |               default='')
28 | @click.option('--emotion',
29 |               help='Emotion for speech synthesis. Available values: good, neutral, evil. Default value depends on speaker\'s original emotion.',
30 |               default=None)
31 | @click.option('--gender',
32 |               help='Speaker\'s gender for speech synthesis. Available values: male, female. Default value depends on speaker\'s original gender.',
33 |               default=None)
34 | @click.option('--textfile',
35 |               help='Read text from this file instead of command line arguments.',
36 |               type=click.File('r'),
37 |               default=None)
38 | @click.option('--uuid',
39 |               default=client.DEFAULT_UUID_VALUE,
40 |               help='UUID of your request. It can be helpful for further logs analysis. Default is random.')
41 | @click.option('--ipv4',
42 |               is_flag=True,
43 |               help='Use ipv4 only connection.')
44 | @click.option('--list-speakers',
45 |               is_flag=True,
46 |               default=False,
47 |               help='Only list available speakers, don\'t try to generate anything.')
48 | @click.option('--silent',
49 |               is_flag=True,
50 |               help='Don\'t print debug messages.')
51 | @click.option('--format',
52 |               default=client.DEFAULT_FORMAT_VALUE,
53 |               help='Format of output audio file. wav | pcm | speex | opus. Default is {0}.'.format(client.DEFAULT_FORMAT_VALUE))
54 | @click.option('--quality',
55 |               default=client.DEFAULT_QUALITY_VALUE,
56 |               help='Quality output audio file. ultra | high | low. Default is {0}.'.format(client.DEFAULT_QUALITY_VALUE))
57 | @click.argument('file',
58 |               required=False,
59 |               type=click.File('wb'))
60 | @click.argument('texts',
61 |                 nargs=-1)
62 | 
63 | def main(silent, speaker, texts, textfile=None, list_speakers=False, **kwars):
64 |     if not silent:
65 |         logging.basicConfig(level=logging.INFO)
66 |     if list_speakers:
67 |         client.list_speakers(**kwars)
68 |         sys.exit(0)
69 |     if not speaker:
70 |         print("Speaker is required. Please, call this script with --list-speakers flag to get speakers list.")
71 |         sys.exit(1)
72 |     if textfile:
73 |         texts = map(str.strip, textfile.readlines())
74 |     client.generate(text=" ".join(texts).decode('utf8'), speaker=speaker, **kwars)
75 | 
76 | if __name__ == "__main__":
77 |         main()
78 | 


--------------------------------------------------------------------------------
/webspeechkit/README.md:
--------------------------------------------------------------------------------
 1 | ###Quickstart
 2 | ####Get API key
 3 | First of all you will need to get API key for Yandex.SpeechKit.
 4 | To do this go [here](https://developer.tech.yandex.ru) and get an API key for Yandex SpeechKit.
 5 | 
 6 | ####Add dependecies to your web page
 7 | Add Yandex.SpeechKit Web scripts from Yandex CDN to your web page:
 8 | 
 9 | `<script type="text/javascript" src="//download.yandex.ru/webspeechkit/webspeechkit-1.0.0.js"></script>`
10 | `<script type="text/javascript" src="//download.yandex.ru/webspeechkit/webspeechkit-settings.js"></script>`
11 | 
12 | ####Use API to create wonderful voice interfaces
13 | Write some code for speech recognition logic.
14 | For example, if you need to simply recognize short voice requests than you'll need to write something like this:
15 | 
16 | ```
17 | window.onload = function() {
18 |     ya.speechkit.recognize({
19 |         doneCallback: function (text) {
20 |             console.log("You've said: " + text);
21 |         },
22 |         initCallback: function () {
23 |             console.log("You may speak now");
24 |         },
25 |         errorCallback: function (err) {
26 |             console.log("Something gone wrong: " + err);
27 |         },
28 |         model: 'freeform', // Model name for recognition process
29 |         lang: 'ru-RU', //Language for recognition process
30 |         apiKey: PUT_YOUR_API_KEY_HERE
31 |     });
32 | };
33 | ```
34 | 
35 | Simple synthesis:
36 | 
37 | ```
38 | window.onload = function() {
39 |     var tts = ya.speechkit.Tts(
40 |         {
41 |             speaker: 'jane',
42 |             emotion: 'good',
43 |             gender: 'female'
44 |         });
45 |     tts.speak('1 2 3');
46 | };
47 | ```
48 | 


--------------------------------------------------------------------------------
/webspeechkit/src/equalizer.js:
--------------------------------------------------------------------------------
 1 | (function (namespace) {
 2 |     'use strict';
 3 | 
 4 |     if (typeof namespace.ya === 'undefined') {
 5 |         namespace.ya = {};
 6 |     }
 7 |     if (typeof namespace.ya.speechkit === 'undefined') {
 8 |         namespace.ya.speechkit = {};
 9 |     }
10 | 
11 |     namespace.ya.speechkit.Equalizer = function (target, recorder) {
12 |         this.recorder = recorder;
13 |         this.element = document.getElementById(target);
14 |         this.element.style.textAlign = 'center';
15 |         this.element.innerText = '';
16 |         this.graf = document.createElement('canvas');
17 |         this.graf.style.width = '100%';
18 |         this.graf.style.height = '100%';
19 |         this.graf.width = 1000;
20 | 
21 |         this.element.appendChild(this.graf);
22 | 
23 |         if (!navigator.cancelAnimationFrame) {
24 |             navigator.cancelAnimationFrame = navigator.webkitCancelAnimationFrame ||
25 |                                              navigator.mozCancelAnimationFrame;
26 |         }
27 |         if (!navigator.requestAnimationFrame) {
28 |             navigator.requestAnimationFrame = navigator.webkitRequestAnimationFrame ||
29 |                                               navigator.mozRequestAnimationFrame;
30 |         }
31 | 
32 |         this.refID = null;
33 | 
34 |         this.startDrawRealtime();
35 |     };
36 | 
37 |     namespace.ya.speechkit.Equalizer.prototype = {
38 |         destroy: function () {
39 |             this.stopDrawRealtime();
40 |             this.element.removeChild(this.graf);
41 |         },
42 |         stopDrawRealtime: function () {
43 |             window.cancelAnimationFrame(this.rafID);
44 |             this.rafID = null;
45 |         },
46 |         startDrawRealtime: function () {
47 |             var _this = this;
48 |             function updateAnalysers(time) {
49 |                 if (!_this.analyserNode) {
50 |                     if (_this.recorder) {
51 |                         _this.analyserNode = _this.recorder.getAnalyserNode();
52 |                         _this.context = _this.recorder.context;
53 |                     } else {
54 |                         return;
55 |                     }
56 |                 }
57 | 
58 |                 var canvasWidth = _this.graf.width;
59 |                 var canvasHeight = _this.graf.height;
60 |                 var analyserContext = _this.graf.getContext('2d');
61 | 
62 |                 var SPACING = 2;
63 |                 var BAR_WIDTH = 1;
64 |                 var numBars = Math.round(canvasWidth / SPACING);
65 |                 var freqByteData = new Uint8Array(_this.analyserNode.frequencyBinCount);
66 | 
67 |                 _this.analyserNode.getByteFrequencyData(freqByteData);
68 | 
69 |                 analyserContext.clearRect(0, 0, canvasWidth, canvasHeight);
70 |                 analyserContext.fillStyle = '#F6D565';
71 |                 analyserContext.lineCap = 'round';
72 |                 var multiplier = _this.analyserNode.frequencyBinCount / numBars;
73 | 
74 |                 for (var i = 0; i < numBars; ++i) {
75 |                     var magnitude = 0;
76 |                     var offset = Math.floor(i * multiplier);
77 |                     for (var j = 0; j < multiplier; j++) {
78 |                         magnitude += freqByteData[offset + j];
79 |                     }
80 |                     magnitude = magnitude / multiplier / 2;
81 |                     analyserContext.fillStyle = 'hsl( ' + Math.round(i * 60 / numBars) + ', 100%, 50%)';
82 |                     analyserContext.fillRect(i * SPACING, canvasHeight, BAR_WIDTH, -magnitude);
83 |                 }
84 |                 _this.rafID = window.requestAnimationFrame(updateAnalysers);
85 |             }
86 | 
87 |             this.rafID = window.requestAnimationFrame(updateAnalysers);
88 |         }
89 |     };
90 | }(this));
91 | 


--------------------------------------------------------------------------------
/webspeechkit/src/recognizer.js:
--------------------------------------------------------------------------------
  1 | (function (namespace) {
  2 |     'use strict';
  3 | 
  4 |     if (typeof namespace.ya === 'undefined') {
  5 |         namespace.ya = {};
  6 |     }
  7 |     if (typeof namespace.ya.speechkit === 'undefined') {
  8 |         namespace.ya.speechkit = {};
  9 |     }
 10 | 
 11 |     /**
 12 |      * Создает новый объект типа Recognizer.
 13 |      * @class Создает сессию и отправляет запрос на сервер для распознавания речи.
 14 |      * @name Recognizer
 15 |      * @param {Object} [options] Опции.
 16 |      * @param {callback:initCallback} [options.onInit] Функция-обработчик, которая будет вызвана после успешной инициализации
 17 |      * сессии.
 18 |      * @param {callback:dataCallback} [options.onResult] Функция-обработчик, которая будет вызвана после завершения распознавания речи.
 19 |      * @param {callback:errorCallback} [options.onError]
 20 |      * @param {String} [options.uuid=см. описание] UUID сессии. По умолчанию принимает значение, указанное
 21 |      * в настройках ya.speechkit.settings.uuid.
 22 |      * @param {String} [options.apikey] API-ключ. Если не задан, то используется ключ, указанный
 23 |      * в настройках ya.speechkit.settings.apikey.
 24 |      * @param {ya.speechkit.FORMAT} [options.format=ya.speechkit.FORMAT.PCM16] Формат аудиопотока.
 25 |      * @param {String} [options.url=см. описание] URL сервера, на котором будет производиться распознавание.
 26 |      * Если параметр не указан, то берется значение, заданное в настройках ya.speechkit.settings.asrUrl. По умолчанию оно равно
 27 |      * 'webasr.yandex.net/asrsocket.ws'.
 28 |      * @param {Boolean} [options.punctuation=true] Использовать ли пунктуацию.
 29 |      * @param {Boolean} [options.allowStrongLanguage=false] Отключить фильтрацию обсценной лексики.
 30 |      * @param {String} [options.model='notes'] Языковая модель, которая должна быть использована при распознавании.
 31 |      * Если параметр не указан, то используется значение, заданное в настройках ya.speechkit.model. Если в настройках значение не задано, то
 32 |      * используется модель 'notes'.
 33 |      * @param {String} [options.lang='ru-RU'] Язык распознавания. Возможные значения: 'ru-RU', 'en-US', 'tr-TR', 'uk-UA'.
 34 |      * <p>Если параметр не указан, то используется
 35 |      * значение, заданное в настройках ya.speechkit.lang. Если в настройках значение не задано, то по умолчанию
 36 |      * выбирается русский язык: 'ru-RU'. </p>
 37 |      * @param {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример - sandbox.
 38 |      */
 39 |     var Recognizer = function (options) {
 40 |         if (!(this instanceof namespace.ya.speechkit.Recognizer)) {
 41 |             return new namespace.ya.speechkit.Recognizer(options);
 42 |         }
 43 |         this.options = namespace.ya.speechkit._extend(
 44 |                         {apikey: namespace.ya.speechkit.settings.apikey,
 45 |                          uuid: namespace.ya.speechkit.settings.uuid,
 46 |                          applicationName: namespace.ya.speechkit.settings.applicationName,
 47 |                          url: namespace.ya.speechkit.settings.websocketProtocol +
 48 |                             namespace.ya.speechkit.settings.asrUrl,
 49 |                          onInit: function () {},
 50 |                          onResult: function () {},
 51 |                          onError: function () {},
 52 |                          punctuation: true,
 53 |                          allowStrongLanguage: false
 54 |                         },
 55 |                         options);
 56 | 
 57 |         // Backward compatibility
 58 |         this.options.key = this.options.apikey;
 59 |         this.options.format = this.options.format.mime;
 60 | 
 61 |         this.sessionId = null;
 62 |         this.socket = null;
 63 | 
 64 |         this.buffered = [];
 65 |         this.totaldata = 0;
 66 |     };
 67 | 
 68 |     Recognizer.prototype = /** @lends Recognizer.prototype */{
 69 |         /**
 70 |          * Send raw data to websocket.
 71 |          * @param data Any data to send to websocket (json string, raw audio data).
 72 |          * @private
 73 |          */
 74 |         _sendRaw: function (data) {
 75 |             if (this.socket) {
 76 |                 this.socket.send(data);
 77 |             }
 78 |         },
 79 |         /**
 80 |          * Stringify JSON and send it to websocket.
 81 |          * @param {Object} json Object needed to be send to websocket.
 82 |          * @private
 83 |          */
 84 |         _sendJson: function (json) {
 85 |             this._sendRaw(JSON.stringify({type: 'message', data: json}));
 86 |         },
 87 |         /**
 88 |          * Запускает процесс распознавания.
 89 |          */
 90 |         start: function () {
 91 |             this.sessionId = null;
 92 |             try {
 93 |                 this.socket = new WebSocket(this.options.url);
 94 |             } catch (e) {
 95 |                 this.options.onError('Error on socket creation: ' + e);
 96 |                 this.options.stopCallback();
 97 |                 return;
 98 |             }
 99 | 
100 |             this.socket.onopen = function () {
101 |                 // {uuid: uuid, key: key, format: audioFormat, punctuation: punctuation ...
102 |                 // console.log('Initial request: ' + JSON.stringify(this.options));
103 |                 this._sendJson(this.options);
104 |             }.bind(this);
105 | 
106 |             this.socket.onmessage = function (e) {
107 |                 var message = JSON.parse(e.data);
108 | 
109 |                 if (message.type == 'InitResponse'){
110 |                     this.sessionId = message.data.sessionId;
111 |                     this.options.onInit(message.data.sessionId, message.data.code);
112 |                 } else if (message.type == 'AddDataResponse'){
113 |                     this.options.onResult(message.data.text, message.data.uttr, message.data.merge, message.data.words);
114 |                     if (typeof message.data.close !== 'undefined' && message.data.close) {
115 |                         this.close();
116 |                     }
117 |                 } else if (message.type == 'Error'){
118 |                     this.options.onError('Session ' + this.sessionId + ': ' + message.data);
119 |                     this.close();
120 |                 } else {
121 |                     this.options.onError('Session ' + this.sessionId + ': ' + message);
122 |                     this.close();
123 |                 }
124 |             }.bind(this);
125 | 
126 |             this.socket.onerror = function (error) {
127 |                 this.options.onError('Socket error: ' + error.message);
128 |             }.bind(this);
129 | 
130 |             this.socket.onclose = function (event) {
131 |             }.bind(this);
132 |         },
133 |         /**
134 |          * Добавляет данные с аудио к потоку для распознавания речи.
135 |          * Если сессия распознавания еще не была создана, то данные будут буферизованы и отправятся на сервер
136 |          * по факту установления соединения.
137 |          * @param {ArrayBuffer} data Буфер с аудио сигналом в формате PCM 16bit.
138 |          */
139 |         addData: function (data) {
140 |             this.totaldata += data.byteLength;
141 | 
142 |             if (!this.sessionId) {
143 |                 this.buffered.push(data);
144 |                 return;
145 |             }
146 | 
147 |             for (var i = 0; i < this.buffered.length; i++){
148 |                 this._sendRaw(new Blob([this.buffered[i]], {type: this.options.format}));
149 |                 this.totaldata += this.buffered[i].byteLength;
150 |             }
151 | 
152 |             this.buffered = [];
153 |             this._sendRaw(new Blob([data], {type: this.options.format}));
154 |         },
155 |         /**
156 |          * Принудительно завершает запись звука и отсылает запрос (не закрывает сессию распознавания, пока не получит от сервера последний ответ).
157 |          */
158 |         finish: function () {
159 |             this._sendJson({command: 'finish'});
160 |         },
161 |         /**
162 |          * Завершает сессию распознавания речи, закрывая соединение с сервером.
163 |          */
164 |         close: function () {
165 |             this.options.onInit = function () {};
166 |             this.options.onResult = this.options.onInit;
167 |             this.options.onError = this.options.onInit;
168 | 
169 |             if (this.socket) {
170 |                 this.socket.close();
171 |                 this.options.stopCallback();
172 |             }
173 |             this.socket = null;
174 |         }
175 |     };
176 | 
177 |     namespace.ya.speechkit.Recognizer = Recognizer;
178 | 
179 |     /**
180 |      * Функция-обработчик, которая будет вызвана после успешной инициализации
181 |      * сессии.
182 |      * @callback
183 |      * @name initCallback
184 |      * @param {String} sessionId Идентификатор сессии.
185 |      * @param {Number} code HTTP-статус, который будет содержаться в ответе сервера после инициализации сессии (200).
186 |      * @memberOf Recognizer
187 |      */
188 | 
189 |     /**
190 |      * Функция-обработчик, которая будет вызвана в случае возникновения ошибки.
191 |      * @callback
192 |      * @name errorCallback
193 |      * @param {String} message Текст сообщения об ошибке.
194 |      * @memberOf Recognizer
195 |      */
196 | 
197 |     /**
198 |      * Функция-обработчик, которая будет вызвана после завершения распознавания речи.
199 |      * @callback
200 |      * @name dataCallback
201 |      * @param {String} text Распознанный текст.
202 |      * @param {Boolean} utterance Является ли данный текст финальным результатом распознавания.
203 |      * @param {Number} merge Число обработанных запросов по которым выдан ответ. (Сколько пакетов с данными были соединены в этот результат).
204 |      * @memberOf Recognizer
205 |      */
206 | }(this));
207 | 


--------------------------------------------------------------------------------
/webspeechkit/src/recorder.js:
--------------------------------------------------------------------------------
  1 | (function (namespace) {
  2 |     'use strict';
  3 | 
  4 |     /**
  5 |      * Пространство имен для классов и методов библиотеки Yandex.Speechkit JS
  6 |      * @namespace ya.speechkit
  7 |      */
  8 |     if (typeof namespace.ya === 'undefined') {
  9 |         namespace.ya = {};
 10 |     }
 11 |     if (typeof namespace.ya.speechkit === 'undefined') {
 12 |         namespace.ya.speechkit = {};
 13 |     }
 14 | 
 15 |     namespace.ya.speechkit.AudioContext = window.AudioContext || window.webkitAudioContext;
 16 | 
 17 |     if (typeof namespace.ya.speechkit.settings === 'undefined') {
 18 |         var js = document.createElement('script');
 19 | 
 20 |         js.type = 'text/javascript';
 21 |         js.src = 'https://webasr.yandex.net/jsapi/v1/webspeechkit-settings.js?seed=' + Math.random();
 22 | 
 23 |         document.head.appendChild(js);
 24 |     }
 25 | 
 26 |     /** Набор поддерживаемых форматов аудио.
 27 |      * @readonly
 28 |      * @enum
 29 |      * @memberof ya.speechkit
 30 |      */
 31 |     namespace.ya.speechkit.FORMAT = {
 32 |         /** PCM 8KHz дает плохое качество распознавания, но малый объем передаваемых на сервер данных */
 33 |         PCM8: {format: 'pcm', sampleRate: 8000, mime: 'audio/x-pcm;bit=16;rate=8000', bufferSize: 1024},
 34 |         /** PCM 16 KHz наилучшее качество распознавания при среднем объеме данных */
 35 |         PCM16: {format: 'pcm', sampleRate: 16000, mime: 'audio/x-pcm;bit=16;rate=16000', bufferSize: 2048},
 36 |         /** PCM 44 KHz большой размер передаваемых данных, возможны задержки на узком канале */
 37 |         PCM44: {format: 'pcm', sampleRate: 44100, mime: 'audio/x-pcm;bit=16;rate=44100', bufferSize: 4096},
 38 |     };
 39 | 
 40 |     /** Media stream used by SpeechKit
 41 |      * @private
 42 |      * @memberof ya.speechkit
 43 |      */
 44 |     namespace.ya.speechkit._stream = null;
 45 | 
 46 |     /**
 47 |      * Deep copies fileds from object 'from' to object 'to'
 48 |      * @param {Object} from Source object
 49 |      * @param {Object} to Destination object
 50 |      * @private
 51 |      */
 52 |     namespace.ya.speechkit._extend = function (to, from) {
 53 |         var i;
 54 |         var toStr = Object.prototype.toString;
 55 |         var astr = '[object Array]';
 56 |         to = to || {};
 57 | 
 58 |         for (i in from) {
 59 |             if (from.hasOwnProperty(i)) {
 60 |                 if (typeof from[i] === 'object') {
 61 |                     to[i] = (toStr.call(from[i]) === astr) ? [] : {};
 62 |                     namespace.ya.speechkit._extend(to[i], from[i]);
 63 |                 } else if (typeof from[i] !== 'undefined' || typeof to[i] === 'undefined') {
 64 |                     to[i] = from[i];
 65 |                 }
 66 |             }
 67 |         }
 68 |         return to;
 69 |     };
 70 | 
 71 |     /**
 72 |      * Создает объект для записи аудио-сигнала с микрофона.
 73 |      * @class Класс, управляющий записью звука с микрофона.
 74 |      * @name Recorder
 75 |      */
 76 |     var Recorder = function ()
 77 |     {
 78 |         if (!namespace.ya.speechkit._stream) {
 79 |             return null;
 80 |         }
 81 | 
 82 |         if (!(this instanceof Recorder)) {
 83 |             return new Recorder();
 84 |         }
 85 | 
 86 |         this.worker = namespace.ya.speechkit.newWorker();
 87 | 
 88 |         this.recording = false;
 89 | 
 90 |         this.paused = false;
 91 |         this.lastDataOnPause = 0;
 92 | 
 93 |         this.nullsArray = [];
 94 | 
 95 |         this.currCallback = null;
 96 |         this.buffCallback = null;
 97 |         this.startCallback = null;
 98 | 
 99 |         this.worker.onmessage = function (e) {
100 |             if (e.data.command == 'int16stream')
101 |             {
102 |                 var data = e.data.buffer;
103 | 
104 |                 if (this.startCallback) {
105 |                     this.startCallback(data);
106 |                 }
107 |             } else if (e.data.command == 'getBuffers' && this.buffCallback) {
108 |                 this.buffCallback(e.data.blob);
109 |             } else if (e.data.command == 'clear' && this.currCallback) {
110 |                 this.currCallback();
111 |             } else if (this.currCallback) {
112 |                 this.currCallback(e.data.blob);
113 |             }
114 |         }.bind(this);
115 | 
116 |     };
117 | 
118 |     Recorder.prototype = /** @lends Recorder.prototype */ {
119 |         /**
120 |          * Creates an input point for a given audio format (sets samplerate and buffer size
121 |          * @param {ya.speechkit.FORMAT} format audio format (it's samplerate and bufferSize are being used)
122 |          * @private
123 |          */
124 |         _createNode: function (format) {
125 |             if (!namespace.ya.speechkit.audiocontext) {
126 |                 namespace.ya.speechkit.audiocontext = new namespace.ya.speechkit.AudioContext();
127 |             }
128 | 
129 |             this.audioInput = namespace.ya.speechkit.audiocontext.createMediaStreamSource(
130 |                                                                             namespace.ya.speechkit._stream);
131 | 
132 |             if (!namespace.ya.speechkit.audiocontext.createScriptProcessor) {
133 |                 this.node = namespace.ya.speechkit.audiocontext.createJavaScriptNode(format.bufferSize, 2, 2);
134 |             } else {
135 |                 this.node = namespace.ya.speechkit.audiocontext.createScriptProcessor(format.bufferSize, 2, 2);
136 |             }
137 | 
138 |             this.audioInput.connect(this.node);
139 |             this.node.onaudioprocess = function (e) {
140 |                 if (!this.recording) {return;}
141 | 
142 |                 if (this.paused) {
143 |                     if (Number(new Date()) - this.lastDataOnPause > 2000) {
144 |                         this.lastDataOnPause = Number(new Date());
145 |                         this.worker.postMessage({
146 |                             command: 'record',
147 |                             buffer: [
148 |                                 this.nullsArray,
149 |                                 this.nullsArray
150 |                             ]
151 |                         });
152 |                     }
153 |                 } else {
154 |                     this.worker.postMessage({
155 |                         command: 'record',
156 |                         buffer: [
157 |                             e.inputBuffer.getChannelData(0),
158 |                             e.inputBuffer.getChannelData(1)
159 |                         ]
160 |                     });
161 |                 }
162 |             }.bind(this);
163 | 
164 |             this.node.connect(namespace.ya.speechkit.audiocontext.destination);
165 |         },
166 |         /**
167 |          * Ставит запись звука на паузу.
168 |          * Во время паузы на сервер будут отправляться периодически запросы с пустым звуком, чтобы сервер не обрывал сессию.
169 |          */
170 |         pause: function () {
171 |             this.paused = true;
172 |             this.lastDataOnPause = Number(new Date());
173 |         },
174 |         /**
175 |          * @returns {AudioContext} Текущий <xref scope="external" locale="ru" href="https://developer.mozilla.org/ru/docs/Web/API/AudioContext">
176 |          * AudioContext</xref><xref scope="external" locale="en-com" href="https://developer.mozilla.org/en-US/docs/Web/API/AudioContext">AudioContext</xref>,
177 |          * с которого записывается звук.
178 |          */
179 |         getAudioContext: function () {
180 |             return namespace.ya.speechkit.audiocontext;
181 |         },
182 |         /**
183 |          * @returns {AnalyserNode} <xref scope="external" locale="ru" href="https://developer.mozilla.org/ru/docs/Web/API/AnalyserNode">
184 |          * AnalyserNode</xref><xref scope="external" locale="en-com" href="https://developer.mozilla.org/en-US/docs/Web/API/AnalyserNode">
185 |          * AnalyserNode</xref> — объект, предназначенный для анализа аудио-сигнала в реальном времени.
186 |          */
187 |         getAnalyserNode: function () {
188 |             if (!namespace.ya.speechkit.audiocontext) {
189 |                 namespace.ya.speechkit.audiocontext = new namespace.ya.speechkit.AudioContext();
190 |             }
191 |             var analyserNode = namespace.ya.speechkit.audiocontext.createAnalyser();
192 |             analyserNode.fftSize = 2048;
193 |             this.audioInput.connect(analyserNode);
194 |             return analyserNode;
195 |         },
196 |         /**
197 |          * @returns {Boolean} true, если запись звука стоит на паузе, false — в противном случае.
198 |          */
199 |         isPaused: function () {
200 |             return this.paused;
201 |         },
202 |         /**
203 |          * Начинает запись звука с микрофона.
204 |          * @param {callback:streamCallback} cb Функция-обработчик, в которую будет передаваться записанный аудио-поток.
205 |          * @param {ya.speechkit.FORMAT} [format=PCM16] Формат для записи аудио-сигнала. Доступные значения:
206 |          * <ul>
207 |          *     <li> PCM8 — плохое качество распознавания, но малый объем передаваемых на сервер данных;</li>
208 |          *     <li> PCM16 — наилучшее качество распознавания при среднем объеме данных; </li>
209 |          *     <li> PCM44 — большой размер передаваемых данных, возможны задержки на узком канале.</li>
210 |          *</ul>
211 |          */
212 |         start: function (cb, format) {
213 |             var backref = this;
214 |             if (!namespace.ya.speechkit._stream) {
215 |                 return namespace.ya.speechkit.initRecorder(function () {backref.start(cb, format);}, console.log);
216 |             }
217 | 
218 |             if (!this.node) {
219 |                 this._createNode(format);
220 |             }
221 | 
222 |             if (this.isPaused()) {
223 |                 this.paused = false;
224 |                 return;
225 |             }
226 |             if (typeof cb !== 'undefined') {
227 |                 this.startCallback = cb;
228 |             } else {
229 |                 this.startCallback = null;
230 |             }
231 |             this.worker.postMessage({
232 |                 command: 'init',
233 |                 config: {
234 |                     sampleRate: namespace.ya.speechkit.audiocontext.sampleRate,
235 |                     format: format || namespace.ya.speechkit.FORMAT.PCM16,
236 |                     channels: this.channelCount,
237 |                 }
238 |             });
239 | 
240 |             this.nullsArray = [];
241 |             var bufferLen = (format || namespace.ya.speechkit.FORMAT.PCM16).bufferSize;
242 |             for (var i = 0; i < bufferLen; i++) {
243 |                 this.nullsArray.push(0);
244 |             }
245 | 
246 |             this.clear(function () {this.recording = true;}.bind(this));
247 |         },
248 |         /**
249 |          * Останавливает запись звука.
250 |          * @param {callback:wavCallback} cb Функция-обработчик, в которую будет передан объект <xref href="https://developer.mozilla.org/en-US/docs/Web/API/Blob" scope="external">Blob</xref>
251 |          * с записанным аудио в формате wav.
252 |          * @param {Number} [channelCount=2] Сколько каналов должно быть в wav-файле: 1 — mono, 2 — stereo.
253 |          */
254 |         stop: function (cb, channelCount) {
255 |             this.recording = false;
256 |             if (this.node) {
257 |                 this.node.disconnect();
258 |             }
259 | 
260 |             this.node = null;
261 |             if (namespace.ya.speechkit._stream &&
262 |                 namespace.ya.speechkit._stream.getAudioTracks) {
263 |                 namespace.ya.speechkit._stream.getAudioTracks()[0].stop();
264 |             } else if (namespace.ya.speechkit._stream &&
265 |                 typeof namespace.ya.speechkit._stream.stop !== 'undefined') {
266 |                 namespace.ya.speechkit._stream.stop();
267 |             }
268 |             namespace.ya.speechkit._stream = null;
269 |             if (typeof namespace.ya.speechkit.audiocontext !== 'undefined' &&
270 |                 namespace.ya.speechkit.audiocontext !== null &&
271 |                 typeof namespace.ya.speechkit.audiocontext.close !== 'undefined') {
272 |                 namespace.ya.speechkit.audiocontext.close();
273 |                 namespace.ya.speechkit.audiocontext = null;
274 |             }
275 | 
276 |             if (typeof cb !== 'undefined') {
277 |                 this.exportWav(function (blob) {
278 |                     cb(blob);
279 |                 }, channelCount || 2);
280 |             }
281 |         },
282 |         /**
283 |          * @returns {Boolean} true, если идет запись звука, false — если запись стоит в режиме паузы.
284 |          */
285 |         isRecording: function () {
286 |             return this.recording;
287 |         },
288 |         /**
289 |          * Очищает буферы с записанным аудио-сигналом.
290 |          * @param {callback:clearCallback} cb Функция-обработчик, которая будет вызвана, когда произойдет очистка.
291 |          */
292 |         clear: function (cb) {
293 |             if (typeof cb !== 'undefined') {
294 |                 this.currCallback = cb;
295 |             } else {
296 |                 this.currCallback = null;
297 |             }
298 |             this.worker.postMessage({command: 'clear'});
299 |         },
300 |         /**
301 |          * Метод для получения буферов с записанным аудио-сигналом.
302 |          * @param {callback:buffersCallback} cb Функция, в которую будут переданы буферы с аудио-сигналом.
303 |          */
304 |         getBuffers: function (cb) {
305 |             if (typeof cb !== 'undefined') {
306 |                 this.buffCallback = cb;
307 |             } else {
308 |                 this.buffCallback = null;
309 |             }
310 |             this.worker.postMessage({command: 'getBuffers'});
311 |         },
312 |         /**
313 |          * Экспортирует записанный звук в wav-файл.
314 |          * @param {callback:wavCallback} cb Функция, в которую будет передан объект <xref href="https://developer.mozilla.org/en-US/docs/Web/API/Blob" scope="external">Blob</xref> с файлом.
315 |          * @param {Number} [channelCount=1] Количество каналов в wav-файле: 1 — mono, 2 — stereo.
316 |          */
317 |         exportWav: function (cb, channelCount) {
318 |             if (typeof cb !== 'undefined') {
319 |                 this.currCallback = cb;
320 |             } else {
321 |                 this.currCallback = null;
322 |             }
323 |             var type = 'audio/wav';
324 | 
325 |             if (!this.currCallback) {throw new Error('Callback not set');}
326 | 
327 |             var exportCommand = 'export' + (channelCount != 2 && 'Mono' || '') + 'WAV';
328 | 
329 |             this.worker.postMessage({
330 |                 command: exportCommand,
331 |                 type: type
332 |             });
333 |         }
334 |     };
335 | 
336 |     namespace.ya.speechkit.Recorder = Recorder;
337 | 
338 |     namespace.ya.speechkit.getUserMedia = navigator.getUserMedia ||
339 |         navigator.mozGetUserMedia ||
340 |         navigator.msGetUserMedia ||
341 |         navigator.webkitGetUserMedia;
342 | 
343 |     namespace.ya.speechkit.mediaDevices = (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) ?
344 |         navigator.mediaDevices :
345 |         (namespace.ya.speechkit.getUserMedia ? {
346 |             getUserMedia: function (c) {
347 |                 return new Promise(function (y, n) {
348 |                     namespace.ya.speechkit.getUserMedia.call(navigator, c, y, n);
349 |                 });
350 |             }
351 |         } : null);
352 | 
353 |     namespace.ya.speechkit._stream = null;
354 |     namespace.ya.speechkit.audiocontext = null;
355 | 
356 |     /**
357 |      * Запрашивает у пользователя права для записи звука с микрофона.
358 |      * @param {callback:initSuccessCallback} initSuccess Функция-обработчик, которая будет вызвана при успешном подключении к микрофону.
359 |      * @param {callback:initFailCallback} initFail Функция-обработчик, в которую будет передано сообщения об ошибке, в случае неуспеха.
360 |      */
361 |     namespace.ya.speechkit.initRecorder = function (initSuccess, initFail)
362 |     {
363 |         var badInitialization = function (err) {
364 |             namespace.ya.speechkit._stream = null;
365 |             if (typeof initFail !== 'undefined') {
366 |                 initFail(err);
367 |             }
368 |         };
369 | 
370 |         if (namespace.ya.speechkit.mediaDevices)
371 |         {
372 |             namespace.ya.speechkit.mediaDevices.getUserMedia(
373 |                 {audio: true}).then(
374 |                 function (stream) {
375 |                     namespace.ya.speechkit._stream = stream;
376 |                     if (typeof initSuccess !== 'undefined') {
377 |                         initSuccess();
378 |                     }
379 |                 }).catch(
380 |                 function (err) {
381 |                     badInitialization(err.message || err.name || err);
382 |                 });
383 |         } else {
384 |             badInitialization('Your browser doesn\'t support Web Audio API. ' +
385 |                               'Please, use Yandex.Browser: https://browser.yandex.ru');
386 |         }
387 |     };
388 | 
389 |     /**
390 |      * Поддерживается ли рапознавание заданного языка.
391 |      * @return true, если язык поддерживается, false — иначе.
392 |      */
393 |     namespace.ya.speechkit.isLanguageSupported = function (lang)
394 |     {
395 |         if (namespace.ya.speechkit.settings.langWhitelist.indexOf(lang) >= 0) {
396 |             return namespace.ya.speechkit.isSupported();
397 |         } else {
398 |             return namespace.ya.speechkit.isWebAudioSupported();
399 |         }
400 |     };
401 | 
402 |     /**
403 |      * Поддерживаются ли технологии рапознавания Яндекса.
404 |      * @return true, если поддерживаются, false — иначе.
405 |      */
406 |     namespace.ya.speechkit.isSupported = function ()
407 |     {
408 |         var userAgent = navigator.userAgent.toLowerCase();
409 |         // Yandex recognition is 100% supported on mobile devices only in firefox
410 |         return ((namespace.ya.speechkit.mediaDevices !== null) &&
411 |                 ((/mozilla|firefox/.test(userAgent) && !/yabrowser/.test(userAgent)) ||
412 |                 !/iphone|ipod|ipad|android|blackberry/.test(userAgent)));
413 |     };
414 | 
415 |     /**
416 |      * Поддерживается ли рапознавание с помощью WebAudio API.
417 |      * @return true, если поддерживается, false — иначе.
418 |      */
419 |     namespace.ya.speechkit.isWebAudioSupported = function ()
420 |     {
421 |         var userAgent = navigator.userAgent.toLowerCase();
422 |         var SpeechRecognition = namespace.SpeechRecognition || namespace.webkitSpeechRecognition;
423 |         // Native recognition is only supported in original chrome and chromium
424 |         return (typeof SpeechRecognition !== 'undefined' && !/yabrowser|opera|opr/.test(userAgent));
425 |     };
426 | 
427 | 
428 |     /**
429 |      * Функция, которая будет вызвана по факту успешного получения прав на доступ к микрофону.
430 |      * @callback
431 |      * @name initSuccessCallback
432 |      * @memberof Recorder
433 |      */
434 | 
435 |     /**
436 |      * Функция-обработчик, которая будет вызвана при возникновении ошибки при получении доступа к микрофону.
437 |      * @callback initFailCallback
438 |      * @param {String} error Сообщение об ошибке.
439 |      * @memberof Recorder
440 |      */
441 | 
442 |     /**
443 |      * Функция для <xref href="https://developer.mozilla.org/en-US/docs/Web/API/Blob" scope="external">Blob</xref> с wav-файлом.
444 |      * @callback
445 |      * @name wavCallback
446 |      * @param {<xref href="https://developer.mozilla.org/en-US/docs/Web/API/Blob" scope="external">Blob</xref> с MIME типом audio/wav} data wav-файл.
447 |      * @memberof Recorder
448 |      */
449 | 
450 |     /**
451 |      * Функция-обработчик, в которую будут переданы буферы записанного аудио.
452 |      * @callback
453 |      * @name buffersCallback
454 |      * @param {Float32Array[]} buffers Буферы записанного аудио для двух каналов (моно и стерео).
455 |      * @memberof Recorder
456 |      */
457 | 
458 |     /**
459 |      * Функция, которая будет вызвана после очистки буферов (это сигнал готовности к повторному запуску).
460 |      * @callback
461 |      * @name clearCallback
462 |      * @memberof Recorder
463 |      */
464 | 
465 |     /**
466 |      * Функция-обработчик, в которую будет передаваться записанный аудио-поток.
467 |      * @callback
468 |      * @name streamCallback
469 |      * @param {ArrayBuffer} stream Записанный PCM поток 16-bit.
470 |      * @memberof Recorder
471 |      */
472 | 
473 | }(this));
474 | 


--------------------------------------------------------------------------------
/webspeechkit/src/recorderWorker.js:
--------------------------------------------------------------------------------
  1 | (function (namespace) {
  2 |     'use strict';
  3 | 
  4 |     if (typeof namespace.ya === 'undefined') {
  5 |         namespace.ya = {};
  6 |     }
  7 |     if (typeof namespace.ya.speechkit === 'undefined') {
  8 |         namespace.ya.speechkit = {};
  9 |     }
 10 | 
 11 |     function _makeWorker(script) {
 12 |         var URL = window.URL || window.webkitURL;
 13 |         var Blob = window.Blob;
 14 |         var Worker = window.Worker;
 15 | 
 16 |         if (!URL || !Blob || !Worker || !script) {
 17 |             return null;
 18 |         }
 19 | 
 20 |         var blob = new Blob([script], {type: 'application/javascript'});
 21 |         var worker = new Worker(URL.createObjectURL(blob));
 22 |         return worker;
 23 |     }
 24 | 
 25 |     var inline_worker =
 26 | "function iirFilter (sampleRate, cutoff, resonance, type) {" +
 27 | "" +
 28 | "    var	self	= this," +
 29 | "            f	= [0.0, 0.0, 0.0, 0.0]," +
 30 | "            freq, damp," +
 31 | "            prevCut, prevReso," +
 32 | "" +
 33 | "            sin	= Math.sin," +
 34 | "            min	= Math.min," +
 35 | "            pow	= Math.pow;" +
 36 | "" +
 37 | "    self.cutoff = cutoff || 20000;" +
 38 | "    self.resonance = resonance || 0.1;" +
 39 | "    self.samplerate = sampleRate || 44100;" +
 40 | "    self.type = type || 0;" +
 41 | "" +
 42 | "    function calcCoeff () {" +
 43 | "            freq = 2 * sin(Math.PI * min(0.25, self.cutoff / (self.samplerate * 2)));" +
 44 | "            damp = min(2 * (1 - pow(self.resonance, 0.25)), min(2, 2 / freq - freq * 0.5));" +
 45 | "    }" +
 46 | "" +
 47 | "    self.pushSample = function (sample) {" +
 48 | "            if (prevCut !== self.cutoff || prevReso !== self.resonance){" +
 49 | "                    calcCoeff();" +
 50 | "                    prevCut = self.cutoff;" +
 51 | "                    prevReso = self.resonance;" +
 52 | "            }" +
 53 | "" +
 54 | "            f[3] = sample - damp * f[2];" +
 55 | "            f[0] = f[0] + freq * f[2];" +
 56 | "            f[1] = f[3] - f[0];" +
 57 | "            f[2] = freq * f[1] + f[2];" +
 58 | "" +
 59 | "            f[3] = sample - damp * f[2];" +
 60 | "            f[0] = f[0] + freq * f[2];" +
 61 | "            f[1] = f[3] - f[0];" +
 62 | "            f[2] = freq * f[1] + f[2];" +
 63 | "" +
 64 | "            return f[self.type];" +
 65 | "    };" +
 66 | "" +
 67 | "    self.getMix = function (type) {" +
 68 | "            return f[type || self.type];" +
 69 | "    };" +
 70 | "}" +
 71 | "" +
 72 | "var speex_loaded = false;" +
 73 | "var recLength = 0;" +
 74 | "var recBuffersL = [];" +
 75 | "var recBuffersR = [];" +
 76 | "var sampleRate;" +
 77 | "var outSampleRate;" +
 78 | "var tmp_buf = 0;" +
 79 | "var need_buf_size = 4096;" +
 80 | "var speex_converter = null;" +
 81 | "    " +
 82 | "this.onmessage = function (e) {" +
 83 | "    switch (e.data.command) {" +
 84 | "    case 'init':" +
 85 | "        init(e.data.config);" +
 86 | "        break;" +
 87 | "    case 'record':" +
 88 | "        record(e.data.buffer);" +
 89 | "        break;" +
 90 | "    case 'exportWAV':" +
 91 | "        exportWAV(e.data.type);" +
 92 | "        break;" +
 93 | "    case 'exportMonoWAV':" +
 94 | "        exportMonoWAV(e.data.type);" +
 95 | "        break;" +
 96 | "    case 'getBuffers':" +
 97 | "        getBuffers();" +
 98 | "        break;" +
 99 | "    case 'clear':" +
100 | "        clear();" +
101 | "        break;" +
102 | "    }" +
103 | "};" +
104 | "    " +
105 | "function init(config) {" +
106 | "    sampleRate = config.sampleRate;" +
107 | "    outSampleRate = config.format.sampleRate || sampleRate;" +
108 | "    need_buf_size = config.format.bufferSize || 4096;" +
109 | "    speex_converter = null;" +
110 | "    /*if (config.format.format == \'speex\') {" +
111 | "        if (!speex_loaded) {" +
112 | "            importScripts(\'./speex.min.js\');" +
113 | "            speex_loaded = true;" +
114 | "        }" +
115 | "        need_buf_size /= 16;" +
116 | "        speex_converter = new SpeexConverter(outSampleRate);" +
117 | "    }*/" +
118 | "}" +
119 | "" +
120 | "var resample = function (inbuf) {" +
121 | "    var speed = 1.0 * sampleRate / outSampleRate;" +
122 | "    var l = Math.ceil(inbuf.length / speed);" +
123 | "    var result = new Float32Array(l);" +
124 | "    var bin = 0;" +
125 | "    var num = 0;" +
126 | "    var indexIn = 0;" +
127 | "    var indexOut = 0;" +
128 | "    for (indexOut = 1, indexIn = speed; indexOut < l - 1; indexIn += speed, indexOut++) {" +
129 | "        var pos = Math.floor(indexIn);" +
130 | "        var dt = indexIn - pos;" +
131 | "        var second = (pos + 1 < inbuf.length) ? pos + 1 : inbuf.length - 1; " +
132 | "        result[indexOut] = inbuf[pos] * (1 - dt) + inbuf[second] * dt;" +
133 | "    }" +
134 | "    result[0] = inbuf[0];" +
135 | "    result[l - 1] = inbuf[inbuf.length - 1];" +
136 | "    return result;" +
137 | "};" +
138 | "    " +
139 | "function record(inputBuffer) {" +
140 | "    if (outSampleRate == sampleRate) {" +
141 | "        recBuffersL.push(inputBuffer[0]);" +
142 | "        recBuffersR.push(inputBuffer[1]);" +
143 | "        recLength += inputBuffer[0].length;" +
144 | "    " +
145 | "        var samples = inputBuffer[0];" +
146 | "        var buffer = new ArrayBuffer(samples.length * 2);" +
147 | "        var view = new DataView(buffer);" +
148 | "        floatTo16BitPCM(view, 0, samples);" +
149 | "        this.postMessage({command: 'int16stream', buffer: buffer});" +
150 | "    } else {" +
151 | "        var filter0 = new iirFilter(outSampleRate, outSampleRate * 0.125, 0.0); " +
152 | "        var filter1 = new iirFilter(outSampleRate, outSampleRate * 0.125, 0.0); " +
153 | "" +
154 | "        for (var i =0; i < inputBuffer[0].length; i++) { " +
155 | "            inputBuffer[0][i] = filter0.pushSample(inputBuffer[0][i]); " +
156 | "            inputBuffer[1][i] = filter1.pushSample(inputBuffer[1][i]); " +
157 | "        }" +
158 | "" +
159 | "        var resin0 = resample(inputBuffer[0], outSampleRate, sampleRate);" +
160 | "        var resin1 = resample(inputBuffer[1], outSampleRate, sampleRate);" +
161 | "    " +
162 | "        recBuffersL.push(resin0);" +
163 | "        recBuffersR.push(resin1);" +
164 | "        recLength += resin0.length;" +
165 | "    " +
166 | "        var result = new Int16Array(resin0.length);" +
167 | "    " +
168 | "        for (var i = 0 ; i < resin0.length ; i++) {" +
169 | "            result[i] = Math.floor(Math.min(Math.max((resin0[i] + resin1[i]) * 0.5, -1.0), 1.0) * 16383);" +
170 | "        }" +
171 | "    " +
172 | "        if (speex_converter) {" +
173 | "            result = speex_converter.convert(result);" +
174 | "        } else {" +
175 | "            result = result.buffer;" +
176 | "        }" +
177 | "    " +
178 | "        if (!tmp_buf) {" +
179 | "            tmp_buf = result;" +
180 | "        } else {" +
181 | "            var tmp = new DataView(new ArrayBuffer(tmp_buf.byteLength + result.byteLength));" +
182 | "            tmp_buf = new DataView(tmp_buf);" +
183 | "            result = new DataView(result);" +
184 | "    " +
185 | "            for (i = 0; i < tmp_buf.byteLength; i++) {" +
186 | "                tmp.setUint8(i, tmp_buf.getUint8(i));" +
187 | "            }" +
188 | "    " +
189 | "            for (i = 0; i < result.byteLength; i++) {" +
190 | "                tmp.setUint8(i + tmp_buf.byteLength, result.getUint8(i));" +
191 | "            }" +
192 | "    " +
193 | "            tmp_buf = tmp.buffer;" +
194 | "        }" +
195 | "    " +
196 | "        if (tmp_buf.byteLength >= need_buf_size) {" +
197 | "            this.postMessage({command: 'int16stream', buffer: tmp_buf});" +
198 | "            tmp_buf = false;" +
199 | "        }" +
200 | "    }" +
201 | "}" +
202 | "    " +
203 | "function exportWAV(type) {" +
204 | "    var bufferL = mergeBuffers(recBuffersL, recLength);" +
205 | "    var bufferR = mergeBuffers(recBuffersR, recLength);" +
206 | "    var interleaved = interleave(bufferL, bufferR);" +
207 | "    var dataview = encodeWAV(interleaved);" +
208 | "    var audioBlob = new Blob([dataview], {type: type});" +
209 | "    " +
210 | "    this.postMessage({command: 'exportWAV', blob: audioBlob});" +
211 | "}" +
212 | "    " +
213 | "function exportMonoWAV(type) {" +
214 | "    var bufferL = mergeBuffers(recBuffersL, recLength);" +
215 | "    var dataview = encodeWAV(bufferL, true);" +
216 | "    var audioBlob = new Blob([dataview], {type: type});" +
217 | "    " +
218 | "    this.postMessage({command: 'exportMonoWAV', blob: audioBlob});" +
219 | "}" +
220 | "    " +
221 | "function getBuffers() {" +
222 | "    var buffers = [];" +
223 | "    buffers.push(mergeBuffers(recBuffersL, recLength));" +
224 | "    buffers.push(mergeBuffers(recBuffersR, recLength));" +
225 | "    this.postMessage({command: 'getBuffers', blob: buffers});" +
226 | "}" +
227 | "    " +
228 | "function clear() {" +
229 | "    recLength = 0;" +
230 | "    recBuffersL = [];" +
231 | "    recBuffersR = [];" +
232 | "    if (speex_converter) {" +
233 | "        speex_converter.clear();" +
234 | "    }" +
235 | "    this.postMessage({command: 'clear'});" +
236 | "}" +
237 | "    " +
238 | "function mergeBuffers(recBuffers, recLength) {" +
239 | "    var result = new Float32Array(recLength);" +
240 | "    var offset = 0;" +
241 | "    for (var i = 0; i < recBuffers.length; i++){" +
242 | "        result.set(recBuffers[i], offset);" +
243 | "        offset += recBuffers[i].length;" +
244 | "    }" +
245 | "    return result;" +
246 | "}" +
247 | "    " +
248 | "function interleave(inputL, inputR) {" +
249 | "    var length = inputL.length + inputR.length;" +
250 | "    var result = new Float32Array(length);" +
251 | "    " +
252 | "    var index = 0;" +
253 | "    var inputIndex = 0;" +
254 | "    " +
255 | "    while (index < length){" +
256 | "        result[index++] = inputL[inputIndex];" +
257 | "        result[index++] = inputR[inputIndex];" +
258 | "        inputIndex++;" +
259 | "    }" +
260 | "    return result;" +
261 | "}" +
262 | "    " +
263 | "function floatTo16BitPCM(output, offset, input) {" +
264 | "    for (var i = 0; i < input.length; i++, offset += 2){" +
265 | "        var s = Math.max(-1, Math.min(1, input[i]));" +
266 | "        output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);" +
267 | "    }" +
268 | "}" +
269 | "    " +
270 | "function writeString(view, offset, string) {" +
271 | "    for (var i = 0; i < string.length; i++){" +
272 | "        view.setUint8(offset + i, string.charCodeAt(i));" +
273 | "    }" +
274 | "}" +
275 | "    " +
276 | "function encodeWAV(samples, mono) {" +
277 | "    var buffer = new ArrayBuffer(44 + samples.length * 2);" +
278 | "    var view = new DataView(buffer);" +
279 | "    " +
280 | "    /* RIFF identifier */" +
281 | "    writeString(view, 0, 'RIFF');" +
282 | "    /* file length */" +
283 | "    view.setUint32(4, 32 + samples.length * 2, true);" +
284 | "    /* RIFF type */" +
285 | "    writeString(view, 8, 'WAVE');" +
286 | "    /* format chunk identifier */" +
287 | "    writeString(view, 12, 'fmt ');" +
288 | "    /* format chunk length */" +
289 | "    view.setUint32(16, 16, true);" +
290 | "    /* sample format (raw) */" +
291 | "    view.setUint16(20, 1, true);" +
292 | "    /* channel count */" +
293 | "    view.setUint16(22, mono ? 1 : 2, true);" +
294 | "    /* sample rate */" +
295 | "    view.setUint32(24, outSampleRate, true);" +
296 | "    /* block align (channel count * bytes per sample) */" +
297 | "    var block_align = mono ? 2 : 4;" +
298 | "    /* byte rate (sample rate * block align) */" +
299 | "    view.setUint32(28, outSampleRate * block_align, true);" +
300 | "    /* block align (channel count * bytes per sample) */" +
301 | "    view.setUint16(32, block_align, true);" +
302 | "    /* bits per sample */" +
303 | "    view.setUint16(34, 16, true);" +
304 | "    /* data chunk identifier */" +
305 | "    writeString(view, 36, 'data');" +
306 | "    /* data chunk length */" +
307 | "    view.setUint32(40, samples.length * 2, true);" +
308 | "    " +
309 | "    floatTo16BitPCM(view, 44, samples);" +
310 | "    " +
311 | "    return view;" +
312 | "}" +
313 | " ";
314 | 
315 |     namespace.ya.speechkit.newWorker = function () {
316 |         return _makeWorker(inline_worker);
317 |     };
318 | }(this));
319 | 
320 | 


--------------------------------------------------------------------------------
/webspeechkit/src/speechrecognition.js:
--------------------------------------------------------------------------------
  1 | (function (namespace) {
  2 |     'use strict';
  3 | 
  4 |     if (typeof namespace.ya === 'undefined') {
  5 |         namespace.ya = {};
  6 |     }
  7 |     if (typeof namespace.ya.speechkit === 'undefined') {
  8 |         namespace.ya.speechkit = {};
  9 |     }
 10 | 
 11 |     function noop() {}
 12 | 
 13 |     /**
 14 |     * Параметры по умолчанию для SpeechRecognition
 15 |     * @private
 16 |     */
 17 |     namespace.ya.speechkit._defaultOptions = function () {
 18 |         /**
 19 |          * @typedef {Object} SpeechRecognitionOptions
 20 |          * @property {SpeechRecognition~initCallback} initCallback - Функция, которая будет вызвана по факту инициализации сессии распознавания
 21 |          * @property {SpeechRecognition~errorCallback} errorCallback - Функция, которая будет вызвана по факту ошибки (все ошибки - критичны, и приводят к порче сессии)
 22 |          * @property {SpeechRecognition~dataCallback} dataCallback - Функция, в которую будут приходить результаты распознавания
 23 |          * @property {SpeechRecognition~infoCallback} infoCallback - Функция для технической информации
 24 |          * @property {SpeechRecognition~stopCallback} stopCallback - Функция, которая будет вызвана в момент остановки сессии распознавания
 25 |          * @property {Boolean} punctuation - Следует ли пытаться расставлять знаки препинания
 26 |          * @property {Boolean} allowStringLanguage - Следует ли отключить фильтрацию обсценной лексики
 27 |          * @property {String} model - Языковая модель для распознавания речи
 28 |          * @property {String} lang - Язык, речь на котором следует распознавать
 29 |          * @property {ya.speechkit.FORMAT} format - Формат передачи аудио сигнала
 30 |          * @property {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример - sandbox.
 31 |          */
 32 |         return {
 33 |                 initCallback: noop,
 34 |                 errorCallback: noop,
 35 |                 dataCallback: noop,
 36 |                 infoCallback: noop,
 37 |                 stopCallback: noop,
 38 |                 punctuation: false,
 39 |                 allowStrongLanguage: false,
 40 |                 model: namespace.ya.speechkit.settings.model,
 41 |                 applicationName: namespace.ya.speechkit.settings.applicationName,
 42 |                 lang: namespace.ya.speechkit.settings.lang,
 43 |                 format: namespace.ya.speechkit.FORMAT.PCM16,
 44 |                 url: namespace.ya.speechkit.settings.websocketProtocol +
 45 |                         namespace.ya.speechkit.settings.asrUrl,
 46 |                 vad: false,
 47 |                 speechStart: noop,
 48 |                 speechEnd: noop,
 49 |             };
 50 |     };
 51 | 
 52 |     /**
 53 |     * Создает новый объект типа SpeechRecognition.
 54 |     * @class Класс для распознавания большого потока аудио-сигнала.
 55 |     * @name SpeechRecognition
 56 |     */
 57 |     var SpeechRecognition = function () {
 58 |         if (!(this instanceof namespace.ya.speechkit.SpeechRecognition)) {
 59 |             return new namespace.ya.speechkit.SpeechRecognition();
 60 |         }
 61 |         this.send = 0;
 62 |         this.send_bytes = 0;
 63 |         this.proc = 0;
 64 |         this.recorder = null;
 65 |         this.recognizer = null;
 66 |         this.vad = null;
 67 |     };
 68 | 
 69 |     SpeechRecognition.prototype = /** @lends SpeechRecognition.prototype */ {
 70 |         /**
 71 |          * Запускает процесс распознавания речи.
 72 |          * @param {Object} [options] Параметры, которые будут использоваться во время сессии.
 73 |          * @param {callback:initCallback} [options.initCallback] Функция-обработчик, которая будет вызвана по факту инициализации сессии распознавания.
 74 |          * @param {callback:errorCallback} [options.errorCallback] Функция-обработчик, которая будет вызвана по факту ошибки (все ошибки критичны и приводят к завершению сессии).
 75 |          * @param {callback:dataCallback} [options.dataCallback] Функция-обработчик, которая будет вызвана после успешного завершения
 76 |          * распознавания. В качестве аргумента ей передаются результаты распознавания.
 77 |          * @param {callback:infoCallback} [options.infoCallback] Функция для получения технической информации.
 78 |          * @param {callback:stopCallback} [options.stopCallback] Функция-обработчик, которая будет вызвана в момент остановки сессии распознавания.
 79 |          * @param {String} [options.apikey] API-ключ. Если не задан, то используется ключ, указанный
 80 |          * в глобальных настройках {@link settings}.
 81 |          * @param {Boolean} [options.punctuation=false] Следует ли использовать пунктуацию.
 82 |          * @param {Boolean} [options.allowStrongLanguage=false] Следует ли отключить фильтрацию обсценной лексики.
 83 |          * @param {String} [options.model='notes'] Языковая модель для распознавания речи. Список доступных значений:
 84 |          * <ul>
 85 |          *     <li>'notes' (по умолчанию) — общая лексика;</li>
 86 |          *     <li>'queries' — короткие запросы;</li>
 87 |          *     <li>'names' — имена; </li>
 88 |          *     <li>'dates' — даты; </li>
 89 |          *     <li>'maps' — топонимы;</li>
 90 |          *     <li>'notes' — тексты;</li>
 91 |          *     <li>'numbers' — числа.</li>
 92 |          * </ul>
 93 |          * <p>Если параметр не указан, то используется
 94 |          * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то
 95 |          * используется модель по умолчанию — 'notes'. </p>
 96 |          * @param {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример - sandbox.
 97 |          * @param {String} [options.lang='ru-RU'] Язык, речь на котором следует распознавать. Возможные значения: 'ru-RU', 'en-US', 'tr-TR'.
 98 |          * <p>Если параметр не указан, то используется
 99 |          * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то по умолчанию
100 |          * выбирается русский язык: 'ru-RU'. </p>
101 |          * @param {ya.speechkit.FORMAT} [options.format=ya.speechkit.FORMAT.PCM16] Формат передачи аудио-сигнала.
102 |          * @param {Boolean} [options.partialResults=true] Отправлять ли на сервер промежуточные результаты.
103 |          * @param {Number} [options.utteranceSilence=120] Длительность промежутка тишины во время записи речи (в десятках миллисекунд). Как только встречается
104 |          * такой перерыв в речи, запись звука останавливается, и записанный фрагмент речи отправляется на сервер.
105 |          */
106 |         start: function (options) {
107 |             this.options = namespace.ya.speechkit._extend(
108 |                                 namespace.ya.speechkit._extend(
109 |                                     {},
110 |                                     namespace.ya.speechkit._defaultOptions()
111 |                                 ),
112 |                                 options);
113 |             if (namespace.ya.speechkit.settings.langWhitelist.indexOf(this.options.lang) >= 0) {
114 |                 if (namespace.ya.speechkit._stream !== null) {
115 |                     this._onstart();
116 |                 } else {
117 |                     namespace.ya.speechkit.initRecorder(
118 |                         this._onstart.bind(this),
119 |                         this.options.errorCallback
120 |                     );
121 |                 }
122 |             } else {
123 |                 var old_error_callback = this.options.errorCallback;
124 |                 this.recorder = namespace.ya.speechkit.WebAudioRecognition(
125 |                     namespace.ya.speechkit._extend(
126 |                     this.options,
127 |                     {
128 |                         errorCallback: function (e) {
129 |                             this.recorder = null;
130 |                             old_error_callback(e);
131 |                         }.bind(this)
132 |                     }
133 |                     ));
134 |                 this.recorder.start();
135 |             }
136 |         },
137 |         /**
138 |          * Will be called after successful call of initRecorder
139 |          * @private
140 |          */
141 |         _onstart: function () {
142 |             if (this.recorder && this.recorder.isPaused()) {
143 |                 this.recorder.start();
144 |             }
145 | 
146 |             if (this.recognizer) {
147 |                 return;
148 |             }
149 | 
150 |             this.send = 0;
151 |             this.send_bytes = 0;
152 |             this.proc = 0;
153 | 
154 |             if (!this.recorder) {
155 |                 this.recorder = new namespace.ya.speechkit.Recorder();
156 |                 if (this.options.vad) {
157 |                     this.vad = new namespace.ya.speechkit.Vad({recorder: this.recorder,
158 |                                                      speechStart: this.options.speechStart,
159 |                                                      speechEnd: this.options.speechEnd});
160 |                 }
161 |             }
162 | 
163 |             this.recognizer = new namespace.ya.speechkit.Recognizer(
164 |                 namespace.ya.speechkit._extend(this.options,
165 |                 {
166 |                     onInit: function (sessionId, code) {
167 |                         this.recorder.start(function (data) {
168 |                             if (this.options.vad && this.vad) {
169 |                                 this.vad.update();
170 |                             }
171 |                             this.send++;
172 |                             this.send_bytes += data.byteLength;
173 |                             this.options.infoCallback({
174 |                                 send_bytes: this.send_bytes,
175 |                                 format: this.options.format,
176 |                                 send_packages: this.send,
177 |                                 processed: this.proc
178 |                             });
179 |                             this.recognizer.addData(data);
180 |                         }.bind(this), this.options.format);
181 | 
182 |                         this.options.initCallback(sessionId, code, 'yandex');
183 |                     }.bind(this),
184 |                     onResult: function (text, uttr, merge, words) {
185 |                                 this.proc += merge;
186 |                                 this.options.infoCallback({
187 |                                     send_bytes: this.send_bytes,
188 |                                     format: this.options.format,
189 |                                     send_packages: this.send,
190 |                                     processed: this.proc
191 |                                 });
192 |                                 this.options.dataCallback(text, uttr, merge, words);
193 |                             }.bind(this),
194 |                     onError: function (msg) {
195 |                                 if (this.recorder) {
196 |                                     this.recorder.stop(function () { this.recorder = null; }.bind(this));
197 |                                 }
198 |                                 if (this.recognizer) {
199 |                                     this.recognizer.close();
200 |                                     this.recognizer = null;
201 |                                 }
202 |                                 this.options.errorCallback(msg);
203 |                             }.bind(this),
204 |                 }));
205 |             this.recognizer.start();
206 |         },
207 |         /**
208 |          * Завершает сессию распознавания речи.
209 |          * По завершении сессии будет вызвана функция-обработчик stopCallback.
210 |          */
211 |         stop: function () {
212 |             if (this.recognizer) {
213 |                 this.recognizer.finish();
214 |             }
215 | 
216 |             if (this.recorder) {
217 |                 this.recorder.stop(
218 |                     function () {
219 |                         this.recognizer = null;
220 |                         this.recorder = null;
221 |                     }.bind(this)
222 |                 );
223 |             }
224 |         },
225 |         /**
226 |          * Прерывает сессию распознавания речи (не дожидается финального результата распознавания).
227 |          * По завершении сессии будет вызвана функция-обработчик stopCallback.
228 |          */
229 |         abort: function () {
230 |             if (this.recognizer) {
231 |                 this.recognizer.close();
232 |             }
233 |             if (this.recorder) {
234 |                 this.recorder.stop(
235 |                     function () {
236 |                         this.recognizer = null;
237 |                         this.recorder = null;
238 |                     }.bind(this)
239 |                 );
240 |             }
241 |         },
242 |         /**
243 |          * Ставит сессию распознавания на паузу.
244 |          * Чтобы соединение с сервером не прерывалось и можно было моментально возобновить распознавание,
245 |          * на сервер периодически посылаются небольшие куски данных.
246 |          */
247 |         pause: function () {
248 |             if (this.recorder) {
249 |                 this.recorder.pause();
250 |             }
251 |         },
252 |         /**
253 |          * Определяет, стоит ли на паузе сессия распознавания.
254 |          * @returns {Boolean} true, если сессия распознавания речи стоит на паузе, false — иначе.
255 |          */
256 |         isPaused: function () {
257 |             return (!this.recorder || this.recorder.isPaused());
258 |         }
259 |     };
260 | 
261 |     ya.speechkit.SpeechRecognition = SpeechRecognition;
262 | 
263 |     /**
264 |      * Функция для распознавания коротких фрагментов речи.
265 |      * <p> При вызове функции recognize() начинается запись звука с микрофона.
266 |      * Как только наступает тишина более чем на одну секунду, запись
267 |      * прекращается, и функция отправляет запрос на сервер для распознавания записанного фрагмента.</p>
268 |      * <p>Приемлемое качество распознавания обеспечивается на фрагментах длительностью не более 10 секунд.
269 |      * При более длительном фрагменте качество распознавания ухудшается.</p>
270 |      * @static
271 |      * @function
272 |      * @name recognize
273 |      * @param {Object} [options] Параметры распознавания речи.
274 |      * @param {callback:SpeechRecognition.initCallback} [options.initCallback] Функция-обработчик, которая будет вызвана по факту
275 |      * инициализации сессии распознавания.
276 |      * @param {callback:SpeechRecognition.errorCallback} [options.errorCallback] Функция-обработчик, которая будет вызвана при возникновении ошибки
277 |      * (все ошибки критичны и приводят к завершению сессии).
278 |      * @param {callback:SpeechRecognition.recognitionDoneCallback} [options.doneCallback] Функция-обработчик, в которую будет отправлен результат распознавания речи.
279 |      * @param {String} [options.apikey] API-ключ. По умолчанию принимает значение, указанное
280 |      * в глобальных настройках {@link settings}.
281 |      * @param {String} [options.model='notes'] Список доступных значений:
282 |      * <ul>
283 |      *     <li>'notes' (по умолчанию) — текст;</li>
284 |      *     <li>'queries' — короткие запросы;</li>
285 |      *     <li>'names' — имена; </li>
286 |      *     <li>'dates' — даты; </li>
287 |      *     <li>'maps' — топонимы;</li>
288 |      *     <li>'notes' — тексты;</li>
289 |      *     <li>'numbers' — числа.</li>
290 |      * </ul>
291 |      * <p>Если параметр не указан, то используется
292 |      * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то
293 |      * используется модель по умолчанию — 'notes'. </p>
294 |      * @param {String} [options.applicationName] Название приложения. Для некоторых приложений мы поддерживаем специальную логику. Пример — sandbox.
295 |      * @param {String} [options.lang='ru-RU'] Язык, речь на котором следует распознавать. Возможные значения: 'ru-RU', 'en-US', 'tr-TR'.
296 |      * <p>Если параметр не указан, то используется
297 |      * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то по умолчанию
298 |      * выбирается русский язык: 'ru-RU'. </p>
299 |      * @param {Boolean} [options.partialResults=true] Отправлять ли на сервер промежуточные результаты.
300 |      * @param {Number} [options.utteranceSilence=120] Длительность промежутка тишины во время записи речи (в десятках миллисекунд). Как только встречается
301 |      * такой перерыв в речи, запись звука останавливается, и записанный фрагмент речи отправляется на сервер.
302 |      */
303 | 
304 |     namespace.ya.speechkit.recognize = function (options) {
305 |         var dict = new namespace.ya.speechkit.SpeechRecognition();
306 | 
307 |         var opts = namespace.ya.speechkit._extend(
308 |                         namespace.ya.speechkit._extend(
309 |                             {},
310 |                             namespace.ya.speechkit._defaultOptions()
311 |                         ),
312 |                         options);
313 | 
314 |         opts.doneCallback = options.doneCallback;
315 | 
316 |         opts.dataCallback = function (text, uttr, merge) {
317 |             if (uttr) {
318 |                 if (opts.doneCallback) {
319 |                     opts.doneCallback(text);
320 |                 }
321 |                 dict.stop();
322 |             }
323 |         };
324 | 
325 |         opts.stopCallback = function () {
326 |             dict = null;
327 |         };
328 | 
329 |         dict.start(opts);
330 |     };
331 | 
332 |     /**
333 |      * Функция, в которую передается полностью распознанный фрагмент текста.
334 |      * @param {String} text Распознанная речь.
335 |      * @callback
336 |      * @name recognitionDoneCallback
337 |      * @memberOf SpeechRecognition
338 |      */
339 | 
340 |     /**
341 |      * Функция, которая будет вызвана после успешной инициализации сессии распознавания речи.
342 |      * @callback
343 |      * @name initCallback
344 |      * @memberOf SpeechRecognition
345 |      * @param {String} sessionId Идентификатор сессии.
346 |      * @param {Number} code HTTP-статус, который будет содержаться в ответе сервера (200 в случае успеха).
347 |      */
348 | 
349 |     /**
350 |      * Функция, в которую будут переданы сообщения об ошибках.
351 |      * @callback
352 |      * @name errorCallback
353 |      * @memberOf SpeechRecognition
354 |      * @param {String} message Текст сообщения об ошибке.
355 |      */
356 | 
357 |     /**
358 |      * Функция для результатов распознавания речи.
359 |      * @callback
360 |      * @name dataCallback
361 |      * @memberOf SpeechRecognition
362 |      * @param {String} text Распознанный текст.
363 |      * @param {Boolean} utterance Является ли данный текст финальным результатом распознавания.
364 |      * @param {Number} merge Число обработанных запросов, по которым выдан ответ от сервера.
365 |      */
366 | 
367 |     /**
368 |      * В эту функцию будет передаваться техническая информация.
369 |      * @callback
370 |      * @name infoCallback
371 |      * @memberOf SpeechRecognition.
372 |      * @param {Number} send_bytes Сколько байт аудио-данных было передано на сервер.
373 |      * @param {Number} send_packages Сколько пакетов аудио-данных было передано на сервер.
374 |      * @param {Number} processed Количество пакетов, на которые ответил сервер.
375 |      * @param {ya.speechkit.FORMAT} format Какой формат аудио используется.
376 |      */
377 | 
378 |     /**
379 |      * Функция, которая будет вызвана после остановки сессии распознавания речи.
380 |      * @callback
381 |      * @name stopCallback
382 |      * @memberOf SpeechRecognition
383 |      */
384 | }(this));
385 | 


--------------------------------------------------------------------------------
/webspeechkit/src/textline.js:
--------------------------------------------------------------------------------
  1 | (function (namespace) {
  2 |     'use strict';
  3 | 
  4 |     if (typeof namespace.ya === 'undefined') {
  5 |         namespace.ya = {};
  6 |     }
  7 |     if (typeof namespace.ya.speechkit === 'undefined') {
  8 |         namespace.ya.speechkit = {};
  9 |     }
 10 | 
 11 |     namespace.ya.speechkit._mic_on = '<svg version="1.1" id="Layer_1" ' +
 12 |     ' xmlns:sketch="http://www.bohemiancoding.com/sketch/ns"' +
 13 |     ' xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" ' +
 14 |     ' x="0px" y="0px" viewBox="0 0 112 112"' +
 15 |     ' enable-background="new 0 0 112 112" xml:space="preserve">' +
 16 |     ' <g id="tuts" sketch:type="MSPage">' +
 17 |     ' <g id="mic_ff" sketch:type="MSLayerGroup">' +
 18 |     ' <g sketch:type="MSShapeGroup">' +
 19 |     ' <circle id="path-1" fill="rgb(255, 204, 0)" cx="56" cy="56" r="56"/>' +
 20 |     ' </g>' +
 21 |     ' <g id="speechkit_vector-9" transform="translate(39.000000, 32.000000)" ' +
 22 |     ' sketch:type="MSShapeGroup" opacity="0.9">' +
 23 |     ' <path id="Shape" d="M17,4c2.8,0,5,2.3,5,5.2v15.6c0,2.9-2.2,5.2-5,5.2s-5-2.3-5-5.2V9.2C12,6.3,14.2,4,17,4 M17,0' +
 24 |     ' c-5,0-9,4.1-9,9.2v15.6c0,5.1,4,9.2,9,9.2s9-4.1,9-9.2V9.2C26,4.1,22,0,17,0L17,0z"/>' +
 25 |     ' <path id="Shape_1_" ' +
 26 |     ' d="M34,23v1.1C34,34,26.4,42,17,42S0,34,0,24.1V23h4v0.1C4,31.3,9.8,38,17,38s13-6.7,13-14.9V23H34z"/>' +
 27 |     ' <rect id="Rectangle-311" x="15" y="41" width="4" height="10"/>' +
 28 |     ' </g>' +
 29 |     ' </g>' +
 30 |     ' </g>' +
 31 |     ' </svg>';
 32 | 
 33 |     namespace.ya.speechkit._mic_off = '<svg version="1.1" id="Layer_1" ' +
 34 |     ' xmlns:sketch="http://www.bohemiancoding.com/sketch/ns"' +
 35 |     ' xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" ' +
 36 |     ' x="0px" y="0px" viewBox="0 0 112 112"' +
 37 |     ' enable-background="new 0 0 112 112" xml:space="preserve">' +
 38 |     ' <g id="tuts" sketch:type="MSPage">' +
 39 |     ' <g id="mic_ff" sketch:type="MSLayerGroup">' +
 40 |     ' <g id="speechkit_vector-9" transform="translate(39.000000, 32.000000)" ' +
 41 |     ' sketch:type="MSShapeGroup" opacity="0.9">' +
 42 |     ' <path id="Shape" d="M17,4c2.8,0,5,2.3,5,5.2v15.6c0,2.9-2.2,5.2-5,5.2s-5-2.3-5-5.2V9.2C12,6.3,14.2,4,17,4 M17,0' +
 43 |     ' c-5,0-9,4.1-9,9.2v15.6c0,5.1,4,9.2,9,9.2s9-4.1,9-9.2V9.2C26,4.1,22,0,17,0L17,0z"/>' +
 44 |     ' <path id="Shape_1_" ' +
 45 |     ' d="M34,23v1.1C34,34,26.4,42,17,42S0,34,0,24.1V23h4v0.1C4,31.3,9.8,38,17,38s13-6.7,13-14.9V23H34z"/>' +
 46 |     ' <rect id="Rectangle-311" x="15" y="41" width="4" height="10"/>' +
 47 |     ' </g>' +
 48 |     ' </g>' +
 49 |     ' </g>' +
 50 |     ' </svg>';
 51 | 
 52 |     /**
 53 |      * @name Textline
 54 |      * @class Класс для добавления элемента управления "Поле для голосового ввода".
 55 |      * @param {String} target Идентификатор div-контейрена, в котором будет размещен элемент управления.
 56 |      * @param {Object} [options] Опции распознавания.
 57 |      * @param {Object} [options.onInputFinished] Функция, которая будет вызвана после завершения распознавания. В качесве ее
 58 |      * аргументов передается финальный распознанный текст.
 59 |      * @param {String} [options.apikey] API-ключ. Если не задан, то используется ключ, указанный
 60 |      * в глобальных настройках {@link settings}.
 61 |      * @param {Boolean} [options.allowStrongLanguage=false] Следует ли отключить фильтрацию обсценной лексики.
 62 |      * @param {String} [options.model='notes'] Языковая модель для распознавания речи. Список доступных значений:
 63 |      * <ul>
 64 |      *     <li>'notes' (по умолчанию) — текст;</li>
 65 |      *     <li>'queries' — короткие запросы;</li>
 66 |      *     <li>'names' — имена; </li>
 67 |      *     <li>'dates' — даты; </li>
 68 |      *     <li>'maps' - топонимы;</li>
 69 |      *     <li>'notes' - тексты;</li>
 70 |      *     <li>'numbers' — числа.</li>
 71 |      * </ul>
 72 |      * <p>Если параметр не указан, то используется
 73 |      * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то
 74 |      * используется модель по умолчанию - 'notes'. </p>
 75 |      * @param {String} [options.lang='ru-RU'] Язык, речь на котором следует распознавать. Возможные значения: 'ru-RU', 'en-US', 'tr-TR', 'uk-UA'.
 76 |      * <p>Если параметр не указан, то используется
 77 |      * значение, заданное в глобальных настройках {@link settings}. Если в настройках значение не задано, то по умолчанию
 78 |      * выбирается русский язык: 'ru-RU'. </p>
 79 |      * @param {ya.speechkit.FORMAT} [options.format=ya.speechkit.FORMAT.PCM16] Формат передачи аудио-сигнала.
 80 |      */
 81 |     namespace.ya.speechkit.Textline = function (target, options) {
 82 |         if (!(this instanceof namespace.ya.speechkit.Textline)) {
 83 |             return new namespace.ya.speechkit.Textline(target, options);
 84 |         }
 85 | 
 86 |         var el = document.getElementById(target);
 87 |         if (el.tagName != 'INPUT') {
 88 |             this.element = el;
 89 |             this.textinput = document.createElement('input');
 90 |             this.textinput.style.height = '100%';
 91 |             this.textinput.style.width = '100%';
 92 |         } else {
 93 |             this.textinput = el;
 94 |             this.element = null;
 95 |         }
 96 |         this.textinput.style.backgroundImage = 'url(\'data:image/svg+xml;utf8,' +
 97 |                                                 namespace.ya.speechkit._mic_off + '\')';
 98 |         this.textinput.style.backgroundRepeat = 'no-repeat';
 99 |         this.textinput.style.backgroundPosition = 'right center';
100 |         if (this.element) {
101 |             this.element.appendChild(this.textinput);
102 |         }
103 | 
104 |         this.dict = null;
105 | 
106 |         this.final_result = '';
107 | 
108 |         var _this = this;
109 | 
110 |         this.textinput.onmousemove = function (event) {
111 |             var rect = _this.textinput.getBoundingClientRect();
112 |             if (event.clientX - rect.x > rect.width - rect.height)
113 |             {
114 |                 _this.textinput.style.cursor = 'pointer';
115 |             } else {
116 |                 _this.textinput.style.cursor = 'text';
117 |             }
118 |         };
119 | 
120 |         options = options || {};
121 | 
122 |         options.dataCallback = function (text, uttr, merge) {
123 |             _this.textinput.value = text;
124 |             if (uttr) {
125 |                 if (options.onInputFinished) {
126 |                     _this.final_result = text;
127 |                     options.onInputFinished(text);
128 |                 }
129 |                 _this.dict.abort();
130 |             }
131 |         };
132 | 
133 |         options.initCallback = function () {
134 |             _this.textinput.style.backgroundImage = 'url(\'data:image/svg+xml;utf8,' + ya.speechkit._mic_on + '\')';
135 |         };
136 | 
137 |         options.stopCallback = function () {
138 |             _this.textinput.style.backgroundImage = 'url(\'data:image/svg+xml;utf8,' + ya.speechkit._mic_off + '\')';
139 |             _this.dict = null;
140 |         };
141 | 
142 |         this.textinput.onmousedown = function (event) {
143 |             var rect = _this.textinput.getBoundingClientRect();
144 | 
145 |             if (event.clientX <= rect.width - rect.height) {
146 |                 return;
147 |             }
148 | 
149 |             if (!_this.dict) {
150 |                 _this.dict = new ya.speechkit.SpeechRecognition();
151 |             }
152 |             if (_this.dict.isPaused())
153 |             {
154 |                 _this.dict.start(options);
155 |             } else {
156 |                 _this.dict.stop();
157 |             }
158 |         };
159 | 
160 |         return {
161 |             /**
162 |              * Удаляет элемент управления.
163 |              * @name Textline.destroy
164 |              * @function
165 |              */
166 |             destroy: function () {
167 |                 if (_this.dict) {
168 |                     _this.dict.stop();
169 |                 }
170 |                 _this.textinput.style.backgroundImage = '';
171 |                 _this.textinput.onmousedown = function () {};
172 |                 _this.textinput.onmousemove = function () {};
173 | 
174 |                 if (_this.element) {
175 |                     _this.element.removeChild(_this.textinput);
176 |                 }
177 |             },
178 |             /**
179 |              * Получает финальный результат распознавания в синхронном режиме.
180 |              * @name Textline.value
181 |              * @function
182 |              * @returns {string} Результат распознавания.
183 |              *
184 |              * @example
185 |              * var textline = new ya.speechkit.Textline('myDiv');
186 |              *
187 |              * setTimeout(function () {
188 |              *     console.log("Результат распознавания: " + textline.value());
189 |              * }, 5000);
190 |              */
191 |             value: function () {
192 |                 return _this.final_result;
193 |             }
194 |         };
195 |     };
196 | }(this));
197 | 


--------------------------------------------------------------------------------
/webspeechkit/src/tts.js:
--------------------------------------------------------------------------------
  1 | (function (namespace) {
  2 |     'use strict';
  3 | 
  4 |     if (typeof namespace.ya === 'undefined') {
  5 |         namespace.ya = {};
  6 |     }
  7 |     if (typeof namespace.ya.speechkit === 'undefined') {
  8 |         namespace.ya.speechkit = {};
  9 |     }
 10 | 
 11 |     var speakersCache = null;
 12 | 
 13 |     /**
 14 |      * Воспроизводит аудиофайл.
 15 |      * @function
 16 |      * @static
 17 |      * @param {String | <xref href="https://developer.mozilla.org/en-US/docs/Web/API/Blob" scope="external">Blob</xref>} url URL, по которому доступен либо аудио-файл,
 18 |      * либо объект <xref href="https://developer.mozilla.org/en-US/docs/Web/API/Blob" scope="external">Blob</xref> со звуком в поддерживаемом браузером формате.
 19 |      * @param {Function} [cb] Функция-обработчик, которая будет вызвана после завершения воспроизведения.
 20 |      * @name play
 21 |      */
 22 |     namespace.ya.speechkit.play = function (url, cb) {
 23 |         var audio = new Audio(url);
 24 |         audio.volume = 1.0;
 25 |         audio.onended = cb || function () {};
 26 |         audio.play();
 27 |     };
 28 | 
 29 |     /**
 30 |      * @class Класс, предназначенный для использования технологии синтеза речи (озвучивания текста).
 31 |      * @name Tts
 32 |      * @param {TtsOptions} [options] Опции.
 33 |      * @param {String} [options.apikey] API-ключ (если в настройках ключ не был указан, то в конструкторе его необходимо указать).
 34 |      * @param {String} [options.emotion='neutral'] Эмоциональная окраска голоса. Доступные значения:
 35 |      * <ul>
 36 |      *     <li>'neutral' — нейтральный (по умолчанию);</li>
 37 |      *     <li>'good' — доброжелательный;</li>
 38 |      *     <li>'evil' — злой.</li>
 39 |      * </ul>
 40 |      * @param {Array} [options.emotions] Массив эмоций вида [['emotion1', weight1], ['emotion2', weight2]], предназначенный для взвешенного смешивания эмоций
 41 |      * @param {String} [options.speaker='omazh'] Голос для озвучивания. Список доступных значений можно получить вызвав функцию Tts.speakers:
 42 |      * * <ul>
 43 |      *     <li>женские голоса: 'omazh' (по умолчанию) и 'jane';</li>
 44 |      *     <li>'мужские голоса: 'zahar' и 'ermil'.</li>
 45 |      * </ul>
 46 |      * @param {Array} [options.speakers] Массив голосов вида [['speaker1', weight1], ['speaker2', weight2]], предназначенный для взвешенного смешивания голосов.
 47 |      * weight может принимать значения от 1.0 до 3.0. Например, [['omazh', 1.5], ['zahar', 2.2]].
 48 |      * @param {Array} [options.genders] Массив полов вида [['gender1', weight1], ['gender2', weight2]], предназначенный для взвешенного смешивания полов говорящего.
 49 |      * weight может принимать значения от 1.0 до 3.0.
 50 |      * @param {Boolean} [options.fast=false] Использовать "быстрый" синтез, который ускоряет генерацию звука путём уменьшения его качества.
 51 |      * @param {String} [options.lang='ru-RU'] Язык текста, который надо произнести. Доступные значения: 'ru-RU', 'en-US', 'tr-TR', 'uk-UA'.
 52 |      * @param {Float} [options.speed=1.0] Скорость синтеза речи. Принимает значения от 0.0 (медленно) до 2.0 (быстро).
 53 |      */
 54 |     var Tts = function (options) {
 55 |         if (!(this instanceof namespace.ya.speechkit.Tts)) {
 56 |             return new namespace.ya.speechkit.Tts(options);
 57 |         }
 58 |         var _this = this;
 59 |         /**
 60 |          * Опции озвучивания текста.
 61 |          * @type TtsOptions
 62 |          * @name Tts.options
 63 |          * @field
 64 |          */
 65 |         this.options = namespace.ya.speechkit._extend(
 66 |                         {
 67 |                             apikey: namespace.ya.speechkit.settings.apikey,
 68 |                             uuid: namespace.ya.speechkit.settings.uuid,
 69 |                             url: namespace.ya.speechkit.settings.websocketProtocol +
 70 |                                 namespace.ya.speechkit.settings.ttsStreamUrl,
 71 |                             infoCallback: function () {},
 72 |                             errorCallback: function (msg) {
 73 |                                                 console.log(msg);
 74 |                                             },
 75 |                         },
 76 |                         options);
 77 |         this.sessionId = null;
 78 |         this.socket = null;
 79 | 
 80 |         this.buffered = [];
 81 | 
 82 |     };
 83 | 
 84 |     Tts.prototype = /** @lends Tts.prototype */{
 85 |         /**
 86 |          * Send raw data to websocket
 87 |          * @param data Any data to send to websocket (json string, raw audio data)
 88 |          * @private
 89 |          */
 90 |         _sendRaw: function (data) {
 91 |             if (this.socket) {
 92 |                 this.socket.send(data);
 93 |             }
 94 |         },
 95 |         /**
 96 |          * Stringify JSON and send it to websocket
 97 |          * @param {Object} json Object needed to be send to websocket
 98 |          * @private
 99 |          */
100 |         _sendJson: function (json) {
101 |             this._sendRaw(JSON.stringify({type: 'message', data: json}));
102 |         },
103 |         /**
104 |          * @private
105 |          * Озвучивание текста.
106 |          * @param {String} text Текст.
107 |          * @param {Function} [cb] Функция-обработчик, которая будет вызвана по завершении воспроизведения.
108 |          * @param {TtsOptions} [options] Опции.
109 |          */
110 |         say: function (text, cb, options) {
111 |             this.speak(
112 |                 text,
113 |                 namespace.ya.speechkit._extend(
114 |                 this.options,
115 |                     namespace.ya.speechkit._extend(
116 |                         {
117 |                             dataCallback: function (blob) {
118 |                                 var url = URL.createObjectURL(blob);
119 |                                 namespace.ya.speechkit.play(url, cb);
120 |                             }
121 |                         },
122 |                     options)
123 |                 )
124 |             );
125 |         },
126 |         /**
127 |          * Озвучивание текста.
128 |          * @param {TtsOptions} text Опции.
129 |          * @param {TtsOptions} [options] Опции.
130 |          */
131 |         speak: function (text, options) {
132 |             var opts = namespace.ya.speechkit._extend(
133 |                             namespace.ya.speechkit._extend(
134 |                             {text: text},
135 |                             this.options),
136 |                         options);
137 |             try {
138 |                 this.socket = new WebSocket(opts.url);
139 |             } catch (e) {
140 |                 opts.errorCallback('Error on socket creation: ' + e);
141 |                 return;
142 |             }
143 | 
144 |             var context = namespace.ya.speechkit.audiocontext || new namespace.ya.speechkit.AudioContext();
145 |             namespace.ya.speechkit.audiocontext = context;
146 | 
147 |             this.socket.onopen = function () {
148 |                 this._sendJson(opts);
149 |             }.bind(this);
150 | 
151 |             var play_queue = [];
152 |             var playing = false;
153 | 
154 |             this.socket.binaryType = 'arraybuffer';
155 | 
156 |             this.socket.onmessage = function (e) {
157 |                 var message = {};
158 |                 if (e.data && e.data[0] == '{') {
159 |                     try {
160 |                         message = JSON.parse(e.data);
161 |                     } catch (ex) {
162 |                         message = {type: 'Audio', data: e.data};
163 |                     }
164 |                 } else {
165 |                     message = {type: 'Audio', data: e.data};
166 |                 }
167 |                 if (message.type == 'InitResponse') {
168 |                     this.sessionId = message.data.sessionId;
169 |                 } else if (message.type == 'Error') {
170 |                     opts.errorCallback('Session ' + this.sessionId + ': ' + message.data);
171 |                     this.socket.onclose = function() {};
172 |                     this.socket.close();
173 |                 } else if (message.type == 'Phonemes') {
174 |                     opts.infoCallback(message.data);
175 |                 } else if (message.type == 'Audio') {
176 |                     play_queue.push(message.data);
177 |                 } else {
178 |                     opts.errorCallback('Session ' + this.sessionId + ': ' + message);
179 |                     this.socket.onclose = function() {};
180 |                     this.socket.close();
181 |                 }
182 |             }.bind(this);
183 | 
184 |             this.socket.onerror = function (error) {
185 |                 opts.errorCallback('Socket error: ' + error.message);
186 |             }.bind(this);
187 | 
188 |             this.socket.onclose = function (event) {
189 |                 var res = Array.prototype.concat.apply([], play_queue);
190 |                 var blob = new Blob(res, {type: 'audio/x-wav'});
191 |                 if (typeof opts.dataCallback !== 'undefined') {
192 |                     opts.dataCallback(blob);
193 |                 } else {
194 |                     var url = URL.createObjectURL(blob);
195 |                     namespace.ya.speechkit.play(url, opts.stopCallback);
196 |                 }
197 |             }.bind(this);
198 |         },
199 |         /**
200 |          * Возвращает список доступных голосов и эмоций.
201 |          * @param {String} [lang] Язык, для которого следует вернуть список доступных языков
202 |          * @returns {Promise} Promise, который вернёт в resolve список доступных языков и эмоций
203 |          */
204 |         speakers: function (lang) {
205 |             return new Promise(function (resolve, reject) {
206 | 
207 |                 if (speakersCache) {
208 |                     resolve(speakersCache);
209 |                 } else {
210 |                     var xhr = new XMLHttpRequest();
211 |                     xhr.open('GET', this.options.url.replace('wss://', 'https://')
212 |                                                     .replace('ws://', 'http://')
213 |                                                     .replace('ttssocket.ws', 'speakers?engine=ytcp&lang=' + (lang || '')));
214 | 
215 |                     xhr.onreadystatechange = function () {
216 |                         if (this.readyState == 4) {
217 |                             if (this.status == 200) {
218 |                                 try {
219 |                                     speakersCache = JSON.parse(this.responseText);
220 |                                     resolve(speakersCache);
221 |                                 } catch (ex) {
222 |                                     reject(ex.message);
223 |                                 }
224 |                             } else {
225 |                                 reject('Can\'t get speakers list!');
226 |                             }
227 |                         }
228 |                     };
229 | 
230 |                     xhr.send();
231 |                 }
232 |             }.bind(this));
233 |         },
234 |     };
235 | 
236 |     namespace.ya.speechkit.Tts = Tts;
237 | }(this));
238 | 
239 | 


--------------------------------------------------------------------------------