├── tts.py
├── README.md
├── bing_stt_with_vad.py
└── bing_voice.py


/tts.py:
--------------------------------------------------------------------------------
 1 | from bing_voice import *
 2 | import pyaudio
 3 | import sys
 4 | import wave
 5 | 
 6 | # get a key from https://www.microsoft.com/cognitive-services/en-us/speech-api
 7 | BING_KEY = 'Enter your key here'
 8 | CHUNK_SIZE = 2048
 9 | 
10 | if len(sys.argv) < 2:
11 |     print('Usage: python %s text_to_convert' % sys.argv[0])
12 |     sys.exit(-1)
13 | 
14 | bing = BingVoice(BING_KEY)
15 | data = bing.synthesize(sys.argv[1])
16 | 
17 | pa = pyaudio.PyAudio()
18 | stream = pa.open(format=pyaudio.paInt16,
19 |                  channels=1,
20 |                  rate=16000,
21 |                  output=True,
22 |                  # output_device_index=1,
23 |                  frames_per_buffer=CHUNK_SIZE)
24 | 
25 | stream.write(data)
26 | stream.close()
27 | 
28 | if len(sys.argv) >= 3:
29 |     wf = wave.open(sys.argv[2], 'wb')
30 |     wf.setframerate(16000)
31 |     wf.setnchannels(1)
32 |     wf.setsampwidth(2)
33 | 
34 |     wf.writeframes(data)
35 |     wf.close()
36 | 
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Please use [bing_speech_api.py](https://github.com/respeaker/respeaker_python_library/blob/master/respeaker/bing_speech_api.py) instead
 2 | 
 3 | 
 4 | Python scripts to use microsoft cognitive services
 5 | ==================================================
 6 | 
 7 | 1. Get a API key from [Microsoft](https://www.microsoft.com/cognitive-services/en-us/speech-api)
 8 | and add it to the Python scripts.
 9 | 2. Install the required package `pip install monotonic`
10 | 3. Change the default language in the Python scripts from `language='zh-CN'` to your prefered language. For example `language='en-GB'`. A [complete list of supported languages is available from Microsoft](https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/overview).
11 | 
12 | ### Speech To Text
13 | 
14 | + bing_voice.py
15 | 
16 |   `python bing_voice.py sample.wav` recognizes audio from file (16000 sample rate, 1 channel)
17 | 
18 | + bing_stt_with_vad.py
19 | 
20 |   Read audio from microphone, pre-process audio with voice activity detector (VAD) and then recognize
21 | 
22 | ### Text To Speech
23 | + tts.py
24 | 
25 |   `python tts.py 'hello, Respeaker is being actively developed. Stay tuned'`
26 | 


--------------------------------------------------------------------------------
/bing_stt_with_vad.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Bing Speech To Text (STT) with Voice Activity Detect (VAD)
  3 | 
  4 | Requirements:
  5 | + pyaudio - `pip install pyaudio`
  6 | + py-webrtcvad - `pip install webrtcvad`
  7 | 
  8 | '''
  9 | 
 10 | from bing_voice import *
 11 | import webrtcvad
 12 | import collections
 13 | import sys
 14 | import signal
 15 | import pyaudio
 16 | 
 17 | # get a key from https://www.microsoft.com/cognitive-services/en-us/speech-api
 18 | BING_KEY = ''
 19 | 
 20 | FORMAT = pyaudio.paInt16
 21 | CHANNELS = 1
 22 | RATE = 16000
 23 | CHUNK_DURATION_MS = 30  # supports 10, 20 and 30 (ms)
 24 | PADDING_DURATION_MS = 1000
 25 | CHUNK_SIZE = int(RATE * CHUNK_DURATION_MS / 1000)
 26 | CHUNK_BYTES = CHUNK_SIZE * 2
 27 | NUM_PADDING_CHUNKS = int(PADDING_DURATION_MS / CHUNK_DURATION_MS)
 28 | NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS)
 29 | 
 30 | vad = webrtcvad.Vad(2)
 31 | bing = BingVoice(BING_KEY)
 32 | 
 33 | pa = pyaudio.PyAudio()
 34 | stream = pa.open(format=FORMAT,
 35 |                            channels=CHANNELS,
 36 |                            rate=RATE,
 37 |                            input=True,
 38 |                            start=False,
 39 |                            # input_device_index=2,
 40 |                            frames_per_buffer=CHUNK_SIZE)
 41 | 
 42 | 
 43 | got_a_sentence = False
 44 | leave = False
 45 | 
 46 | 
 47 | def handle_int(sig, chunk):
 48 |     global leave, got_a_sentence
 49 |     
 50 |     leave = True
 51 |     got_a_sentence = True
 52 |     
 53 | signal.signal(signal.SIGINT, handle_int)
 54 | 
 55 | while not leave:
 56 |     ring_buffer = collections.deque(maxlen=NUM_PADDING_CHUNKS)
 57 |     triggered = False
 58 |     voiced_frames = []
 59 |     ring_buffer_flags = [0] * NUM_WINDOW_CHUNKS
 60 |     ring_buffer_index = 0
 61 |     buffer_in = ''
 62 |     
 63 |     print("* recording")
 64 |     stream.start_stream()
 65 |     while not got_a_sentence and not leave:
 66 |         chunk = stream.read(CHUNK_SIZE)
 67 |         active = vad.is_speech(chunk, RATE)
 68 |         sys.stdout.write('1' if active else '0')
 69 |         ring_buffer_flags[ring_buffer_index] = 1 if active else 0
 70 |         ring_buffer_index += 1
 71 |         ring_buffer_index %= NUM_WINDOW_CHUNKS
 72 |         if not triggered:
 73 |             ring_buffer.append(chunk)
 74 |             num_voiced = sum(ring_buffer_flags)
 75 |             if num_voiced > 0.5 * NUM_WINDOW_CHUNKS:
 76 |                 sys.stdout.write('+')
 77 |                 triggered = True
 78 |                 voiced_frames.extend(ring_buffer)
 79 |                 ring_buffer.clear()
 80 |         else:
 81 |             voiced_frames.append(chunk)
 82 |             ring_buffer.append(chunk)
 83 |             num_unvoiced = NUM_WINDOW_CHUNKS - sum(ring_buffer_flags)
 84 |             if num_unvoiced > 0.9 * NUM_WINDOW_CHUNKS:
 85 |                 sys.stdout.write('-')
 86 |                 triggered = False
 87 |                 got_a_sentence = True
 88 | 
 89 |         sys.stdout.flush()
 90 | 
 91 |     sys.stdout.write('\n')
 92 |     data = b''.join(voiced_frames)
 93 |     
 94 |     stream.stop_stream()
 95 |     print("* done recording")
 96 | 
 97 |     # recognize speech using Microsoft Bing Voice Recognition
 98 |     try:
 99 |         text = bing.recognize(data, language='zh-CN')
100 |         print('Bing:' + text.encode('utf-8'))
101 |     except UnknownValueError:
102 |         print("Microsoft Bing Voice Recognition could not understand audio")
103 |     except RequestError as e:
104 |         print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e))
105 |         
106 |     got_a_sentence = False
107 |         
108 | stream.close()


--------------------------------------------------------------------------------
/bing_voice.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Bing Speech To Text (STT) and Text To Speech (TTS)
  3 | 
  4 | based on https://github.com/Uberi/speech_recognition
  5 | '''
  6 | 
  7 | import json
  8 | import uuid
  9 | import wave
 10 | import io
 11 | from monotonic import monotonic
 12 | from urllib import urlencode
 13 | from urllib2 import Request, urlopen, URLError, HTTPError
 14 | 
 15 | # get a key from https://www.microsoft.com/cognitive-services/en-us/speech-api
 16 | BING_KEY = ''
 17 | 
 18 | 
 19 | class RequestError(Exception):
 20 |     pass
 21 | 
 22 | 
 23 | class UnknownValueError(Exception):
 24 |     pass
 25 | 
 26 | 
 27 | class LocaleError(Exception):
 28 |     pass
 29 | 
 30 | 
 31 | class BingVoice():
 32 |     def __init__(self, key):
 33 |         self.key = key
 34 |         self.access_token = None
 35 |         self.expire_time = None
 36 |         self.locales = {
 37 |             "ar-eg": {"Female": "Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)"},
 38 |             "de-DE": {"Female": "Microsoft Server Speech Text to Speech Voice (de-DE, Hedda)",
 39 |                       "Male": "Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)"},
 40 |             "en-AU": {"Female": "Microsoft Server Speech Text to Speech Voice (en-AU, Catherine)"},
 41 |             "en-CA": {"Female": "Microsoft Server Speech Text to Speech Voice (en-CA, Linda)"},
 42 |             "en-GB": {"Female": "Microsoft Server Speech Text to Speech Voice (en-GB, Susan, Apollo)",
 43 |                       "Male": "Microsoft Server Speech Text to Speech Voice (en-GB, George, Apollo)"},
 44 |             "en-IN": {"Male": "Microsoft Server Speech Text to Speech Voice (en-IN, Ravi, Apollo)"},
 45 |             "en-US": {"Female": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
 46 |                       "Male": "Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)"},
 47 |             "es-ES": {"Female": "Microsoft Server Speech Text to Speech Voice (es-ES, Laura, Apollo)",
 48 |                       "Male": "Microsoft Server Speech Text to Speech Voice (es-ES, Pablo, Apollo)"},
 49 |             "es-MX": {"Male": "Microsoft Server Speech Text to Speech Voice (es-MX, Raul, Apollo)"},
 50 |             "fr-CA": {"Female": "Microsoft Server Speech Text to Speech Voice (fr-CA, Caroline)"},
 51 |             "fr-FR": {"Female": "Microsoft Server Speech Text to Speech Voice (fr-FR, Julie, Apollo)",
 52 |                       "Male": "Microsoft Server Speech Text to Speech Voice (fr-FR, Paul, Apollo)"},
 53 |             "it-IT": {"Male": "Microsoft Server Speech Text to Speech Voice (it-IT, Cosimo, Apollo)"},
 54 |             "ja-JP": {"Female": "Microsoft Server Speech Text to Speech Voice (ja-JP, Ayumi, Apollo)",
 55 |                       "Male": "Microsoft Server Speech Text to Speech Voice (ja-JP, Ichiro, Apollo)"},
 56 |             "pt-BR": {"Male": "Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)"},
 57 |             "ru-RU": {"Female": "Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)",
 58 |                       "Male": "Microsoft Server Speech Text to Speech Voice (ru-RU, Pavel, Apollo)"},
 59 |             "zh-CN": {"Female": "Microsoft Server Speech Text to Speech Voice (zh-CN, HuihuiRUS)",
 60 |                       "Female2": "Microsoft Server Speech Text to Speech Voice (zh-CN, Yaoyao, Apollo)",
 61 |                       "Male": "Microsoft Server Speech Text to Speech Voice (zh-CN, Kangkang, Apollo)"},
 62 |             "zh-HK": {"Female": "Microsoft Server Speech Text to Speech Voice (zh-HK, Tracy, Apollo)",
 63 |                       "Male": "Microsoft Server Speech Text to Speech Voice (zh-HK, Danny, Apollo)"},
 64 |             "zh-TW": {"Female": "Microsoft Server Speech Text to Speech Voice (zh-TW, Yating, Apollo)",
 65 |                       "Male": "Microsoft Server Speech Text to Speech Voice (zh-TW, Zhiwei, Apollo)"}
 66 |         }
 67 | 
 68 |     def auth(self):
 69 |         if self.expire_time is None or monotonic() > self.expire_time:  # first credential request, or the access token from the previous one expired
 70 |             # get an access token using OAuth
 71 |             credential_url = "https://oxford-speech.cloudapp.net/token/issueToken"
 72 |             credential_request = Request(credential_url, data=urlencode({
 73 |                 "grant_type": "client_credentials",
 74 |                 "client_id": "python",
 75 |                 "client_secret": self.key,
 76 |                 "scope": "https://speech.platform.bing.com"
 77 |             }).encode("utf-8"))
 78 |             start_time = monotonic()
 79 |             try:
 80 |                 credential_response = urlopen(credential_request)
 81 |             except HTTPError as e:
 82 |                 raise RequestError("recognition request failed: {0}".format(
 83 |                     getattr(e, "reason", "status {0}".format(e.code))))  # use getattr to be compatible with Python 2.6
 84 |             except URLError as e:
 85 |                 raise RequestError("recognition connection failed: {0}".format(e.reason))
 86 |             credential_text = credential_response.read().decode("utf-8")
 87 |             credentials = json.loads(credential_text)
 88 |             self.access_token, expiry_seconds = credentials["access_token"], float(credentials["expires_in"])
 89 | 
 90 |             self.expire_time = start_time + expiry_seconds
 91 | 
 92 |     def recognize(self, audio_data, language="en-US", show_all=False):
 93 |         self.auth()
 94 |         wav_data = self.to_wav(audio_data)
 95 |         url = "https://speech.platform.bing.com/recognize/query?{0}".format(urlencode({
 96 |             "version": "3.0",
 97 |             "requestid": uuid.uuid4(),
 98 |             "appID": "D4D52672-91D7-4C74-8AD8-42B1D98141A5",
 99 |             "format": "json",
100 |             "locale": language,
101 |             "device.os": "wp7",
102 |             "scenarios": "ulm",
103 |             "instanceid": uuid.uuid4(),
104 |             "result.profanitymarkup": "0",
105 |         }))
106 |         request = Request(url, data=wav_data, headers={
107 |             "Authorization": "Bearer {0}".format(self.access_token),
108 |             "Content-Type": "audio/wav; samplerate=16000; sourcerate={0}; trustsourcerate=true".format(16000),
109 |         })
110 |         try:
111 |             response = urlopen(request)
112 |         except HTTPError as e:
113 |             raise RequestError("recognition request failed: {0}".format(
114 |                 getattr(e, "reason", "status {0}".format(e.code))))  # use getattr to be compatible with Python 2.6
115 |         except URLError as e:
116 |             raise RequestError("recognition connection failed: {0}".format(e.reason))
117 |         response_text = response.read().decode("utf-8")
118 |         result = json.loads(response_text)
119 | 
120 |         # return results
121 |         if show_all: return result
122 |         if "header" not in result or "lexical" not in result["header"]: raise UnknownValueError()
123 |         return result["header"]["lexical"]
124 | 
125 |     def synthesize(self, text, language="en-US", gender="Female"):
126 |         self.auth()
127 | 
128 |         if language not in self.locales.keys():
129 |             raise LocaleError("language locale not supported.")
130 | 
131 |         lang = self.locales.get(language)
132 | 
133 |         if gender not in ["Female", "Male", "Female2"]:
134 |             gender = "Female"
135 | 
136 |         if len(lang) == 1:
137 |             gender = lang.keys()[0]
138 | 
139 |         service_name = lang[gender]
140 | 
141 |         body = "<speak version='1.0' xml:lang='en-us'>\
142 |                 <voice xml:lang='%s' xml:gender='%s' name='%s'>%s</voice>\
143 |                 </speak>" % (language, gender, service_name, text)
144 | 
145 |         headers = {"Content-type": "application/ssml+xml",
146 |                    "X-Microsoft-OutputFormat": "raw-16khz-16bit-mono-pcm",
147 |                    "Authorization": "Bearer " + self.access_token,
148 |                    "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
149 |                    "X-Search-ClientID": str(uuid.uuid1()).replace('-', ''),
150 |                    "User-Agent": "TTSForPython"}
151 | 
152 |         url = "https://speech.platform.bing.com/synthesize"
153 |         request = Request(url, data=body, headers=headers)
154 |         try:
155 |             response = urlopen(request)
156 |         except HTTPError as e:
157 |             raise RequestError("tts request failed: {0}".format(
158 |                 getattr(e, "reason", "status {0}".format(e.code))))  # use getattr to be compatible with Python 2.6
159 |         except URLError as e:
160 |             raise RequestError("tts connection failed: {0}".format(e.reason))
161 | 
162 |         data = response.read()
163 | 
164 |         return data
165 | 
166 |     @staticmethod
167 |     def to_wav(raw_data):
168 |         # generate the WAV file contents
169 |         with io.BytesIO() as wav_file:
170 |             wav_writer = wave.open(wav_file, "wb")
171 |             try:  # note that we can't use context manager, since that was only added in Python 3.4
172 |                 wav_writer.setframerate(16000)
173 |                 wav_writer.setsampwidth(2)
174 |                 wav_writer.setnchannels(1)
175 |                 wav_writer.writeframes(raw_data)
176 |                 wav_data = wav_file.getvalue()
177 |             finally:  # make sure resources are cleaned up
178 |                 wav_writer.close()
179 |         return wav_data
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     import sys
184 | 
185 |     if len(sys.argv) != 2:
186 |         print('Usage: %s 16k_mono.wav' % sys.argv[0])
187 |         sys.exit(-1)
188 | 
189 |     wf = wave.open(sys.argv[1])
190 |     if wf.getframerate() != 16000 or wf.getnchannels() != 1 or wf.getsampwidth() != 2:
191 |         print('only support 16000 sample rate, 1 channel and 2 bytes sample width')
192 |         sys.exit(-2)
193 | 
194 |     # read less than 10 seconds audio data
195 |     n = wf.getnframes()
196 |     if (n / 16000.0) > 10.0:
197 |         n = 16000 * 10
198 | 
199 |     frames = wf.readframes(n)
200 | 
201 |     bing = BingVoice(BING_KEY)
202 | 
203 |     # recognize speech using Microsoft Bing Voice Recognition
204 |     try:
205 |         text = bing.recognize(frames, language='en-US')
206 |         print('Bing:' + text.encode('utf-8'))
207 |     except UnknownValueError:
208 |         print("Microsoft Bing Voice Recognition could not understand audio")
209 |     except RequestError as e:
210 |         print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e))
211 | 


--------------------------------------------------------------------------------