├── tts.py ├── README.md ├── bing_stt_with_vad.py └── bing_voice.py /tts.py: -------------------------------------------------------------------------------- 1 | from bing_voice import * 2 | import pyaudio 3 | import sys 4 | import wave 5 | 6 | # get a key from https://www.microsoft.com/cognitive-services/en-us/speech-api 7 | BING_KEY = 'Enter your key here' 8 | CHUNK_SIZE = 2048 9 | 10 | if len(sys.argv) < 2: 11 | print('Usage: python %s text_to_convert' % sys.argv[0]) 12 | sys.exit(-1) 13 | 14 | bing = BingVoice(BING_KEY) 15 | data = bing.synthesize(sys.argv[1]) 16 | 17 | pa = pyaudio.PyAudio() 18 | stream = pa.open(format=pyaudio.paInt16, 19 | channels=1, 20 | rate=16000, 21 | output=True, 22 | # output_device_index=1, 23 | frames_per_buffer=CHUNK_SIZE) 24 | 25 | stream.write(data) 26 | stream.close() 27 | 28 | if len(sys.argv) >= 3: 29 | wf = wave.open(sys.argv[2], 'wb') 30 | wf.setframerate(16000) 31 | wf.setnchannels(1) 32 | wf.setsampwidth(2) 33 | 34 | wf.writeframes(data) 35 | wf.close() 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Please use [bing_speech_api.py](https://github.com/respeaker/respeaker_python_library/blob/master/respeaker/bing_speech_api.py) instead 2 | 3 | 4 | Python scripts to use microsoft cognitive services 5 | ================================================== 6 | 7 | 1. Get a API key from [Microsoft](https://www.microsoft.com/cognitive-services/en-us/speech-api) 8 | and add it to the Python scripts. 9 | 2. Install the required package `pip install monotonic` 10 | 3. Change the default language in the Python scripts from `language='zh-CN'` to your prefered language. For example `language='en-GB'`. A [complete list of supported languages is available from Microsoft](https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/overview). 11 | 12 | ### Speech To Text 13 | 14 | + bing_voice.py 15 | 16 | `python bing_voice.py sample.wav` recognizes audio from file (16000 sample rate, 1 channel) 17 | 18 | + bing_stt_with_vad.py 19 | 20 | Read audio from microphone, pre-process audio with voice activity detector (VAD) and then recognize 21 | 22 | ### Text To Speech 23 | + tts.py 24 | 25 | `python tts.py 'hello, Respeaker is being actively developed. Stay tuned'` 26 | -------------------------------------------------------------------------------- /bing_stt_with_vad.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Bing Speech To Text (STT) with Voice Activity Detect (VAD) 3 | 4 | Requirements: 5 | + pyaudio - `pip install pyaudio` 6 | + py-webrtcvad - `pip install webrtcvad` 7 | 8 | ''' 9 | 10 | from bing_voice import * 11 | import webrtcvad 12 | import collections 13 | import sys 14 | import signal 15 | import pyaudio 16 | 17 | # get a key from https://www.microsoft.com/cognitive-services/en-us/speech-api 18 | BING_KEY = '' 19 | 20 | FORMAT = pyaudio.paInt16 21 | CHANNELS = 1 22 | RATE = 16000 23 | CHUNK_DURATION_MS = 30 # supports 10, 20 and 30 (ms) 24 | PADDING_DURATION_MS = 1000 25 | CHUNK_SIZE = int(RATE * CHUNK_DURATION_MS / 1000) 26 | CHUNK_BYTES = CHUNK_SIZE * 2 27 | NUM_PADDING_CHUNKS = int(PADDING_DURATION_MS / CHUNK_DURATION_MS) 28 | NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS) 29 | 30 | vad = webrtcvad.Vad(2) 31 | bing = BingVoice(BING_KEY) 32 | 33 | pa = pyaudio.PyAudio() 34 | stream = pa.open(format=FORMAT, 35 | channels=CHANNELS, 36 | rate=RATE, 37 | input=True, 38 | start=False, 39 | # input_device_index=2, 40 | frames_per_buffer=CHUNK_SIZE) 41 | 42 | 43 | got_a_sentence = False 44 | leave = False 45 | 46 | 47 | def handle_int(sig, chunk): 48 | global leave, got_a_sentence 49 | 50 | leave = True 51 | got_a_sentence = True 52 | 53 | signal.signal(signal.SIGINT, handle_int) 54 | 55 | while not leave: 56 | ring_buffer = collections.deque(maxlen=NUM_PADDING_CHUNKS) 57 | triggered = False 58 | voiced_frames = [] 59 | ring_buffer_flags = [0] * NUM_WINDOW_CHUNKS 60 | ring_buffer_index = 0 61 | buffer_in = '' 62 | 63 | print("* recording") 64 | stream.start_stream() 65 | while not got_a_sentence and not leave: 66 | chunk = stream.read(CHUNK_SIZE) 67 | active = vad.is_speech(chunk, RATE) 68 | sys.stdout.write('1' if active else '0') 69 | ring_buffer_flags[ring_buffer_index] = 1 if active else 0 70 | ring_buffer_index += 1 71 | ring_buffer_index %= NUM_WINDOW_CHUNKS 72 | if not triggered: 73 | ring_buffer.append(chunk) 74 | num_voiced = sum(ring_buffer_flags) 75 | if num_voiced > 0.5 * NUM_WINDOW_CHUNKS: 76 | sys.stdout.write('+') 77 | triggered = True 78 | voiced_frames.extend(ring_buffer) 79 | ring_buffer.clear() 80 | else: 81 | voiced_frames.append(chunk) 82 | ring_buffer.append(chunk) 83 | num_unvoiced = NUM_WINDOW_CHUNKS - sum(ring_buffer_flags) 84 | if num_unvoiced > 0.9 * NUM_WINDOW_CHUNKS: 85 | sys.stdout.write('-') 86 | triggered = False 87 | got_a_sentence = True 88 | 89 | sys.stdout.flush() 90 | 91 | sys.stdout.write('\n') 92 | data = b''.join(voiced_frames) 93 | 94 | stream.stop_stream() 95 | print("* done recording") 96 | 97 | # recognize speech using Microsoft Bing Voice Recognition 98 | try: 99 | text = bing.recognize(data, language='zh-CN') 100 | print('Bing:' + text.encode('utf-8')) 101 | except UnknownValueError: 102 | print("Microsoft Bing Voice Recognition could not understand audio") 103 | except RequestError as e: 104 | print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e)) 105 | 106 | got_a_sentence = False 107 | 108 | stream.close() -------------------------------------------------------------------------------- /bing_voice.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Bing Speech To Text (STT) and Text To Speech (TTS) 3 | 4 | based on https://github.com/Uberi/speech_recognition 5 | ''' 6 | 7 | import json 8 | import uuid 9 | import wave 10 | import io 11 | from monotonic import monotonic 12 | from urllib import urlencode 13 | from urllib2 import Request, urlopen, URLError, HTTPError 14 | 15 | # get a key from https://www.microsoft.com/cognitive-services/en-us/speech-api 16 | BING_KEY = '' 17 | 18 | 19 | class RequestError(Exception): 20 | pass 21 | 22 | 23 | class UnknownValueError(Exception): 24 | pass 25 | 26 | 27 | class LocaleError(Exception): 28 | pass 29 | 30 | 31 | class BingVoice(): 32 | def __init__(self, key): 33 | self.key = key 34 | self.access_token = None 35 | self.expire_time = None 36 | self.locales = { 37 | "ar-eg": {"Female": "Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)"}, 38 | "de-DE": {"Female": "Microsoft Server Speech Text to Speech Voice (de-DE, Hedda)", 39 | "Male": "Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)"}, 40 | "en-AU": {"Female": "Microsoft Server Speech Text to Speech Voice (en-AU, Catherine)"}, 41 | "en-CA": {"Female": "Microsoft Server Speech Text to Speech Voice (en-CA, Linda)"}, 42 | "en-GB": {"Female": "Microsoft Server Speech Text to Speech Voice (en-GB, Susan, Apollo)", 43 | "Male": "Microsoft Server Speech Text to Speech Voice (en-GB, George, Apollo)"}, 44 | "en-IN": {"Male": "Microsoft Server Speech Text to Speech Voice (en-IN, Ravi, Apollo)"}, 45 | "en-US": {"Female": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)", 46 | "Male": "Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)"}, 47 | "es-ES": {"Female": "Microsoft Server Speech Text to Speech Voice (es-ES, Laura, Apollo)", 48 | "Male": "Microsoft Server Speech Text to Speech Voice (es-ES, Pablo, Apollo)"}, 49 | "es-MX": {"Male": "Microsoft Server Speech Text to Speech Voice (es-MX, Raul, Apollo)"}, 50 | "fr-CA": {"Female": "Microsoft Server Speech Text to Speech Voice (fr-CA, Caroline)"}, 51 | "fr-FR": {"Female": "Microsoft Server Speech Text to Speech Voice (fr-FR, Julie, Apollo)", 52 | "Male": "Microsoft Server Speech Text to Speech Voice (fr-FR, Paul, Apollo)"}, 53 | "it-IT": {"Male": "Microsoft Server Speech Text to Speech Voice (it-IT, Cosimo, Apollo)"}, 54 | "ja-JP": {"Female": "Microsoft Server Speech Text to Speech Voice (ja-JP, Ayumi, Apollo)", 55 | "Male": "Microsoft Server Speech Text to Speech Voice (ja-JP, Ichiro, Apollo)"}, 56 | "pt-BR": {"Male": "Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)"}, 57 | "ru-RU": {"Female": "Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)", 58 | "Male": "Microsoft Server Speech Text to Speech Voice (ru-RU, Pavel, Apollo)"}, 59 | "zh-CN": {"Female": "Microsoft Server Speech Text to Speech Voice (zh-CN, HuihuiRUS)", 60 | "Female2": "Microsoft Server Speech Text to Speech Voice (zh-CN, Yaoyao, Apollo)", 61 | "Male": "Microsoft Server Speech Text to Speech Voice (zh-CN, Kangkang, Apollo)"}, 62 | "zh-HK": {"Female": "Microsoft Server Speech Text to Speech Voice (zh-HK, Tracy, Apollo)", 63 | "Male": "Microsoft Server Speech Text to Speech Voice (zh-HK, Danny, Apollo)"}, 64 | "zh-TW": {"Female": "Microsoft Server Speech Text to Speech Voice (zh-TW, Yating, Apollo)", 65 | "Male": "Microsoft Server Speech Text to Speech Voice (zh-TW, Zhiwei, Apollo)"} 66 | } 67 | 68 | def auth(self): 69 | if self.expire_time is None or monotonic() > self.expire_time: # first credential request, or the access token from the previous one expired 70 | # get an access token using OAuth 71 | credential_url = "https://oxford-speech.cloudapp.net/token/issueToken" 72 | credential_request = Request(credential_url, data=urlencode({ 73 | "grant_type": "client_credentials", 74 | "client_id": "python", 75 | "client_secret": self.key, 76 | "scope": "https://speech.platform.bing.com" 77 | }).encode("utf-8")) 78 | start_time = monotonic() 79 | try: 80 | credential_response = urlopen(credential_request) 81 | except HTTPError as e: 82 | raise RequestError("recognition request failed: {0}".format( 83 | getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 84 | except URLError as e: 85 | raise RequestError("recognition connection failed: {0}".format(e.reason)) 86 | credential_text = credential_response.read().decode("utf-8") 87 | credentials = json.loads(credential_text) 88 | self.access_token, expiry_seconds = credentials["access_token"], float(credentials["expires_in"]) 89 | 90 | self.expire_time = start_time + expiry_seconds 91 | 92 | def recognize(self, audio_data, language="en-US", show_all=False): 93 | self.auth() 94 | wav_data = self.to_wav(audio_data) 95 | url = "https://speech.platform.bing.com/recognize/query?{0}".format(urlencode({ 96 | "version": "3.0", 97 | "requestid": uuid.uuid4(), 98 | "appID": "D4D52672-91D7-4C74-8AD8-42B1D98141A5", 99 | "format": "json", 100 | "locale": language, 101 | "device.os": "wp7", 102 | "scenarios": "ulm", 103 | "instanceid": uuid.uuid4(), 104 | "result.profanitymarkup": "0", 105 | })) 106 | request = Request(url, data=wav_data, headers={ 107 | "Authorization": "Bearer {0}".format(self.access_token), 108 | "Content-Type": "audio/wav; samplerate=16000; sourcerate={0}; trustsourcerate=true".format(16000), 109 | }) 110 | try: 111 | response = urlopen(request) 112 | except HTTPError as e: 113 | raise RequestError("recognition request failed: {0}".format( 114 | getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 115 | except URLError as e: 116 | raise RequestError("recognition connection failed: {0}".format(e.reason)) 117 | response_text = response.read().decode("utf-8") 118 | result = json.loads(response_text) 119 | 120 | # return results 121 | if show_all: return result 122 | if "header" not in result or "lexical" not in result["header"]: raise UnknownValueError() 123 | return result["header"]["lexical"] 124 | 125 | def synthesize(self, text, language="en-US", gender="Female"): 126 | self.auth() 127 | 128 | if language not in self.locales.keys(): 129 | raise LocaleError("language locale not supported.") 130 | 131 | lang = self.locales.get(language) 132 | 133 | if gender not in ["Female", "Male", "Female2"]: 134 | gender = "Female" 135 | 136 | if len(lang) == 1: 137 | gender = lang.keys()[0] 138 | 139 | service_name = lang[gender] 140 | 141 | body = "\ 142 | %s\ 143 | " % (language, gender, service_name, text) 144 | 145 | headers = {"Content-type": "application/ssml+xml", 146 | "X-Microsoft-OutputFormat": "raw-16khz-16bit-mono-pcm", 147 | "Authorization": "Bearer " + self.access_token, 148 | "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA", 149 | "X-Search-ClientID": str(uuid.uuid1()).replace('-', ''), 150 | "User-Agent": "TTSForPython"} 151 | 152 | url = "https://speech.platform.bing.com/synthesize" 153 | request = Request(url, data=body, headers=headers) 154 | try: 155 | response = urlopen(request) 156 | except HTTPError as e: 157 | raise RequestError("tts request failed: {0}".format( 158 | getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 159 | except URLError as e: 160 | raise RequestError("tts connection failed: {0}".format(e.reason)) 161 | 162 | data = response.read() 163 | 164 | return data 165 | 166 | @staticmethod 167 | def to_wav(raw_data): 168 | # generate the WAV file contents 169 | with io.BytesIO() as wav_file: 170 | wav_writer = wave.open(wav_file, "wb") 171 | try: # note that we can't use context manager, since that was only added in Python 3.4 172 | wav_writer.setframerate(16000) 173 | wav_writer.setsampwidth(2) 174 | wav_writer.setnchannels(1) 175 | wav_writer.writeframes(raw_data) 176 | wav_data = wav_file.getvalue() 177 | finally: # make sure resources are cleaned up 178 | wav_writer.close() 179 | return wav_data 180 | 181 | 182 | if __name__ == '__main__': 183 | import sys 184 | 185 | if len(sys.argv) != 2: 186 | print('Usage: %s 16k_mono.wav' % sys.argv[0]) 187 | sys.exit(-1) 188 | 189 | wf = wave.open(sys.argv[1]) 190 | if wf.getframerate() != 16000 or wf.getnchannels() != 1 or wf.getsampwidth() != 2: 191 | print('only support 16000 sample rate, 1 channel and 2 bytes sample width') 192 | sys.exit(-2) 193 | 194 | # read less than 10 seconds audio data 195 | n = wf.getnframes() 196 | if (n / 16000.0) > 10.0: 197 | n = 16000 * 10 198 | 199 | frames = wf.readframes(n) 200 | 201 | bing = BingVoice(BING_KEY) 202 | 203 | # recognize speech using Microsoft Bing Voice Recognition 204 | try: 205 | text = bing.recognize(frames, language='en-US') 206 | print('Bing:' + text.encode('utf-8')) 207 | except UnknownValueError: 208 | print("Microsoft Bing Voice Recognition could not understand audio") 209 | except RequestError as e: 210 | print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e)) 211 | --------------------------------------------------------------------------------