├── .gitignore ├── LICENSE ├── hello.flac ├── README.md ├── tts_google.py └── stt_google.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | TODO Add MIT License 2 | -------------------------------------------------------------------------------- /hello.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeysonm82/python-google-speech-scripts/HEAD/hello.flac -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Simple scripts to interact with Google speech services. 2 | 3 | tts_google.py : Text to speech. 4 | 5 | stt_google.py: Speech to text. 6 | 7 | I only tested it on Linux. 8 | 9 | REQUIREMENTS: 10 | 11 | - pyaudio 12 | 13 | - flac (FLAC converter for Linux) 14 | -------------------------------------------------------------------------------- /tts_google.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import urllib 3 | import time 4 | import os 5 | 6 | 7 | def speak(text='hello', lang='en', fname='result.wav', player=None): 8 | """ Sends text to Google's text to speech service 9 | and returns created speech (wav file). """ 10 | 11 | limit = min(100, len(text))#100 characters is the current limit. 12 | text = text[0:limit] 13 | print "Text to speech:", text 14 | url = "http://translate.google.com/translate_tts" 15 | values = urllib.urlencode({"q": text, "textlen": len(text), "tl": lang}) 16 | hrs = {"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7"} 17 | #TODO catch exceptions 18 | req = urllib2.Request(url, data=values, headers=hrs) 19 | p = urllib2.urlopen(req) 20 | f = open(fname, 'wb') 21 | f.write(p.read()) 22 | f.close() 23 | print "Speech saved to:", fname 24 | if player is not None: 25 | play_wav(fname, player) 26 | 27 | 28 | def play_wav(filep, player='mplayer'): 29 | ''' Plays filep using player ''' 30 | print "Playing %s file using %s" % (filep, player) 31 | try: 32 | os.system(player + " " + filep) 33 | except: 34 | print "Couldn't use %s to play file" % (player) 35 | 36 | 37 | if(__name__ == '__main__'): 38 | speak("Hello world. The time is %s" % (time.strftime('%H %M')), 'mplayer') 39 | -------------------------------------------------------------------------------- /stt_google.py: -------------------------------------------------------------------------------- 1 | import pyaudio 2 | import wave 3 | import audioop 4 | from collections import deque 5 | import os 6 | import urllib2 7 | import urllib 8 | import time 9 | import math 10 | 11 | LANG_CODE = 'en-US' # Language to use 12 | 13 | GOOGLE_SPEECH_URL = 'https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&pfilter=2&lang=%s&maxresults=6' % (LANG_CODE) 14 | 15 | FLAC_CONV = 'flac -f' # We need a WAV to FLAC converter. flac is available 16 | # on Linux 17 | 18 | # Microphone stream config. 19 | CHUNK = 1024 # CHUNKS of bytes to read each time from mic 20 | FORMAT = pyaudio.paInt16 21 | CHANNELS = 1 22 | RATE = 16000 23 | THRESHOLD = 2500 # The threshold intensity that defines silence 24 | # and noise signal (an int. lower than THRESHOLD is silence). 25 | 26 | SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where 27 | # only silence is recorded. When this time passes the 28 | # recording finishes and the file is delivered. 29 | 30 | PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise 31 | # is detected, how much of previously recorded audio is 32 | # prepended. This helps to prevent chopping the beggining 33 | # of the phrase. 34 | 35 | 36 | def audio_int(num_samples=50): 37 | """ Gets average audio intensity of your mic sound. You can use it to get 38 | average intensities while you're talking and/or silent. The average 39 | is the avg of the 20% largest intensities recorded. 40 | """ 41 | 42 | print "Getting intensity values from mic." 43 | p = pyaudio.PyAudio() 44 | 45 | stream = p.open(format=FORMAT, 46 | channels=CHANNELS, 47 | rate=RATE, 48 | input=True, 49 | frames_per_buffer=CHUNK) 50 | 51 | values = [math.sqrt(abs(audioop.avg(stream.read(CHUNK), 4))) 52 | for x in range(num_samples)] 53 | values = sorted(values, reverse=True) 54 | r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2) 55 | print " Finished " 56 | print " Average audio intensity is ", r 57 | stream.close() 58 | p.terminate() 59 | return r 60 | 61 | 62 | def listen_for_speech(threshold=THRESHOLD, num_phrases=-1): 63 | """ 64 | Listens to Microphone, extracts phrases from it and sends it to 65 | Google's TTS service and returns response. a "phrase" is sound 66 | surrounded by silence (according to threshold). num_phrases controls 67 | how many phrases to process before finishing the listening process 68 | (-1 for infinite). 69 | """ 70 | 71 | #Open stream 72 | p = pyaudio.PyAudio() 73 | 74 | stream = p.open(format=FORMAT, 75 | channels=CHANNELS, 76 | rate=RATE, 77 | input=True, 78 | frames_per_buffer=CHUNK) 79 | 80 | print "* Listening mic. " 81 | audio2send = [] 82 | cur_data = '' # current chunk of audio data 83 | rel = RATE/CHUNK 84 | slid_win = deque(maxlen=SILENCE_LIMIT * rel) 85 | #Prepend audio from 0.5 seconds before noise was detected 86 | prev_audio = deque(maxlen=PREV_AUDIO * rel) 87 | started = False 88 | n = num_phrases 89 | response = [] 90 | 91 | while (num_phrases == -1 or n > 0): 92 | cur_data = stream.read(CHUNK) 93 | slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4)))) 94 | #print slid_win[-1] 95 | if(sum([x > THRESHOLD for x in slid_win]) > 0): 96 | if(not started): 97 | print "Starting record of phrase" 98 | started = True 99 | audio2send.append(cur_data) 100 | elif (started is True): 101 | print "Finished" 102 | # The limit was reached, finish capture and deliver. 103 | filename = save_speech(list(prev_audio) + audio2send, p) 104 | # Send file to Google and get response 105 | r = stt_google_wav(filename) 106 | if num_phrases == -1: 107 | print "Response", r 108 | else: 109 | response.append(r) 110 | # Remove temp file. Comment line to review. 111 | os.remove(filename) 112 | # Reset all 113 | started = False 114 | slid_win = deque(maxlen=SILENCE_LIMIT * rel) 115 | prev_audio = deque(maxlen=0.5 * rel) 116 | audio2send = [] 117 | n -= 1 118 | print "Listening ..." 119 | else: 120 | prev_audio.append(cur_data) 121 | 122 | print "* Done recording" 123 | stream.close() 124 | p.terminate() 125 | 126 | return response 127 | 128 | 129 | def save_speech(data, p): 130 | """ Saves mic data to temporary WAV file. Returns filename of saved 131 | file """ 132 | 133 | filename = 'output_'+str(int(time.time())) 134 | # writes data to WAV file 135 | data = ''.join(data) 136 | wf = wave.open(filename + '.wav', 'wb') 137 | wf.setnchannels(1) 138 | wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) 139 | wf.setframerate(16000) # TODO make this value a function parameter? 140 | wf.writeframes(data) 141 | wf.close() 142 | return filename + '.wav' 143 | 144 | 145 | def stt_google_wav(audio_fname): 146 | """ Sends audio file (audio_fname) to Google's text to speech 147 | service and returns service's response. We need a FLAC 148 | converter if audio is not FLAC (check FLAC_CONV). """ 149 | 150 | print "Sending ", audio_fname 151 | #Convert to flac first 152 | filename = audio_fname 153 | del_flac = False 154 | if 'flac' not in filename: 155 | del_flac = True 156 | print "Converting to flac" 157 | print FLAC_CONV + filename 158 | os.system(FLAC_CONV + ' ' + filename) 159 | filename = filename.split('.')[0] + '.flac' 160 | 161 | f = open(filename, 'rb') 162 | flac_cont = f.read() 163 | f.close() 164 | 165 | # Headers. A common Chromium (Linux) User-Agent 166 | hrs = {"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7", 167 | 'Content-type': 'audio/x-flac; rate=16000'} 168 | 169 | req = urllib2.Request(GOOGLE_SPEECH_URL, data=flac_cont, headers=hrs) 170 | print "Sending request to Google TTS" 171 | #print "response", response 172 | try: 173 | p = urllib2.urlopen(req) 174 | response = p.read() 175 | res = eval(response)['hypotheses'] 176 | except: 177 | print "Couldn't parse service response" 178 | res = None 179 | 180 | if del_flac: 181 | os.remove(filename) # Remove temp file 182 | 183 | return res 184 | 185 | 186 | if(__name__ == '__main__'): 187 | listen_for_speech() # listen to mic. 188 | #print stt_google_wav('hello.flac') # translate audio file 189 | #audio_int() # To measure your mic levels 190 | --------------------------------------------------------------------------------