├── .gitignore
├── LICENSE
├── hello.flac
├── README.md
├── tts_google.py
└── stt_google.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | TODO Add MIT License
2 | 


--------------------------------------------------------------------------------
/hello.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeysonm82/python-google-speech-scripts/HEAD/hello.flac


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Simple scripts to interact with Google speech services.
 2 | 
 3 | tts_google.py : Text to speech.
 4 | 
 5 | stt_google.py: Speech to text.
 6 | 
 7 | I only tested it on Linux.
 8 | 
 9 | REQUIREMENTS:
10 | 
11 | - pyaudio
12 | 
13 | - flac (FLAC converter for Linux)
14 | 


--------------------------------------------------------------------------------
/tts_google.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import urllib
 3 | import time
 4 | import os
 5 | 
 6 | 
 7 | def speak(text='hello', lang='en', fname='result.wav', player=None):
 8 |     """ Sends text to Google's text to speech service
 9 |     and returns created speech (wav file). """
10 | 
11 |     limit = min(100, len(text))#100 characters is the current limit.
12 |     text = text[0:limit]
13 |     print "Text to speech:", text
14 |     url = "http://translate.google.com/translate_tts"
15 |     values = urllib.urlencode({"q": text, "textlen": len(text), "tl": lang})
16 |     hrs = {"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7"}
17 |     #TODO catch exceptions
18 |     req = urllib2.Request(url, data=values, headers=hrs)
19 |     p = urllib2.urlopen(req)
20 |     f = open(fname, 'wb')
21 |     f.write(p.read())
22 |     f.close()
23 |     print "Speech saved to:", fname
24 |     if player is not None:
25 |         play_wav(fname, player)
26 | 
27 | 
28 | def play_wav(filep, player='mplayer'):
29 |     ''' Plays filep using player '''
30 |     print "Playing %s file using %s" % (filep, player)
31 |     try:
32 |         os.system(player + " " + filep)
33 |     except:
34 |         print "Couldn't use %s to play file" % (player)
35 | 
36 | 
37 | if(__name__ == '__main__'):
38 |     speak("Hello world. The time is %s" % (time.strftime('%H %M')), 'mplayer')
39 | 


--------------------------------------------------------------------------------
/stt_google.py:
--------------------------------------------------------------------------------
  1 | import pyaudio
  2 | import wave
  3 | import audioop
  4 | from collections import deque
  5 | import os
  6 | import urllib2
  7 | import urllib
  8 | import time
  9 | import math
 10 | 
 11 | LANG_CODE = 'en-US'  # Language to use
 12 | 
 13 | GOOGLE_SPEECH_URL = 'https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&pfilter=2&lang=%s&maxresults=6' % (LANG_CODE)
 14 | 
 15 | FLAC_CONV = 'flac -f'  # We need a WAV to FLAC converter. flac is available
 16 |                        # on Linux
 17 | 
 18 | # Microphone stream config.
 19 | CHUNK = 1024  # CHUNKS of bytes to read each time from mic
 20 | FORMAT = pyaudio.paInt16
 21 | CHANNELS = 1
 22 | RATE = 16000
 23 | THRESHOLD = 2500  # The threshold intensity that defines silence
 24 |                   # and noise signal (an int. lower than THRESHOLD is silence).
 25 | 
 26 | SILENCE_LIMIT = 1  # Silence limit in seconds. The max ammount of seconds where
 27 |                    # only silence is recorded. When this time passes the
 28 |                    # recording finishes and the file is delivered.
 29 | 
 30 | PREV_AUDIO = 0.5  # Previous audio (in seconds) to prepend. When noise
 31 |                   # is detected, how much of previously recorded audio is
 32 |                   # prepended. This helps to prevent chopping the beggining
 33 |                   # of the phrase.
 34 | 
 35 | 
 36 | def audio_int(num_samples=50):
 37 |     """ Gets average audio intensity of your mic sound. You can use it to get
 38 |         average intensities while you're talking and/or silent. The average
 39 |         is the avg of the 20% largest intensities recorded.
 40 |     """
 41 | 
 42 |     print "Getting intensity values from mic."
 43 |     p = pyaudio.PyAudio()
 44 | 
 45 |     stream = p.open(format=FORMAT,
 46 |                     channels=CHANNELS,
 47 |                     rate=RATE,
 48 |                     input=True,
 49 |                     frames_per_buffer=CHUNK)
 50 | 
 51 |     values = [math.sqrt(abs(audioop.avg(stream.read(CHUNK), 4))) 
 52 |               for x in range(num_samples)] 
 53 |     values = sorted(values, reverse=True)
 54 |     r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
 55 |     print " Finished "
 56 |     print " Average audio intensity is ", r
 57 |     stream.close()
 58 |     p.terminate()
 59 |     return r
 60 | 
 61 | 
 62 | def listen_for_speech(threshold=THRESHOLD, num_phrases=-1):
 63 |     """
 64 |     Listens to Microphone, extracts phrases from it and sends it to 
 65 |     Google's TTS service and returns response. a "phrase" is sound 
 66 |     surrounded by silence (according to threshold). num_phrases controls
 67 |     how many phrases to process before finishing the listening process 
 68 |     (-1 for infinite). 
 69 |     """
 70 | 
 71 |     #Open stream
 72 |     p = pyaudio.PyAudio()
 73 | 
 74 |     stream = p.open(format=FORMAT,
 75 |                     channels=CHANNELS,
 76 |                     rate=RATE,
 77 |                     input=True,
 78 |                     frames_per_buffer=CHUNK)
 79 | 
 80 |     print "* Listening mic. "
 81 |     audio2send = []
 82 |     cur_data = ''  # current chunk  of audio data
 83 |     rel = RATE/CHUNK
 84 |     slid_win = deque(maxlen=SILENCE_LIMIT * rel)
 85 |     #Prepend audio from 0.5 seconds before noise was detected
 86 |     prev_audio = deque(maxlen=PREV_AUDIO * rel) 
 87 |     started = False
 88 |     n = num_phrases
 89 |     response = []
 90 | 
 91 |     while (num_phrases == -1 or n > 0):
 92 |         cur_data = stream.read(CHUNK)
 93 |         slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
 94 |         #print slid_win[-1]
 95 |         if(sum([x > THRESHOLD for x in slid_win]) > 0):
 96 |             if(not started):
 97 |                 print "Starting record of phrase"
 98 |                 started = True
 99 |             audio2send.append(cur_data)
100 |         elif (started is True):
101 |             print "Finished"
102 |             # The limit was reached, finish capture and deliver.
103 |             filename = save_speech(list(prev_audio) + audio2send, p)
104 |             # Send file to Google and get response
105 |             r = stt_google_wav(filename) 
106 |             if num_phrases == -1:
107 |                 print "Response", r
108 |             else:
109 |                 response.append(r)
110 |             # Remove temp file. Comment line to review.
111 |             os.remove(filename)
112 |             # Reset all
113 |             started = False
114 |             slid_win = deque(maxlen=SILENCE_LIMIT * rel)
115 |             prev_audio = deque(maxlen=0.5 * rel) 
116 |             audio2send = []
117 |             n -= 1
118 |             print "Listening ..."
119 |         else:
120 |             prev_audio.append(cur_data)
121 | 
122 |     print "* Done recording"
123 |     stream.close()
124 |     p.terminate()
125 | 
126 |     return response
127 | 
128 | 
129 | def save_speech(data, p):
130 |     """ Saves mic data to temporary WAV file. Returns filename of saved 
131 |         file """
132 | 
133 |     filename = 'output_'+str(int(time.time()))
134 |     # writes data to WAV file
135 |     data = ''.join(data)
136 |     wf = wave.open(filename + '.wav', 'wb')
137 |     wf.setnchannels(1)
138 |     wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
139 |     wf.setframerate(16000)  # TODO make this value a function parameter?
140 |     wf.writeframes(data)
141 |     wf.close()
142 |     return filename + '.wav'
143 | 
144 | 
145 | def stt_google_wav(audio_fname):
146 |     """ Sends audio file (audio_fname) to Google's text to speech 
147 |         service and returns service's response. We need a FLAC 
148 |         converter if audio is not FLAC (check FLAC_CONV). """
149 | 
150 |     print "Sending ", audio_fname
151 |     #Convert to flac first
152 |     filename = audio_fname
153 |     del_flac = False
154 |     if 'flac' not in filename:
155 |         del_flac = True
156 |         print "Converting to flac"
157 |         print FLAC_CONV + filename
158 |         os.system(FLAC_CONV + ' ' + filename)
159 |         filename = filename.split('.')[0] + '.flac'
160 | 
161 |     f = open(filename, 'rb')
162 |     flac_cont = f.read()
163 |     f.close()
164 | 
165 |     # Headers. A common Chromium (Linux) User-Agent
166 |     hrs = {"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7", 
167 |            'Content-type': 'audio/x-flac; rate=16000'}  
168 | 
169 |     req = urllib2.Request(GOOGLE_SPEECH_URL, data=flac_cont, headers=hrs)
170 |     print "Sending request to Google TTS"
171 |     #print "response", response
172 |     try:
173 |         p = urllib2.urlopen(req)
174 |         response = p.read()
175 |         res = eval(response)['hypotheses']
176 |     except:
177 |         print "Couldn't parse service response"
178 |         res = None
179 | 
180 |     if del_flac:
181 |         os.remove(filename)  # Remove temp file
182 | 
183 |     return res
184 | 
185 | 
186 | if(__name__ == '__main__'):
187 |     listen_for_speech()  # listen to mic.
188 |     #print stt_google_wav('hello.flac')  # translate audio file
189 |     #audio_int()  # To measure your mic levels
190 | 


--------------------------------------------------------------------------------