├── README.md ├── requirements.txt ├── websocket_client.py └── websocket_server.py /README.md: -------------------------------------------------------------------------------- 1 | # Google-speech-to-text-python-websocket-server-using-microphone-stream 2 | Python WebSocket server which converts input audio stream from microphone to text using Google speech to text 3 | 4 | Setup 5 | 6 | 1) Clone the repo 7 | 8 | $ git clone https://github.com/dawntcherian/Google-speech-to-text-python-websocket-server-using-microphone-stream.git 9 | 2) Install pip and virtualenv if you do not already have them. 10 | 3) Create a virtualenv with Python 3.6.4 11 | 4) Install the dependencies 12 | 13 | $ pip install -r requirements.txt 14 | 5) Provide authentication credentials to your application code by setting the environment variable GOOGLE_APPLICATION_CREDENTIALS. (Follow https://cloud.google.com/docs/authentication/getting-started) 15 | 6) Run 16 | 17 | $ python websocket_server.py 18 | 19 | $ python websocket_client.py 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==3.0.0 2 | certifi==2018.10.15 3 | chardet==3.0.4 4 | google-api-core==1.5.2 5 | google-auth==1.6.1 6 | google-cloud-speech==0.36.0 7 | googleapis-common-protos==1.5.5 8 | grpcio==1.16.0 9 | idna==2.7 10 | protobuf==3.6.1 11 | pyasn1==0.4.4 12 | pyasn1-modules==0.2.2 13 | PyAudio==0.2.11 14 | pytz==2018.7 15 | requests==2.20.1 16 | rsa==4.0 17 | six==1.11.0 18 | urllib3==1.24.1 19 | websockets==6.0 20 | -------------------------------------------------------------------------------- /websocket_client.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import asyncio 3 | import websockets 4 | import json 5 | import pyaudio 6 | from google.cloud.speech import enums 7 | 8 | FORMAT = pyaudio.paInt16 9 | CHANNELS = 1 10 | RATE = 16000 11 | CHUNK = int(RATE / 10) 12 | 13 | audio = pyaudio.PyAudio() 14 | 15 | stream = audio.open(format=FORMAT, 16 | channels=CHANNELS, 17 | rate=RATE, 18 | input=True, 19 | frames_per_buffer=CHUNK) 20 | 21 | 22 | async def microphone_client(): 23 | async with websockets.connect( 24 | 'ws://0.0.0.0:8000/') as websocket: 25 | await websocket.send(json.dumps({ 26 | "rate": RATE, 27 | "format": enums.RecognitionConfig.AudioEncoding.LINEAR16, 28 | "language": 'en-IN' 29 | })) 30 | while True: 31 | data = stream.read(CHUNK) 32 | await websocket.send(data) 33 | 34 | 35 | asyncio.get_event_loop().run_until_complete(microphone_client()) 36 | -------------------------------------------------------------------------------- /websocket_server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import websockets 3 | import json 4 | import threading 5 | from six.moves import queue 6 | from google.cloud import speech 7 | from google.cloud.speech import types 8 | 9 | 10 | IP = '0.0.0.0' 11 | PORT = 8000 12 | 13 | class Transcoder(object): 14 | """ 15 | Converts audio chunks to text 16 | """ 17 | def __init__(self, encoding, rate, language): 18 | self.buff = queue.Queue() 19 | self.encoding = encoding 20 | self.language = language 21 | self.rate = rate 22 | self.closed = True 23 | self.transcript = None 24 | 25 | def start(self): 26 | """Start up streaming speech call""" 27 | threading.Thread(target=self.process).start() 28 | 29 | def response_loop(self, responses): 30 | """ 31 | Pick up the final result of Speech to text conversion 32 | """ 33 | for response in responses: 34 | if not response.results: 35 | continue 36 | result = response.results[0] 37 | if not result.alternatives: 38 | continue 39 | transcript = result.alternatives[0].transcript 40 | if result.is_final: 41 | self.transcript = transcript 42 | 43 | def process(self): 44 | """ 45 | Audio stream recognition and result parsing 46 | """ 47 | #You can add speech contexts for better recognition 48 | cap_speech_context = types.SpeechContext(phrases=["Add your phrases here"]) 49 | client = speech.SpeechClient() 50 | config = types.RecognitionConfig( 51 | encoding=self.encoding, 52 | sample_rate_hertz=self.rate, 53 | language_code=self.language, 54 | speech_contexts=[cap_speech_context,], 55 | model='command_and_search' 56 | ) 57 | streaming_config = types.StreamingRecognitionConfig( 58 | config=config, 59 | interim_results=False, 60 | single_utterance=False) 61 | audio_generator = self.stream_generator() 62 | requests = (types.StreamingRecognizeRequest(audio_content=content) 63 | for content in audio_generator) 64 | 65 | responses = client.streaming_recognize(streaming_config, requests) 66 | try: 67 | self.response_loop(responses) 68 | except: 69 | self.start() 70 | 71 | def stream_generator(self): 72 | while not self.closed: 73 | chunk = self.buff.get() 74 | if chunk is None: 75 | return 76 | data = [chunk] 77 | while True: 78 | try: 79 | chunk = self.buff.get(block=False) 80 | if chunk is None: 81 | return 82 | data.append(chunk) 83 | except queue.Empty: 84 | break 85 | yield b''.join(data) 86 | 87 | def write(self, data): 88 | """ 89 | Writes data to the buffer 90 | """ 91 | self.buff.put(data) 92 | 93 | 94 | async def audio_processor(websocket, path): 95 | """ 96 | Collects audio from the stream, writes it to buffer and return the output of Google speech to text 97 | """ 98 | config = await websocket.recv() 99 | if not isinstance(config, str): 100 | print("ERROR, no config") 101 | return 102 | config = json.loads(config) 103 | transcoder = Transcoder( 104 | encoding=config["format"], 105 | rate=config["rate"], 106 | language=config["language"] 107 | ) 108 | transcoder.start() 109 | while True: 110 | try: 111 | data = await websocket.recv() 112 | except websockets.ConnectionClosed: 113 | print("Connection closed") 114 | break 115 | transcoder.write(data) 116 | transcoder.closed = False 117 | if transcoder.transcript: 118 | print(transcoder.transcript) 119 | await websocket.send(transcoder.transcript) 120 | transcoder.transcript = None 121 | 122 | start_server = websockets.serve(audio_processor, IP, PORT) 123 | asyncio.get_event_loop().run_until_complete(start_server) 124 | asyncio.get_event_loop().run_forever() 125 | 126 | --------------------------------------------------------------------------------