├── README.md
├── requirements.txt
├── websocket_client.py
└── websocket_server.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Google-speech-to-text-python-websocket-server-using-microphone-stream
 2 | Python WebSocket server which converts input audio stream from microphone to text using Google speech to text
 3 | 
 4 | <b>Setup</b>
 5 | 
 6 | 1) Clone the repo 
 7 | 
 8 |     $ git clone https://github.com/dawntcherian/Google-speech-to-text-python-websocket-server-using-microphone-stream.git
 9 | 2) Install pip and virtualenv if you do not already have them. 
10 | 3) Create a virtualenv with Python 3.6.4
11 | 4) Install the dependencies
12 |     
13 |     $ pip install -r requirements.txt
14 | 5) Provide authentication credentials to your application code by setting the environment variable GOOGLE_APPLICATION_CREDENTIALS. (Follow https://cloud.google.com/docs/authentication/getting-started)
15 | 6) Run
16 | 
17 |     $ python websocket_server.py
18 |     
19 |     $ python websocket_client.py
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cachetools==3.0.0
 2 | certifi==2018.10.15
 3 | chardet==3.0.4
 4 | google-api-core==1.5.2
 5 | google-auth==1.6.1
 6 | google-cloud-speech==0.36.0
 7 | googleapis-common-protos==1.5.5
 8 | grpcio==1.16.0
 9 | idna==2.7
10 | protobuf==3.6.1
11 | pyasn1==0.4.4
12 | pyasn1-modules==0.2.2
13 | PyAudio==0.2.11
14 | pytz==2018.7
15 | requests==2.20.1
16 | rsa==4.0
17 | six==1.11.0
18 | urllib3==1.24.1
19 | websockets==6.0
20 | 


--------------------------------------------------------------------------------
/websocket_client.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import asyncio
 3 | import websockets
 4 | import json
 5 | import pyaudio
 6 | from google.cloud.speech import enums
 7 | 
 8 | FORMAT = pyaudio.paInt16
 9 | CHANNELS = 1
10 | RATE = 16000
11 | CHUNK = int(RATE / 10)
12 | 
13 | audio = pyaudio.PyAudio()
14 | 
15 | stream = audio.open(format=FORMAT,
16 |                     channels=CHANNELS,
17 |                     rate=RATE,
18 |                     input=True,
19 |                     frames_per_buffer=CHUNK)
20 | 
21 | 
22 | async def microphone_client():
23 |     async with websockets.connect(
24 |             'ws://0.0.0.0:8000/') as websocket:
25 |         await websocket.send(json.dumps({
26 |             "rate": RATE,
27 |             "format": enums.RecognitionConfig.AudioEncoding.LINEAR16,
28 |             "language": 'en-IN'
29 |         }))
30 |         while True:
31 |             data = stream.read(CHUNK)
32 |             await websocket.send(data)
33 | 
34 | 
35 | asyncio.get_event_loop().run_until_complete(microphone_client())
36 | 


--------------------------------------------------------------------------------
/websocket_server.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import websockets
  3 | import json
  4 | import threading
  5 | from six.moves import queue
  6 | from google.cloud import speech
  7 | from google.cloud.speech import types
  8 | 
  9 | 
 10 | IP = '0.0.0.0'
 11 | PORT = 8000
 12 | 
 13 | class Transcoder(object):
 14 |     """
 15 |     Converts audio chunks to text
 16 |     """
 17 |     def __init__(self, encoding, rate, language):
 18 |         self.buff = queue.Queue()
 19 |         self.encoding = encoding
 20 |         self.language = language
 21 |         self.rate = rate
 22 |         self.closed = True
 23 |         self.transcript = None
 24 | 
 25 |     def start(self):
 26 |         """Start up streaming speech call"""
 27 |         threading.Thread(target=self.process).start()
 28 | 
 29 |     def response_loop(self, responses):
 30 |         """
 31 |         Pick up the final result of Speech to text conversion
 32 |         """
 33 |         for response in responses:
 34 |             if not response.results:
 35 |                 continue
 36 |             result = response.results[0]
 37 |             if not result.alternatives:
 38 |                 continue
 39 |             transcript = result.alternatives[0].transcript
 40 |             if result.is_final:
 41 |                 self.transcript = transcript
 42 | 
 43 |     def process(self):
 44 |         """
 45 |         Audio stream recognition and result parsing
 46 |         """
 47 |         #You can add speech contexts for better recognition
 48 |         cap_speech_context = types.SpeechContext(phrases=["Add your phrases here"])
 49 |         client = speech.SpeechClient()
 50 |         config = types.RecognitionConfig(
 51 |             encoding=self.encoding,
 52 |             sample_rate_hertz=self.rate,
 53 |             language_code=self.language,
 54 |             speech_contexts=[cap_speech_context,],
 55 |             model='command_and_search'
 56 |         )
 57 |         streaming_config = types.StreamingRecognitionConfig(
 58 |             config=config,
 59 |             interim_results=False,
 60 |             single_utterance=False)
 61 |         audio_generator = self.stream_generator()
 62 |         requests = (types.StreamingRecognizeRequest(audio_content=content)
 63 |                     for content in audio_generator)
 64 | 
 65 |         responses = client.streaming_recognize(streaming_config, requests)
 66 |         try:
 67 |             self.response_loop(responses)
 68 |         except:
 69 |             self.start()
 70 | 
 71 |     def stream_generator(self):
 72 |         while not self.closed:
 73 |             chunk = self.buff.get()
 74 |             if chunk is None:
 75 |                 return
 76 |             data = [chunk]
 77 |             while True:
 78 |                 try:
 79 |                     chunk = self.buff.get(block=False)
 80 |                     if chunk is None:
 81 |                         return
 82 |                     data.append(chunk)
 83 |                 except queue.Empty:
 84 |                     break
 85 |             yield b''.join(data)
 86 | 
 87 |     def write(self, data):
 88 |         """
 89 |         Writes data to the buffer
 90 |         """
 91 |         self.buff.put(data)
 92 | 
 93 | 
 94 | async def audio_processor(websocket, path):
 95 |     """
 96 |     Collects audio from the stream, writes it to buffer and return the output of Google speech to text
 97 |     """
 98 |     config = await websocket.recv()
 99 |     if not isinstance(config, str):
100 |         print("ERROR, no config")
101 |         return
102 |     config = json.loads(config)
103 |     transcoder = Transcoder(
104 |         encoding=config["format"],
105 |         rate=config["rate"],
106 |         language=config["language"]
107 |     )
108 |     transcoder.start()
109 |     while True:
110 |         try:
111 |             data = await websocket.recv()
112 |         except websockets.ConnectionClosed:
113 |             print("Connection closed")
114 |             break
115 |         transcoder.write(data)
116 |         transcoder.closed = False
117 |         if transcoder.transcript:
118 |             print(transcoder.transcript)
119 |             await websocket.send(transcoder.transcript)
120 |             transcoder.transcript = None
121 | 
122 | start_server = websockets.serve(audio_processor, IP, PORT)
123 | asyncio.get_event_loop().run_until_complete(start_server)
124 | asyncio.get_event_loop().run_forever()
125 | 
126 | 


--------------------------------------------------------------------------------