├── LICENSE ├── build_diarized_transcript.py ├── deepgram_streaming.py └── twilio ├── node ├── LICENSE ├── README.md ├── twilio-api-scripts │ └── stream.js ├── twilio-proxy-mono.js └── twilio-proxy-stereo.js └── python ├── LICENSE ├── README.md ├── twilio-api-scripts └── stream.py ├── twilio-proxy-mono.py └── twilio-proxy-stereo.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 deepgram 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /build_diarized_transcript.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Usage: Pipe output into an executable version of this script. 4 | # E.g.: curl | build_diarized_transcript.py 5 | 6 | import json 7 | import sys 8 | 9 | def parse_response(res): 10 | res = json.loads(res) 11 | metadata = res['metadata'] 12 | channels = metadata['channels'] 13 | print('Duration: {:.3f}'.format(metadata['duration'])) 14 | print('Channels: {}'.format('channels')) 15 | print() 16 | for channel in range(channels): 17 | if channel != 0: 18 | print() 19 | print('Channel: {} of {}'.format(channel+1, channels)) 20 | words = res['results']['channels'][channel]['alternatives'][0]['words'] 21 | speaker = None 22 | run = [] 23 | for word in words: 24 | if speaker is not None and speaker != word["speaker"]: 25 | print("[{:.3f} - {:.3f}] {}: {}".format(run[0]["start"], run[-1]["end"], speaker, ' '.join(w["word"] for w in run))) 26 | run = [] 27 | run.append(word) 28 | speaker = word["speaker"] 29 | if run: 30 | print("[{:.3f} - {:.3f}] {}: {}".format(run[0]["start"], run[-1]["end"], speaker, ' '.join(w["word"] for w in run))) 31 | 32 | if __name__ == '__main__': 33 | sys.exit(parse_response(sys.stdin.read()) or 0) 34 | -------------------------------------------------------------------------------- /deepgram_streaming.py: -------------------------------------------------------------------------------- 1 | """ A simple example which prints out parsed streaming responses. 2 | Python version: 3.6+ 3 | Dependencies (use `pip install X` to install a dependency): 4 | - websockets 5 | Usage: 6 | python deepgram_streaming.py -k 'YOUR_DEEPGRAM_API_KEY' /path/to/audio.wav 7 | Limitations: 8 | - Only parses signed, 16-bit little-endian encoded WAV files. 9 | """ 10 | 11 | import argparse 12 | import asyncio 13 | import base64 14 | import json 15 | import sys 16 | import wave 17 | import websockets 18 | import subprocess 19 | 20 | # Mimic sending a real-time stream by sending this many seconds of audio at a time. 21 | REALTIME_RESOLUTION = 0.100 22 | 23 | async def run(data, key, channels, sample_width, sample_rate, filepath): 24 | # How many bytes are contained in one second of audio. 25 | byte_rate = sample_width * sample_rate * channels 26 | print('This demonstration will print all finalized results, not interim results.') 27 | 28 | # Connect to the real-time streaming endpoint, attaching our credentials. 29 | async with websockets.connect( 30 | # Alter the protocol and base URL below. 31 | f'wss://api.deepgram.com/v1/listen?punctuate=true&channels={channels}&sample_rate={sample_rate}&encoding=linear16', 32 | extra_headers={ 33 | 'Authorization': 'Token {}'.format(key) 34 | } 35 | ) as ws: 36 | async def sender(ws): 37 | """ Sends the data, mimicking a real-time connection. 38 | """ 39 | nonlocal data 40 | try: 41 | total = len(data) 42 | while len(data): 43 | # How many bytes are in `REALTIME_RESOLUTION` seconds of audio? 44 | i = int(byte_rate * REALTIME_RESOLUTION) 45 | chunk, data = data[:i], data[i:] 46 | # Send the data 47 | await ws.send(chunk) 48 | # Mimic real-time by waiting `REALTIME_RESOLUTION` seconds 49 | # before the next packet. 50 | await asyncio.sleep(REALTIME_RESOLUTION) 51 | 52 | # An empty binary message tells Deepgram that no more audio 53 | # will be sent. Deepgram will close the connection once all 54 | # audio has finished processing. 55 | await ws.send(b'') 56 | except Exception as e: 57 | print(f'Error while sending: {e}') 58 | raise 59 | 60 | async def receiver(ws): 61 | """ Print out the messages received from the server. 62 | """ 63 | async for msg in ws: 64 | res = json.loads(msg) 65 | try: 66 | # To see interim results in this demo, remove the conditional `if res['is_final']:`. 67 | if res['is_final']: 68 | transcript = res['channel']['alternatives'][0]['transcript'] 69 | start = res['start'] 70 | print(f'{transcript}') 71 | except KeyError: 72 | print(msg) 73 | 74 | await asyncio.wait([ 75 | asyncio.ensure_future(sender(ws)), 76 | asyncio.ensure_future(receiver(ws)) 77 | ]) 78 | print() 79 | 80 | def parse_args(): 81 | """ Parses the command-line arguments. 82 | """ 83 | parser = argparse.ArgumentParser(description='Submits data to the real-time streaming endpoint.') 84 | parser.add_argument('-k', '--key', required=True, help='YOUR_DEEPGRAM_API_KEY (authorization)') 85 | parser.add_argument('input', help='Input file.') 86 | return parser.parse_args() 87 | 88 | def main(): 89 | """ Entrypoint for the example. 90 | """ 91 | # Parse the command-line arguments. 92 | args = parse_args() 93 | 94 | # Open the audio file. 95 | with wave.open(args.input, 'rb') as fh: 96 | (channels, sample_width, sample_rate, num_samples, _, _) = fh.getparams() 97 | assert sample_width == 2, 'WAV data must be 16-bit.' 98 | data = fh.readframes(num_samples) 99 | print(f'Channels = {channels}, Sample Rate = {sample_rate} Hz, Sample width = {sample_width} bytes, Size = {len(data)} bytes', file=sys.stderr) 100 | 101 | # Run the example. 102 | asyncio.get_event_loop().run_until_complete(run(data, args.key, channels, sample_width, sample_rate, args.input)) 103 | 104 | if __name__ == '__main__': 105 | sys.exit(main() or 0) 106 | -------------------------------------------------------------------------------- /twilio/node/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 deepgram 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /twilio/node/README.md: -------------------------------------------------------------------------------- 1 | # twilio integration 2 | 3 | Two node scripts are provided which can do basic twilio-deepgram proxying. Run `node twilio-proxy-mono.js` or `node twilio-proxy-stereo.js` 4 | to run the proxy server for either just the inbound track, or both the inbound and outbound tracks, respectively. They have the following dependencies: 5 | 6 | ``` 7 | npm install cross-fetch websocket 8 | npm install cross-fetch @deepgram/sdk 9 | ``` 10 | 11 | Refer to the twilio documentation on streaming tracks for more info on mono vs stereo twilio streaming: 12 | 13 | https://www.twilio.com/docs/voice/twiml/stream#attributes-track 14 | 15 | In either script, you will need to add your Deepgram API Key where it says `INSERT_YOUR_API_KEY`, 16 | as well as set up twilio to send websockets data to the server running the scripts. This is done via 17 | TwiML Bin files: 18 | 19 | https://www.twilio.com/docs/runtime/tutorials/twiml-bins 20 | 21 | My TwiML Bin files ended up looking like the following for mono: 22 | 23 | ``` 24 | 25 | 26 | 27 | 28 | 29 | my-phone-number 30 | 31 | ``` 32 | 33 | and the following for stereo (just adding the extra `track` parameter): 34 | 35 | ``` 36 | 37 | 38 | 39 | 40 | 41 | my-phone-number 42 | 43 | ``` 44 | 45 | When calling your twilio number, the call will be forwarded to the number you set in your TwiML Bin. The 46 | conversation will then be forked to this `twilio-proxy-mono`/`twilio-proxy-stereo` app which sends the audio to Deepgram, receives 47 | transcriptions, and prints the transcriptions to the screen. You will likely want to do something like 48 | provide an `http` (or websockets) callback to send transcriptions to. 49 | -------------------------------------------------------------------------------- /twilio/node/twilio-api-scripts/stream.js: -------------------------------------------------------------------------------- 1 | // Twilio helper library 2 | const twilio = require("twilio"); 3 | 4 | // Your account SID and auth token from twilio.com/console 5 | const accountSid = process.env.TWILIO_ACCOUNT_SID; 6 | const authToken = process.env.TWILIO_AUTH_TOKEN; 7 | 8 | // The Twilio client 9 | const client = twilio(accountSid, authToken); 10 | 11 | // Make the outgoing call 12 | client.calls 13 | .create({ 14 | twiml: 15 | '+11231231234', // replace number with person B, replace url 16 | to: "+11231231234", // person A 17 | from: "+11231231234", // your Twilio number 18 | }) 19 | .then((call) => console.log(call.sid)) 20 | .catch((err) => console.error(err)); 21 | -------------------------------------------------------------------------------- /twilio/node/twilio-proxy-mono.js: -------------------------------------------------------------------------------- 1 | const WebSocketServer = require("ws"); 2 | const { createClient, LiveTranscriptionEvents } = require("@deepgram/sdk"); 3 | 4 | const websocketServer = new WebSocketServer.Server({ port: 5000 }); 5 | const deepgramApiKey = "INSERT_YOUR_API_KEY_HERE"; 6 | 7 | websocketServer.on("connection", (ws) => { 8 | console.log("new client connected"); 9 | 10 | const deepgram = createClient(deepgramApiKey); 11 | const connection = deepgram.listen.live({ 12 | model: "nova-2", 13 | smart_format: true, 14 | encoding: "mulaw", 15 | sample_rate: 8000, 16 | channels: 1, 17 | }); 18 | 19 | connection.on(LiveTranscriptionEvents.Open, () => { 20 | connection.on(LiveTranscriptionEvents.Close, () => { 21 | console.log("Connection closed."); 22 | }); 23 | 24 | connection.on(LiveTranscriptionEvents.Transcript, (transcription) => { 25 | console.dir(transcription, { depth: null }); 26 | }); 27 | 28 | ws.on("message", (data) => { 29 | const twilioMessage = JSON.parse(data); 30 | if ( 31 | twilioMessage["event"] === "connected" || 32 | twilioMessage["event"] === "start" 33 | ) { 34 | console.log("received a twilio connected or start event"); 35 | } 36 | if (twilioMessage["event"] === "media") { 37 | const media = twilioMessage["media"]; 38 | const audio = Buffer.from(media["payload"], "base64"); 39 | connection.send(audio); 40 | } 41 | }); 42 | 43 | ws.on("close", () => { 44 | console.log("client has disconnected"); 45 | if (connection) { 46 | connection.finish(); 47 | } 48 | }); 49 | 50 | ws.onerror = function () { 51 | console.log("some error occurred"); 52 | connection.finish(); 53 | }; 54 | }); 55 | }); 56 | 57 | console.log("the websocket server is running on port 5000"); 58 | -------------------------------------------------------------------------------- /twilio/node/twilio-proxy-stereo.js: -------------------------------------------------------------------------------- 1 | const WebSocketServer = require("ws"); 2 | const { createClient, LiveTranscriptionEvents } = require("@deepgram/sdk"); 3 | 4 | const websocketServer = new WebSocketServer.Server({ port: 5000 }); 5 | const deepgramApiKey = "INSERT_YOUR_API_KEY_HERE"; 6 | 7 | websocketServer.on("connection", (ws) => { 8 | console.log("new client connected"); 9 | 10 | const deepgram = createClient(deepgramApiKey); 11 | const connection = deepgram.listen.live({ 12 | model: "nova-2", 13 | smart_format: true, 14 | encoding: "mulaw", 15 | sample_rate: 8000, 16 | channels: 1, 17 | }); 18 | 19 | const inboundSamples = []; 20 | const outboundSamples = []; 21 | 22 | connection.on(LiveTranscriptionEvents.Open, () => { 23 | connection.on(LiveTranscriptionEvents.Close, () => { 24 | console.log("Connection closed."); 25 | }); 26 | 27 | connection.on(LiveTranscriptionEvents.Transcript, (transcription) => { 28 | console.dir(transcription, { depth: null }); 29 | }); 30 | 31 | ws.on("message", (data) => { 32 | const twilioMessage = JSON.parse(data); 33 | if ( 34 | twilioMessage["event"] === "connected" || 35 | twilioMessage["event"] === "start" 36 | ) { 37 | console.log("received a twilio connected or start event"); 38 | } 39 | if (twilioMessage["event"] === "media") { 40 | const media = twilioMessage["media"]; 41 | var audio = Buffer.from(media["payload"], "base64"); 42 | if (media["track"] === "inbound") { 43 | for (let i = 0; i < audio.length; i++) { 44 | inboundSamples.push(audio[i]); 45 | } 46 | } 47 | if (media["track"] === "outbound") { 48 | for (let i = 0; i < audio.length; i++) { 49 | outboundSamples.push(audio[i]); 50 | } 51 | } 52 | let mixable_length = Math.min( 53 | inboundSamples.length, 54 | outboundSamples.length 55 | ); 56 | if (mixable_length > 0) { 57 | var mixedSamples = Buffer.alloc(mixable_length * 2); 58 | for (let i = 0; i < mixable_length; i++) { 59 | mixedSamples[2 * i] = inboundSamples[i]; 60 | mixedSamples[2 * i + 1] = outboundSamples[i]; 61 | } 62 | 63 | inboundSamples = inboundSamples.slice(mixable_length); 64 | outboundSamples = outboundSamples.slice(mixable_length); 65 | 66 | if (connection) { 67 | connection.send(Buffer.from(mixedSamples)); 68 | } 69 | } 70 | } 71 | }); 72 | 73 | ws.on("close", () => { 74 | console.log("client has disconnected"); 75 | connection.finish(); 76 | }); 77 | 78 | ws.onerror = function () { 79 | console.log("some error occurred"); 80 | connection.finish(); 81 | }; 82 | }); 83 | }); 84 | 85 | console.log("the websocket server is running on port 5000"); 86 | -------------------------------------------------------------------------------- /twilio/python/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 deepgram 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /twilio/python/README.md: -------------------------------------------------------------------------------- 1 | # twilio integration 2 | 3 | Two python scripts are provided which can do basic twilio-deepgram proxying. Run `python3 twilio-proxy-mono.py` or `python3 twilio-proxy-stereo.py` 4 | to run the proxy server for either just the inbound track, or both the inbound and outbound tracks, respectively. Refer to the twilio documentation on streaming tracks for more info: 5 | 6 | https://www.twilio.com/docs/voice/twiml/stream#attributes-track 7 | 8 | In either script, you will need to change your `username` and `password`, as well as set up twilio to send websockets data to the server running the scripts. Refer to the following 9 | twilio documentation to do this (don't use the example python Flask application though, use our `twilio-proxy-mono` or `twilio-proxy-stereo` script instead): 10 | 11 | https://www.twilio.com/docs/voice/tutorials/consume-real-time-media-stream-using-websockets-python-and-flask 12 | https://www.twilio.com/docs/runtime/tutorials/twiml-bins 13 | 14 | My TwiML Bin files ended up looking like the following for mono: 15 | 16 | ``` 17 | 18 | 19 | 20 | 21 | 22 | my-phone-number 23 | 24 | ``` 25 | 26 | and the following for stereo (just adding the extra `track` parameter): 27 | 28 | ``` 29 | 30 | 31 | 32 | 33 | 34 | my-phone-number 35 | 36 | ``` 37 | 38 | (Alternatively, you can try initiating a call between "Person A" and "Person B", and having the call data forwarded to the twilio-deepgram proxy using the script in `twilio-api-scripts/stream.py`.) 39 | 40 | When calling your twilio number, the call will be forwarded to the number you set in your TwiML Bin. The 41 | conversation will then be forked to this `twilio-proxy-mono`/`twilio-proxy-stereo` app which sends the audio to Deepgram, receives 42 | transcriptions, and prints the transcriptions to the screen. You will likely want to do something like 43 | provide an `http` (or websockets) callback to send transcriptions to. 44 | -------------------------------------------------------------------------------- /twilio/python/twilio-api-scripts/stream.py: -------------------------------------------------------------------------------- 1 | # twilio helper library 2 | from twilio.rest import Client 3 | 4 | # other imports 5 | import time 6 | import requests 7 | import json 8 | import os 9 | import uuid 10 | 11 | # your account sid and auth token from twilio.com/console 12 | account_sid = os.environ['TWILIO_ACCOUNT_SID'] 13 | auth_token = os.environ['TWILIO_AUTH_TOKEN'] 14 | # the twilio client 15 | client = Client(account_sid, auth_token) 16 | # make the outgoing call 17 | call = client.calls.create( 18 | twiml = '+11231231234', # replace number with person B, replace url 19 | to = '+11231231234', # person A 20 | from_ = '+11231231234' # your twilio number 21 | ) 22 | -------------------------------------------------------------------------------- /twilio/python/twilio-proxy-mono.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import websockets 3 | import sys 4 | import json 5 | import base64 6 | import ssl 7 | import time 8 | 9 | def deepgram_connect(): 10 | # Replace with your Deepgram API key. 11 | extra_headers = { 12 | 'Authorization': 'Token YOUR_DEEPGRAM_API_KEY' 13 | } 14 | deepgram_ws = websockets.connect("wss://api.deepgram.com/v1/listen?encoding=mulaw&sample_rate=8000&endpointing=true", extra_headers = extra_headers) 15 | 16 | return deepgram_ws 17 | 18 | async def proxy(client_ws, path): 19 | # inbox = asyncio.Queue() # not needed unless sending ws messages back to the client 20 | outbox = asyncio.Queue() 21 | 22 | print('started proxy') 23 | 24 | # use these for timing 25 | audio_cursor = 0. 26 | conn_start = time.time() 27 | 28 | async with deepgram_connect() as deepgram_ws: 29 | async def deepgram_sender(deepgram_ws): 30 | print('started deepgram sender') 31 | while True: 32 | chunk = await outbox.get() 33 | await deepgram_ws.send(chunk) 34 | print('finished deepgram sender') 35 | 36 | async def deepgram_receiver(deepgram_ws): 37 | print('started deepgram receiver') 38 | nonlocal audio_cursor 39 | async for message in deepgram_ws: 40 | try: 41 | dg_json = json.loads(message) 42 | 43 | # print the results from deepgram! 44 | print(dg_json) 45 | 46 | # do this logic for timing 47 | # NOTE: it only makes sense to measure timing for interim results, see this doc for more details: https://docs.deepgram.com/streaming/tutorials/latency.html 48 | # try: 49 | # if dg_json["is_final"] == False: 50 | # transcript = dg_json["channel"]["alternatives"][0]["transcript"] 51 | # start = dg_json["start"] 52 | # duration = dg_json["duration"] 53 | # latency = audio_cursor - (start + duration) 54 | # conn_duration = time.time() - conn_start 55 | # print('latency: ' + str(latency) + '; transcript: ' + transcript) 56 | # except: 57 | # print('did not receive a standard streaming result') 58 | # continue 59 | except: 60 | print('was not able to parse deepgram response as json') 61 | continue 62 | print('finished deepgram receiver') 63 | 64 | async def client_receiver(client_ws): 65 | print('started client receiver') 66 | nonlocal audio_cursor 67 | 68 | # we will use a buffer of 20 messages (20 * 160 bytes, 0.4 seconds) to improve throughput performance 69 | # NOTE: twilio seems to consistently send media messages of 160 bytes 70 | BUFFER_SIZE = 20 * 160 71 | buffer = bytearray(b'') 72 | empty_byte_received = False 73 | async for message in client_ws: 74 | try: 75 | data = json.loads(message) 76 | if data["event"] in ("connected", "start"): 77 | print("Media WS: Received event connected or start") 78 | continue 79 | if data["event"] == "media": 80 | media = data["media"] 81 | chunk = base64.b64decode(media["payload"]) 82 | time_increment = len(chunk) / 8000.0 83 | audio_cursor += time_increment 84 | buffer.extend(chunk) 85 | if chunk == b'': 86 | empty_byte_received = True 87 | if data["event"] == "stop": 88 | print("Media WS: Received event stop") 89 | break 90 | 91 | # check if our buffer is ready to send to our outbox (and, thus, then to deepgram) 92 | if len(buffer) >= BUFFER_SIZE or empty_byte_received: 93 | outbox.put_nowait(buffer) 94 | buffer = bytearray(b'') 95 | except: 96 | print('message from client not formatted correctly, bailing') 97 | break 98 | 99 | # if the empty byte was received, the async for loop should end, and we should here forward the empty byte to deepgram 100 | # or, if the empty byte was not received, but the WS connection to the client (twilio) died, then the async for loop will end and we should forward an empty byte to deepgram 101 | outbox.put_nowait(b'') 102 | print('finished client receiver') 103 | 104 | await asyncio.wait([ 105 | asyncio.ensure_future(deepgram_sender(deepgram_ws)), 106 | asyncio.ensure_future(deepgram_receiver(deepgram_ws)), 107 | asyncio.ensure_future(client_receiver(client_ws)) 108 | ]) 109 | 110 | client_ws.close() 111 | print('finished running the proxy') 112 | 113 | def main(): 114 | # use this if using ssl 115 | # ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) 116 | # ssl_context.load_cert_chain('/cert.pem', 'key.pem') 117 | # proxy_server = websockets.serve(proxy, '0.0.0.0', 443, ssl=ssl_context) 118 | 119 | # use this if not using ssl 120 | proxy_server = websockets.serve(proxy, 'localhost', 5000) 121 | 122 | asyncio.get_event_loop().run_until_complete(proxy_server) 123 | asyncio.get_event_loop().run_forever() 124 | 125 | if __name__ == '__main__': 126 | sys.exit(main() or 0) 127 | -------------------------------------------------------------------------------- /twilio/python/twilio-proxy-stereo.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import json 4 | import sys 5 | import time 6 | import websockets 7 | import ssl 8 | from pydub import AudioSegment 9 | 10 | def deepgram_connect(): 11 | extra_headers = { 12 | 'Authorization': 'Token YOUR_DEEPGRAM_API_KEY' 13 | } 14 | deepgram_ws = websockets.connect("wss://api.deepgram.com/v1/listen?encoding=mulaw&sample_rate=8000&channels=2&multichannel=true", extra_headers = extra_headers) 15 | 16 | return deepgram_ws 17 | 18 | async def proxy(client_ws, path): 19 | # inbox = asyncio.Queue() # not needed unless sending ws messages back to the client 20 | outbox = asyncio.Queue() 21 | 22 | print('started proxy') 23 | 24 | async with deepgram_connect() as deepgram_ws: 25 | async def deepgram_sender(deepgram_ws): 26 | print('started deepgram sender') 27 | while True: 28 | chunk = await outbox.get() 29 | await deepgram_ws.send(chunk) 30 | print('finished deepgram sender') 31 | 32 | async def deepgram_receiver(deepgram_ws): 33 | print('started deepgram receiver') 34 | async for message in deepgram_ws: 35 | try: 36 | dg_json = json.loads(message) 37 | 38 | # print the results from deepgram! you may want to send this somewhere else, like a callback server, instead of just printing it out 39 | print(dg_json) 40 | except: 41 | print('was not able to parse deepgram response as json') 42 | continue 43 | print('finished deepgram receiver') 44 | 45 | async def client_receiver(client_ws): 46 | print('started client receiver') 47 | 48 | # directly outputting the audio to a file can be useful 49 | # the audio can be converted to a regular wav file via something like: 50 | # $ ffmpeg -f mulaw -ar 8000 -ac 2 -i mixed mixed.wav 51 | # file_inbound = open('inbound', 'wb') 52 | # file_outbound = open('outbound', 'wb') 53 | # file_mixed = open('mixed', 'wb') 54 | # file_manually_mixed = open('manually_mixed', 'wb') 55 | 56 | # we will use a buffer of 20 messages (20 * 160 bytes, 0.4 seconds) to improve throughput performance 57 | # NOTE: twilio seems to consistently send media messages of 160 bytes 58 | BUFFER_SIZE = 20 * 160 59 | # the algorithm to deal with mixing the two channels is ever so slightly sophisticated 60 | # I try here to implement an algorithm which fills in silence for channels if that channel is either 61 | # A) not currently streaming (e.g. the outbound channel when the inbound channel starts ringing it) 62 | # B) packets are dropped (this happens, and sometimes the timestamps which come back for subsequent packets are not aligned, I try to deal with this) 63 | inbuffer = bytearray(b'') 64 | outbuffer = bytearray(b'') 65 | empty_byte_received = False 66 | inbound_chunks_started = False 67 | outbound_chunks_started = False 68 | latest_inbound_timestamp = 0 69 | latest_outbound_timestamp = 0 70 | async for message in client_ws: 71 | try: 72 | data = json.loads(message) 73 | if data["event"] in ("connected", "start"): 74 | print("Media WS: Received event connected or start") 75 | continue 76 | if data["event"] == "media": 77 | media = data["media"] 78 | chunk = base64.b64decode(media["payload"]) 79 | if media['track'] == 'inbound': 80 | # fills in silence if there have been dropped packets 81 | if inbound_chunks_started: 82 | if latest_inbound_timestamp + 20 < int(media['timestamp']): 83 | bytes_to_fill = 8 * (int(media['timestamp']) - (latest_inbound_timestamp + 20)) 84 | print ('INBOUND WARNING! last timestamp was ' + str(latest_inbound_timestamp) + ' but current packet is for timestamp ' + media['timestamp'] + ', filling in ' + str(bytes_to_fill) + ' bytes of silence') 85 | inbuffer.extend(b"\xff" * bytes_to_fill) # NOTE: 0xff is silence for mulaw audio, and there are 8 bytes per ms of data for our format (8 bit, 8000 Hz) 86 | else: 87 | print ('started receiving inbound chunks!') 88 | # make it known that inbound chunks have started arriving 89 | inbound_chunks_started = True 90 | latest_inbound_timestamp = int(media['timestamp']) 91 | # this basically sets the starting point for outbound timestamps 92 | latest_outbound_timestamp = int(media['timestamp']) - 20 93 | latest_inbound_timestamp = int(media['timestamp']) 94 | # extend the inbound audio buffer with data 95 | inbuffer.extend(chunk) 96 | if media['track'] == 'outbound': 97 | # make it known that outbound chunks have started arriving 98 | outbound_chunked_started = True 99 | # fills in silence if there have been dropped packets 100 | if latest_outbound_timestamp + 20 < int(media['timestamp']): 101 | bytes_to_fill = 8 * (int(media['timestamp']) - (latest_outbound_timestamp + 20)) 102 | print ('OUTBOUND WARNING! last timestamp was ' + str(latest_outbound_timestamp) + ' but current packet is for timestamp ' + media['timestamp'] + ', filling in ' + str(bytes_to_fill) + ' bytes of silence') 103 | outbuffer.extend(b"\xff" * bytes_to_fill) # NOTE: 0xff is silence for mulaw audio, and there are 8 bytes per ms of data for our format (8 bit, 8000 Hz) 104 | latest_outbound_timestamp = int(media['timestamp']) 105 | # extend the outbound audio buffer with data 106 | outbuffer.extend(chunk) 107 | if chunk == b'': 108 | empty_byte_received = True 109 | if data["event"] == "stop": 110 | print("Media WS: Received event stop") 111 | break 112 | 113 | # check if our buffer is ready to send to our outbox (and, thus, then to deepgram) 114 | while len(inbuffer) >= BUFFER_SIZE and len(outbuffer) >= BUFFER_SIZE or empty_byte_received: 115 | if empty_byte_received: 116 | break 117 | 118 | print ( str(len(inbuffer)) + ' ' + str(len(outbuffer)) ) 119 | asinbound = AudioSegment(inbuffer[:BUFFER_SIZE], sample_width=1, frame_rate=8000, channels=1) 120 | asoutbound = AudioSegment(outbuffer[:BUFFER_SIZE], sample_width=1, frame_rate=8000, channels=1) 121 | mixed = AudioSegment.from_mono_audiosegments(asinbound, asoutbound) 122 | 123 | # if you don't have a nice library for mixing, you can always trivially manually mix the channels like so 124 | # manually_mixed = bytearray(b'') 125 | # for i in range(BUFFER_SIZE): 126 | # manually_mixed.append(inbuffer[i]) 127 | # manually_mixed.append(outbuffer[i]) 128 | 129 | # file_inbound.write(asinbound.raw_data) 130 | # file_outbound.write(asoutbound.raw_data) 131 | # file_mixed.write(mixed.raw_data) 132 | # file_manually_mixed.write(manually_mixed) 133 | 134 | # sending to deepgram 135 | outbox.put_nowait(mixed.raw_data) 136 | # outbox.put_nowait(manually_mixed) 137 | 138 | # clearing buffers 139 | inbuffer = inbuffer[BUFFER_SIZE:] 140 | outbuffer = outbuffer[BUFFER_SIZE:] 141 | except: 142 | print('message from client not formatted correctly, bailing') 143 | break 144 | 145 | # if the empty byte was received, the async for loop should end, and we should here forward the empty byte to deepgram 146 | # or, if the empty byte was not received, but the WS connection to the client (twilio) died, then the async for loop will end and we should forward an empty byte to deepgram 147 | outbox.put_nowait(b'') 148 | print('finished client receiver') 149 | 150 | # file_inbound.close() 151 | # file_outbound.close() 152 | # file_mixed.close() 153 | # file_manually_mixed.close() 154 | 155 | await asyncio.wait([ 156 | asyncio.ensure_future(deepgram_sender(deepgram_ws)), 157 | asyncio.ensure_future(deepgram_receiver(deepgram_ws)), 158 | asyncio.ensure_future(client_receiver(client_ws)) 159 | ]) 160 | 161 | client_ws.close() 162 | print('finished running the proxy') 163 | 164 | def main(): 165 | # use this if using ssl 166 | # ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) 167 | # ssl_context.load_cert_chain('cert.pem', 'key.pem') 168 | # proxy_server = websockets.serve(proxy, '0.0.0.0', 443, ssl=ssl_context) 169 | 170 | # use this if not using ssl 171 | proxy_server = websockets.serve(proxy, 'localhost', 5000) 172 | 173 | asyncio.get_event_loop().run_until_complete(proxy_server) 174 | asyncio.get_event_loop().run_forever() 175 | 176 | if __name__ == '__main__': 177 | sys.exit(main() or 0) 178 | --------------------------------------------------------------------------------