├── version ├── helpers ├── __init__.py ├── browser_helpers.py ├── assistant_helpers.py ├── device_helpers.py └── audio_helpers.py ├── config.ini ├── snowboy.tar.gz ├── Bachelor_thesis_kenan_ekici.pdf ├── .idea ├── misc.xml ├── vcs.xml ├── modules.xml ├── pepper-google-assistant.iml └── workspace.xml ├── requirements.txt ├── __init__.py ├── static └── styles │ └── layout.css ├── README.md ├── .gitignore ├── templates ├── index.html ├── stt.html └── tts.html ├── gestures.txt ├── webserver.py ├── README.rst ├── sdk ├── audiofileinput.py ├── textinput.py └── devicetool.py ├── LICENSE └── assistant.py /version: -------------------------------------------------------------------------------- 1 | 1.0.4 2 | -------------------------------------------------------------------------------- /helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [IP] 2 | Host = 192.168.3.197 3 | Port = 3000 4 | Robot = 192.168.3.146 5 | 6 | -------------------------------------------------------------------------------- /snowboy.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kenanEkici/pepper-google-assistant/HEAD/snowboy.tar.gz -------------------------------------------------------------------------------- /Bachelor_thesis_kenan_ekici.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kenanEkici/pepper-google-assistant/HEAD/Bachelor_thesis_kenan_ekici.pdf -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | google-assistant-grpc==0.1.0 2 | google-assistant-library==0.1.1 3 | google-assistant-sdk==0.4.4 4 | google-auth==1.0.1 5 | google-auth-oauthlib==0.2.0 6 | sounddevice==0.3.11 7 | click==6.7 8 | tenacity==4.12.0 9 | futures==3.2.0 10 | pathlib2==2.3.0 11 | pyaudio==0.2.11 12 | pyasn1==0.4.2 13 | gtts==1.2.2 14 | flask==0.12.2 15 | flas-socketio==2.9.6 16 | configparser==3.5.0 17 | 18 | -------------------------------------------------------------------------------- /.idea/pepper-google-assistant.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 15 | 16 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Samples for Google Assistant gRPC API.""" 16 | -------------------------------------------------------------------------------- /static/styles/layout.css: -------------------------------------------------------------------------------- 1 | .message { 2 | border-radius: 50px; 3 | padding: 15px 20px; 4 | position: relative; 5 | font-weight: bold; 6 | font-size: 18px; 7 | text-align: center; 8 | list-style-type: none; 9 | } 10 | 11 | .right { 12 | margin: 0 15px 10px 50%; 13 | background-color: #2095FE; 14 | color: #fff; 15 | } 16 | 17 | .left { 18 | margin: 0 50% 10px 15px; 19 | background-color: #d7dde8; 20 | } 21 | 22 | .footer { 23 | position: absolute; 24 | bottom: 0; 25 | width: 100%; 26 | height: 60px; /* Set the fixed height of the footer here */ 27 | line-height: 60px; /* Vertically center the text there */ 28 | background-color: #f5f5f5; 29 | } 30 | 31 | .vertical-align { 32 | display: flex; 33 | align-items: center; 34 | } 35 | 36 | h1 { 37 | text-align: center; 38 | } 39 | 40 | /* 41 | .message.to + .message.to, 42 | .message.from + .message.from { 43 | margin-top: -10px; 44 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pepper-google-assistant 2 | 3 | The basic objective of this project was to enable Google Assistant on the Pepper Robot. Because of a lack of root access on the robot, we had to find alternative ways to make this work such as using the notorious PYNAOqi framework which provided no actual guarantee of working at all times. 4 | 5 | We had also developed a simple front end to either follow the interaction between the robot and any other person and also to synthesize speech using several Text-to-Speech API's. 6 | 7 | ### What we have achieved 8 | 9 | - Build a custom Google Assistant solution for the Pepper humanoid Robot. 10 | - Train a simple voice trigger command on which the Pepper robot will start listening when its name ("pepper") has been said. 11 | - Combine both these solutions in one solution. 12 | - Build a front end for both the Google Assistant solution and the Text-to-Speech functionality. 13 | 14 | ### Notes 15 | 16 | Please refer to the my bachelor thesis [Teach Robots to speak: Text 2 Speech Solutions for Robotics](https://www.researchgate.net/publication/359379920_Teach_Robots_to_speak_Text_2_Speech_Solutions_for_Robotics) for more details about this project. 17 | 18 | -------------------------------------------------------------------------------- /helpers/browser_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os.path 16 | import tempfile 17 | import webbrowser 18 | 19 | ASSISTANT_HTML_FILE = 'google-assistant-sdk-screen-out.html' 20 | 21 | 22 | class SystemBrowser(object): 23 | def __init__(self): 24 | self.tempdir = tempfile.mkdtemp() 25 | self.filename = os.path.join(self.tempdir, ASSISTANT_HTML_FILE) 26 | 27 | def display(self, html): 28 | with open(self.filename, 'wb') as f: 29 | f.write(html) 30 | webbrowser.open(self.filename, new=0) 31 | 32 | 33 | system_browser = SystemBrowser() 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # response 10 | snowboy/ 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Google Assistant 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 27 |
28 |
29 |

30 |

Google Assistant

31 |
32 |
33 |
34 |
35 | 36 | 37 | -------------------------------------------------------------------------------- /gestures.txt: -------------------------------------------------------------------------------- 1 | gestures = [['animations/Stand/Gestures/Hey_1', 'animations/Stand/Gestures/Hey_1'], 2 | ['animations/Stand/Gestures/IDontKnow_1'], 3 | ['animations/Stand/Gestures/Me_1', 'animations/Stand/Gestures/You_1'], 4 | ['animations/Stand/Gestures/Please_1'], 5 | ['animations/Stand/Gestures/Nothing_2'], 6 | ['animations/Stand/Gestures/Everything_1'], 7 | ['animations/Stand/Gestures/Yes_1'], 8 | ['animations/Stand/Gestures/CalmDown_1', 'animations/Stand/Gestures/Desperate_1', 'animations/Stand/Gestures/Desperate_2', 'animations/Stand/Gestures/Desperate_4', 'animations/Stand/Gestures/Desperate_5'], 9 | ['animations/Stand/Gestures/Think_1'], 10 | ['animations/Stand/Gestures/Happy_4'], 11 | ['animations/Stand/Gestures/Yes_1', 'animations/Stand/Gestures/Yes_2', 'animations/Stand/Gestures/Yes_3'], 12 | ['animations/Stand/Gestures/Explain_1', 'animations/Stand/Gestures/Explain_2', 'animations/Stand/Gestures/Explain_3', 'animations/Stand/Gestures/Explain_4', 'animations/Stand/Gestures/Explain_5', 'animations/Stand/Gestures/Explain_6', 'animations/Stand/Gestures/Explain_7', 'animations/Stand/Gestures/Explain_8', 'animations/Stand/Gestures/Explain_10', 'animations/Stand/Gestures/Explain_11'], 13 | ['animations/Stand/Waiting/Think_1', 'animations/Stand/Waiting/Think_2', 'animations/Stand/Waiting/Think_3']] 14 | 15 | 16 | def animation_pepper(self, gesture, session): 17 | animation_player_service = session.service("ALAnimationPlayer") 18 | print('test1') 19 | if 'hi' in gesture or 'hey' in gesture: 20 | print('test2') 21 | gest = gestures[0][randint(0, len(gestures[0]) - 1)] 22 | animation_player_service.run(gest, _async=True) 23 | 24 | 25 | 26 | def naoqi_session(self): 27 | session = qi.Session() 28 | try: 29 | session.connect("tcp://" + '192.168.3.146' + ":" + '9559') 30 | except RuntimeError: 31 | print ("Can't connect to Naoqi") 32 | sys.exit(1) 33 | return session -------------------------------------------------------------------------------- /helpers/assistant_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Helper functions for the Google Assistant API.""" 16 | 17 | import logging 18 | 19 | from google.assistant.embedded.v1alpha2 import embedded_assistant_pb2 20 | 21 | 22 | def log_assist_request_without_audio(assist_request): 23 | """Log AssistRequest fields without audio data.""" 24 | if logging.getLogger().isEnabledFor(logging.DEBUG): 25 | resp_copy = embedded_assistant_pb2.AssistRequest() 26 | resp_copy.CopyFrom(assist_request) 27 | if len(resp_copy.audio_in) > 0: 28 | size = len(resp_copy.audio_in) 29 | resp_copy.ClearField('audio_in') 30 | logging.debug('AssistRequest: audio_in (%d bytes)', 31 | size) 32 | return 33 | logging.debug('AssistRequest: %s', resp_copy) 34 | 35 | 36 | def log_assist_response_without_audio(assist_response): 37 | """Log AssistResponse fields without audio data.""" 38 | if logging.getLogger().isEnabledFor(logging.DEBUG): 39 | resp_copy = embedded_assistant_pb2.AssistResponse() 40 | resp_copy.CopyFrom(assist_response) 41 | has_audio_data = (resp_copy.HasField('audio_out') and 42 | len(resp_copy.audio_out.audio_data) > 0) 43 | if has_audio_data: 44 | size = len(resp_copy.audio_out.audio_data) 45 | resp_copy.audio_out.ClearField('audio_data') 46 | if resp_copy.audio_out.ListFields(): 47 | logging.debug('AssistResponse: %s audio_data (%d bytes)', 48 | resp_copy, 49 | size) 50 | else: 51 | logging.debug('AssistResponse: audio_data (%d bytes)', 52 | size) 53 | return 54 | logging.debug('AssistResponse: %s', resp_copy) 55 | -------------------------------------------------------------------------------- /templates/stt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Frontend SST 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 49 |
50 |
51 |

52 |

Speech-To-Text and Interactive Questions

53 |
54 |
55 |
56 | 57 |
58 |
59 | 60 | -------------------------------------------------------------------------------- /helpers/device_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Helper functions for the Device Actions.""" 16 | 17 | import concurrent.futures 18 | import logging 19 | import sys 20 | 21 | 22 | key_inputs_ = 'inputs' 23 | key_intent_ = 'intent' 24 | key_payload_ = 'payload' 25 | key_commands_ = 'commands' 26 | key_id_ = 'id' 27 | 28 | 29 | class DeviceRequestHandler(object): 30 | """Asynchronous dispatcher for Device actions commands. 31 | 32 | Dispatch commands to the given device handlers. 33 | 34 | Args: 35 | device_id: device id to match command against 36 | 37 | Example: 38 | # Use as as decorator to register handler. 39 | device_handler = DeviceRequestHandler('my-device') 40 | @device_handler.command('INTENT_NAME') 41 | def handler(param): 42 | pass 43 | """ 44 | 45 | def __init__(self, device_id): 46 | self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) 47 | self.device_id = device_id 48 | self.handlers = {} 49 | 50 | def __call__(self, device_request): 51 | """Handle incoming device request. 52 | 53 | Returns: List of concurrent.futures for each command execution. 54 | """ 55 | fs = [] 56 | if key_inputs_ in device_request: 57 | for input in device_request[key_inputs_]: 58 | if input[key_intent_] == 'action.devices.EXECUTE': 59 | for command in input[key_payload_][key_commands_]: 60 | fs.extend(self.submit_commands(**command)) 61 | return fs 62 | 63 | def command(self, intent): 64 | """Register a device action handlers.""" 65 | def decorator(fn): 66 | self.handlers[intent] = fn 67 | return decorator 68 | 69 | def submit_commands(self, devices, execution): 70 | """Submit device command executions. 71 | 72 | Returns: a list of concurrent.futures for scheduled executions. 73 | """ 74 | fs = [] 75 | for device in devices: 76 | if device[key_id_] != self.device_id: 77 | logging.warning('Ignoring command for unknown device: %s' 78 | % device[key_id_]) 79 | continue 80 | if not execution: 81 | logging.warning('Ignoring noop execution') 82 | continue 83 | for command in execution: 84 | f = self.executor.submit( 85 | self.dispatch_command, **command 86 | ) 87 | fs.append(f) 88 | return fs 89 | 90 | def dispatch_command(self, command, params=None): 91 | """Dispatch device commands to the appropriate handler.""" 92 | try: 93 | if command in self.handlers: 94 | self.handlers[command](**params) 95 | else: 96 | logging.warning('Unsupported command: %s: %s', 97 | command, params) 98 | except Exception as e: 99 | logging.warning('Error during command execution', 100 | exc_info=sys.exc_info()) 101 | raise e 102 | -------------------------------------------------------------------------------- /templates/tts.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Text-to-Speech dashboard 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 68 |
69 |
70 |

71 |

Text-to-Speech dashboard

72 |
73 |
74 |

Pepper Text-to-Speech

75 | 76 |
77 | 78 |
79 |
80 |
81 |

GTTS Text-to-Speech

82 | 83 |
84 | 85 |
86 |
87 |
88 |

Google Cloud Text-to-Speech

89 | 90 |
91 | 95 |
96 | 105 |
106 |
107 | 108 | 109 |
110 |
111 | 112 |
113 |
114 |
115 | 116 | 117 | -------------------------------------------------------------------------------- /webserver.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, send_file, render_template, jsonify 2 | import urllib3 3 | from gtts import gTTS 4 | import configparser 5 | import json 6 | import requests 7 | import base64 8 | from naoqi import ALProxy 9 | from flask_socketio import SocketIO 10 | from threading import Thread 11 | import pyaudio 12 | import wave 13 | 14 | 15 | my_server = Flask(__name__) 16 | socketio = SocketIO(my_server) 17 | stream = None 18 | wavstream = None 19 | pepper = None 20 | host = None 21 | port = None 22 | 23 | 24 | @my_server.route("/", methods=['GET']) 25 | def root(): 26 | return render_template('index.html') 27 | 28 | 29 | @my_server.route("/texttospeech", methods=['GET']) 30 | def tts(): 31 | return render_template('tts.html') 32 | 33 | 34 | @my_server.route("/speechtotext", methods=['GET']) 35 | def stt(): 36 | return render_template('stt.html') 37 | 38 | 39 | @my_server.route("/googlestt", methods=['POST']) 40 | def gsst(): 41 | headers = { 42 | 'Content-Type': 'application/json' 43 | } 44 | start_record() 45 | with open('/tmp/input.wav', 'rb') as f1: 46 | content = base64.b64encode(f1.read()) 47 | 48 | dic = { 49 | "config": { 50 | "encoding": "LINEAR16", 51 | "languageCode": "en-US", 52 | "enableAutomaticPunctuation": 'true', 53 | "sampleRateHertz": 16000, 54 | "model": "default" 55 | }, 56 | 57 | "audio": { 58 | "content": content 59 | } 60 | } 61 | 62 | resp = requests.post( 63 | "https://cxl-services.appspot.com/proxy?url=https%3A%2F%2Fspeech.googleapis.com%2Fv1p1beta1%2Fspeech%3Arecognize", 64 | headers=headers, data=json.dumps(dic)) 65 | json_data = resp.json() 66 | transcript = json_data['results'][0]['alternatives'][0]['transcript'] 67 | socketio.emit('inputmsg', transcript) 68 | return 'success' 69 | 70 | 71 | @my_server.route("/pepper", methods=['POST']) 72 | def pepper(): 73 | req = request.json.get('input') 74 | altts = ALProxy("ALTextToSpeech", pepper, 9559) 75 | altts.say(str(req)) 76 | return "success" 77 | 78 | 79 | @my_server.route("/gtts", methods=['POST']) 80 | def gtts(): 81 | req = request.json.get('input') 82 | googletts = gTTS(text=req, lang='en') 83 | googletts.save("/tmp/syn.mp3") 84 | return "success" 85 | 86 | 87 | @my_server.route("/gcloud", methods=['POST']) 88 | def gcloud(): 89 | headers = { 90 | 'Content-Type': 'application/json' 91 | } 92 | resp = requests.post("https://cxl-services.appspot.com/proxy?url=https%3A%2F%2Ftexttospeech.googleapis.com%2Fv1beta1%2Ftext%3Asynthesize", headers=headers, data=json.dumps(request.get_json())) 93 | r = resp.json().get('audioContent') 94 | with open("/tmp/syn.mp3", 'w') as file: 95 | file.write(base64.decodestring(r)) 96 | return "success" 97 | 98 | 99 | @my_server.route("/playpepper", methods=['GET']) 100 | def play_stream(): 101 | t = Thread(target=play_pepper) 102 | t.start() 103 | return "success" 104 | 105 | 106 | @my_server.route("/stream", methods=['GET']) 107 | def stream_mp3(): 108 | return send_file('/tmp/syn.mp3', cache_timeout=0) 109 | 110 | 111 | @my_server.route("/wavstream", methods=['GET']) 112 | def stream_wav(): 113 | return send_file('/tmp/syn.wav', cache_timeout=0) 114 | 115 | 116 | def emit_socket(msg_type, msg): 117 | socketio.emit(msg_type, msg) 118 | 119 | 120 | def play_pepper(): 121 | audio = ALProxy("ALAudioPlayer", pepper, 9559) 122 | audio.playWebStream(stream, 1, 0) 123 | 124 | 125 | def play_asistant_response(): 126 | audio = ALProxy("ALAudioPlayer", pepper, 9559) 127 | audio.playWebStream(wavstream, 1, 0) 128 | 129 | 130 | def start_record(): 131 | FORMAT = pyaudio.paInt16 132 | CHANNELS = 1 133 | RATE = 16000 134 | CHUNK = 1024 135 | RECORD_SECONDS = 10 136 | WAVE_OUTPUT_FILENAME = "/tmp/input.wav" 137 | audio = pyaudio.PyAudio() 138 | 139 | # start recording 140 | wav_stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) 141 | print "recording..." 142 | frames = [] 143 | 144 | for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): 145 | data = wav_stream.read(CHUNK) 146 | frames.append(data) 147 | print "finished recording" 148 | 149 | # stop recording 150 | wav_stream.stop_stream() 151 | wav_stream.close() 152 | audio.terminate() 153 | 154 | wave_file = wave.open(WAVE_OUTPUT_FILENAME, 'wb') 155 | wave_file.setnchannels(CHANNELS) 156 | wave_file.setsampwidth(audio.get_sample_size(FORMAT)) 157 | wave_file.setframerate(RATE) 158 | wave_file.writeframes(b''.join(frames)) 159 | wave_file.close() 160 | 161 | 162 | def start_server(): 163 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 164 | global stream, wavstream, pepper, host, port 165 | config = configparser.ConfigParser() 166 | config.read('config.ini') 167 | 168 | host = config['IP']['Host'] 169 | port = config['IP']['Port'] 170 | 171 | stream = "http://" + str(host) + ":" + str(port) + "/stream" 172 | wavstream = "http://" + str(host) + ":" + str(port) + "/wavstream" 173 | pepper = str(config['IP']['Robot']) 174 | 175 | print("API is running") 176 | socketio.run(my_server, host=host, port=int(port)) 177 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Python Samples for the Google Assistant gRPC API 2 | ================================================ 3 | 4 | This repository contains a reference sample for the ``google-assistant-grpc`` Python package_. 5 | 6 | It implements the following features: 7 | 8 | - Triggering a conversation using a key press 9 | - Audio recording of user queries (single or multiple consecutive queries) 10 | - Playback of the Assistant response 11 | - Conversation state management 12 | - Volume control 13 | 14 | .. _package: https://pypi.python.org/pypi/google-assistant-grpc 15 | 16 | Prerequisites 17 | ------------- 18 | 19 | - `Python `_ (>= 3.4 recommended) 20 | - An `Actions Console Project `_ 21 | - A `Google account `_ 22 | 23 | Setup 24 | ----- 25 | 26 | - Install Python 3 27 | 28 | - Ubuntu/Debian GNU/Linux:: 29 | 30 | sudo apt-get update 31 | sudo apt-get install python3 python3-venv 32 | 33 | - `MacOSX, Windows, Other `_ 34 | 35 | - Create a new virtual environment (recommended):: 36 | 37 | python3 -m venv env 38 | env/bin/python -m pip install --upgrade pip setuptools wheel 39 | source env/bin/activate 40 | 41 | Authorization 42 | ------------- 43 | 44 | - Follow the steps to `configure the Actions Console project and the Google account `_. 45 | - Follow the steps to `register a new device model and download the client secrets file `_. 46 | - Generate device credentials using ``google-oauthlib-tool``: 47 | 48 | pip install --upgrade google-auth-oauthlib[tool] 49 | google-oauthlib-tool --client-secrets path/to/credentials.json --scope https://www.googleapis.com/auth/assistant-sdk-prototype --save --headless 50 | 51 | Run the samples 52 | --------------- 53 | 54 | - Install the sample dependencies:: 55 | 56 | sudo apt-get install portaudio19-dev libffi-dev libssl-dev 57 | pip install --upgrade -r requirements.txt 58 | 59 | - Verify audio setup:: 60 | 61 | # Record a 5 sec sample and play it back 62 | python -m audio_helpers 63 | 64 | - Run the push to talk sample. The sample records a voice query after a key press and plays back the Google Assistant's answer:: 65 | 66 | python -m pushtotalk --device-id 'my-device-identifier' --device-model-id 'my-model-identifier' 67 | 68 | - Try some Google Assistant voice query like "What time is it?" or "Who am I?". 69 | 70 | - Try a device action query like "Turn on". 71 | 72 | - Run in verbose mode to see the gRPC communication with the Google Assistant API:: 73 | 74 | python -m pushtotalk --device-id 'my-device-identifier' --device-model-id 'my-model-identifier' -v 75 | 76 | - Send a pre-recorded request to the Assistant:: 77 | 78 | python -m pushtotalk --device-id 'my-device-identifier' --device-model-id 'my-model-identifier' -i in.wav 79 | 80 | - Save the Assistant response to a file:: 81 | 82 | python -m pushtotalk --device-id 'my-device-identifier' --device-model-id 'my-model-identifier' -o out.wav 83 | 84 | - Send text requests to the Assistant:: 85 | 86 | python -m textinput --device-id 'my-device-identifier' --device-model-id 'my-model-identifier' 87 | 88 | - Send a request to the Assistant from a local audio file and write the Assistant audio response to another file:: 89 | 90 | python -m audiofileinput --device-id 'my-device-identifier' --device-model-id 'my-model-identifier' -i in.wav -o out.wav 91 | 92 | Troubleshooting 93 | --------------- 94 | 95 | - Verify ALSA setup:: 96 | 97 | # Play a test sound 98 | speaker-test -t wav 99 | 100 | # Record and play back some audio using ALSA command-line tools 101 | arecord --format=S16_LE --duration=5 --rate=16000 --file-type=raw out.raw 102 | aplay --format=S16_LE --rate=16000 --file-type=raw out.raw 103 | 104 | - If Assistant audio is choppy, try adjusting the sound device's block size:: 105 | 106 | # If using a USB speaker or dedicated soundcard, set block size to "0" 107 | # to automatically adjust the buffer size 108 | python -m audio_helpers --audio-block-size=0 109 | 110 | # If using the line-out 3.5mm audio jack on the device, set block size 111 | # to a value larger than the `ConverseResponse` audio payload size 112 | python -m audio_helpers --audio-block-size=3200 113 | 114 | # Run the Assistant sample using the best block size value found above 115 | python -m pushtotalk --audio-block-size=value 116 | 117 | - If Assistant audio is truncated, try adjusting the sound device's flush size:: 118 | 119 | # Set flush size to a value larger than the audio block size. You can 120 | # run the sample using the --audio-flush-size flag as well. 121 | python -m audio_helpers --audio-block-size=3200 --audio-flush-size=6400 122 | 123 | See also the `troubleshooting section `_ of the official documentation. 124 | 125 | License 126 | ------- 127 | 128 | Copyright (C) 2017 Google Inc. 129 | 130 | Licensed to the Apache Software Foundation (ASF) under one or more contributor 131 | license agreements. See the NOTICE file distributed with this work for 132 | additional information regarding copyright ownership. The ASF licenses this 133 | file to you under the Apache License, Version 2.0 (the "License"); you may not 134 | use this file except in compliance with the License. You may obtain a copy of 135 | the License at 136 | 137 | http://www.apache.org/licenses/LICENSE-2.0 138 | 139 | Unless required by applicable law or agreed to in writing, software 140 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 141 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 142 | License for the specific language governing permissions and limitations under 143 | the License. 144 | -------------------------------------------------------------------------------- /sdk/audiofileinput.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Simple file-based sample for the Google Assistant Service.""" 16 | 17 | import json 18 | import logging 19 | import os 20 | import os.path 21 | import sys 22 | 23 | import click 24 | import google.auth.transport.grpc 25 | import google.auth.transport.requests 26 | import google.oauth2.credentials 27 | 28 | from google.assistant.embedded.v1alpha2 import ( 29 | embedded_assistant_pb2, 30 | embedded_assistant_pb2_grpc 31 | ) 32 | 33 | 34 | END_OF_UTTERANCE = embedded_assistant_pb2.AssistResponse.END_OF_UTTERANCE 35 | 36 | 37 | @click.command() 38 | @click.option('--api-endpoint', default='embeddedassistant.googleapis.com', 39 | metavar='', show_default=True, 40 | help='Address of Google Assistant API service.') 41 | @click.option('--credentials', 42 | metavar='', show_default=True, 43 | default=os.path.join(click.get_app_dir('google-oauthlib-tool'), 44 | 'credentials.json'), 45 | help='Path to read OAuth2 credentials.') 46 | @click.option('--device-model-id', required=True, 47 | metavar='', 48 | help='Unique device model identifier.') 49 | @click.option('--device-id', required=True, 50 | metavar='', 51 | help='Unique registered device instance identifier.') 52 | @click.option('--lang', show_default=True, 53 | metavar='', 54 | default='en-US', 55 | help='Language code of the Assistant.') 56 | @click.option('--verbose', '-v', is_flag=True, default=False, 57 | help='Enable verbose logging.') 58 | @click.option('--input-audio-file', '-i', required=True, 59 | metavar='', type=click.File('rb'), 60 | help='Path to input audio file (format: LINEAR16 16000 Hz).') 61 | @click.option('--output-audio-file', '-o', required=True, 62 | metavar='', type=click.File('wb'), 63 | help='Path to output audio file (format: LINEAR16 16000 Hz).') 64 | @click.option('--block-size', default=1024, 65 | metavar='', show_default=True, 66 | help='Size of each input stream read in bytes.') 67 | @click.option('--grpc-deadline', default=300, 68 | metavar='', show_default=True, 69 | help='gRPC deadline in seconds') 70 | def main(api_endpoint, credentials, 71 | device_model_id, device_id, lang, verbose, 72 | input_audio_file, output_audio_file, 73 | block_size, grpc_deadline, *args, **kwargs): 74 | """File based sample for the Google Assistant API. 75 | 76 | Examples: 77 | $ python -m audiofileinput -i -o 78 | """ 79 | # Setup logging. 80 | logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) 81 | 82 | # Load OAuth 2.0 credentials. 83 | try: 84 | with open(credentials, 'r') as f: 85 | credentials = google.oauth2.credentials.Credentials(token=None, 86 | **json.load(f)) 87 | http_request = google.auth.transport.requests.Request() 88 | credentials.refresh(http_request) 89 | except Exception as e: 90 | logging.error('Error loading credentials: %s', e) 91 | logging.error('Run google-oauthlib-tool to initialize ' 92 | 'new OAuth 2.0 credentials.') 93 | sys.exit(-1) 94 | 95 | # Create an authorized gRPC channel. 96 | grpc_channel = google.auth.transport.grpc.secure_authorized_channel( 97 | credentials, http_request, api_endpoint) 98 | logging.info('Connecting to %s', api_endpoint) 99 | 100 | # Create gRPC stubs 101 | assistant = embedded_assistant_pb2_grpc.EmbeddedAssistantStub(grpc_channel) 102 | 103 | # Generate gRPC requests. 104 | def gen_assist_requests(input_stream): 105 | dialog_state_in = embedded_assistant_pb2.DialogStateIn( 106 | language_code=lang, 107 | conversation_state=b'' 108 | ) 109 | config = embedded_assistant_pb2.AssistConfig( 110 | audio_in_config=embedded_assistant_pb2.AudioInConfig( 111 | encoding='LINEAR16', 112 | sample_rate_hertz=16000, 113 | ), 114 | audio_out_config=embedded_assistant_pb2.AudioOutConfig( 115 | encoding='LINEAR16', 116 | sample_rate_hertz=16000, 117 | volume_percentage=100, 118 | ), 119 | dialog_state_in=dialog_state_in, 120 | device_config=embedded_assistant_pb2.DeviceConfig( 121 | device_id=device_id, 122 | device_model_id=device_model_id, 123 | ) 124 | ) 125 | # Send first AssistRequest message with configuration. 126 | yield embedded_assistant_pb2.AssistRequest(config=config) 127 | while True: 128 | # Read user request from file. 129 | data = input_stream.read(block_size) 130 | if not data: 131 | break 132 | # Send following AssitRequest message with audio chunks. 133 | yield embedded_assistant_pb2.AssistRequest(audio_in=data) 134 | 135 | for resp in assistant.Assist(gen_assist_requests(input_audio_file), 136 | grpc_deadline): 137 | # Iterate on AssistResponse messages. 138 | if resp.event_type == END_OF_UTTERANCE: 139 | logging.info('End of audio request detected') 140 | if resp.speech_results: 141 | logging.info('Transcript of user request: "%s".', 142 | ' '.join(r.transcript 143 | for r in resp.speech_results)) 144 | if len(resp.audio_out.audio_data) > 0: 145 | # Write assistant response to supplied file. 146 | output_audio_file.write(resp.audio_out.audio_data) 147 | if resp.dialog_state_out.supplemental_display_text: 148 | logging.info('Assistant display text: "%s"', 149 | resp.dialog_state_out.supplemental_display_text) 150 | if resp.device_action.device_request_json: 151 | device_request = json.loads(resp.device_action.device_request_json) 152 | logging.info('Device request: %s', device_request) 153 | 154 | 155 | if __name__ == '__main__': 156 | main() 157 | -------------------------------------------------------------------------------- /sdk/textinput.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Sample that implements a text client for the Google Assistant Service.""" 16 | 17 | import os 18 | import logging 19 | import json 20 | 21 | import click 22 | import google.auth.transport.grpc 23 | import google.auth.transport.requests 24 | import google.oauth2.credentials 25 | 26 | from google.assistant.embedded.v1alpha2 import ( 27 | embedded_assistant_pb2, 28 | embedded_assistant_pb2_grpc 29 | ) 30 | 31 | try: 32 | from . import ( 33 | assistant_helpers, 34 | browser_helpers, 35 | ) 36 | except (SystemError, ImportError): 37 | import assistant_helpers 38 | import browser_helpers 39 | 40 | 41 | ASSISTANT_API_ENDPOINT = 'embeddedassistant.googleapis.com' 42 | DEFAULT_GRPC_DEADLINE = 60 * 3 + 5 43 | PLAYING = embedded_assistant_pb2.ScreenOutConfig.PLAYING 44 | 45 | 46 | class SampleTextAssistant(object): 47 | """Sample Assistant that supports text based conversations. 48 | 49 | Args: 50 | language_code: language for the conversation. 51 | device_model_id: identifier of the device model. 52 | device_id: identifier of the registered device instance. 53 | display: enable visual display of assistant response. 54 | channel: authorized gRPC channel for connection to the 55 | Google Assistant API. 56 | deadline_sec: gRPC deadline in seconds for Google Assistant API call. 57 | """ 58 | 59 | def __init__(self, language_code, device_model_id, device_id, 60 | display, channel, deadline_sec): 61 | self.language_code = language_code 62 | self.device_model_id = device_model_id 63 | self.device_id = device_id 64 | self.conversation_state = None 65 | # Force reset of first conversation. 66 | self.is_new_conversation = True 67 | self.display = display 68 | self.assistant = embedded_assistant_pb2_grpc.EmbeddedAssistantStub( 69 | channel 70 | ) 71 | self.deadline = deadline_sec 72 | 73 | def __enter__(self): 74 | return self 75 | 76 | def __exit__(self, etype, e, traceback): 77 | if e: 78 | return False 79 | 80 | def assist(self, text_query): 81 | """Send a text request to the Assistant and playback the response. 82 | """ 83 | def iter_assist_requests(): 84 | config = embedded_assistant_pb2.AssistConfig( 85 | audio_out_config=embedded_assistant_pb2.AudioOutConfig( 86 | encoding='LINEAR16', 87 | sample_rate_hertz=16000, 88 | volume_percentage=0, 89 | ), 90 | dialog_state_in=embedded_assistant_pb2.DialogStateIn( 91 | language_code=self.language_code, 92 | conversation_state=self.conversation_state, 93 | is_new_conversation=self.is_new_conversation, 94 | ), 95 | device_config=embedded_assistant_pb2.DeviceConfig( 96 | device_id=self.device_id, 97 | device_model_id=self.device_model_id, 98 | ), 99 | text_query=text_query, 100 | ) 101 | # Continue current conversation with later requests. 102 | self.is_new_conversation = False 103 | if self.display: 104 | config.screen_out_config.screen_mode = PLAYING 105 | req = embedded_assistant_pb2.AssistRequest(config=config) 106 | assistant_helpers.log_assist_request_without_audio(req) 107 | yield req 108 | 109 | text_response = None 110 | html_response = None 111 | for resp in self.assistant.Assist(iter_assist_requests(), 112 | self.deadline): 113 | assistant_helpers.log_assist_response_without_audio(resp) 114 | if resp.screen_out.data: 115 | html_response = resp.screen_out.data 116 | if resp.dialog_state_out.conversation_state: 117 | conversation_state = resp.dialog_state_out.conversation_state 118 | self.conversation_state = conversation_state 119 | if resp.dialog_state_out.supplemental_display_text: 120 | text_response = resp.dialog_state_out.supplemental_display_text 121 | return text_response, html_response 122 | 123 | 124 | @click.command() 125 | @click.option('--api-endpoint', default=ASSISTANT_API_ENDPOINT, 126 | metavar='', show_default=True, 127 | help='Address of Google Assistant API service.') 128 | @click.option('--credentials', 129 | metavar='', show_default=True, 130 | default=os.path.join(click.get_app_dir('google-oauthlib-tool'), 131 | 'credentials.json'), 132 | help='Path to read OAuth2 credentials.') 133 | @click.option('--device-model-id', 134 | metavar='', 135 | required=True, 136 | help=(('Unique device model identifier, ' 137 | 'if not specifed, it is read from --device-config'))) 138 | @click.option('--device-id', 139 | metavar='', 140 | required=True, 141 | help=(('Unique registered device instance identifier, ' 142 | 'if not specified, it is read from --device-config, ' 143 | 'if no device_config found: a new device is registered ' 144 | 'using a unique id and a new device config is saved'))) 145 | @click.option('--lang', show_default=True, 146 | metavar='', 147 | default='en-US', 148 | help='Language code of the Assistant') 149 | @click.option('--display', is_flag=True, default=False, 150 | help='Enable visual display of Assistant responses in HTML.') 151 | @click.option('--verbose', '-v', is_flag=True, default=False, 152 | help='Verbose logging.') 153 | @click.option('--grpc-deadline', default=DEFAULT_GRPC_DEADLINE, 154 | metavar='', show_default=True, 155 | help='gRPC deadline in seconds') 156 | def main(api_endpoint, credentials, 157 | device_model_id, device_id, lang, display, verbose, 158 | grpc_deadline, *args, **kwargs): 159 | # Setup logging. 160 | logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) 161 | 162 | # Load OAuth 2.0 credentials. 163 | try: 164 | with open(credentials, 'r') as f: 165 | credentials = google.oauth2.credentials.Credentials(token=None, 166 | **json.load(f)) 167 | http_request = google.auth.transport.requests.Request() 168 | credentials.refresh(http_request) 169 | except Exception as e: 170 | logging.error('Error loading credentials: %s', e) 171 | logging.error('Run google-oauthlib-tool to initialize ' 172 | 'new OAuth 2.0 credentials.') 173 | return 174 | 175 | # Create an authorized gRPC channel. 176 | grpc_channel = google.auth.transport.grpc.secure_authorized_channel( 177 | credentials, http_request, api_endpoint) 178 | logging.info('Connecting to %s', api_endpoint) 179 | 180 | with SampleTextAssistant(lang, device_model_id, device_id, display, 181 | grpc_channel, grpc_deadline) as assistant: 182 | while True: 183 | query = click.prompt('') 184 | click.echo(' %s' % query) 185 | response_text, response_html = assistant.assist(text_query=query) 186 | if display and response_html: 187 | system_browser = browser_helpers.system_browser 188 | system_browser.display(response_html) 189 | if response_text: 190 | click.echo('<@assistant> %s' % response_text) 191 | 192 | 193 | if __name__ == '__main__': 194 | main() 195 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /helpers/audio_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Helper functions for audio streams.""" 16 | 17 | import array 18 | import logging 19 | import math 20 | import time 21 | import threading 22 | import wave 23 | 24 | import click 25 | import sounddevice as sd 26 | 27 | 28 | DEFAULT_AUDIO_SAMPLE_RATE = 16000 29 | DEFAULT_AUDIO_SAMPLE_WIDTH = 2 30 | DEFAULT_AUDIO_ITER_SIZE = 3200 31 | DEFAULT_AUDIO_DEVICE_BLOCK_SIZE = 6400 32 | DEFAULT_AUDIO_DEVICE_FLUSH_SIZE = 25600 33 | 34 | 35 | def normalize_audio_buffer(buf, volume_percentage, sample_width=2): 36 | """Adjusts the loudness of the audio data in the given buffer. 37 | 38 | Volume normalization is done by scaling the amplitude of the audio 39 | in the buffer by a scale factor of 2^(volume_percentage/100)-1. 40 | For example, 50% volume scales the amplitude by a factor of 0.414, 41 | and 75% volume scales the amplitude by a factor of 0.681. 42 | For now we only sample_width 2. 43 | 44 | Args: 45 | buf: byte string containing audio data to normalize. 46 | volume_percentage: volume setting as an integer percentage (1-100). 47 | sample_width: size of a single sample in bytes. 48 | """ 49 | if sample_width != 2: 50 | raise Exception('unsupported sample width:', sample_width) 51 | scale = math.pow(2, 1.0*volume_percentage/100)-1 52 | # Construct array from bytes based on sample_width, multiply by scale 53 | # and convert it back to bytes 54 | arr = array.array('h', buf) 55 | for idx in range(0, len(arr)): 56 | arr[idx] = int(arr[idx]*scale) 57 | buf = arr.tostring() 58 | return buf 59 | 60 | 61 | def align_buf(buf, sample_width): 62 | """In case of buffer size not aligned to sample_width pad it with 0s""" 63 | remainder = len(buf) % sample_width 64 | if remainder != 0: 65 | buf += b'\0' * (sample_width - remainder) 66 | return buf 67 | 68 | 69 | class WaveSource(object): 70 | """Audio source that reads audio data from a WAV file. 71 | 72 | Reads are throttled to emulate the given sample rate and silence 73 | is returned when the end of the file is reached. 74 | 75 | Args: 76 | fp: file-like stream object to read from. 77 | sample_rate: sample rate in hertz. 78 | sample_width: size of a single sample in bytes. 79 | """ 80 | def __init__(self, fp, sample_rate, sample_width): 81 | self._fp = fp 82 | try: 83 | self._wavep = wave.open(self._fp, 'r') 84 | except wave.Error as e: 85 | logging.warning('error opening WAV file: %s, ' 86 | 'falling back to RAW format', e) 87 | self._fp.seek(0) 88 | self._wavep = None 89 | self._sample_rate = sample_rate 90 | self._sample_width = sample_width 91 | self._sleep_until = 0 92 | 93 | def read(self, size): 94 | """Read bytes from the stream and block until sample rate is achieved. 95 | 96 | Args: 97 | size: number of bytes to read from the stream. 98 | """ 99 | now = time.time() 100 | missing_dt = self._sleep_until - now 101 | if missing_dt > 0: 102 | time.sleep(missing_dt) 103 | self._sleep_until = time.time() + self._sleep_time(size) 104 | data = (self._wavep.readframes(size) 105 | if self._wavep 106 | else self._fp.read(size)) 107 | # When reach end of audio stream, pad remainder with silence (zeros). 108 | if not data: 109 | return b'\x00' * size 110 | return data 111 | 112 | def close(self): 113 | """Close the underlying stream.""" 114 | if self._wavep: 115 | self._wavep.close() 116 | self._fp.close() 117 | 118 | def _sleep_time(self, size): 119 | sample_count = size / float(self._sample_width) 120 | sample_rate_dt = sample_count / float(self._sample_rate) 121 | return sample_rate_dt 122 | 123 | def start(self): 124 | pass 125 | 126 | def stop(self): 127 | pass 128 | 129 | @property 130 | def sample_rate(self): 131 | return self._sample_rate 132 | 133 | 134 | class WaveSink(object): 135 | """Audio sink that writes audio data to a WAV file. 136 | 137 | Args: 138 | fp: file-like stream object to write data to. 139 | sample_rate: sample rate in hertz. 140 | sample_width: size of a single sample in bytes. 141 | """ 142 | def __init__(self, fp, sample_rate, sample_width): 143 | self._fp = fp 144 | self._wavep = wave.open(self._fp, 'wb') 145 | self._wavep.setsampwidth(sample_width) 146 | self._wavep.setnchannels(1) 147 | self._wavep.setframerate(sample_rate) 148 | 149 | def write(self, data): 150 | """Write bytes to the stream. 151 | 152 | Args: 153 | data: frame data to write. 154 | """ 155 | self._wavep.writeframes(data) 156 | 157 | def close(self): 158 | """Close the underlying stream.""" 159 | self._wavep.close() 160 | self._fp.close() 161 | 162 | def start(self): 163 | pass 164 | 165 | def stop(self): 166 | pass 167 | 168 | def flush(self): 169 | pass 170 | 171 | 172 | class SoundDeviceStream(object): 173 | """Audio stream based on an underlying sound device. 174 | 175 | It can be used as an audio source (read) and a audio sink (write). 176 | 177 | Args: 178 | sample_rate: sample rate in hertz. 179 | sample_width: size of a single sample in bytes. 180 | block_size: size in bytes of each read and write operation. 181 | flush_size: size in bytes of silence data written during flush operation. 182 | """ 183 | def __init__(self, sample_rate, sample_width, block_size, flush_size): 184 | if sample_width == 2: 185 | audio_format = 'int16' 186 | else: 187 | raise Exception('unsupported sample width:', sample_width) 188 | self._audio_stream = sd.RawStream( 189 | samplerate=sample_rate, dtype=audio_format, channels=1, 190 | blocksize=int(block_size/2), # blocksize is in number of frames. 191 | ) 192 | self._block_size = block_size 193 | self._flush_size = flush_size 194 | self._sample_rate = sample_rate 195 | 196 | def read(self, size): 197 | """Read bytes from the stream.""" 198 | buf, overflow = self._audio_stream.read(size) 199 | if overflow: 200 | logging.warning('SoundDeviceStream read overflow (%d, %d)', 201 | size, len(buf)) 202 | return bytes(buf) 203 | 204 | def write(self, buf): 205 | """Write bytes to the stream.""" 206 | underflow = self._audio_stream.write(buf) 207 | if underflow: 208 | logging.warning('SoundDeviceStream write underflow (size: %d)', 209 | len(buf)) 210 | return len(buf) 211 | 212 | def flush(self): 213 | if self._audio_stream.active and self._flush_size > 0: 214 | self._audio_stream.write(b'\x00' * self._flush_size) 215 | 216 | def start(self): 217 | """Start the underlying stream.""" 218 | if not self._audio_stream.active: 219 | self._audio_stream.start() 220 | 221 | def stop(self): 222 | """Stop the underlying stream.""" 223 | if self._audio_stream.active: 224 | self._audio_stream.stop() 225 | 226 | def close(self): 227 | """Close the underlying stream and audio interface.""" 228 | if self._audio_stream: 229 | self.stop() 230 | self._audio_stream.close() 231 | self._audio_stream = None 232 | 233 | @property 234 | def sample_rate(self): 235 | return self._sample_rate 236 | 237 | 238 | class ConversationStream(object): 239 | """Audio stream that supports half-duplex conversation. 240 | 241 | A conversation is the alternance of: 242 | - a recording operation 243 | - a playback operation 244 | 245 | Excepted usage: 246 | 247 | For each conversation: 248 | - start_recording() 249 | - read() or iter() 250 | - stop_recording() 251 | - start_playback() 252 | - write() 253 | - stop_playback() 254 | 255 | When conversations are finished: 256 | - close() 257 | 258 | Args: 259 | source: file-like stream object to read input audio bytes from. 260 | sink: file-like stream object to write output audio bytes to. 261 | iter_size: read size in bytes for each iteration. 262 | sample_width: size of a single sample in bytes. 263 | """ 264 | def __init__(self, source, sink, iter_size, sample_width): 265 | self._source = source 266 | self._sink = sink 267 | self._iter_size = iter_size 268 | self._sample_width = sample_width 269 | self._volume_percentage = 50 270 | self._stop_recording = threading.Event() 271 | self._source_lock = threading.RLock() 272 | self._recording = False 273 | self._playing = False 274 | 275 | def start_recording(self): 276 | """Start recording from the audio source.""" 277 | self._recording = True 278 | self._stop_recording.clear() 279 | self._source.start() 280 | 281 | def stop_recording(self): 282 | """Stop recording from the audio source.""" 283 | self._stop_recording.set() 284 | with self._source_lock: 285 | self._source.stop() 286 | self._recording = False 287 | 288 | def start_playback(self): 289 | """Start playback to the audio sink.""" 290 | self._playing = True 291 | self._sink.start() 292 | 293 | def stop_playback(self): 294 | """Stop playback from the audio sink.""" 295 | self._sink.flush() 296 | self._sink.stop() 297 | self._playing = False 298 | 299 | @property 300 | def recording(self): 301 | return self._recording 302 | 303 | @property 304 | def playing(self): 305 | return self._playing 306 | 307 | @property 308 | def volume_percentage(self): 309 | """The current volume setting as an integer percentage (1-100).""" 310 | return self._volume_percentage 311 | 312 | @volume_percentage.setter 313 | def volume_percentage(self, new_volume_percentage): 314 | self._volume_percentage = new_volume_percentage 315 | 316 | def read(self, size): 317 | """Read bytes from the source (if currently recording). 318 | """ 319 | with self._source_lock: 320 | return self._source.read(size) 321 | 322 | def write(self, buf): 323 | """Write bytes to the sink (if currently playing). 324 | """ 325 | buf = align_buf(buf, self._sample_width) 326 | buf = normalize_audio_buffer(buf, self.volume_percentage) 327 | return self._sink.write(buf) 328 | 329 | def close(self): 330 | """Close source and sink.""" 331 | self._source.close() 332 | self._sink.close() 333 | 334 | def __iter__(self): 335 | """Returns a generator reading data from the stream.""" 336 | while True: 337 | if self._stop_recording.is_set(): 338 | raise StopIteration 339 | yield self.read(self._iter_size) 340 | 341 | @property 342 | def sample_rate(self): 343 | return self._source._sample_rate 344 | 345 | 346 | @click.command() 347 | @click.option('--record-time', default=5, 348 | metavar='', show_default=True, 349 | help='Record time in secs') 350 | @click.option('--audio-sample-rate', 351 | default=DEFAULT_AUDIO_SAMPLE_RATE, 352 | metavar='