├── requirements.txt ├── raw_to_wav.py ├── LICENSE.md ├── .gitignore ├── README.md ├── ros_receiver.py ├── module_receiver.py ├── google.py └── module_speechrecognition.py /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy -------------------------------------------------------------------------------- /raw_to_wav.py: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # This method converts raw audio files in the project directory to wav files 3 | # 4 | # Author: Johannes Bramauer, Vienna University of Technology 5 | # Created: May 30, 2018 6 | # License: MIT 7 | # 8 | ########################################################### 9 | import wave 10 | import os 11 | 12 | def rawToWav(filename): 13 | 14 | rawfile = filename + ".raw" 15 | if not os.path.isfile(rawfile): 16 | return 17 | 18 | outfile = wave.open(filename + ".wav", "wb") 19 | outfile.setframerate(48000) 20 | outfile.setnchannels(1) 21 | outfile.setsampwidth(2) 22 | 23 | f = open(rawfile, "rb") 24 | sample = f.read(4096) 25 | print 'writing file: ' + filename + '.wav' 26 | 27 | while sample != "": 28 | outfile.writeframes(sample) 29 | sample = f.read(4096) 30 | 31 | outfile.close() 32 | 33 | os.remove(rawfile) 34 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2018 Johannes Bramauer, Vienna University of Technology 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 3 | 4 | # User-specific stuff: 5 | .idea/**/workspace.xml 6 | .idea/**/tasks.xml 7 | .idea/dictionaries 8 | 9 | # Sensitive or high-churn files: 10 | .idea/**/dataSources/ 11 | .idea/**/dataSources.ids 12 | .idea/**/dataSources.local.xml 13 | .idea/**/sqlDataSources.xml 14 | .idea/**/dynamic.xml 15 | .idea/**/uiDesigner.xml 16 | 17 | .idea/misc.xml 18 | .idea/modules.xml 19 | .idea/vcs.xml 20 | 21 | # Gradle: 22 | .idea/**/gradle.xml 23 | .idea/**/libraries 24 | 25 | # CMake 26 | cmake-build-debug/ 27 | cmake-build-release/ 28 | 29 | # Mongo Explorer plugin: 30 | .idea/**/mongoSettings.xml 31 | 32 | ## File-based project format: 33 | *.iws 34 | 35 | ## Plugin-specific files: 36 | 37 | # IntelliJ 38 | out/ 39 | 40 | # mpeltonen/sbt-idea plugin 41 | .idea_modules/ 42 | 43 | # JIRA plugin 44 | atlassian-ide-plugin.xml 45 | 46 | # Cursive Clojure plugin 47 | .idea/replstate.xml 48 | 49 | # Crashlytics plugin (for Android Studio and IntelliJ) 50 | com_crashlytics_export_strings.xml 51 | crashlytics.properties 52 | crashlytics-build.properties 53 | fabric.properties 54 | 55 | # Mac stuff 56 | .DS_Store 57 | 58 | # python byte code 59 | *.pyc 60 | 61 | # virtual environment directory 62 | venv/ 63 | 64 | # recorded debug audio files 65 | *.raw 66 | *.wav 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pepper Speech Recognition 2 | 3 | This is a python module that brings Google Speech Recognition to the Pepper robot by Aldebaran. 4 | It was specifically implemented so it can be run ON the robot as a NaoQi module but it can also be run on your computer. 5 | 6 | ## Features 7 | * Automatic detection if and when a person speaks 8 | * Automatic detection when a person stops speaking 9 | * Calibration function to learn background noise level 10 | * Manual trigger mode (record now for 10 seconds and then recognize) 11 | * This module provides an ALMemory event other modules can subscribe to 12 | * Unlike other modules this one does NOT depend on PyAudio/Portaudio 13 | * Only dependencies: Numpy, NaoQi Python SDK 14 | 15 | ## Setup to run on robot 16 | Everything you need should already be installed, simply clone the repository / copy the files to your robot, ssh into it and run 17 | 18 | ``` 19 | python module_speechrecognition.py 20 | ``` 21 | 22 | ## Setup to run on computer 23 | 24 | Like every NaoQi Python project you need the NaoQi Python SDK. You can get it from [doc.aldebaran.com](http://doc.aldebaran.com/2-5/dev/python/install_guide.html). 25 | To test it simply run 26 | 27 | ``` 28 | python 29 | import naoqi 30 | ``` 31 | 32 | Next up you need the numpy python module, which is the only python library used 33 | 34 | ``` 35 | pip install numpy 36 | ``` 37 | 38 | Test it: 39 | 40 | ``` 41 | python 42 | import numpy 43 | ``` 44 | 45 | Now run the speechrecognition module 46 | 47 | 48 | ``` 49 | python module_speechrecognition.py --pip (your robot IP) 50 | ``` 51 | 52 | ## Configuration 53 | To configure the module you can use an ALProxy instance you can get from the Broker (speechrecognition module needs to be running) 54 | 55 | Here's a simple example on how to do that: 56 | 57 | ``` 58 | SpeechRecognition = ALProxy("SpeechRecognition") 59 | SpeechRecognition.start() 60 | SpeechRecognition.setLanguage("de-de") 61 | SpeechRecognition.calibrate() 62 | SpeechRecognition.enableAutoDetection() 63 | ``` 64 | Calling start() makes the module subscribe to the ALAudio event of the robot. Calibrate() makes it calculate the mean RMS value of the signal for 4 seconds and use this as a reference for background noise level. 65 | 66 | Also, you can just use the module like this to trigger a one-time recognition: 67 | ``` 68 | SpeechRecognition = ALProxy("SpeechRecognition") 69 | SpeechRecognition.start() 70 | SpeechRecognition.setLanguage("de-de") 71 | 72 | # starts immediately, records for at least HOLD_TIME seconds and then tries to recognize 73 | SpeechRecognition.startRecoring() 74 | ``` 75 | 76 | You can use [module_receiver.py](module_receiver.py) as a template for implementing your own module. 77 | 78 | ## Subscribing to the ALMemory event 79 | The speechrecognition module raises an ALMemory event every time a string was successfully recognized. You can subscribe to it in your module using the ALMemory proxy. 80 | 81 | ``` 82 | memory = naoqi.ALProxy("ALMemory") 83 | memory.subscribeToEvent("SpeechRecognition", self.getName(), "processRemote") 84 | ``` 85 | 86 | "SpeechRecognition" is the name of the event, "proccessRemote" the name of your callback function. 87 | 88 | ``` 89 | def processRemote(self, signalName, message): 90 | # Do something with the received speech recognition result 91 | print(message) 92 | ``` 93 | 94 | ### ROS Module 95 | A simple ROS (Robot Operating System) module [is provided](ros_receiver.py), that uses [rospy](http://wiki.ros.org/rospy) to publish a ROS message whenever speech is recognized, carrying the recognized string as data. 96 | 97 | ## Dependencies 98 | The speechrecognition module was built to be able to run ON Pepper (and you can't easily install 3rd party libraries there, also we need to be efficient), so it's only dependencies are 99 | * Python 2.7 100 | * numpy 101 | * naoqi Python SDK 102 | 103 | All of them are pre-installed on Pepper 104 | 105 | ## Built With 106 | 107 | * Python 2.7 108 | * [PyCharm](https://www.jetbrains.com/pycharm/) - Python IDE 109 | 110 | ## Changelog 111 | 112 | **Version 1.0** - Initial release, May 30 2018 113 | 114 | ## Authors 115 | 116 | * **Johannes Bramauer** @ Vienna University of Technology - *Initial work* 117 | 118 | ## License 119 | 120 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details 121 | 122 | ## Acknowledgments 123 | 124 | * Anthony Zang (Uberi) and his [SpeechRecognition](https://github.com/Uberi/speech_recognition) 125 | -------------------------------------------------------------------------------- /ros_receiver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ########################################################### 5 | # ROS Module and Naoqi Module that wraps the google speech recognition and publishes a ROS event whenever speech was detected 6 | # Syntaxe: 7 | # python scriptname --pip --pport 8 | # 9 | # --pip : specify the ip of your robot (without specification it will use the NAO_IP defined some line below 10 | # 11 | # Author: Johannes Bramauer 12 | # Created: May 30, 2018 13 | # License: MIT 14 | ########################################################### 15 | 16 | # NAO_PORT = 65445 # Virtual Machine 17 | NAO_PORT = 9559 # Robot 18 | 19 | # NAO_IP = "127.0.0.1" # Virtual Machine 20 | NAO_IP = "nao.local" # Pepper 21 | 22 | 23 | from optparse import OptionParser 24 | import naoqi 25 | import time 26 | import sys 27 | from naoqi import ALProxy 28 | import rospy 29 | from std_msgs.msg import String 30 | 31 | class ReceiverTestModule(naoqi.ALModule): 32 | """ 33 | Use this object to get call back from the ALMemory of the naoqi world. 34 | Your callback needs to be a method with two parameter (variable name, value). 35 | """ 36 | 37 | def __init__( self, strModuleName, strNaoIp ): 38 | try: 39 | naoqi.ALModule.__init__(self, strModuleName ); 40 | self.BIND_PYTHON( self.getName(),"callback" ); 41 | self.strNaoIp = strNaoIp; 42 | 43 | self.pub = rospy.Publisher('phrase', String, queue_size=10) 44 | 45 | except BaseException, err: 46 | print( "ERR: abcdk.naoqitools.ReceiverModule: loading error: %s" % str(err) ); 47 | 48 | # __init__ - end 49 | def __del__( self ): 50 | print( "INF: abcdk.naoqitools.ReceiverModule.__del__: cleaning everything" ); 51 | self.stop(); 52 | 53 | def start( self ): 54 | memory = naoqi.ALProxy("ALMemory", NAO_IP, NAO_PORT) 55 | memory.subscribeToEvent("SpeechRecognition", self.getName(), "processRemote") 56 | print( "INF: ReceiverModule: started!" ) 57 | 58 | 59 | def stop( self ): 60 | print( "INF: ReceiverModule: stopping..." ) 61 | memory = naoqi.ALProxy("ALMemory", NAO_IP, NAO_PORT) 62 | memory.unsubscribe(self.getName()) 63 | 64 | print( "INF: ReceiverModule: stopped!" ) 65 | 66 | def version( self ): 67 | return "0.1"; 68 | 69 | def processRemote(self, signalName, message): 70 | self.pub.publish(message) 71 | print(message) 72 | 73 | 74 | def main(): 75 | """ Main entry point 76 | 77 | """ 78 | parser = OptionParser() 79 | parser.add_option("--pip", 80 | help="Parent broker port. The IP address or your robot", 81 | dest="pip") 82 | parser.add_option("--pport", 83 | help="Parent broker port. The port NAOqi is listening to", 84 | dest="pport", 85 | type="int") 86 | parser.set_defaults( 87 | pip=NAO_IP, 88 | pport=NAO_PORT) 89 | 90 | (opts, args_) = parser.parse_args() 91 | pip = opts.pip 92 | pport = opts.pport 93 | 94 | # We need this broker to be able to construct 95 | # NAOqi modules and subscribe to other modules 96 | # The broker must stay alive until the program exists 97 | myBroker = naoqi.ALBroker("myBroker", 98 | "0.0.0.0", # listen to anyone 99 | 0, # find a free port and use it 100 | pip, # parent broker IP 101 | pport) # parent broker port 102 | 103 | try: 104 | p = ALProxy("ReceiverTestModule") 105 | p.exit() # kill previous instance 106 | except: 107 | pass 108 | # Reinstantiate module 109 | 110 | # Warning: ReceiverModule must be a global variable 111 | # The name given to the constructor must be the name of the 112 | # variable 113 | global ReceiverTestModule 114 | ReceiverTestModule = ReceiverTestModule("ReceiverTestModule", pip) 115 | ReceiverTestModule.start() 116 | 117 | SpeechRecognition = ALProxy("SpeechRecognition") 118 | SpeechRecognition.start() 119 | #SpeechRecognition.setLanguage("de-de") 120 | SpeechRecognition.calibrate() 121 | SpeechRecognition.enableAutoDetection() 122 | 123 | # ROS node 124 | rospy.init_node('pepper_listen', anonymous=True) 125 | 126 | try: 127 | while not rospy.is_shutdown(): 128 | time.sleep(1) 129 | 130 | except KeyboardInterrupt: 131 | print "Interrupted by user, shutting down" 132 | 133 | except rospy.ROSInterruptException: 134 | print "Interrupted by ROS, shutting down" 135 | pass 136 | 137 | myBroker.shutdown() 138 | sys.exit(0) 139 | 140 | 141 | 142 | if __name__ == "__main__": 143 | main() -------------------------------------------------------------------------------- /module_receiver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ########################################################### 4 | # This module receives results from the speechrecognition module and prints to console 5 | # 6 | # Syntax: 7 | # python scriptname --pip --pport 8 | # 9 | # --pip : specify the ip of your robot (without specification it will use the NAO_IP defined below 10 | # 11 | # Author: Johannes Bramauer, Vienna University of Technology 12 | # Created: May 30, 2018 13 | # License: MIT 14 | ########################################################### 15 | 16 | # NAO_PORT = 65445 # Virtual Machine 17 | NAO_PORT = 9559 # Robot 18 | 19 | 20 | # NAO_IP = "127.0.0.1" # Virtual Machine 21 | NAO_IP = "nao.local" # Pepper default 22 | 23 | 24 | from optparse import OptionParser 25 | import naoqi 26 | import time 27 | import sys 28 | from naoqi import ALProxy 29 | 30 | 31 | class BaseSpeechReceiverModule(naoqi.ALModule): 32 | """ 33 | Use this object to get call back from the ALMemory of the naoqi world. 34 | Your callback needs to be a method with two parameter (variable name, value). 35 | """ 36 | 37 | def __init__( self, strModuleName, strNaoIp ): 38 | try: 39 | naoqi.ALModule.__init__(self, strModuleName ) 40 | self.BIND_PYTHON( self.getName(),"callback" ) 41 | self.strNaoIp = strNaoIp 42 | 43 | except BaseException, err: 44 | print( "ERR: ReceiverModule: loading error: %s" % str(err) ) 45 | 46 | # __init__ - end 47 | def __del__( self ): 48 | print( "INF: ReceiverModule.__del__: cleaning everything" ) 49 | self.stop() 50 | 51 | def start( self ): 52 | memory = naoqi.ALProxy("ALMemory", self.strNaoIp, NAO_PORT) 53 | memory.subscribeToEvent("SpeechRecognition", self.getName(), "processRemote") 54 | print( "INF: ReceiverModule: started!" ) 55 | 56 | 57 | def stop( self ): 58 | print( "INF: ReceiverModule: stopping..." ) 59 | memory = naoqi.ALProxy("ALMemory", self.strNaoIp, NAO_PORT) 60 | memory.unsubscribe(self.getName()) 61 | 62 | print( "INF: ReceiverModule: stopped!" ) 63 | 64 | def version( self ): 65 | return "1.1" 66 | 67 | def processRemote(self, signalName, message): 68 | # Do something with the received speech recognition result 69 | print(message) 70 | 71 | 72 | def main(): 73 | """ Main entry point 74 | 75 | """ 76 | parser = OptionParser() 77 | parser.add_option("--pip", 78 | help="Parent broker port. The IP address or your robot", 79 | dest="pip") 80 | parser.add_option("--pport", 81 | help="Parent broker port. The port NAOqi is listening to", 82 | dest="pport", 83 | type="int") 84 | parser.set_defaults( 85 | pip=NAO_IP, 86 | pport=NAO_PORT) 87 | 88 | (opts, args_) = parser.parse_args() 89 | pip = opts.pip 90 | pport = opts.pport 91 | 92 | # We need this broker to be able to construct 93 | # NAOqi modules and subscribe to other modules 94 | # The broker must stay alive until the program exists 95 | myBroker = naoqi.ALBroker("myBroker", 96 | "0.0.0.0", # listen to anyone 97 | 0, # find a free port and use it 98 | pip, # parent broker IP 99 | pport) # parent broker port 100 | 101 | try: 102 | p = ALProxy("BaseSpeechReceiverModule") 103 | p.exit() # kill previous instance 104 | except: 105 | pass 106 | # Reinstantiate module 107 | 108 | # Warning: ReceiverModule must be a global variable 109 | # The name given to the constructor must be the name of the 110 | # variable 111 | global BaseSpeechReceiverModule 112 | BaseSpeechReceiverModule = BaseSpeechReceiverModule("BaseSpeechReceiverModule", pip) 113 | BaseSpeechReceiverModule.start() 114 | 115 | if(False): 116 | #one-shot recording for at least 5 seconds 117 | SpeechRecognition = ALProxy("SpeechRecognition") 118 | SpeechRecognition.start() 119 | SpeechRecognition.setHoldTime(5) 120 | SpeechRecognition.setIdleReleaseTime(1.7) 121 | SpeechRecognition.setMaxRecordingDuration(10) 122 | SpeechRecognition.startRecording() 123 | 124 | else: 125 | # auto-detection 126 | SpeechRecognition = ALProxy("SpeechRecognition") 127 | SpeechRecognition.start() 128 | SpeechRecognition.setHoldTime(2.5) 129 | SpeechRecognition.setIdleReleaseTime(1.0) 130 | SpeechRecognition.setMaxRecordingDuration(10) 131 | SpeechRecognition.setLookaheadDuration(0.5) 132 | #SpeechRecognition.setLanguage("de-de") 133 | #SpeechRecognition.calibrate() 134 | SpeechRecognition.setAutoDetectionThreshold(5) 135 | SpeechRecognition.enableAutoDetection() 136 | #SpeechRecognition.startRecording() 137 | 138 | try: 139 | while True: 140 | time.sleep(1) 141 | 142 | except KeyboardInterrupt: 143 | print 144 | print "Interrupted by user, shutting down" 145 | myBroker.shutdown() 146 | sys.exit(0) 147 | 148 | 149 | 150 | if __name__ == "__main__": 151 | main() -------------------------------------------------------------------------------- /google.py: -------------------------------------------------------------------------------- 1 | # The recognize_google method is taken from the Uberi SpeechRecognition module: 2 | # https://github.com/Uberi/speech_recognition 3 | # Author: Anthony Zhang (Uberi) 4 | # License: BSD (3 clause) 5 | # 6 | # Slight modification was done to support contentType parameters and have 16 bit LPCM be the default 7 | # Modification by Johannes Bramauer, Vienna University of Technology, May 30, 2018 8 | # 9 | # License: 10 | # Copyright (c) 2014-2017, Anthony Zhang 11 | # All rights reserved. 12 | # 13 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 14 | # 15 | # 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 16 | # 17 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 18 | # 19 | # 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 20 | # 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | # 23 | 24 | from urllib import urlencode 25 | from urllib2 import Request, urlopen, URLError, HTTPError 26 | import json 27 | import socket 28 | 29 | class RequestError(Exception): pass 30 | 31 | class UnknownValueError(Exception): pass 32 | 33 | 34 | class Recognizer : 35 | 36 | # supported content types: "audio/l16" (16 bit signed LPCM), "audio/flac" 37 | # samplerate must be at least 8kHz 38 | 39 | def recognize_google(self, audio_data, samplerate, key=None, language="en-US", show_all=False, contentType="audio/l16"): 40 | """ 41 | Performs speech recognition on ``audio_data``, using the Google Speech Recognition API. 42 | The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**. 43 | To obtain your own API key, simply following the steps on the `API Keys `__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API". 44 | The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer `__. 45 | Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. 46 | Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. 47 | """ 48 | assert key is None or isinstance(key, str), "``key`` must be ``None`` or a string" 49 | assert isinstance(language, str), "``language`` must be a string" 50 | 51 | if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" 52 | 53 | url = "http://www.google.com/speech-api/v2/recognize?{}".format(urlencode({ 54 | "client": "chromium", 55 | "lang": language, 56 | "key": key, 57 | })) 58 | 59 | # set default request timeout to 30 seconds 60 | socket.setdefaulttimeout(30) 61 | 62 | request = Request(url, data=audio_data, 63 | headers={"Content-Type": "{}; rate={}".format(contentType, samplerate)}) 64 | 65 | # obtain audio transcription results 66 | try: 67 | response = urlopen(request) 68 | except HTTPError as e: 69 | raise RequestError("recognition request failed: {}".format(e.reason)) 70 | except URLError as e: 71 | raise RequestError("recognition connection failed: {}".format(e.reason)) 72 | response_text = response.read().decode("utf-8") 73 | 74 | # ignore any blank blocks 75 | actual_result = [] 76 | for line in response_text.split("\n"): 77 | if not line: continue 78 | result = json.loads(line)["result"] 79 | if len(result) != 0: 80 | actual_result = result[0] 81 | break 82 | 83 | # return results 84 | if show_all: return actual_result 85 | if not isinstance(actual_result, dict) or len(actual_result.get("alternative", [])) == 0: 86 | raise UnknownValueError() 87 | 88 | if "confidence" in actual_result["alternative"]: 89 | # return alternative with highest confidence score 90 | best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"]) 91 | else: 92 | # when there is no confidence available, we arbitrarily choose the first hypothesis. 93 | best_hypothesis = actual_result["alternative"][0] 94 | if "transcript" not in best_hypothesis: 95 | raise UnknownValueError() 96 | return best_hypothesis["transcript"].encode("utf-8") -------------------------------------------------------------------------------- /module_speechrecognition.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ########################################################### 4 | # Retrieve robot audio buffer and do google speech recognition 5 | # 6 | # Syntax: 7 | # python scriptname --pip --pport 8 | # 9 | # --pip : specify the ip of your robot (without specification it will use the NAO_IP defined below) 10 | # 11 | # Author: Johannes Bramauer, Vienna University of Technology 12 | # Created: May 30, 2018 13 | # License: MIT 14 | # 15 | ########################################################### 16 | import socket 17 | 18 | from raw_to_wav import rawToWav 19 | 20 | NAO_IP = "127.0.0.1" # default, for running on Pepper 21 | NAO_PORT = 9559 22 | 23 | from optparse import OptionParser 24 | import naoqi 25 | import numpy as np 26 | import time 27 | import sys 28 | import threading 29 | from naoqi import ALProxy 30 | from google import Recognizer, UnknownValueError, RequestError 31 | from numpy import sqrt, mean, square 32 | import traceback 33 | 34 | 35 | RECORDING_DURATION = 10 # seconds, maximum recording time, also default value for startRecording(), Google Speech API only accepts up to about 10-15 seconds 36 | LOOKAHEAD_DURATION = 1.0 # seconds, for auto-detect mode: amount of seconds before the threshold trigger that will be included in the request 37 | IDLE_RELEASE_TIME = 2.0 # seconds, for auto-detect mode: idle time (RMS below threshold) after which we stop recording and recognize 38 | HOLD_TIME = 3.0 # seconds, minimum recording time after we started recording (autodetection) 39 | SAMPLE_RATE = 48000 # Hz, be careful changing this, both google and Naoqi have requirements! 40 | 41 | CALIBRATION_DURATION = 4 # seconds, timespan during which calibration is performed (summing up RMS values and calculating mean) 42 | CALIBRATION_THRESHOLD_FACTOR = 1.5 # factor the calculated mean RMS gets multiplied by to determine the auto detection threshold (after calibration) 43 | 44 | DEFAULT_LANGUAGE = "en-us" # RFC5646 language tag, e.g. "en-us", "de-de", "fr-fr",... 45 | 46 | WRITE_WAV_FILE = False # write the recorded audio to "out.wav" before sending it to google. intended for debugging purposes 47 | PRINT_RMS = False # prints the calculated RMS value to the console, useful for setting the threshold 48 | 49 | PREBUFFER_WHEN_STOP = False # Fills pre-buffer with last samples when stopping recording. WARNING: has performance issues! 50 | 51 | 52 | class SpeechRecognitionModule(naoqi.ALModule): 53 | """ 54 | Use this object to get call back from the ALMemory of the naoqi world. 55 | Your callback needs to be a method with two parameter (variable name, value). 56 | """ 57 | 58 | def __init__( self, strModuleName, strNaoIp ): 59 | try: 60 | naoqi.ALModule.__init__(self, strModuleName ) 61 | 62 | # is these 2 line necessary? what do they do? 63 | # just copied them from the examples... 64 | self.BIND_PYTHON( self.getName(),"callback" ) 65 | self.strNaoIp = strNaoIp 66 | 67 | # declare event to ALMemory so other modules can subscribe 68 | self.memory = naoqi.ALProxy("ALMemory") 69 | self.memory.declareEvent("SpeechRecognition") 70 | 71 | # flag to indicate if subscribed to audio events 72 | self.isStarted = False 73 | 74 | # flag to indicate if we are currently recording audio 75 | self.isRecording = False 76 | self.startRecordingTimestamp = 0 77 | self.recordingDuration = RECORDING_DURATION 78 | 79 | # flag to indicate if auto speech detection is enabled 80 | self.isAutoDetectionEnabled = False 81 | self.autoDetectionThreshold = 10 # TODO: find a default value that works fine so we don't need to calibrate every time 82 | 83 | # flag to indicate if we are calibrating 84 | self.isCalibrating = False 85 | self.startCalibrationTimestamp = 0 86 | 87 | # RMS calculation variables 88 | self.framesCount = 0 89 | self.rmsSum = 0 # used to sum up rms results and calculate average 90 | self.lastTimeRMSPeak = 0 91 | 92 | # audio buffer 93 | self.buffer = [] 94 | self.preBuffer = [] 95 | self.preBufferLength = 0 # length in samples (len(self.preBuffer) just counts entries) 96 | 97 | # init parameters 98 | self.language = DEFAULT_LANGUAGE 99 | self.idleReleaseTime = IDLE_RELEASE_TIME 100 | self.holdTime = HOLD_TIME 101 | self.lookaheadBufferSize = LOOKAHEAD_DURATION * SAMPLE_RATE 102 | 103 | # counter for wav file output 104 | self.fileCounter = 0 105 | 106 | except BaseException, err: 107 | print( "ERR: SpeechRecognitionModule: loading error: %s" % str(err) ) 108 | 109 | # __init__ - end 110 | def __del__( self ): 111 | print( "INF: SpeechRecognitionModule.__del__: cleaning everything" ) 112 | self.stop() 113 | 114 | def start( self ): 115 | 116 | if(self.isStarted): 117 | print("INF: SpeechRecognitionModule.start: already running") 118 | return 119 | 120 | print("INF: SpeechRecognitionModule: starting!") 121 | 122 | self.isStarted = True 123 | 124 | audio = naoqi.ALProxy( "ALAudioDevice") 125 | nNbrChannelFlag = 0 # ALL_Channels: 0, AL::LEFTCHANNEL: 1, AL::RIGHTCHANNEL: 2 AL::FRONTCHANNEL: 3 or AL::REARCHANNEL: 4. 126 | nDeinterleave = 0 127 | audio.setClientPreferences( self.getName(), SAMPLE_RATE, nNbrChannelFlag, nDeinterleave ) # setting same as default generate a bug !?! 128 | audio.subscribe( self.getName() ) 129 | 130 | def pause(self): 131 | print("INF: SpeechRecognitionModule.pause: stopping") 132 | if (self.isStarted == False): 133 | print("INF: SpeechRecognitionModule.stop: not running") 134 | return 135 | 136 | self.isStarted = False 137 | 138 | audio = naoqi.ALProxy("ALAudioDevice", self.strNaoIp, NAO_PORT) 139 | audio.unsubscribe(self.getName()) 140 | 141 | print("INF: SpeechRecognitionModule: stopped!") 142 | 143 | def stop( self ): 144 | self.pause() 145 | 146 | def processRemote( self, nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, buffer ): 147 | #print("INF: SpeechRecognitionModule: Processing '%s' channels" % nbOfChannels) 148 | 149 | # calculate a decimal seconds timestamp 150 | timestamp = float (str(aTimeStamp[0]) + "." + str(aTimeStamp[1])) 151 | 152 | # put whole function in a try/except to be able to see the stracktrace 153 | try: 154 | 155 | aSoundDataInterlaced = np.fromstring( str(buffer), dtype=np.int16 ) 156 | aSoundData = np.reshape( aSoundDataInterlaced, (nbOfChannels, nbrOfSamplesByChannel), 'F' ) 157 | 158 | # compute RMS, handle autodetection and calibration 159 | if( self.isCalibrating or self.isAutoDetectionEnabled or self.isRecording): 160 | 161 | # compute the rms level on front mic 162 | rmsMicFront = self.calcRMSLevel(self.convertStr2SignedInt(aSoundData[0])) 163 | 164 | if (rmsMicFront >= self.autoDetectionThreshold): 165 | # save timestamp when we last had and RMS > threshold 166 | self.lastTimeRMSPeak = timestamp 167 | 168 | # start recording if we are not doing so already 169 | if (self.isAutoDetectionEnabled and not self.isRecording and not self.isCalibrating): 170 | self.startRecording() 171 | 172 | # perform calibration 173 | if( self.isCalibrating): 174 | 175 | if(self.startCalibrationTimestamp <= 0): 176 | # we are starting to calibrate, save timestamp 177 | # to track how long we are doing this 178 | self.startCalibrationTimestamp = timestamp 179 | 180 | elif(timestamp - self.startCalibrationTimestamp >= CALIBRATION_DURATION): 181 | # time's up, we're done! 182 | self.stopCalibration() 183 | 184 | # sum rms values of the frames 185 | # keep track of how many frames we sum up 186 | # to calculate mean afterwards 187 | self.rmsSum += rmsMicFront 188 | self.framesCount = self.framesCount + 1 189 | 190 | 191 | if(PRINT_RMS): 192 | # for debug purposes 193 | # also use it to find a good threshold value for auto detection 194 | print 'Mic RMS: ' + str(rmsMicFront) 195 | 196 | if( False ): 197 | # compute average 198 | aAvgValue = np.mean( aSoundData, axis = 1 ) 199 | print( "avg: %s" % aAvgValue ) 200 | if( False ): 201 | # compute fft 202 | nBlockSize = nbrOfSamplesByChannel 203 | signal = aSoundData[0] * np.hanning( nBlockSize ) 204 | aFft = ( np.fft.rfft(signal) / nBlockSize ) 205 | print aFft 206 | if( False ): 207 | # compute peak 208 | aPeakValue = np.max( aSoundData ) 209 | if( aPeakValue > 16000 ): 210 | print( "Peak: %s" % aPeakValue ) 211 | 212 | if(not self.isCalibrating): 213 | if(self.isRecording): 214 | # write to buffer 215 | self.buffer.append(aSoundData) 216 | 217 | if (self.startRecordingTimestamp <= 0): 218 | # initialize timestamp when we start recording 219 | self.startRecordingTimestamp = timestamp 220 | elif ((timestamp - self.startRecordingTimestamp) > self.recordingDuration): 221 | print('stop after max recording duration') 222 | # check how long we are recording 223 | self.stopRecordingAndRecognize() 224 | 225 | # stop recording after idle time (and recording at least hold time) 226 | # lastTimeRMSPeak is 0 if no peak occured 227 | if (timestamp - self.lastTimeRMSPeak >= self.idleReleaseTime) and ( 228 | timestamp - self.startRecordingTimestamp >= self.holdTime): 229 | print ('stopping after idle/hold time') 230 | self.stopRecordingAndRecognize() 231 | else: 232 | # constantly record into prebuffer for lookahead 233 | self.preBuffer.append(aSoundData) 234 | self.preBufferLength += len(aSoundData[0]) 235 | 236 | # remove first (oldest) item if the buffer gets bigger than required 237 | # removes one block of samples as we store a list of lists... 238 | overshoot = (self.preBufferLength - self.lookaheadBufferSize) 239 | 240 | if((overshoot > 0) and (len(self.preBuffer) > 0)): 241 | self.preBufferLength -= len(self.preBuffer.pop(0)[0]) 242 | 243 | except: 244 | # i did this so i could see the stracktrace as the thread otherwise just silently failed 245 | traceback.print_exc() 246 | 247 | # processRemote - end 248 | 249 | def calcRMSLevel(self, data): 250 | rms = (sqrt(mean(square(data)))) 251 | # TODO: maybe a log would be better for threshold? 252 | #rms = 20 * np.log10(np.sqrt(np.sum(np.power(data, 2) / len(data)))) 253 | return rms 254 | 255 | def version( self ): 256 | return "1.1" 257 | 258 | 259 | # use this method to manually start recording (works with both autodetection enabled or disabled) 260 | # the recording will stop after the signal is below the threshold for IDLE_RELEASE_TIME seconds, 261 | # but will at least record for HOLD_TIME seconds 262 | def startRecording(self): 263 | if(self.isRecording): 264 | print("INF: SpeechRecognitionModule.startRecording: already recording") 265 | return 266 | 267 | print("INF: Starting to record audio") 268 | 269 | # start recording 270 | self.startRecordingTimestamp = 0 271 | self.lastTimeRMSPeak = 0 272 | self.buffer = self.preBuffer 273 | 274 | #self.preBuffer = [] 275 | 276 | self.isRecording = True 277 | 278 | return 279 | 280 | def stopRecordingAndRecognize(self): 281 | if(self.isRecording == False): 282 | print("INF: SpeechRecognitionModule.stopRecordingAndRecognize: not recording") 283 | return 284 | 285 | print("INF: stopping recording and recognizing") 286 | 287 | # TODO: choose which mic channel to use 288 | # can we use the sound direction module for this? 289 | 290 | # buffer is a list of nparrays we now concat into one array 291 | # and the slice out the first mic channel 292 | slice = np.concatenate(self.buffer, axis=1)[0] 293 | 294 | # initialize preBuffer with last samples to fix cut off words 295 | # loop through buffer and count samples until prebuffer is full 296 | # TODO: performance issues! 297 | if (PREBUFFER_WHEN_STOP): 298 | sampleCounter = 0 299 | itemCounter = 0 300 | 301 | for i in reversed(self.preBuffer): 302 | sampleCounter += len(i[0]) 303 | 304 | if(sampleCounter > self.lookaheadBufferSize): 305 | break 306 | 307 | itemCounter += 1 308 | 309 | start = len(self.buffer) - itemCounter 310 | self.preBuffer = self.buffer[start:] 311 | else: 312 | # don't copy to prebuffer 313 | self.preBuffer = [] 314 | 315 | # start new worker thread to do the http call and some processing 316 | # copy slice to be thread safe! 317 | # TODO: make a job queue so we don't start a new thread for each recognition 318 | threading.Thread(target=self.recognize, args=(slice.copy(), )).start() 319 | 320 | # reset flag 321 | self.isRecording = False 322 | 323 | return 324 | 325 | def calibrate(self): 326 | self.isCalibrating = True 327 | self.framesCount = 0 328 | self.startCalibrationTimestamp = 0 329 | 330 | print("INF: starting calibration") 331 | 332 | if(self.isStarted == False): 333 | self.start() 334 | 335 | return 336 | 337 | def stopCalibration(self): 338 | if(self.isCalibrating == False): 339 | print("INF: SpeechRecognitionModule.stopCalibration: not calibrating") 340 | return 341 | 342 | self.isCalibrating = False 343 | 344 | # calculate avg rms over self.framesCount 345 | self.threshold = CALIBRATION_THRESHOLD_FACTOR * (self.rmsSum / self.framesCount) 346 | print 'calibration done, RMS threshold is: ' + str(self.threshold) 347 | return 348 | 349 | def enableAutoDetection(self): 350 | self.isAutoDetectionEnabled = True 351 | print("INF: autoDetection enabled") 352 | return 353 | 354 | def disableAutoDetection(self): 355 | self.isAutoDetectionEnabled = False 356 | return 357 | 358 | def setLanguage(self, language = DEFAULT_LANGUAGE): 359 | self.language = language 360 | return 361 | 362 | # used for RMS calculation 363 | def convertStr2SignedInt(self, data): 364 | """ 365 | This function takes a string containing 16 bits little endian sound 366 | samples as input and returns a vector containing the 16 bits sound 367 | samples values converted between -1 and 1. 368 | """ 369 | 370 | # from the naoqi sample, but rewritten to use numpy methods instead of for loops 371 | 372 | lsb = data[0::2] 373 | msb = data[1::2] 374 | 375 | # don't remove the .0, otherwise overflow! 376 | rms_data = np.add(lsb, np.multiply(msb, 256.0)) 377 | 378 | # gives and array that contains -65536 on every position where signedData is > 32768 379 | sign_correction = np.select([rms_data>=32768], [-65536]) 380 | 381 | # add the two to get the correct signed values 382 | rms_data = np.add(rms_data, sign_correction) 383 | 384 | # normalize values to -1.0 ... +1.0 385 | rms_data = np.divide(rms_data, 32768.0) 386 | 387 | return rms_data 388 | 389 | def recognize(self, data): 390 | # print 'sending %d bytes' % len(data) 391 | 392 | if (WRITE_WAV_FILE): 393 | # write to file 394 | filename = "out" + str(self.fileCounter) 395 | self.fileCounter += 1 396 | outfile = open(filename + ".raw", "wb") 397 | data.tofile(outfile) 398 | outfile.close() 399 | rawToWav(filename) 400 | 401 | buffer = np.getbuffer(data) 402 | 403 | r = Recognizer() 404 | try: 405 | result = r.recognize_google(audio_data=buffer, samplerate=SAMPLE_RATE, language=self.language) 406 | self.memory.raiseEvent("SpeechRecognition", result) 407 | print 'RESULT: ' + result 408 | except UnknownValueError: 409 | print 'ERR: Recognition error' 410 | except RequestError, e: 411 | print 'ERR: ' + str(e) 412 | except socket.timeout: 413 | print 'ERR: Socket timeout' 414 | except: 415 | print 'ERR: Unknown, probably timeout ' + str(sys.exc_info()[0]) 416 | 417 | def setAutoDetectionThreshold(self, threshold): 418 | self.autoDetectionThreshold = threshold 419 | 420 | def setIdleReleaseTime(self, releaseTime): 421 | self.idleReleaseTime = releaseTime 422 | 423 | def setHoldTime(self, holdTime): 424 | self.holdTime = holdTime 425 | 426 | def setMaxRecordingDuration(self, duration): 427 | self.recordingDuration = duration 428 | 429 | def setLookaheadDuration(self, duration): 430 | self.lookaheadBufferSize = duration * SAMPLE_RATE 431 | self.preBuffer = [] 432 | self.preBufferLength = 0 433 | 434 | # SpeechRecognition - end 435 | 436 | 437 | def getAudioDuration(self): 438 | return len(self.audio_data)/48000.0 439 | 440 | def main(): 441 | """ Main entry point 442 | 443 | """ 444 | parser = OptionParser() 445 | parser.add_option("--pip", 446 | help="Parent broker port. The IP address or your robot", 447 | dest="pip") 448 | parser.add_option("--pport", 449 | help="Parent broker port. The port NAOqi is listening to", 450 | dest="pport", 451 | type="int") 452 | parser.set_defaults( 453 | pip=NAO_IP, 454 | pport=NAO_PORT) 455 | 456 | (opts, args_) = parser.parse_args() 457 | pip = opts.pip 458 | pport = opts.pport 459 | 460 | # We need this broker to be able to construct 461 | # NAOqi modules and subscribe to other modules 462 | # The broker must stay alive until the program exists 463 | myBroker = naoqi.ALBroker("myBroker", 464 | "0.0.0.0", # listen to anyone 465 | 0, # find a free port and use it 466 | pip, # parent broker IP 467 | pport) # parent broker port 468 | 469 | try: 470 | p = ALProxy("SpeechRecognition") 471 | p.exit() # kill previous instance, useful for developing ;) 472 | except: 473 | pass 474 | 475 | # Reinstantiate module 476 | 477 | # Warning: SpeechRecognition must be a global variable 478 | # The name given to the constructor must be the name of the 479 | # variable 480 | global SpeechRecognition 481 | SpeechRecognition = SpeechRecognitionModule("SpeechRecognition", pip) 482 | 483 | # uncomment for debug purposes 484 | # usually a subscribing client will call start() from ALProxy 485 | #SpeechRecognition.start() 486 | #SpeechRecognition.startRecording() 487 | #SpeechRecognition.calibrate() 488 | #SpeechRecognition.enableAutoDetection() 489 | #SpeechRecognition.startRecording() 490 | 491 | try: 492 | while True: 493 | time.sleep(1) 494 | except KeyboardInterrupt: 495 | print 496 | print "Interrupted by user, shutting down" 497 | myBroker.shutdown() 498 | sys.exit(0) 499 | 500 | 501 | 502 | if __name__ == "__main__": 503 | main() 504 | --------------------------------------------------------------------------------