├── requirements.txt
├── raw_to_wav.py
├── LICENSE.md
├── .gitignore
├── README.md
├── ros_receiver.py
├── module_receiver.py
├── google.py
└── module_speechrecognition.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy


--------------------------------------------------------------------------------
/raw_to_wav.py:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | # This method converts raw audio files in the project directory to wav files
 3 | #
 4 | # Author: Johannes Bramauer, Vienna University of Technology
 5 | # Created: May 30, 2018
 6 | # License: MIT
 7 | #
 8 | ###########################################################
 9 | import wave
10 | import os
11 | 
12 | def rawToWav(filename):
13 | 
14 |     rawfile = filename + ".raw"
15 |     if not os.path.isfile(rawfile):
16 |         return
17 | 
18 |     outfile = wave.open(filename + ".wav", "wb")
19 |     outfile.setframerate(48000)
20 |     outfile.setnchannels(1)
21 |     outfile.setsampwidth(2)
22 | 
23 |     f = open(rawfile, "rb")
24 |     sample = f.read(4096)
25 |     print 'writing file: ' + filename + '.wav'
26 | 
27 |     while sample != "":
28 |         outfile.writeframes(sample)
29 |         sample = f.read(4096)
30 | 
31 |     outfile.close()
32 | 
33 |     os.remove(rawfile)
34 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 2018 Johannes Bramauer, Vienna University of Technology
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 3 | 
 4 | # User-specific stuff:
 5 | .idea/**/workspace.xml
 6 | .idea/**/tasks.xml
 7 | .idea/dictionaries
 8 | 
 9 | # Sensitive or high-churn files:
10 | .idea/**/dataSources/
11 | .idea/**/dataSources.ids
12 | .idea/**/dataSources.local.xml
13 | .idea/**/sqlDataSources.xml
14 | .idea/**/dynamic.xml
15 | .idea/**/uiDesigner.xml
16 | 
17 | .idea/misc.xml
18 | .idea/modules.xml
19 | .idea/vcs.xml
20 | 
21 | # Gradle:
22 | .idea/**/gradle.xml
23 | .idea/**/libraries
24 | 
25 | # CMake
26 | cmake-build-debug/
27 | cmake-build-release/
28 | 
29 | # Mongo Explorer plugin:
30 | .idea/**/mongoSettings.xml
31 | 
32 | ## File-based project format:
33 | *.iws
34 | 
35 | ## Plugin-specific files:
36 | 
37 | # IntelliJ
38 | out/
39 | 
40 | # mpeltonen/sbt-idea plugin
41 | .idea_modules/
42 | 
43 | # JIRA plugin
44 | atlassian-ide-plugin.xml
45 | 
46 | # Cursive Clojure plugin
47 | .idea/replstate.xml
48 | 
49 | # Crashlytics plugin (for Android Studio and IntelliJ)
50 | com_crashlytics_export_strings.xml
51 | crashlytics.properties
52 | crashlytics-build.properties
53 | fabric.properties
54 | 
55 | # Mac stuff
56 | .DS_Store
57 | 
58 | # python byte code
59 | *.pyc
60 | 
61 | # virtual environment directory
62 | venv/
63 | 
64 | # recorded debug audio files
65 | *.raw
66 | *.wav
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pepper Speech Recognition
  2 | 
  3 | This is a python module that brings Google Speech Recognition to the Pepper robot by Aldebaran.
  4 | It was specifically implemented so it can be run ON the robot as a NaoQi module but it can also be run on your computer.
  5 | 
  6 | ## Features
  7 | * Automatic detection if and when a person speaks
  8 | * Automatic detection when a person stops speaking
  9 | * Calibration function to learn background noise level
 10 | * Manual trigger mode (record now for 10 seconds and then recognize)
 11 | * This module provides an ALMemory event other modules can subscribe to
 12 | * Unlike other modules this one does NOT depend on PyAudio/Portaudio
 13 | * Only dependencies: Numpy, NaoQi Python SDK
 14 | 
 15 | ## Setup to run on robot
 16 | Everything you need should already be installed, simply clone the repository / copy the files to your robot, ssh into it and run
 17 | 
 18 |  ```
 19 | python module_speechrecognition.py
 20 | ```
 21 | 
 22 | ## Setup to run on computer
 23 | 
 24 | Like every NaoQi Python project you need the NaoQi Python SDK. You can get it from [doc.aldebaran.com](http://doc.aldebaran.com/2-5/dev/python/install_guide.html).
 25 | To test it simply run
 26 | 
 27 | ```
 28 | python
 29 | import naoqi
 30 | ```
 31 | 
 32 | Next up you need the numpy python module, which is the only python library used
 33 | 
 34 | ```
 35 | pip install numpy
 36 | ```
 37 | 
 38 | Test it:
 39 | 
 40 | ```
 41 | python
 42 | import numpy
 43 | ```
 44 | 
 45 | Now run the speechrecognition module
 46 | 
 47 | 
 48 |  ```
 49 | python module_speechrecognition.py --pip (your robot IP)
 50 | ```
 51 | 
 52 | ## Configuration
 53 | To configure the module you can use an ALProxy instance you can get from the Broker (speechrecognition module needs to be running)
 54 | 
 55 | Here's a simple example on how to do that:
 56 | 
 57 | ```
 58 | SpeechRecognition = ALProxy("SpeechRecognition")
 59 | SpeechRecognition.start()
 60 | SpeechRecognition.setLanguage("de-de")
 61 | SpeechRecognition.calibrate()
 62 | SpeechRecognition.enableAutoDetection()
 63 | ```
 64 | Calling start() makes the module subscribe to the ALAudio event of the robot. Calibrate() makes it calculate the mean RMS value of the signal for 4 seconds and use this as a reference for background noise level.
 65 | 
 66 | Also, you can just use the module like this to trigger a one-time recognition:
 67 | ```
 68 | SpeechRecognition = ALProxy("SpeechRecognition")
 69 | SpeechRecognition.start()
 70 | SpeechRecognition.setLanguage("de-de")
 71 | 
 72 | # starts immediately, records for at least HOLD_TIME seconds and then tries to recognize
 73 | SpeechRecognition.startRecoring() 
 74 | ```
 75 | 
 76 | You can use [module_receiver.py](module_receiver.py) as a template for implementing your own module.
 77 | 
 78 | ## Subscribing to the ALMemory event
 79 | The speechrecognition module raises an ALMemory event every time a string was successfully recognized. You can subscribe to it in your module using the ALMemory proxy.
 80 | 
 81 | ```
 82 | memory = naoqi.ALProxy("ALMemory")
 83 | memory.subscribeToEvent("SpeechRecognition", self.getName(), "processRemote")
 84 | ```
 85 | 
 86 | "SpeechRecognition" is the name of the event, "proccessRemote" the name of your callback function.
 87 | 
 88 | ```
 89 | def processRemote(self, signalName, message):
 90 |     # Do something with the received speech recognition result
 91 |     print(message)
 92 | ```
 93 | 
 94 | ### ROS Module
 95 | A simple ROS (Robot Operating System) module [is provided](ros_receiver.py), that uses [rospy](http://wiki.ros.org/rospy) to publish a ROS message whenever speech is recognized, carrying the recognized string as data.
 96 | 
 97 | ## Dependencies
 98 | The speechrecognition module was built to be able to run ON Pepper (and you can't easily install 3rd party libraries there, also we need to be efficient), so it's only dependencies are
 99 | * Python 2.7
100 | * numpy
101 | * naoqi Python SDK
102 | 
103 | All of them are pre-installed on Pepper
104 | 
105 | ## Built With
106 | 
107 | * Python 2.7
108 | * [PyCharm](https://www.jetbrains.com/pycharm/) - Python IDE
109 | 
110 | ## Changelog
111 | 
112 | **Version 1.0** - Initial release, May 30 2018
113 | 
114 | ## Authors
115 | 
116 | * **Johannes Bramauer** @ Vienna University of Technology - *Initial work*
117 | 
118 | ## License
119 | 
120 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
121 | 
122 | ## Acknowledgments
123 | 
124 | * Anthony Zang (Uberi) and his [SpeechRecognition](https://github.com/Uberi/speech_recognition)
125 | 


--------------------------------------------------------------------------------
/ros_receiver.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ###########################################################
  5 | # ROS Module and Naoqi Module that wraps the google speech recognition and publishes a ROS event whenever speech was detected
  6 | # Syntaxe:
  7 | #    python scriptname --pip <ip> --pport <port>
  8 | #
  9 | #    --pip <ip>: specify the ip of your robot (without specification it will use the NAO_IP defined some line below
 10 | #
 11 | # Author: Johannes Bramauer
 12 | # Created: May 30, 2018
 13 | # License: MIT
 14 | ###########################################################
 15 | 
 16 | # NAO_PORT = 65445 # Virtual Machine
 17 | NAO_PORT = 9559 # Robot
 18 | 
 19 | # NAO_IP = "127.0.0.1" # Virtual Machine
 20 | NAO_IP = "nao.local" # Pepper
 21 | 
 22 | 
 23 | from optparse import OptionParser
 24 | import naoqi
 25 | import time
 26 | import sys
 27 | from naoqi import ALProxy
 28 | import rospy
 29 | from std_msgs.msg import String
 30 | 
 31 | class ReceiverTestModule(naoqi.ALModule):
 32 |     """
 33 |     Use this object to get call back from the ALMemory of the naoqi world.
 34 |     Your callback needs to be a method with two parameter (variable name, value).
 35 |     """
 36 | 
 37 |     def __init__( self, strModuleName, strNaoIp ):
 38 |         try:
 39 |             naoqi.ALModule.__init__(self, strModuleName );
 40 |             self.BIND_PYTHON( self.getName(),"callback" );
 41 |             self.strNaoIp = strNaoIp;
 42 | 
 43 |             self.pub = rospy.Publisher('phrase', String, queue_size=10)
 44 | 
 45 |         except BaseException, err:
 46 |             print( "ERR: abcdk.naoqitools.ReceiverModule: loading error: %s" % str(err) );
 47 | 
 48 |     # __init__ - end
 49 |     def __del__( self ):
 50 |         print( "INF: abcdk.naoqitools.ReceiverModule.__del__: cleaning everything" );
 51 |         self.stop();
 52 | 
 53 |     def start( self ):
 54 |         memory = naoqi.ALProxy("ALMemory", NAO_IP, NAO_PORT)
 55 |         memory.subscribeToEvent("SpeechRecognition", self.getName(), "processRemote")
 56 |         print( "INF: ReceiverModule: started!" )
 57 | 
 58 | 
 59 |     def stop( self ):
 60 |         print( "INF: ReceiverModule: stopping..." )
 61 |         memory = naoqi.ALProxy("ALMemory", NAO_IP, NAO_PORT)
 62 |         memory.unsubscribe(self.getName())
 63 | 
 64 |         print( "INF: ReceiverModule: stopped!" )
 65 | 
 66 |     def version( self ):
 67 |         return "0.1";
 68 | 
 69 |     def processRemote(self, signalName, message):
 70 |         self.pub.publish(message)
 71 |         print(message)
 72 | 
 73 | 
 74 | def main():
 75 |     """ Main entry point
 76 | 
 77 |     """
 78 |     parser = OptionParser()
 79 |     parser.add_option("--pip",
 80 |         help="Parent broker port. The IP address or your robot",
 81 |         dest="pip")
 82 |     parser.add_option("--pport",
 83 |         help="Parent broker port. The port NAOqi is listening to",
 84 |         dest="pport",
 85 |         type="int")
 86 |     parser.set_defaults(
 87 |         pip=NAO_IP,
 88 |         pport=NAO_PORT)
 89 | 
 90 |     (opts, args_) = parser.parse_args()
 91 |     pip   = opts.pip
 92 |     pport = opts.pport
 93 | 
 94 |     # We need this broker to be able to construct
 95 |     # NAOqi modules and subscribe to other modules
 96 |     # The broker must stay alive until the program exists
 97 |     myBroker = naoqi.ALBroker("myBroker",
 98 |        "0.0.0.0",   # listen to anyone
 99 |        0,           # find a free port and use it
100 |        pip,         # parent broker IP
101 |        pport)       # parent broker port
102 | 
103 |     try:
104 |         p = ALProxy("ReceiverTestModule")
105 |         p.exit()  # kill previous instance
106 |     except:
107 |         pass
108 |     # Reinstantiate module
109 | 
110 |     # Warning: ReceiverModule must be a global variable
111 |     # The name given to the constructor must be the name of the
112 |     # variable
113 |     global ReceiverTestModule
114 |     ReceiverTestModule = ReceiverTestModule("ReceiverTestModule", pip)
115 |     ReceiverTestModule.start()
116 | 
117 |     SpeechRecognition = ALProxy("SpeechRecognition")
118 |     SpeechRecognition.start()
119 |     #SpeechRecognition.setLanguage("de-de")
120 |     SpeechRecognition.calibrate()
121 |     SpeechRecognition.enableAutoDetection()
122 | 
123 |     # ROS node
124 |     rospy.init_node('pepper_listen', anonymous=True)
125 | 
126 |     try:
127 |         while not rospy.is_shutdown():
128 |             time.sleep(1)
129 | 
130 |     except KeyboardInterrupt:
131 |         print "Interrupted by user, shutting down"
132 | 
133 |     except rospy.ROSInterruptException:
134 |         print "Interrupted by ROS, shutting down"
135 |         pass
136 | 
137 |     myBroker.shutdown()
138 |     sys.exit(0)
139 | 
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     main()


--------------------------------------------------------------------------------
/module_receiver.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ###########################################################
  4 | # This module receives results from the speechrecognition module and prints to console
  5 | #
  6 | # Syntax:
  7 | #    python scriptname --pip <ip> --pport <port>
  8 | #
  9 | #    --pip <ip>: specify the ip of your robot (without specification it will use the NAO_IP defined below
 10 | #
 11 | # Author: Johannes Bramauer, Vienna University of Technology
 12 | # Created: May 30, 2018
 13 | # License: MIT
 14 | ###########################################################
 15 | 
 16 | # NAO_PORT = 65445 # Virtual Machine
 17 | NAO_PORT = 9559 # Robot
 18 | 
 19 | 
 20 | # NAO_IP = "127.0.0.1" # Virtual Machine
 21 | NAO_IP = "nao.local" # Pepper default
 22 | 
 23 | 
 24 | from optparse import OptionParser
 25 | import naoqi
 26 | import time
 27 | import sys
 28 | from naoqi import ALProxy
 29 | 
 30 | 
 31 | class BaseSpeechReceiverModule(naoqi.ALModule):
 32 |     """
 33 |     Use this object to get call back from the ALMemory of the naoqi world.
 34 |     Your callback needs to be a method with two parameter (variable name, value).
 35 |     """
 36 | 
 37 |     def __init__( self, strModuleName, strNaoIp ):
 38 |         try:
 39 |             naoqi.ALModule.__init__(self, strModuleName )
 40 |             self.BIND_PYTHON( self.getName(),"callback" )
 41 |             self.strNaoIp = strNaoIp
 42 | 
 43 |         except BaseException, err:
 44 |             print( "ERR: ReceiverModule: loading error: %s" % str(err) )
 45 | 
 46 |     # __init__ - end
 47 |     def __del__( self ):
 48 |         print( "INF: ReceiverModule.__del__: cleaning everything" )
 49 |         self.stop()
 50 | 
 51 |     def start( self ):
 52 |         memory = naoqi.ALProxy("ALMemory", self.strNaoIp, NAO_PORT)
 53 |         memory.subscribeToEvent("SpeechRecognition", self.getName(), "processRemote")
 54 |         print( "INF: ReceiverModule: started!" )
 55 | 
 56 | 
 57 |     def stop( self ):
 58 |         print( "INF: ReceiverModule: stopping..." )
 59 |         memory = naoqi.ALProxy("ALMemory", self.strNaoIp, NAO_PORT)
 60 |         memory.unsubscribe(self.getName())
 61 | 
 62 |         print( "INF: ReceiverModule: stopped!" )
 63 | 
 64 |     def version( self ):
 65 |         return "1.1"
 66 | 
 67 |     def processRemote(self, signalName, message):
 68 |         # Do something with the received speech recognition result
 69 |         print(message)
 70 | 
 71 | 
 72 | def main():
 73 |     """ Main entry point
 74 | 
 75 |     """
 76 |     parser = OptionParser()
 77 |     parser.add_option("--pip",
 78 |         help="Parent broker port. The IP address or your robot",
 79 |         dest="pip")
 80 |     parser.add_option("--pport",
 81 |         help="Parent broker port. The port NAOqi is listening to",
 82 |         dest="pport",
 83 |         type="int")
 84 |     parser.set_defaults(
 85 |         pip=NAO_IP,
 86 |         pport=NAO_PORT)
 87 | 
 88 |     (opts, args_) = parser.parse_args()
 89 |     pip   = opts.pip
 90 |     pport = opts.pport
 91 | 
 92 |     # We need this broker to be able to construct
 93 |     # NAOqi modules and subscribe to other modules
 94 |     # The broker must stay alive until the program exists
 95 |     myBroker = naoqi.ALBroker("myBroker",
 96 |        "0.0.0.0",   # listen to anyone
 97 |        0,           # find a free port and use it
 98 |        pip,         # parent broker IP
 99 |        pport)       # parent broker port
100 | 
101 |     try:
102 |         p = ALProxy("BaseSpeechReceiverModule")
103 |         p.exit()  # kill previous instance
104 |     except:
105 |         pass
106 |     # Reinstantiate module
107 | 
108 |     # Warning: ReceiverModule must be a global variable
109 |     # The name given to the constructor must be the name of the
110 |     # variable
111 |     global BaseSpeechReceiverModule
112 |     BaseSpeechReceiverModule = BaseSpeechReceiverModule("BaseSpeechReceiverModule", pip)
113 |     BaseSpeechReceiverModule.start()
114 | 
115 |     if(False):
116 |         #one-shot recording for at least 5 seconds
117 |         SpeechRecognition = ALProxy("SpeechRecognition")
118 |         SpeechRecognition.start()
119 |         SpeechRecognition.setHoldTime(5)
120 |         SpeechRecognition.setIdleReleaseTime(1.7)
121 |         SpeechRecognition.setMaxRecordingDuration(10)
122 |         SpeechRecognition.startRecording()
123 | 
124 |     else:
125 |         # auto-detection
126 |         SpeechRecognition = ALProxy("SpeechRecognition")
127 |         SpeechRecognition.start()
128 |         SpeechRecognition.setHoldTime(2.5)
129 |         SpeechRecognition.setIdleReleaseTime(1.0)
130 |         SpeechRecognition.setMaxRecordingDuration(10)
131 |         SpeechRecognition.setLookaheadDuration(0.5)
132 |         #SpeechRecognition.setLanguage("de-de")
133 |         #SpeechRecognition.calibrate()
134 |         SpeechRecognition.setAutoDetectionThreshold(5)
135 |         SpeechRecognition.enableAutoDetection()
136 |         #SpeechRecognition.startRecording()
137 | 
138 |     try:
139 |         while True:
140 |             time.sleep(1)
141 | 
142 |     except KeyboardInterrupt:
143 |         print
144 |         print "Interrupted by user, shutting down"
145 |         myBroker.shutdown()
146 |         sys.exit(0)
147 | 
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     main()


--------------------------------------------------------------------------------
/google.py:
--------------------------------------------------------------------------------
 1 | # The recognize_google method is taken from the Uberi SpeechRecognition module:
 2 | # https://github.com/Uberi/speech_recognition
 3 | # Author: Anthony Zhang (Uberi)
 4 | # License: BSD (3 clause)
 5 | #
 6 | # Slight modification was done to support contentType parameters and have 16 bit LPCM be the default
 7 | # Modification by Johannes Bramauer, Vienna University of Technology, May 30, 2018
 8 | #
 9 | # License:
10 | # Copyright (c) 2014-2017, Anthony Zhang <azhang9@gmail.com>
11 | # All rights reserved.
12 | #
13 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
14 | #
15 | # 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
16 | #
17 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
18 | #
19 | # 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22 | #
23 | 
24 | from urllib import urlencode
25 | from urllib2 import Request, urlopen, URLError, HTTPError
26 | import json
27 | import socket
28 | 
29 | class RequestError(Exception): pass
30 | 
31 | class UnknownValueError(Exception): pass
32 | 
33 | 
34 | class Recognizer :
35 | 
36 |     # supported content types: "audio/l16" (16 bit signed LPCM), "audio/flac"
37 |     # samplerate must be at least 8kHz
38 | 
39 |     def recognize_google(self, audio_data, samplerate, key=None, language="en-US", show_all=False, contentType="audio/l16"):
40 |         """
41 |         Performs speech recognition on ``audio_data``, using the Google Speech Recognition API.
42 |         The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
43 |         To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
44 |         The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
45 |         Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
46 |         Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
47 |         """
48 |         assert key is None or isinstance(key, str), "``key`` must be ``None`` or a string"
49 |         assert isinstance(language, str), "``language`` must be a string"
50 | 
51 |         if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
52 | 
53 |         url = "http://www.google.com/speech-api/v2/recognize?{}".format(urlencode({
54 |             "client": "chromium",
55 |             "lang": language,
56 |             "key": key,
57 |         }))
58 | 
59 |         # set default request timeout to 30 seconds
60 |         socket.setdefaulttimeout(30)
61 | 
62 |         request = Request(url, data=audio_data,
63 |                           headers={"Content-Type": "{}; rate={}".format(contentType, samplerate)})
64 | 
65 |         # obtain audio transcription results
66 |         try:
67 |             response = urlopen(request)
68 |         except HTTPError as e:
69 |             raise RequestError("recognition request failed: {}".format(e.reason))
70 |         except URLError as e:
71 |             raise RequestError("recognition connection failed: {}".format(e.reason))
72 |         response_text = response.read().decode("utf-8")
73 | 
74 |         # ignore any blank blocks
75 |         actual_result = []
76 |         for line in response_text.split("\n"):
77 |             if not line: continue
78 |             result = json.loads(line)["result"]
79 |             if len(result) != 0:
80 |                 actual_result = result[0]
81 |                 break
82 | 
83 |         # return results
84 |         if show_all: return actual_result
85 |         if not isinstance(actual_result, dict) or len(actual_result.get("alternative", [])) == 0:
86 |             raise UnknownValueError()
87 | 
88 |         if "confidence" in actual_result["alternative"]:
89 |             # return alternative with highest confidence score
90 |             best_hypothesis = max(actual_result["alternative"], key=lambda alternative: alternative["confidence"])
91 |         else:
92 |             # when there is no confidence available, we arbitrarily choose the first hypothesis.
93 |             best_hypothesis = actual_result["alternative"][0]
94 |         if "transcript" not in best_hypothesis:
95 |             raise UnknownValueError()
96 |         return best_hypothesis["transcript"].encode("utf-8")


--------------------------------------------------------------------------------
/module_speechrecognition.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ###########################################################
  4 | # Retrieve robot audio buffer and do google speech recognition
  5 | #
  6 | # Syntax:
  7 | #    python scriptname --pip <ip> --pport <port>
  8 | #
  9 | #    --pip <ip>: specify the ip of your robot (without specification it will use the NAO_IP defined below)
 10 | #
 11 | # Author: Johannes Bramauer, Vienna University of Technology
 12 | # Created: May 30, 2018
 13 | # License: MIT
 14 | #
 15 | ###########################################################
 16 | import socket
 17 | 
 18 | from raw_to_wav import rawToWav
 19 | 
 20 | NAO_IP = "127.0.0.1" # default, for running on Pepper
 21 | NAO_PORT = 9559
 22 | 
 23 | from optparse import OptionParser
 24 | import naoqi
 25 | import numpy as np
 26 | import time
 27 | import sys
 28 | import threading
 29 | from naoqi import ALProxy
 30 | from google import Recognizer, UnknownValueError, RequestError
 31 | from numpy import sqrt, mean, square
 32 | import traceback
 33 | 
 34 | 
 35 | RECORDING_DURATION = 10     # seconds, maximum recording time, also default value for startRecording(), Google Speech API only accepts up to about 10-15 seconds
 36 | LOOKAHEAD_DURATION = 1.0    # seconds, for auto-detect mode: amount of seconds before the threshold trigger that will be included in the request
 37 | IDLE_RELEASE_TIME = 2.0     # seconds, for auto-detect mode: idle time (RMS below threshold) after which we stop recording and recognize
 38 | HOLD_TIME = 3.0             # seconds, minimum recording time after we started recording (autodetection)
 39 | SAMPLE_RATE = 48000         # Hz, be careful changing this, both google and Naoqi have requirements!
 40 | 
 41 | CALIBRATION_DURATION = 4    # seconds, timespan during which calibration is performed (summing up RMS values and calculating mean)
 42 | CALIBRATION_THRESHOLD_FACTOR = 1.5  # factor the calculated mean RMS gets multiplied by to determine the auto detection threshold (after calibration)
 43 | 
 44 | DEFAULT_LANGUAGE = "en-us"  # RFC5646 language tag, e.g. "en-us", "de-de", "fr-fr",... <http://stackoverflow.com/a/14302134>
 45 | 
 46 | WRITE_WAV_FILE = False      # write the recorded audio to "out.wav" before sending it to google. intended for debugging purposes
 47 | PRINT_RMS = False           # prints the calculated RMS value to the console, useful for setting the threshold
 48 | 
 49 | PREBUFFER_WHEN_STOP = False # Fills pre-buffer with last samples when stopping recording. WARNING: has performance issues!
 50 | 
 51 | 
 52 | class SpeechRecognitionModule(naoqi.ALModule):
 53 |     """
 54 |     Use this object to get call back from the ALMemory of the naoqi world.
 55 |     Your callback needs to be a method with two parameter (variable name, value).
 56 |     """
 57 | 
 58 |     def __init__( self, strModuleName, strNaoIp ):
 59 |         try:
 60 |             naoqi.ALModule.__init__(self, strModuleName )
 61 | 
 62 |             # is these 2 line necessary? what do they do?
 63 |             # just copied them from the examples...
 64 |             self.BIND_PYTHON( self.getName(),"callback" )
 65 |             self.strNaoIp = strNaoIp
 66 | 
 67 |             # declare event to ALMemory so other modules can subscribe
 68 |             self.memory = naoqi.ALProxy("ALMemory")
 69 |             self.memory.declareEvent("SpeechRecognition")
 70 | 
 71 |             # flag to indicate if subscribed to audio events
 72 |             self.isStarted = False
 73 | 
 74 |             # flag to indicate if we are currently recording audio
 75 |             self.isRecording = False
 76 |             self.startRecordingTimestamp = 0
 77 |             self.recordingDuration = RECORDING_DURATION
 78 | 
 79 |             # flag to indicate if auto speech detection is enabled
 80 |             self.isAutoDetectionEnabled = False
 81 |             self.autoDetectionThreshold = 10 # TODO: find a default value that works fine so we don't need to calibrate every time
 82 | 
 83 |             # flag to indicate if we are calibrating
 84 |             self.isCalibrating = False
 85 |             self.startCalibrationTimestamp = 0
 86 | 
 87 |             # RMS calculation variables
 88 |             self.framesCount = 0
 89 |             self.rmsSum = 0 # used to sum up rms results and calculate average
 90 |             self.lastTimeRMSPeak = 0
 91 | 
 92 |             # audio buffer
 93 |             self.buffer = []
 94 |             self.preBuffer = []
 95 |             self.preBufferLength = 0    # length in samples (len(self.preBuffer) just counts entries)
 96 | 
 97 |             # init parameters
 98 |             self.language = DEFAULT_LANGUAGE
 99 |             self.idleReleaseTime = IDLE_RELEASE_TIME
100 |             self.holdTime = HOLD_TIME
101 |             self.lookaheadBufferSize = LOOKAHEAD_DURATION * SAMPLE_RATE
102 | 
103 |             # counter for wav file output
104 |             self.fileCounter = 0
105 | 
106 |         except BaseException, err:
107 |             print( "ERR: SpeechRecognitionModule: loading error: %s" % str(err) )
108 | 
109 |     # __init__ - end
110 |     def __del__( self ):
111 |         print( "INF: SpeechRecognitionModule.__del__: cleaning everything" )
112 |         self.stop()
113 | 
114 |     def start( self ):
115 | 
116 |         if(self.isStarted):
117 |             print("INF: SpeechRecognitionModule.start: already running")
118 |             return
119 | 
120 |         print("INF: SpeechRecognitionModule: starting!")
121 | 
122 |         self.isStarted = True
123 | 
124 |         audio = naoqi.ALProxy( "ALAudioDevice")
125 |         nNbrChannelFlag = 0 # ALL_Channels: 0,  AL::LEFTCHANNEL: 1, AL::RIGHTCHANNEL: 2 AL::FRONTCHANNEL: 3  or AL::REARCHANNEL: 4.
126 |         nDeinterleave = 0
127 |         audio.setClientPreferences( self.getName(),  SAMPLE_RATE, nNbrChannelFlag, nDeinterleave ) # setting same as default generate a bug !?!
128 |         audio.subscribe( self.getName() )
129 | 
130 |     def pause(self):
131 |         print("INF: SpeechRecognitionModule.pause: stopping")
132 |         if (self.isStarted == False):
133 |             print("INF: SpeechRecognitionModule.stop: not running")
134 |             return
135 | 
136 |         self.isStarted = False
137 | 
138 |         audio = naoqi.ALProxy("ALAudioDevice", self.strNaoIp, NAO_PORT)
139 |         audio.unsubscribe(self.getName())
140 | 
141 |         print("INF: SpeechRecognitionModule: stopped!")
142 | 
143 |     def stop( self ):
144 |         self.pause()
145 | 
146 |     def processRemote( self, nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, buffer ):
147 |         #print("INF: SpeechRecognitionModule: Processing '%s' channels" % nbOfChannels)
148 | 
149 |         # calculate a decimal seconds timestamp
150 |         timestamp = float (str(aTimeStamp[0]) + "."  + str(aTimeStamp[1]))
151 | 
152 |         # put whole function in a try/except to be able to see the stracktrace
153 |         try:
154 | 
155 |             aSoundDataInterlaced = np.fromstring( str(buffer), dtype=np.int16 )
156 |             aSoundData = np.reshape( aSoundDataInterlaced, (nbOfChannels, nbrOfSamplesByChannel), 'F' )
157 | 
158 |             # compute RMS, handle autodetection and calibration
159 |             if( self.isCalibrating or self.isAutoDetectionEnabled or self.isRecording):
160 | 
161 |                 # compute the rms level on front mic
162 |                 rmsMicFront = self.calcRMSLevel(self.convertStr2SignedInt(aSoundData[0]))
163 | 
164 |                 if (rmsMicFront >= self.autoDetectionThreshold):
165 |                     # save timestamp when we last had and RMS > threshold
166 |                     self.lastTimeRMSPeak = timestamp
167 | 
168 |                     # start recording if we are not doing so already
169 |                     if (self.isAutoDetectionEnabled and not self.isRecording and not self.isCalibrating):
170 |                         self.startRecording()
171 | 
172 |                 # perform calibration
173 |                 if( self.isCalibrating):
174 | 
175 |                     if(self.startCalibrationTimestamp <= 0):
176 |                         # we are starting to calibrate, save timestamp
177 |                         # to track how long we are doing this
178 |                         self.startCalibrationTimestamp = timestamp
179 | 
180 |                     elif(timestamp - self.startCalibrationTimestamp >= CALIBRATION_DURATION):
181 |                         # time's up, we're done!
182 |                         self.stopCalibration()
183 | 
184 |                     # sum rms values of the frames
185 |                     # keep track of how many frames we sum up
186 |                     # to calculate mean afterwards
187 |                     self.rmsSum += rmsMicFront
188 |                     self.framesCount = self.framesCount + 1
189 | 
190 | 
191 |                 if(PRINT_RMS):
192 |                     # for debug purposes
193 |                     # also use it to find a good threshold value for auto detection
194 |                     print 'Mic RMS: ' + str(rmsMicFront)
195 | 
196 |             if( False ):
197 |                 # compute average
198 |                 aAvgValue = np.mean( aSoundData, axis = 1 )
199 |                 print( "avg: %s" % aAvgValue )
200 |             if( False ):
201 |                 # compute fft
202 |                 nBlockSize = nbrOfSamplesByChannel
203 |                 signal = aSoundData[0] * np.hanning( nBlockSize )
204 |                 aFft = ( np.fft.rfft(signal) / nBlockSize )
205 |                 print aFft
206 |             if( False ):
207 |                 # compute peak
208 |                 aPeakValue = np.max( aSoundData )
209 |                 if( aPeakValue > 16000 ):
210 |                     print( "Peak: %s" % aPeakValue )
211 | 
212 |             if(not self.isCalibrating):
213 |                 if(self.isRecording):
214 |                     # write to buffer
215 |                     self.buffer.append(aSoundData)
216 | 
217 |                     if (self.startRecordingTimestamp <= 0):
218 |                         # initialize timestamp when we start recording
219 |                         self.startRecordingTimestamp = timestamp
220 |                     elif ((timestamp - self.startRecordingTimestamp) > self.recordingDuration):
221 |                         print('stop after max recording duration')
222 |                         # check how long we are recording
223 |                         self.stopRecordingAndRecognize()
224 | 
225 |                     # stop recording after idle time (and recording at least hold time)
226 |                     # lastTimeRMSPeak is 0 if no peak occured
227 |                     if (timestamp - self.lastTimeRMSPeak >= self.idleReleaseTime) and (
228 |                             timestamp - self.startRecordingTimestamp >= self.holdTime):
229 |                         print ('stopping after idle/hold time')
230 |                         self.stopRecordingAndRecognize()
231 |                 else:
232 |                     # constantly record into prebuffer for lookahead
233 |                     self.preBuffer.append(aSoundData)
234 |                     self.preBufferLength += len(aSoundData[0])
235 | 
236 |                     # remove first (oldest) item if the buffer gets bigger than required
237 |                     # removes one block of samples as we store a list of lists...
238 |                     overshoot = (self.preBufferLength - self.lookaheadBufferSize)
239 | 
240 |                     if((overshoot > 0) and (len(self.preBuffer) > 0)):
241 |                         self.preBufferLength -= len(self.preBuffer.pop(0)[0])
242 | 
243 |         except:
244 |             # i did this so i could see the stracktrace as the thread otherwise just silently failed
245 |             traceback.print_exc()
246 | 
247 |     # processRemote - end
248 | 
249 |     def calcRMSLevel(self, data):
250 |         rms = (sqrt(mean(square(data))))
251 |         # TODO: maybe a log would be better for threshold?
252 |         #rms = 20 * np.log10(np.sqrt(np.sum(np.power(data, 2) / len(data))))
253 |         return rms
254 | 
255 |     def version( self ):
256 |         return "1.1"
257 | 
258 | 
259 |     # use this method to manually start recording (works with both autodetection enabled or disabled)
260 |     # the recording will stop after the signal is below the threshold for IDLE_RELEASE_TIME seconds,
261 |     # but will at least record for HOLD_TIME seconds
262 |     def startRecording(self):
263 |         if(self.isRecording):
264 |             print("INF: SpeechRecognitionModule.startRecording: already recording")
265 |             return
266 | 
267 |         print("INF: Starting to record audio")
268 | 
269 |         # start recording
270 |         self.startRecordingTimestamp = 0
271 |         self.lastTimeRMSPeak = 0
272 |         self.buffer = self.preBuffer
273 | 
274 |         #self.preBuffer = []
275 | 
276 |         self.isRecording = True
277 | 
278 |         return
279 | 
280 |     def stopRecordingAndRecognize(self):
281 |         if(self.isRecording == False):
282 |             print("INF: SpeechRecognitionModule.stopRecordingAndRecognize: not recording")
283 |             return
284 | 
285 |         print("INF: stopping recording and recognizing")
286 | 
287 |         # TODO: choose which mic channel to use
288 |         # can we use the sound direction module for this?
289 | 
290 |         # buffer is a list of nparrays we now concat into one array
291 |         # and the slice out the first mic channel
292 |         slice = np.concatenate(self.buffer, axis=1)[0]
293 | 
294 |         # initialize preBuffer with last samples to fix cut off words
295 |         # loop through buffer and count samples until prebuffer is full
296 |         # TODO: performance issues!
297 |         if (PREBUFFER_WHEN_STOP):
298 |             sampleCounter = 0
299 |             itemCounter = 0
300 | 
301 |             for i in reversed(self.preBuffer):
302 |                 sampleCounter += len(i[0])
303 | 
304 |                 if(sampleCounter > self.lookaheadBufferSize):
305 |                     break
306 | 
307 |                 itemCounter += 1
308 | 
309 |             start = len(self.buffer) - itemCounter
310 |             self.preBuffer = self.buffer[start:]
311 |         else:
312 |             # don't copy to prebuffer
313 |             self.preBuffer = []
314 | 
315 |         # start new worker thread to do the http call and some processing
316 |         # copy slice to be thread safe!
317 |         # TODO: make a job queue so we don't start a new thread for each recognition
318 |         threading.Thread(target=self.recognize, args=(slice.copy(), )).start()
319 | 
320 |         # reset flag
321 |         self.isRecording = False
322 | 
323 |         return
324 | 
325 |     def calibrate(self):
326 |         self.isCalibrating = True
327 |         self.framesCount = 0
328 |         self.startCalibrationTimestamp = 0
329 | 
330 |         print("INF: starting calibration")
331 | 
332 |         if(self.isStarted == False):
333 |             self.start()
334 | 
335 |         return
336 | 
337 |     def stopCalibration(self):
338 |         if(self.isCalibrating == False):
339 |             print("INF: SpeechRecognitionModule.stopCalibration: not calibrating")
340 |             return
341 | 
342 |         self.isCalibrating = False
343 | 
344 |         # calculate avg rms over self.framesCount
345 |         self.threshold = CALIBRATION_THRESHOLD_FACTOR * (self.rmsSum / self.framesCount)
346 |         print 'calibration done, RMS threshold is: ' + str(self.threshold)
347 |         return
348 | 
349 |     def enableAutoDetection(self):
350 |         self.isAutoDetectionEnabled = True
351 |         print("INF: autoDetection enabled")
352 |         return
353 | 
354 |     def disableAutoDetection(self):
355 |         self.isAutoDetectionEnabled = False
356 |         return
357 | 
358 |     def setLanguage(self, language = DEFAULT_LANGUAGE):
359 |         self.language = language
360 |         return
361 | 
362 |     # used for RMS calculation
363 |     def convertStr2SignedInt(self, data):
364 |         """
365 |         This function takes a string containing 16 bits little endian sound
366 |         samples as input and returns a vector containing the 16 bits sound
367 |         samples values converted between -1 and 1.
368 |         """
369 | 
370 |         # from the naoqi sample, but rewritten to use numpy methods instead of for loops
371 | 
372 |         lsb = data[0::2]
373 |         msb = data[1::2]
374 | 
375 |         # don't remove the .0, otherwise overflow!
376 |         rms_data = np.add(lsb, np.multiply(msb, 256.0))
377 | 
378 |         # gives and array that contains -65536 on every position where signedData is > 32768
379 |         sign_correction = np.select([rms_data>=32768], [-65536])
380 | 
381 |         # add the two to get the correct signed values
382 |         rms_data = np.add(rms_data, sign_correction)
383 | 
384 |         # normalize values to -1.0 ... +1.0
385 |         rms_data = np.divide(rms_data, 32768.0)
386 | 
387 |         return rms_data
388 | 
389 |     def recognize(self, data):
390 |         # print 'sending %d bytes' % len(data)
391 | 
392 |         if (WRITE_WAV_FILE):
393 |             # write to file
394 |             filename = "out" + str(self.fileCounter)
395 |             self.fileCounter += 1
396 |             outfile = open(filename + ".raw", "wb")
397 |             data.tofile(outfile)
398 |             outfile.close()
399 |             rawToWav(filename)
400 | 
401 |         buffer = np.getbuffer(data)
402 | 
403 |         r = Recognizer()
404 |         try:
405 |             result = r.recognize_google(audio_data=buffer, samplerate=SAMPLE_RATE, language=self.language)
406 |             self.memory.raiseEvent("SpeechRecognition", result)
407 |             print 'RESULT: ' + result
408 |         except UnknownValueError:
409 |             print 'ERR: Recognition error'
410 |         except RequestError, e:
411 |             print 'ERR: ' + str(e)
412 |         except socket.timeout:
413 |             print 'ERR: Socket timeout'
414 |         except:
415 |             print 'ERR: Unknown, probably timeout ' + str(sys.exc_info()[0])
416 | 
417 |     def setAutoDetectionThreshold(self, threshold):
418 |         self.autoDetectionThreshold = threshold
419 | 
420 |     def setIdleReleaseTime(self, releaseTime):
421 |         self.idleReleaseTime = releaseTime
422 | 
423 |     def setHoldTime(self, holdTime):
424 |         self.holdTime = holdTime
425 | 
426 |     def setMaxRecordingDuration(self, duration):
427 |         self.recordingDuration = duration
428 | 
429 |     def setLookaheadDuration(self, duration):
430 |         self.lookaheadBufferSize = duration * SAMPLE_RATE
431 |         self.preBuffer = []
432 |         self.preBufferLength = 0
433 | 
434 | # SpeechRecognition - end
435 | 
436 | 
437 |     def getAudioDuration(self):
438 |         return len(self.audio_data)/48000.0
439 | 
440 | def main():
441 |     """ Main entry point
442 | 
443 |     """
444 |     parser = OptionParser()
445 |     parser.add_option("--pip",
446 |         help="Parent broker port. The IP address or your robot",
447 |         dest="pip")
448 |     parser.add_option("--pport",
449 |         help="Parent broker port. The port NAOqi is listening to",
450 |         dest="pport",
451 |         type="int")
452 |     parser.set_defaults(
453 |         pip=NAO_IP,
454 |         pport=NAO_PORT)
455 | 
456 |     (opts, args_) = parser.parse_args()
457 |     pip   = opts.pip
458 |     pport = opts.pport
459 | 
460 |     # We need this broker to be able to construct
461 |     # NAOqi modules and subscribe to other modules
462 |     # The broker must stay alive until the program exists
463 |     myBroker = naoqi.ALBroker("myBroker",
464 |        "0.0.0.0",   # listen to anyone
465 |        0,           # find a free port and use it
466 |        pip,         # parent broker IP
467 |        pport)       # parent broker port
468 | 
469 |     try:
470 |         p = ALProxy("SpeechRecognition")
471 |         p.exit()  # kill previous instance, useful for developing ;)
472 |     except:
473 |         pass
474 | 
475 |     # Reinstantiate module
476 | 
477 |     # Warning: SpeechRecognition must be a global variable
478 |     # The name given to the constructor must be the name of the
479 |     # variable
480 |     global SpeechRecognition
481 |     SpeechRecognition = SpeechRecognitionModule("SpeechRecognition", pip)
482 | 
483 |     # uncomment for debug purposes
484 |     # usually a subscribing client will call start() from ALProxy
485 |     #SpeechRecognition.start()
486 |     #SpeechRecognition.startRecording()
487 |     #SpeechRecognition.calibrate()
488 |     #SpeechRecognition.enableAutoDetection()
489 |     #SpeechRecognition.startRecording()
490 | 
491 |     try:
492 |         while True:
493 |             time.sleep(1)
494 |     except KeyboardInterrupt:
495 |         print
496 |         print "Interrupted by user, shutting down"
497 |         myBroker.shutdown()
498 |         sys.exit(0)
499 | 
500 | 
501 | 
502 | if __name__ == "__main__":
503 |     main()
504 | 


--------------------------------------------------------------------------------