├── CMakeLists.txt ├── LICENSE ├── README.md ├── config └── params.yaml ├── env ├── bin │ ├── python │ └── python3 ├── lib64 └── pyvenv.cfg ├── install.sh ├── launch ├── remap.launch ├── voice.launch └── voixmap.launch ├── mymap1.pgm ├── mymap1.yaml ├── package.xml ├── requirements.txt ├── rviz-simul.png └── scripts ├── mic_client.py └── nav_v1.py /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.3) 2 | project(voice_ros) 3 | 4 | ## Find catkin macros and libraries 5 | ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) 6 | ## is used, also find other catkin packages 7 | find_package(catkin REQUIRED COMPONENTS 8 | rospy 9 | std_msgs 10 | message_generation 11 | move_base_msgs 12 | actionlib 13 | geometry_msgs 14 | 15 | ) 16 | 17 | ## Uncomment this if the package has a setup.py. This macro ensures 18 | ## modules and global scripts declared therein get installed 19 | ## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html 20 | # catkin_python_setup() 21 | 22 | add_message_files( 23 | FILES 24 | 25 | ) 26 | 27 | generate_messages( 28 | DEPENDENCIES 29 | std_msgs 30 | move_base_msgs 31 | geometry_msgs 32 | ) 33 | 34 | ################################### 35 | ## catkin specific configuration ## 36 | ################################### 37 | ## The catkin_package macro generates cmake config files for your package 38 | ## Declare things to be passed to dependent projects 39 | ## INCLUDE_DIRS: uncomment this if your package contains header files 40 | ## LIBRARIES: libraries you create in this project that dependent projects also need 41 | ## CATKIN_DEPENDS: catkin_packages dependent projects also need 42 | ## DEPENDS: system dependencies of this project that dependent projects also need 43 | catkin_package( 44 | # INCLUDE_DIRS include 45 | # LIBRARIES ros_dialogflow 46 | # CATKIN_DEPENDS rospy std_msgs 47 | # DEPENDS system_lib 48 | ) 49 | 50 | ########### 51 | ## Build ## 52 | ########### 53 | 54 | ## Specify additional locations of header files 55 | ## Your package locations should be listed before other locations 56 | include_directories( 57 | # include 58 | ${catkin_INCLUDE_DIRS} 59 | ) 60 | 61 | 62 | ## Mark executable scripts (Python etc.) for installation 63 | ## in contrast to setup.py, you can choose the destination 64 | install(PROGRAMS 65 | scripts/mic_client.py 66 | scripts/nav_v1.py 67 | DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} 68 | ) 69 | 70 | # Mark other files for installation (e.g. launch and bag files, etc.) 71 | foreach (dir launch config) 72 | install(DIRECTORY ${dir}/ 73 | DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/${dir}) 74 | endforeach(dir) 75 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Mohammed Elmzaghi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Voix 3 | This package uses the Google Text-To-Speech API to allow for seamless human robot interaction by sending navigation commands issued by the user to robot. Main operating system used was ROS while Turtlebot and Gazebo were used for simulation. Actual experiments were conducted using RosAria and the Pioneer3AT robot. 4 | 5 | # Academic Citations 6 | Please cite this IEEE publication when using this package for any academic projects: [M. Elmzaghi, M. Fahad and Y. Guo, "Implementing Robust Voice-Control for Human Robot Interaction for Autonomous Robot Guides," 2019 IEEE MIT Undergraduate Research Technology Conference (URTC), Cambridge, MA, USA, 2019, pp. 1-6, doi: 10.1109/URTC49097.2019.9660499.](https://ieeexplore.ieee.org/document/9660499) 7 | 8 | # Why use Google API? 9 | The goal is to create natural human robot interaction. Current voice processing software requires users to design speech recognition algorithms, collect datasets, as well as understand Natural Language Processing methods. The accuracy of these methods still does not exceed that of Google Cloud API given its massive datasets from the various voice services as well as the use of deep learning neural networks. By incorporating an accurate and robust voice processing software easily integrated with popular robotics software, users can include robust voice control into their robotics projects. 10 | # Installation 11 | 12 | To use this software package, ensure you are running ROS for your respective system as well as have installed Google Cloud SDK (which is detailed below.) 13 | However, we need to install PortAudio so we can use PyAudio to get mic data. 14 | ```bash 15 | sudo apt-get install portaudio19-dev 16 | ``` 17 | Install all the requirements using pip by cloning the Github repo and installing all the packages in requirements.txt. 18 | 19 | ```bash 20 | cd ~/catkin_ws/src 21 | git clone https://github.com/moeelm/voice_ros.git 22 | cd voice_ros 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | ## Google Cloud Setup 27 | Follow the instructions [here](https://cloud.google.com/speech/docs/quickstart) for configuring your Google Cloud project and installing the SDK for authentication. You will need a google/gmail account. 28 | 29 | Usage of the Google Cloud SDK requires authentication. This means you require an API key and an activated service account to utilize the APIs. 30 | 1. Setup a [service account](https://cloud.google.com/docs/authentication/getting-started) 31 | 2. Download the service account key as a JSON. 32 | 3. Check you have GOOGLE_APPLICATION_CREDENTIALS in your environment. This should be the path to the keys. 33 | ```bash 34 | export GOOGLE_APPLICATION_CREDENTIALS='/path/to/key' 35 | ``` 36 | 4. Run the authentication command: 37 | ```bash 38 | gcloud auth activate-service-account --key-file GOOGLE_APPLICATION_CREDENTIALS 39 | ``` 40 | 41 | 42 | # Usage 43 | Follow the steps below to setup the package properly. 44 | 45 | ## Configure topics 46 | Go into the config directory and change the following parameters in the `params.yaml` file: 47 | 48 | * `results_topic`: (Optional) The topic where your results will be published. 49 | * `project_id`: The name of your project for the Google Speech node. This is the name of your Google Cloud project when going through the Google Cloud setup. 50 | 51 | ## Launching nodes 52 | To start the voice processing node, run the following command: 53 | ```bash 54 | roslaunch voice_ros voice.launch 55 | ``` 56 | ## Autonomous Navigation Using Turtlebot 57 | 58 | 59 | To run autonomous navigation using the Turtlebot robot as well as RVIZ simulator, 60 | 1. Launch Turtlebot as well as Gazebo environment 61 | ```bash 62 | roslaunch turtlebot_gazebo turtlebot_world.launch 63 | ``` 64 | 2. Run the map 65 | ```bash 66 | rosrun map_server map_server /PATH/TO/myMap.yaml 67 | ``` 68 | 3. Run the navigation demo. AMCL (Adaptive Monte Carlo Localization) is used for localizing the robot 69 | ```bash 70 | roslaunch turtlebot_gazebo amcl_demo.launch 71 | ``` 72 | 4. Launch ROS Vizualization (RVIZ) 73 | ```bash 74 | roslaunch turtlebot_rviz_launchers view_navigation.launch 75 | ``` 76 | The `2D Nav Goal` sends a navigation goal, `2D Pose Estimate` sets the pose of the robot. 77 | 78 | # ROS Nodes 79 | 80 | ## mic_client 81 | ROS node receives text from the Google Cloud Speech API and publishes it onto `text_topic` (see config/params.yaml). This is parsed and sent to the navigation node, `nav_v1.py`. `single_utterance` is set to true to allow for single user input. This prevents the voice processing from being continous, allowing the intent to be parsed and detected. 82 | 83 | ### Published Topics 84 | `text_topic` ([std_msgs/String](http://docs.ros.org/api/std_msgs/html/msg/String.html)) 85 | Acquired text from the Google Cloud Speech API. 86 | `text2_topic`([std_msgs/String](http://docs.ros.org/api/std_msgs/html/msg/String.html)) 87 | Intent detected in voice command. 88 | 89 | ## Navigation 90 | ROS node that takes text from the _mic\_client_ node, publishes the parsed intent and sends the spatial location associated with the intent to the `\move_base_simple/goal` topic. For example, `'microwave'`has the position coordinates `(2.00, -5.00, 0.000)` mapped to it. 91 | 92 | 93 | -------------------------------------------------------------------------------- /config/params.yaml: -------------------------------------------------------------------------------- 1 | results_topic: "/results" 2 | text_topic: "/text" 3 | text2_topic: "/intent" 4 | project_id: "voice-controlled-1528217063857" 5 | -------------------------------------------------------------------------------- /env/bin/python: -------------------------------------------------------------------------------- 1 | python3 -------------------------------------------------------------------------------- /env/bin/python3: -------------------------------------------------------------------------------- 1 | /usr/bin/python3 -------------------------------------------------------------------------------- /env/lib64: -------------------------------------------------------------------------------- 1 | lib -------------------------------------------------------------------------------- /env/pyvenv.cfg: -------------------------------------------------------------------------------- 1 | home = /usr/bin 2 | include-system-site-packages = false 3 | version = 3.5.2 4 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [[-v GOOGLE_APPLICATION_CREDENTIALS]] then 3 | echo "Found credentials in: $GOOGLE_APPLICATION_CREDENTIALS" 4 | else 5 | read -p "No credentials path found, please enter the path for your Google service account credentials: " gpath 6 | export GOOGLE_APPLICATION_CREDENTIALS=$gpath 7 | # Start by installing Google Cloud dependencies 8 | export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" 9 | echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list 10 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - 11 | sudo apt-get update && sudo apt-get install google-cloud-sdk 12 | # Now install python dependencies 13 | sudo apt-get install python-pip portaudio19-dev 14 | pip install --user -r requirements.txt 15 | echo "Remember to run 'gcloud init' to configure Google Cloud" -------------------------------------------------------------------------------- /launch/remap.launch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /launch/voice.launch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /launch/voixmap.launch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /mymap1.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moeelm/Voice-Control-ROS/56c00d1d9cbbbb2aef0ebb6c5668bb37918815f5/mymap1.pgm -------------------------------------------------------------------------------- /mymap1.yaml: -------------------------------------------------------------------------------- 1 | image: mymap1.pgm 2 | resolution: 0.050000 3 | origin: [-24.400000, -14.800000, 0.000000] 4 | negate: 0 5 | occupied_thresh: 0.65 6 | free_thresh: 0.196 7 | 8 | -------------------------------------------------------------------------------- /package.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | voice_ros 4 | 1.0.0 5 | Voice Control ROS 6 | Mohammed Elmzaghi 7 | 8 | 9 | 10 | 11 | 12 | MIT 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | catkin 23 | rospy 24 | std_msgs 25 | message_generation 26 | message_runtime 27 | 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyAudio 2 | google-cloud-speech 3 | six==1.10.0 4 | google-api-python-client 5 | dialogflow 6 | pyyaml 7 | numpy==1.13.0 8 | protobuf-to-dict 9 | -------------------------------------------------------------------------------- /rviz-simul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moeelm/Voice-Control-ROS/56c00d1d9cbbbb2aef0ebb6c5668bb37918815f5/rviz-simul.png -------------------------------------------------------------------------------- /scripts/mic_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from google.cloud import speech 4 | from google.cloud.speech import enums 5 | from google.cloud.speech import types 6 | import pyaudio 7 | import Queue 8 | import rospy 9 | from std_msgs.msg import String 10 | 11 | 12 | class GspeechClient(object): 13 | def __init__(self): 14 | # Audio stream input setup 15 | 16 | #Shut down command 17 | rospy.on_shutdown(self.shutdown) 18 | 19 | FORMAT = pyaudio.paInt16 20 | CHANNELS = 1 21 | RATE = 16000 22 | self.CHUNK = 4096 23 | self.audio = pyaudio.PyAudio() 24 | self.stream = self.audio.open(format=FORMAT, channels=CHANNELS, 25 | rate=RATE, input=True, 26 | frames_per_buffer=self.CHUNK, 27 | stream_callback=self._get_data) 28 | self._buff = Queue.Queue() # Buffer to hold audio data 29 | self.closed = False 30 | 31 | # ROS Text Publisher 32 | text_topic = rospy.get_param('/text_topic','/text') 33 | text2_topic = rospy.get_param('/text2_topic', '/intent') 34 | self.text_pub = rospy.Publisher(text_topic, String, queue_size=10) 35 | self.text2_pub = rospy.Publisher(text2_topic, String, queue_size=10) 36 | 37 | def _get_data(self, in_data, frame_count, time_info, status): 38 | """Daemon thread to continuously get audio data from the server and put 39 | it in a buffer. 40 | """ 41 | # Uncomment this if you want to hear the audio being replayed. 42 | self._buff.put(in_data) 43 | return None, pyaudio.paContinue 44 | 45 | def _generator(self): 46 | """Generator function that continuously yields audio chunks from the buffer. 47 | Used to stream data to the Google Speech API Asynchronously. 48 | """ 49 | while not self.closed: 50 | # Check first chunk of data 51 | chunk = self._buff.get() 52 | if chunk is None: 53 | return 54 | data = [chunk] 55 | 56 | # Read in a stream till the end using a non-blocking get() 57 | while True: 58 | try: 59 | chunk = self._buff.get(block=False) 60 | if chunk is None: 61 | return 62 | data.append(chunk) 63 | except Queue.Empty: 64 | break 65 | 66 | yield b''.join(data) 67 | 68 | def _listen_print_loop(self, responses): 69 | """Iterates through server responses and prints them. 70 | The responses passed is a generator that will block until a response 71 | is provided by the server. 72 | Each response may contain multiple results, and each result may contain 73 | multiple alternatives; for details, see https://goo.gl/tjCPAU. Here we 74 | print only the transcription for the top alternative of the top result. 75 | """ 76 | for response in responses: 77 | # If not a valid response, move on to next potential one 78 | if not response.results: 79 | continue 80 | 81 | # The `results` list is consecutive. For streaming, we only care about 82 | # the first result being considered, since once it's `is_final`, it 83 | # moves on to considering the next utterance. 84 | result = response.results[0] 85 | if not result.alternatives: 86 | continue 87 | 88 | # Display the transcription of the top alternative. 89 | transcript = result.alternatives[0].transcript 90 | 91 | # Parse the final utterance 92 | if result.is_final: 93 | rospy.loginfo("Google Speech result: {}".format(result)) 94 | # Received data is Unicode, convert it to string 95 | transcript = transcript.encode('utf-8') 96 | # Strip the initial space, if any 97 | if transcript.startswith(' '): 98 | transcript = transcript[1:] 99 | # Exit if needed 100 | if transcript.lower() == 'exit': 101 | self.shutdown() 102 | # Send the rest of the sentence to topic 103 | self.text_pub.publish(transcript) 104 | #Checks if sentence contains command in dictionary 105 | commands = ['microwave', 'bathroom'] 106 | mycommands = transcript.split(' ') 107 | check_command = False 108 | for i in commands: 109 | for j in mycommands: 110 | if i == j: 111 | print 'Sure, going to %s' %(j) 112 | check_command = True 113 | rospy.loginfo(j) 114 | self.text2_pub.publish(j) 115 | if check_command == False: 116 | print "Sorry, cant do that" 117 | break 118 | # rospy.sleep(3) 119 | 120 | 121 | 122 | def gspeech_client(self): 123 | """Creates the Google Speech API client, configures it, and sends/gets 124 | audio/text data for parsing. 125 | """ 126 | language_code = 'en-US' 127 | client = speech.SpeechClient() 128 | config = types.RecognitionConfig( 129 | encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, 130 | sample_rate_hertz=16000, 131 | language_code=language_code) 132 | streaming_config = types.StreamingRecognitionConfig( 133 | config=config, 134 | interim_results=True, 135 | single_utterance=True) 136 | requests = (types.StreamingRecognizeRequest(audio_content=content) for content in self._generator()) 137 | responses = client.streaming_recognize(streaming_config, requests) 138 | self._listen_print_loop(responses) 139 | 140 | def shutdown(self): 141 | """Shut down as cleanly as possible""" 142 | rospy.loginfo("Shutting down") 143 | self.closed = True 144 | self._buff.put(None) 145 | self.stream.close() 146 | self.audio.terminate() 147 | exit() 148 | 149 | def start_client(self): 150 | """Entry function to start the client""" 151 | try: 152 | rospy.loginfo("Starting Google speech mic client") 153 | self.gspeech_client() 154 | except KeyboardInterrupt: 155 | self.shutdown() 156 | 157 | 158 | if __name__ == '__main__': 159 | rospy.init_node('mic_client') 160 | g = GspeechClient() 161 | g.start_client() 162 | 163 | -------------------------------------------------------------------------------- /scripts/nav_v1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import rospy 3 | from std_msgs.msg import String 4 | from geometry_msgs.msg import PoseStamped 5 | from actionlib_msgs.msg import GoalStatus 6 | from move_base_msgs.msg import MoveBaseAction 7 | 8 | class Motion(object): 9 | def __init__(self): 10 | 11 | rospy.on_shutdown(self.shutdown) 12 | 13 | 14 | self.pub_pos = rospy.Publisher('move_base_simple/goal', PoseStamped, queue_size=10) 15 | 16 | rospy.sleep(3) 17 | rospy.loginfo("Wait for the action server to come up") 18 | text2_topic = rospy.get_param('/text2_topic', '/intent') 19 | rospy.Subscriber(text2_topic, String, self.callback, queue_size=10) 20 | 21 | 22 | def callback(self, ros_data): 23 | # Dictionary with various spatial locations, change according to position 24 | self.my_intent = {'microwave':{'position':{'x':2.00, 'y':-5.000, 'z':0.000}}, 25 | 'bathroom':{'position':{'x':0.49, 'y':-7.15, 'z':-0.002}} 26 | } 27 | self.word_received = ros_data.data 28 | for key in sorted(self.my_intent.keys()): 29 | if self.word_received == key: 30 | self.myGoal = self.my_intent[key]['position'] 31 | 32 | def start(self): 33 | rospy.loginfo("Starting Navigation") 34 | 35 | 36 | def go_to(self): 37 | rospy.sleep(5) 38 | goal = PoseStamped() 39 | goal.header.frame_id = "odom" 40 | goal.header.stamp = rospy.Time.now() 41 | goal.pose.position.z = self.myGoal['z'] 42 | goal.pose.position.x = self.myGoal['x'] 43 | goal.pose.position.y = self.myGoal['y'] 44 | goal.pose.orientation.w = 1.0 45 | self.pub_pos.publish(goal) 46 | rospy.loginfo("Sending goal") 47 | rospy.signal_shutdown("Restarting nav node") 48 | rospy.sleep(5) 49 | 50 | 51 | def shutdown(self): 52 | rospy.loginfo("Stop") 53 | self.closed = True 54 | exit() 55 | 56 | if __name__ == '__main__': 57 | try: 58 | rospy.init_node("nav_v1") 59 | navigator = Motion() 60 | navigator.start() 61 | success = navigator.go_to() 62 | if success: 63 | rospy.loginfo("Reached desired destination") 64 | else: 65 | rospy.loginfo("Sorry, I was not able to reach the destination. Try again") 66 | except KeyboardInterrupt: 67 | navigator.shutdown() 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | --------------------------------------------------------------------------------