├── .gitignore ├── APF.py ├── LICENSE ├── README.md ├── config ├── config_DQN.json └── config_IQN.json ├── demonstration.png ├── env_visualizer.py ├── marinenav_env ├── __init__.py ├── envs │ ├── __init__.py │ ├── marinenav_env.py │ └── utils │ │ └── robot.py └── setup.py ├── policy ├── DQN_model.py ├── IQN_model.py ├── agent.py ├── replay_buffer.py └── trainer.py ├── pretrained_models ├── DQN │ └── seed_9 │ │ ├── dqn_constructor_params.json │ │ └── dqn_network_params.pth └── IQN │ └── seed_9 │ ├── constructor_params.json │ └── network_params.pth ├── run_experiments.py ├── scripts ├── plot_eval_returns.py ├── visualize_eval_episode.py └── visualize_exp_episode.py ├── system_requirements ├── thirdparty └── RVO.py └── train_rl_agents.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode* 2 | training* 3 | test* 4 | experiment* 5 | __pycache__ 6 | marine_env/marine_env.egg-info/ 7 | *.png 8 | *.mp4 -------------------------------------------------------------------------------- /APF.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class APF_agent: 5 | 6 | def __init__(self, a, w): 7 | self.k_att = 50.0 # attractive force constant 8 | self.k_rep = 500.0 # repulsive force constant 9 | self.k_v = 1.0 # velocity force constant 10 | self.m = 500 # robot weight (kg) 11 | self.d0 = 15.0 # obstacle distance threshold (m) 12 | self.n = 2 # power constant of repulsive force 13 | self.min_vel = 1.0 # if velocity is lower than the threshold, mandate acceleration 14 | 15 | self.a = a # available linear acceleration (action 1) 16 | self.w = w # available angular velocity (action 2) 17 | 18 | self.GAMMA = 0.99 19 | 20 | def position_force(self,position,radius,goal): 21 | d_obs = np.linalg.norm(position) - radius - 0.8 22 | d_goal = np.linalg.norm(goal) 23 | 24 | # repulsive force component to move away from the obstacle 25 | mag_1 = self.k_rep * ((1/d_obs)-(1/self.d0)) * (d_goal ** self.n)/(d_obs ** 2) 26 | dir_1 = -1.0 * position / np.linalg.norm(position) 27 | F_rep_1 = mag_1 * dir_1 28 | 29 | # repulsive force component to move towards the goal 30 | mag_2 = (self.n / 2) * self.k_rep * (((1/d_obs)-(1/self.d0))**2) * (d_goal ** (self.n-1)) 31 | dir_2 = -1.0 * goal / d_goal 32 | F_rep_2 = mag_2 * dir_2 33 | 34 | return F_rep_1+F_rep_2 35 | 36 | def velocity_force(self,v_ao,position,radius): 37 | d_obs = np.linalg.norm(position) - radius - 0.8 38 | 39 | mag = -self.k_v * v_ao / d_obs 40 | dir = position / np.linalg.norm(position) 41 | 42 | F_rep = mag * dir 43 | 44 | return F_rep 45 | 46 | def act(self, observation): 47 | assert len(observation) == 39, "The state size does not equal 39" 48 | 49 | obs_array = np.array(observation) 50 | 51 | ego = obs_array[:4] 52 | static = obs_array[4:19] 53 | dynamic = obs_array[19:] 54 | 55 | # compute attractive force 56 | F_att = self.k_att * ego[:2] 57 | 58 | # compute total repulsive force from sonar reflections 59 | F_rep = np.zeros(2) 60 | goal = ego[:2] 61 | velocity = ego[2:] 62 | 63 | # force from static obstacles 64 | for i in range(0,len(static),3): 65 | if np.abs(static[i]) < 1e-3 and np.abs(static[i+1]) < 1e-3: 66 | # padding 67 | continue 68 | 69 | F_rep += self.position_force(static[i:i+2],static[i+2],goal) 70 | 71 | # force from dynamic obstacles 72 | for i in range(0,len(dynamic),4): 73 | if np.abs(dynamic[i]) < 1e-3 and np.abs(dynamic[i+1]) < 1e-3: 74 | # padding 75 | continue 76 | 77 | e_ao = dynamic[i:i+2]/np.linalg.norm(dynamic[i:i+2]) 78 | v_ao = np.dot(velocity-dynamic[i+2:i+4],e_ao) 79 | if v_ao >= 0.0: 80 | # agent is moving towards dynamic obstacles 81 | F_rep += self.position_force(dynamic[i:i+2],0.8,goal) 82 | F_rep += self.velocity_force(v_ao,dynamic[i:i+2],0.8) 83 | 84 | # select angular velocity action 85 | F_total = F_att + F_rep 86 | V_angle = 0.0 87 | if np.linalg.norm(velocity) > 1e-03: 88 | V_angle = np.arctan2(velocity[1],velocity[0]) 89 | F_angle = np.arctan2(F_total[1],F_total[0]) 90 | 91 | diff_angle = F_angle - V_angle 92 | while diff_angle < -np.pi: 93 | diff_angle += 2 * np.pi 94 | while diff_angle >= np.pi: 95 | diff_angle -= 2 * np.pi 96 | 97 | w_idx = np.argmin(np.abs(self.w-diff_angle)) 98 | 99 | # select linear acceleration action 100 | a_total = F_total / self.m 101 | V_dir = np.array([1.0,0.0]) 102 | if np.linalg.norm(velocity) > 1e-03: 103 | V_dir = velocity / np.linalg.norm(velocity) 104 | a_proj = np.dot(a_total,V_dir) 105 | 106 | a = copy.deepcopy(self.a) 107 | if np.linalg.norm(velocity) < self.min_vel: 108 | # if the velocity is small, mandate acceleration 109 | a[a<=0.0] = -np.inf 110 | a_diff = a-a_proj 111 | a_idx = np.argmin(np.abs(a_diff)) 112 | 113 | return a_idx * len(self.w) + w_idx 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Robust Field Autonomy Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi Robot Distributional RL Navigation 2 | 3 | This repository contains the code implementation of our ICRA 2024 paper [here](https://arxiv.org/abs/2402.11799). We proposed a Distributional Reinforcement Learning (Distributional RL) based decentralized multi-robot collision avoidance method for Autonomous Surface Vehicles, which can work safely in congested waters with unknown current disturbances. Each vehicle equipped with the trained agent makes independent action decisions based on (1) ego observation of surrounding objects including static obstacles and other vehicles, and (2) level of risk sensitivity. The performance of our approach is shown in the video [here](https://robustfieldautonomylab.github.io/Lin_ICRA24_Video.mp4). 4 | 5 |

6 | 7 |

8 | 9 | If you find this repository useful, please cite our paper 10 | ``` 11 | @INPROCEEDINGS{10611668, 12 | author={Lin, Xi and Huang, Yewei and Chen, Fanfei and Englot, Brendan}, 13 | booktitle={2024 IEEE International Conference on Robotics and Automation (ICRA)}, 14 | title={Decentralized Multi-Robot Navigation for Autonomous Surface Vehicles with Distributional Reinforcement Learning}, 15 | year={2024}, 16 | volume={}, 17 | number={}, 18 | pages={8327-8333}, 19 | doi={10.1109/ICRA57147.2024.10611668}} 20 | ``` 21 | 22 | ## Train RL Agents 23 | 24 | Our proposed Distributional RL based policy model (IQN), and a traditional RL based model (DQN) used for comparison, can be trained by running the following command. You can also skip this training section and run experiments with the provided pretrained models. 25 | 26 | ``` 27 | python train_rl_agents.py -C CONFIG_FILE [-P NUM_PROCS] [-D DEVICE] 28 | ``` 29 | 30 | Replace CONFIG_FILE with the path to the training config file. Example training config files are provided in config directory. 31 | 32 | -P and -D flags are optional. -P can be used to specify the number of processes to be run in parallel when training multiple models. -D can be used to specify the CPU or GPU intended to be used in training. 33 | 34 | ## Evaluate Learning Performance 35 | 36 | We provide scripts for plotting learning performance and visualizing evaluation episodes of RL agents trained by you. 37 | 38 | To plot learning curves, set data_dir, seeds, and eval_agents according to the corresponding trained RL models, and run the command 39 | 40 | ``` 41 | python plot_eval_returns.py 42 | ``` 43 | 44 | To visualize an evaluation episode, set eval_id and eval_episode according to the corresponding evaluation config file, and run the command 45 | 46 | ``` 47 | python visualize_eval_episode.py 48 | ``` 49 | 50 | ## Run Experiments 51 | 52 | Run the following command to perform evaluation experiments. If you want to evaluate RL agents trained on your own, set save_dir in run_experiments.py to the stored directories. 53 | 54 | ``` 55 | python run_experiments.py 56 | ``` 57 | 58 | To visualize an experiment episode, set schedule_id, agent_id, ep_id according to the experiment result file, and run the command 59 | 60 | ``` 61 | python visualize_exp_episode.py 62 | ``` 63 | -------------------------------------------------------------------------------- /config/config_DQN.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed":[9], 3 | "total_timesteps":6000000, 4 | "eval_freq":60000, 5 | "save_dir":"/home/rfal/Multi_Robot_Distributional_RL_Navigation/training_data", 6 | "training_schedule": 7 | { 8 | "timesteps":[0,1000000,2000000,3000000,4000000,5000000], 9 | "num_cooperative":[3,5,7,7,7,7], 10 | "num_non_cooperative":[0,0,0,0,0,0], 11 | "num_cores":[4,6,8,8,8,8], 12 | "num_obstacles":[0,0,0,6,8,10], 13 | "min_start_goal_dis":[30.0,35.0,40.0,40.0,40.0,40.0] 14 | }, 15 | "eval_schedule": 16 | { 17 | "num_episodes": [10,10,10,10,10,10], 18 | "num_cooperative": [3,5,7,3,5,7], 19 | "num_non_cooperative": [0,0,0,0,0,0], 20 | "num_cores": [4,6,8,4,6,8], 21 | "num_obstacles": [0,0,0,6,8,10], 22 | "min_start_goal_dis": [40.0,40.0,40.0,40.0,40.0,40.0] 23 | }, 24 | "use_iqn":false 25 | } -------------------------------------------------------------------------------- /config/config_IQN.json: -------------------------------------------------------------------------------- 1 | { 2 | "seed":[9], 3 | "total_timesteps":6000000, 4 | "eval_freq":60000, 5 | "save_dir":"/home/rfal/Multi_Robot_Distributional_RL_Navigation/training_data", 6 | "training_schedule": 7 | { 8 | "timesteps":[0,1000000,2000000,3000000,4000000,5000000], 9 | "num_cooperative":[3,5,7,7,7,7], 10 | "num_non_cooperative":[0,0,0,0,0,0], 11 | "num_cores":[4,6,8,8,8,8], 12 | "num_obstacles":[0,0,0,6,8,10], 13 | "min_start_goal_dis":[30.0,35.0,40.0,40.0,40.0,40.0] 14 | }, 15 | "eval_schedule": 16 | { 17 | "num_episodes": [10,10,10,10,10,10], 18 | "num_cooperative": [3,5,7,3,5,7], 19 | "num_non_cooperative": [0,0,0,0,0,0], 20 | "num_cores": [4,6,8,4,6,8], 21 | "num_obstacles": [0,0,0,6,8,10], 22 | "min_start_goal_dis": [40.0,40.0,40.0,40.0,40.0,40.0] 23 | }, 24 | "use_iqn":true 25 | } -------------------------------------------------------------------------------- /demonstration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobustFieldAutonomyLab/Multi_Robot_Distributional_RL_Navigation/889bfed3aded1fc2edc1967090517f22e14de7a0/demonstration.png -------------------------------------------------------------------------------- /env_visualizer.py: -------------------------------------------------------------------------------- 1 | import marinenav_env.envs.marinenav_env as marinenav_env 2 | import numpy as np 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | import matplotlib.cm as cm 6 | import matplotlib.animation as animation 7 | import copy 8 | import scipy.spatial 9 | import gym 10 | import json 11 | import os 12 | 13 | class EnvVisualizer: 14 | 15 | def __init__(self, 16 | seed:int=0, 17 | draw_envs:bool=False, # Mode 2: plot the envrionment 18 | draw_traj:bool=False, # Mode 3: plot final trajectories given action sequences 19 | video_plots:bool=False, # Mode 4: Generate plots for a video 20 | plot_dist:bool=False, # If return distributions are needed (for IQN agent) in the video 21 | plot_qvalues:bool=False, # If Q values are needed in the video 22 | dpi:int=96, # Monitor DPI 23 | ): 24 | self.env = marinenav_env.MarineNavEnv2(seed) 25 | self.env.reset() 26 | self.fig = None # figure for visualization 27 | self.axis_graph = None # sub figure for the map 28 | self.robots_plot = [] 29 | self.robots_last_pos = [] 30 | self.robots_traj_plot = [] 31 | self.LiDAR_beams_plot = [] 32 | self.axis_title = None # sub figure for title 33 | self.axis_action = None # sub figure for action command and steer data 34 | self.axis_goal = None # sub figure for relative goal measurment 35 | self.axis_perception = None # sub figure for perception output 36 | self.axis_dvl = None # sub figure for DVL measurement 37 | self.axis_dist = [] # sub figure(s) for return distribution of actions 38 | self.axis_qvalues = None # subfigure for Q values of actions 39 | 40 | self.episode_actions = [] # action sequence load from episode data 41 | self.episode_actions_quantiles = None 42 | self.episode_actions_taus = None 43 | 44 | self.plot_dist = plot_dist # draw return distribution of actions 45 | self.plot_qvalues = plot_qvalues # draw Q values of actions 46 | self.draw_envs = draw_envs # draw only the envs 47 | self.draw_traj = draw_traj # draw only final trajectories 48 | self.video_plots = video_plots # draw video plots 49 | self.plots_save_dir = None # video plots save directory 50 | self.dpi = dpi # monitor DPI 51 | self.agent_name = None # agent name 52 | self.agent = None # agent (IQN or DQN for plot data) 53 | 54 | self.configs = None # evaluation configs 55 | self.episodes = None # evaluation episodes to visualize 56 | 57 | def init_visualize(self, 58 | env_configs=None # used in Mode 2 59 | ): 60 | 61 | # initialize subplot for the map, robot state and sensor measurments 62 | if self.draw_envs: 63 | # Mode 2: plot the envrionment 64 | if env_configs is None: 65 | self.fig, self.axis_graph = plt.subplots(1,1,figsize=(8,8)) 66 | else: 67 | num = len(env_configs) 68 | if num % 3 == 0: 69 | self.fig, self.axis_graphs = plt.subplots(int(num/3),3,figsize=(8*3,8*int(num/3))) 70 | else: 71 | self.fig, self.axis_graphs = plt.subplots(1,num,figsize=(8*num,8)) 72 | elif self.draw_traj: 73 | if self.plot_dist: 74 | self.fig = plt.figure(figsize=(24,16)) 75 | spec = self.fig.add_gridspec(5,6) 76 | 77 | self.axis_graph = self.fig.add_subplot(spec[:,:4]) 78 | self.axis_perception = self.fig.add_subplot(spec[:2,4:]) 79 | self.axis_dist.append(self.fig.add_subplot(spec[2:,4])) 80 | self.axis_dist.append(self.fig.add_subplot(spec[2:,5])) 81 | else: 82 | # Mode 3: plot final trajectories given action sequences 83 | self.fig, self.axis_graph = plt.subplots(figsize=(16,16)) 84 | elif self.video_plots: 85 | # Mode 4: Generate 1080p video plots 86 | w = 1920 87 | h = 1080 88 | self.fig = plt.figure(figsize=(w/self.dpi,h/self.dpi),dpi=self.dpi) 89 | if self.agent_name == "adaptive_IQN": 90 | spec = self.fig.add_gridspec(5,6) 91 | 92 | # self.axis_title = self.fig.add_subplot(spec[0,:]) 93 | # self.axis_title.text(-0.9,0.,"adaptive IQN performance",fontweight="bold",fontsize=45) 94 | 95 | self.axis_graph = self.fig.add_subplot(spec[:,:4]) 96 | self.axis_graph.set_title("adaptive IQN performance",fontweight="bold",fontsize=30) 97 | 98 | self.axis_perception = self.fig.add_subplot(spec[0:2,4:]) 99 | self.axis_dist.append(self.fig.add_subplot(spec[2:,4])) 100 | self.axis_dist.append(self.fig.add_subplot(spec[2:,5])) 101 | elif self.agent_name == "IQN": 102 | spec = self.fig.add_gridspec(5,12) 103 | 104 | # self.axis_title = self.fig.add_subplot(spec[0,:]) 105 | # self.axis_title.text(-0.9,0.,"IQN performance",fontweight="bold",fontsize=45) 106 | 107 | self.axis_graph = self.fig.add_subplot(spec[:,:8]) 108 | self.axis_graph.set_title("IQN performance",fontweight="bold",fontsize=30) 109 | 110 | self.axis_perception = self.fig.add_subplot(spec[0:2,8:]) 111 | self.axis_dist.append(self.fig.add_subplot(spec[2:,9:11])) 112 | elif self.agent_name == "DQN": 113 | spec = self.fig.add_gridspec(5,12) 114 | 115 | self.axis_graph = self.fig.add_subplot(spec[:,:8]) 116 | self.axis_graph.set_title("DQN performance",fontweight="bold",fontsize=30) 117 | 118 | self.axis_perception = self.fig.add_subplot(spec[0:2,8:]) 119 | self.axis_qvalues = self.fig.add_subplot(spec[2:,9:11]) 120 | else: 121 | name = "" 122 | if self.agent_name == "APF": 123 | name = "Artificial Potential Field" 124 | elif self.agent_name == "RVO": 125 | name = "Reciprocal Velocity Obstacle" 126 | spec = self.fig.add_gridspec(5,3) 127 | 128 | self.axis_graph = self.fig.add_subplot(spec[:,:2]) 129 | self.axis_graph.set_title(f"{name} performance",fontweight="bold",fontsize=30) 130 | 131 | self.axis_perception = self.fig.add_subplot(spec[:2,2]) 132 | self.axis_action = self.fig.add_subplot(spec[2:,2]) 133 | 134 | # self.axis_title.set_xlim([-1.0,1.0]) 135 | # self.axis_title.set_ylim([-1.0,1.0]) 136 | # self.axis_title.set_xticks([]) 137 | # self.axis_title.set_yticks([]) 138 | # self.axis_title.spines["left"].set_visible(False) 139 | # self.axis_title.spines["top"].set_visible(False) 140 | # self.axis_title.spines["right"].set_visible(False) 141 | # self.axis_title.spines["bottom"].set_visible(False) 142 | else: 143 | # Mode 1 (default): Display an episode 144 | self.fig = plt.figure(figsize=(32,16)) 145 | spec = self.fig.add_gridspec(5,4) 146 | self.axis_graph = self.fig.add_subplot(spec[:,:2]) 147 | # self.axis_goal = self.fig.add_subplot(spec[0,2]) 148 | self.axis_perception = self.fig.add_subplot(spec[1:3,2]) 149 | # self.axis_dvl = self.fig.add_subplot(spec[3:,2]) 150 | # self.axis_observation = self.fig.add_subplot(spec[:,3]) 151 | 152 | ### temp for ploting head figure ### 153 | # self.fig, self.axis_graph = plt.subplots(1,1,figsize=(16,16)) 154 | # # self.fig, self.axis_perception = plt.subplots(1,1,figsize=(8,8)) 155 | 156 | if self.draw_envs and env_configs is not None: 157 | for i,env_config in enumerate(env_configs): 158 | self.load_env_config(env_config) 159 | if len(env_configs) % 3 == 0: 160 | self.plot_graph(self.axis_graphs[int(i/3),i%3]) 161 | else: 162 | self.plot_graph(self.axis_graphs[i]) 163 | else: 164 | self.plot_graph(self.axis_graph) 165 | 166 | def plot_graph(self,axis): 167 | # plot current velocity in the map 168 | # if self.draw_envs: 169 | # x_pos = list(np.linspace(0.0,self.env.width,100)) 170 | # y_pos = list(np.linspace(0.0,self.env.height,100)) 171 | # else: 172 | # x_pos = list(np.linspace(-2.5,self.env.width+2.5,110)) 173 | # y_pos = list(np.linspace(-2.5,self.env.height+2.5,110)) 174 | 175 | x_pos = list(np.linspace(0.0,self.env.width,100)) 176 | y_pos = list(np.linspace(0.0,self.env.height,100)) 177 | 178 | pos_x = [] 179 | pos_y = [] 180 | arrow_x = [] 181 | arrow_y = [] 182 | speeds = np.zeros((len(x_pos),len(y_pos))) 183 | for m,x in enumerate(x_pos): 184 | for n,y in enumerate(y_pos): 185 | v = self.env.get_velocity(x,y) 186 | speed = np.clip(np.linalg.norm(v),0.1,10) 187 | pos_x.append(x) 188 | pos_y.append(y) 189 | arrow_x.append(v[0]) 190 | arrow_y.append(v[1]) 191 | speeds[n,m] = np.log(speed) 192 | 193 | 194 | cmap = cm.Blues(np.linspace(0,1,20)) 195 | cmap = mpl.colors.ListedColormap(cmap[10:,:-1]) 196 | 197 | axis.contourf(x_pos,y_pos,speeds,cmap=cmap) 198 | axis.quiver(pos_x, pos_y, arrow_x, arrow_y, width=0.001,scale_units='xy',scale=2.0) 199 | 200 | # if not self.draw_envs: 201 | # # plot the evaluation boundary 202 | # boundary = np.array([[0.0,0.0], 203 | # [self.env.width,0.0], 204 | # [self.env.width,self.env.height], 205 | # [0.0,self.env.height], 206 | # [0.0,0.0]]) 207 | # axis.plot(boundary[:,0],boundary[:,1],color = 'r',linestyle="-.",linewidth=3) 208 | 209 | # plot obstacles in the map 210 | l = True 211 | for obs in self.env.obstacles: 212 | if l: 213 | axis.add_patch(mpl.patches.Circle((obs.x,obs.y),radius=obs.r,color='m')) 214 | l = False 215 | else: 216 | axis.add_patch(mpl.patches.Circle((obs.x,obs.y),radius=obs.r,color='m')) 217 | 218 | axis.set_aspect('equal') 219 | # if self.draw_envs: 220 | # axis.set_xlim([0.0,self.env.width]) 221 | # axis.set_ylim([0.0,self.env.height]) 222 | # else: 223 | # axis.set_xlim([-2.5,self.env.width+2.5]) 224 | # axis.set_ylim([-2.5,self.env.height+2.5]) 225 | axis.set_xlim([0.0,self.env.width]) 226 | axis.set_ylim([0.0,self.env.height]) 227 | axis.set_xticks([]) 228 | axis.set_yticks([]) 229 | 230 | # plot start and goal state of each robot 231 | for idx,robot in enumerate(self.env.robots): 232 | if not self.draw_envs: 233 | axis.scatter(robot.start[0],robot.start[1],marker="o",color="yellow",s=200,zorder=6) 234 | axis.text(robot.start[0]-1,robot.start[1]+1,str(idx),color="yellow",fontsize=25,zorder=8) 235 | axis.scatter(robot.goal[0],robot.goal[1],marker="*",color="yellow",s=650,zorder=6) 236 | axis.text(robot.goal[0]-1,robot.goal[1]+1,str(idx),color="yellow",fontsize=25,zorder=8) 237 | self.robots_last_pos.append([]) 238 | self.robots_traj_plot.append([]) 239 | 240 | self.plot_robots(axis) 241 | 242 | def plot_robots(self,axis,traj_color=None): 243 | if not self.draw_envs: 244 | for robot_plot in self.robots_plot: 245 | robot_plot.remove() 246 | self.robots_plot.clear() 247 | 248 | robot_scale = 1.5 249 | for i,robot in enumerate(self.env.robots): 250 | if robot.deactivated: 251 | continue 252 | 253 | d = np.matrix([[0.5*robot_scale*robot.length],[0.5*robot_scale*robot.width]]) 254 | rot = np.matrix([[np.cos(robot.theta),-np.sin(robot.theta)], \ 255 | [np.sin(robot.theta),np.cos(robot.theta)]]) 256 | d_r = rot * d 257 | xy = (robot.x-d_r[0,0],robot.y-d_r[1,0]) 258 | 259 | angle_d = robot.theta / np.pi * 180 260 | 261 | if self.draw_traj: 262 | robot.check_reach_goal() 263 | c = "lime" if robot.reach_goal else 'r' 264 | else: 265 | c = 'lime' 266 | # draw robot velocity (add initial length to avoid being hidden by the robot plot) 267 | robot_r = 0.5*np.linalg.norm(np.array([robot.length,robot.width])) 268 | init_len = robot_scale * robot_r + 0.1 269 | velocity_len = np.linalg.norm(robot.velocity) 270 | scaled_len = (velocity_len + init_len) / velocity_len 271 | self.robots_plot.append(axis.quiver(robot.x,robot.y,scaled_len*robot.velocity[0],scaled_len*robot.velocity[1], \ 272 | color="r",width=0.005,headlength=5,headwidth=3,scale_units='xy',scale=1)) 273 | 274 | # draw robot 275 | self.robots_plot.append(axis.add_patch(mpl.patches.Rectangle(xy,robot_scale*robot.length, \ 276 | robot_scale*robot.width, color=c, \ 277 | angle=angle_d,zorder=7))) 278 | # if not self.draw_envs: 279 | # # draw robot perception range 280 | # self.robots_plot.append(axis.add_patch(mpl.patches.Circle((robot.x,robot.y), \ 281 | # robot.perception.range, color=c, 282 | # alpha=0.2))) 283 | # robot id 284 | self.robots_plot.append(axis.text(robot.x-1,robot.y+1,str(i),color="yellow",fontsize=25,zorder=8)) 285 | 286 | if not self.draw_envs: 287 | if self.robots_last_pos[i] != []: 288 | h = axis.plot((self.robots_last_pos[i][0],robot.x), 289 | (self.robots_last_pos[i][1],robot.y), 290 | color='tab:orange' if traj_color is None else traj_color[i], 291 | linewidth=3.0) 292 | self.robots_traj_plot[i].append(h) 293 | 294 | self.robots_last_pos[i] = [robot.x, robot.y] 295 | 296 | def plot_action_and_steer_state(self,action): 297 | self.axis_action.clear() 298 | 299 | a,w = self.env.robots[0].actions[action] 300 | 301 | if self.video_plots: 302 | self.axis_action.text(1,3,"action",fontsize=25) 303 | self.axis_action.text(1,2,f"a: {a:.2f}",fontsize=20) 304 | self.axis_action.text(1,1,f"w: {w:.2f}",fontsize=20) 305 | self.axis_action.set_xlim([0,2.5]) 306 | self.axis_action.set_ylim([0,4]) 307 | else: 308 | x_pos = 0.15 309 | self.axis_action.text(x_pos,6,"Steer actions",fontweight="bold",fontsize=15) 310 | self.axis_action.text(x_pos,5,f"Acceleration (m/s^2): {a:.2f}",fontsize=15) 311 | self.axis_action.text(x_pos,4,f"Angular velocity (rad/s): {w:.2f}",fontsize=15) 312 | 313 | # robot steer state 314 | self.axis_action.text(x_pos,2,"Steer states",fontweight="bold",fontsize=15) 315 | self.axis_action.text(x_pos,1,f"Forward speed (m/s): {self.env.robot.speed:.2f}",fontsize=15) 316 | self.axis_action.text(x_pos,0,f"Orientation (rad): {self.env.robot.theta:.2f}",fontsize=15) 317 | self.axis_action.set_ylim([-1,7]) 318 | 319 | self.axis_action.set_xticks([]) 320 | self.axis_action.set_yticks([]) 321 | self.axis_action.spines["left"].set_visible(False) 322 | self.axis_action.spines["top"].set_visible(False) 323 | self.axis_action.spines["right"].set_visible(False) 324 | self.axis_action.spines["bottom"].set_visible(False) 325 | 326 | def plot_measurements(self,robot_idx,R_matrix=None): 327 | self.axis_perception.clear() 328 | # self.axis_observation.clear() 329 | # self.axis_dvl.clear() 330 | # self.axis_goal.clear() 331 | 332 | rob = self.env.robots[robot_idx] 333 | 334 | # if rob.reach_goal: 335 | # print(f"robot {robot_idx} reached goal, no measurements are available!") 336 | # return 337 | 338 | legend_size = 12 339 | font_size = 15 340 | 341 | rob.perception_output(self.env.obstacles,self.env.robots) 342 | 343 | # plot detected objects in the robot frame (rotate x-axis by 90 degree (upward) in the plot) 344 | range_c = 'g' if rob.cooperative else 'r' 345 | self.axis_perception.add_patch(mpl.patches.Circle((0,0), \ 346 | rob.perception.range, color=range_c, \ 347 | alpha = 0.2)) 348 | 349 | # plot self velocity (add initial length to avoid being hidden by the robot plot) 350 | robot_scale = 1.5 351 | robot_r = 0.5*np.linalg.norm(np.array([rob.length,rob.width])) 352 | init_len = robot_scale * robot_r 353 | velocity_len = np.linalg.norm(rob.velocity) 354 | scaled_len = (velocity_len + init_len) / velocity_len 355 | 356 | abs_velocity_r = rob.perception.observation["self"][2:] 357 | self.axis_perception.quiver(0.0,0.0,-scaled_len*abs_velocity_r[1],scaled_len*abs_velocity_r[0], \ 358 | color='r',width=0.008,headlength=5,headwidth=3,scale_units='xy',scale=1) 359 | 360 | robot_c = 'g' 361 | self.axis_perception.add_patch(mpl.patches.Rectangle((-0.5*robot_scale*rob.width,-0.5*robot_scale*rob.length), \ 362 | robot_scale*rob.width,robot_scale*rob.length, color=robot_c)) 363 | 364 | x_pos = 0 365 | y_pos = 0 366 | relation_pos = [[0.0,0.0]] 367 | 368 | for i,obs in enumerate(rob.perception.observation["static"]): 369 | # rotate by 90 degree 370 | self.axis_perception.add_patch(mpl.patches.Circle((-obs[1],obs[0]), \ 371 | obs[2], color="m")) 372 | relation_pos.append([-obs[1],obs[0]]) 373 | # include into observation info 374 | # self.axis_observation.text(x_pos,y_pos,f"position: ({obs[0]:.2f},{obs[1]:.2f}), radius: {obs[2]:.2f}") 375 | # y_pos += 1 376 | 377 | # self.axis_observation.text(x_pos,y_pos,"Static obstacles",fontweight="bold",fontsize=15) 378 | # y_pos += 2 379 | 380 | if rob.cooperative: 381 | # for i,obj_history in enumerate(rob.perception.observation["dynamic"].values()): 382 | for i,obj in enumerate(rob.perception.observation["dynamic"]): 383 | # plot the current position 384 | # pos = obj_history[-1][:2] 385 | 386 | # plot velocity (rotate by 90 degree) 387 | velocity_len = np.linalg.norm(rob.velocity) 388 | scaled_len = (velocity_len + init_len) / velocity_len 389 | self.axis_perception.quiver(-obj[1],obj[0],-scaled_len*obj[3],scaled_len*obj[2],color="r", \ 390 | width=0.008,headlength=5,headwidth=3,scale_units='xy',scale=1) 391 | 392 | # plot position (rotate by 90 degree) 393 | self.axis_perception.add_patch(mpl.patches.Circle((-obj[1],obj[0]), \ 394 | rob.detect_r, color="g")) 395 | relation_pos.append([-obj[1],obj[0]]) 396 | 397 | # include history into observation info 398 | # self.axis_observation.text(x_pos,y_pos,f"position: ({obj[0]:.2f},{obj[1]:.2f}), velocity: ({obj[2]:.2f},{obj[3]:.2f})") 399 | # y_pos += 1 400 | 401 | # self.axis_observation.text(x_pos,y_pos,"Other Robots",fontweight="bold",fontsize=15) 402 | # y_pos += 2 403 | 404 | 405 | if R_matrix is not None: 406 | # plot relation matrix 407 | length = len(R_matrix) 408 | assert len(relation_pos) == length, "The number of objects do not match size of the relation matrix" 409 | for i in range(length): 410 | for j in range(length): 411 | self.axis_perception.plot([relation_pos[i][0],relation_pos[j][0]], \ 412 | [relation_pos[i][1],relation_pos[j][1]], 413 | linewidth=2*R_matrix[i][j],color='k',zorder=0) 414 | 415 | type = "cooperative" if rob.cooperative else "non-cooperative" 416 | # self.axis_observation.text(x_pos,y_pos,f"Showing the observation of robot {robot_idx} ({type})",fontweight="bold",fontsize=20) 417 | 418 | self.axis_perception.set_xlim([-rob.perception.range-1,rob.perception.range+1]) 419 | self.axis_perception.set_ylim([-rob.perception.range-1,rob.perception.range+1]) 420 | self.axis_perception.set_aspect('equal') 421 | self.axis_perception.set_title(f'Robot {robot_idx}',fontsize=25) 422 | 423 | self.axis_perception.set_xticks([]) 424 | self.axis_perception.set_yticks([]) 425 | self.axis_perception.spines["left"].set_visible(False) 426 | self.axis_perception.spines["top"].set_visible(False) 427 | self.axis_perception.spines["right"].set_visible(False) 428 | self.axis_perception.spines["bottom"].set_visible(False) 429 | 430 | # self.axis_observation.set_ylim([-1,y_pos+1]) 431 | # self.axis_observation.set_xticks([]) 432 | # self.axis_observation.set_yticks([]) 433 | # self.axis_observation.spines["left"].set_visible(False) 434 | # self.axis_observation.spines["top"].set_visible(False) 435 | # self.axis_observation.spines["right"].set_visible(False) 436 | # self.axis_observation.spines["bottom"].set_visible(False) 437 | 438 | 439 | # # plot robot velocity in the robot frame (rotate x-axis by 90 degree (upward) in the plot) 440 | # h1 = self.axis_dvl.arrow(0.0,0.0,0.0,1.0, \ 441 | # color='k', \ 442 | # width = 0.02, \ 443 | # head_width = 0.08, \ 444 | # head_length = 0.12, \ 445 | # length_includes_head=True, \ 446 | # label='steering direction') 447 | # # rotate by 90 degree 448 | # h2 = self.axis_dvl.arrow(0.0,0.0,-abs_velocity_r[1],abs_velocity_r[0], \ 449 | # color='r',width=0.02, head_width = 0.08, \ 450 | # head_length = 0.12, length_includes_head=True, \ 451 | # label='velocity wrt seafloor') 452 | # x_range = np.max([2,np.abs(abs_velocity_r[1])]) 453 | # y_range = np.max([2,np.abs(abs_velocity_r[0])]) 454 | # mpl.rcParams["font.size"]=12 455 | # self.axis_dvl.set_xlim([-x_range,x_range]) 456 | # self.axis_dvl.set_ylim([-1,y_range]) 457 | # self.axis_dvl.set_aspect('equal') 458 | # self.axis_dvl.legend(handles=[h1,h2],loc='lower center',fontsize=legend_size) 459 | # self.axis_dvl.set_title('Velocity Measurement',fontsize=font_size) 460 | 461 | # self.axis_dvl.set_xticks([]) 462 | # self.axis_dvl.set_yticks([]) 463 | # self.axis_dvl.spines["left"].set_visible(False) 464 | # self.axis_dvl.spines["top"].set_visible(False) 465 | # self.axis_dvl.spines["right"].set_visible(False) 466 | # self.axis_dvl.spines["bottom"].set_visible(False) 467 | 468 | 469 | # # give goal position info in the robot frame 470 | # goal_r = rob.perception.observation["self"][:2] 471 | # x1 = 0.07 472 | # x2 = x1 + 0.13 473 | # self.axis_goal.text(x1,0.5,"Goal Position (Relative)",fontsize=font_size) 474 | # self.axis_goal.text(x2,0.25,f"({goal_r[0]:.2f}, {goal_r[1]:.2f})",fontsize=font_size) 475 | 476 | # self.axis_goal.set_xticks([]) 477 | # self.axis_goal.set_yticks([]) 478 | # self.axis_goal.spines["left"].set_visible(False) 479 | # self.axis_goal.spines["top"].set_visible(False) 480 | # self.axis_goal.spines["right"].set_visible(False) 481 | # self.axis_goal.spines["bottom"].set_visible(False) 482 | 483 | def plot_return_dist(self,action): 484 | for axis in self.axis_dist: 485 | axis.clear() 486 | 487 | dist_interval = 1 488 | mean_bar = 0.35 489 | idx = 0 490 | 491 | xlim = [np.inf,-np.inf] 492 | for idx, cvar in enumerate(action["cvars"]): 493 | ylabelright=[] 494 | 495 | quantiles = np.array(action["quantiles"][idx]) 496 | 497 | q_means = np.mean(quantiles,axis=0) 498 | max_a = np.argmax(q_means) 499 | for i, a in enumerate(self.env.robots[0].actions): 500 | q_mean = q_means[i] 501 | # q_mean = np.mean(quantiles[:,i]) 502 | 503 | ylabelright.append( 504 | "\n".join([f"a: {a[0]:.2f}",f"w: {a[1]:.2f}"]) 505 | ) 506 | 507 | # ylabelright.append(f"mean: {q_mean:.2f}") 508 | 509 | self.axis_dist[idx].axhline(i*dist_interval, color="black", linewidth=2.0, zorder=0) 510 | self.axis_dist[idx].scatter(quantiles[:,i], i*np.ones(len(quantiles[:,i]))*dist_interval,color="g", marker="x",s=80,linewidth=3) 511 | self.axis_dist[idx].hlines(y=i*dist_interval, xmin=np.min(quantiles[:,i]), xmax=np.max(quantiles[:,i]),zorder=0) 512 | if i == max_a: 513 | self.axis_dist[idx].vlines(q_mean, ymin=i*dist_interval-mean_bar, ymax=i*dist_interval+mean_bar,color="red",linewidth=5) 514 | else: 515 | self.axis_dist[idx].vlines(q_mean, ymin=i*dist_interval-mean_bar, ymax=i*dist_interval+mean_bar,color="blue",linewidth=3) 516 | 517 | self.axis_dist[idx].tick_params(axis="x", labelsize=15) 518 | self.axis_dist[idx].set_ylim([-1.0,i+1]) 519 | self.axis_dist[idx].set_yticks([]) 520 | if idx == len(action["cvars"])-1: 521 | self.axis_dist[idx].set_yticks(range(0,i+1)) 522 | self.axis_dist[idx].yaxis.tick_right() 523 | self.axis_dist[idx].set_yticklabels(labels=ylabelright,fontsize=15) 524 | 525 | if len(action["cvars"]) > 1: 526 | if idx == 0: 527 | self.axis_dist[idx].set_title("adpative: "+r'$\phi$'+f" = {cvar:.2f}",fontsize=20) 528 | else: 529 | self.axis_dist[idx].set_title(r'$\phi$'+f" = {cvar:.2f}",fontsize=20) 530 | else: 531 | self.axis_dist[idx].set_title(r'$\phi$'+f" = {cvar:.2f}",fontsize=20) 532 | 533 | xlim[0] = min(xlim[0],np.min(quantiles)-5) 534 | xlim[1] = max(xlim[1],np.max(quantiles)+5) 535 | 536 | for idx, cvar in enumerate(action["cvars"]): 537 | # self.axis_dist[idx].xaxis.set_ticks(np.arange(xlim[0],xlim[1]+1,(xlim[1]-xlim[0])/5)) 538 | self.axis_dist[idx].set_xlim(xlim) 539 | 540 | def plot_action_values(self,action): 541 | self.axis_qvalues.clear() 542 | 543 | dist_interval = 1 544 | mean_bar = 0.35 545 | ylabelright=[] 546 | 547 | q_values = np.array(action["qvalues"]) 548 | max_a = np.argmax(q_values) 549 | for i, a in enumerate(self.env.robots[0].actions): 550 | ylabelright.append( 551 | "\n".join([f"a: {a[0]:.2f}",f"w: {a[1]:.2f}"]) 552 | ) 553 | self.axis_qvalues.axhline(i*dist_interval, color="black", linewidth=2.0, zorder=0) 554 | if i == max_a: 555 | self.axis_qvalues.vlines(q_values[i], ymin=i*dist_interval-mean_bar, ymax=i*dist_interval+mean_bar,color="red",linewidth=8) 556 | else: 557 | self.axis_qvalues.vlines(q_values[i], ymin=i*dist_interval-mean_bar, ymax=i*dist_interval+mean_bar,color="blue",linewidth=5) 558 | 559 | self.axis_qvalues.set_title("action values",fontsize=20) 560 | self.axis_qvalues.tick_params(axis="x", labelsize=15) 561 | self.axis_qvalues.set_ylim([-1.0,i+1]) 562 | self.axis_qvalues.set_yticks(range(0,i+1)) 563 | self.axis_qvalues.yaxis.tick_right() 564 | self.axis_qvalues.set_yticklabels(labels=ylabelright,fontsize=15) 565 | self.axis_qvalues.set_xlim([np.min(q_values)-5,np.max(q_values)+5]) 566 | 567 | def one_step(self,actions,robot_idx=0): 568 | assert len(actions) == len(self.env.robots), "Number of actions not equal number of robots!" 569 | for i,action in enumerate(actions): 570 | rob = self.env.robots[i] 571 | current_velocity = self.env.get_velocity(rob.x, rob.y) 572 | rob.update_state(action,current_velocity) 573 | 574 | self.plot_robots() 575 | self.plot_measurements(robot_idx) 576 | # if not self.plot_dist and not self.plot_qvalues: 577 | # self.plot_action_and_steer_state(action["action"]) 578 | 579 | if self.step % self.env.robots[0].N == 0: 580 | if self.plot_dist: 581 | self.plot_return_dist(action) 582 | elif self.plot_qvalues: 583 | self.plot_action_values(action) 584 | 585 | self.step += 1 586 | 587 | def init_animation(self): 588 | # plot initial robot position 589 | self.plot_robots() 590 | 591 | # plot initial measurments 592 | # self.plot_measurements() 593 | 594 | def visualize_control(self,action_sequence,start_idx=0): 595 | # update robot state and make animation when executing action sequence 596 | actions = [] 597 | 598 | # counter for updating distributions plot 599 | self.step = start_idx 600 | 601 | for i,a in enumerate(action_sequence): 602 | for _ in range(self.env.robots[0].N): 603 | # action = {} 604 | # action["action"] = a 605 | # if self.video_plots: 606 | # if self.plot_dist: 607 | # action["cvars"] = self.episode_actions_cvars[i] 608 | # action["quantiles"] = self.episode_actions_quantiles[i] 609 | # action["taus"] = self.episode_actions_taus[i] 610 | # elif self.plot_qvalues: 611 | # action["qvalues"] = self.episode_actions_values[i] 612 | 613 | actions.append(a) 614 | 615 | if self.video_plots: 616 | for i,action in enumerate(actions): 617 | self.one_step(action) 618 | self.fig.savefig(f"{self.plots_save_dir}/step_{self.step}.png",pad_inches=0.2,dpi=self.dpi) 619 | else: 620 | # self.animation = animation.FuncAnimation(self.fig, self.one_step,frames=actions, \ 621 | # init_func=self.init_animation, 622 | # interval=10,repeat=False) 623 | for i,action in enumerate(actions): 624 | self.one_step(action) 625 | plt.show() 626 | 627 | def load_env_config(self,episode_dict): 628 | episode = copy.deepcopy(episode_dict) 629 | 630 | self.env.reset_with_eval_config(episode) 631 | 632 | if self.plot_dist: 633 | # load action cvars, quantiles and taus 634 | self.episode_actions_cvars = episode["robot"]["actions_cvars"] 635 | self.episode_actions_quantiles = episode["robot"]["actions_quantiles"] 636 | self.episode_actions_taus = episode["robot"]["actions_taus"] 637 | elif self.plot_qvalues: 638 | # load action values 639 | self.episode_actions_values = episode["robot"]["actions_values"] 640 | 641 | def load_env_config_from_eval_files(self,config_f,eval_f,eval_id,env_id): 642 | with open(config_f,"r") as f: 643 | episodes = json.load(f) 644 | episode = episodes[f"env_{env_id}"] 645 | eval_file = np.load(eval_f,allow_pickle=True) 646 | episode["robot"]["action_history"] = copy.deepcopy(eval_file["actions"][eval_id][env_id]) 647 | self.load_env_config(episode) 648 | 649 | def load_env_config_from_json_file(self,filename): 650 | with open(filename,"r") as f: 651 | episode = json.load(f) 652 | self.load_env_config(episode) 653 | 654 | # def play_episode(self,start_idx=0): 655 | # for plot in self.robot_traj_plot: 656 | # plot[0].remove() 657 | # self.robot_traj_plot.clear() 658 | 659 | # current_v = self.env.get_velocity(self.env.start[0],self.env.start[1]) 660 | # self.env.robot.reset_state(self.env.start[0],self.env.start[1], current_velocity=current_v) 661 | 662 | # self.init_visualize() 663 | 664 | # self.visualize_control(self.episode_actions,start_idx) 665 | 666 | def load_eval_config_and_episode(self,config_file,eval_file): 667 | with open(config_file,"r") as f: 668 | self.configs = json.load(f) 669 | 670 | self.episodes = np.load(eval_file,allow_pickle=True) 671 | 672 | def play_eval_episode(self,eval_id,episode_id,colors,robot_ids=None): 673 | self.env.reset_with_eval_config(self.configs[episode_id]) 674 | self.init_visualize() 675 | 676 | trajectories = self.episodes["trajectories"][eval_id][episode_id] 677 | 678 | self.play_episode(trajectories,colors,robot_ids) 679 | 680 | def play_episode(self, 681 | trajectories, 682 | colors, 683 | robot_ids=None, 684 | max_steps=None, 685 | start_step=0): 686 | 687 | # sort robots according trajectory lengths 688 | all_robots = [] 689 | for i,traj in enumerate(trajectories): 690 | plot_observation = False if robot_ids is None else i in robot_ids 691 | all_robots.append({"id":i,"traj_len":len(traj),"plot_observation":plot_observation}) 692 | all_robots = sorted(all_robots, key=lambda x: x["traj_len"]) 693 | all_robots[-1]["plot_observation"] = True 694 | 695 | if max_steps is None: 696 | max_steps = all_robots[-1]["traj_len"]-1 697 | 698 | robots = [] 699 | for robot in all_robots: 700 | if robot["plot_observation"] is True: 701 | robots.append(robot) 702 | 703 | idx = 0 704 | current_robot_step = 0 705 | for i in range(max_steps): 706 | if i >= robots[idx]["traj_len"]: 707 | current_robot_step = 0 708 | idx += 1 709 | self.plot_robots(self.axis_graph,colors) 710 | self.plot_measurements(robots[idx]["id"]) 711 | # action = [actions[j][i] for j in range(len(self.env.robots))] 712 | # self.env.step(action) 713 | 714 | for j,rob in enumerate(self.env.robots): 715 | if rob.deactivated: 716 | continue 717 | rob.x = trajectories[j][i][0] 718 | rob.y = trajectories[j][i][1] 719 | rob.theta = trajectories[j][i][2] 720 | rob.speed = trajectories[j][i][3] 721 | rob.velocity = np.array(trajectories[j][i][4:]) 722 | 723 | if self.video_plots: 724 | if current_robot_step % self.env.robots[0].N == 0: 725 | action = self.action_data(robots[idx]["id"]) 726 | if self.agent_name == "adaptive_IQN" or self.agent_name == "IQN": 727 | self.plot_return_dist(action) 728 | elif self.agent_name == "DQN": 729 | self.plot_action_values(action) 730 | elif self.agent_name == "APF" or self.agent_name == "RVO": 731 | self.plot_action_and_steer_state(action) 732 | self.fig.savefig(f"{self.plots_save_dir}/step_{start_step+i}.png",dpi=self.dpi) 733 | else: 734 | plt.pause(0.01) 735 | 736 | for j,rob in enumerate(self.env.robots): 737 | if i == len(trajectories[j])-1: 738 | rob.deactivated = True 739 | 740 | current_robot_step += 1 741 | 742 | def draw_dist_plot(self, 743 | trajectories, 744 | robot_id, 745 | step_id, 746 | colors 747 | ): 748 | 749 | self.init_visualize() 750 | 751 | for i in range(step_id+1): 752 | self.plot_robots(self.axis_graph,traj_color=colors) 753 | for j,rob in enumerate(self.env.robots): 754 | if rob.deactivated: 755 | continue 756 | rob.x = trajectories[j][i+1][0] 757 | rob.y = trajectories[j][i+1][1] 758 | rob.theta = trajectories[j][i+1][2] 759 | rob.speed = trajectories[j][i+1][3] 760 | rob.velocity = np.array(trajectories[j][i+1][4:]) 761 | if i+1 == len(trajectories[j])-1: 762 | rob.deactivated = True 763 | 764 | # plot observation 765 | self.plot_measurements(robot_id) 766 | 767 | action = self.action_data(robot_id) 768 | self.plot_return_dist(action) 769 | 770 | self.fig.savefig("IQN_dist_plot.png",bbox_inches="tight") 771 | 772 | def action_data(self, robot_id): 773 | rob = self.env.robots[robot_id] 774 | observation,_,_ = rob.perception_output(self.env.obstacles,self.env.robots) 775 | 776 | if self.agent_name == "adaptive_IQN": 777 | # compute total distribution and adaptive CVaR distribution 778 | a_cvar,quantiles_cvar,_,cvar = self.agent.act_adaptive(observation) 779 | a_greedy,quantiles_greedy,_ = self.agent.act(observation) 780 | 781 | action = dict(action=[a_cvar,a_greedy], 782 | cvars=[cvar,1.0], 783 | quantiles=[quantiles_cvar[0],quantiles_greedy[0]]) 784 | elif self.agent_name == "IQN": 785 | a_greedy,quantiles_greedy,_ = self.agent.act(observation) 786 | 787 | action = dict(action=[a_greedy], 788 | cvars=[1.0], 789 | quantiles=[quantiles_greedy[0]]) 790 | elif self.agent_name == "DQN": 791 | a,qvalues = self.agent.act_dqn(observation) 792 | 793 | action = dict(action=a,qvalues=qvalues[0]) 794 | elif self.agent_name == "APF" or self.agent_name == "RVO": 795 | action = self.agent.act(observation) 796 | return action 797 | 798 | 799 | def draw_trajectory(self, 800 | trajectories, 801 | colors, 802 | name=None, 803 | ): 804 | # Used in Mode 3 805 | self.init_visualize() 806 | 807 | # select a robot that is active utill the end of the episode 808 | robot_id = 0 809 | max_length = 0 810 | for i,traj in enumerate(trajectories): 811 | print("rob: ",i," len: ",len(traj)) 812 | if len(traj) > max_length: 813 | robot_id = i 814 | max_length = len(traj) 815 | 816 | print("\n") 817 | 818 | for i in range(len(trajectories[robot_id])-1): 819 | self.plot_robots(self.axis_graph,traj_color=colors) 820 | for j,rob in enumerate(self.env.robots): 821 | if rob.deactivated: 822 | continue 823 | rob.x = trajectories[j][i+1][0] 824 | rob.y = trajectories[j][i+1][1] 825 | rob.theta = trajectories[j][i+1][2] 826 | rob.speed = trajectories[j][i+1][3] 827 | rob.velocity = np.array(trajectories[j][i+1][4:]) 828 | if i+1 == len(trajectories[j])-1: 829 | rob.deactivated = True 830 | 831 | # for robot_plot in self.robots_plot: 832 | # robot_plot.remove() 833 | # self.robots_plot.clear() 834 | 835 | fig_name = "trajectory_test.png" if name is None else f"trajectory_{name}.png" 836 | self.fig.savefig(fig_name,bbox_inches="tight") 837 | 838 | def draw_video_plots(self,episode,save_dir,start_idx,agent): 839 | # Used in Mode 4 840 | self.agent = agent 841 | self.load_env_config(episode) 842 | self.plots_save_dir = save_dir 843 | self.play_episode(start_idx) 844 | return self.step 845 | 846 | -------------------------------------------------------------------------------- /marinenav_env/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='marinenav_env-v0', 5 | entry_point='marinenav_env.envs:MarineNavEnv2', 6 | ) -------------------------------------------------------------------------------- /marinenav_env/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from marinenav_env.envs.marinenav_env import MarineNavEnv2 -------------------------------------------------------------------------------- /marinenav_env/envs/marinenav_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.spatial 3 | import marinenav_env.envs.utils.robot as robot 4 | import gym 5 | import json 6 | import copy 7 | 8 | class Core: 9 | 10 | def __init__(self, x:float, y:float, clockwise:bool, Gamma:float): 11 | 12 | self.x = x # x coordinate of the vortex core 13 | self.y = y # y coordinate of the vortex core 14 | self.clockwise = clockwise # if the vortex direction is clockwise 15 | self.Gamma = Gamma # circulation strength of the vortex core 16 | 17 | class Obstacle: 18 | 19 | def __init__(self, x:float, y:float, r:float): 20 | 21 | self.x = x # x coordinate of the obstacle center 22 | self.y = y # y coordinate of the obstacle center 23 | self.r = r # radius of the obstacle 24 | 25 | class MarineNavEnv2(gym.Env): 26 | 27 | def __init__(self, seed:int=0, schedule:dict=None): 28 | 29 | self.sd = seed 30 | self.rd = np.random.RandomState(seed) # PRNG 31 | 32 | # parameter initialization 33 | self.width = 50 # x coordinate dimension of the map 34 | self.height = 50 # y coordinate dimension of the map 35 | self.r = 0.5 # radius of vortex core 36 | self.v_rel_max = 1.0 # max allowable speed when two currents flowing towards each other 37 | self.p = 0.8 # max allowable relative speed at another vortex core 38 | self.v_range = [5,10] # speed range of the vortex (at the edge of core) 39 | self.obs_r_range = [1,1] # radius range of the obstacle 40 | # self.min_obs_core_dis = 3.0 # minimum distance between a vortex core and an obstacle 41 | self.clear_r = 5.0 # radius of area centered at the start(goal) of each robot, 42 | # where no vortex cores, static obstacles, or the start(goal) of other robots exist 43 | self.timestep_penalty = -1.0 44 | # self.distance_penalty = -2.0 45 | self.collision_penalty = -50.0 46 | self.goal_reward = 100.0 47 | self.num_cores = 8 # number of vortices 48 | self.num_obs = 8 # number of static obstacles 49 | self.min_start_goal_dis = 30.0 # minimum distance between start and goal 50 | self.num_cooperative = 3 # number of cooperative robots 51 | self.num_non_cooperative = 3 # number of non-cooperative robots 52 | 53 | self.robots = [] # list of robots 54 | for _ in range(self.num_cooperative): 55 | self.robots.append(robot.Robot(cooperative=True)) 56 | for _ in range(self.num_non_cooperative): 57 | self.robots.append(robot.Robot(cooperative=False)) 58 | 59 | self.cores = [] # list of vortex cores 60 | self.obstacles = [] # list of static obstacles 61 | 62 | self.schedule = schedule # schedule for curriculum learning 63 | self.episode_timesteps = 0 # current episode timesteps 64 | self.total_timesteps = 0 # learning timesteps 65 | 66 | # self.set_boundary = False # set boundary of environment 67 | 68 | self.observation_in_robot_frame = True # return observation in robot frame 69 | 70 | def get_action_space_dimension(self): 71 | return self.robot.compute_actions_dimension() 72 | 73 | def reset(self): 74 | # reset the environment 75 | 76 | if self.schedule is not None: 77 | steps = self.schedule["timesteps"] 78 | diffs = np.array(steps) - self.total_timesteps 79 | 80 | # find the interval the current timestep falls into 81 | idx = len(diffs[diffs<=0])-1 82 | 83 | self.num_cooperative = self.schedule["num_cooperative"][idx] 84 | self.num_non_cooperative = self.schedule["num_non_cooperative"][idx] 85 | self.num_cores = self.schedule["num_cores"][idx] 86 | self.num_obs = self.schedule["num_obstacles"][idx] 87 | self.min_start_goal_dis = self.schedule["min_start_goal_dis"][idx] 88 | 89 | print("\n======== training schedule ========") 90 | print("num of cooperative agents: ",self.num_cooperative) 91 | print("num of non-cooperative agents: ",self.num_non_cooperative) 92 | print("num of cores: ",self.num_cores) 93 | print("num of obstacles: ",self.num_obs) 94 | print("min start goal dis: ",self.min_start_goal_dis) 95 | print("======== training schedule ========\n") 96 | 97 | self.episode_timesteps = 0 98 | 99 | self.cores.clear() 100 | self.obstacles.clear() 101 | self.robots.clear() 102 | 103 | num_cores = self.num_cores 104 | num_obs = self.num_obs 105 | robot_types = [True]*self.num_cooperative + [False]*self.num_non_cooperative 106 | assert len(robot_types) > 0, "Number of robots is 0!" 107 | 108 | ##### generate robots with randomly generated start and goal 109 | num_robots = 0 110 | iteration = 500 111 | while True: 112 | start = self.rd.uniform(low = 2.0*np.ones(2), high = np.array([self.width-2.0,self.height-2.0])) 113 | goal = self.rd.uniform(low = 2.0*np.ones(2), high = np.array([self.width-2.0,self.height-2.0])) 114 | iteration -= 1 115 | if self.check_start_and_goal(start,goal): 116 | rob = robot.Robot(robot_types[num_robots]) 117 | rob.start = start 118 | rob.goal = goal 119 | self.reset_robot(rob) 120 | self.robots.append(rob) 121 | num_robots += 1 122 | if iteration == 0 or num_robots == len(robot_types): 123 | break 124 | 125 | ##### generate vortex with random position, spinning direction and strength 126 | if num_cores > 0: 127 | iteration = 500 128 | while True: 129 | center = self.rd.uniform(low = np.zeros(2), high = np.array([self.width,self.height])) 130 | direction = self.rd.binomial(1,0.5) 131 | v_edge = self.rd.uniform(low = self.v_range[0], high = self.v_range[1]) 132 | Gamma = 2 * np.pi * self.r * v_edge 133 | core = Core(center[0],center[1],direction,Gamma) 134 | iteration -= 1 135 | if self.check_core(core): 136 | self.cores.append(core) 137 | num_cores -= 1 138 | if iteration == 0 or num_cores == 0: 139 | break 140 | 141 | centers = None 142 | for core in self.cores: 143 | if centers is None: 144 | centers = np.array([[core.x,core.y]]) 145 | else: 146 | c = np.array([[core.x,core.y]]) 147 | centers = np.vstack((centers,c)) 148 | 149 | # KDTree storing vortex core center positions 150 | if centers is not None: 151 | self.core_centers = scipy.spatial.KDTree(centers) 152 | 153 | ##### generate static obstacles with random position and size 154 | if num_obs > 0: 155 | iteration = 500 156 | while True: 157 | center = self.rd.uniform(low = 5.0*np.ones(2), high = np.array([self.width-5.0,self.height-5.0])) 158 | r = self.rd.uniform(low = self.obs_r_range[0], high = self.obs_r_range[1]) 159 | obs = Obstacle(center[0],center[1],r) 160 | iteration -= 1 161 | if self.check_obstacle(obs): 162 | self.obstacles.append(obs) 163 | num_obs -= 1 164 | if iteration == 0 or num_obs == 0: 165 | break 166 | 167 | return self.get_observations() 168 | 169 | def reset_robot(self,rob): 170 | # reset robot state 171 | rob.reach_goal = False 172 | rob.collision = False 173 | rob.deactivated = False 174 | rob.init_theta = self.rd.uniform(low = 0.0, high = 2*np.pi) 175 | rob.init_speed = self.rd.uniform(low = 0.0, high = rob.max_speed) 176 | current_v = self.get_velocity(rob.start[0],rob.start[1]) 177 | rob.reset_state(current_velocity=current_v) 178 | 179 | def check_all_deactivated(self): 180 | res = True 181 | for rob in self.robots: 182 | if not rob.deactivated: 183 | res = False 184 | break 185 | return res 186 | 187 | def check_all_reach_goal(self): 188 | res = True 189 | for rob in self.robots: 190 | if not rob.reach_goal: 191 | res = False 192 | break 193 | return res 194 | 195 | def check_any_collision(self): 196 | res = False 197 | for rob in self.robots: 198 | if rob.collision: 199 | res = True 200 | break 201 | return res 202 | 203 | def step(self, actions): 204 | 205 | rewards = [0]*len(self.robots) 206 | 207 | assert len(actions) == len(self.robots), "Number of actions not equal number of robots!" 208 | assert self.check_all_reach_goal() is not True, "All robots reach goals, not actions are available!" 209 | # Execute actions for all robots 210 | for i,action in enumerate(actions): 211 | rob = self.robots[i] 212 | 213 | if rob.deactivated: 214 | # This robot is in the deactivate state 215 | continue 216 | 217 | # save action to history 218 | rob.action_history.append(action) 219 | 220 | dis_before = rob.dist_to_goal() 221 | 222 | # update robot state after executing the action 223 | for _ in range(rob.N): 224 | current_velocity = self.get_velocity(rob.x, rob.y) 225 | rob.update_state(action,current_velocity) 226 | 227 | # save robot state 228 | rob.trajectory.append([rob.x,rob.y,rob.theta,rob.speed,rob.velocity[0],rob.velocity[1]]) 229 | 230 | dis_after = rob.dist_to_goal() 231 | 232 | # constant penalty applied at every time step 233 | rewards[i] += self.timestep_penalty 234 | 235 | # reward agent for getting closer to the goal 236 | rewards[i] += dis_before - dis_after 237 | 238 | 239 | # Get observation for all robots 240 | observations, collisions, reach_goals = self.get_observations() 241 | 242 | dones = [False]*len(self.robots) 243 | infos = [{"state":"normal"}]*len(self.robots) 244 | 245 | # (1) end the current episode if it's too long 246 | # (2) if any collision happens, end the current episode 247 | # (3) if all robots reach goals (robot list is empty), end the current episode 248 | 249 | # end_episode = False 250 | for idx,rob in enumerate(self.robots): 251 | if rob.deactivated: 252 | # This robot is in the deactivate state 253 | dones[idx] = True 254 | if rob.collision: 255 | infos[idx] = {"state":"deactivated after collision"} 256 | elif rob.reach_goal: 257 | infos[idx] = {"state":"deactivated after reaching goal"} 258 | else: 259 | raise RuntimeError("Robot being deactived can only be caused by collsion or reaching goal!") 260 | continue 261 | 262 | # min_dis = min_distances[idx] 263 | # if min_dis <= rob.obs_dis: 264 | # # penalize agent for being close to other objects 265 | # rewards[i] += self.distance_penalty + min_dis * (-self.distance_penalty/rob.obs_dis) 266 | 267 | if self.episode_timesteps >= 1000: 268 | # end_episode = True 269 | dones[idx] = True 270 | infos[idx] = {"state":"too long episode"} 271 | elif collisions[idx]: 272 | rewards[idx] += self.collision_penalty 273 | # end_episode = True 274 | dones[idx] = True 275 | infos[idx] = {"state":"collision"} 276 | elif reach_goals[idx]: 277 | rewards[idx] += self.goal_reward 278 | dones[idx] = True 279 | infos[idx] = {"state":"reach goal"} 280 | else: 281 | dones[idx] = False 282 | infos[idx] = {"state":"normal"} 283 | 284 | # if self.check_all_reach_goal(): 285 | # end_episode = True 286 | 287 | # if self.set_boundary and self.out_of_boundary(): 288 | # # No used in training 289 | # done = True 290 | # info = {"state":"out of boundary"} 291 | 292 | self.episode_timesteps += 1 293 | self.total_timesteps += 1 294 | 295 | return observations, rewards, dones, infos 296 | 297 | def out_of_boundary(self): 298 | # only used when boundary is set 299 | x_out = self.robot.x < 0.0 or self.robot.x > self.width 300 | y_out = self.robot.y < 0.0 or self.robot.y > self.height 301 | return x_out or y_out 302 | 303 | def get_observations(self): 304 | observations = [] 305 | collisions = [] 306 | reach_goals = [] 307 | # min_distances = [] 308 | for robot in self.robots: 309 | observation, collision, reach_goal = robot.perception_output(self.obstacles,self.robots,self.observation_in_robot_frame) 310 | observations.append(observation) 311 | collisions.append(collision) 312 | reach_goals.append(reach_goal) 313 | # min_distances.append(min_distance) 314 | return observations, collisions, reach_goals 315 | 316 | def check_start_and_goal(self,start,goal): 317 | 318 | # The start and goal point is far enough 319 | if np.linalg.norm(goal-start) < self.min_start_goal_dis: 320 | return False 321 | 322 | for robot in self.robots: 323 | 324 | dis_s = robot.start - start 325 | # Start point not too close to that of existing robots 326 | if np.linalg.norm(dis_s) <= self.clear_r: 327 | return False 328 | 329 | dis_g = robot.goal - goal 330 | # Goal point not too close to that of existing robots 331 | if np.linalg.norm(dis_g) <= self.clear_r: 332 | return False 333 | 334 | return True 335 | 336 | def check_core(self,core_j): 337 | 338 | # Within the range of the map 339 | if core_j.x - self.r < 0.0 or core_j.x + self.r > self.width: 340 | return False 341 | if core_j.y - self.r < 0.0 or core_j.y + self.r > self.width: 342 | return False 343 | 344 | for robot in self.robots: 345 | # Not too close to start and goal point of each robot 346 | core_pos = np.array([core_j.x,core_j.y]) 347 | dis_s = core_pos - robot.start 348 | if np.linalg.norm(dis_s) < self.r + self.clear_r: 349 | return False 350 | dis_g = core_pos - robot.goal 351 | if np.linalg.norm(dis_g) < self.r + self.clear_r: 352 | return False 353 | 354 | for core_i in self.cores: 355 | dx = core_i.x - core_j.x 356 | dy = core_i.y - core_j.y 357 | dis = np.sqrt(dx*dx+dy*dy) 358 | 359 | if core_i.clockwise == core_j.clockwise: 360 | # i and j rotate in the same direction, their currents run towards each other at boundary 361 | # The currents speed at boundary need to be lower than threshold 362 | boundary_i = core_i.Gamma / (2*np.pi*self.v_rel_max) 363 | boundary_j = core_j.Gamma / (2*np.pi*self.v_rel_max) 364 | if dis < boundary_i + boundary_j: 365 | return False 366 | else: 367 | # i and j rotate in the opposite direction, their currents join at boundary 368 | # The relative current speed of the stronger vortex at boundary need to be lower than threshold 369 | Gamma_l = max(core_i.Gamma, core_j.Gamma) 370 | Gamma_s = min(core_i.Gamma, core_j.Gamma) 371 | v_1 = Gamma_l / (2*np.pi*(dis-2*self.r)) 372 | v_2 = Gamma_s / (2*np.pi*self.r) 373 | if v_1 > self.p * v_2: 374 | return False 375 | 376 | return True 377 | 378 | def check_obstacle(self,obs): 379 | 380 | # Within the range of the map 381 | if obs.x - obs.r < 0.0 or obs.x + obs.r > self.width: 382 | return False 383 | if obs.y - obs.r < 0.0 or obs.y + obs.r > self.height: 384 | return False 385 | 386 | for robot in self.robots: 387 | # Not too close to start and goal point 388 | obs_pos = np.array([obs.x,obs.y]) 389 | dis_s = obs_pos - robot.start 390 | if np.linalg.norm(dis_s) < obs.r + self.clear_r: 391 | return False 392 | dis_g = obs_pos - robot.goal 393 | if np.linalg.norm(dis_g) < obs.r + self.clear_r: 394 | return False 395 | 396 | # Not too close to vortex cores 397 | for core in self.cores: 398 | dx = core.x - obs.x 399 | dy = core.y - obs.y 400 | dis = np.sqrt(dx*dx + dy*dy) 401 | 402 | if dis <= self.r + obs.r: 403 | return False 404 | 405 | # Not collide with other obstacles 406 | for obstacle in self.obstacles: 407 | dx = obstacle.x - obs.x 408 | dy = obstacle.y - obs.y 409 | dis = np.sqrt(dx*dx + dy*dy) 410 | 411 | if dis <= obstacle.r + obs.r: 412 | return False 413 | 414 | return True 415 | 416 | def get_velocity(self,x:float, y:float): 417 | if len(self.cores) == 0: 418 | return np.zeros(2) 419 | 420 | # sort the vortices according to their distance to the query point 421 | d, idx = self.core_centers.query(np.array([x,y]),k=len(self.cores)) 422 | if isinstance(idx,np.int64): 423 | idx = [idx] 424 | 425 | v_radial_set = [] 426 | v_velocity = np.zeros((2,1)) 427 | for i in list(idx): 428 | core = self.cores[i] 429 | v_radial = np.matrix([[core.x-x],[core.y-y]]) 430 | 431 | for v in v_radial_set: 432 | project = np.transpose(v) * v_radial 433 | if project[0,0] > 0: 434 | # if the core is in the outter area of a checked core (wrt the query position), 435 | # assume that it has no influence the velocity of the query position 436 | continue 437 | 438 | v_radial_set.append(v_radial) 439 | dis = np.linalg.norm(v_radial) 440 | v_radial /= dis 441 | if core.clockwise: 442 | rotation = np.matrix([[0., -1.],[1., 0]]) 443 | else: 444 | rotation = np.matrix([[0., 1.],[-1., 0]]) 445 | v_tangent = rotation * v_radial 446 | speed = self.compute_speed(core.Gamma,dis) 447 | v_velocity += v_tangent * speed 448 | 449 | return np.array([v_velocity[0,0], v_velocity[1,0]]) 450 | 451 | def get_velocity_test(self,x:float, y:float): 452 | v = np.ones(2) 453 | return v / np.linalg.norm(v) 454 | 455 | def compute_speed(self, Gamma:float, d:float): 456 | if d <= self.r: 457 | return Gamma / (2*np.pi*self.r*self.r) * d 458 | else: 459 | return Gamma / (2*np.pi*d) 460 | 461 | def reset_with_eval_config(self,eval_config): 462 | self.episode_timesteps = 0 463 | 464 | # load env config 465 | self.sd = eval_config["env"]["seed"] 466 | self.width = eval_config["env"]["width"] 467 | self.height = eval_config["env"]["height"] 468 | self.r = eval_config["env"]["r"] 469 | self.v_rel_max = eval_config["env"]["v_rel_max"] 470 | self.p = eval_config["env"]["p"] 471 | self.v_range = copy.deepcopy(eval_config["env"]["v_range"]) 472 | self.obs_r_range = copy.deepcopy(eval_config["env"]["obs_r_range"]) 473 | # self.min_obs_core_dis = eval_config["env"]["min_obs_core_dis"] 474 | self.clear_r = eval_config["env"]["clear_r"] 475 | self.timestep_penalty = eval_config["env"]["timestep_penalty"] 476 | # self.distance_penalty = eval_config["env"]["distance_penalty"] 477 | self.collision_penalty = eval_config["env"]["collision_penalty"] 478 | self.goal_reward = eval_config["env"]["goal_reward"] 479 | 480 | # load vortex cores 481 | self.cores.clear() 482 | centers = None 483 | for i in range(len(eval_config["env"]["cores"]["positions"])): 484 | center = eval_config["env"]["cores"]["positions"][i] 485 | clockwise = eval_config["env"]["cores"]["clockwise"][i] 486 | Gamma = eval_config["env"]["cores"]["Gamma"][i] 487 | core = Core(center[0],center[1],clockwise,Gamma) 488 | self.cores.append(core) 489 | if centers is None: 490 | centers = np.array([[core.x,core.y]]) 491 | else: 492 | c = np.array([[core.x,core.y]]) 493 | centers = np.vstack((centers,c)) 494 | 495 | if centers is not None: 496 | self.core_centers = scipy.spatial.KDTree(centers) 497 | 498 | # load obstacles 499 | self.obstacles.clear() 500 | for i in range(len(eval_config["env"]["obstacles"]["positions"])): 501 | center = eval_config["env"]["obstacles"]["positions"][i] 502 | r = eval_config["env"]["obstacles"]["r"][i] 503 | obs = Obstacle(center[0],center[1],r) 504 | self.obstacles.append(obs) 505 | 506 | # load robot config 507 | self.robots.clear() 508 | for i in range(len(eval_config["robots"]["cooperative"])): 509 | rob = robot.Robot(eval_config["robots"]["cooperative"][i]) 510 | rob.dt = eval_config["robots"]["dt"][i] 511 | rob.N = eval_config["robots"]["N"][i] 512 | rob.length = eval_config["robots"]["length"][i] 513 | rob.width = eval_config["robots"]["width"][i] 514 | rob.r = eval_config["robots"]["r"][i] 515 | rob.detect_r = eval_config["robots"]["detect_r"][i] 516 | rob.goal_dis = eval_config["robots"]["goal_dis"][i] 517 | rob.obs_dis = eval_config["robots"]["obs_dis"][i] 518 | rob.max_speed = eval_config["robots"]["max_speed"][i] 519 | rob.a = np.array(eval_config["robots"]["a"][i]) 520 | rob.w = np.array(eval_config["robots"]["w"][i]) 521 | rob.start = np.array(eval_config["robots"]["start"][i]) 522 | rob.goal = np.array(eval_config["robots"]["goal"][i]) 523 | rob.compute_k() 524 | rob.compute_actions() 525 | rob.init_theta = eval_config["robots"]["init_theta"][i] 526 | rob.init_speed = eval_config["robots"]["init_speed"][i] 527 | 528 | rob.perception.range = eval_config["robots"]["perception"]["range"][i] 529 | rob.perception.angle = eval_config["robots"]["perception"]["angle"][i] 530 | # rob.perception.num_beams = eval_config["robots"]["perception"]["num_beams"][i] 531 | # rob.perception.len_obs_history = eval_config["robots"]["perception"]["len_obs_history"][i] 532 | 533 | current_v = self.get_velocity(rob.start[0],rob.start[1]) 534 | rob.reset_state(current_velocity=current_v) 535 | 536 | self.robots.append(rob) 537 | 538 | return self.get_observations() 539 | 540 | def episode_data(self): 541 | episode = {} 542 | 543 | # save environment config 544 | episode["env"] = {} 545 | episode["env"]["seed"] = self.sd 546 | episode["env"]["width"] = self.width 547 | episode["env"]["height"] = self.height 548 | episode["env"]["r"] = self.r 549 | episode["env"]["v_rel_max"] = self.v_rel_max 550 | episode["env"]["p"] = self.p 551 | episode["env"]["v_range"] = copy.deepcopy(self.v_range) 552 | episode["env"]["obs_r_range"] = copy.deepcopy(self.obs_r_range) 553 | # episode["env"]["min_obs_core_dis"] = self.min_obs_core_dis 554 | episode["env"]["clear_r"] = self.clear_r 555 | episode["env"]["timestep_penalty"] = self.timestep_penalty 556 | # episode["env"]["distance_penalty"] = self.distance_penalty 557 | episode["env"]["collision_penalty"] = self.collision_penalty 558 | episode["env"]["goal_reward"] = self.goal_reward 559 | 560 | # save vortex cores information 561 | episode["env"]["cores"] = {} 562 | episode["env"]["cores"]["positions"] = [] 563 | episode["env"]["cores"]["clockwise"] = [] 564 | episode["env"]["cores"]["Gamma"] = [] 565 | for core in self.cores: 566 | episode["env"]["cores"]["positions"].append([core.x,core.y]) 567 | episode["env"]["cores"]["clockwise"].append(core.clockwise) 568 | episode["env"]["cores"]["Gamma"].append(core.Gamma) 569 | 570 | # save obstacles information 571 | episode["env"]["obstacles"] = {} 572 | episode["env"]["obstacles"]["positions"] = [] 573 | episode["env"]["obstacles"]["r"] = [] 574 | for obs in self.obstacles: 575 | episode["env"]["obstacles"]["positions"].append([obs.x,obs.y]) 576 | episode["env"]["obstacles"]["r"].append(obs.r) 577 | 578 | # save robots information 579 | episode["robots"] = {} 580 | episode["robots"]["cooperative"] = [] 581 | episode["robots"]["dt"] = [] 582 | episode["robots"]["N"] = [] 583 | episode["robots"]["length"] = [] 584 | episode["robots"]["width"] = [] 585 | episode["robots"]["r"] = [] 586 | episode["robots"]["detect_r"] = [] 587 | episode["robots"]["goal_dis"] = [] 588 | episode["robots"]["obs_dis"] = [] 589 | episode["robots"]["max_speed"] = [] 590 | episode["robots"]["a"] = [] 591 | episode["robots"]["w"] = [] 592 | episode["robots"]["start"] = [] 593 | episode["robots"]["goal"] = [] 594 | episode["robots"]["init_theta"] = [] 595 | episode["robots"]["init_speed"] = [] 596 | 597 | episode["robots"]["perception"] = {} 598 | episode["robots"]["perception"]["range"] = [] 599 | episode["robots"]["perception"]["angle"] = [] 600 | # episode["robots"]["perception"]["num_beams"] = [] 601 | # episode["robots"]["perception"]["len_obs_history"] = [] 602 | 603 | episode["robots"]["action_history"] = [] 604 | episode["robots"]["trajectory"] = [] 605 | 606 | for rob in self.robots: 607 | episode["robots"]["cooperative"].append(rob.cooperative) 608 | episode["robots"]["dt"].append(rob.dt) 609 | episode["robots"]["N"].append(rob.N) 610 | episode["robots"]["length"].append(rob.length) 611 | episode["robots"]["width"].append(rob.width) 612 | episode["robots"]["r"].append(rob.r) 613 | episode["robots"]["detect_r"].append(rob.detect_r) 614 | episode["robots"]["goal_dis"].append(rob.goal_dis) 615 | episode["robots"]["obs_dis"].append(rob.obs_dis) 616 | episode["robots"]["max_speed"].append(rob.max_speed) 617 | episode["robots"]["a"].append(list(rob.a)) 618 | episode["robots"]["w"].append(list(rob.w)) 619 | episode["robots"]["start"].append(list(rob.start)) 620 | episode["robots"]["goal"].append(list(rob.goal)) 621 | episode["robots"]["init_theta"].append(rob.init_theta) 622 | episode["robots"]["init_speed"].append(rob.init_speed) 623 | 624 | episode["robots"]["perception"]["range"].append(rob.perception.range) 625 | episode["robots"]["perception"]["angle"].append(rob.perception.angle) 626 | # episode["robots"]["perception"]["num_beams"].append(rob.perception.num_beams) 627 | # episode["robots"]["perception"]["len_obs_history"].append(rob.perception.len_obs_history) 628 | 629 | episode["robots"]["action_history"].append(copy.deepcopy(rob.action_history)) 630 | episode["robots"]["trajectory"].append(copy.deepcopy(rob.trajectory)) 631 | 632 | return episode 633 | 634 | def save_episode(self,filename): 635 | episode = self.episode_data() 636 | with open(filename,"w") as file: 637 | json.dump(episode,file) 638 | -------------------------------------------------------------------------------- /marinenav_env/envs/utils/robot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import heapq 4 | 5 | class Perception: 6 | 7 | def __init__(self,cooperative:bool=False): 8 | # 2D LiDAR model with detection area as a sector 9 | self.range = 15.0 # range of beams (meter) 10 | self.angle = 2 * np.pi # detection angle range 11 | # self.len_obs_history = 5 # the window size of observation history 12 | self.max_obs_num = 5 # the maximum number of obstacles to be considered 13 | self.max_obj_num = 5 # the maximum number of dyanmic objects to be considered 14 | self.observation_format(cooperative) 15 | self.observed_obs = [] # indices of observed static obstacles 16 | self.observed_objs = [] # indiced of observed dynamic objects 17 | 18 | def observation_format(self,cooperative:bool=False): 19 | # format: {"self": [goal,velocity], 20 | # "static":[[obs_1.x,obs_1.y,obs_1.r],...,[obs_n.x,obs_n.y,obs_n.r]], 21 | # "dynamic":[[robot_1.x,robot_1.y,robot_1.vx,robot_1.vy],...,[robot_n.x,robot_n.y,robot_n.vx,robot_n.vy]] 22 | if cooperative: 23 | self.observation = dict(self=[],static=[],dynamic=[]) 24 | else: 25 | self.observation = dict(self=[],static=[]) 26 | 27 | 28 | class Robot: 29 | 30 | def __init__(self,cooperative:bool=False): 31 | 32 | # parameter initialization 33 | self.cooperative = cooperative # if the robot is cooperative or not 34 | self.dt = 0.05 # discretized time step (second) 35 | self.N = 10 # number of time step per action 36 | self.perception = Perception(cooperative) 37 | self.length = 1.0 38 | self.width = 0.5 39 | self.r = 0.8 # collision range 40 | self.detect_r = 0.5*np.sqrt(self.length**2+self.width**2) # detection range 41 | self.goal_dis = 2.0 # max distance to goal considered as reached 42 | self.obs_dis = 5.0 # min distance to other objects that is considered safe 43 | self.max_speed = 2.0 44 | self.a = np.array([-0.4,0.0,0.4]) # linear accelerations (m/s^2) 45 | self.w = np.array([-np.pi/6,0.0,np.pi/6]) # angular velocities (rad/s) 46 | self.compute_k() # cofficient of water resistance 47 | self.compute_actions() # list of actions 48 | 49 | self.x = None # x coordinate 50 | self.y = None # y coordinate 51 | self.theta = None # steering heading angle 52 | self.speed = None # steering foward speed 53 | self.velocity = None # velocity wrt sea floor 54 | 55 | self.start = None # start position 56 | self.goal = None # goal position 57 | self.collision = False 58 | self.reach_goal = False 59 | self.deactivated = False # deactivate the robot if it collides with any objects or reaches the goal 60 | 61 | self.init_theta = 0.0 # theta at initial position 62 | self.init_speed = 0.0 # speed at initial position 63 | 64 | self.action_history = [] # history of action commands in one episode 65 | self.trajectory = [] # trajectory in one episode 66 | 67 | def compute_k(self): 68 | self.k = np.max(self.a)/self.max_speed 69 | 70 | def compute_actions(self): 71 | self.actions = [(acc,ang_v) for acc in self.a for ang_v in self.w] 72 | 73 | def compute_actions_dimension(self): 74 | return len(self.actions) 75 | 76 | def compute_dist_reward_scale(self): 77 | # scale the distance reward 78 | return 1 / (self.max_speed * self.N * self.dt) 79 | 80 | def compute_penalty_matrix(self): 81 | # scale the penalty value to [-1,0] 82 | scale_a = 1 / (np.max(self.a)*np.max(self.a)) 83 | scale_w = 1 / (np.max(self.w)*np.max(self.w)) 84 | p = -0.5 * np.matrix([[scale_a,0.0],[0.0,scale_w]]) 85 | return p 86 | 87 | def compute_action_energy_cost(self,action): 88 | # scale the a and w to [0,1] 89 | a,w = self.actions[action] 90 | a /= np.max(self.a) 91 | w /= np.max(self.w) 92 | return np.abs(a) + np.abs(w) 93 | 94 | def dist_to_goal(self): 95 | return np.linalg.norm(self.goal - np.array([self.x,self.y])) 96 | 97 | def check_reach_goal(self): 98 | if self.dist_to_goal() <= self.goal_dis: 99 | self.reach_goal = True 100 | 101 | def reset_state(self,current_velocity=np.zeros(2)): 102 | # only called when resetting the environment 103 | self.action_history.clear() 104 | self.trajectory.clear() 105 | self.x = self.start[0] 106 | self.y = self.start[1] 107 | self.theta = self.init_theta 108 | self.speed = self.init_speed 109 | self.update_velocity(current_velocity) 110 | self.trajectory.append([self.x,self.y,self.theta,self.speed,self.velocity[0],self.velocity[1]]) 111 | 112 | def get_robot_transform(self): 113 | # compute transformation from world frame to robot frame 114 | R_wr = np.matrix([[np.cos(self.theta),-np.sin(self.theta)],[np.sin(self.theta),np.cos(self.theta)]]) 115 | t_wr = np.matrix([[self.x],[self.y]]) 116 | return R_wr, t_wr 117 | 118 | def get_steer_velocity(self): 119 | return self.speed * np.array([np.cos(self.theta), np.sin(self.theta)]) 120 | 121 | def update_velocity(self,current_velocity=np.zeros(2)): 122 | steer_velocity = self.get_steer_velocity() 123 | self.velocity = steer_velocity + current_velocity 124 | 125 | def update_state(self,action,current_velocity): 126 | # update robot position in one time step 127 | self.update_velocity(current_velocity) 128 | dis = self.velocity * self.dt 129 | self.x += dis[0] 130 | self.y += dis[1] 131 | 132 | # update robot speed in one time step 133 | a,w = self.actions[action] 134 | 135 | # assume that water resistance force is proportion to the speed 136 | self.speed += (a-self.k*self.speed) * self.dt 137 | self.speed = np.clip(self.speed,0.0,self.max_speed) 138 | 139 | # update robot heading angle in one time step 140 | self.theta += w * self.dt 141 | 142 | # warp theta to [0,2*pi) 143 | while self.theta < 0.0: 144 | self.theta += 2 * np.pi 145 | while self.theta >= 2 * np.pi: 146 | self.theta -= 2 * np.pi 147 | 148 | def check_collision(self,obj_x,obj_y,obj_r): 149 | d = self.compute_distance(obj_x,obj_y,obj_r) 150 | if d <= 0.0: 151 | self.collision = True 152 | 153 | def compute_distance(self,x,y,r,in_robot_frame=False): 154 | if in_robot_frame: 155 | d = np.sqrt(x**2+y**2) - r - self.r 156 | else: 157 | d = np.sqrt((self.x-x)**2+(self.y-y)**2) - r - self.r 158 | return d 159 | 160 | def check_detection(self,obj_x,obj_y,obj_r): 161 | proj_pos = self.project_to_robot_frame(np.array([obj_x,obj_y]),False) 162 | 163 | if np.linalg.norm(proj_pos) > self.perception.range + obj_r: 164 | return False 165 | 166 | angle = np.arctan2(proj_pos[1],proj_pos[0]) 167 | if angle < -0.5*self.perception.angle or angle > 0.5*self.perception.angle: 168 | return False 169 | 170 | return True 171 | 172 | def project_to_robot_frame(self,x,is_vector=True): 173 | assert isinstance(x,np.ndarray), "the input needs to be an numpy array" 174 | assert np.shape(x) == (2,) 175 | 176 | x_r = np.reshape(x,(2,1)) 177 | 178 | R_wr, t_wr = self.get_robot_transform() 179 | 180 | R_rw = np.transpose(R_wr) 181 | t_rw = -R_rw * t_wr 182 | 183 | if is_vector: 184 | x_r = R_rw * x_r 185 | else: 186 | x_r = R_rw * x_r + t_rw 187 | 188 | x_r.resize((2,)) 189 | return np.array(x_r) 190 | 191 | 192 | def perception_output(self,obstacles,robots,in_robot_frame=True): 193 | if self.deactivated: 194 | return None, self.collision, self.reach_goal 195 | 196 | self.perception.observation["static"].clear() 197 | if self.cooperative: 198 | self.perception.observation["dynamic"].clear() 199 | 200 | ##### self observation ##### 201 | if in_robot_frame: 202 | # vehicle velocity wrt seafloor in self frame 203 | abs_velocity_r = self.project_to_robot_frame(self.velocity) 204 | 205 | # goal position in self frame 206 | goal_r = self.project_to_robot_frame(self.goal,False) 207 | 208 | self.perception.observation["self"] = list(np.concatenate((goal_r,abs_velocity_r))) 209 | else: 210 | self.perception.observation["self"] = [self.x,self.y,self.theta,self.speed,self.velocity[0],self.velocity[1],self.goal[0],self.goal[1]] 211 | 212 | 213 | self.perception.observed_obs.clear() 214 | if self.cooperative: 215 | self.perception.observed_objs.clear() 216 | 217 | self.check_reach_goal() 218 | 219 | # min_distance = np.inf 220 | 221 | ##### static obstacles observation ##### 222 | for i,obs in enumerate(obstacles): 223 | if not self.check_detection(obs.x,obs.y,obs.r): 224 | continue 225 | 226 | self.perception.observed_obs.append(i) 227 | 228 | if not self.collision: 229 | self.check_collision(obs.x,obs.y,obs.r) 230 | 231 | if in_robot_frame: 232 | pos_r = self.project_to_robot_frame(np.array([obs.x,obs.y]),False) 233 | self.perception.observation["static"].append([pos_r[0],pos_r[1],obs.r]) 234 | else: 235 | self.perception.observation["static"].append([obs.x,obs.y,obs.r]) 236 | # min_distance = min(min_distance,np.linalg.norm(pos_r)-obs.r) 237 | 238 | if self.cooperative: 239 | ##### dynamic objects observation ##### 240 | for j,robot in enumerate(robots): 241 | if robot is self: 242 | continue 243 | if robot.deactivated: 244 | # This robot is in the deactivate state, and abscent from the current map 245 | continue 246 | if not self.check_detection(robot.x,robot.y,robot.detect_r): 247 | continue 248 | 249 | self.perception.observed_objs.append(j) 250 | 251 | if not self.collision: 252 | self.check_collision(robot.x,robot.y,robot.r) 253 | 254 | if in_robot_frame: 255 | pos_r = self.project_to_robot_frame(np.array([robot.x,robot.y]),False) 256 | v_r = self.project_to_robot_frame(robot.velocity) 257 | new_obs = list(np.concatenate((pos_r,v_r))) 258 | # if j in self.perception.observation["dynamic"].copy().keys(): 259 | # self.perception.observation["dynamic"][j].append(new_obs) 260 | # while len(self.perception.observation["dynamic"][j]) > self.perception.len_obs_history: 261 | # del self.perception.observation["dynamic"][j][0] 262 | # else: 263 | # self.perception.observation["dynamic"][j] = [new_obs] 264 | self.perception.observation["dynamic"].append(new_obs) 265 | # min_distance = min(min_distance,np.linalg.norm(pos_r)-robot.r) 266 | else: 267 | self.perception.observation["dynamic"].append([robot.x,robot.y,robot.velocity[0],robot.velocity[1]]) 268 | 269 | # if self.cooperative: 270 | # for idx in self.perception.observation["dynamic"].copy().keys(): 271 | # if idx not in self.perception.observed_objs: 272 | # # remove the observation history if the object is not observed in the current step 273 | # del self.perception.observation["dynamic"][idx] 274 | 275 | self_state = copy.deepcopy(self.perception.observation["self"]) 276 | static_observations = copy.deepcopy(heapq.nsmallest(self.perception.max_obs_num, 277 | self.perception.observation["static"], 278 | key=lambda obs:self.compute_distance(obs[0],obs[1],obs[2],True))) 279 | 280 | static_states = [] 281 | for static in static_observations: 282 | static_states += static 283 | static_states += [0.0,0.0,0.0]*(self.perception.max_obs_num-len(static_observations)) 284 | 285 | if self.cooperative: 286 | # # remove object indices 287 | # dynamic_states = copy.deepcopy(list(self.perception.observation["dynamic"].values())) 288 | # if fixed_dynamic_size: 289 | # dynamic_states_v = [] 290 | # for obs_history in dynamic_states: 291 | # # convert the observation history into a vector [s_{t-k},s_{t-k+1},...,s_t] 292 | # pad_len = max(self.perception.len_obs_history-len(obs_history),0) 293 | # dynamic_states_vectors = [0.,0.,0.,0.] * pad_len 294 | # for obs in obs_history: 295 | # dynamic_states_vectors += obs 296 | # dynamic_states_v.append(dynamic_states_vectors) 297 | # return (self_state,static_states,dynamic_states_v), collision, self.reach_goal 298 | # else: 299 | # idx_array = [] 300 | # for idx,obs_history in enumerate(dynamic_states): 301 | # # pad the dynamic observation and save the indices of exact lastest element 302 | # idx_array.append([idx,len(obs_history)-1]) 303 | # while len(obs_history) < self.perception.len_obs_history: 304 | # obs_history.append([0.,0.,0.,0.]) 305 | 306 | # return (self_state,static_states,dynamic_states,idx_array), collision, self.reach_goal 307 | dynamic_observations = copy.deepcopy(heapq.nsmallest(self.perception.max_obj_num, 308 | self.perception.observation["dynamic"], 309 | key=lambda obj:self.compute_distance(obj[0],obj[1],self.r,True))) 310 | else: 311 | dynamic_observations = [[0.0,0.0,0.0,0.0]]*self.perception.max_obj_num 312 | 313 | dynamic_states = [] 314 | for dynamic in dynamic_observations: 315 | dynamic_states += dynamic 316 | dynamic_states += [0.0,0.0,0.0,0.0]*(self.perception.max_obj_num-len(dynamic_observations)) 317 | 318 | return self_state+static_states+dynamic_states, self.collision, self.reach_goal 319 | 320 | 321 | # def perception_prediction(self,obs_states,robots_states=[]): 322 | # # perception of predicted future state (robot frame) 323 | 324 | # ##### self observation ##### 325 | # # vehicle velocity wrt seafloor in self frame 326 | # abs_velocity_r = self.project_to_robot_frame(self.velocity) 327 | 328 | # # goal position in self frame 329 | # goal_r = self.project_to_robot_frame(self.goal,False) 330 | 331 | # self_state = list(np.concatenate((goal_r,abs_velocity_r))) 332 | 333 | # collision = False 334 | # self.check_reach_goal() 335 | 336 | # ##### static obstatcles observation ##### 337 | # static_states = [] 338 | # for obs in obs_states: 339 | # if not self.check_detection(obs[0],obs[1],obs[2]): 340 | # continue 341 | 342 | # if not collision: 343 | # collision = self.check_collision(obs[0],obs[1],obs[2]) 344 | 345 | # pos_r = self.project_to_robot_frame(np.array([obs[0],obs[1]]),False) 346 | # static_states.append([pos_r[0],pos_r[1],obs[2]]) 347 | 348 | # if self.cooperative: 349 | # # dynamic objects observation in self frame 350 | # dynamic_states = [] 351 | # for obj in robots_states: 352 | # if not self.check_detection(obj[0],obj[1],self.detect_r): 353 | # continue 354 | 355 | # if not collision: 356 | # collision = self.check_collision(obj[0],obj[1],self.r) 357 | 358 | # pos_r = self.project_to_robot_frame(np.array([obj[0],obj[1]]),False) 359 | # v_r = self.project_to_robot_frame(np.array([obj[2],obj[3]])) 360 | # dynamic_states.append(list(np.concatenate((pos_r,v_r)))) 361 | 362 | # return (self_state,static_states,dynamic_states), collision, self.reach_goal 363 | # else: 364 | # return (self_state,static_states), collision, self.reach_goal 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | -------------------------------------------------------------------------------- /marinenav_env/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='marinenav_env', 4 | version='0.0.1', 5 | install_requires=['gym'] 6 | ) -------------------------------------------------------------------------------- /policy/DQN_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import os 5 | import json 6 | from torch.nn.functional import softmax,relu 7 | 8 | def encoder(input_dimension,output_dimension): 9 | l1 = nn.Linear(input_dimension,output_dimension) 10 | l2 = nn.ReLU() 11 | model = nn.Sequential(l1, l2) 12 | return model 13 | 14 | class DQN_Policy(nn.Module): 15 | def __init__(self, 16 | self_dimension, 17 | static_dimension, 18 | dynamic_dimension, 19 | self_feature_dimension, 20 | static_feature_dimension, 21 | dynamic_feature_dimension, 22 | hidden_dimension, 23 | action_size, 24 | device='cpu', 25 | seed=0 26 | ): 27 | super().__init__() 28 | 29 | self.self_dimension = self_dimension 30 | self.static_dimension = static_dimension 31 | self.dynamic_dimension = dynamic_dimension 32 | self.self_feature_dimension = self_feature_dimension 33 | self.static_feature_dimension = static_feature_dimension 34 | self.dynamic_feature_dimension = dynamic_feature_dimension 35 | self.concat_feature_dimension = self_feature_dimension + static_feature_dimension + dynamic_feature_dimension 36 | self.hidden_dimension = hidden_dimension 37 | self.action_size = action_size 38 | self.device = device 39 | self.seed_id = seed 40 | self.seed = torch.manual_seed(seed) 41 | 42 | self.self_encoder = encoder(self_dimension,self_feature_dimension) 43 | self.static_encoder = encoder(static_dimension,static_feature_dimension) 44 | self.dynamic_encoder = encoder(dynamic_dimension,dynamic_feature_dimension) 45 | 46 | # hidden layers 47 | self.hidden_layer = nn.Linear(self.concat_feature_dimension, hidden_dimension) 48 | self.hidden_layer_2 = nn.Linear(hidden_dimension, hidden_dimension) 49 | self.output_layer = nn.Linear(hidden_dimension, action_size) 50 | 51 | def forward(self, x): 52 | 53 | self_states = x[:,:self.self_dimension] 54 | static_states = x[:,self.self_dimension:self.self_dimension+self.static_dimension] 55 | dynamic_states = x[:,self.self_dimension+self.static_dimension:] 56 | 57 | # encode observations as features 58 | self_features = self.self_encoder(self_states) 59 | static_features = self.static_encoder(static_states) 60 | dynamic_features = self.dynamic_encoder(dynamic_states) 61 | features = torch.cat((self_features,static_features,dynamic_features),1) 62 | 63 | features = relu(self.hidden_layer(features)) 64 | features = relu(self.hidden_layer_2(features)) 65 | output = self.output_layer(features) 66 | 67 | return output 68 | 69 | def get_constructor_parameters(self): 70 | return dict(self_dimension = self.self_dimension, 71 | static_dimension = self.static_dimension, 72 | dynamic_dimension = self.dynamic_dimension, 73 | self_feature_dimension = self.self_feature_dimension, 74 | static_feature_dimension = self.static_feature_dimension, 75 | dynamic_feature_dimension = self.dynamic_feature_dimension, 76 | hidden_dimension = self.hidden_dimension, 77 | action_size = self.action_size, 78 | seed = self.seed_id) 79 | 80 | def save(self,directory): 81 | # save network parameters 82 | torch.save(self.state_dict(),os.path.join(directory,f"dqn_network_params.pth")) 83 | 84 | # save constructor parameters 85 | with open(os.path.join(directory,f"dqn_constructor_params.json"),mode="w") as constructor_f: 86 | json.dump(self.get_constructor_parameters(),constructor_f) 87 | 88 | @classmethod 89 | def load(cls,directory,device="cpu"): 90 | # load network parameters 91 | model_params = torch.load(os.path.join(directory,"dqn_network_params.pth"), 92 | map_location=device) 93 | 94 | # load constructor parameters 95 | with open(os.path.join(directory,"dqn_constructor_params.json"), mode="r") as constructor_f: 96 | constructor_params = json.load(constructor_f) 97 | constructor_params["device"] = device 98 | 99 | model = cls(**constructor_params) 100 | model.load_state_dict(model_params) 101 | model.to(device) 102 | 103 | return model -------------------------------------------------------------------------------- /policy/IQN_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import os 5 | import json 6 | from torch.nn.functional import softmax,relu 7 | 8 | def encoder(input_dimension,output_dimension): 9 | l1 = nn.Linear(input_dimension,output_dimension) 10 | l2 = nn.ReLU() 11 | model = nn.Sequential(l1, l2) 12 | return model 13 | 14 | class IQN_Policy(nn.Module): 15 | def __init__(self, 16 | self_dimension, 17 | static_dimension, 18 | dynamic_dimension, 19 | self_feature_dimension, 20 | static_feature_dimension, 21 | dynamic_feature_dimension, 22 | hidden_dimension, 23 | action_size, 24 | device='cpu', 25 | seed=0 26 | ): 27 | super().__init__() 28 | 29 | self.self_dimension = self_dimension 30 | self.static_dimension = static_dimension 31 | self.dynamic_dimension = dynamic_dimension 32 | self.self_feature_dimension = self_feature_dimension 33 | self.static_feature_dimension = static_feature_dimension 34 | self.dynamic_feature_dimension = dynamic_feature_dimension 35 | self.concat_feature_dimension = self_feature_dimension + static_feature_dimension + dynamic_feature_dimension 36 | self.hidden_dimension = hidden_dimension 37 | self.action_size = action_size 38 | self.device = device 39 | self.seed_id = seed 40 | self.seed = torch.manual_seed(seed) 41 | 42 | self.self_encoder = encoder(self_dimension,self_feature_dimension) 43 | self.static_encoder = encoder(static_dimension,static_feature_dimension) 44 | self.dynamic_encoder = encoder(dynamic_dimension,dynamic_feature_dimension) 45 | 46 | self.K = 32 # number of quantiles in output 47 | self.n = 64 # number of cosine features 48 | 49 | # quantile encoder 50 | self.pis = torch.FloatTensor([np.pi * i for i in range(self.n)]).view(1,1,self.n).to(device) 51 | self.cos_embedding = nn.Linear(self.n,self.concat_feature_dimension) 52 | 53 | # hidden layers 54 | self.hidden_layer = nn.Linear(self.concat_feature_dimension, hidden_dimension) 55 | self.hidden_layer_2 = nn.Linear(hidden_dimension, hidden_dimension) 56 | self.output_layer = nn.Linear(hidden_dimension, action_size) 57 | 58 | # temp for reproducibility 59 | # self.taus = None 60 | 61 | def calc_cos(self, batch_size, num_tau=8, cvar=1.0): 62 | """ 63 | Calculating the cosinus values depending on the number of tau samples 64 | """ 65 | # temp for reproducibility 66 | # if self.taus is None: 67 | # t = np.arange(0.05,1,0.03).astype(np.float32) 68 | # self.taus = torch.from_numpy(t).to(self.device).unsqueeze(-1) # (batch_size, n_tau, 1) for broadcast 69 | # taus = self.taus * cvar 70 | 71 | taus = torch.rand(batch_size,num_tau).to(self.device).unsqueeze(-1) 72 | 73 | # distorted quantile sampling 74 | taus = taus * cvar 75 | 76 | cos = torch.cos(taus * self.pis) 77 | assert cos.shape == (batch_size, num_tau, self.n), "cos shape is incorrect" 78 | return cos, taus 79 | 80 | def forward(self, x, num_tau=8, cvar=1.0): 81 | batch_size = x.shape[0] 82 | 83 | self_states = x[:,:self.self_dimension] 84 | static_states = x[:,self.self_dimension:self.self_dimension+self.static_dimension] 85 | dynamic_states = x[:,self.self_dimension+self.static_dimension:] 86 | 87 | # encode observations as features 88 | self_features = self.self_encoder(self_states) 89 | static_features = self.static_encoder(static_states) 90 | dynamic_features = self.dynamic_encoder(dynamic_states) 91 | features = torch.cat((self_features,static_features,dynamic_features),1) 92 | 93 | # encode quantiles as features 94 | cos, taus = self.calc_cos(batch_size, num_tau, cvar) 95 | cos = cos.view(batch_size*num_tau, self.n) 96 | cos_features = relu(self.cos_embedding(cos)).view(batch_size,num_tau,self.concat_feature_dimension) 97 | 98 | # pairwise product of the input feature and cosine features 99 | features = (features.unsqueeze(1) * cos_features).view(batch_size*num_tau,self.concat_feature_dimension) 100 | 101 | features = relu(self.hidden_layer(features)) 102 | features = relu(self.hidden_layer_2(features)) 103 | quantiles = self.output_layer(features) 104 | 105 | return quantiles.view(batch_size,num_tau,self.action_size), taus 106 | 107 | def get_constructor_parameters(self): 108 | return dict(self_dimension = self.self_dimension, 109 | static_dimension = self.static_dimension, 110 | dynamic_dimension = self.dynamic_dimension, 111 | self_feature_dimension = self.self_feature_dimension, 112 | static_feature_dimension = self.static_feature_dimension, 113 | dynamic_feature_dimension = self.dynamic_feature_dimension, 114 | hidden_dimension = self.hidden_dimension, 115 | action_size = self.action_size, 116 | seed = self.seed_id) 117 | 118 | def save(self,directory): 119 | # save network parameters 120 | torch.save(self.state_dict(),os.path.join(directory,f"network_params.pth")) 121 | 122 | # save constructor parameters 123 | with open(os.path.join(directory,f"constructor_params.json"),mode="w") as constructor_f: 124 | json.dump(self.get_constructor_parameters(),constructor_f) 125 | 126 | @classmethod 127 | def load(cls,directory,device="cpu"): 128 | # load network parameters 129 | model_params = torch.load(os.path.join(directory,"network_params.pth"), 130 | map_location=device) 131 | 132 | # load constructor parameters 133 | with open(os.path.join(directory,"constructor_params.json"), mode="r") as constructor_f: 134 | constructor_params = json.load(constructor_f) 135 | constructor_params["device"] = device 136 | 137 | model = cls(**constructor_params) 138 | model.load_state_dict(model_params) 139 | model.to(device) 140 | 141 | return model -------------------------------------------------------------------------------- /policy/agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | from policy.IQN_model import IQN_Policy 4 | from policy.DQN_model import DQN_Policy 5 | from policy.replay_buffer import ReplayBuffer 6 | from marinenav_env.envs.utils.robot import Robot 7 | import numpy as np 8 | import random 9 | import time 10 | from torch.nn import functional as F 11 | 12 | class Agent(): 13 | def __init__(self, 14 | self_dimension=4, 15 | static_dimension=15, 16 | self_feature_dimension=32, 17 | static_feature_dimension=120, 18 | hidden_dimension=156, 19 | action_size=9, 20 | dynamic_dimension=20, 21 | dynamic_feature_dimension=160, 22 | cooperative=True, 23 | BATCH_SIZE=32, 24 | BUFFER_SIZE=1_000_000, 25 | LR=1e-4, 26 | TAU=1.0, 27 | GAMMA=0.99, 28 | device="cpu", 29 | seed=0, 30 | training=True, 31 | use_iqn=True 32 | ): 33 | 34 | self.cooperative = cooperative 35 | self.device = device 36 | self.LR = LR 37 | self.TAU = TAU 38 | self.GAMMA = GAMMA 39 | self.BUFFER_SIZE = BUFFER_SIZE 40 | self.BATCH_SIZE = BATCH_SIZE 41 | self.training = training 42 | self.action_size = action_size 43 | self.use_iqn = use_iqn 44 | 45 | if training: 46 | if use_iqn: 47 | self.policy_local = IQN_Policy(self_dimension, 48 | static_dimension, 49 | dynamic_dimension, 50 | self_feature_dimension, 51 | static_feature_dimension, 52 | dynamic_feature_dimension, 53 | hidden_dimension, 54 | action_size, 55 | device, 56 | seed).to(device) 57 | self.policy_target = IQN_Policy(self_dimension, 58 | static_dimension, 59 | dynamic_dimension, 60 | self_feature_dimension, 61 | static_feature_dimension, 62 | dynamic_feature_dimension, 63 | hidden_dimension, 64 | action_size, 65 | device, 66 | seed).to(device) 67 | else: 68 | self.policy_local = DQN_Policy(self_dimension, 69 | static_dimension, 70 | dynamic_dimension, 71 | self_feature_dimension, 72 | static_feature_dimension, 73 | dynamic_feature_dimension, 74 | hidden_dimension, 75 | action_size, 76 | device, 77 | seed).to(device) 78 | self.policy_target = DQN_Policy(self_dimension, 79 | static_dimension, 80 | dynamic_dimension, 81 | self_feature_dimension, 82 | static_feature_dimension, 83 | dynamic_feature_dimension, 84 | hidden_dimension, 85 | action_size, 86 | device, 87 | seed).to(device) 88 | 89 | self.optimizer = optim.Adam(self.policy_local.parameters(), lr=self.LR) 90 | 91 | self.memory = ReplayBuffer(BUFFER_SIZE,BATCH_SIZE) 92 | 93 | def act_dqn(self, state, eps=0.0, use_eval=True): 94 | state = torch.tensor([state]).float().to(self.device) 95 | if use_eval: 96 | self.policy_local.eval() 97 | else: 98 | self.policy_local.train() 99 | with torch.no_grad(): 100 | action_values = self.policy_local(state) 101 | self.policy_local.train() 102 | 103 | # epsilon-greedy action selection 104 | if random.random() > eps: 105 | action = np.argmax(action_values.cpu().data.numpy()) 106 | else: 107 | action = random.choice(np.arange(self.action_size)) 108 | 109 | return action, action_values.cpu().data.numpy() 110 | 111 | def act(self, state, eps=0.0, cvar=1.0, use_eval=True): 112 | """Returns action index and quantiles 113 | Params 114 | ====== 115 | frame: to adjust epsilon 116 | state (array_like): current state 117 | """ 118 | # state = self.memory.state_batch([state]) 119 | # state = self.state_to_tensor(state) 120 | state = torch.tensor([state]).float().to(self.device) 121 | if use_eval: 122 | self.policy_local.eval() 123 | else: 124 | self.policy_local.train() 125 | with torch.no_grad(): 126 | quantiles, taus = self.policy_local(state, self.policy_local.K, cvar) 127 | action_values = quantiles.mean(dim=1) 128 | self.policy_local.train() 129 | 130 | # epsilon-greedy action selection 131 | if random.random() > eps: 132 | action = np.argmax(action_values.cpu().data.numpy()) 133 | else: 134 | action = random.choice(np.arange(self.action_size)) 135 | 136 | return action, quantiles.cpu().data.numpy(), taus.cpu().data.numpy() 137 | 138 | def act_adaptive(self, state, eps=0.0): 139 | """adptively tune the CVaR value, compute action index and quantiles 140 | Params 141 | ====== 142 | frame: to adjust epsilon 143 | state (array_like): current state 144 | """ 145 | cvar = self.adjust_cvar(state) 146 | action, quantiles, taus = self.act(state, eps, cvar) 147 | return action, quantiles, taus, cvar 148 | 149 | def adjust_cvar(self,state): 150 | # scale CVaR value according to the closest distance to obstacles 151 | 152 | assert len(state) == 39, "The state size does not equal 39" 153 | 154 | static = state[4:19] 155 | dynamic = state[19:] 156 | 157 | closest_d = np.inf 158 | 159 | for i in range(0,len(static),3): 160 | if np.abs(static[i]) < 1e-3 and np.abs(static[i+1]) < 1e-3: 161 | # padding 162 | continue 163 | dist = np.linalg.norm(static[i:i+2]) - static[i+2] - 0.8 164 | closest_d = min(closest_d,dist) 165 | 166 | for i in range(0,len(dynamic),4): 167 | if np.abs(dynamic[i]) < 1e-3 and np.abs(dynamic[i+1]) < 1e-3: 168 | # padding 169 | continue 170 | dist = np.linalg.norm(dynamic[i:i+2]) - 1.6 171 | closest_d = min(closest_d,dist) 172 | 173 | cvar = 1.0 174 | if closest_d < 10.0: 175 | cvar = closest_d / 10.0 176 | 177 | return cvar 178 | 179 | def train(self): 180 | if self.use_iqn: 181 | return self.train_IQN() 182 | else: 183 | return self.train_DQN() 184 | 185 | def train_IQN(self): 186 | """Update value parameters using given batch of experience tuples 187 | Params 188 | ====== 189 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 190 | gamma (float): discount factor 191 | """ 192 | 193 | states, actions, rewards, next_states, dones = self.memory.sample() 194 | states = torch.tensor(states).float().to(self.device) 195 | actions = torch.tensor(actions).unsqueeze(-1).to(self.device) 196 | rewards = torch.tensor(rewards).unsqueeze(-1).float().to(self.device) 197 | next_states = torch.tensor(next_states).float().to(self.device) 198 | dones = torch.tensor(dones).unsqueeze(-1).float().to(self.device) 199 | 200 | self.optimizer.zero_grad() 201 | # Get max predicted Q values (for next states) from target model 202 | Q_targets_next,_ = self.policy_target(next_states) 203 | Q_targets_next = Q_targets_next.detach().max(2)[0].unsqueeze(1) # (batch_size, 1, N) 204 | 205 | # Compute Q targets for current states 206 | Q_targets = rewards.unsqueeze(-1) + (self.GAMMA * Q_targets_next * (1. - dones.unsqueeze(-1))) 207 | # Get expected Q values from local model 208 | Q_expected,taus = self.policy_local(states) 209 | Q_expected = Q_expected.gather(2, actions.unsqueeze(-1).expand(self.BATCH_SIZE, 8, 1)) 210 | 211 | # Quantile Huber loss 212 | td_error = Q_targets - Q_expected 213 | assert td_error.shape == (self.BATCH_SIZE, 8, 8), "wrong td error shape" 214 | huber_l = calculate_huber_loss(td_error, 1.0) 215 | quantil_l = abs(taus -(td_error.detach() < 0).float()) * huber_l / 1.0 216 | 217 | loss = quantil_l.sum(dim=1).mean(dim=1) # keepdim=True if per weights get multiple 218 | loss = loss.mean() 219 | 220 | # minimize the loss 221 | loss.backward() 222 | torch.nn.utils.clip_grad_norm_(self.policy_local.parameters(), 0.5) 223 | self.optimizer.step() 224 | 225 | return loss.detach().cpu().numpy() 226 | 227 | def train_DQN(self): 228 | states, actions, rewards, next_states, dones = self.memory.sample() 229 | states = torch.tensor(states).float().to(self.device) 230 | actions = torch.tensor(actions).unsqueeze(-1).to(self.device) 231 | rewards = torch.tensor(rewards).unsqueeze(-1).float().to(self.device) 232 | next_states = torch.tensor(next_states).float().to(self.device) 233 | dones = torch.tensor(dones).unsqueeze(-1).float().to(self.device) 234 | 235 | self.optimizer.zero_grad() 236 | 237 | # compute target values 238 | Q_targets_next = self.policy_target(next_states) 239 | Q_targets_next,_ = Q_targets_next.max(dim=1,keepdim=True) 240 | Q_targets = rewards + (1-dones) * self.GAMMA * Q_targets_next 241 | 242 | # compute expected values 243 | Q_expected = self.policy_local(states) 244 | Q_expected = Q_expected.gather(1,actions) 245 | 246 | # compute huber loss 247 | loss = F.smooth_l1_loss(Q_expected,Q_targets) 248 | 249 | # minimize the loss 250 | loss.backward() 251 | torch.nn.utils.clip_grad_norm_(self.policy_local.parameters(), 0.5) 252 | self.optimizer.step() 253 | 254 | return loss.detach().cpu().numpy() 255 | 256 | 257 | 258 | def soft_update(self): 259 | """Soft update model parameters. 260 | θ_target = τ * θ_local + (1 - τ) * θ_target 261 | Params 262 | ====== 263 | local_model (PyTorch model): weights will be copied from 264 | target_model (PyTorch model): weights will be copied to 265 | tau (float): interpolation parameter 266 | """ 267 | for target_param, local_param in zip(self.policy_target.parameters(), self.policy_local.parameters()): 268 | target_param.data.copy_(self.TAU * local_param.data + (1.0 - self.TAU) * target_param.data) 269 | 270 | def save_latest_model(self,directory): 271 | self.policy_local.save(directory) 272 | 273 | def load_model(self,path,agent_type="cooperative",device="cpu"): 274 | if self.use_iqn: 275 | self.policy_local = IQN_Policy.load(path,device) 276 | else: 277 | self.policy_local = DQN_Policy.load(path,device) 278 | 279 | 280 | def calculate_huber_loss(td_errors, k=1.0): 281 | """ 282 | Calculate huber loss element-wisely depending on kappa k. 283 | """ 284 | loss = torch.where(td_errors.abs() <= k, 0.5 * td_errors.pow(2), k * (td_errors.abs() - 0.5 * k)) 285 | assert loss.shape == (td_errors.shape[0], 8, 8), "huber loss has wrong shape" 286 | return loss -------------------------------------------------------------------------------- /policy/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import deque 3 | import torch 4 | import copy 5 | 6 | class ReplayBuffer: 7 | """Fixed-size buffer to store experience tuples.""" 8 | 9 | def __init__(self, buffer_size, batch_size): 10 | """ 11 | Params 12 | ====== 13 | buffer_size (int): maximum size of buffer 14 | batch_size (int): size of each training batch 15 | """ 16 | self.memory = deque(maxlen=buffer_size) 17 | self.batch_size = batch_size 18 | 19 | def add(self,item): 20 | """Add a new experience to memory.""" 21 | self.memory.append(item) 22 | 23 | def sample(self): 24 | """Randomly sample a batch of experiences from memory.""" 25 | samples = random.sample(self.memory, k=self.batch_size) 26 | states = [] 27 | actions = [] 28 | rewards = [] 29 | next_states = [] 30 | dones = [] 31 | for sample in samples: 32 | state, action, reward, next_state, done = sample 33 | states.append(state) 34 | actions.append(action) 35 | rewards.append(reward) 36 | next_states.append(next_state) 37 | dones.append(done) 38 | 39 | return states, actions, rewards, next_states, dones 40 | 41 | def size(self): 42 | """Return the current size of internal memory.""" 43 | return len(self.memory) 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /policy/trainer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import copy 5 | import time 6 | 7 | class Trainer(): 8 | def __init__(self, 9 | train_env, 10 | eval_env, 11 | eval_schedule, 12 | non_cooperative_agent=None, 13 | cooperative_agent=None, 14 | UPDATE_EVERY=4, 15 | learning_starts=2000, 16 | target_update_interval=10000, 17 | exploration_fraction=0.25, 18 | initial_eps=0.6, 19 | final_eps=0.05 20 | ): 21 | 22 | self.train_env = train_env 23 | self.eval_env = eval_env 24 | self.cooperative_agent = cooperative_agent 25 | self.noncooperative_agent = non_cooperative_agent 26 | self.eval_config = [] 27 | self.create_eval_configs(eval_schedule) 28 | 29 | self.UPDATE_EVERY = UPDATE_EVERY 30 | self.learning_starts = learning_starts 31 | self.target_update_interval = target_update_interval 32 | self.exploration_fraction = exploration_fraction 33 | self.initial_eps = initial_eps 34 | self.final_eps = final_eps 35 | 36 | # Current time step 37 | self.current_timestep = 0 38 | 39 | # Learning time step (start counting after learning_starts time step) 40 | self.learning_timestep = 0 41 | 42 | # Evaluation data 43 | self.eval_timesteps = [] 44 | self.eval_actions = [] 45 | self.eval_trajectories = [] 46 | self.eval_rewards = [] 47 | self.eval_successes = [] 48 | self.eval_times = [] 49 | self.eval_energies = [] 50 | self.eval_obs = [] 51 | self.eval_objs = [] 52 | 53 | def create_eval_configs(self,eval_schedule): 54 | self.eval_config.clear() 55 | 56 | count = 0 57 | for i,num_episode in enumerate(eval_schedule["num_episodes"]): 58 | for _ in range(num_episode): 59 | self.eval_env.num_cooperative = eval_schedule["num_cooperative"][i] 60 | self.eval_env.num_non_cooperative = eval_schedule["num_non_cooperative"][i] 61 | self.eval_env.num_cores = eval_schedule["num_cores"][i] 62 | self.eval_env.num_obs = eval_schedule["num_obstacles"][i] 63 | self.eval_env.min_start_goal_dis = eval_schedule["min_start_goal_dis"][i] 64 | 65 | self.eval_env.reset() 66 | 67 | # save eval config 68 | self.eval_config.append(self.eval_env.episode_data()) 69 | count += 1 70 | 71 | def save_eval_config(self,directory): 72 | file = os.path.join(directory,"eval_configs.json") 73 | with open(file, "w+") as f: 74 | json.dump(self.eval_config, f) 75 | 76 | def learn(self, 77 | total_timesteps, 78 | eval_freq, 79 | eval_log_path, 80 | verbose=True): 81 | 82 | states,_,_ = self.train_env.reset() 83 | 84 | # # Sample CVaR value from (0.0,1.0) 85 | # cvar = 1 - np.random.uniform(0.0, 1.0) 86 | 87 | # current episode 88 | ep_rewards = np.zeros(len(self.train_env.robots)) 89 | ep_deactivated_t = [-1]*len(self.train_env.robots) 90 | ep_length = 0 91 | ep_num = 0 92 | 93 | while self.current_timestep <= total_timesteps: 94 | 95 | # start_all = time.time() 96 | eps = self.linear_eps(total_timesteps) 97 | 98 | # gather actions for robots from agents 99 | # start_1 = time.time() 100 | actions = [] 101 | for i,rob in enumerate(self.train_env.robots): 102 | if rob.deactivated: 103 | actions.append(None) 104 | continue 105 | 106 | if rob.cooperative: 107 | if self.cooperative_agent.use_iqn: 108 | action,_,_ = self.cooperative_agent.act(states[i],eps) 109 | else: 110 | action,_ = self.cooperative_agent.act_dqn(states[i],eps) 111 | else: 112 | if self.noncooperative_agent.use_iqn: 113 | action,_,_ = self.noncooperative_agent.act(states[i],eps) 114 | else: 115 | action,_ = self.noncooperative_agent.act_dqn(states[i],eps) 116 | actions.append(action) 117 | # end_1 = time.time() 118 | # elapsed_time_1 = end_1 - start_1 119 | # if self.current_timestep % 100 == 0: 120 | # print("Elapsed time 1: {:.6f} seconds".format(elapsed_time_1)) 121 | 122 | # start_2 = time.time() 123 | # execute actions in the training environment 124 | next_states, rewards, dones, infos = self.train_env.step(actions) 125 | # end_2 = time.time() 126 | # elapsed_time_2 = end_2 - start_2 127 | # if self.current_timestep % 100 == 0: 128 | # print("Elapsed time 2: {:.6f} seconds".format(elapsed_time_2)) 129 | 130 | # save experience in replay memory 131 | for i,rob in enumerate(self.train_env.robots): 132 | if rob.deactivated: 133 | continue 134 | 135 | if rob.cooperative: 136 | ep_rewards[i] += self.cooperative_agent.GAMMA ** ep_length * rewards[i] 137 | if self.cooperative_agent.training: 138 | self.cooperative_agent.memory.add((states[i], actions[i], rewards[i], next_states[i], dones[i])) 139 | else: 140 | ep_rewards[i] += self.noncooperative_agent.GAMMA ** ep_length * rewards[i] 141 | if self.noncooperative_agent.training: 142 | self.noncooperative_agent.memory.add((states[i], actions[i], rewards[i], next_states[i], dones[i])) 143 | 144 | if rob.collision or rob.reach_goal: 145 | rob.deactivated = True 146 | ep_deactivated_t[i] = ep_length 147 | 148 | end_episode = (ep_length >= 1000) or self.train_env.check_all_deactivated() 149 | 150 | # Learn, update and evaluate models after learning_starts time step 151 | if self.current_timestep >= self.learning_starts: 152 | # start_3 = time.time() 153 | 154 | for agent in [self.cooperative_agent,self.noncooperative_agent]: 155 | if agent is None: 156 | continue 157 | 158 | if not agent.training: 159 | continue 160 | 161 | # Learn every UPDATE_EVERY time steps. 162 | if self.current_timestep % self.UPDATE_EVERY == 0: 163 | # If enough samples are available in memory, get random subset and learn 164 | if agent.memory.size() > agent.BATCH_SIZE: 165 | agent.train() 166 | 167 | # Update the target model every target_update_interval time steps 168 | if self.current_timestep % self.target_update_interval == 0: 169 | agent.soft_update() 170 | 171 | # end_3 = time.time() 172 | # elapsed_time_3 = end_3 - start_3 173 | # if self.current_timestep % 100 == 0: 174 | # print("Elapsed time 3: {:.6f} seconds".format(elapsed_time_3)) 175 | 176 | # Evaluate learning agents every eval_freq time steps 177 | if self.current_timestep == self.learning_starts or self.current_timestep % eval_freq == 0: 178 | self.evaluation() 179 | self.save_evaluation(eval_log_path) 180 | 181 | for agent in [self.cooperative_agent,self.noncooperative_agent]: 182 | if agent is None: 183 | continue 184 | if not agent.training: 185 | continue 186 | # save the latest models 187 | agent.save_latest_model(eval_log_path) 188 | 189 | # self.learning_timestep += 1 190 | 191 | if end_episode: 192 | ep_num += 1 193 | 194 | if verbose: 195 | # print abstract info of learning process 196 | print("======== Episode Info ========") 197 | print("current ep_length: ",ep_length) 198 | print("current ep_num: ",ep_num) 199 | print("current exploration rate: ",eps) 200 | print("current timesteps: ",self.current_timestep) 201 | print("total timesteps: ",total_timesteps) 202 | print("======== Episode Info ========\n") 203 | print("======== Robots Info ========") 204 | for i,rob in enumerate(self.train_env.robots): 205 | info = infos[i]["state"] 206 | if info == "deactivated after collision" or info == "deactivated after reaching goal": 207 | print(f"Robot {i} ep reward: {ep_rewards[i]:.2f}, {info} at step {ep_deactivated_t[i]}") 208 | else: 209 | print(f"Robot {i} ep reward: {ep_rewards[i]:.2f}, {info}") 210 | print("======== Robots Info ========\n") 211 | 212 | states,_,_ = self.train_env.reset() 213 | # cvar = 1 - np.random.uniform(0.0, 1.0) 214 | 215 | ep_rewards = np.zeros(len(self.train_env.robots)) 216 | ep_deactivated_t = [-1]*len(self.train_env.robots) 217 | ep_length = 0 218 | else: 219 | states = next_states 220 | ep_length += 1 221 | 222 | # end_all = time.time() 223 | # elapsed_time_all = end_all - start_all 224 | # if self.current_timestep % 100 == 0: 225 | # print("one step elapsed time: {:.6f} seconds".format(elapsed_time_all)) 226 | 227 | self.current_timestep += 1 228 | 229 | def linear_eps(self,total_timesteps): 230 | 231 | progress = self.current_timestep / total_timesteps 232 | if progress < self.exploration_fraction: 233 | r = progress / self.exploration_fraction 234 | return self.initial_eps + r * (self.final_eps - self.initial_eps) 235 | else: 236 | return self.final_eps 237 | 238 | def evaluation(self): 239 | """Evaluate performance of the agent 240 | Params 241 | ====== 242 | eval_env (gym compatible env): evaluation environment 243 | eval_config: eval envs config file 244 | """ 245 | actions_data = [] 246 | trajectories_data = [] 247 | rewards_data = [] 248 | successes_data = [] 249 | times_data = [] 250 | energies_data = [] 251 | obs_data = [] 252 | objs_data = [] 253 | 254 | for idx, config in enumerate(self.eval_config): 255 | print(f"Evaluating episode {idx}") 256 | state,_,_ = self.eval_env.reset_with_eval_config(config) 257 | obs = [[copy.deepcopy(rob.perception.observed_obs)] for rob in self.eval_env.robots] 258 | objs = [[copy.deepcopy(rob.perception.observed_objs)] for rob in self.eval_env.robots] 259 | 260 | rob_num = len(self.eval_env.robots) 261 | 262 | rewards = [0.0]*rob_num 263 | times = [0.0]*rob_num 264 | energies = [0.0]*rob_num 265 | end_episode = False 266 | length = 0 267 | 268 | while not end_episode: 269 | # gather actions for robots from agents 270 | action = [] 271 | for i,rob in enumerate(self.eval_env.robots): 272 | if rob.deactivated: 273 | action.append(None) 274 | continue 275 | 276 | if rob.cooperative: 277 | if self.cooperative_agent.use_iqn: 278 | a,_,_ = self.cooperative_agent.act(state[i]) 279 | else: 280 | a,_ = self.cooperative_agent.act_dqn(state[i]) 281 | else: 282 | if self.noncooperative_agent.use_iqn: 283 | a,_,_ = self.noncooperative_agent.act(state[i]) 284 | else: 285 | a,_ = self.noncooperative_agent.act_dqn(state[i]) 286 | 287 | action.append(a) 288 | 289 | # execute actions in the training environment 290 | state, reward, done, info = self.eval_env.step(action) 291 | 292 | for i,rob in enumerate(self.eval_env.robots): 293 | if rob.deactivated: 294 | continue 295 | 296 | if rob.cooperative: 297 | rewards[i] += self.cooperative_agent.GAMMA ** length * reward[i] 298 | else: 299 | rewards[i] += self.noncooperative_agent.GAMMA ** length * reward[i] 300 | times[i] += rob.dt * rob.N 301 | energies[i] += rob.compute_action_energy_cost(action[i]) 302 | obs[i].append(copy.deepcopy(rob.perception.observed_obs)) 303 | objs[i].append(copy.deepcopy(rob.perception.observed_objs)) 304 | 305 | if rob.collision or rob.reach_goal: 306 | rob.deactivated = True 307 | 308 | end_episode = (length >= 1000) or self.eval_env.check_any_collision() or self.eval_env.check_all_deactivated() 309 | length += 1 310 | 311 | actions = [] 312 | trajectories = [] 313 | for rob in self.eval_env.robots: 314 | actions.append(copy.deepcopy(rob.action_history)) 315 | trajectories.append(copy.deepcopy(rob.trajectory)) 316 | 317 | success = True if self.eval_env.check_all_reach_goal() else False 318 | 319 | actions_data.append(actions) 320 | trajectories_data.append(trajectories) 321 | rewards_data.append(np.mean(rewards)) 322 | successes_data.append(success) 323 | times_data.append(np.mean(times)) 324 | energies_data.append(np.mean(energies)) 325 | obs_data.append(obs) 326 | objs_data.append(objs) 327 | 328 | avg_r = np.mean(rewards_data) 329 | success_rate = np.sum(successes_data)/len(successes_data) 330 | idx = np.where(np.array(successes_data) == 1)[0] 331 | avg_t = None if np.shape(idx)[0] == 0 else np.mean(np.array(times_data)[idx]) 332 | avg_e = None if np.shape(idx)[0] == 0 else np.mean(np.array(energies_data)[idx]) 333 | 334 | print(f"++++++++ Evaluation Info ++++++++") 335 | print(f"Avg cumulative reward: {avg_r:.2f}") 336 | print(f"Success rate: {success_rate:.2f}") 337 | if avg_t is not None: 338 | print(f"Avg time: {avg_t:.2f}") 339 | print(f"Avg energy: {avg_e:.2f}") 340 | print(f"++++++++ Evaluation Info ++++++++\n") 341 | 342 | self.eval_timesteps.append(self.current_timestep) 343 | self.eval_actions.append(actions_data) 344 | self.eval_trajectories.append(trajectories_data) 345 | self.eval_rewards.append(rewards_data) 346 | self.eval_successes.append(successes_data) 347 | self.eval_times.append(times_data) 348 | self.eval_energies.append(energies_data) 349 | self.eval_obs.append(obs_data) 350 | self.eval_objs.append(objs_data) 351 | 352 | def save_evaluation(self,eval_log_path): 353 | filename = "evaluations.npz" 354 | 355 | np.savez( 356 | os.path.join(eval_log_path,filename), 357 | timesteps=np.array(self.eval_timesteps,dtype=object), 358 | actions=np.array(self.eval_actions,dtype=object), 359 | trajectories=np.array(self.eval_trajectories,dtype=object), 360 | rewards=np.array(self.eval_rewards,dtype=object), 361 | successes=np.array(self.eval_successes,dtype=object), 362 | times=np.array(self.eval_times,dtype=object), 363 | energies=np.array(self.eval_energies,dtype=object), 364 | obs=np.array(self.eval_obs,dtype=object), 365 | objs=np.array(self.eval_objs,dtype=object) 366 | ) -------------------------------------------------------------------------------- /pretrained_models/DQN/seed_9/dqn_constructor_params.json: -------------------------------------------------------------------------------- 1 | {"self_dimension": 4, "static_dimension": 15, "dynamic_dimension": 20, "self_feature_dimension": 32, "static_feature_dimension": 120, "dynamic_feature_dimension": 160, "hidden_dimension": 156, "action_size": 9, "seed": 109} -------------------------------------------------------------------------------- /pretrained_models/DQN/seed_9/dqn_network_params.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobustFieldAutonomyLab/Multi_Robot_Distributional_RL_Navigation/889bfed3aded1fc2edc1967090517f22e14de7a0/pretrained_models/DQN/seed_9/dqn_network_params.pth -------------------------------------------------------------------------------- /pretrained_models/IQN/seed_9/constructor_params.json: -------------------------------------------------------------------------------- 1 | {"self_dimension": 4, "static_dimension": 15, "dynamic_dimension": 20, "self_feature_dimension": 32, "static_feature_dimension": 120, "dynamic_feature_dimension": 160, "hidden_dimension": 156, "action_size": 9, "seed": 109} -------------------------------------------------------------------------------- /pretrained_models/IQN/seed_9/network_params.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobustFieldAutonomyLab/Multi_Robot_Distributional_RL_Navigation/889bfed3aded1fc2edc1967090517f22e14de7a0/pretrained_models/IQN/seed_9/network_params.pth -------------------------------------------------------------------------------- /run_experiments.py: -------------------------------------------------------------------------------- 1 | import marinenav_env.envs.marinenav_env as marinenav_env 2 | from policy.agent import Agent 3 | import numpy as np 4 | import copy 5 | import scipy.spatial 6 | import json 7 | from datetime import datetime 8 | import time as t_module 9 | import os 10 | import matplotlib.pyplot as plt 11 | import APF 12 | import sys 13 | sys.path.insert(0,"./thirdparty") 14 | import RVO 15 | 16 | def evaluation(state, agent, eval_env, use_rl=True, use_iqn=True, act_adaptive=True, save_episode=False): 17 | """Evaluate performance of the agent 18 | """ 19 | 20 | rob_num = len(eval_env.robots) 21 | 22 | 23 | rewards = [0.0]*rob_num 24 | times = [0.0]*rob_num 25 | energies = [0.0]*rob_num 26 | computation_times = [] 27 | 28 | end_episode = False 29 | length = 0 30 | 31 | while not end_episode: 32 | # gather actions for robots from agents 33 | action = [] 34 | for i,rob in enumerate(eval_env.robots): 35 | if rob.deactivated: 36 | action.append(None) 37 | continue 38 | 39 | assert rob.cooperative, "Every robot must be cooperative!" 40 | 41 | start = t_module.time() 42 | if use_rl: 43 | if use_iqn: 44 | if act_adaptive: 45 | a,_,_,_ = agent.act_adaptive(state[i]) 46 | else: 47 | a,_,_ = agent.act(state[i]) 48 | else: 49 | a,_ = agent.act_dqn(state[i]) 50 | else: 51 | a = agent.act(state[i]) 52 | end = t_module.time() 53 | computation_times.append(end-start) 54 | 55 | action.append(a) 56 | 57 | # execute actions in the training environment 58 | state, reward, done, info = eval_env.step(action) 59 | 60 | for i,rob in enumerate(eval_env.robots): 61 | if rob.deactivated: 62 | continue 63 | 64 | assert rob.cooperative, "Every robot must be cooperative!" 65 | 66 | rewards[i] += agent.GAMMA ** length * reward[i] 67 | 68 | times[i] += rob.dt * rob.N 69 | energies[i] += rob.compute_action_energy_cost(action[i]) 70 | 71 | if rob.collision or rob.reach_goal: 72 | rob.deactivated = True 73 | 74 | end_episode = (length >= 360) or eval_env.check_all_deactivated() 75 | length += 1 76 | 77 | success = True if eval_env.check_all_reach_goal() else False 78 | # success = 0 79 | # for rob in eval_env.robots: 80 | # if rob.reach_goal: 81 | # success += 1 82 | 83 | # save time and energy data of robots that reach goal 84 | success_times = [] 85 | success_energies = [] 86 | for i,rob in enumerate(eval_env.robots): 87 | if rob.reach_goal: 88 | success_times.append(times[i]) 89 | success_energies.append(energies[i]) 90 | 91 | if save_episode: 92 | trajectories = [] 93 | for rob in eval_env.robots: 94 | trajectories.append(copy.deepcopy(rob.trajectory)) 95 | return success, rewards, computation_times, success_times, success_energies, trajectories 96 | else: 97 | return success, rewards, computation_times, success_times, success_energies 98 | 99 | def exp_setup(envs,eval_schedule,i): 100 | observations = [] 101 | 102 | for test_env in envs: 103 | test_env.num_cooperative = eval_schedule["num_cooperative"][i] 104 | test_env.num_non_cooperative = eval_schedule["num_non_cooperative"][i] 105 | test_env.num_cores = eval_schedule["num_cores"][i] 106 | test_env.num_obs = eval_schedule["num_obstacles"][i] 107 | test_env.min_start_goal_dis = eval_schedule["min_start_goal_dis"][i] 108 | 109 | # save eval config 110 | state,_,_ = test_env.reset() 111 | observations.append(state) 112 | 113 | return observations 114 | 115 | def dashboard(eval_schedule,i): 116 | print("\n======== eval schedule ========") 117 | print("num of cooperative agents: ",eval_schedule["num_cooperative"][i]) 118 | print("num of non-cooperative agents: ",eval_schedule["num_non_cooperative"][i]) 119 | print("num of cores: ",eval_schedule["num_cores"][i]) 120 | print("num of obstacles: ",eval_schedule["num_obstacles"][i]) 121 | print("min start goal dis: ",eval_schedule["min_start_goal_dis"][i]) 122 | print("======== eval schedule ========\n") 123 | 124 | def run_experiment(eval_schedules): 125 | agents = [adaptive_IQN_agent,IQN_agent,DQN_agent,APF_agent,RVO_agent] 126 | names = ["adaptive_IQN","IQN","DQN","APF","RVO"] 127 | envs = [test_env_0,test_env_1,test_env_2,test_env_3,test_env_4] 128 | evaluations = [evaluation,evaluation,evaluation,evaluation,evaluation] 129 | 130 | colors = ["b","g","r","tab:orange","m"] 131 | 132 | save_trajectory = True 133 | 134 | dt = datetime.now() 135 | timestamp = dt.strftime("%Y-%m-%d-%H-%M-%S") 136 | 137 | robot_nums = [] 138 | # all_test_rob_exp = [] 139 | all_successes_exp = [] 140 | all_rewards_exp = [] 141 | all_success_times_exp = [] 142 | all_success_energies_exp =[] 143 | if save_trajectory: 144 | all_trajectories_exp = [] 145 | all_eval_configs_exp = [] 146 | 147 | for idx,count in enumerate(eval_schedules["num_episodes"]): 148 | dashboard(eval_schedules,idx) 149 | 150 | robot_nums.append(eval_schedules["num_cooperative"][idx]) 151 | # all_test_rob = [0]*len(agents) 152 | all_successes = [[] for _ in agents] 153 | all_rewards = [[] for _ in agents] 154 | all_computation_times = [[] for _ in agents] 155 | all_success_times = [[] for _ in agents] 156 | all_success_energies = [[] for _ in agents] 157 | if save_trajectory: 158 | all_trajectories = [[] for _ in agents] 159 | all_eval_configs = [[] for _ in agents] 160 | 161 | for i in range(count): 162 | print("Evaluating all agents on episode ",i) 163 | observations = exp_setup(envs,eval_schedules,idx) 164 | for j in range(len(agents)): 165 | agent = agents[j] 166 | env = envs[j] 167 | eval_func = evaluations[j] 168 | name = names[j] 169 | 170 | if save_trajectory: 171 | all_eval_configs[j].append(env.episode_data()) 172 | 173 | # obs = env.reset() 174 | obs = observations[j] 175 | if save_trajectory: 176 | if name == "adaptive_IQN": 177 | success, rewards, computation_times, success_times, success_energies, trajectories = eval_func(obs,agent,env,save_episode=True) 178 | elif name == "IQN": 179 | success, rewards, computation_times, success_times, success_energies, trajectories = eval_func(obs,agent,env,act_adaptive=False,save_episode=True) 180 | elif name == "DQN": 181 | success, rewards, computation_times, success_times, success_energies, trajectories = eval_func(obs,agent,env,use_iqn=False,save_episode=True) 182 | elif name == "APF": 183 | success, rewards, computation_times, success_times, success_energies, trajectories = eval_func(obs,agent,env,use_rl=False,save_episode=True) 184 | elif name == "RVO": 185 | success, rewards, computation_times, success_times, success_energies, trajectories = eval_func(obs,agent,env,use_rl=False,save_episode=True) 186 | else: 187 | raise RuntimeError("Agent not implemented!") 188 | else: 189 | if name == "adaptive_IQN": 190 | success, rewards, computation_times, success_times, success_energies = eval_func(obs,agent,env) 191 | elif name == "IQN": 192 | success, rewards, computation_times, success_times, success_energies = eval_func(obs,agent,env,act_adaptive=False) 193 | elif name == "DQN": 194 | success, rewards, computation_times, success_times, success_energies = eval_func(obs,agent,env,use_iqn=False) 195 | elif name == "APF": 196 | success, rewards, computation_times, success_times, success_energies = eval_func(obs,agent,env,use_rl=False) 197 | elif name == "RVO": 198 | success, rewards, computation_times, success_times, success_energies = eval_func(obs,agent,env,use_rl=False) 199 | else: 200 | raise RuntimeError("Agent not implemented!") 201 | 202 | all_successes[j].append(success) 203 | # all_test_rob[j] += eval_schedules["num_cooperative"][idx] 204 | # all_successes[j] += success 205 | all_rewards[j] += rewards 206 | all_computation_times[j] += computation_times 207 | all_success_times[j] += success_times 208 | all_success_energies[j] += success_energies 209 | if save_trajectory: 210 | all_trajectories[j].append(copy.deepcopy(trajectories)) 211 | 212 | for k,name in enumerate(names): 213 | s_rate = np.sum(all_successes[k])/len(all_successes[k]) 214 | # s_rate = all_successes[k]/all_test_rob[k] 215 | avg_r = np.mean(all_rewards[k]) 216 | avg_compute_t = np.mean(all_computation_times[k]) 217 | avg_t = np.mean(all_success_times[k]) 218 | avg_e = np.mean(all_success_energies[k]) 219 | print(f"{name} | success rate: {s_rate:.2f} | avg_reward: {avg_r:.2f} | avg_compute_t: {avg_compute_t} | \ 220 | avg_t: {avg_t:.2f} | avg_e: {avg_e:.2f}") 221 | 222 | print("\n") 223 | 224 | # all_test_rob_exp.append(all_test_rob) 225 | all_successes_exp.append(all_successes) 226 | all_rewards_exp.append(all_rewards) 227 | all_success_times_exp.append(all_success_times) 228 | all_success_energies_exp.append(all_success_energies) 229 | if save_trajectory: 230 | all_trajectories_exp.append(copy.deepcopy(all_trajectories)) 231 | all_eval_configs_exp.append(copy.deepcopy(all_eval_configs)) 232 | 233 | # save data 234 | if save_trajectory: 235 | exp_data = dict(eval_schedules=eval_schedules, 236 | names=names, 237 | all_successes_exp=all_successes_exp, 238 | all_rewards_exp=all_rewards_exp, 239 | all_success_times_exp=all_success_times_exp, 240 | all_success_energies_exp=all_success_energies_exp, 241 | all_trajectories_exp=all_trajectories_exp, 242 | all_eval_configs_exp=all_eval_configs_exp 243 | ) 244 | else: 245 | exp_data = dict(eval_schedules=eval_schedules, 246 | names=names, 247 | all_successes_exp=all_successes_exp, 248 | all_rewards_exp=all_rewards_exp, 249 | all_success_times_exp=all_success_times_exp, 250 | all_success_energies_exp=all_success_energies_exp, 251 | ) 252 | 253 | 254 | exp_dir = f"experiment_data/exp_data_{timestamp}" 255 | os.makedirs(exp_dir) 256 | 257 | filename = os.path.join(exp_dir,"exp_results.json") 258 | with open(filename,"w") as file: 259 | json.dump(exp_data,file) 260 | 261 | fig1, ax1 = plt.subplots() 262 | fig2, ax2 = plt.subplots() 263 | fig3, ax3 = plt.subplots() 264 | bar_width = 0.25 265 | interval_scale = 1.5 266 | set_label = [True]*len(names) 267 | for i,robot_num in enumerate(robot_nums): 268 | 269 | all_successes = all_successes_exp[i] 270 | all_success_times = all_success_times_exp[i] 271 | all_success_energies = all_success_energies_exp[i] 272 | for j,pos in enumerate([-2*bar_width,-bar_width,0.0,bar_width,2*bar_width]): 273 | # bar plot for success rate 274 | s_rate = np.sum(all_successes[j])/len(all_successes[j]) 275 | if set_label[j]: 276 | ax1.bar(interval_scale*i+pos,s_rate,0.8*bar_width,color=colors[j],label=names[j]) 277 | set_label[j] = False 278 | else: 279 | ax1.bar(interval_scale*i+pos,s_rate,0.8*bar_width,color=colors[j]) 280 | 281 | # box plot for time 282 | box = ax2.boxplot(all_success_times[j],positions=[interval_scale*i+pos],flierprops={'marker': '.','markersize': 1},patch_artist=True) 283 | for patch in box["boxes"]: 284 | patch.set_facecolor(colors[j]) 285 | for line in box["medians"]: 286 | line.set_color("black") 287 | 288 | # box plot for energy 289 | box = ax3.boxplot(all_success_energies[j],positions=[interval_scale*i+pos],flierprops={'marker': '.','markersize': 1},patch_artist=True) 290 | for patch in box["boxes"]: 291 | patch.set_facecolor(colors[j]) 292 | for line in box["medians"]: 293 | line.set_color("black") 294 | 295 | ax1.set_xticks(interval_scale*np.arange(len(robot_nums))) 296 | ax1.set_xticklabels(robot_nums) 297 | ax1.set_title("Success Rate") 298 | ax1.legend() 299 | 300 | ax2.set_xticks(interval_scale*np.arange(len(robot_nums))) 301 | ax2.set_xticklabels([str(robot_num) for robot_num in eval_schedules["num_cooperative"]]) 302 | ax2.set_title("Time") 303 | 304 | ax3.set_xticks(interval_scale*np.arange(len(robot_nums))) 305 | ax3.set_xticklabels([str(robot_num) for robot_num in eval_schedules["num_cooperative"]]) 306 | ax3.set_title("Energy") 307 | 308 | fig1.savefig(os.path.join(exp_dir,"success_rate.png")) 309 | fig2.savefig(os.path.join(exp_dir,"time.png")) 310 | fig3.savefig(os.path.join(exp_dir,"energy.png")) 311 | 312 | 313 | if __name__ == "__main__": 314 | seed = 3 # PRNG seed for all testing envs 315 | 316 | ##### adaptive IQN ##### 317 | test_env_0 = marinenav_env.MarineNavEnv2(seed) 318 | 319 | save_dir = "pretrained_models/IQN/seed_9" 320 | device = "cpu" 321 | 322 | adaptive_IQN_agent = Agent(cooperative=True,device=device) 323 | adaptive_IQN_agent.load_model(save_dir,"cooperative",device) 324 | ##### adaptive IQN ##### 325 | 326 | ##### IQN ##### 327 | test_env_1 = marinenav_env.MarineNavEnv2(seed) 328 | 329 | save_dir = "pretrained_models/IQN/seed_9" 330 | device = "cpu" 331 | 332 | IQN_agent = Agent(cooperative=True,device=device) 333 | IQN_agent.load_model(save_dir,"cooperative",device) 334 | ##### IQN ##### 335 | 336 | ##### DQN ##### 337 | test_env_2 = marinenav_env.MarineNavEnv2(seed) 338 | 339 | save_dir = "pretrained_models/DQN/seed_9" 340 | device = "cpu" 341 | 342 | DQN_agent = Agent(cooperative=True,device=device,use_iqn=False) 343 | DQN_agent.load_model(save_dir,"cooperative",device) 344 | ##### DQN ##### 345 | 346 | ##### APF ##### 347 | test_env_3 = marinenav_env.MarineNavEnv2(seed) 348 | 349 | APF_agent = APF.APF_agent(test_env_3.robots[0].a,test_env_3.robots[0].w) 350 | ##### APF ##### 351 | 352 | ##### RVO ##### 353 | test_env_4 = marinenav_env.MarineNavEnv2(seed) 354 | 355 | RVO_agent = RVO.RVO_agent(test_env_4.robots[0].a,test_env_4.robots[0].w,test_env_4.robots[0].max_speed) 356 | ##### RVO ##### 357 | 358 | eval_schedules = dict(num_episodes=[100,100,100,100,100], 359 | num_cooperative=[3,4,5,6,7], 360 | num_non_cooperative=[0,0,0,0,0], 361 | num_cores=[4,5,6,7,8], 362 | num_obstacles=[4,5,6,7,8], 363 | min_start_goal_dis=[40.0,40.0,40.0,40.0,40.0] 364 | ) 365 | 366 | run_experiment(eval_schedules) 367 | 368 | -------------------------------------------------------------------------------- /scripts/plot_eval_returns.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib as mpl 4 | import matplotlib.ticker as ticker 5 | import os 6 | 7 | if __name__ == "__main__": 8 | first = True 9 | seeds = [9] 10 | eval_agents = ["IQN","DQN"] 11 | colors = ["b","tab:orange"] 12 | data_dirs = ["your/IQN/training/evaluation/file/directory", 13 | "your/DQN/training/evaluation/file/directory"] 14 | 15 | fig, (ax_rewards,ax_success_rate) = plt.subplots(1,2,figsize=(16,6)) 16 | 17 | IQN_final = [] 18 | DQN_final = [] 19 | 20 | for idx, eval_agent in enumerate(eval_agents): 21 | all_rewards = [] 22 | all_success_rates = [] 23 | print(f"===== Plotting {eval_agent} results =====") 24 | for seed in seeds: 25 | seed_dir = "seed_"+str(seed) 26 | eval_data = np.load(os.path.join(data_dirs[idx],seed_dir,"evaluations.npz"),allow_pickle=True) 27 | 28 | timesteps = np.array(eval_data['timesteps'],dtype=np.float64) 29 | rewards = np.mean(eval_data['rewards'],axis=1) 30 | success_rates = [] 31 | for i in range(len(eval_data['timesteps'])): 32 | successes = eval_data['successes'][i] 33 | success_rates.append(np.sum(successes)/len(successes)) 34 | 35 | if eval_agent == "IQN": 36 | IQN_final.append(success_rates[-1]) 37 | else: 38 | DQN_final.append(success_rates[-1]) 39 | 40 | all_rewards.append(rewards.tolist()) 41 | all_success_rates.append(success_rates) 42 | 43 | all_rewards_mean = np.mean(all_rewards,axis=0) 44 | all_rewards_std = np.std(all_rewards,axis=0)/np.sqrt(np.shape(all_rewards)[0]) 45 | all_success_rates_mean = np.mean(all_success_rates,axis=0) 46 | all_success_rates_std = np.std(all_success_rates,axis=0) 47 | 48 | mpl.rcParams["font.size"]=25 49 | [x.set_linewidth(1.5) for x in ax_rewards.spines.values()] 50 | ax_rewards.tick_params(axis="x", labelsize=22) 51 | ax_rewards.tick_params(axis="y", labelsize=22) 52 | ax_rewards.plot(timesteps,all_rewards_mean,linewidth=3,label=eval_agent,c=colors[idx],zorder=5-idx) 53 | ax_rewards.fill_between(timesteps,all_rewards_mean+all_rewards_std,all_rewards_mean-all_rewards_std,alpha=0.2,color=colors[idx],zorder=5-idx) 54 | ax_rewards.xaxis.set_ticks(np.arange(0,eval_data['timesteps'][-1]+1,1000000)) 55 | scale_x = 1e-5 56 | ticks_x = ticker.FuncFormatter(lambda x, pos:'{0:g}'.format(x*scale_x)) 57 | ax_rewards.xaxis.set_major_formatter(ticks_x) 58 | ax_rewards.set_xlabel("Timestep(x10^5)",fontsize=25) 59 | ax_rewards.set_title("Cumulative Reward",fontsize=25,fontweight='bold') 60 | 61 | [x.set_linewidth(1.5) for x in ax_success_rate.spines.values()] 62 | ax_success_rate.tick_params(axis="x", labelsize=22) 63 | ax_success_rate.tick_params(axis="y", labelsize=22) 64 | ax_success_rate.plot(timesteps,all_success_rates_mean,linewidth=3,label=eval_agent,c=colors[idx],zorder=5-idx) 65 | ax_success_rate.fill_between(timesteps,all_success_rates_mean+all_success_rates_std,all_success_rates_mean-all_success_rates_std,alpha=0.2,color=colors[idx],zorder=5-idx) 66 | ax_success_rate.xaxis.set_ticks(np.arange(0,eval_data['timesteps'][-1]+1,1000000)) 67 | scale_x = 1e-5 68 | ticks_x = ticker.FuncFormatter(lambda x, pos:'{0:g}'.format(x*scale_x)) 69 | ax_success_rate.xaxis.set_major_formatter(ticks_x) 70 | ax_success_rate.set_xlabel("Timestep(x10^5)",fontsize=25) 71 | ax_success_rate.yaxis.set_ticks(np.arange(0,1.1,0.2)) 72 | ax_success_rate.set_title("Success Rate",fontsize=25,fontweight='bold') 73 | ax_success_rate.legend(loc="lower right",bbox_to_anchor=(1, 0.35)) 74 | 75 | print(f"IQN final success rates of all models: {IQN_final}") 76 | print(f"DQN final success rates of all models: {DQN_final}") 77 | 78 | fig.tight_layout() 79 | fig.savefig("learning_curves.png") 80 | -------------------------------------------------------------------------------- /scripts/visualize_eval_episode.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0,"../") 3 | import env_visualizer 4 | import os 5 | 6 | if __name__ == "__main__": 7 | dir = "your/training/evaluation/file/directory" 8 | 9 | eval_configs = os.path.join(dir,"eval_configs.json") 10 | evaluations = os.path.join(dir,"evaluations.npz") 11 | 12 | ev = env_visualizer.EnvVisualizer(seed=231) 13 | 14 | colors = ["r","lime","cyan","orange","tab:olive","white","chocolate"] 15 | 16 | eval_id = -1 17 | eval_episode = 0 18 | 19 | ev.load_eval_config_and_episode(eval_configs,evaluations) 20 | ev.play_eval_episode(eval_id,eval_episode,colors) -------------------------------------------------------------------------------- /scripts/visualize_exp_episode.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0,"../") 3 | import env_visualizer 4 | import json 5 | import numpy as np 6 | from policy.agent import Agent 7 | 8 | if __name__ == "__main__": 9 | filename = "your/experiment/result/file" 10 | 11 | with open(filename,"r") as f: 12 | exp_data = json.load(f) 13 | 14 | 15 | schedule_id = -1 16 | agent_id = 0 17 | ep_id = 0 18 | 19 | colors = ["r","lime","cyan","orange","tab:olive","white","chocolate"] 20 | 21 | 22 | ev = env_visualizer.EnvVisualizer() 23 | 24 | ev.env.reset_with_eval_config(exp_data["all_eval_configs_exp"][schedule_id][agent_id][ep_id]) 25 | ev.init_visualize() 26 | 27 | ev.play_episode(trajectories=exp_data["all_trajectories_exp"][schedule_id][agent_id][ep_id], 28 | colors=colors) 29 | -------------------------------------------------------------------------------- /system_requirements: -------------------------------------------------------------------------------- 1 | ubuntu 20.04 2 | gym 0.19.0 -------------------------------------------------------------------------------- /thirdparty/RVO.py: -------------------------------------------------------------------------------- 1 | # from math import ceil, floor, sqrt 2 | import copy 3 | import numpy as np 4 | 5 | # from math import cos, sin, tan, atan2, asin 6 | 7 | # from math import pi as PI 8 | 9 | class RVO_agent: 10 | 11 | def __init__(self, a, w, max_vel): 12 | self.min_vel = 1.0 # if velocity is lower than the threshold, mandate acceleration 13 | self.max_vel = max_vel 14 | 15 | self.a = a # available linear acceleration (action 1) 16 | self.w = w # available angular velocity (action 2) 17 | 18 | self.rob_r = 2*0.8 19 | self.GAMMA = 0.99 20 | 21 | def distance(self,pose1,pose2): 22 | """ compute Euclidean distance for 2D """ 23 | return np.sqrt((pose1[0]-pose2[0])**2+(pose1[1]-pose2[1])**2)+0.001 24 | 25 | def act(self, observation): 26 | assert len(observation) == 39, "The state size does not equal 39" 27 | 28 | obs_array = np.array(observation) 29 | 30 | ego = obs_array[:4] 31 | static = obs_array[4:19] 32 | dynamic = obs_array[19:] 33 | 34 | # desired velocity (towards goal) 35 | goal = ego[:2] 36 | v_desire = self.max_vel * goal / np.linalg.norm(goal) 37 | 38 | vA = [ego[2],ego[3]] 39 | pA = [0.0,0.0] 40 | 41 | RVO_BA_all = [] 42 | 43 | # velocity obstacle from dynamic obstacles 44 | for i in range(0,len(dynamic),4): 45 | if np.abs(dynamic[i]) < 1e-3 and np.abs(dynamic[i+1]) < 1e-3: 46 | # padding 47 | continue 48 | 49 | vB = [dynamic[i+2],dynamic[i+3]] 50 | pB = [dynamic[i],dynamic[i+1]] 51 | 52 | # use RVO 53 | transl_vB_vA = [pA[0]+0.5*(vB[0]+vA[0]), pA[1]+0.5*(vB[1]+vA[1])] 54 | 55 | dist_BA = self.distance(pA, pB) 56 | theta_BA = np.arctan2(pB[1]-pA[1], pB[0]-pA[0]) 57 | 58 | if 2*self.rob_r > dist_BA: 59 | dist_BA = 2*self.rob_r 60 | 61 | theta_BAort = np.arcsin(2*self.rob_r/dist_BA) 62 | theta_ort_left = theta_BA+theta_BAort 63 | bound_left = [np.cos(theta_ort_left), np.sin(theta_ort_left)] 64 | theta_ort_right = theta_BA-theta_BAort 65 | bound_right = [np.cos(theta_ort_right), np.sin(theta_ort_right)] 66 | 67 | RVO_BA = [transl_vB_vA, bound_left, bound_right, dist_BA, 2*self.rob_r] 68 | RVO_BA_all.append(RVO_BA) 69 | 70 | # velocity obstacle from static obstacles 71 | for i in range(0,len(static),3): 72 | if np.abs(static[i]) < 1e-3 and np.abs(static[i+1]) < 1e-3: 73 | # padding 74 | continue 75 | 76 | vB = [0.0, 0.0] 77 | pB = [static[i],static[i+1]] 78 | 79 | transl_vB_vA = [pA[0]+vB[0], pA[1]+vB[1]] 80 | dist_BA = self.distance(pA, pB) 81 | theta_BA = np.arctan2(pB[1]-pA[1], pB[0]-pA[0]) 82 | 83 | # over-approximation of square to circular 84 | OVER_APPROX_C2S = 1.5 85 | rad = 2*static[i+2]*OVER_APPROX_C2S 86 | if (rad+self.rob_r) > dist_BA: 87 | dist_BA = rad+self.rob_r 88 | 89 | theta_BAort = np.arcsin((rad+self.rob_r)/dist_BA) 90 | theta_ort_left = theta_BA+theta_BAort 91 | bound_left = [np.cos(theta_ort_left), np.sin(theta_ort_left)] 92 | theta_ort_right = theta_BA-theta_BAort 93 | bound_right = [np.cos(theta_ort_right), np.sin(theta_ort_right)] 94 | RVO_BA = [transl_vB_vA, bound_left, bound_right, dist_BA, rad+self.rob_r] 95 | RVO_BA_all.append(RVO_BA) 96 | 97 | vA_post = self.intersect(pA, v_desire, RVO_BA_all) 98 | 99 | # select angular velocity action 100 | vA_angle = 0.0 101 | vA_post_angle = 0.0 102 | if np.linalg.norm(np.array(vA)) > 1e-03: 103 | vA_angle = np.arctan2(vA[1],vA[0]) 104 | if np.linalg.norm(np.array(vA_post)) > 1e-03: 105 | vA_post_angle = np.arctan2(vA_post[1],vA_post[0]) 106 | 107 | diff_angle = vA_post_angle - vA_angle 108 | while diff_angle < -np.pi: 109 | diff_angle += 2 * np.pi 110 | while diff_angle >= np.pi: 111 | diff_angle -= 2 * np.pi 112 | 113 | w_idx = np.argmin(np.abs(self.w-diff_angle)) 114 | 115 | # select linear acceleration action 116 | vA_dir = np.array([1.0,0.0]) 117 | if np.linalg.norm(np.array(vA)) > 1e-03: 118 | vA_dir = np.array(vA) / np.linalg.norm(np.array(vA)) 119 | vA_post_proj = np.dot(vA_dir,np.array(vA_post)) 120 | 121 | a_proj = (vA_post_proj - np.linalg.norm(np.array(vA))) / 0.5 122 | a = copy.deepcopy(self.a) 123 | if np.linalg.norm(np.array(vA)) < self.min_vel: 124 | # if the velocity is small, mandate acceleration 125 | a[a<=0.0] = -np.inf 126 | a_diff = a-a_proj 127 | a_idx = np.argmin(np.abs(a_diff)) 128 | 129 | return a_idx * len(self.w) + w_idx 130 | 131 | 132 | def intersect(self,pA, vA, RVO_BA_all): 133 | # print '----------------------------------------' 134 | # print 'Start intersection test' 135 | norm_v = self.distance(vA, [0, 0]) 136 | suitable_V = [] 137 | unsuitable_V = [] 138 | for theta in np.arange(0, 2*np.pi, 0.1): 139 | for rad in np.arange(0.02, norm_v+0.02, norm_v/5.0): 140 | new_v = [rad*np.cos(theta), rad*np.sin(theta)] 141 | suit = True 142 | for RVO_BA in RVO_BA_all: 143 | p_0 = RVO_BA[0] 144 | left = RVO_BA[1] 145 | right = RVO_BA[2] 146 | dif = [new_v[0]+pA[0]-p_0[0], new_v[1]+pA[1]-p_0[1]] 147 | theta_dif = np.arctan2(dif[1], dif[0]) 148 | theta_right = np.arctan2(right[1], right[0]) 149 | theta_left = np.arctan2(left[1], left[0]) 150 | if self.in_between(theta_right, theta_dif, theta_left): 151 | suit = False 152 | break 153 | if suit: 154 | suitable_V.append(new_v) 155 | else: 156 | unsuitable_V.append(new_v) 157 | new_v = vA[:] 158 | suit = True 159 | for RVO_BA in RVO_BA_all: 160 | p_0 = RVO_BA[0] 161 | left = RVO_BA[1] 162 | right = RVO_BA[2] 163 | dif = [new_v[0]+pA[0]-p_0[0], new_v[1]+pA[1]-p_0[1]] 164 | theta_dif = np.arctan2(dif[1], dif[0]) 165 | theta_right = np.arctan2(right[1], right[0]) 166 | theta_left = np.arctan2(left[1], left[0]) 167 | if self.in_between(theta_right, theta_dif, theta_left): 168 | suit = False 169 | break 170 | if suit: 171 | suitable_V.append(new_v) 172 | else: 173 | unsuitable_V.append(new_v) 174 | #---------------------- 175 | if suitable_V: 176 | # print 'Suitable found' 177 | vA_post = min(suitable_V, key = lambda v: self.distance(v, vA)) 178 | new_v = vA_post[:] 179 | for RVO_BA in RVO_BA_all: 180 | p_0 = RVO_BA[0] 181 | left = RVO_BA[1] 182 | right = RVO_BA[2] 183 | dif = [new_v[0]+pA[0]-p_0[0], new_v[1]+pA[1]-p_0[1]] 184 | theta_dif = np.arctan2(dif[1], dif[0]) 185 | theta_right = np.arctan2(right[1], right[0]) 186 | theta_left = np.arctan2(left[1], left[0]) 187 | else: 188 | # print 'Suitable not found' 189 | tc_V = dict() 190 | for unsuit_v in unsuitable_V: 191 | tc_V[tuple(unsuit_v)] = 0 192 | tc = [] 193 | for RVO_BA in RVO_BA_all: 194 | p_0 = RVO_BA[0] 195 | left = RVO_BA[1] 196 | right = RVO_BA[2] 197 | dist = RVO_BA[3] 198 | rad = RVO_BA[4] 199 | dif = [unsuit_v[0]+pA[0]-p_0[0], unsuit_v[1]+pA[1]-p_0[1]] 200 | theta_dif = np.arctan2(dif[1], dif[0]) 201 | theta_right = np.arctan2(right[1], right[0]) 202 | theta_left = np.arctan2(left[1], left[0]) 203 | if self.in_between(theta_right, theta_dif, theta_left): 204 | small_theta = np.abs(theta_dif-0.5*(theta_left+theta_right)) 205 | if np.abs(dist*np.sin(small_theta)) >= rad: 206 | rad = np.abs(dist*np.sin(small_theta)) 207 | big_theta = np.arcsin(np.abs(dist*np.sin(small_theta))/rad) 208 | dist_tg = np.abs(dist*np.cos(small_theta))-np.abs(rad*np.cos(big_theta)) 209 | if dist_tg < 0: 210 | dist_tg = 0 211 | tc_v = dist_tg/self.distance(dif, [0,0]) 212 | tc.append(tc_v) 213 | tc_V[tuple(unsuit_v)] = min(tc)+0.001 214 | WT = 0.2 215 | vA_post = min(unsuitable_V, key = lambda v: ((WT/tc_V[tuple(v)])+self.distance(v, vA))) 216 | return vA_post 217 | 218 | def in_between(self, theta_right, theta_dif, theta_left): 219 | if abs(theta_right - theta_left) <= np.pi: 220 | if theta_right <= theta_dif <= theta_left: 221 | return True 222 | else: 223 | return False 224 | else: 225 | if (theta_left <0) and (theta_right >0): 226 | theta_left += 2*np.pi 227 | if theta_dif < 0: 228 | theta_dif += 2*np.pi 229 | if theta_right <= theta_dif <= theta_left: 230 | return True 231 | else: 232 | return False 233 | if (theta_left >0) and (theta_right <0): 234 | theta_right += 2*np.pi 235 | if theta_dif < 0: 236 | theta_dif += 2*np.pi 237 | if theta_left <= theta_dif <= theta_right: 238 | return True 239 | else: 240 | return False 241 | 242 | -------------------------------------------------------------------------------- /train_rl_agents.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import itertools 4 | from multiprocessing import Pool 5 | import json 6 | from datetime import datetime 7 | import numpy as np 8 | from marinenav_env.envs.marinenav_env import MarineNavEnv2 9 | from policy.agent import Agent 10 | from policy.trainer import Trainer 11 | 12 | parser = argparse.ArgumentParser(description="Train IQN model") 13 | 14 | parser.add_argument( 15 | "-C", 16 | "--config-file", 17 | dest="config_file", 18 | type=open, 19 | required=True, 20 | help="configuration file for training parameters", 21 | ) 22 | parser.add_argument( 23 | "-P", 24 | "--num-procs", 25 | dest="num_procs", 26 | type=int, 27 | default=1, 28 | help="number of subprocess workers to use for trial parallelization", 29 | ) 30 | parser.add_argument( 31 | "-D", 32 | "--device", 33 | dest="device", 34 | type=str, 35 | default="cpu", 36 | help="device to run all subprocesses, could only specify 1 device in each run" 37 | ) 38 | 39 | def product(*args, repeat=1): 40 | # This function is a modified version of 41 | # https://docs.python.org/3/library/itertools.html#itertools.product 42 | pools = [tuple(pool) for pool in args] * repeat 43 | result = [[]] 44 | for pool in pools: 45 | result = [x+[y] for x in result for y in pool] 46 | for prod in result: 47 | yield tuple(prod) 48 | 49 | def trial_params(params): 50 | if isinstance(params,(str,int,float)): 51 | return [params] 52 | elif isinstance(params,list): 53 | return params 54 | elif isinstance(params, dict): 55 | keys, vals = zip(*params.items()) 56 | mix_vals = [] 57 | for val in vals: 58 | val = trial_params(val) 59 | mix_vals.append(val) 60 | return [dict(zip(keys, mix_val)) for mix_val in itertools.product(*mix_vals)] 61 | else: 62 | raise TypeError("Parameter type is incorrect.") 63 | 64 | def params_dashboard(params): 65 | print("\n====== Training Setup ======\n") 66 | print("seed: ",params["seed"]) 67 | print("total_timesteps: ",params["total_timesteps"]) 68 | print("eval_freq: ",params["eval_freq"]) 69 | print("use_iqn: ",params["use_iqn"]) 70 | print("\n") 71 | 72 | def run_trial(device,params): 73 | exp_dir = os.path.join(params["save_dir"], 74 | "training_"+params["training_time"], 75 | "seed_"+str(params["seed"])) 76 | os.makedirs(exp_dir) 77 | 78 | param_file = os.path.join(exp_dir,"trial_config.json") 79 | with open(param_file, 'w+') as outfile: 80 | json.dump(params, outfile) 81 | 82 | train_env = MarineNavEnv2(seed=params["seed"],schedule=params["training_schedule"]) 83 | 84 | eval_env = MarineNavEnv2(seed=253) 85 | 86 | cooperative_agent = Agent(cooperative=True,device=device,use_iqn=params["use_iqn"],seed=params["seed"]+100) 87 | 88 | if "load_model" in params: 89 | cooperative_agent.load_model(params["load_model"],"cooperative",device) 90 | 91 | trainer = Trainer(train_env=train_env, 92 | eval_env=eval_env, 93 | eval_schedule=params["eval_schedule"], 94 | cooperative_agent=cooperative_agent, 95 | non_cooperative_agent=None, 96 | ) 97 | 98 | trainer.save_eval_config(exp_dir) 99 | 100 | trainer.learn(total_timesteps=params["total_timesteps"], 101 | eval_freq=params["eval_freq"], 102 | eval_log_path=exp_dir) 103 | 104 | if __name__ == "__main__": 105 | args = parser.parse_args() 106 | params = json.load(args.config_file) 107 | params_dashboard(params) 108 | training_schedule = params.pop("training_schedule") 109 | eval_schedule = params.pop("eval_schedule") 110 | 111 | trial_param_list = trial_params(params) 112 | 113 | dt = datetime.now() 114 | timestamp = dt.strftime("%Y-%m-%d-%H-%M-%S") 115 | 116 | if args.num_procs == 1: 117 | for param in trial_param_list: 118 | param["training_time"]=timestamp 119 | param["training_schedule"]=training_schedule 120 | param["eval_schedule"]=eval_schedule 121 | run_trial(args.device,param) 122 | else: 123 | with Pool(processes=args.num_procs) as pool: 124 | for param in trial_param_list: 125 | param["training_time"]=timestamp 126 | param["training_schedule"]=training_schedule 127 | param["eval_schedule"]=eval_schedule 128 | pool.apply_async(run_trial,(args.device,param)) 129 | 130 | pool.close() 131 | pool.join() 132 | 133 | --------------------------------------------------------------------------------