├── README.md ├── agents ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── models.cpython-37.pyc │ ├── policies.cpython-37.pyc │ └── utils.cpython-37.pyc ├── models.py ├── policies.py └── utils.py ├── config ├── config_greedy.ini ├── config_ia2c.ini ├── config_ia2c_cu.ini ├── config_ia2c_fp.ini ├── config_ma2c_dial.ini ├── config_ma2c_ic3.ini └── config_ma2c_nc.ini ├── envs ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── env.cpython-37.pyc │ └── large_grid_env.cpython-37.pyc ├── data │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── build_file.cpython-37.pyc │ ├── build_file.py │ ├── intersection.pdf │ ├── network.pdf │ └── view.xml ├── env.py └── large_grid_env.py ├── main.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Networked Multi-agent Deep RL 2 | This repo implements the state-of-the-art methods for deep RL in a networked multi-agent system, with observability and communication of each agent limited to its neighborhood. For fair comparison, all methods are applied to A2C agents. 3 | Under construction ... 4 | 5 | Available IA2C algorithms: 6 | * PolicyInferring: [Lowe, Ryan, et al. "Multi-agent actor-critic for mixed cooperative-competitive environments." Advances in Neural Information Processing Systems, 2017.](https://papers.nips.cc/paper/7217-multi-agent-actor-critic-for-mixed-cooperative-competitive-environments.pdf) 7 | * FingerPrint: [Foerster, Jakob, et al. "Stabilising experience replay for deep multi-agent reinforcement learning." arXiv preprint arXiv:1702.08887, 2017.](https://arxiv.org/pdf/1702.08887.pdf) 8 | * ConsensusUpdate: [Zhang, Kaiqing, et al. "Fully decentralized multi-agent reinforcement learning with networked agents." arXiv preprint arXiv:1802.08757, 2018.](https://arxiv.org/pdf/1802.08757.pdf) 9 | 10 | 11 | Available MA2C algorithms: 12 | * DIAL: [Foerster, Jakob, et al. "Learning to communicate with deep multi-agent reinforcement learning." Advances in Neural Information Processing Systems. 2016.](http://papers.nips.cc/paper/6042-learning-to-communicate-with-deep-multi-agent-reinforcement-learning.pdf) 13 | * CommNet: [Sukhbaatar, Sainbayar, et al. "Learning multiagent communication with backpropagation." Advances in Neural Information Processing Systems, 2016.](https://arxiv.org/pdf/1605.07736.pdf) 14 | * NeurComm: [Gilmer, Justin, et al. "Neural message passing for quantum chemistry." arXiv preprint arXiv:1704.01212, 2017.](https://arxiv.org/pdf/1704.01212.pdf) 15 | 16 | ## Requirements 17 | * Python3 18 | * [Tensorflow](http://www.tensorflow.org/install) 19 | * [SUMO](http://sumo.dlr.de/wiki/Installing) 20 | 21 | 22 | -------------------------------------------------------------------------------- /agents/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /agents/__pycache__/models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/models.cpython-37.pyc -------------------------------------------------------------------------------- /agents/__pycache__/policies.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/policies.cpython-37.pyc -------------------------------------------------------------------------------- /agents/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /agents/models.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from agents.utils import OnPolicyBuffer, MultiAgentOnPolicyBuffer, Scheduler 4 | from agents.policies import (LstmPolicy, FPPolicy, ConsensusPolicy, NCMultiAgentPolicy, 5 | IC3MultiAgentPolicy, DIALMultiAgentPolicy) 6 | import logging 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | 11 | class IA2C: 12 | """ 13 | The basic IA2C implementation with decentralized actor and centralized critic, 14 | limited to neighborhood area only. 15 | """ 16 | def __init__(self, n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma, 17 | total_step, model_config, seed=0): 18 | self.name = 'ia2c' 19 | self._init_algo(n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma, 20 | total_step, seed, model_config) 21 | 22 | def add_transition(self, ob, naction, action, reward, value, done): 23 | if self.reward_norm > 0: 24 | reward = reward / self.reward_norm 25 | if self.reward_clip > 0: 26 | reward = np.clip(reward, -self.reward_clip, self.reward_clip) 27 | for i in range(self.n_agent): 28 | self.trans_buffer[i].add_transition(ob[i], naction[i], action[i], reward, value[i], done) 29 | 30 | def backward(self, Rends, dt, summary_writer=None, global_step=None): 31 | cur_lr = self.lr_scheduler.get(self.n_step) 32 | for i in range(self.n_agent): 33 | obs, nas, acts, dones, Rs, Advs = self.trans_buffer[i].sample_transition(Rends[i], dt) 34 | if i == 0: 35 | self.policy[i].backward(self.sess, obs, nas, acts, dones, Rs, Advs, cur_lr, 36 | summary_writer=summary_writer, global_step=global_step) 37 | else: 38 | self.policy[i].backward(self.sess, obs, nas, acts, dones, Rs, Advs, cur_lr) 39 | 40 | def forward(self, obs, done, nactions=None, out_type='p'): 41 | out = [] 42 | if nactions is None: 43 | nactions = [None] * self.n_agent 44 | for i in range(self.n_agent): 45 | cur_out = self.policy[i].forward(self.sess, obs[i], done, nactions[i], out_type) 46 | out.append(cur_out) 47 | return np.array(out) 48 | 49 | def load(self, model_dir, checkpoint=None): 50 | save_file = None 51 | save_step = 0 52 | if os.path.exists(model_dir): 53 | if checkpoint is None: 54 | for file in os.listdir(model_dir): 55 | if file.startswith('checkpoint'): 56 | prefix = file.split('.')[0] 57 | tokens = prefix.split('-') 58 | if len(tokens) != 2: 59 | continue 60 | cur_step = int(tokens[1]) 61 | if cur_step > save_step: 62 | save_file = prefix 63 | save_step = cur_step 64 | else: 65 | save_file = 'checkpoint-' + str(int(checkpoint)) 66 | if save_file is not None: 67 | self.saver.restore(self.sess, model_dir + save_file) 68 | logging.info('Checkpoint loaded: %s' % save_file) 69 | return True 70 | logging.error('Can not find old checkpoint for %s' % model_dir) 71 | return False 72 | 73 | def save(self, model_dir, global_step): 74 | self.saver.save(self.sess, model_dir + 'checkpoint', global_step=global_step) 75 | 76 | def _init_algo(self, n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma, 77 | total_step, seed, model_config): 78 | # init params 79 | if self.name.startswith('ia2c'): 80 | self.n_s_ls = n_s_ls 81 | else: 82 | self.n_s = n_s_ls 83 | self.n_a = n_a 84 | self.neighbor_mask = neighbor_mask 85 | self.n_agent = len(self.neighbor_mask) 86 | self.reward_clip = model_config.getfloat('reward_clip') 87 | self.reward_norm = model_config.getfloat('reward_norm') 88 | self.n_step = model_config.getint('batch_size') 89 | self.n_fc = model_config.getint('num_fc') 90 | self.n_lstm = model_config.getint('num_lstm') 91 | # init tf 92 | tf.reset_default_graph() 93 | tf.set_random_seed(seed) 94 | config = tf.ConfigProto(allow_soft_placement=True) 95 | self.sess = tf.Session(config=config) 96 | self.policy = self._init_policy() 97 | self.saver = tf.train.Saver(max_to_keep=5) 98 | # init exp buffer and lr scheduler for training 99 | if total_step: 100 | self.total_step = total_step 101 | self._init_train(model_config, distance_mask, coop_gamma) 102 | self.sess.run(tf.global_variables_initializer()) 103 | 104 | def _init_policy(self): 105 | policy = [] 106 | for i in range(self.n_agent): 107 | n_n = np.sum(self.neighbor_mask[i]) 108 | policy.append(LstmPolicy(self.n_s_ls[i], self.n_a, n_n, self.n_step, 109 | n_fc=self.n_fc, n_lstm=self.n_lstm, name='%d' % i)) 110 | return policy 111 | 112 | def _init_scheduler(self, model_config): 113 | # init lr scheduler 114 | lr_init = model_config.getfloat('lr_init') 115 | lr_decay = model_config.get('lr_decay') 116 | if lr_decay == 'constant': 117 | self.lr_scheduler = Scheduler(lr_init, decay=lr_decay) 118 | else: 119 | lr_min = model_config.getfloat('lr_min') 120 | self.lr_scheduler = Scheduler(lr_init, lr_min, self.total_step, decay=lr_decay) 121 | 122 | def _init_train(self, model_config, distance_mask, coop_gamma): 123 | # init lr scheduler 124 | self._init_scheduler(model_config) 125 | v_coef = model_config.getfloat('value_coef') 126 | e_coef = model_config.getfloat('entropy_coef') 127 | max_grad_norm = model_config.getfloat('max_grad_norm') 128 | alpha = model_config.getfloat('rmsp_alpha') 129 | epsilon = model_config.getfloat('rmsp_epsilon') 130 | gamma = model_config.getfloat('gamma') 131 | self.trans_buffer = [] 132 | for i in range(self.n_agent): 133 | # init loss 134 | self.policy[i].prepare_loss(v_coef, e_coef, max_grad_norm, alpha, epsilon) 135 | # init replay buffer 136 | self.trans_buffer.append(OnPolicyBuffer(gamma, coop_gamma, distance_mask[i])) 137 | 138 | 139 | class IA2C_FP(IA2C): 140 | """ 141 | In fingerprint IA2C, neighborhood policies (fingerprints) are also included. 142 | """ 143 | def __init__(self, n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma, 144 | total_step, model_config, seed=0): 145 | self.name = 'ia2c_fp' 146 | self._init_algo(n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma, 147 | total_step, seed, model_config) 148 | 149 | def _init_policy(self): 150 | policy = [] 151 | for i in range(self.n_agent): 152 | n_n = np.sum(self.neighbor_mask[i]) 153 | # neighborhood policies are included in local state 154 | n_s1 = self.n_s_ls[i] + self.n_a*n_n 155 | policy.append(FPPolicy(n_s1, self.n_a, n_n, self.n_step, n_fc=self.n_fc, 156 | n_lstm=self.n_lstm, name='%d' % i)) 157 | return policy 158 | 159 | 160 | class MA2C_NC(IA2C): 161 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 162 | total_step, model_config, seed=0): 163 | self.name = 'ma2c_nc' 164 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 165 | total_step, seed, model_config) 166 | 167 | def add_transition(self, ob, p, action, reward, value, done): 168 | if self.reward_norm > 0: 169 | reward = reward / self.reward_norm 170 | if self.reward_clip > 0: 171 | reward = np.clip(reward, -self.reward_clip, self.reward_clip) 172 | self.trans_buffer.add_transition(ob, p, action, reward, value, done) 173 | 174 | def backward(self, Rends, dt, summary_writer=None, global_step=None): 175 | cur_lr = self.lr_scheduler.get(self.n_step) 176 | obs, ps, acts, dones, Rs, Advs = self.trans_buffer.sample_transition(Rends, dt) 177 | self.policy.backward(self.sess, obs, ps, acts, dones, Rs, Advs, cur_lr, 178 | summary_writer=summary_writer, global_step=global_step) 179 | 180 | def forward(self, obs, done, ps, actions=None, out_type='p'): 181 | return self.policy.forward(self.sess, obs, done, ps, actions, out_type) 182 | 183 | def _init_policy(self): 184 | return NCMultiAgentPolicy(self.n_s, self.n_a, self.n_agent, self.n_step, 185 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm) 186 | 187 | def _init_train(self, model_config, distance_mask, coop_gamma): 188 | # init lr scheduler 189 | self._init_scheduler(model_config) 190 | v_coef = model_config.getfloat('value_coef') 191 | e_coef = model_config.getfloat('entropy_coef') 192 | max_grad_norm = model_config.getfloat('max_grad_norm') 193 | alpha = model_config.getfloat('rmsp_alpha') 194 | epsilon = model_config.getfloat('rmsp_epsilon') 195 | gamma = model_config.getfloat('gamma') 196 | # init loss 197 | self.policy.prepare_loss(v_coef, e_coef, max_grad_norm, alpha, epsilon) 198 | # init replay buffer 199 | self.trans_buffer = MultiAgentOnPolicyBuffer(gamma, coop_gamma, distance_mask) 200 | 201 | 202 | class IA2C_CU(MA2C_NC): 203 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 204 | total_step, model_config, seed=0): 205 | self.name = 'ma2c_cu' 206 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 207 | total_step, seed, model_config) 208 | 209 | def _init_policy(self): 210 | return ConsensusPolicy(self.n_s, self.n_a, self.n_agent, self.n_step, 211 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm) 212 | 213 | 214 | class MA2C_IC3(MA2C_NC): 215 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 216 | total_step, model_config, seed=0): 217 | self.name = 'ma2c_ic3' 218 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 219 | total_step, seed, model_config) 220 | 221 | def _init_policy(self): 222 | return IC3MultiAgentPolicy(self.n_s, self.n_a, self.n_agent, self.n_step, 223 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm) 224 | 225 | 226 | class MA2C_DIAL(MA2C_NC): 227 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 228 | total_step, model_config, seed=0): 229 | self.name = 'ma2c_dial' 230 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma, 231 | total_step, seed, model_config) 232 | 233 | def _init_policy(self): 234 | return DIALMultiAgentPolicy(self.n_s, self.n_a, self.n_agent, self.n_step, 235 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm) 236 | -------------------------------------------------------------------------------- /agents/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from agents.utils import * 4 | 5 | 6 | class Policy: 7 | def __init__(self, n_a, n_s, n_step, policy_name, agent_name): 8 | self.name = policy_name 9 | if agent_name is not None: 10 | # for multi-agent system 11 | self.name += '_' + str(agent_name) 12 | self.n_a = n_a 13 | self.n_s = n_s 14 | self.n_step = n_step 15 | 16 | def forward(self, ob, *_args, **_kwargs): 17 | raise NotImplementedError() 18 | 19 | def prepare_loss(self, v_coef, e_coef, max_grad_norm, alpha, epsilon): 20 | self.A = tf.placeholder(tf.int32, [self.n_step]) 21 | self.ADV = tf.placeholder(tf.float32, [self.n_step]) 22 | self.R = tf.placeholder(tf.float32, [self.n_step]) 23 | A_sparse = tf.one_hot(self.A, self.n_a) 24 | log_pi = tf.log(tf.clip_by_value(self.pi, 1e-10, 1.0)) 25 | entropy = -tf.reduce_sum(self.pi * log_pi, axis=1) 26 | entropy_loss = -tf.reduce_mean(entropy) * e_coef 27 | policy_loss = -tf.reduce_mean(tf.reduce_sum(log_pi * A_sparse, axis=1) * self.ADV) 28 | value_loss = tf.reduce_mean(tf.square(self.R - self.v)) * 0.5 * v_coef 29 | self.loss = policy_loss + value_loss + entropy_loss 30 | 31 | wts = tf.trainable_variables(scope=self.name) 32 | grads = tf.gradients(self.loss, wts) 33 | if max_grad_norm > 0: 34 | grads, self.grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) 35 | self.lr = tf.placeholder(tf.float32, []) 36 | self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.lr, decay=alpha, 37 | epsilon=epsilon) 38 | self._train = self.optimizer.apply_gradients(list(zip(grads, wts))) 39 | # monitor training 40 | summaries = [] 41 | summaries.append(tf.summary.scalar('loss/%s_entropy_loss' % self.name, entropy_loss)) 42 | summaries.append(tf.summary.scalar('loss/%s_policy_loss' % self.name, policy_loss)) 43 | summaries.append(tf.summary.scalar('loss/%s_value_loss' % self.name, value_loss)) 44 | summaries.append(tf.summary.scalar('loss/%s_total_loss' % self.name, self.loss)) 45 | summaries.append(tf.summary.scalar('train/%s_lr' % self.name, self.lr)) 46 | summaries.append(tf.summary.scalar('train/%s_gradnorm' % self.name, self.grad_norm)) 47 | self.summary = tf.summary.merge(summaries) 48 | 49 | def _build_actor_head(self, h, agent_name=None): 50 | name = 'pi' 51 | if agent_name is not None: 52 | name += '_' + str(agent_name) 53 | pi = fc(h, name, self.n_a, act=tf.nn.softmax) 54 | return pi 55 | 56 | def _build_critic_head(self, h, na, n_n=None, agent_name=None): 57 | name = 'v' 58 | if agent_name is not None: 59 | name += '_' + str(agent_name) 60 | if n_n is None: 61 | n_n = na.shape[-1] 62 | na_sparse = tf.one_hot(na, self.n_a, axis=-1) 63 | na_sparse = tf.reshape(na_sparse, [-1, self.n_a*n_n]) 64 | h = tf.concat([h, na_sparse], 1) 65 | v = fc(h, name, 1, act=lambda x: x) 66 | return v 67 | 68 | 69 | class LstmPolicy(Policy): 70 | def __init__(self, n_s, n_a, n_n, n_step, n_fc=64, n_lstm=64, name=None): 71 | super().__init__(n_a, n_s, n_step, 'lstm', name) 72 | self.n_lstm = n_lstm 73 | self.n_fc = n_fc 74 | self.n_n = n_n 75 | self.ob_fw = tf.placeholder(tf.float32, [1, n_s]) # forward 1-step 76 | self.naction_fw = tf.placeholder(tf.int32, [1, n_n]) 77 | self.done_fw = tf.placeholder(tf.float32, [1]) 78 | self.ob_bw = tf.placeholder(tf.float32, [n_step, n_s]) # backward n-step 79 | self.naction_bw = tf.placeholder(tf.int32, [n_step, n_n]) 80 | self.done_bw = tf.placeholder(tf.float32, [n_step]) 81 | self.states = tf.placeholder(tf.float32, [n_lstm * 2]) 82 | with tf.variable_scope(self.name): 83 | self.pi_fw, self.v_fw, self.new_states = self._build_net('forward') 84 | with tf.variable_scope(self.name, reuse=True): 85 | self.pi, self.v, _ = self._build_net('backward') 86 | self._reset() 87 | 88 | def backward(self, sess, obs, nactions, acts, dones, Rs, Advs, cur_lr, 89 | summary_writer=None, global_step=None): 90 | summary, _ = sess.run([self.summary, self._train], 91 | {self.ob_bw: obs, 92 | self.naction_bw: nactions, 93 | self.done_bw: dones, 94 | self.states: self.states_bw, 95 | self.A: acts, 96 | self.ADV: Advs, 97 | self.R: Rs, 98 | self.lr: cur_lr}) 99 | self.states_bw = np.copy(self.states_fw) 100 | if summary_writer is not None: 101 | summary_writer.add_summary(summary, global_step=global_step) 102 | 103 | def forward(self, sess, ob, done, naction=None, out_type='p'): 104 | # update state only when p is called 105 | ins = {self.ob_fw: np.array([ob]), 106 | self.done_fw: np.array([done]), 107 | self.states: self.states_fw} 108 | if out_type.startswith('p'): 109 | outs = [self.pi_fw, self.new_states] 110 | else: 111 | outs = [self.v_fw] 112 | ins[self.naction_fw] = np.array([naction]) 113 | out_values = sess.run(outs, ins) 114 | out_value = out_values[0] 115 | if out_type.startswith('p'): 116 | self.states_fw = out_values[-1] 117 | return out_value 118 | 119 | def _build_net(self, in_type): 120 | if in_type == 'forward': 121 | ob = self.ob_fw 122 | done = self.done_fw 123 | naction = self.naction_fw 124 | else: 125 | ob = self.ob_bw 126 | done = self.done_bw 127 | naction = self.naction_bw 128 | h = fc(ob, 'fc', self.n_fc) 129 | h, new_states = lstm(h, done, self.states, 'lstm') 130 | pi = self._build_actor_head(h) 131 | v = self._build_critic_head(h, naction) 132 | return tf.squeeze(pi), tf.squeeze(v), new_states 133 | 134 | def _reset(self): 135 | # forget the cumulative states every cum_step 136 | self.states_fw = np.zeros(self.n_lstm * 2, dtype=np.float32) 137 | self.states_bw = np.zeros(self.n_lstm * 2, dtype=np.float32) 138 | 139 | 140 | class FPPolicy(LstmPolicy): 141 | def __init__(self, n_s, n_a, n_n, n_step, n_fc=64, n_lstm=64, name=None): 142 | super().__init__(n_s, n_a, n_n, n_step, n_fc, n_lstm, name) 143 | 144 | def _build_net(self, in_type): 145 | if in_type == 'forward': 146 | ob = self.ob_fw 147 | done = self.done_fw 148 | naction = self.naction_fw 149 | else: 150 | ob = self.ob_bw 151 | done = self.done_bw 152 | naction = self.naction_bw 153 | n_x = int(self.n_s - self.n_n * self.n_a) 154 | hx = fc(ob[:,:n_x], 'fcs', self.n_fc) 155 | hp = fc(ob[:,n_x:], 'fcp', self.n_fc) 156 | h = tf.concat([hx, hp], axis=1) 157 | h, new_states = lstm(h, done, self.states, 'lstm') 158 | pi = self._build_actor_head(h) 159 | v = self._build_critic_head(h, naction) 160 | return tf.squeeze(pi), tf.squeeze(v), new_states 161 | 162 | 163 | class NCMultiAgentPolicy(Policy): 164 | """ Inplemented as a centralized agent. To simplify the implementation, all input 165 | and output dimensions are identical among all agents, and invalid values are casted as 166 | zeros during runtime.""" 167 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64): 168 | super().__init__(n_a, n_s, n_step, 'nc', None) 169 | self._init_policy(n_agent, neighbor_mask, n_h) 170 | 171 | def backward(self, sess, obs, policies, acts, dones, Rs, Advs, cur_lr, 172 | summary_writer=None, global_step=None): 173 | summary, _ = sess.run([self.summary, self._train], 174 | {self.ob_bw: obs, 175 | self.policy_bw: policies, 176 | self.action_bw: acts, 177 | self.done_bw: dones, 178 | self.states: self.states_bw, 179 | self.ADV: Advs, 180 | self.R: Rs, 181 | self.lr: cur_lr}) 182 | self.states_bw = np.copy(self.states_fw) 183 | if summary_writer is not None: 184 | summary_writer.add_summary(summary, global_step=global_step) 185 | 186 | def forward(self, sess, ob, done, policy, action=None, out_type='p'): 187 | # update state only when p is called 188 | ins = {self.ob_fw: np.expand_dims(ob, axis=1), 189 | self.done_fw: np.expand_dims(done, axis=1), 190 | self.policy_fw: np.expand_dims(policy, axis=1), 191 | self.states: self.states_fw} 192 | if out_type.startswith('p'): 193 | outs = [self.pi_fw, self.new_states] 194 | else: 195 | outs = [self.v_fw] 196 | ins[self.action_fw] = np.expand_dims(action, axis=1) 197 | out_values = sess.run(outs, ins) 198 | out_value = out_values[0] 199 | if out_type.startswith('p'): 200 | self.states_fw = out_values[-1] 201 | return out_value 202 | 203 | def prepare_loss(self, v_coef, e_coef, max_grad_norm, alpha, epsilon): 204 | self.ADV = tf.placeholder(tf.float32, [self.n_agent, self.n_step]) 205 | self.R = tf.placeholder(tf.float32, [self.n_agent, self.n_step]) 206 | A_sparse = tf.one_hot(self.action_bw, self.n_a) 207 | # all losses are averaged over steps but summed over agents 208 | log_pi = tf.log(tf.clip_by_value(self.pi, 1e-10, 1.0)) 209 | entropy = -tf.reduce_sum(self.pi * log_pi, axis=-1) 210 | entropy_loss = -tf.reduce_sum(tf.reduce_mean(entropy, axis=-1)) * e_coef 211 | policy_loss = -tf.reduce_sum(tf.reduce_mean(tf.reduce_sum(log_pi * A_sparse, axis=-1) * self.ADV, axis=-1)) 212 | value_loss = tf.reduce_sum(tf.reduce_mean(tf.square(self.R - self.v), axis=-1)) * 0.5 * v_coef 213 | self.loss = policy_loss + value_loss + entropy_loss 214 | 215 | wts = tf.trainable_variables(scope=self.name) 216 | grads = tf.gradients(self.loss, wts) 217 | if max_grad_norm > 0: 218 | grads, self.grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) 219 | self.lr = tf.placeholder(tf.float32, []) 220 | self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.lr, decay=alpha, 221 | epsilon=epsilon) 222 | self._train = self.optimizer.apply_gradients(list(zip(grads, wts))) 223 | # monitor training 224 | summaries = [] 225 | summaries.append(tf.summary.scalar('loss/%s_entropy_loss' % self.name, entropy_loss)) 226 | summaries.append(tf.summary.scalar('loss/%s_policy_loss' % self.name, policy_loss)) 227 | summaries.append(tf.summary.scalar('loss/%s_value_loss' % self.name, value_loss)) 228 | summaries.append(tf.summary.scalar('loss/%s_total_loss' % self.name, self.loss)) 229 | summaries.append(tf.summary.scalar('train/%s_lr' % self.name, self.lr)) 230 | summaries.append(tf.summary.scalar('train/%s_gradnorm' % self.name, self.grad_norm)) 231 | self.summary = tf.summary.merge(summaries) 232 | 233 | def _build_net(self, in_type): 234 | if in_type == 'forward': 235 | ob = self.ob_fw 236 | policy = self.policy_fw 237 | action = self.action_fw 238 | done = self.done_fw 239 | else: 240 | ob = self.ob_bw 241 | policy = self.policy_bw 242 | action = self.action_bw 243 | done = self.done_bw 244 | h, new_states = lstm_comm_new(ob, policy, done, self.neighbor_mask, self.states, 'lstm_comm') 245 | pi_ls = [] 246 | v_ls = [] 247 | for i in range(self.n_agent): 248 | h_i = h[i] # Txn_h 249 | naction_i = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i])) # Txn_n 250 | pi = self._build_actor_head(h_i, agent_name='%d' % i) 251 | v = self._build_critic_head(h_i, naction_i, n_n=int(np.sum(self.neighbor_mask[i])), 252 | agent_name='%d' % i) 253 | pi_ls.append(tf.expand_dims(pi, axis=0)) 254 | v_ls.append(tf.expand_dims(v, axis=0)) 255 | return tf.squeeze(tf.concat(pi_ls, axis=0)), tf.squeeze(tf.concat(v_ls, axis=0)), new_states 256 | 257 | def _init_policy(self, n_agent, neighbor_mask, n_h): 258 | self.n_agent = n_agent 259 | self.neighbor_mask = neighbor_mask #n_agent x n_agent 260 | self.n_h = n_h 261 | self.ob_fw = tf.placeholder(tf.float32, [n_agent, 1, self.n_s]) # forward 1-step 262 | self.policy_fw = tf.placeholder(tf.float32, [n_agent, 1, self.n_a]) 263 | self.action_fw = tf.placeholder(tf.int32, [n_agent, 1]) 264 | self.done_fw = tf.placeholder(tf.float32, [1]) 265 | self.ob_bw = tf.placeholder(tf.float32, [n_agent, self.n_step, self.n_s]) # backward n-step 266 | self.policy_bw = tf.placeholder(tf.float32, [n_agent, self.n_step, self.n_a]) 267 | self.action_bw = tf.placeholder(tf.int32, [n_agent, self.n_step]) 268 | self.done_bw = tf.placeholder(tf.float32, [self.n_step]) 269 | self.states = tf.placeholder(tf.float32, [n_agent, n_h * 2]) 270 | 271 | with tf.variable_scope(self.name): 272 | self.pi_fw, self.v_fw, self.new_states = self._build_net('forward') 273 | with tf.variable_scope(self.name, reuse=True): 274 | self.pi, self.v, _ = self._build_net('backward') 275 | self._reset() 276 | 277 | def _reset(self): 278 | self.states_fw = np.zeros((self.n_agent, self.n_h * 2), dtype=np.float32) 279 | self.states_bw = np.zeros((self.n_agent, self.n_h * 2), dtype=np.float32) 280 | 281 | 282 | class ConsensusPolicy(NCMultiAgentPolicy): 283 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64): 284 | Policy.__init__(self, n_a, n_s, n_step, 'cu', None) 285 | self.n_agent = n_agent 286 | self.n_h = n_h 287 | self.neighbor_mask = neighbor_mask 288 | self._init_policy(n_agent, neighbor_mask, n_h) 289 | 290 | def backward(self, sess, obs, policies, acts, dones, Rs, Advs, cur_lr, 291 | summary_writer=None, global_step=None): 292 | super().backward(sess, obs, policies, acts, dones, Rs, Advs, cur_lr, 293 | summary_writer, global_step) 294 | sess.run(self._consensus_update) 295 | 296 | def prepare_loss(self, v_coef, e_coef, max_grad_norm, alpha, epsilon): 297 | super().prepare_loss(v_coef, e_coef, max_grad_norm, alpha, epsilon) 298 | consensus_update = [] 299 | for i in range(self.n_agent): 300 | wt_from, wt_to = self._get_critic_wts(i) 301 | for w1, w2 in zip(wt_from, wt_to): 302 | consensus_update.append(w2.assign(w1)) 303 | self._consensus_update = tf.group(*consensus_update) 304 | 305 | def _build_net(self, in_type): 306 | if in_type == 'forward': 307 | ob = self.ob_fw 308 | done = self.done_fw 309 | action = self.action_fw 310 | else: 311 | ob = self.ob_bw 312 | done = self.done_bw 313 | action = self.action_bw 314 | pi_ls = [] 315 | v_ls = [] 316 | new_states_ls = [] 317 | for i in range(self.n_agent): 318 | h = fc(ob[i], 'fc_%d' % i, self.n_h) 319 | h, new_states = lstm(h, done, self.states[i], 'lstm_%d' % i) 320 | pi = self._build_actor_head(h, agent_name='%d' % i) 321 | naction = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i])) 322 | v = self._build_critic_head(h, naction, n_n=int(np.sum(self.neighbor_mask[i])), agent_name='%d' % i) 323 | pi_ls.append(tf.expand_dims(pi, axis=0)) 324 | v_ls.append(tf.expand_dims(v, axis=0)) 325 | new_states_ls.append(tf.expand_dims(new_states, axis=0)) 326 | pi_ls = tf.squeeze(tf.concat(pi_ls, axis=0)) 327 | v_ls = tf.squeeze(tf.concat(v_ls, axis=0)) 328 | new_states_ls = tf.squeeze(tf.concat(new_states_ls, axis=0)) 329 | return pi_ls, v_ls, new_states_ls 330 | 331 | def _get_critic_wts(self, agent_i): 332 | neighbor_mask = self.neighbor_mask[agent_i] 333 | agents = [agent_i] + list(np.where(neighbor_mask == 1)[0]) 334 | wt_i = [] 335 | wt_n = [] 336 | for i in agents: 337 | critic_scope = [self.name + ('/lstm_%d' % i)] 338 | wt = [] 339 | for scope in critic_scope: 340 | wt += tf.trainable_variables(scope=scope) 341 | if i == agent_i: 342 | wt_i = wt 343 | wt_n.append(wt) 344 | mean_wt_n = [] 345 | n_n = len(wt_n) 346 | n_w = len(wt_n[0]) 347 | for i in range(n_w): 348 | cur_wts = [] 349 | for j in range(n_n): 350 | cur_wts.append(tf.expand_dims(wt_n[j][i], axis=-1)) 351 | cur_wts = tf.concat(cur_wts, axis=-1) 352 | cur_wts = tf.reduce_mean(cur_wts, axis=-1) 353 | mean_wt_n.append(cur_wts) 354 | return mean_wt_n, wt_i 355 | 356 | 357 | class IC3MultiAgentPolicy(NCMultiAgentPolicy): 358 | """Reference code: https://github.com/IC3Net/IC3Net/blob/master/comm.py. 359 | Note in IC3, the message is generated from hidden state only, so current state 360 | and neigbor policies are not included in the inputs.""" 361 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64): 362 | Policy.__init__(self, n_a, n_s, n_step, 'ic3', None) 363 | self._init_policy(n_agent, neighbor_mask, n_h) 364 | 365 | def _build_net(self, in_type): 366 | if in_type == 'forward': 367 | ob = self.ob_fw 368 | action = self.action_fw 369 | done = self.done_fw 370 | else: 371 | ob = self.ob_bw 372 | action = self.action_bw 373 | done = self.done_bw 374 | h, new_states = lstm_ic3(ob, done, self.neighbor_mask, self.states, 'lstm_ic3') 375 | pi_ls = [] 376 | v_ls = [] 377 | for i in range(self.n_agent): 378 | h_i = h[i] # Txn_h 379 | naction_i = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i])) # Txn_n 380 | pi = self._build_actor_head(h_i, agent_name='%d' % i) 381 | v = self._build_critic_head(h_i, naction_i, n_n=int(np.sum(self.neighbor_mask[i])), 382 | agent_name='%d' % i) 383 | pi_ls.append(tf.expand_dims(pi, axis=0)) 384 | v_ls.append(tf.expand_dims(v, axis=0)) 385 | return tf.squeeze(tf.concat(pi_ls, axis=0)), tf.squeeze(tf.concat(v_ls, axis=0)), new_states 386 | 387 | 388 | class DIALMultiAgentPolicy(NCMultiAgentPolicy): 389 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64): 390 | Policy.__init__(self, n_a, n_s, n_step, 'dial', None) 391 | self._init_policy(n_agent, neighbor_mask, n_h) 392 | 393 | def _build_net(self, in_type): 394 | if in_type == 'forward': 395 | ob = self.ob_fw 396 | policy = self.policy_fw 397 | action = self.action_fw 398 | done = self.done_fw 399 | else: 400 | ob = self.ob_bw 401 | policy = self.policy_bw 402 | action = self.action_bw 403 | done = self.done_bw 404 | h, new_states = lstm_dial(ob, policy, done, self.neighbor_mask, self.states, 'lstm_comm') 405 | pi_ls = [] 406 | v_ls = [] 407 | for i in range(self.n_agent): 408 | h_i = h[i] # Txn_h 409 | naction_i = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i])) # Txn_n 410 | pi = self._build_actor_head(h_i, agent_name='%d' % i) 411 | v = self._build_critic_head(h_i, naction_i, n_n=int(np.sum(self.neighbor_mask[i])), 412 | agent_name='%d' % i) 413 | pi_ls.append(tf.expand_dims(pi, axis=0)) 414 | v_ls.append(tf.expand_dims(v, axis=0)) 415 | return tf.squeeze(tf.concat(pi_ls, axis=0)), tf.squeeze(tf.concat(v_ls, axis=0)), new_states 416 | 417 | -------------------------------------------------------------------------------- /agents/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | """ 5 | initializers 6 | """ 7 | DEFAULT_SCALE = np.sqrt(2) 8 | DEFAULT_MODE = 'fan_in' 9 | 10 | def ortho_init(scale=DEFAULT_SCALE, mode=None): 11 | def _ortho_init(shape, dtype, partition_info=None): 12 | # lasagne ortho init for tf 13 | shape = tuple(shape) 14 | if len(shape) == 2: # fc: in, out 15 | flat_shape = shape 16 | elif (len(shape) == 3) or (len(shape) == 4): # 1d/2dcnn: (in_h), in_w, in_c, out 17 | flat_shape = (np.prod(shape[:-1]), shape[-1]) 18 | a = np.random.standard_normal(flat_shape) 19 | u, _, v = np.linalg.svd(a, full_matrices=False) 20 | q = u if u.shape == flat_shape else v # pick the one with the correct shape 21 | q = q.reshape(shape) 22 | return (scale * q).astype(np.float32) 23 | return _ortho_init 24 | 25 | 26 | def norm_init(scale=DEFAULT_SCALE, mode=DEFAULT_MODE): 27 | def _norm_init(shape, dtype, partition_info=None): 28 | shape = tuple(shape) 29 | if len(shape) == 2: 30 | n_in = shape[0] 31 | elif (len(shape) == 3) or (len(shape) == 4): 32 | n_in = np.prod(shape[:-1]) 33 | a = np.random.standard_normal(shape) 34 | if mode == 'fan_in': 35 | n = n_in 36 | elif mode == 'fan_out': 37 | n = shape[-1] 38 | elif mode == 'fan_avg': 39 | n = 0.5 * (n_in + shape[-1]) 40 | return (scale * a / np.sqrt(n)).astype(np.float32) 41 | 42 | DEFAULT_METHOD = ortho_init 43 | """ 44 | layers 45 | """ 46 | def conv(x, scope, n_out, f_size, stride=1, pad='VALID', f_size_w=None, act=tf.nn.relu, 47 | conv_dim=1, init_scale=DEFAULT_SCALE, init_mode=None, init_method=DEFAULT_METHOD): 48 | with tf.variable_scope(scope): 49 | b = tf.get_variable("b", [n_out], initializer=tf.constant_initializer(0.0)) 50 | if conv_dim == 1: 51 | n_c = x.shape[2].value 52 | w = tf.get_variable("w", [f_size, n_c, n_out], 53 | initializer=init_method(init_scale, init_mode)) 54 | z = tf.nn.conv1d(x, w, stride=stride, padding=pad) + b 55 | elif conv_dim == 2: 56 | n_c = x.shape[3].value 57 | if f_size_w is None: 58 | f_size_w = f_size 59 | w = tf.get_variable("w", [f_size, f_size_w, n_c, n_out], 60 | initializer=init_method(init_scale, init_mode)) 61 | z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad) + b 62 | return act(z) 63 | 64 | 65 | def fc(x, scope, n_out, act=tf.nn.relu, init_scale=DEFAULT_SCALE, 66 | init_mode=DEFAULT_MODE, init_method=DEFAULT_METHOD): 67 | with tf.variable_scope(scope): 68 | n_in = x.shape[1].value 69 | w = tf.get_variable("w", [n_in, n_out], 70 | initializer=init_method(init_scale, init_mode)) 71 | b = tf.get_variable("b", [n_out], initializer=tf.constant_initializer(0.0)) 72 | z = tf.matmul(x, w) + b 73 | return act(z) 74 | 75 | 76 | def batch_to_seq(x): 77 | n_step = x.shape[0].value 78 | if len(x.shape) == 1: 79 | x = tf.expand_dims(x, -1) 80 | return tf.split(axis=0, num_or_size_splits=n_step, value=x) 81 | 82 | 83 | def seq_to_batch(x): 84 | return tf.concat(x, axis=0) 85 | 86 | 87 | def lstm(xs, dones, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE, 88 | init_method=DEFAULT_METHOD): 89 | xs = batch_to_seq(xs) 90 | # need dones to reset states 91 | dones = batch_to_seq(dones) 92 | n_in = xs[0].shape[1].value 93 | n_out = s.shape[0] // 2 94 | with tf.variable_scope(scope): 95 | wx = tf.get_variable("wx", [n_in, n_out*4], 96 | initializer=init_method(init_scale, init_mode)) 97 | wh = tf.get_variable("wh", [n_out, n_out*4], 98 | initializer=init_method(init_scale, init_mode)) 99 | b = tf.get_variable("b", [n_out*4], initializer=tf.constant_initializer(0.0)) 100 | s = tf.expand_dims(s, 0) 101 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s) 102 | for ind, (x, done) in enumerate(zip(xs, dones)): 103 | c = c * (1-done) 104 | h = h * (1-done) 105 | z = tf.matmul(x, wx) + tf.matmul(h, wh) + b 106 | i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) 107 | i = tf.nn.sigmoid(i) 108 | f = tf.nn.sigmoid(f) 109 | o = tf.nn.sigmoid(o) 110 | u = tf.tanh(u) 111 | c = f*c + i*u 112 | h = o*tf.tanh(c) 113 | xs[ind] = h 114 | s = tf.concat(axis=1, values=[c, h]) 115 | return seq_to_batch(xs), tf.squeeze(s) 116 | 117 | 118 | def lstm_comm(xs, ps, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE, 119 | init_method=DEFAULT_METHOD): 120 | n_agent = s.shape[0] 121 | n_h = s.shape[1] // 2 122 | n_s = xs.shape[-1] 123 | n_a = ps.shape[-1] 124 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s 125 | xs = batch_to_seq(xs) 126 | ps = tf.transpose(ps, perm=[1,0,2]) # TxNxn_a 127 | ps = batch_to_seq(ps) 128 | # need dones to reset states 129 | dones = batch_to_seq(dones) # Tx1 130 | # create wts 131 | n_in_msg = n_h + n_s + n_a 132 | w_msg = [] 133 | b_msg = [] 134 | wx_hid = [] 135 | wh_hid = [] 136 | b_hid = [] 137 | for i in range(n_agent): 138 | n_m = np.sum(masks[i]) 139 | n_in_hid = n_s + n_h*n_m 140 | with tf.variable_scope(scope + ('_%d' % i)): 141 | w_msg.append(tf.get_variable("w_msg", [n_in_msg, n_h], 142 | initializer=init_method(init_scale, init_mode))) 143 | b_msg.append(tf.get_variable("b_msg", [n_h], 144 | initializer=tf.constant_initializer(0.0))) 145 | wx_hid.append(tf.get_variable("wx_hid", [n_in_hid, n_h*4], 146 | initializer=init_method(init_scale, init_mode))) 147 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4], 148 | initializer=init_method(init_scale, init_mode))) 149 | b_hid.append(tf.get_variable("b_hid", [n_h*4], 150 | initializer=tf.constant_initializer(0.0))) 151 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s) 152 | # loop over steps 153 | for t, (x, p, done) in enumerate(zip(xs, ps, dones)): 154 | # abuse 1 agent as 1 step 155 | x = batch_to_seq(tf.squeeze(x, axis=0)) 156 | p = batch_to_seq(tf.squeeze(p, axis=0)) 157 | out_h = [] 158 | out_c = [] 159 | out_m = [] 160 | # communication phase 161 | for i, (xi, pi) in enumerate(zip(x, p)): 162 | hi = tf.expand_dims(h[i], axis=0) 163 | si = tf.concat([hi, xi, pi], axis=1) 164 | mi = tf.nn.relu(tf.matmul(si, w_msg[i]) + b_msg[i]) 165 | out_m.append(mi) 166 | out_m = tf.concat(out_m, axis=0) # Nxn_h 167 | # hidden phase 168 | for i, xi in enumerate(x): 169 | ci = tf.expand_dims(c[i], axis=0) 170 | hi = tf.expand_dims(h[i], axis=0) 171 | # reset states for a new episode 172 | ci = ci * (1-done) 173 | hi = hi * (1-done) 174 | # receive neighbor messages 175 | mi = tf.expand_dims(tf.reshape(tf.boolean_mask(out_m, masks[i]), [-1]), axis=0) 176 | # TODO: add additional encoding layers here 177 | si = tf.concat([xi, mi], axis=1) 178 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i] 179 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi) 180 | ii = tf.nn.sigmoid(ii) 181 | fi = tf.nn.sigmoid(fi) 182 | oi = tf.nn.sigmoid(oi) 183 | ui = tf.tanh(ui) 184 | ci = fi*ci + ii*ui 185 | hi = oi*tf.tanh(ci) 186 | out_h.append(hi) 187 | out_c.append(ci) 188 | c = tf.concat(out_c, axis=0) 189 | h = tf.concat(out_h, axis=0) 190 | xs[t] = tf.expand_dims(h, axis=0) 191 | s = tf.concat(axis=1, values=[c, h]) 192 | xs = seq_to_batch(xs) # TxNxn_h 193 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h 194 | return xs, s 195 | 196 | 197 | def lstm_comm_new(xs, ps, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE, 198 | init_method=DEFAULT_METHOD): 199 | n_agent = s.shape[0] 200 | n_h = s.shape[1] // 2 201 | n_s = xs.shape[-1] 202 | n_a = ps.shape[-1] 203 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s 204 | xs = batch_to_seq(xs) 205 | ps = tf.transpose(ps, perm=[1,0,2]) # TxNxn_a 206 | ps = batch_to_seq(ps) 207 | # need dones to reset states 208 | dones = batch_to_seq(dones) # Tx1 209 | # create wts 210 | w_msg = [] 211 | b_msg = [] 212 | w_ob = [] 213 | b_ob = [] 214 | # w_fp = [] 215 | # b_fp = [] 216 | wx_hid = [] 217 | wh_hid = [] 218 | b_hid = [] 219 | n_in_hid = 3*n_h 220 | for i in range(n_agent): 221 | n_m = np.sum(masks[i]) 222 | # n_in_hid = (n_m+1)*n_h 223 | with tf.variable_scope(scope + ('_%d' % i)): 224 | w_msg.append(tf.get_variable("w_msg", [n_h*n_m, n_h], 225 | initializer=init_method(init_scale, init_mode))) 226 | b_msg.append(tf.get_variable("b_msg", [n_h], 227 | initializer=tf.constant_initializer(0.0))) 228 | w_ob.append(tf.get_variable("w_ob", [n_s*(n_m+1), n_h], 229 | initializer=init_method(init_scale, init_mode))) 230 | b_ob.append(tf.get_variable("b_ob", [n_h], 231 | initializer=tf.constant_initializer(0.0))) 232 | # w_fp.append(tf.get_variable("w_fp", [n_a*n_m, n_h], 233 | # initializer=init_method(init_scale, init_mode))) 234 | # b_fp.append(tf.get_variable("b_fp", [n_h], 235 | # initializer=tf.constant_initializer(0.0))) 236 | wx_hid.append(tf.get_variable("wx_hid", [n_in_hid, n_h*4], 237 | initializer=init_method(init_scale, init_mode))) 238 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4], 239 | initializer=init_method(init_scale, init_mode))) 240 | b_hid.append(tf.get_variable("b_hid", [n_h*4], 241 | initializer=tf.constant_initializer(0.0))) 242 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s) 243 | # loop over steps 244 | for t, (x, p, done) in enumerate(zip(xs, ps, dones)): 245 | # abuse 1 agent as 1 step 246 | x = tf.squeeze(x, axis=0) 247 | p = tf.squeeze(p, axis=0) 248 | # x = batch_to_seq(tf.squeeze(x, axis=0)) 249 | # p = batch_to_seq(tf.squeeze(p, axis=0)) 250 | out_h = [] 251 | out_c = [] 252 | out_m = [] 253 | # communication phase 254 | for i in range(n_agent): 255 | hi = tf.expand_dims(h[i], axis=0) 256 | # hxi = fc(xi, 'mfc_s_%d' % i, n_h, act=tf.nn.tanh) 257 | # hpi = fc(pi, 'mfc_p_%d' % i, n_h, act=tf.nn.tanh) 258 | # si = tf.concat([hi, hxi, hpi], axis=1) 259 | mi = fc(hi, 'mfc_%d' % i, n_h) 260 | out_m.append(mi) 261 | # out_m = [tf.expand_dims(h[i], axis=0) for i in range(n_agent)] 262 | out_m = tf.concat(out_m, axis=0) # Nxn_h 263 | # hidden phase 264 | for i in range(n_agent): 265 | ci = tf.expand_dims(c[i], axis=0) 266 | hi = tf.expand_dims(h[i], axis=0) 267 | # reset states for a new episode 268 | ci = ci * (1-done) 269 | hi = hi * (1-done) 270 | # receive neighbor messages 271 | mi = tf.expand_dims(tf.reshape(tf.boolean_mask(out_m, masks[i]), [-1]), axis=0) 272 | # pi = tf.expand_dims(tf.reshape(tf.boolean_mask(p, masks[i]), [-1]), axis=0) 273 | xi = tf.expand_dims(tf.reshape(tf.boolean_mask(x, masks[i]), [-1]), axis=0) 274 | xi = tf.concat([tf.expand_dims(x[i], axis=0), xi], axis=1) 275 | hxi = tf.nn.relu(tf.matmul(xi, w_ob[i]) + b_ob[i]) 276 | # hpi = tf.nn.relu(tf.matmul(pi, w_fp[i]) + b_fp[i]) 277 | hmi = tf.matmul(mi, w_msg[i]) + b_msg[i] 278 | # si = tf.concat([hxi, hpi, hmi], axis=1) 279 | si = tf.concat([hxi, hmi], axis=1) 280 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i] 281 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi) 282 | ii = tf.nn.sigmoid(ii) 283 | fi = tf.nn.sigmoid(fi) 284 | oi = tf.nn.sigmoid(oi) 285 | ui = tf.tanh(ui) 286 | ci = fi*ci + ii*ui 287 | hi = oi*tf.tanh(ci) 288 | out_h.append(hi) 289 | out_c.append(ci) 290 | c = tf.concat(out_c, axis=0) 291 | h = tf.concat(out_h, axis=0) 292 | xs[t] = tf.expand_dims(h, axis=0) 293 | s = tf.concat(axis=1, values=[c, h]) 294 | xs = seq_to_batch(xs) # TxNxn_h 295 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h 296 | return xs, s 297 | 298 | def lstm_ic3(xs, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE, 299 | init_method=DEFAULT_METHOD): 300 | n_agent = s.shape[0] 301 | n_h = s.shape[1] // 2 302 | n_s = xs.shape[-1] 303 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s 304 | xs = batch_to_seq(xs) 305 | # need dones to reset states 306 | dones = batch_to_seq(dones) # Tx1 307 | # create wts 308 | w_msg = [] 309 | b_msg = [] 310 | w_ob = [] 311 | b_ob = [] 312 | wx_hid = [] 313 | wh_hid = [] 314 | b_hid = [] 315 | for i in range(n_agent): 316 | with tf.variable_scope(scope + ('_%d' % i)): 317 | w_msg.append(tf.get_variable("w_msg", [n_h, n_h], 318 | initializer=init_method(init_scale, init_mode))) 319 | b_msg.append(tf.get_variable("b_msg", [n_h], 320 | initializer=tf.constant_initializer(0.0))) 321 | w_ob.append(tf.get_variable("w_ob", [n_s, n_h], 322 | initializer=init_method(init_scale, init_mode))) 323 | b_ob.append(tf.get_variable("b_ob", [n_h], 324 | initializer=tf.constant_initializer(0.0))) 325 | wx_hid.append(tf.get_variable("wx_hid", [n_h, n_h*4], 326 | initializer=init_method(init_scale, init_mode))) 327 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4], 328 | initializer=init_method(init_scale, init_mode))) 329 | b_hid.append(tf.get_variable("b_hid", [n_h*4], 330 | initializer=tf.constant_initializer(0.0))) 331 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s) 332 | # loop over steps 333 | for t, (x, done) in enumerate(zip(xs, dones)): 334 | # abuse 1 agent as 1 step 335 | x = batch_to_seq(tf.squeeze(x, axis=0)) 336 | out_h = [] 337 | out_c = [] 338 | out_m = [tf.expand_dims(h[i], axis=0) for i in range(n_agent)] 339 | out_m = tf.concat(out_m, axis=0) # Nxn_h 340 | # hidden phase 341 | for i, xi in enumerate(x): 342 | ci = tf.expand_dims(c[i], axis=0) 343 | hi = tf.expand_dims(h[i], axis=0) 344 | # reset states for a new episode 345 | ci = ci * (1-done) 346 | hi = hi * (1-done) 347 | # receive neighbor messages 348 | mi = tf.reduce_mean(tf.boolean_mask(out_m, masks[i]), axis=0, keepdims=True) 349 | # the state encoder in IC3 code is not consistent with that described in the paper. 350 | # Here we follow the impelmentation in the paper. 351 | si = tf.nn.tanh(tf.matmul(xi, w_ob[i]) + b_ob[i]) + tf.matmul(mi, w_msg[i]) + b_msg[i] 352 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i] 353 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi) 354 | ii = tf.nn.sigmoid(ii) 355 | fi = tf.nn.sigmoid(fi) 356 | oi = tf.nn.sigmoid(oi) 357 | ui = tf.tanh(ui) 358 | ci = fi*ci + ii*ui 359 | hi = oi*tf.tanh(ci) 360 | out_h.append(hi) 361 | out_c.append(ci) 362 | c = tf.concat(out_c, axis=0) 363 | h = tf.concat(out_h, axis=0) 364 | xs[t] = tf.expand_dims(h, axis=0) 365 | s = tf.concat(axis=1, values=[c, h]) 366 | xs = seq_to_batch(xs) # TxNxn_h 367 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h 368 | return xs, s 369 | 370 | 371 | def lstm_dial(xs, ps, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE, 372 | init_method=DEFAULT_METHOD): 373 | n_agent = s.shape[0] 374 | n_h = s.shape[1] // 2 375 | n_s = xs.shape[-1] 376 | n_a = ps.shape[-1] 377 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s 378 | xs = batch_to_seq(xs) 379 | ps = tf.transpose(ps, perm=[1,0,2]) # TxNxn_a 380 | ps = batch_to_seq(ps) 381 | # need dones to reset states 382 | dones = batch_to_seq(dones) # Tx1 383 | # create wts 384 | w_msg = [] 385 | b_msg = [] 386 | w_ob = [] 387 | b_ob = [] 388 | wx_hid = [] 389 | wh_hid = [] 390 | b_hid = [] 391 | for i in range(n_agent): 392 | n_m = np.sum(masks[i]) 393 | # n_in_hid = (n_m+1)*n_h 394 | with tf.variable_scope(scope + ('_%d' % i)): 395 | w_msg.append(tf.get_variable("w_msg", [n_h*n_m, n_h], 396 | initializer=init_method(init_scale, init_mode))) 397 | b_msg.append(tf.get_variable("b_msg", [n_h], 398 | initializer=tf.constant_initializer(0.0))) 399 | w_ob.append(tf.get_variable("w_ob", [n_s*(n_m+1), n_h], 400 | initializer=init_method(init_scale, init_mode))) 401 | b_ob.append(tf.get_variable("b_ob", [n_h], 402 | initializer=tf.constant_initializer(0.0))) 403 | wx_hid.append(tf.get_variable("wx_hid", [n_h, n_h*4], 404 | initializer=init_method(init_scale, init_mode))) 405 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4], 406 | initializer=init_method(init_scale, init_mode))) 407 | b_hid.append(tf.get_variable("b_hid", [n_h*4], 408 | initializer=tf.constant_initializer(0.0))) 409 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s) 410 | # loop over steps 411 | for t, (x, p, done) in enumerate(zip(xs, ps, dones)): 412 | # abuse 1 agent as 1 step 413 | x = tf.squeeze(x, axis=0) 414 | p = tf.squeeze(p, axis=0) 415 | out_h = [] 416 | out_c = [] 417 | out_m = [] 418 | # communication phase 419 | for i in range(n_agent): 420 | hi = tf.expand_dims(h[i], axis=0) 421 | mi = fc(hi, 'mfc_%d' % i, n_h) 422 | out_m.append(mi) 423 | out_m = tf.concat(out_m, axis=0) # Nxn_h 424 | # hidden phase 425 | for i in range(n_agent): 426 | ci = tf.expand_dims(c[i], axis=0) 427 | hi = tf.expand_dims(h[i], axis=0) 428 | # reset states for a new episode 429 | ci = ci * (1-done) 430 | hi = hi * (1-done) 431 | # receive neighbor messages 432 | mi = tf.expand_dims(tf.reshape(tf.boolean_mask(out_m, masks[i]), [-1]), axis=0) 433 | ai = tf.one_hot(tf.expand_dims(tf.argmax(p[i]), axis=0), n_h) 434 | xi = tf.expand_dims(tf.reshape(tf.boolean_mask(x, masks[i]), [-1]), axis=0) 435 | xi = tf.concat([tf.expand_dims(x[i], axis=0), xi], axis=1) 436 | hxi = tf.nn.relu(tf.matmul(xi, w_ob[i]) + b_ob[i]) 437 | hmi = tf.nn.relu(tf.matmul(mi, w_msg[i]) + b_msg[i]) 438 | si = hxi + hmi + ai 439 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i] 440 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi) 441 | ii = tf.nn.sigmoid(ii) 442 | fi = tf.nn.sigmoid(fi) 443 | oi = tf.nn.sigmoid(oi) 444 | ui = tf.tanh(ui) 445 | ci = fi*ci + ii*ui 446 | hi = oi*tf.tanh(ci) 447 | out_h.append(hi) 448 | out_c.append(ci) 449 | c = tf.concat(out_c, axis=0) 450 | h = tf.concat(out_h, axis=0) 451 | xs[t] = tf.expand_dims(h, axis=0) 452 | s = tf.concat(axis=1, values=[c, h]) 453 | xs = seq_to_batch(xs) # TxNxn_h 454 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h 455 | return xs, s 456 | 457 | 458 | """ 459 | buffers 460 | """ 461 | class TransBuffer: 462 | def reset(self): 463 | self.buffer = [] 464 | 465 | @property 466 | def size(self): 467 | return len(self.buffer) 468 | 469 | def add_transition(self, ob, a, r, *_args, **_kwargs): 470 | raise NotImplementedError() 471 | 472 | def sample_transition(self, *_args, **_kwargs): 473 | raise NotImplementedError() 474 | 475 | 476 | class OnPolicyBuffer(TransBuffer): 477 | def __init__(self, gamma, alpha, distance_mask): 478 | self.gamma = gamma 479 | self.alpha = alpha 480 | if alpha > 0: 481 | self.distance_mask = distance_mask 482 | self.max_distance = np.max(distance_mask, axis=-1) 483 | self.reset() 484 | 485 | def reset(self, done=False): 486 | # the done before each step is required 487 | self.obs = [] 488 | self.acts = [] 489 | self.rs = [] 490 | self.vs = [] 491 | self.adds = [] 492 | self.dones = [done] 493 | 494 | def add_transition(self, ob, na, a, r, v, done): 495 | self.obs.append(ob) 496 | self.adds.append(na) 497 | self.acts.append(a) 498 | self.rs.append(r) 499 | self.vs.append(v) 500 | self.dones.append(done) 501 | 502 | def sample_transition(self, R, dt=0): 503 | if self.alpha < 0: 504 | self._add_R_Adv(R) 505 | else: 506 | self._add_s_R_Adv(R) 507 | obs = np.array(self.obs, dtype=np.float32) 508 | nas = np.array(self.adds, dtype=np.int32) 509 | acts = np.array(self.acts, dtype=np.int32) 510 | Rs = np.array(self.Rs, dtype=np.float32) 511 | Advs = np.array(self.Advs, dtype=np.float32) 512 | # use pre-step dones here 513 | dones = np.array(self.dones[:-1], dtype=np.bool) 514 | self.reset(self.dones[-1]) 515 | return obs, nas, acts, dones, Rs, Advs 516 | 517 | def _add_R_Adv(self, R): 518 | Rs = [] 519 | Advs = [] 520 | # use post-step dones here 521 | for r, v, done in zip(self.rs[::-1], self.vs[::-1], self.dones[:0:-1]): 522 | R = r + self.gamma * R * (1.-done) 523 | Adv = R - v 524 | Rs.append(R) 525 | Advs.append(Adv) 526 | Rs.reverse() 527 | Advs.reverse() 528 | self.Rs = Rs 529 | self.Advs = Advs 530 | 531 | def _add_st_R_Adv(self, R, dt): 532 | Rs = [] 533 | Advs = [] 534 | # use post-step dones here 535 | tdiff = dt 536 | for r, v, done in zip(self.rs[::-1], self.vs[::-1], self.dones[:0:-1]): 537 | R = self.gamma * R * (1.-done) 538 | if done: 539 | tdiff = 0 540 | # additional spatial rewards 541 | tmax = min(tdiff, self.max_distance) 542 | for t in range(tmax + 1): 543 | rt = np.sum(r[self.distance_mask == t]) 544 | R += (self.gamma * self.alpha) ** t * rt 545 | Adv = R - v 546 | tdiff += 1 547 | Rs.append(R) 548 | Advs.append(Adv) 549 | Rs.reverse() 550 | Advs.reverse() 551 | self.Rs = Rs 552 | self.Advs = Advs 553 | 554 | def _add_s_R_Adv(self, R): 555 | Rs = [] 556 | Advs = [] 557 | # use post-step dones here 558 | for r, v, done in zip(self.rs[::-1], self.vs[::-1], self.dones[:0:-1]): 559 | R = self.gamma * R * (1.-done) 560 | # additional spatial rewards 561 | for t in range(self.max_distance + 1): 562 | rt = np.sum(r[self.distance_mask == t]) 563 | R += (self.alpha ** t) * rt 564 | Adv = R - v 565 | Rs.append(R) 566 | Advs.append(Adv) 567 | Rs.reverse() 568 | Advs.reverse() 569 | self.Rs = Rs 570 | self.Advs = Advs 571 | 572 | 573 | class MultiAgentOnPolicyBuffer(OnPolicyBuffer): 574 | def __init__(self, gamma, alpha, distance_mask): 575 | super().__init__(gamma, alpha, distance_mask) 576 | 577 | def sample_transition(self, R, dt=0): 578 | if self.alpha < 0: 579 | self._add_R_Adv(R) 580 | else: 581 | self._add_s_R_Adv(R) 582 | obs = np.transpose(np.array(self.obs, dtype=np.float32), (1, 0, 2)) 583 | policies = np.transpose(np.array(self.adds, dtype=np.float32), (1, 0, 2)) 584 | acts = np.transpose(np.array(self.acts, dtype=np.int32)) 585 | Rs = np.array(self.Rs, dtype=np.float32) 586 | Advs = np.array(self.Advs, dtype=np.float32) 587 | dones = np.array(self.dones[:-1], dtype=np.bool) 588 | self.reset(self.dones[-1]) 589 | return obs, policies, acts, dones, Rs, Advs 590 | 591 | def _add_R_Adv(self, R): 592 | Rs = [] 593 | Advs = [] 594 | vs = np.array(self.vs) 595 | for i in range(vs.shape[1]): 596 | cur_Rs = [] 597 | cur_Advs = [] 598 | cur_R = R[i] 599 | for r, v, done in zip(self.rs[::-1], vs[::-1,i], self.dones[:0:-1]): 600 | cur_R = r + self.gamma * cur_R * (1.-done) 601 | cur_Adv = cur_R - v 602 | cur_Rs.append(cur_R) 603 | cur_Advs.append(cur_Adv) 604 | cur_Rs.reverse() 605 | cur_Advs.reverse() 606 | Rs.append(cur_Rs) 607 | Advs.append(cur_Advs) 608 | self.Rs = np.array(Rs) 609 | self.Advs = np.array(Advs) 610 | 611 | def _add_st_R_Adv(self, R, dt): 612 | Rs = [] 613 | Advs = [] 614 | vs = np.array(self.vs) 615 | for i in range(vs.shape[1]): 616 | cur_Rs = [] 617 | cur_Advs = [] 618 | cur_R = R[i] 619 | tdiff = dt 620 | distance_mask = self.distance_mask[i] 621 | max_distance = self.max_distance[i] 622 | for r, v, done in zip(self.rs[::-1], vs[::-1,i], self.dones[:0:-1]): 623 | cur_R = self.gamma * cur_R * (1.-done) 624 | if done: 625 | tdiff = 0 626 | # additional spatial rewards 627 | tmax = min(tdiff, max_distance) 628 | for t in range(tmax + 1): 629 | rt = np.sum(r[distance_mask==t]) 630 | cur_R += (self.gamma * self.alpha) ** t * rt 631 | cur_Adv = cur_R - v 632 | tdiff += 1 633 | cur_Rs.append(cur_R) 634 | cur_Advs.append(cur_Adv) 635 | cur_Rs.reverse() 636 | cur_Advs.reverse() 637 | Rs.append(cur_Rs) 638 | Advs.append(cur_Advs) 639 | self.Rs = np.array(Rs) 640 | self.Advs = np.array(Advs) 641 | 642 | def _add_s_R_Adv(self, R): 643 | Rs = [] 644 | Advs = [] 645 | vs = np.array(self.vs) 646 | for i in range(vs.shape[1]): 647 | cur_Rs = [] 648 | cur_Advs = [] 649 | cur_R = R[i] 650 | distance_mask = self.distance_mask[i] 651 | max_distance = self.max_distance[i] 652 | for r, v, done in zip(self.rs[::-1], vs[::-1,i], self.dones[:0:-1]): 653 | cur_R = self.gamma * cur_R * (1.-done) 654 | # additional spatial rewards 655 | for t in range(max_distance + 1): 656 | rt = np.sum(r[distance_mask==t]) 657 | cur_R += (self.alpha ** t) * rt 658 | cur_Adv = cur_R - v 659 | cur_Rs.append(cur_R) 660 | cur_Advs.append(cur_Adv) 661 | cur_Rs.reverse() 662 | cur_Advs.reverse() 663 | Rs.append(cur_Rs) 664 | Advs.append(cur_Advs) 665 | self.Rs = np.array(Rs) 666 | self.Advs = np.array(Advs) 667 | 668 | """ 669 | util functions 670 | """ 671 | class Scheduler: 672 | def __init__(self, val_init, val_min=0, total_step=0, decay='linear'): 673 | self.val = val_init 674 | self.N = float(total_step) 675 | self.val_min = val_min 676 | self.decay = decay 677 | self.n = 0 678 | 679 | def get(self, n_step): 680 | self.n += n_step 681 | if self.decay == 'linear': 682 | return max(self.val_min, self.val * (1 - self.n / self.N)) 683 | else: 684 | return self.val 685 | 686 | -------------------------------------------------------------------------------- /config/config_greedy.ini: -------------------------------------------------------------------------------- 1 | [ENV_CONFIG] 2 | clip_wave = -1.0 3 | clip_wait = -1.0 4 | control_interval_sec = 5 5 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc. 6 | agent = greedy 7 | ; coop discount is used to discount the neighbors' impact 8 | coop_gamma = 0.75 9 | data_path = ./envs/data/ 10 | episode_length_sec = 3600 11 | ; the normailization is based on typical values in sim 12 | norm_wave = 1.0 13 | norm_wait = 1.0 14 | coef_wait = 0.2 15 | peak_flow1 = 1100 16 | peak_flow2 = 925 17 | init_density = 0 18 | ; objective is chosen from queue, wait, hybrid 19 | objective = queue 20 | scenario = large_grid 21 | seed = 12 22 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000 23 | yellow_interval_sec = 2 24 | -------------------------------------------------------------------------------- /config/config_ia2c.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | rmsp_alpha = 0.99 3 | rmsp_epsilon = 1e-5 4 | max_grad_norm = 40 5 | gamma = 0.99 6 | lr_init = 5e-4 7 | lr_decay = constant 8 | entropy_coef = 0.01 9 | value_coef = 0.5 10 | num_lstm = 64 11 | num_fc = 64 12 | batch_size = 120 13 | reward_norm = 100.0 14 | reward_clip = -1 15 | 16 | [TRAIN_CONFIG] 17 | total_step = 1e6 18 | test_interval = 2e6 19 | log_interval = 1e4 20 | 21 | [ENV_CONFIG] 22 | clip_wave = 2.0 23 | clip_wait = -1 24 | control_interval_sec = 5 25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc. 26 | agent = ia2c 27 | ; coop discount is used to discount the neighbors' impact 28 | coop_gamma = 0.9 29 | data_path = ./envs/data/ 30 | episode_length_sec = 3600 31 | ; the normailization is based on typical values in sim 32 | norm_wave = 5.0 33 | norm_wait = -1 34 | coef_wait = 0 35 | peak_flow1 = 1100 36 | peak_flow2 = 925 37 | init_density = 0 38 | ; objective is chosen from queue, wait, hybrid 39 | objective = queue 40 | scenario = large_grid 41 | seed = 12 42 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000 43 | yellow_interval_sec = 2 44 | -------------------------------------------------------------------------------- /config/config_ia2c_cu.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | rmsp_alpha = 0.99 3 | rmsp_epsilon = 1e-5 4 | max_grad_norm = 40 5 | gamma = 0.99 6 | lr_init = 5e-4 7 | lr_decay = constant 8 | entropy_coef = 0.01 9 | value_coef = 0.5 10 | num_lstm = 64 11 | num_fc = 64 12 | batch_size = 120 13 | reward_norm = 100.0 14 | reward_clip = -1 15 | 16 | [TRAIN_CONFIG] 17 | total_step = 1e6 18 | test_interval = 2e6 19 | log_interval = 1e4 20 | 21 | [ENV_CONFIG] 22 | clip_wave = 2.0 23 | clip_wait = -1 24 | control_interval_sec = 5 25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc. 26 | agent = ma2c_cu 27 | ; coop discount is used to discount the neighbors' impact 28 | coop_gamma = 0.9 29 | data_path = ./envs/data/ 30 | episode_length_sec = 3600 31 | ; the normailization is based on typical values in sim 32 | norm_wave = 5.0 33 | norm_wait = -1 34 | coef_wait = 0 35 | peak_flow1 = 1100 36 | peak_flow2 = 925 37 | init_density = 0 38 | ; objective is chosen from queue, wait, hybrid 39 | objective = queue 40 | scenario = large_grid 41 | seed = 12 42 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000 43 | yellow_interval_sec = 2 44 | -------------------------------------------------------------------------------- /config/config_ia2c_fp.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | rmsp_alpha = 0.99 3 | rmsp_epsilon = 1e-5 4 | max_grad_norm = 40 5 | gamma = 0.99 6 | lr_init = 5e-4 7 | lr_decay = constant 8 | entropy_coef = 0.01 9 | value_coef = 0.5 10 | num_lstm = 64 11 | num_fc = 64 12 | batch_size = 120 13 | reward_norm = 100.0 14 | reward_clip = -1 15 | 16 | [TRAIN_CONFIG] 17 | total_step = 1e6 18 | test_interval = 2e6 19 | log_interval = 1e4 20 | 21 | [ENV_CONFIG] 22 | clip_wave = 2.0 23 | clip_wait = -1 24 | control_interval_sec = 5 25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc. 26 | agent = ia2c_fp 27 | ; coop discount is used to discount the neighbors' impact 28 | coop_gamma = 0.9 29 | data_path = ./envs/data/ 30 | episode_length_sec = 3600 31 | ; the normailization is based on typical values in sim 32 | norm_wave = 5.0 33 | norm_wait = -1 34 | coef_wait = 0 35 | peak_flow1 = 1100 36 | peak_flow2 = 925 37 | init_density = 0 38 | ; objective is chosen from queue, wait, hybrid 39 | objective = queue 40 | scenario = large_grid 41 | seed = 12 42 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000 43 | yellow_interval_sec = 2 44 | -------------------------------------------------------------------------------- /config/config_ma2c_dial.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | rmsp_alpha = 0.99 3 | rmsp_epsilon = 1e-5 4 | max_grad_norm = 40 5 | gamma = 0.99 6 | lr_init = 5e-4 7 | lr_decay = constant 8 | entropy_coef = 0.01 9 | value_coef = 0.5 10 | num_lstm = 64 11 | num_fc = 64 12 | batch_size = 120 13 | reward_norm = 2000.0 14 | reward_clip = -1 15 | 16 | [TRAIN_CONFIG] 17 | total_step = 1e6 18 | test_interval = 2e6 19 | log_interval = 1e4 20 | 21 | [ENV_CONFIG] 22 | clip_wave = 2.0 23 | clip_wait = -1 24 | control_interval_sec = 5 25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc. 26 | agent = ma2c_dial 27 | ; coop discount is used to discount the neighbors' impact 28 | coop_gamma = -1 29 | data_path = ./envs/data/ 30 | episode_length_sec = 3600 31 | ; the normailization is based on typical values in sim 32 | norm_wave = 5.0 33 | norm_wait = -1 34 | coef_wait = 0 35 | peak_flow1 = 1100 36 | peak_flow2 = 925 37 | init_density = 0 38 | ; objective is chosen from queue, wait, hybrid 39 | objective = queue 40 | scenario = large_grid 41 | seed = 12 42 | test_seeds = 10000 43 | yellow_interval_sec = 2 44 | -------------------------------------------------------------------------------- /config/config_ma2c_ic3.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | rmsp_alpha = 0.99 3 | rmsp_epsilon = 1e-5 4 | max_grad_norm = 40 5 | gamma = 0.99 6 | lr_init = 5e-4 7 | lr_decay = constant 8 | entropy_coef = 0.01 9 | value_coef = 0.5 10 | num_lstm = 64 11 | num_fc = 64 12 | batch_size = 120 13 | reward_norm = 2000.0 14 | reward_clip = -1 15 | 16 | [TRAIN_CONFIG] 17 | total_step = 1e6 18 | test_interval = 2e6 19 | log_interval = 1e4 20 | 21 | [ENV_CONFIG] 22 | clip_wave = 2.0 23 | clip_wait = -1 24 | control_interval_sec = 5 25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc. 26 | agent = ma2c_ic3 27 | ; coop discount is used to discount the neighbors' impact 28 | coop_gamma = -1 29 | data_path = ./envs/data/ 30 | episode_length_sec = 3600 31 | ; the normailization is based on typical values in sim 32 | norm_wave = 5.0 33 | norm_wait = -1 34 | coef_wait = 0 35 | peak_flow1 = 1100 36 | peak_flow2 = 925 37 | init_density = 0 38 | ; objective is chosen from queue, wait, hybrid 39 | objective = queue 40 | scenario = large_grid 41 | seed = 12 42 | test_seeds = 10000 43 | yellow_interval_sec = 2 44 | -------------------------------------------------------------------------------- /config/config_ma2c_nc.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | rmsp_alpha = 0.99 3 | rmsp_epsilon = 1e-5 4 | max_grad_norm = 40 5 | gamma = 0.99 6 | lr_init = 5e-4 7 | lr_decay = constant 8 | entropy_coef = 0.01 9 | value_coef = 0.5 10 | num_lstm = 64 11 | num_fc = 64 12 | batch_size = 120 13 | reward_norm = 2000.0 14 | reward_clip = -1 15 | 16 | [TRAIN_CONFIG] 17 | total_step = 1e6 18 | test_interval = 2e6 19 | log_interval = 1e4 20 | 21 | [ENV_CONFIG] 22 | clip_wave = 2.0 23 | clip_wait = -1 24 | control_interval_sec = 5 25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc. 26 | agent = ma2c_nc 27 | ; coop discount is used to discount the neighbors' impact 28 | coop_gamma = -1 29 | data_path = ./envs/data/ 30 | episode_length_sec = 3600 31 | ; the normailization is based on typical values in sim 32 | norm_wave = 5.0 33 | norm_wait = -1 34 | coef_wait = 0 35 | peak_flow1 = 1100 36 | peak_flow2 = 925 37 | init_density = 0 38 | ; objective is chosen from queue, wait, hybrid 39 | objective = queue 40 | scenario = large_grid 41 | seed = 12 42 | test_seeds = 10000 43 | yellow_interval_sec = 2 44 | -------------------------------------------------------------------------------- /envs/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /envs/__pycache__/env.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/__pycache__/env.cpython-37.pyc -------------------------------------------------------------------------------- /envs/__pycache__/large_grid_env.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/__pycache__/large_grid_env.cpython-37.pyc -------------------------------------------------------------------------------- /envs/data/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /envs/data/__pycache__/build_file.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/__pycache__/build_file.cpython-37.pyc -------------------------------------------------------------------------------- /envs/data/build_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | build *.xml files for a large 5 x 5 network 4 | w/ the traffic dynamics modified from the following paper: 5 | 6 | Chu, Tianshu, Shuhui Qu, and Jie Wang. "Large-scale traffic grid signal control with 7 | regional reinforcement learning." American Control Conference (ACC), 2016. IEEE, 2016. 8 | 9 | @author: Tianshu Chu 10 | """ 11 | import numpy as np 12 | import os 13 | 14 | MAX_CAR_NUM = 30 15 | SPEED_LIMIT_ST = 20 16 | SPEED_LIMIT_AV = 11 17 | L0 = 200 18 | L0_end = 75 19 | N = 5 20 | 21 | 22 | def write_file(path, content): 23 | with open(path, 'w') as f: 24 | f.write(content) 25 | 26 | 27 | def output_nodes(node): 28 | str_nodes = '\n' 29 | # traffic light nodes 30 | ind = 1 31 | for dy in np.arange(0, L0 * 5, L0): 32 | for dx in np.arange(0, L0 * 5, L0): 33 | str_nodes += node % ('nt' + str(ind), dx, dy, 'traffic_light') 34 | ind += 1 35 | # other nodes 36 | ind = 1 37 | for dx in np.arange(0, L0 * 5, L0): 38 | str_nodes += node % ('np' + str(ind), dx, -L0_end, 'priority') 39 | ind += 1 40 | for dy in np.arange(0, L0 * 5, L0): 41 | str_nodes += node % ('np' + str(ind), L0 * 4 + L0_end, dy, 'priority') 42 | ind += 1 43 | for dx in np.arange(L0 * 4, -1, -L0): 44 | str_nodes += node % ('np' + str(ind), dx, L0 * 4 + L0_end, 'priority') 45 | ind += 1 46 | for dy in np.arange(L0 * 4, -1, -L0): 47 | str_nodes += node % ('np' + str(ind), -L0_end, dy, 'priority') 48 | ind += 1 49 | str_nodes += '\n' 50 | return str_nodes 51 | 52 | 53 | def output_road_types(): 54 | str_types = '\n' 55 | str_types += ' \n' % SPEED_LIMIT_ST 56 | str_types += ' \n' % SPEED_LIMIT_AV 57 | str_types += '\n' 58 | return str_types 59 | 60 | 61 | def get_edge_str(edge, from_node, to_node, edge_type): 62 | edge_id = '%s_%s' % (from_node, to_node) 63 | return edge % (edge_id, from_node, to_node, edge_type) 64 | 65 | 66 | def output_edges(edge): 67 | str_edges = '\n' 68 | # external roads 69 | in_edges = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1] 70 | out_edges = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20] 71 | for in_i, out_i in zip(in_edges, out_edges): 72 | in_node = 'nt' + str(in_i) 73 | out_node = 'np' + str(out_i) 74 | str_edges += get_edge_str(edge, in_node, out_node, 'a') 75 | str_edges += get_edge_str(edge, out_node, in_node, 'a') 76 | 77 | in_edges = [1, 2, 3, 4, 5, 25, 24, 23, 22, 21] 78 | out_edges = [1, 2, 3, 4, 5, 11, 12, 13, 14, 15] 79 | for in_i, out_i in zip(in_edges, out_edges): 80 | in_node = 'nt' + str(in_i) 81 | out_node = 'np' + str(out_i) 82 | str_edges += get_edge_str(edge, in_node, out_node, 'b') 83 | str_edges += get_edge_str(edge, out_node, in_node, 'b') 84 | # internal roads 85 | for i in range(1, 25, 5): 86 | for j in range(4): 87 | from_node = 'nt' + str(i + j) 88 | to_node = 'nt' + str(i + j + 1) 89 | str_edges += get_edge_str(edge, from_node, to_node, 'a') 90 | str_edges += get_edge_str(edge, to_node, from_node, 'a') 91 | for i in range(1, 6): 92 | for j in range(0, 20, 5): 93 | from_node = 'nt' + str(i + j) 94 | to_node = 'nt' + str(i + j + 5) 95 | str_edges += get_edge_str(edge, from_node, to_node, 'b') 96 | str_edges += get_edge_str(edge, to_node, from_node, 'b') 97 | str_edges += '\n' 98 | return str_edges 99 | 100 | 101 | def get_con_str(con, from_node, cur_node, to_node, from_lane, to_lane): 102 | from_edge = '%s_%s' % (from_node, cur_node) 103 | to_edge = '%s_%s' % (cur_node, to_node) 104 | return con % (from_edge, to_edge, from_lane, to_lane) 105 | 106 | 107 | def get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node): 108 | str_cons = '' 109 | # go-through 110 | str_cons += get_con_str(con, s_node, cur_node, n_node, 0, 0) 111 | str_cons += get_con_str(con, n_node, cur_node, s_node, 0, 0) 112 | str_cons += get_con_str(con, w_node, cur_node, e_node, 0, 0) 113 | str_cons += get_con_str(con, e_node, cur_node, w_node, 0, 0) 114 | # left-turn 115 | str_cons += get_con_str(con, s_node, cur_node, w_node, 0, 1) 116 | str_cons += get_con_str(con, n_node, cur_node, e_node, 0, 1) 117 | str_cons += get_con_str(con, w_node, cur_node, n_node, 1, 0) 118 | str_cons += get_con_str(con, e_node, cur_node, s_node, 1, 0) 119 | # right-turn 120 | str_cons += get_con_str(con, s_node, cur_node, e_node, 0, 0) 121 | str_cons += get_con_str(con, n_node, cur_node, w_node, 0, 0) 122 | str_cons += get_con_str(con, w_node, cur_node, s_node, 0, 0) 123 | str_cons += get_con_str(con, e_node, cur_node, n_node, 0, 0) 124 | return str_cons 125 | 126 | 127 | def output_connections(con): 128 | str_cons = '\n' 129 | # edge nodes 130 | in_edges = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1] 131 | out_edges = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20] 132 | for i, j in zip(in_edges, out_edges): 133 | if i == 5: 134 | s_node = 'np5' 135 | elif i == 1: 136 | s_node = 'np1' 137 | else: 138 | s_node = 'nt' + str(i - 5) 139 | if i == 25: 140 | n_node = 'np11' 141 | elif i == 21: 142 | n_node = 'np15' 143 | else: 144 | n_node = 'nt' + str(i + 5) 145 | if i % 5 == 1: 146 | w_node = 'np' + str(j) 147 | else: 148 | w_node = 'nt' + str(i - 1) 149 | if i % 5 == 0: 150 | e_node = 'np' + str(j) 151 | else: 152 | e_node = 'nt' + str(i + 1) 153 | cur_node = 'nt' + str(i) 154 | str_cons += get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node) 155 | 156 | in_edges = [2, 3, 4, 24, 23, 22] 157 | out_edges = [2, 3, 4, 12, 13, 14] 158 | for i, j in zip(in_edges, out_edges): 159 | w_node = 'nt' + str(i - 1) 160 | e_node = 'nt' + str(i + 1) 161 | if i <= 5: 162 | s_node = 'np' + str(j) 163 | else: 164 | s_node = 'nt' + str(i - 5) 165 | if i >= 20: 166 | n_node = 'np' + str(j) 167 | else: 168 | n_node = 'nt' + str(i + 5) 169 | cur_node = 'nt' + str(i) 170 | str_cons += get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node) 171 | 172 | # internal nodes 173 | for i in [7, 8, 9, 12, 13, 14, 17, 18, 19]: 174 | n_node = 'nt' + str(i + 5) 175 | s_node = 'nt' + str(i - 5) 176 | w_node = 'nt' + str(i - 1) 177 | e_node = 'nt' + str(i + 1) 178 | cur_node = 'nt' + str(i) 179 | str_cons += get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node) 180 | 181 | str_cons += '\n' 182 | return str_cons 183 | 184 | 185 | def output_netconfig(): 186 | str_config = '\n \n' 187 | str_config += ' \n' 188 | str_config += ' \n' 189 | str_config += ' \n' 190 | str_config += ' \n' 191 | str_config += ' \n' 192 | str_config += ' \n \n' 193 | str_config += ' \n' 194 | str_config += ' \n\n' 195 | return str_config 196 | 197 | 198 | def get_external_od(out_edges, dest=True): 199 | edge_maps = [0, 1, 2, 3, 4, 5, 5, 10, 15, 20, 25, 200 | 25, 24, 23, 22, 21, 21, 16, 11, 6, 1] 201 | cur_dest = [] 202 | for out_edge in out_edges: 203 | in_edge = edge_maps[out_edge] 204 | in_node = 'nt' + str(in_edge) 205 | out_node = 'np' + str(out_edge) 206 | if dest: 207 | edge = '%s_%s' % (in_node, out_node) 208 | else: 209 | edge = '%s_%s' % (out_node, in_node) 210 | cur_dest.append(edge) 211 | return cur_dest 212 | 213 | 214 | def sample_od_pair(orig_edges, dest_edges): 215 | from_edges = [] 216 | to_edges = [] 217 | for i in range(len(orig_edges)): 218 | from_edges.append(np.random.choice(orig_edges[i])) 219 | to_edges.append(np.random.choice(dest_edges)) 220 | return from_edges, to_edges 221 | 222 | 223 | def init_routes(density): 224 | init_flow = ' \n' 225 | output = '' 226 | in_nodes = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1, 227 | 1, 2, 3, 4, 5, 25, 24, 23, 22, 21] 228 | out_nodes = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 229 | 1, 2, 3, 4, 5, 11, 12, 13, 14, 15] 230 | # external edges 231 | sink_edges = [] 232 | for i, j in zip(in_nodes, out_nodes): 233 | node1 = 'nt' + str(i) 234 | node2 = 'np' + str(j) 235 | sink_edges.append('%s_%s' % (node1, node2)) 236 | 237 | def get_od(node1, node2, k, lane=0): 238 | source_edge = '%s_%s' % (node1, node2) 239 | sink_edge = np.random.choice(sink_edges) 240 | return init_flow % (str(k), source_edge, sink_edge, lane, car_num) 241 | 242 | # streets 243 | k = 1 244 | car_num = int(MAX_CAR_NUM * density) 245 | for i in range(1, 25, 5): 246 | for j in range(4): 247 | node1 = 'nt' + str(i + j) 248 | node2 = 'nt' + str(i + j + 1) 249 | output += get_od(node1, node2, k) 250 | k += 1 251 | output += get_od(node2, node1, k) 252 | k += 1 253 | output += get_od(node1, node2, k, lane=1) 254 | k += 1 255 | output += get_od(node2, node1, k, lane=1) 256 | k += 1 257 | # avenues 258 | for i in range(1, 6): 259 | for j in range(0, 20, 5): 260 | node1 = 'nt' + str(i + j) 261 | node2 = 'nt' + str(i + j + 5) 262 | output += get_od(node1, node2, k) 263 | k += 1 264 | output += get_od(node2, node1, k) 265 | k += 1 266 | return output 267 | 268 | def output_flows(peak_flow1, peak_flow2, density, seed=None): 269 | ''' 270 | flow1: x11, x12, x13, x14, x15 -> x1, x2, x3, x4, x5 271 | flow2: x16, x17, x18, x19, x20 -> x6, x7, x8, x9, x10 272 | flow3: x1, x2, x3, x4, x5 -> x15, x14, x13, x12, x11 273 | flow4: x6, x7, x8, x9, x10 -> x20, x19, x18, x17, x16 274 | ''' 275 | if seed is not None: 276 | np.random.seed(seed) 277 | ext_flow = ' \n' 278 | str_flows = '\n' 279 | str_flows += ' \n' 280 | # initial traffic dist 281 | if density > 0: 282 | str_flows += init_routes(density) 283 | 284 | # create external origins and destinations for flows 285 | srcs = [] 286 | srcs.append(get_external_od([12, 13, 14], dest=False)) 287 | srcs.append(get_external_od([16, 18, 20], dest=False)) 288 | srcs.append(get_external_od([2, 3, 4], dest=False)) 289 | srcs.append(get_external_od([6, 8, 10], dest=False)) 290 | 291 | sinks = [] 292 | sinks.append(get_external_od([2, 3, 4])) 293 | sinks.append(get_external_od([6, 8, 10])) 294 | sinks.append(get_external_od([14, 13, 12])) 295 | sinks.append(get_external_od([20, 18, 16])) 296 | 297 | # create volumes per 5 min for flows 298 | ratios1 = np.array([0.4, 0.7, 0.9, 1.0, 0.75, 0.5, 0.25]) # start from 0 299 | ratios2 = np.array([0.3, 0.8, 0.9, 1.0, 0.8, 0.6, 0.2]) # start from 15min 300 | flows1 = peak_flow1 * 0.6 * ratios1 301 | flows2 = peak_flow1 * ratios1 302 | flows3 = peak_flow2 * 0.6 * ratios2 303 | flows4 = peak_flow2 * ratios2 304 | flows = [flows1, flows2, flows3, flows4] 305 | times = np.arange(0, 3001, 300) 306 | id1 = len(flows1) 307 | id2 = len(times) - 1 - id1 308 | for i in range(len(times) - 1): 309 | name = str(i) 310 | t_begin, t_end = times[i], times[i + 1] 311 | # external flow 312 | k = 0 313 | if i < id1: 314 | for j in [0, 1]: 315 | for e1, e2 in zip(srcs[j], sinks[j]): 316 | cur_name = name + '_' + str(k) 317 | str_flows += ext_flow % (cur_name, e1, e2, t_begin, t_end, flows[j][i]) 318 | k += 1 319 | if i >= id2: 320 | for j in [2, 3]: 321 | for e1, e2 in zip(srcs[j], sinks[j]): 322 | cur_name = name + '_' + str(k) 323 | str_flows += ext_flow % (cur_name, e1, e2, t_begin, t_end, flows[j][i - id2]) 324 | k += 1 325 | str_flows += '\n' 326 | return str_flows 327 | 328 | 329 | def gen_rou_file(path, peak_flow1, peak_flow2, density, seed=None, thread=None): 330 | if thread is None: 331 | flow_file = 'exp.rou.xml' 332 | else: 333 | flow_file = 'exp_%d.rou.xml' % int(thread) 334 | write_file(path + flow_file, output_flows(peak_flow1, peak_flow2, density, seed=seed)) 335 | sumocfg_file = path + ('exp_%d.sumocfg' % thread) 336 | write_file(sumocfg_file, output_config(thread=thread)) 337 | return sumocfg_file 338 | 339 | 340 | def output_config(thread=None): 341 | if thread is None: 342 | out_file = 'exp.rou.xml' 343 | else: 344 | out_file = 'exp_%d.rou.xml' % int(thread) 345 | str_config = '\n \n' 346 | str_config += ' \n' 347 | str_config += ' \n' % out_file 348 | str_config += ' \n' 349 | str_config += ' \n \n\n' 352 | return str_config 353 | 354 | 355 | def get_ild_str(from_node, to_node, ild_str, lane_i=0): 356 | edge = '%s_%s' % (from_node, to_node) 357 | return ild_str % (edge, lane_i, edge, lane_i) 358 | 359 | 360 | def output_ild(ild): 361 | str_adds = '\n' 362 | in_edges = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1, 363 | 1, 2, 3, 4, 5, 25, 24, 23, 22, 21] 364 | out_edges = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 365 | 1, 2, 3, 4, 5, 11, 12, 13, 14, 15] 366 | # external edges 367 | for k, (i, j) in enumerate(zip(in_edges, out_edges)): 368 | node1 = 'nt' + str(i) 369 | node2 = 'np' + str(j) 370 | str_adds += get_ild_str(node2, node1, ild) 371 | if k < 10: 372 | # streets 373 | str_adds += get_ild_str(node2, node1, ild, lane_i=1) 374 | # streets 375 | for i in range(1, 25, 5): 376 | for j in range(4): 377 | node1 = 'nt' + str(i + j) 378 | node2 = 'nt' + str(i + j + 1) 379 | str_adds += get_ild_str(node1, node2, ild) 380 | str_adds += get_ild_str(node2, node1, ild) 381 | str_adds += get_ild_str(node1, node2, ild, lane_i=1) 382 | str_adds += get_ild_str(node2, node1, ild, lane_i=1) 383 | # avenues 384 | for i in range(1, 6): 385 | for j in range(0, 20, 5): 386 | node1 = 'nt' + str(i + j) 387 | node2 = 'nt' + str(i + j + 5) 388 | str_adds += get_ild_str(node1, node2, ild) 389 | str_adds += get_ild_str(node2, node1, ild) 390 | str_adds += '\n' 391 | return str_adds 392 | 393 | 394 | def output_tls(tls, phase): 395 | str_adds = '\n' 396 | # all crosses have 3 phases 397 | three_phases = ['GGgrrrGGgrrr', 'yyyrrryyyrrr', 398 | 'rrrGrGrrrGrG', 'rrrGryrrrGry', 399 | 'rrrGGrrrrGGr', 'rrryyrrrryyr'] 400 | phase_duration = [30, 3] 401 | for i in range(1, 26): 402 | node = 'nt' + str(i) 403 | str_adds += tls % node 404 | for k, p in enumerate(three_phases): 405 | str_adds += phase % (phase_duration[k % 2], p) 406 | str_adds += ' \n' 407 | str_adds += '\n' 408 | return str_adds 409 | 410 | 411 | def main(): 412 | # nod.xml file 413 | node = ' \n' 414 | write_file('./exp.nod.xml', output_nodes(node)) 415 | 416 | # typ.xml file 417 | write_file('./exp.typ.xml', output_road_types()) 418 | 419 | # edg.xml file 420 | edge = ' \n' 421 | write_file('./exp.edg.xml', output_edges(edge)) 422 | 423 | # con.xml file 424 | con = ' \n' 425 | write_file('./exp.con.xml', output_connections(con)) 426 | 427 | # tls.xml file 428 | tls = ' \n' 429 | phase = ' \n' 430 | write_file('./exp.tll.xml', output_tls(tls, phase)) 431 | 432 | # net config file 433 | write_file('./exp.netccfg', output_netconfig()) 434 | 435 | # generate net.xml file 436 | os.system('netconvert -c exp.netccfg') 437 | 438 | # raw.rou.xml file 439 | write_file('./exp.rou.xml', output_flows(1000, 2000, 0.2)) 440 | 441 | # generate rou.xml file 442 | # os.system('jtrrouter -n exp.net.xml -r exp.raw.rou.xml -o exp.rou.xml') 443 | 444 | # add.xml file 445 | ild = ' \n' 446 | # ild_in = ' \n' 447 | write_file('./exp.add.xml', output_ild(ild)) 448 | 449 | # config file 450 | write_file('./exp.sumocfg', output_config()) 451 | 452 | if __name__ == '__main__': 453 | main() 454 | -------------------------------------------------------------------------------- /envs/data/intersection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/intersection.pdf -------------------------------------------------------------------------------- /envs/data/network.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/network.pdf -------------------------------------------------------------------------------- /envs/data/view.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | -------------------------------------------------------------------------------- /envs/env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Traffic network simulator w/ defined sumo files 3 | @author: Tianshu Chu 4 | """ 5 | import logging 6 | import numpy as np 7 | import pandas as pd 8 | import subprocess 9 | #from sumolib import checkBinary 10 | import time 11 | #import traci 12 | import xml.etree.cElementTree as ET 13 | 14 | DEFAULT_PORT = 8000 15 | SEC_IN_MS = 1000 16 | 17 | 18 | class PhaseSet: 19 | def __init__(self, phases): 20 | self.num_phase = len(phases) 21 | self.num_lane = len(phases[0]) 22 | self.phases = phases 23 | self._init_phase_set() 24 | 25 | @staticmethod 26 | def _get_phase_lanes(phase, signal='r'): 27 | phase_lanes = [] 28 | for i, l in enumerate(phase): 29 | if l == signal: 30 | phase_lanes.append(i) 31 | return phase_lanes 32 | 33 | def _init_phase_set(self): 34 | self.red_lanes = [] 35 | for phase in self.phases: 36 | self.red_lanes.append(self._get_phase_lanes(phase)) 37 | 38 | 39 | class PhaseMap: 40 | def __init__(self): 41 | self.phases = {} 42 | 43 | def get_phase(self, phase_id, action): 44 | # phase_type is either green or yellow 45 | return self.phases[phase_id].phases[int(action)] 46 | 47 | def get_phase_num(self, phase_id): 48 | return self.phases[phase_id].num_phase 49 | 50 | def get_lane_num(self, phase_id): 51 | # the lane number is link number 52 | return self.phases[phase_id].num_lane 53 | 54 | def get_red_lanes(self, phase_id, action): 55 | # the lane number is link number 56 | return self.phases[phase_id].red_lanes[int(action)] 57 | 58 | 59 | class Node: 60 | def __init__(self, name, neighbor=[], control=False): 61 | self.control = control # disabled 62 | self.lanes_in = [] 63 | self.ilds_in = [] # for state 64 | self.fingerprint = [] # local policy 65 | self.name = name 66 | self.neighbor = neighbor 67 | self.num_state = 0 # wave and wait should have the same dim 68 | self.wave_state = [] # local state 69 | self.wait_state = [] # local state 70 | self.phase_id = -1 71 | self.n_a = 0 72 | self.prev_action = -1 73 | 74 | 75 | class TrafficSimulator: 76 | def __init__(self, config, output_path, is_record, record_stats, port=0): 77 | self.name = config.get('scenario') 78 | self.seed = config.getint('seed') 79 | self.control_interval_sec = config.getint('control_interval_sec') 80 | self.yellow_interval_sec = config.getint('yellow_interval_sec') 81 | self.episode_length_sec = config.getint('episode_length_sec') 82 | self.T = np.ceil(self.episode_length_sec / self.control_interval_sec) 83 | self.port = DEFAULT_PORT + port 84 | self.sim_thread = port 85 | self.obj = config.get('objective') 86 | self.data_path = config.get('data_path') 87 | self.agent = config.get('agent') 88 | self.coop_gamma = config.getfloat('coop_gamma') 89 | self.cur_episode = 0 90 | self.norms = {'wave': config.getfloat('norm_wave'), 91 | 'wait': config.getfloat('norm_wait')} 92 | self.clips = {'wave': config.getfloat('clip_wave'), 93 | 'wait': config.getfloat('clip_wait')} 94 | self.coef_wait = config.getfloat('coef_wait') 95 | self.train_mode = True 96 | test_seeds = config.get('test_seeds').split(',') 97 | test_seeds = [int(s) for s in test_seeds] 98 | self._init_map() 99 | self.init_data(is_record, record_stats, output_path) 100 | self.init_test_seeds(test_seeds) 101 | self._init_sim(self.seed) 102 | self._init_nodes() 103 | self.terminate() 104 | 105 | def collect_tripinfo(self): 106 | # read trip xml, has to be called externally to get complete file 107 | trip_file = self.output_path + ('%s_%s_trip.xml' % (self.name, self.agent)) 108 | tree = ET.ElementTree(file=trip_file) 109 | for child in tree.getroot(): 110 | cur_trip = child.attrib 111 | cur_dict = {} 112 | cur_dict['episode'] = self.cur_episode 113 | cur_dict['id'] = cur_trip['id'] 114 | cur_dict['depart_sec'] = cur_trip['depart'] 115 | cur_dict['arrival_sec'] = cur_trip['arrival'] 116 | cur_dict['duration_sec'] = cur_trip['duration'] 117 | cur_dict['wait_step'] = cur_trip['waitingCount'] 118 | cur_dict['wait_sec'] = cur_trip['waitingTime'] 119 | self.trip_data.append(cur_dict) 120 | # delete the current xml 121 | cmd = 'rm ' + trip_file 122 | subprocess.check_call(cmd, shell=True) 123 | 124 | def get_fingerprint(self): 125 | policies = [] 126 | for node_name in self.node_names: 127 | policies.append(self.nodes[node_name].fingerprint) 128 | return np.array(policies) 129 | 130 | def get_neighbor_action(self, action): 131 | naction = [] 132 | for i in range(self.n_agent): 133 | naction.append(action[self.neighbor_mask[i] == 1]) 134 | return naction 135 | 136 | def init_data(self, is_record, record_stats, output_path): 137 | self.is_record = is_record 138 | self.record_stats = record_stats 139 | self.output_path = output_path 140 | if self.is_record: 141 | self.traffic_data = [] 142 | self.control_data = [] 143 | self.trip_data = [] 144 | if self.record_stats: 145 | self.state_stat = {} 146 | for state_name in self.state_names: 147 | self.state_stat[state_name] = [] 148 | 149 | def init_test_seeds(self, test_seeds): 150 | self.test_num = len(test_seeds) 151 | self.test_seeds = test_seeds 152 | 153 | def output_data(self): 154 | if not self.is_record: 155 | logging.error('Env: no record to output!') 156 | control_data = pd.DataFrame(self.control_data) 157 | control_data.to_csv(self.output_path + ('%s_%s_control.csv' % (self.name, self.agent))) 158 | traffic_data = pd.DataFrame(self.traffic_data) 159 | traffic_data.to_csv(self.output_path + ('%s_%s_traffic.csv' % (self.name, self.agent))) 160 | trip_data = pd.DataFrame(self.trip_data) 161 | trip_data.to_csv(self.output_path + ('%s_%s_trip.csv' % (self.name, self.agent))) 162 | 163 | def reset(self, gui=False, test_ind=0): 164 | # have to terminate previous sim before calling reset 165 | self._reset_state() 166 | if self.train_mode: 167 | seed = self.seed 168 | else: 169 | seed = self.test_seeds[test_ind] 170 | self._init_sim(seed, gui=gui) 171 | self.cur_sec = 0 172 | self.cur_episode += 1 173 | # initialize fingerprint 174 | self.update_fingerprint(self._init_policy()) 175 | # next environment random condition should be different 176 | self.seed += 1 177 | return self._get_state() 178 | 179 | def step(self, action): 180 | self._set_phase(action, 'yellow', self.yellow_interval_sec) 181 | self._simulate(self.yellow_interval_sec) 182 | rest_interval_sec = self.control_interval_sec - self.yellow_interval_sec 183 | self._set_phase(action, 'green', rest_interval_sec) 184 | self._simulate(rest_interval_sec) 185 | state = self._get_state() 186 | reward = self._measure_reward_step() 187 | done = False 188 | if self.cur_sec >= self.episode_length_sec: 189 | done = True 190 | global_reward = np.sum(reward) 191 | if self.is_record: 192 | action_r = ','.join(['%d' % a for a in action]) 193 | cur_control = {'episode': self.cur_episode, 194 | 'time_sec': self.cur_sec, 195 | 'step': self.cur_sec / self.control_interval_sec, 196 | 'action': action_r, 197 | 'reward': global_reward} 198 | self.control_data.append(cur_control) 199 | 200 | # use original rewards in test 201 | if not self.train_mode: 202 | return state, reward, done, global_reward 203 | if (self.agent == 'greedy') or (self.coop_gamma < 0): 204 | reward = global_reward 205 | return state, reward, done, global_reward 206 | 207 | def terminate(self): 208 | self.sim.close() 209 | 210 | def update_fingerprint(self, policy): 211 | for node_name, pi in zip(self.node_names, policy): 212 | self.nodes[node_name].fingerprint = pi 213 | 214 | def _get_node_phase(self, action, node_name, phase_type): 215 | node = self.nodes[node_name] 216 | cur_phase = self.phase_map.get_phase(node.phase_id, action) 217 | if phase_type == 'green': 218 | return cur_phase 219 | prev_action = node.prev_action 220 | node.prev_action = action 221 | if (prev_action < 0) or (action == prev_action): 222 | return cur_phase 223 | prev_phase = self.phase_map.get_phase(node.phase_id, prev_action) 224 | switch_reds = [] 225 | switch_greens = [] 226 | for i, (p0, p1) in enumerate(zip(prev_phase, cur_phase)): 227 | if (p0 in 'Gg') and (p1 == 'r'): 228 | switch_reds.append(i) 229 | elif (p0 in 'r') and (p1 in 'Gg'): 230 | switch_greens.append(i) 231 | if not len(switch_reds): 232 | return cur_phase 233 | yellow_phase = list(cur_phase) 234 | for i in switch_reds: 235 | yellow_phase[i] = 'y' 236 | for i in switch_greens: 237 | yellow_phase[i] = 'r' 238 | return ''.join(yellow_phase) 239 | 240 | def _get_node_phase_id(self, node_name): 241 | # needs to be overwriteen 242 | raise NotImplementedError() 243 | 244 | def _get_state(self): 245 | # hard code the state ordering as wave, wait, fp 246 | state = [] 247 | # measure the most recent state 248 | self._measure_state_step() 249 | 250 | # get the appropriate state vectors 251 | for node_name in self.node_names: 252 | node = self.nodes[node_name] 253 | # wave is required in state 254 | if self.agent == 'greedy': 255 | state.append(node.wave_state) 256 | else: 257 | cur_state = [node.wave_state] 258 | 259 | # include wave states of neighbors 260 | if self.agent.startswith('ia2c'): 261 | for nnode_name in node.neighbor: 262 | cur_state.append(self.nodes[nnode_name].wave_state) 263 | 264 | # include fingerprints of neighbors 265 | if self.agent == 'ia2c_fp': 266 | for nnode_name in node.neighbor: 267 | cur_state.append(self.nodes[nnode_name].fingerprint) 268 | 269 | # include wait state 270 | if 'wait' in self.state_names: 271 | cur_state.append(node.wait_state) 272 | state.append(np.concatenate(cur_state)) 273 | return state 274 | 275 | def _init_action_space(self): 276 | # for local and neighbor coop level 277 | self.n_agent = self.n_node 278 | # to simplify the sim, we assume all agents have the same action dim 279 | phase_id = self._get_node_phase_id('all') 280 | phase_num = self.phase_map.get_phase_num(phase_id) 281 | self.n_a = phase_num 282 | for node_name in self.node_names: 283 | node = self.nodes[node_name] 284 | node.phase_id = phase_id 285 | node.n_a = phase_num 286 | 287 | def _init_map(self): 288 | # needs to be overwriteen 289 | self.neighbor_map = None 290 | self.phase_map = None 291 | self.state_names = None 292 | raise NotImplementedError() 293 | 294 | def _init_nodes(self): 295 | nodes = {} 296 | tl_nodes = self.sim.trafficlight.getIDList() 297 | for node_name in self.node_names: 298 | if node_name not in tl_nodes: 299 | logging.error('node %s can not be found!' % node_name) 300 | exit(1) 301 | neighbor = self.neighbor_map[node_name] 302 | nodes[node_name] = Node(node_name, 303 | neighbor=neighbor, 304 | control=True) 305 | # controlled lanes: l:j,i_k 306 | lanes_in = self.sim.trafficlight.getControlledLanes(node_name) 307 | nodes[node_name].lanes_in = lanes_in 308 | ilds_in = [] 309 | for lane_name in lanes_in: 310 | ild_name = lane_name 311 | if ild_name not in ilds_in: 312 | ilds_in.append(ild_name) 313 | nodes[node_name].ilds_in = ilds_in 314 | self.nodes = nodes 315 | s = 'Env: init %d node information:\n' % len(self.node_names) 316 | for node in self.nodes.values(): 317 | s += node.name + ':\n' 318 | s += '\tneigbor: %r\n' % node.neighbor 319 | s += '\tilds_in: %r\n' % node.ilds_in 320 | logging.info(s) 321 | self._init_action_space() 322 | self._init_state_space() 323 | 324 | def _init_policy(self): 325 | return [np.ones(self.n_a) / self.n_a for _ in range(self.n_agent)] 326 | 327 | def _init_sim(self, seed, gui=False): 328 | sumocfg_file = self._init_sim_config(seed) 329 | if gui: 330 | app = 'sumo-gui' 331 | else: 332 | app = 'sumo' 333 | command = [checkBinary(app), '-c', sumocfg_file] 334 | command += ['--seed', str(seed)] 335 | command += ['--remote-port', str(self.port)] 336 | command += ['--no-step-log', 'True'] 337 | command += ['--time-to-teleport', '600'] # long teleport for safety 338 | command += ['--no-warnings', 'True'] 339 | command += ['--duration-log.disable', 'True'] 340 | # collect trip info if necessary 341 | if self.is_record: 342 | command += ['--tripinfo-output', 343 | self.output_path + ('%s_%s_trip.xml' % (self.name, self.agent))] 344 | subprocess.Popen(command) 345 | # wait 1s to establish the traci server 346 | time.sleep(1) 347 | self.sim = traci.connect(port=self.port) 348 | 349 | def _init_sim_config(self): 350 | # needs to be overwriteen 351 | raise NotImplementedError() 352 | 353 | def _init_state_space(self): 354 | self._reset_state() 355 | n_s_ls = [] 356 | for node_name in self.node_names: 357 | node = self.nodes[node_name] 358 | # fingerprint is previous policy 359 | node.num_fingerprint = self.n_a 360 | node.num_state = len(node.ilds_in) 361 | num_wave = node.num_state 362 | num_wait = 0 if 'wait' not in self.state_names else node.num_state 363 | if self.agent.startswith('ma2c'): 364 | num_n = 1 365 | else: 366 | num_n = 1 + len(node.neighbor) 367 | n_s_ls.append(num_wait + num_wave * num_n) 368 | if self.agent.startswith('ma2c'): 369 | assert len(set(n_s_ls)) == 1 370 | self.n_s = n_s_ls[0] 371 | else: 372 | self.n_s_ls = n_s_ls 373 | 374 | def _measure_reward_step(self): 375 | rewards = [] 376 | for node_name in self.node_names: 377 | queues = [] 378 | waits = [] 379 | for ild in self.nodes[node_name].ilds_in: 380 | if self.obj in ['queue', 'hybrid']: 381 | cur_queue = self.sim.lanearea.getLastStepHaltingNumber(ild) 382 | queues.append(cur_queue) 383 | if self.obj in ['wait', 'hybrid']: 384 | max_pos = 0 385 | car_wait = 0 386 | cur_cars = self.sim.lanearea.getLastStepVehicleIDs(ild) 387 | for vid in cur_cars: 388 | car_pos = self.sim.vehicle.getLanePosition(vid) 389 | if car_pos > max_pos: 390 | max_pos = car_pos 391 | car_wait = self.sim.vehicle.getWaitingTime(vid) 392 | waits.append(car_wait) 393 | queue = np.sum(np.array(queues)) if len(queues) else 0 394 | wait = np.sum(np.array(waits)) if len(waits) else 0 395 | if self.obj == 'queue': 396 | reward = - queue 397 | elif self.obj == 'wait': 398 | reward = - wait 399 | else: 400 | reward = - queue - self.coef_wait * wait 401 | rewards.append(reward) 402 | return np.array(rewards) 403 | 404 | def _measure_state_step(self): 405 | for node_name in self.node_names: 406 | node = self.nodes[node_name] 407 | for state_name in self.state_names: 408 | if state_name == 'wave': 409 | cur_state = [] 410 | for ild in node.ilds_in: 411 | cur_wave = self.sim.lanearea.getLastStepVehicleNumber(ild) 412 | cur_state.append(cur_wave) 413 | cur_state = np.array(cur_state) 414 | elif state_name == 'wait': 415 | cur_state = [] 416 | for ild in node.ilds_in: 417 | max_pos = 0 418 | car_wait = 0 419 | cur_cars = self.sim.lanearea.getLastStepVehicleIDs(ild) 420 | for vid in cur_cars: 421 | car_pos = self.sim.vehicle.getLanePosition(vid) 422 | if car_pos > max_pos: 423 | max_pos = car_pos 424 | car_wait = self.sim.vehicle.getWaitingTime(vid) 425 | cur_state.append(car_wait) 426 | cur_state = np.array(cur_state) 427 | if self.record_stats: 428 | self.state_stat[state_name] += list(cur_state) 429 | # normalization 430 | norm_cur_state = self._norm_clip_state(cur_state, 431 | self.norms[state_name], 432 | self.clips[state_name]) 433 | if state_name == 'wave': 434 | node.wave_state = norm_cur_state 435 | else: 436 | node.wait_state = norm_cur_state 437 | 438 | def _measure_traffic_step(self): 439 | cars = self.sim.vehicle.getIDList() 440 | num_tot_car = len(cars) 441 | num_in_car = self.sim.simulation.getDepartedNumber() 442 | num_out_car = self.sim.simulation.getArrivedNumber() 443 | if num_tot_car > 0: 444 | avg_waiting_time = np.mean([self.sim.vehicle.getWaitingTime(car) for car in cars]) 445 | avg_speed = np.mean([self.sim.vehicle.getSpeed(car) for car in cars]) 446 | else: 447 | avg_speed = 0 448 | avg_waiting_time = 0 449 | # all trip-related measurements are not supported by traci, 450 | # need to read from outputfile afterwards 451 | queues = [] 452 | for node_name in self.node_names: 453 | for ild in self.nodes[node_name].ilds_in: 454 | lane_name = ild 455 | queues.append(self.sim.lane.getLastStepHaltingNumber(lane_name)) 456 | avg_queue = np.mean(np.array(queues)) 457 | std_queue = np.std(np.array(queues)) 458 | cur_traffic = {'episode': self.cur_episode, 459 | 'time_sec': self.cur_sec, 460 | 'number_total_car': num_tot_car, 461 | 'number_departed_car': num_in_car, 462 | 'number_arrived_car': num_out_car, 463 | 'avg_wait_sec': avg_waiting_time, 464 | 'avg_speed_mps': avg_speed, 465 | 'std_queue': std_queue, 466 | 'avg_queue': avg_queue} 467 | self.traffic_data.append(cur_traffic) 468 | 469 | @staticmethod 470 | def _norm_clip_state(x, norm, clip=-1): 471 | x = x / norm 472 | return x if clip < 0 else np.clip(x, 0, clip) 473 | 474 | def _reset_state(self): 475 | for node_name in self.node_names: 476 | node = self.nodes[node_name] 477 | # prev action for yellow phase before each switch 478 | node.prev_action = 0 479 | 480 | def _set_phase(self, action, phase_type, phase_duration): 481 | for node_name, a in zip(self.node_names, list(action)): 482 | phase = self._get_node_phase(a, node_name, phase_type) 483 | self.sim.trafficlight.setRedYellowGreenState(node_name, phase) 484 | self.sim.trafficlight.setPhaseDuration(node_name, phase_duration) 485 | 486 | def _simulate(self, num_step): 487 | # reward = np.zeros(len(self.control_node_names)) 488 | for _ in range(num_step): 489 | self.sim.simulationStep() 490 | self.cur_sec += 1 491 | if self.is_record: 492 | self._measure_traffic_step() 493 | -------------------------------------------------------------------------------- /envs/large_grid_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Particular class of large traffic grid 3 | @author: Tianshu Chu 4 | """ 5 | 6 | import configparser 7 | import logging 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import os 11 | import seaborn as sns 12 | import time 13 | from envs.env import PhaseMap, PhaseSet, TrafficSimulator 14 | from envs.data.build_file import gen_rou_file 15 | 16 | sns.set_color_codes() 17 | 18 | 19 | STATE_NAMES = ['wave'] 20 | PHASE_NUM = 5 21 | 22 | 23 | class LargeGridPhase(PhaseMap): 24 | def __init__(self): 25 | phases = ['GGgrrrGGgrrr', 'rrrGrGrrrGrG', 'rrrGGrrrrGGr', 26 | 'rrrGGGrrrrrr', 'rrrrrrrrrGGG'] 27 | self.phases = {PHASE_NUM: PhaseSet(phases)} 28 | 29 | 30 | class LargeGridController: 31 | def __init__(self, node_names): 32 | self.name = 'greedy' 33 | self.node_names = node_names 34 | 35 | def forward(self, obs): 36 | actions = [] 37 | for ob, node_name in zip(obs, self.node_names): 38 | actions.append(self.greedy(ob, node_name)) 39 | return actions 40 | 41 | def greedy(self, ob, node_name): 42 | # hard code the mapping from state to number of cars 43 | flows = [ob[0] + ob[3], ob[2] + ob[5], ob[1] + ob[4], 44 | ob[1] + ob[2], ob[4] + ob[5]] 45 | return np.argmax(np.array(flows)) 46 | 47 | 48 | class LargeGridEnv(TrafficSimulator): 49 | def __init__(self, config, port=0, output_path='', is_record=False, record_stat=False): 50 | self.peak_flow1 = config.getint('peak_flow1') 51 | self.peak_flow2 = config.getint('peak_flow2') 52 | self.init_density = config.getfloat('init_density') 53 | super().__init__(config, output_path, is_record, record_stat, port=port) 54 | 55 | def _get_node_phase_id(self, node_name): 56 | return PHASE_NUM 57 | 58 | def _init_neighbor_map(self): 59 | neighbor_map = {} 60 | # corner nodes 61 | neighbor_map['nt1'] = ['nt6', 'nt2'] 62 | neighbor_map['nt5'] = ['nt10', 'nt4'] 63 | neighbor_map['nt21'] = ['nt22', 'nt16'] 64 | neighbor_map['nt25'] = ['nt20', 'nt24'] 65 | # edge nodes 66 | neighbor_map['nt2'] = ['nt7', 'nt3', 'nt1'] 67 | neighbor_map['nt3'] = ['nt8', 'nt4', 'nt2'] 68 | neighbor_map['nt4'] = ['nt9', 'nt5', 'nt3'] 69 | neighbor_map['nt22'] = ['nt23', 'nt17', 'nt21'] 70 | neighbor_map['nt23'] = ['nt24', 'nt18', 'nt22'] 71 | neighbor_map['nt24'] = ['nt25', 'nt19', 'nt23'] 72 | neighbor_map['nt10'] = ['nt15', 'nt5', 'nt9'] 73 | neighbor_map['nt15'] = ['nt20', 'nt10', 'nt14'] 74 | neighbor_map['nt20'] = ['nt25', 'nt15', 'nt19'] 75 | neighbor_map['nt6'] = ['nt11', 'nt7', 'nt1'] 76 | neighbor_map['nt11'] = ['nt16', 'nt12', 'nt6'] 77 | neighbor_map['nt16'] = ['nt21', 'nt17', 'nt11'] 78 | # internal nodes 79 | for i in [7, 8, 9, 12, 13, 14, 17, 18, 19]: 80 | n_node = 'nt' + str(i + 5) 81 | s_node = 'nt' + str(i - 5) 82 | w_node = 'nt' + str(i - 1) 83 | e_node = 'nt' + str(i + 1) 84 | cur_node = 'nt' + str(i) 85 | neighbor_map[cur_node] = [n_node, e_node, s_node, w_node] 86 | self.neighbor_map = neighbor_map 87 | self.neighbor_mask = np.zeros((self.n_node, self.n_node)) 88 | for i in range(self.n_node): 89 | for nnode in neighbor_map['nt%d' % (i+1)]: 90 | ni = self.node_names.index(nnode) 91 | self.neighbor_mask[i, ni] = 1 92 | logging.info('neighbor mask:\n %r' % self.neighbor_mask) 93 | 94 | def _init_distance_map(self): 95 | block0 = np.array([[0,1,2,3,4],[1,0,1,2,3],[2,1,0,1,2],[3,2,1,0,1],[4,3,2,1,0]]) 96 | block1 = block0 + 1 97 | block2 = block0 + 2 98 | block3 = block0 + 3 99 | block4 = block0 + 4 100 | row0 = np.hstack([block0, block1, block2, block3, block4]) 101 | row1 = np.hstack([block1, block0, block1, block2, block3]) 102 | row2 = np.hstack([block2, block1, block0, block1, block2]) 103 | row3 = np.hstack([block3, block2, block1, block0, block1]) 104 | row4 = np.hstack([block4, block3, block2, block1, block0]) 105 | self.distance_mask = np.vstack([row0, row1, row2, row3, row4]) 106 | 107 | def _init_map(self): 108 | self.node_names = ['nt%d' % i for i in range(1, 26)] 109 | self.n_node = 25 110 | self._init_neighbor_map() 111 | # for spatial discount 112 | self._init_distance_map() 113 | self.max_distance = 8 114 | self.phase_map = LargeGridPhase() 115 | self.state_names = STATE_NAMES 116 | 117 | def _init_sim_config(self, seed): 118 | return gen_rou_file(self.data_path, 119 | self.peak_flow1, 120 | self.peak_flow2, 121 | self.init_density, 122 | seed=seed, 123 | thread=self.sim_thread) 124 | 125 | def plot_stat(self, rewards): 126 | self.state_stat['reward'] = rewards 127 | for name, data in self.state_stat.items(): 128 | fig = plt.figure(figsize=(8, 6)) 129 | plot_cdf(data) 130 | plt.ylabel(name) 131 | fig.savefig(self.output_path + self.name + '_' + name + '.png') 132 | 133 | 134 | def plot_cdf(X, c='b', label=None): 135 | sorted_data = np.sort(X) 136 | yvals = np.arange(len(sorted_data))/float(len(sorted_data)-1) 137 | plt.plot(sorted_data, yvals, color=c, label=label) 138 | 139 | if __name__ == '__main__': 140 | logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', 141 | level=logging.INFO) 142 | config = configparser.ConfigParser() 143 | config.read('./config/config_greedy.ini') 144 | base_dir = './greedy/' 145 | if not os.path.exists(base_dir): 146 | os.mkdir(base_dir) 147 | env = LargeGridEnv(config['ENV_CONFIG'], 2, base_dir, is_record=True, record_stat=True) 148 | env.train_mode = False 149 | time.sleep(1) 150 | controller = LargeGridController(env.node_names) 151 | rewards = [] 152 | for i in range(env.test_num): 153 | ob = env.reset(test_ind=i) 154 | while True: 155 | next_ob, _, done, reward = env.step(controller.forward(ob)) 156 | rewards.append(reward) 157 | if done: 158 | break 159 | ob = next_ob 160 | env.terminate() 161 | time.sleep(2) 162 | env.collect_tripinfo() 163 | env.plot_stat(np.array(rewards)) 164 | logging.info('avg reward: %.2f' % np.mean(rewards)) 165 | env.output_data() 166 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main function for training and evaluating MARL algorithms in traffic envs 3 | @author: Tianshu Chu 4 | """ 5 | 6 | import argparse 7 | import configparser 8 | import logging 9 | import tensorflow as tf 10 | import threading 11 | from envs.large_grid_env import LargeGridEnv, LargeGridController 12 | from agents.models import IA2C, IA2C_FP, IA2C_CU, MA2C_NC, MA2C_IC3, MA2C_DIAL 13 | from utils import (Counter, Trainer, Tester, Evaluator, 14 | check_dir, copy_file, find_file, 15 | init_dir, init_log, init_test_flag, 16 | plot_evaluation, plot_train) 17 | 18 | 19 | def parse_args(): 20 | default_base_dir = '/Users/tchu/Documents/rl_test/deeprl_dist/ma2c_ic3_test' 21 | default_config_dir = './config/config_ma2c_ic3.ini' 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--base-dir', type=str, required=False, 24 | default=default_base_dir, help="experiment base dir") 25 | subparsers = parser.add_subparsers(dest='option', help="train or evaluate") 26 | sp = subparsers.add_parser('train', help='train a single agent under base dir') 27 | sp.add_argument('--test-mode', type=str, required=False, 28 | default='after_train_test', 29 | help="test mode during training", 30 | choices=['no_test', 'in_train_test', 'after_train_test', 'all_test']) 31 | sp.add_argument('--config-dir', type=str, required=False, 32 | default=default_config_dir, help="experiment config path") 33 | sp = subparsers.add_parser('evaluate', help="evaluate and compare agents under base dir") 34 | sp.add_argument('--evaluate-seeds', type=str, required=False, 35 | default=','.join([str(i) for i in range(2000, 2500, 10)]), 36 | help="random seeds for evaluation, split by ,") 37 | args = parser.parse_args() 38 | if not args.option: 39 | parser.print_help() 40 | exit(1) 41 | return args 42 | 43 | 44 | def init_env(config, port=0, naive_policy=False): 45 | if not naive_policy: 46 | return LargeGridEnv(config, port=port) 47 | else: 48 | env = LargeGridEnv(config, port=port) 49 | policy = LargeGridController(env.node_names) 50 | return env, policy 51 | 52 | 53 | def init_agent(env, config, total_step, seed): 54 | if env.agent == 'ia2c': 55 | return IA2C(env.n_s_ls, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma, 56 | total_step, config, seed=seed) 57 | elif env.agent == 'ia2c_fp': 58 | return IA2C_FP(env.n_s_ls, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma, 59 | total_step, config, seed=seed) 60 | elif env.agent == 'ma2c_nc': 61 | return MA2C_NC(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma, 62 | total_step, config, seed=seed) 63 | elif env.agent == 'ma2c_ic3': 64 | return MA2C_IC3(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma, 65 | total_step, config, seed=seed) 66 | elif env.agent == 'ma2c_cu': 67 | return IA2C_CU(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma, 68 | total_step, config, seed=seed) 69 | elif env.agent == 'ma2c_dial': 70 | return MA2C_DIAL(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma, 71 | total_step, config, seed=seed) 72 | else: 73 | return None 74 | 75 | 76 | def train(args): 77 | base_dir = args.base_dir 78 | dirs = init_dir(base_dir) 79 | init_log(dirs['log']) 80 | config_dir = args.config_dir 81 | copy_file(config_dir, dirs['data']) 82 | config = configparser.ConfigParser() 83 | config.read(config_dir) 84 | in_test, post_test = init_test_flag(args.test_mode) 85 | 86 | # init env 87 | env = init_env(config['ENV_CONFIG']) 88 | logging.info('Training: a dim %d, agent dim: %d' % (env.n_a, env.n_agent)) 89 | 90 | # init step counter 91 | total_step = int(config.getfloat('TRAIN_CONFIG', 'total_step')) 92 | test_step = int(config.getfloat('TRAIN_CONFIG', 'test_interval')) 93 | log_step = int(config.getfloat('TRAIN_CONFIG', 'log_interval')) 94 | global_counter = Counter(total_step, test_step, log_step) 95 | 96 | # init centralized or multi agent 97 | seed = config.getint('ENV_CONFIG', 'seed') 98 | model = init_agent(env, config['MODEL_CONFIG'], total_step, seed) 99 | 100 | # disable multi-threading for safe SUMO implementation 101 | summary_writer = tf.summary.FileWriter(dirs['log']) 102 | trainer = Trainer(env, model, global_counter, summary_writer, in_test, output_path=dirs['data']) 103 | trainer.run() 104 | 105 | # save model 106 | final_step = global_counter.cur_step 107 | logging.info('Training: save final model at step %d ...' % final_step) 108 | model.save(dirs['model'], final_step) 109 | 110 | # post-training test 111 | if post_test: 112 | test_dirs = init_dir(base_dir, pathes=['eva_data']) 113 | evaluator = Evaluator(env, model, test_dirs['eva_data']) 114 | evaluator.run() 115 | 116 | 117 | def evaluate_fn(agent_dir, output_dir, seeds, port): 118 | agent = agent_dir.split('/')[-1] 119 | if not check_dir(agent_dir): 120 | logging.error('Evaluation: %s does not exist!' % agent) 121 | return 122 | # load config file for env 123 | config_dir = find_file(agent_dir + '/data/') 124 | if not config_dir: 125 | return 126 | config = configparser.ConfigParser() 127 | config.read(config_dir) 128 | 129 | # init env 130 | env, greedy_policy = init_env(config['ENV_CONFIG'], port=port, naive_policy=True) 131 | env.init_test_seeds(seeds) 132 | 133 | # load model for agent 134 | if agent != 'greedy': 135 | # init centralized or multi agent 136 | model = init_agent(env, config['MODEL_CONFIG'], 0, 0) 137 | if model is None: 138 | return 139 | if not model.load(agent_dir + '/model/'): 140 | return 141 | else: 142 | model = greedy_policy 143 | # collect evaluation data 144 | evaluator = Evaluator(env, model, output_dir) 145 | evaluator.run() 146 | 147 | 148 | def evaluate(args): 149 | base_dir = args.base_dir 150 | dirs = init_dir(base_dir, pathes=['eva_data', 'eva_log']) 151 | init_log(dirs['eva_log']) 152 | # enforce the same evaluation seeds across agents 153 | seeds = args.evaluate_seeds 154 | logging.info('Evaluation: random seeds: %s' % seeds) 155 | if not seeds: 156 | seeds = [] 157 | else: 158 | seeds = [int(s) for s in seeds.split(',')] 159 | evaluate_fn(base_dir, dirs['eva_data'], seeds, 1) 160 | 161 | 162 | if __name__ == '__main__': 163 | args = parse_args() 164 | if args.option == 'train': 165 | train(args) 166 | else: 167 | evaluate(args) 168 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | import numpy as np 4 | import tensorflow as tf 5 | import time 6 | import os 7 | import pandas as pd 8 | import subprocess 9 | 10 | 11 | def check_dir(cur_dir): 12 | if not os.path.exists(cur_dir): 13 | return False 14 | return True 15 | 16 | 17 | def copy_file(src_dir, tar_dir): 18 | cmd = 'cp %s %s' % (src_dir, tar_dir) 19 | subprocess.check_call(cmd, shell=True) 20 | 21 | 22 | def find_file(cur_dir, suffix='.ini'): 23 | for file in os.listdir(cur_dir): 24 | if file.endswith(suffix): 25 | return cur_dir + '/' + file 26 | logging.error('Cannot find %s file' % suffix) 27 | return None 28 | 29 | 30 | def init_dir(base_dir, pathes=['log', 'data', 'model']): 31 | if not os.path.exists(base_dir): 32 | os.mkdir(base_dir) 33 | dirs = {} 34 | for path in pathes: 35 | cur_dir = base_dir + '/%s/' % path 36 | if not os.path.exists(cur_dir): 37 | os.mkdir(cur_dir) 38 | dirs[path] = cur_dir 39 | return dirs 40 | 41 | 42 | def init_log(log_dir): 43 | logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', 44 | level=logging.INFO, 45 | handlers=[ 46 | logging.FileHandler('%s/%d.log' % (log_dir, time.time())), 47 | logging.StreamHandler() 48 | ]) 49 | 50 | 51 | def init_test_flag(test_mode): 52 | if test_mode == 'no_test': 53 | return False, False 54 | if test_mode == 'in_train_test': 55 | return True, False 56 | if test_mode == 'after_train_test': 57 | return False, True 58 | if test_mode == 'all_test': 59 | return True, True 60 | return False, False 61 | 62 | 63 | def plot_train(data_dirs, labels): 64 | pass 65 | 66 | def plot_evaluation(data_dirs, labels): 67 | pass 68 | 69 | 70 | class Counter: 71 | def __init__(self, total_step, test_step, log_step): 72 | self.counter = itertools.count(1) 73 | self.cur_step = 0 74 | self.cur_test_step = 0 75 | self.total_step = total_step 76 | self.test_step = test_step 77 | self.log_step = log_step 78 | self.stop = False 79 | 80 | def next(self): 81 | self.cur_step = next(self.counter) 82 | return self.cur_step 83 | 84 | def should_test(self): 85 | test = False 86 | if (self.cur_step - self.cur_test_step) >= self.test_step: 87 | test = True 88 | self.cur_test_step = self.cur_step 89 | return test 90 | 91 | def should_log(self): 92 | return (self.cur_step % self.log_step == 0) 93 | 94 | def should_stop(self): 95 | if self.cur_step >= self.total_step: 96 | return True 97 | return self.stop 98 | 99 | 100 | class Trainer(): 101 | def __init__(self, env, model, global_counter, summary_writer, run_test, output_path=None): 102 | self.cur_step = 0 103 | self.global_counter = global_counter 104 | self.env = env 105 | self.agent = self.env.agent 106 | self.model = model 107 | self.sess = self.model.sess 108 | self.n_step = self.model.n_step 109 | self.summary_writer = summary_writer 110 | self.run_test = run_test 111 | assert self.env.T % self.n_step == 0 112 | self.data = [] 113 | self.output_path = output_path 114 | if run_test: 115 | self.test_num = self.env.test_num 116 | logging.info('Testing: total test num: %d' % self.test_num) 117 | self._init_summary() 118 | 119 | def _init_summary(self): 120 | self.train_reward = tf.placeholder(tf.float32, []) 121 | self.train_summary = tf.summary.scalar('train_reward', self.train_reward) 122 | self.test_reward = tf.placeholder(tf.float32, []) 123 | self.test_summary = tf.summary.scalar('test_reward', self.test_reward) 124 | 125 | def _add_summary(self, reward, global_step, is_train=True): 126 | if is_train: 127 | summ = self.sess.run(self.train_summary, {self.train_reward: reward}) 128 | else: 129 | summ = self.sess.run(self.test_summary, {self.test_reward: reward}) 130 | self.summary_writer.add_summary(summ, global_step=global_step) 131 | 132 | def _get_policy(self, ob, done, mode='train'): 133 | if self.agent.startswith('ma2c'): 134 | self.ps = self.env.get_fingerprint() 135 | policy = self.model.forward(np.array(ob), done, self.ps) 136 | else: 137 | policy = self.model.forward(ob, done) 138 | action = [] 139 | for pi in policy: 140 | if mode == 'train': 141 | action.append(np.random.choice(np.arange(len(pi)), p=pi)) 142 | else: 143 | action.append(np.argmax(pi)) 144 | return policy, np.array(action) 145 | 146 | def _get_value(self, ob, done, action): 147 | if self.agent.startswith('ma2c'): 148 | value = self.model.forward(np.array(ob), done, self.ps, np.array(action), 'v') 149 | else: 150 | self.naction = self.env.get_neighbor_action(action) 151 | value = self.model.forward(ob, done, self.naction, 'v') 152 | return value 153 | 154 | def explore(self, prev_ob, prev_done): 155 | ob = prev_ob 156 | done = prev_done 157 | rewards = [] 158 | for _ in range(self.n_step): 159 | # pre-decision 160 | policy, action = self._get_policy(ob, done) 161 | # post-decision 162 | value = self._get_value(ob, done, action) 163 | # transition 164 | self.env.update_fingerprint(policy) 165 | next_ob, reward, done, global_reward = self.env.step(action) 166 | rewards.append(global_reward) 167 | global_step = self.global_counter.next() 168 | self.cur_step += 1 169 | # collect experience 170 | if self.agent.startswith('ma2c'): 171 | self.model.add_transition(ob, self.ps, action, reward, value, done) 172 | else: 173 | self.model.add_transition(ob, self.naction, action, reward, value, done) 174 | # logging 175 | if self.global_counter.should_log(): 176 | logging.info('''Training: global step %d, episode step %d, 177 | ob: %s, a: %s, pi: %s, r: %.2f, train r: %.2f, done: %r''' % 178 | (global_step, self.cur_step, 179 | str(ob), str(action), str(policy), global_reward, np.mean(reward), done)) 180 | if done: 181 | break 182 | ob = next_ob 183 | if done: 184 | R = np.zeros(self.model.n_agent) 185 | else: 186 | _, action = self._get_policy(ob, done) 187 | R = self._get_value(ob, done, action) 188 | return ob, done, R, rewards 189 | 190 | def perform(self, test_ind): 191 | ob = self.env.reset(test_ind=test_ind) 192 | rewards = [] 193 | while True: 194 | if self.agent == 'greedy': 195 | action = self.model.forward(ob) 196 | else: 197 | # in on-policy learning, test policy has to be stochastic 198 | # policy, action = self._get_policy(ob, False, mode='test') 199 | policy, action = self._get_policy(ob, False) 200 | self.env.update_fingerprint(policy) 201 | next_ob, reward, done, global_reward = self.env.step(action) 202 | rewards.append(global_reward) 203 | if done: 204 | break 205 | ob = next_ob 206 | mean_reward = np.mean(np.array(rewards)) 207 | std_reward = np.std(np.array(rewards)) 208 | return mean_reward, std_reward 209 | 210 | def run_thread(self, coord): 211 | '''Multi-threading is disabled''' 212 | ob = self.env.reset() 213 | done = False 214 | cum_reward = 0 215 | while not coord.should_stop(): 216 | ob, done, R, cum_reward = self.explore(ob, done, cum_reward) 217 | global_step = self.global_counter.cur_step 218 | if self.agent.endswith('a2c'): 219 | self.model.backward(R, self.summary_writer, global_step) 220 | else: 221 | self.model.backward(self.summary_writer, global_step) 222 | self.summary_writer.flush() 223 | if (self.global_counter.should_stop()) and (not coord.should_stop()): 224 | self.env.terminate() 225 | coord.request_stop() 226 | logging.info('Training: stop condition reached!') 227 | return 228 | 229 | def run(self): 230 | while not self.global_counter.should_stop(): 231 | # test 232 | if self.run_test and self.global_counter.should_test(): 233 | rewards = [] 234 | global_step = self.global_counter.cur_step 235 | self.env.train_mode = False 236 | for test_ind in range(self.test_num): 237 | mean_reward, std_reward = self.perform(test_ind) 238 | self.env.terminate() 239 | rewards.append(mean_reward) 240 | log = {'agent': self.agent, 241 | 'step': global_step, 242 | 'test_id': test_ind, 243 | 'avg_reward': mean_reward, 244 | 'std_reward': std_reward} 245 | self.data.append(log) 246 | avg_reward = np.mean(np.array(rewards)) 247 | self._add_summary(avg_reward, global_step, is_train=False) 248 | logging.info('Testing: global step %d, avg R: %.2f' % 249 | (global_step, avg_reward)) 250 | # train 251 | self.env.train_mode = True 252 | ob = self.env.reset() 253 | done = False 254 | self.cur_step = 0 255 | rewards = [] 256 | while True: 257 | ob, done, R, cur_rewards = self.explore(ob, done) 258 | dt = self.env.T - self.cur_step 259 | rewards += cur_rewards 260 | global_step = self.global_counter.cur_step 261 | self.model.backward(R, dt, self.summary_writer, global_step) 262 | # termination 263 | if done: 264 | self.env.terminate() 265 | break 266 | rewards = np.array(rewards) 267 | mean_reward = np.mean(rewards) 268 | std_reward = np.std(rewards) 269 | log = {'agent': self.agent, 270 | 'step': global_step, 271 | 'test_id': -1, 272 | 'avg_reward': mean_reward, 273 | 'std_reward': std_reward} 274 | self.data.append(log) 275 | self._add_summary(mean_reward, global_step) 276 | self.summary_writer.flush() 277 | df = pd.DataFrame(self.data) 278 | df.to_csv(self.output_path + 'train_reward.csv') 279 | 280 | 281 | class Tester(Trainer): 282 | def __init__(self, env, model, global_counter, summary_writer, output_path): 283 | super().__init__(env, model, global_counter, summary_writer) 284 | self.env.train_mode = False 285 | self.test_num = self.env.test_num 286 | self.output_path = output_path 287 | self.data = [] 288 | logging.info('Testing: total test num: %d' % self.test_num) 289 | 290 | def _init_summary(self): 291 | self.reward = tf.placeholder(tf.float32, []) 292 | self.summary = tf.summary.scalar('test_reward', self.reward) 293 | 294 | def run_offline(self): 295 | # enable traffic measurments for offline test 296 | is_record = True 297 | record_stats = False 298 | self.env.cur_episode = 0 299 | self.env.init_data(is_record, record_stats, self.output_path) 300 | rewards = [] 301 | for test_ind in range(self.test_num): 302 | rewards.append(self.perform(test_ind)) 303 | self.env.terminate() 304 | time.sleep(2) 305 | self.env.collect_tripinfo() 306 | avg_reward = np.mean(np.array(rewards)) 307 | logging.info('Offline testing: avg R: %.2f' % avg_reward) 308 | self.env.output_data() 309 | 310 | def run_online(self, coord): 311 | self.env.cur_episode = 0 312 | while not coord.should_stop(): 313 | time.sleep(30) 314 | if self.global_counter.should_test(): 315 | rewards = [] 316 | global_step = self.global_counter.cur_step 317 | for test_ind in range(self.test_num): 318 | cur_reward = self.perform(test_ind) 319 | self.env.terminate() 320 | rewards.append(cur_reward) 321 | log = {'agent': self.agent, 322 | 'step': global_step, 323 | 'test_id': test_ind, 324 | 'reward': cur_reward} 325 | self.data.append(log) 326 | avg_reward = np.mean(np.array(rewards)) 327 | self._add_summary(avg_reward, global_step) 328 | logging.info('Testing: global step %d, avg R: %.2f' % 329 | (global_step, avg_reward)) 330 | # self.global_counter.update_test(avg_reward) 331 | df = pd.DataFrame(self.data) 332 | df.to_csv(self.output_path + 'train_reward.csv') 333 | 334 | 335 | class Evaluator(Tester): 336 | def __init__(self, env, model, output_path): 337 | self.env = env 338 | self.model = model 339 | self.agent = self.env.agent 340 | self.env.train_mode = False 341 | self.test_num = self.env.test_num 342 | self.output_path = output_path 343 | 344 | def run(self): 345 | is_record = True 346 | record_stats = False 347 | self.env.cur_episode = 0 348 | self.env.init_data(is_record, record_stats, self.output_path) 349 | time.sleep(1) 350 | for test_ind in range(self.test_num): 351 | reward, _ = self.perform(test_ind) 352 | self.env.terminate() 353 | logging.info('test %i, avg reward %.2f' % (test_ind, reward)) 354 | time.sleep(2) 355 | self.env.collect_tripinfo() 356 | self.env.output_data() 357 | --------------------------------------------------------------------------------