├── README.md
├── agents
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── models.cpython-37.pyc
│ ├── policies.cpython-37.pyc
│ └── utils.cpython-37.pyc
├── models.py
├── policies.py
└── utils.py
├── config
├── config_greedy.ini
├── config_ia2c.ini
├── config_ia2c_cu.ini
├── config_ia2c_fp.ini
├── config_ma2c_dial.ini
├── config_ma2c_ic3.ini
└── config_ma2c_nc.ini
├── envs
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── env.cpython-37.pyc
│ └── large_grid_env.cpython-37.pyc
├── data
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ └── build_file.cpython-37.pyc
│ ├── build_file.py
│ ├── intersection.pdf
│ ├── network.pdf
│ └── view.xml
├── env.py
└── large_grid_env.py
├── main.py
└── utils.py
/README.md:
--------------------------------------------------------------------------------
1 | # Networked Multi-agent Deep RL
2 | This repo implements the state-of-the-art methods for deep RL in a networked multi-agent system, with observability and communication of each agent limited to its neighborhood. For fair comparison, all methods are applied to A2C agents.
3 | Under construction ...
4 |
5 | Available IA2C algorithms:
6 | * PolicyInferring: [Lowe, Ryan, et al. "Multi-agent actor-critic for mixed cooperative-competitive environments." Advances in Neural Information Processing Systems, 2017.](https://papers.nips.cc/paper/7217-multi-agent-actor-critic-for-mixed-cooperative-competitive-environments.pdf)
7 | * FingerPrint: [Foerster, Jakob, et al. "Stabilising experience replay for deep multi-agent reinforcement learning." arXiv preprint arXiv:1702.08887, 2017.](https://arxiv.org/pdf/1702.08887.pdf)
8 | * ConsensusUpdate: [Zhang, Kaiqing, et al. "Fully decentralized multi-agent reinforcement learning with networked agents." arXiv preprint arXiv:1802.08757, 2018.](https://arxiv.org/pdf/1802.08757.pdf)
9 |
10 |
11 | Available MA2C algorithms:
12 | * DIAL: [Foerster, Jakob, et al. "Learning to communicate with deep multi-agent reinforcement learning." Advances in Neural Information Processing Systems. 2016.](http://papers.nips.cc/paper/6042-learning-to-communicate-with-deep-multi-agent-reinforcement-learning.pdf)
13 | * CommNet: [Sukhbaatar, Sainbayar, et al. "Learning multiagent communication with backpropagation." Advances in Neural Information Processing Systems, 2016.](https://arxiv.org/pdf/1605.07736.pdf)
14 | * NeurComm: [Gilmer, Justin, et al. "Neural message passing for quantum chemistry." arXiv preprint arXiv:1704.01212, 2017.](https://arxiv.org/pdf/1704.01212.pdf)
15 |
16 | ## Requirements
17 | * Python3
18 | * [Tensorflow](http://www.tensorflow.org/install)
19 | * [SUMO](http://sumo.dlr.de/wiki/Installing)
20 |
21 |
22 |
--------------------------------------------------------------------------------
/agents/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/agents/__pycache__/models.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/models.cpython-37.pyc
--------------------------------------------------------------------------------
/agents/__pycache__/policies.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/policies.cpython-37.pyc
--------------------------------------------------------------------------------
/agents/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/agents/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/agents/models.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | from agents.utils import OnPolicyBuffer, MultiAgentOnPolicyBuffer, Scheduler
4 | from agents.policies import (LstmPolicy, FPPolicy, ConsensusPolicy, NCMultiAgentPolicy,
5 | IC3MultiAgentPolicy, DIALMultiAgentPolicy)
6 | import logging
7 | import numpy as np
8 | import tensorflow as tf
9 |
10 |
11 | class IA2C:
12 | """
13 | The basic IA2C implementation with decentralized actor and centralized critic,
14 | limited to neighborhood area only.
15 | """
16 | def __init__(self, n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma,
17 | total_step, model_config, seed=0):
18 | self.name = 'ia2c'
19 | self._init_algo(n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma,
20 | total_step, seed, model_config)
21 |
22 | def add_transition(self, ob, naction, action, reward, value, done):
23 | if self.reward_norm > 0:
24 | reward = reward / self.reward_norm
25 | if self.reward_clip > 0:
26 | reward = np.clip(reward, -self.reward_clip, self.reward_clip)
27 | for i in range(self.n_agent):
28 | self.trans_buffer[i].add_transition(ob[i], naction[i], action[i], reward, value[i], done)
29 |
30 | def backward(self, Rends, dt, summary_writer=None, global_step=None):
31 | cur_lr = self.lr_scheduler.get(self.n_step)
32 | for i in range(self.n_agent):
33 | obs, nas, acts, dones, Rs, Advs = self.trans_buffer[i].sample_transition(Rends[i], dt)
34 | if i == 0:
35 | self.policy[i].backward(self.sess, obs, nas, acts, dones, Rs, Advs, cur_lr,
36 | summary_writer=summary_writer, global_step=global_step)
37 | else:
38 | self.policy[i].backward(self.sess, obs, nas, acts, dones, Rs, Advs, cur_lr)
39 |
40 | def forward(self, obs, done, nactions=None, out_type='p'):
41 | out = []
42 | if nactions is None:
43 | nactions = [None] * self.n_agent
44 | for i in range(self.n_agent):
45 | cur_out = self.policy[i].forward(self.sess, obs[i], done, nactions[i], out_type)
46 | out.append(cur_out)
47 | return np.array(out)
48 |
49 | def load(self, model_dir, checkpoint=None):
50 | save_file = None
51 | save_step = 0
52 | if os.path.exists(model_dir):
53 | if checkpoint is None:
54 | for file in os.listdir(model_dir):
55 | if file.startswith('checkpoint'):
56 | prefix = file.split('.')[0]
57 | tokens = prefix.split('-')
58 | if len(tokens) != 2:
59 | continue
60 | cur_step = int(tokens[1])
61 | if cur_step > save_step:
62 | save_file = prefix
63 | save_step = cur_step
64 | else:
65 | save_file = 'checkpoint-' + str(int(checkpoint))
66 | if save_file is not None:
67 | self.saver.restore(self.sess, model_dir + save_file)
68 | logging.info('Checkpoint loaded: %s' % save_file)
69 | return True
70 | logging.error('Can not find old checkpoint for %s' % model_dir)
71 | return False
72 |
73 | def save(self, model_dir, global_step):
74 | self.saver.save(self.sess, model_dir + 'checkpoint', global_step=global_step)
75 |
76 | def _init_algo(self, n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma,
77 | total_step, seed, model_config):
78 | # init params
79 | if self.name.startswith('ia2c'):
80 | self.n_s_ls = n_s_ls
81 | else:
82 | self.n_s = n_s_ls
83 | self.n_a = n_a
84 | self.neighbor_mask = neighbor_mask
85 | self.n_agent = len(self.neighbor_mask)
86 | self.reward_clip = model_config.getfloat('reward_clip')
87 | self.reward_norm = model_config.getfloat('reward_norm')
88 | self.n_step = model_config.getint('batch_size')
89 | self.n_fc = model_config.getint('num_fc')
90 | self.n_lstm = model_config.getint('num_lstm')
91 | # init tf
92 | tf.reset_default_graph()
93 | tf.set_random_seed(seed)
94 | config = tf.ConfigProto(allow_soft_placement=True)
95 | self.sess = tf.Session(config=config)
96 | self.policy = self._init_policy()
97 | self.saver = tf.train.Saver(max_to_keep=5)
98 | # init exp buffer and lr scheduler for training
99 | if total_step:
100 | self.total_step = total_step
101 | self._init_train(model_config, distance_mask, coop_gamma)
102 | self.sess.run(tf.global_variables_initializer())
103 |
104 | def _init_policy(self):
105 | policy = []
106 | for i in range(self.n_agent):
107 | n_n = np.sum(self.neighbor_mask[i])
108 | policy.append(LstmPolicy(self.n_s_ls[i], self.n_a, n_n, self.n_step,
109 | n_fc=self.n_fc, n_lstm=self.n_lstm, name='%d' % i))
110 | return policy
111 |
112 | def _init_scheduler(self, model_config):
113 | # init lr scheduler
114 | lr_init = model_config.getfloat('lr_init')
115 | lr_decay = model_config.get('lr_decay')
116 | if lr_decay == 'constant':
117 | self.lr_scheduler = Scheduler(lr_init, decay=lr_decay)
118 | else:
119 | lr_min = model_config.getfloat('lr_min')
120 | self.lr_scheduler = Scheduler(lr_init, lr_min, self.total_step, decay=lr_decay)
121 |
122 | def _init_train(self, model_config, distance_mask, coop_gamma):
123 | # init lr scheduler
124 | self._init_scheduler(model_config)
125 | v_coef = model_config.getfloat('value_coef')
126 | e_coef = model_config.getfloat('entropy_coef')
127 | max_grad_norm = model_config.getfloat('max_grad_norm')
128 | alpha = model_config.getfloat('rmsp_alpha')
129 | epsilon = model_config.getfloat('rmsp_epsilon')
130 | gamma = model_config.getfloat('gamma')
131 | self.trans_buffer = []
132 | for i in range(self.n_agent):
133 | # init loss
134 | self.policy[i].prepare_loss(v_coef, e_coef, max_grad_norm, alpha, epsilon)
135 | # init replay buffer
136 | self.trans_buffer.append(OnPolicyBuffer(gamma, coop_gamma, distance_mask[i]))
137 |
138 |
139 | class IA2C_FP(IA2C):
140 | """
141 | In fingerprint IA2C, neighborhood policies (fingerprints) are also included.
142 | """
143 | def __init__(self, n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma,
144 | total_step, model_config, seed=0):
145 | self.name = 'ia2c_fp'
146 | self._init_algo(n_s_ls, n_a, neighbor_mask, distance_mask, coop_gamma,
147 | total_step, seed, model_config)
148 |
149 | def _init_policy(self):
150 | policy = []
151 | for i in range(self.n_agent):
152 | n_n = np.sum(self.neighbor_mask[i])
153 | # neighborhood policies are included in local state
154 | n_s1 = self.n_s_ls[i] + self.n_a*n_n
155 | policy.append(FPPolicy(n_s1, self.n_a, n_n, self.n_step, n_fc=self.n_fc,
156 | n_lstm=self.n_lstm, name='%d' % i))
157 | return policy
158 |
159 |
160 | class MA2C_NC(IA2C):
161 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
162 | total_step, model_config, seed=0):
163 | self.name = 'ma2c_nc'
164 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
165 | total_step, seed, model_config)
166 |
167 | def add_transition(self, ob, p, action, reward, value, done):
168 | if self.reward_norm > 0:
169 | reward = reward / self.reward_norm
170 | if self.reward_clip > 0:
171 | reward = np.clip(reward, -self.reward_clip, self.reward_clip)
172 | self.trans_buffer.add_transition(ob, p, action, reward, value, done)
173 |
174 | def backward(self, Rends, dt, summary_writer=None, global_step=None):
175 | cur_lr = self.lr_scheduler.get(self.n_step)
176 | obs, ps, acts, dones, Rs, Advs = self.trans_buffer.sample_transition(Rends, dt)
177 | self.policy.backward(self.sess, obs, ps, acts, dones, Rs, Advs, cur_lr,
178 | summary_writer=summary_writer, global_step=global_step)
179 |
180 | def forward(self, obs, done, ps, actions=None, out_type='p'):
181 | return self.policy.forward(self.sess, obs, done, ps, actions, out_type)
182 |
183 | def _init_policy(self):
184 | return NCMultiAgentPolicy(self.n_s, self.n_a, self.n_agent, self.n_step,
185 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm)
186 |
187 | def _init_train(self, model_config, distance_mask, coop_gamma):
188 | # init lr scheduler
189 | self._init_scheduler(model_config)
190 | v_coef = model_config.getfloat('value_coef')
191 | e_coef = model_config.getfloat('entropy_coef')
192 | max_grad_norm = model_config.getfloat('max_grad_norm')
193 | alpha = model_config.getfloat('rmsp_alpha')
194 | epsilon = model_config.getfloat('rmsp_epsilon')
195 | gamma = model_config.getfloat('gamma')
196 | # init loss
197 | self.policy.prepare_loss(v_coef, e_coef, max_grad_norm, alpha, epsilon)
198 | # init replay buffer
199 | self.trans_buffer = MultiAgentOnPolicyBuffer(gamma, coop_gamma, distance_mask)
200 |
201 |
202 | class IA2C_CU(MA2C_NC):
203 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
204 | total_step, model_config, seed=0):
205 | self.name = 'ma2c_cu'
206 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
207 | total_step, seed, model_config)
208 |
209 | def _init_policy(self):
210 | return ConsensusPolicy(self.n_s, self.n_a, self.n_agent, self.n_step,
211 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm)
212 |
213 |
214 | class MA2C_IC3(MA2C_NC):
215 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
216 | total_step, model_config, seed=0):
217 | self.name = 'ma2c_ic3'
218 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
219 | total_step, seed, model_config)
220 |
221 | def _init_policy(self):
222 | return IC3MultiAgentPolicy(self.n_s, self.n_a, self.n_agent, self.n_step,
223 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm)
224 |
225 |
226 | class MA2C_DIAL(MA2C_NC):
227 | def __init__(self, n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
228 | total_step, model_config, seed=0):
229 | self.name = 'ma2c_dial'
230 | self._init_algo(n_s, n_a, neighbor_mask, distance_mask, coop_gamma,
231 | total_step, seed, model_config)
232 |
233 | def _init_policy(self):
234 | return DIALMultiAgentPolicy(self.n_s, self.n_a, self.n_agent, self.n_step,
235 | self.neighbor_mask, n_fc=self.n_fc, n_h=self.n_lstm)
236 |
--------------------------------------------------------------------------------
/agents/policies.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from agents.utils import *
4 |
5 |
6 | class Policy:
7 | def __init__(self, n_a, n_s, n_step, policy_name, agent_name):
8 | self.name = policy_name
9 | if agent_name is not None:
10 | # for multi-agent system
11 | self.name += '_' + str(agent_name)
12 | self.n_a = n_a
13 | self.n_s = n_s
14 | self.n_step = n_step
15 |
16 | def forward(self, ob, *_args, **_kwargs):
17 | raise NotImplementedError()
18 |
19 | def prepare_loss(self, v_coef, e_coef, max_grad_norm, alpha, epsilon):
20 | self.A = tf.placeholder(tf.int32, [self.n_step])
21 | self.ADV = tf.placeholder(tf.float32, [self.n_step])
22 | self.R = tf.placeholder(tf.float32, [self.n_step])
23 | A_sparse = tf.one_hot(self.A, self.n_a)
24 | log_pi = tf.log(tf.clip_by_value(self.pi, 1e-10, 1.0))
25 | entropy = -tf.reduce_sum(self.pi * log_pi, axis=1)
26 | entropy_loss = -tf.reduce_mean(entropy) * e_coef
27 | policy_loss = -tf.reduce_mean(tf.reduce_sum(log_pi * A_sparse, axis=1) * self.ADV)
28 | value_loss = tf.reduce_mean(tf.square(self.R - self.v)) * 0.5 * v_coef
29 | self.loss = policy_loss + value_loss + entropy_loss
30 |
31 | wts = tf.trainable_variables(scope=self.name)
32 | grads = tf.gradients(self.loss, wts)
33 | if max_grad_norm > 0:
34 | grads, self.grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
35 | self.lr = tf.placeholder(tf.float32, [])
36 | self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.lr, decay=alpha,
37 | epsilon=epsilon)
38 | self._train = self.optimizer.apply_gradients(list(zip(grads, wts)))
39 | # monitor training
40 | summaries = []
41 | summaries.append(tf.summary.scalar('loss/%s_entropy_loss' % self.name, entropy_loss))
42 | summaries.append(tf.summary.scalar('loss/%s_policy_loss' % self.name, policy_loss))
43 | summaries.append(tf.summary.scalar('loss/%s_value_loss' % self.name, value_loss))
44 | summaries.append(tf.summary.scalar('loss/%s_total_loss' % self.name, self.loss))
45 | summaries.append(tf.summary.scalar('train/%s_lr' % self.name, self.lr))
46 | summaries.append(tf.summary.scalar('train/%s_gradnorm' % self.name, self.grad_norm))
47 | self.summary = tf.summary.merge(summaries)
48 |
49 | def _build_actor_head(self, h, agent_name=None):
50 | name = 'pi'
51 | if agent_name is not None:
52 | name += '_' + str(agent_name)
53 | pi = fc(h, name, self.n_a, act=tf.nn.softmax)
54 | return pi
55 |
56 | def _build_critic_head(self, h, na, n_n=None, agent_name=None):
57 | name = 'v'
58 | if agent_name is not None:
59 | name += '_' + str(agent_name)
60 | if n_n is None:
61 | n_n = na.shape[-1]
62 | na_sparse = tf.one_hot(na, self.n_a, axis=-1)
63 | na_sparse = tf.reshape(na_sparse, [-1, self.n_a*n_n])
64 | h = tf.concat([h, na_sparse], 1)
65 | v = fc(h, name, 1, act=lambda x: x)
66 | return v
67 |
68 |
69 | class LstmPolicy(Policy):
70 | def __init__(self, n_s, n_a, n_n, n_step, n_fc=64, n_lstm=64, name=None):
71 | super().__init__(n_a, n_s, n_step, 'lstm', name)
72 | self.n_lstm = n_lstm
73 | self.n_fc = n_fc
74 | self.n_n = n_n
75 | self.ob_fw = tf.placeholder(tf.float32, [1, n_s]) # forward 1-step
76 | self.naction_fw = tf.placeholder(tf.int32, [1, n_n])
77 | self.done_fw = tf.placeholder(tf.float32, [1])
78 | self.ob_bw = tf.placeholder(tf.float32, [n_step, n_s]) # backward n-step
79 | self.naction_bw = tf.placeholder(tf.int32, [n_step, n_n])
80 | self.done_bw = tf.placeholder(tf.float32, [n_step])
81 | self.states = tf.placeholder(tf.float32, [n_lstm * 2])
82 | with tf.variable_scope(self.name):
83 | self.pi_fw, self.v_fw, self.new_states = self._build_net('forward')
84 | with tf.variable_scope(self.name, reuse=True):
85 | self.pi, self.v, _ = self._build_net('backward')
86 | self._reset()
87 |
88 | def backward(self, sess, obs, nactions, acts, dones, Rs, Advs, cur_lr,
89 | summary_writer=None, global_step=None):
90 | summary, _ = sess.run([self.summary, self._train],
91 | {self.ob_bw: obs,
92 | self.naction_bw: nactions,
93 | self.done_bw: dones,
94 | self.states: self.states_bw,
95 | self.A: acts,
96 | self.ADV: Advs,
97 | self.R: Rs,
98 | self.lr: cur_lr})
99 | self.states_bw = np.copy(self.states_fw)
100 | if summary_writer is not None:
101 | summary_writer.add_summary(summary, global_step=global_step)
102 |
103 | def forward(self, sess, ob, done, naction=None, out_type='p'):
104 | # update state only when p is called
105 | ins = {self.ob_fw: np.array([ob]),
106 | self.done_fw: np.array([done]),
107 | self.states: self.states_fw}
108 | if out_type.startswith('p'):
109 | outs = [self.pi_fw, self.new_states]
110 | else:
111 | outs = [self.v_fw]
112 | ins[self.naction_fw] = np.array([naction])
113 | out_values = sess.run(outs, ins)
114 | out_value = out_values[0]
115 | if out_type.startswith('p'):
116 | self.states_fw = out_values[-1]
117 | return out_value
118 |
119 | def _build_net(self, in_type):
120 | if in_type == 'forward':
121 | ob = self.ob_fw
122 | done = self.done_fw
123 | naction = self.naction_fw
124 | else:
125 | ob = self.ob_bw
126 | done = self.done_bw
127 | naction = self.naction_bw
128 | h = fc(ob, 'fc', self.n_fc)
129 | h, new_states = lstm(h, done, self.states, 'lstm')
130 | pi = self._build_actor_head(h)
131 | v = self._build_critic_head(h, naction)
132 | return tf.squeeze(pi), tf.squeeze(v), new_states
133 |
134 | def _reset(self):
135 | # forget the cumulative states every cum_step
136 | self.states_fw = np.zeros(self.n_lstm * 2, dtype=np.float32)
137 | self.states_bw = np.zeros(self.n_lstm * 2, dtype=np.float32)
138 |
139 |
140 | class FPPolicy(LstmPolicy):
141 | def __init__(self, n_s, n_a, n_n, n_step, n_fc=64, n_lstm=64, name=None):
142 | super().__init__(n_s, n_a, n_n, n_step, n_fc, n_lstm, name)
143 |
144 | def _build_net(self, in_type):
145 | if in_type == 'forward':
146 | ob = self.ob_fw
147 | done = self.done_fw
148 | naction = self.naction_fw
149 | else:
150 | ob = self.ob_bw
151 | done = self.done_bw
152 | naction = self.naction_bw
153 | n_x = int(self.n_s - self.n_n * self.n_a)
154 | hx = fc(ob[:,:n_x], 'fcs', self.n_fc)
155 | hp = fc(ob[:,n_x:], 'fcp', self.n_fc)
156 | h = tf.concat([hx, hp], axis=1)
157 | h, new_states = lstm(h, done, self.states, 'lstm')
158 | pi = self._build_actor_head(h)
159 | v = self._build_critic_head(h, naction)
160 | return tf.squeeze(pi), tf.squeeze(v), new_states
161 |
162 |
163 | class NCMultiAgentPolicy(Policy):
164 | """ Inplemented as a centralized agent. To simplify the implementation, all input
165 | and output dimensions are identical among all agents, and invalid values are casted as
166 | zeros during runtime."""
167 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64):
168 | super().__init__(n_a, n_s, n_step, 'nc', None)
169 | self._init_policy(n_agent, neighbor_mask, n_h)
170 |
171 | def backward(self, sess, obs, policies, acts, dones, Rs, Advs, cur_lr,
172 | summary_writer=None, global_step=None):
173 | summary, _ = sess.run([self.summary, self._train],
174 | {self.ob_bw: obs,
175 | self.policy_bw: policies,
176 | self.action_bw: acts,
177 | self.done_bw: dones,
178 | self.states: self.states_bw,
179 | self.ADV: Advs,
180 | self.R: Rs,
181 | self.lr: cur_lr})
182 | self.states_bw = np.copy(self.states_fw)
183 | if summary_writer is not None:
184 | summary_writer.add_summary(summary, global_step=global_step)
185 |
186 | def forward(self, sess, ob, done, policy, action=None, out_type='p'):
187 | # update state only when p is called
188 | ins = {self.ob_fw: np.expand_dims(ob, axis=1),
189 | self.done_fw: np.expand_dims(done, axis=1),
190 | self.policy_fw: np.expand_dims(policy, axis=1),
191 | self.states: self.states_fw}
192 | if out_type.startswith('p'):
193 | outs = [self.pi_fw, self.new_states]
194 | else:
195 | outs = [self.v_fw]
196 | ins[self.action_fw] = np.expand_dims(action, axis=1)
197 | out_values = sess.run(outs, ins)
198 | out_value = out_values[0]
199 | if out_type.startswith('p'):
200 | self.states_fw = out_values[-1]
201 | return out_value
202 |
203 | def prepare_loss(self, v_coef, e_coef, max_grad_norm, alpha, epsilon):
204 | self.ADV = tf.placeholder(tf.float32, [self.n_agent, self.n_step])
205 | self.R = tf.placeholder(tf.float32, [self.n_agent, self.n_step])
206 | A_sparse = tf.one_hot(self.action_bw, self.n_a)
207 | # all losses are averaged over steps but summed over agents
208 | log_pi = tf.log(tf.clip_by_value(self.pi, 1e-10, 1.0))
209 | entropy = -tf.reduce_sum(self.pi * log_pi, axis=-1)
210 | entropy_loss = -tf.reduce_sum(tf.reduce_mean(entropy, axis=-1)) * e_coef
211 | policy_loss = -tf.reduce_sum(tf.reduce_mean(tf.reduce_sum(log_pi * A_sparse, axis=-1) * self.ADV, axis=-1))
212 | value_loss = tf.reduce_sum(tf.reduce_mean(tf.square(self.R - self.v), axis=-1)) * 0.5 * v_coef
213 | self.loss = policy_loss + value_loss + entropy_loss
214 |
215 | wts = tf.trainable_variables(scope=self.name)
216 | grads = tf.gradients(self.loss, wts)
217 | if max_grad_norm > 0:
218 | grads, self.grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
219 | self.lr = tf.placeholder(tf.float32, [])
220 | self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.lr, decay=alpha,
221 | epsilon=epsilon)
222 | self._train = self.optimizer.apply_gradients(list(zip(grads, wts)))
223 | # monitor training
224 | summaries = []
225 | summaries.append(tf.summary.scalar('loss/%s_entropy_loss' % self.name, entropy_loss))
226 | summaries.append(tf.summary.scalar('loss/%s_policy_loss' % self.name, policy_loss))
227 | summaries.append(tf.summary.scalar('loss/%s_value_loss' % self.name, value_loss))
228 | summaries.append(tf.summary.scalar('loss/%s_total_loss' % self.name, self.loss))
229 | summaries.append(tf.summary.scalar('train/%s_lr' % self.name, self.lr))
230 | summaries.append(tf.summary.scalar('train/%s_gradnorm' % self.name, self.grad_norm))
231 | self.summary = tf.summary.merge(summaries)
232 |
233 | def _build_net(self, in_type):
234 | if in_type == 'forward':
235 | ob = self.ob_fw
236 | policy = self.policy_fw
237 | action = self.action_fw
238 | done = self.done_fw
239 | else:
240 | ob = self.ob_bw
241 | policy = self.policy_bw
242 | action = self.action_bw
243 | done = self.done_bw
244 | h, new_states = lstm_comm_new(ob, policy, done, self.neighbor_mask, self.states, 'lstm_comm')
245 | pi_ls = []
246 | v_ls = []
247 | for i in range(self.n_agent):
248 | h_i = h[i] # Txn_h
249 | naction_i = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i])) # Txn_n
250 | pi = self._build_actor_head(h_i, agent_name='%d' % i)
251 | v = self._build_critic_head(h_i, naction_i, n_n=int(np.sum(self.neighbor_mask[i])),
252 | agent_name='%d' % i)
253 | pi_ls.append(tf.expand_dims(pi, axis=0))
254 | v_ls.append(tf.expand_dims(v, axis=0))
255 | return tf.squeeze(tf.concat(pi_ls, axis=0)), tf.squeeze(tf.concat(v_ls, axis=0)), new_states
256 |
257 | def _init_policy(self, n_agent, neighbor_mask, n_h):
258 | self.n_agent = n_agent
259 | self.neighbor_mask = neighbor_mask #n_agent x n_agent
260 | self.n_h = n_h
261 | self.ob_fw = tf.placeholder(tf.float32, [n_agent, 1, self.n_s]) # forward 1-step
262 | self.policy_fw = tf.placeholder(tf.float32, [n_agent, 1, self.n_a])
263 | self.action_fw = tf.placeholder(tf.int32, [n_agent, 1])
264 | self.done_fw = tf.placeholder(tf.float32, [1])
265 | self.ob_bw = tf.placeholder(tf.float32, [n_agent, self.n_step, self.n_s]) # backward n-step
266 | self.policy_bw = tf.placeholder(tf.float32, [n_agent, self.n_step, self.n_a])
267 | self.action_bw = tf.placeholder(tf.int32, [n_agent, self.n_step])
268 | self.done_bw = tf.placeholder(tf.float32, [self.n_step])
269 | self.states = tf.placeholder(tf.float32, [n_agent, n_h * 2])
270 |
271 | with tf.variable_scope(self.name):
272 | self.pi_fw, self.v_fw, self.new_states = self._build_net('forward')
273 | with tf.variable_scope(self.name, reuse=True):
274 | self.pi, self.v, _ = self._build_net('backward')
275 | self._reset()
276 |
277 | def _reset(self):
278 | self.states_fw = np.zeros((self.n_agent, self.n_h * 2), dtype=np.float32)
279 | self.states_bw = np.zeros((self.n_agent, self.n_h * 2), dtype=np.float32)
280 |
281 |
282 | class ConsensusPolicy(NCMultiAgentPolicy):
283 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64):
284 | Policy.__init__(self, n_a, n_s, n_step, 'cu', None)
285 | self.n_agent = n_agent
286 | self.n_h = n_h
287 | self.neighbor_mask = neighbor_mask
288 | self._init_policy(n_agent, neighbor_mask, n_h)
289 |
290 | def backward(self, sess, obs, policies, acts, dones, Rs, Advs, cur_lr,
291 | summary_writer=None, global_step=None):
292 | super().backward(sess, obs, policies, acts, dones, Rs, Advs, cur_lr,
293 | summary_writer, global_step)
294 | sess.run(self._consensus_update)
295 |
296 | def prepare_loss(self, v_coef, e_coef, max_grad_norm, alpha, epsilon):
297 | super().prepare_loss(v_coef, e_coef, max_grad_norm, alpha, epsilon)
298 | consensus_update = []
299 | for i in range(self.n_agent):
300 | wt_from, wt_to = self._get_critic_wts(i)
301 | for w1, w2 in zip(wt_from, wt_to):
302 | consensus_update.append(w2.assign(w1))
303 | self._consensus_update = tf.group(*consensus_update)
304 |
305 | def _build_net(self, in_type):
306 | if in_type == 'forward':
307 | ob = self.ob_fw
308 | done = self.done_fw
309 | action = self.action_fw
310 | else:
311 | ob = self.ob_bw
312 | done = self.done_bw
313 | action = self.action_bw
314 | pi_ls = []
315 | v_ls = []
316 | new_states_ls = []
317 | for i in range(self.n_agent):
318 | h = fc(ob[i], 'fc_%d' % i, self.n_h)
319 | h, new_states = lstm(h, done, self.states[i], 'lstm_%d' % i)
320 | pi = self._build_actor_head(h, agent_name='%d' % i)
321 | naction = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i]))
322 | v = self._build_critic_head(h, naction, n_n=int(np.sum(self.neighbor_mask[i])), agent_name='%d' % i)
323 | pi_ls.append(tf.expand_dims(pi, axis=0))
324 | v_ls.append(tf.expand_dims(v, axis=0))
325 | new_states_ls.append(tf.expand_dims(new_states, axis=0))
326 | pi_ls = tf.squeeze(tf.concat(pi_ls, axis=0))
327 | v_ls = tf.squeeze(tf.concat(v_ls, axis=0))
328 | new_states_ls = tf.squeeze(tf.concat(new_states_ls, axis=0))
329 | return pi_ls, v_ls, new_states_ls
330 |
331 | def _get_critic_wts(self, agent_i):
332 | neighbor_mask = self.neighbor_mask[agent_i]
333 | agents = [agent_i] + list(np.where(neighbor_mask == 1)[0])
334 | wt_i = []
335 | wt_n = []
336 | for i in agents:
337 | critic_scope = [self.name + ('/lstm_%d' % i)]
338 | wt = []
339 | for scope in critic_scope:
340 | wt += tf.trainable_variables(scope=scope)
341 | if i == agent_i:
342 | wt_i = wt
343 | wt_n.append(wt)
344 | mean_wt_n = []
345 | n_n = len(wt_n)
346 | n_w = len(wt_n[0])
347 | for i in range(n_w):
348 | cur_wts = []
349 | for j in range(n_n):
350 | cur_wts.append(tf.expand_dims(wt_n[j][i], axis=-1))
351 | cur_wts = tf.concat(cur_wts, axis=-1)
352 | cur_wts = tf.reduce_mean(cur_wts, axis=-1)
353 | mean_wt_n.append(cur_wts)
354 | return mean_wt_n, wt_i
355 |
356 |
357 | class IC3MultiAgentPolicy(NCMultiAgentPolicy):
358 | """Reference code: https://github.com/IC3Net/IC3Net/blob/master/comm.py.
359 | Note in IC3, the message is generated from hidden state only, so current state
360 | and neigbor policies are not included in the inputs."""
361 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64):
362 | Policy.__init__(self, n_a, n_s, n_step, 'ic3', None)
363 | self._init_policy(n_agent, neighbor_mask, n_h)
364 |
365 | def _build_net(self, in_type):
366 | if in_type == 'forward':
367 | ob = self.ob_fw
368 | action = self.action_fw
369 | done = self.done_fw
370 | else:
371 | ob = self.ob_bw
372 | action = self.action_bw
373 | done = self.done_bw
374 | h, new_states = lstm_ic3(ob, done, self.neighbor_mask, self.states, 'lstm_ic3')
375 | pi_ls = []
376 | v_ls = []
377 | for i in range(self.n_agent):
378 | h_i = h[i] # Txn_h
379 | naction_i = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i])) # Txn_n
380 | pi = self._build_actor_head(h_i, agent_name='%d' % i)
381 | v = self._build_critic_head(h_i, naction_i, n_n=int(np.sum(self.neighbor_mask[i])),
382 | agent_name='%d' % i)
383 | pi_ls.append(tf.expand_dims(pi, axis=0))
384 | v_ls.append(tf.expand_dims(v, axis=0))
385 | return tf.squeeze(tf.concat(pi_ls, axis=0)), tf.squeeze(tf.concat(v_ls, axis=0)), new_states
386 |
387 |
388 | class DIALMultiAgentPolicy(NCMultiAgentPolicy):
389 | def __init__(self, n_s, n_a, n_agent, n_step, neighbor_mask, n_fc=64, n_h=64):
390 | Policy.__init__(self, n_a, n_s, n_step, 'dial', None)
391 | self._init_policy(n_agent, neighbor_mask, n_h)
392 |
393 | def _build_net(self, in_type):
394 | if in_type == 'forward':
395 | ob = self.ob_fw
396 | policy = self.policy_fw
397 | action = self.action_fw
398 | done = self.done_fw
399 | else:
400 | ob = self.ob_bw
401 | policy = self.policy_bw
402 | action = self.action_bw
403 | done = self.done_bw
404 | h, new_states = lstm_dial(ob, policy, done, self.neighbor_mask, self.states, 'lstm_comm')
405 | pi_ls = []
406 | v_ls = []
407 | for i in range(self.n_agent):
408 | h_i = h[i] # Txn_h
409 | naction_i = tf.transpose(tf.boolean_mask(action, self.neighbor_mask[i])) # Txn_n
410 | pi = self._build_actor_head(h_i, agent_name='%d' % i)
411 | v = self._build_critic_head(h_i, naction_i, n_n=int(np.sum(self.neighbor_mask[i])),
412 | agent_name='%d' % i)
413 | pi_ls.append(tf.expand_dims(pi, axis=0))
414 | v_ls.append(tf.expand_dims(v, axis=0))
415 | return tf.squeeze(tf.concat(pi_ls, axis=0)), tf.squeeze(tf.concat(v_ls, axis=0)), new_states
416 |
417 |
--------------------------------------------------------------------------------
/agents/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | """
5 | initializers
6 | """
7 | DEFAULT_SCALE = np.sqrt(2)
8 | DEFAULT_MODE = 'fan_in'
9 |
10 | def ortho_init(scale=DEFAULT_SCALE, mode=None):
11 | def _ortho_init(shape, dtype, partition_info=None):
12 | # lasagne ortho init for tf
13 | shape = tuple(shape)
14 | if len(shape) == 2: # fc: in, out
15 | flat_shape = shape
16 | elif (len(shape) == 3) or (len(shape) == 4): # 1d/2dcnn: (in_h), in_w, in_c, out
17 | flat_shape = (np.prod(shape[:-1]), shape[-1])
18 | a = np.random.standard_normal(flat_shape)
19 | u, _, v = np.linalg.svd(a, full_matrices=False)
20 | q = u if u.shape == flat_shape else v # pick the one with the correct shape
21 | q = q.reshape(shape)
22 | return (scale * q).astype(np.float32)
23 | return _ortho_init
24 |
25 |
26 | def norm_init(scale=DEFAULT_SCALE, mode=DEFAULT_MODE):
27 | def _norm_init(shape, dtype, partition_info=None):
28 | shape = tuple(shape)
29 | if len(shape) == 2:
30 | n_in = shape[0]
31 | elif (len(shape) == 3) or (len(shape) == 4):
32 | n_in = np.prod(shape[:-1])
33 | a = np.random.standard_normal(shape)
34 | if mode == 'fan_in':
35 | n = n_in
36 | elif mode == 'fan_out':
37 | n = shape[-1]
38 | elif mode == 'fan_avg':
39 | n = 0.5 * (n_in + shape[-1])
40 | return (scale * a / np.sqrt(n)).astype(np.float32)
41 |
42 | DEFAULT_METHOD = ortho_init
43 | """
44 | layers
45 | """
46 | def conv(x, scope, n_out, f_size, stride=1, pad='VALID', f_size_w=None, act=tf.nn.relu,
47 | conv_dim=1, init_scale=DEFAULT_SCALE, init_mode=None, init_method=DEFAULT_METHOD):
48 | with tf.variable_scope(scope):
49 | b = tf.get_variable("b", [n_out], initializer=tf.constant_initializer(0.0))
50 | if conv_dim == 1:
51 | n_c = x.shape[2].value
52 | w = tf.get_variable("w", [f_size, n_c, n_out],
53 | initializer=init_method(init_scale, init_mode))
54 | z = tf.nn.conv1d(x, w, stride=stride, padding=pad) + b
55 | elif conv_dim == 2:
56 | n_c = x.shape[3].value
57 | if f_size_w is None:
58 | f_size_w = f_size
59 | w = tf.get_variable("w", [f_size, f_size_w, n_c, n_out],
60 | initializer=init_method(init_scale, init_mode))
61 | z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad) + b
62 | return act(z)
63 |
64 |
65 | def fc(x, scope, n_out, act=tf.nn.relu, init_scale=DEFAULT_SCALE,
66 | init_mode=DEFAULT_MODE, init_method=DEFAULT_METHOD):
67 | with tf.variable_scope(scope):
68 | n_in = x.shape[1].value
69 | w = tf.get_variable("w", [n_in, n_out],
70 | initializer=init_method(init_scale, init_mode))
71 | b = tf.get_variable("b", [n_out], initializer=tf.constant_initializer(0.0))
72 | z = tf.matmul(x, w) + b
73 | return act(z)
74 |
75 |
76 | def batch_to_seq(x):
77 | n_step = x.shape[0].value
78 | if len(x.shape) == 1:
79 | x = tf.expand_dims(x, -1)
80 | return tf.split(axis=0, num_or_size_splits=n_step, value=x)
81 |
82 |
83 | def seq_to_batch(x):
84 | return tf.concat(x, axis=0)
85 |
86 |
87 | def lstm(xs, dones, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE,
88 | init_method=DEFAULT_METHOD):
89 | xs = batch_to_seq(xs)
90 | # need dones to reset states
91 | dones = batch_to_seq(dones)
92 | n_in = xs[0].shape[1].value
93 | n_out = s.shape[0] // 2
94 | with tf.variable_scope(scope):
95 | wx = tf.get_variable("wx", [n_in, n_out*4],
96 | initializer=init_method(init_scale, init_mode))
97 | wh = tf.get_variable("wh", [n_out, n_out*4],
98 | initializer=init_method(init_scale, init_mode))
99 | b = tf.get_variable("b", [n_out*4], initializer=tf.constant_initializer(0.0))
100 | s = tf.expand_dims(s, 0)
101 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
102 | for ind, (x, done) in enumerate(zip(xs, dones)):
103 | c = c * (1-done)
104 | h = h * (1-done)
105 | z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
106 | i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
107 | i = tf.nn.sigmoid(i)
108 | f = tf.nn.sigmoid(f)
109 | o = tf.nn.sigmoid(o)
110 | u = tf.tanh(u)
111 | c = f*c + i*u
112 | h = o*tf.tanh(c)
113 | xs[ind] = h
114 | s = tf.concat(axis=1, values=[c, h])
115 | return seq_to_batch(xs), tf.squeeze(s)
116 |
117 |
118 | def lstm_comm(xs, ps, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE,
119 | init_method=DEFAULT_METHOD):
120 | n_agent = s.shape[0]
121 | n_h = s.shape[1] // 2
122 | n_s = xs.shape[-1]
123 | n_a = ps.shape[-1]
124 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s
125 | xs = batch_to_seq(xs)
126 | ps = tf.transpose(ps, perm=[1,0,2]) # TxNxn_a
127 | ps = batch_to_seq(ps)
128 | # need dones to reset states
129 | dones = batch_to_seq(dones) # Tx1
130 | # create wts
131 | n_in_msg = n_h + n_s + n_a
132 | w_msg = []
133 | b_msg = []
134 | wx_hid = []
135 | wh_hid = []
136 | b_hid = []
137 | for i in range(n_agent):
138 | n_m = np.sum(masks[i])
139 | n_in_hid = n_s + n_h*n_m
140 | with tf.variable_scope(scope + ('_%d' % i)):
141 | w_msg.append(tf.get_variable("w_msg", [n_in_msg, n_h],
142 | initializer=init_method(init_scale, init_mode)))
143 | b_msg.append(tf.get_variable("b_msg", [n_h],
144 | initializer=tf.constant_initializer(0.0)))
145 | wx_hid.append(tf.get_variable("wx_hid", [n_in_hid, n_h*4],
146 | initializer=init_method(init_scale, init_mode)))
147 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4],
148 | initializer=init_method(init_scale, init_mode)))
149 | b_hid.append(tf.get_variable("b_hid", [n_h*4],
150 | initializer=tf.constant_initializer(0.0)))
151 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
152 | # loop over steps
153 | for t, (x, p, done) in enumerate(zip(xs, ps, dones)):
154 | # abuse 1 agent as 1 step
155 | x = batch_to_seq(tf.squeeze(x, axis=0))
156 | p = batch_to_seq(tf.squeeze(p, axis=0))
157 | out_h = []
158 | out_c = []
159 | out_m = []
160 | # communication phase
161 | for i, (xi, pi) in enumerate(zip(x, p)):
162 | hi = tf.expand_dims(h[i], axis=0)
163 | si = tf.concat([hi, xi, pi], axis=1)
164 | mi = tf.nn.relu(tf.matmul(si, w_msg[i]) + b_msg[i])
165 | out_m.append(mi)
166 | out_m = tf.concat(out_m, axis=0) # Nxn_h
167 | # hidden phase
168 | for i, xi in enumerate(x):
169 | ci = tf.expand_dims(c[i], axis=0)
170 | hi = tf.expand_dims(h[i], axis=0)
171 | # reset states for a new episode
172 | ci = ci * (1-done)
173 | hi = hi * (1-done)
174 | # receive neighbor messages
175 | mi = tf.expand_dims(tf.reshape(tf.boolean_mask(out_m, masks[i]), [-1]), axis=0)
176 | # TODO: add additional encoding layers here
177 | si = tf.concat([xi, mi], axis=1)
178 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i]
179 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi)
180 | ii = tf.nn.sigmoid(ii)
181 | fi = tf.nn.sigmoid(fi)
182 | oi = tf.nn.sigmoid(oi)
183 | ui = tf.tanh(ui)
184 | ci = fi*ci + ii*ui
185 | hi = oi*tf.tanh(ci)
186 | out_h.append(hi)
187 | out_c.append(ci)
188 | c = tf.concat(out_c, axis=0)
189 | h = tf.concat(out_h, axis=0)
190 | xs[t] = tf.expand_dims(h, axis=0)
191 | s = tf.concat(axis=1, values=[c, h])
192 | xs = seq_to_batch(xs) # TxNxn_h
193 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h
194 | return xs, s
195 |
196 |
197 | def lstm_comm_new(xs, ps, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE,
198 | init_method=DEFAULT_METHOD):
199 | n_agent = s.shape[0]
200 | n_h = s.shape[1] // 2
201 | n_s = xs.shape[-1]
202 | n_a = ps.shape[-1]
203 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s
204 | xs = batch_to_seq(xs)
205 | ps = tf.transpose(ps, perm=[1,0,2]) # TxNxn_a
206 | ps = batch_to_seq(ps)
207 | # need dones to reset states
208 | dones = batch_to_seq(dones) # Tx1
209 | # create wts
210 | w_msg = []
211 | b_msg = []
212 | w_ob = []
213 | b_ob = []
214 | # w_fp = []
215 | # b_fp = []
216 | wx_hid = []
217 | wh_hid = []
218 | b_hid = []
219 | n_in_hid = 3*n_h
220 | for i in range(n_agent):
221 | n_m = np.sum(masks[i])
222 | # n_in_hid = (n_m+1)*n_h
223 | with tf.variable_scope(scope + ('_%d' % i)):
224 | w_msg.append(tf.get_variable("w_msg", [n_h*n_m, n_h],
225 | initializer=init_method(init_scale, init_mode)))
226 | b_msg.append(tf.get_variable("b_msg", [n_h],
227 | initializer=tf.constant_initializer(0.0)))
228 | w_ob.append(tf.get_variable("w_ob", [n_s*(n_m+1), n_h],
229 | initializer=init_method(init_scale, init_mode)))
230 | b_ob.append(tf.get_variable("b_ob", [n_h],
231 | initializer=tf.constant_initializer(0.0)))
232 | # w_fp.append(tf.get_variable("w_fp", [n_a*n_m, n_h],
233 | # initializer=init_method(init_scale, init_mode)))
234 | # b_fp.append(tf.get_variable("b_fp", [n_h],
235 | # initializer=tf.constant_initializer(0.0)))
236 | wx_hid.append(tf.get_variable("wx_hid", [n_in_hid, n_h*4],
237 | initializer=init_method(init_scale, init_mode)))
238 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4],
239 | initializer=init_method(init_scale, init_mode)))
240 | b_hid.append(tf.get_variable("b_hid", [n_h*4],
241 | initializer=tf.constant_initializer(0.0)))
242 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
243 | # loop over steps
244 | for t, (x, p, done) in enumerate(zip(xs, ps, dones)):
245 | # abuse 1 agent as 1 step
246 | x = tf.squeeze(x, axis=0)
247 | p = tf.squeeze(p, axis=0)
248 | # x = batch_to_seq(tf.squeeze(x, axis=0))
249 | # p = batch_to_seq(tf.squeeze(p, axis=0))
250 | out_h = []
251 | out_c = []
252 | out_m = []
253 | # communication phase
254 | for i in range(n_agent):
255 | hi = tf.expand_dims(h[i], axis=0)
256 | # hxi = fc(xi, 'mfc_s_%d' % i, n_h, act=tf.nn.tanh)
257 | # hpi = fc(pi, 'mfc_p_%d' % i, n_h, act=tf.nn.tanh)
258 | # si = tf.concat([hi, hxi, hpi], axis=1)
259 | mi = fc(hi, 'mfc_%d' % i, n_h)
260 | out_m.append(mi)
261 | # out_m = [tf.expand_dims(h[i], axis=0) for i in range(n_agent)]
262 | out_m = tf.concat(out_m, axis=0) # Nxn_h
263 | # hidden phase
264 | for i in range(n_agent):
265 | ci = tf.expand_dims(c[i], axis=0)
266 | hi = tf.expand_dims(h[i], axis=0)
267 | # reset states for a new episode
268 | ci = ci * (1-done)
269 | hi = hi * (1-done)
270 | # receive neighbor messages
271 | mi = tf.expand_dims(tf.reshape(tf.boolean_mask(out_m, masks[i]), [-1]), axis=0)
272 | # pi = tf.expand_dims(tf.reshape(tf.boolean_mask(p, masks[i]), [-1]), axis=0)
273 | xi = tf.expand_dims(tf.reshape(tf.boolean_mask(x, masks[i]), [-1]), axis=0)
274 | xi = tf.concat([tf.expand_dims(x[i], axis=0), xi], axis=1)
275 | hxi = tf.nn.relu(tf.matmul(xi, w_ob[i]) + b_ob[i])
276 | # hpi = tf.nn.relu(tf.matmul(pi, w_fp[i]) + b_fp[i])
277 | hmi = tf.matmul(mi, w_msg[i]) + b_msg[i]
278 | # si = tf.concat([hxi, hpi, hmi], axis=1)
279 | si = tf.concat([hxi, hmi], axis=1)
280 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i]
281 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi)
282 | ii = tf.nn.sigmoid(ii)
283 | fi = tf.nn.sigmoid(fi)
284 | oi = tf.nn.sigmoid(oi)
285 | ui = tf.tanh(ui)
286 | ci = fi*ci + ii*ui
287 | hi = oi*tf.tanh(ci)
288 | out_h.append(hi)
289 | out_c.append(ci)
290 | c = tf.concat(out_c, axis=0)
291 | h = tf.concat(out_h, axis=0)
292 | xs[t] = tf.expand_dims(h, axis=0)
293 | s = tf.concat(axis=1, values=[c, h])
294 | xs = seq_to_batch(xs) # TxNxn_h
295 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h
296 | return xs, s
297 |
298 | def lstm_ic3(xs, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE,
299 | init_method=DEFAULT_METHOD):
300 | n_agent = s.shape[0]
301 | n_h = s.shape[1] // 2
302 | n_s = xs.shape[-1]
303 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s
304 | xs = batch_to_seq(xs)
305 | # need dones to reset states
306 | dones = batch_to_seq(dones) # Tx1
307 | # create wts
308 | w_msg = []
309 | b_msg = []
310 | w_ob = []
311 | b_ob = []
312 | wx_hid = []
313 | wh_hid = []
314 | b_hid = []
315 | for i in range(n_agent):
316 | with tf.variable_scope(scope + ('_%d' % i)):
317 | w_msg.append(tf.get_variable("w_msg", [n_h, n_h],
318 | initializer=init_method(init_scale, init_mode)))
319 | b_msg.append(tf.get_variable("b_msg", [n_h],
320 | initializer=tf.constant_initializer(0.0)))
321 | w_ob.append(tf.get_variable("w_ob", [n_s, n_h],
322 | initializer=init_method(init_scale, init_mode)))
323 | b_ob.append(tf.get_variable("b_ob", [n_h],
324 | initializer=tf.constant_initializer(0.0)))
325 | wx_hid.append(tf.get_variable("wx_hid", [n_h, n_h*4],
326 | initializer=init_method(init_scale, init_mode)))
327 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4],
328 | initializer=init_method(init_scale, init_mode)))
329 | b_hid.append(tf.get_variable("b_hid", [n_h*4],
330 | initializer=tf.constant_initializer(0.0)))
331 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
332 | # loop over steps
333 | for t, (x, done) in enumerate(zip(xs, dones)):
334 | # abuse 1 agent as 1 step
335 | x = batch_to_seq(tf.squeeze(x, axis=0))
336 | out_h = []
337 | out_c = []
338 | out_m = [tf.expand_dims(h[i], axis=0) for i in range(n_agent)]
339 | out_m = tf.concat(out_m, axis=0) # Nxn_h
340 | # hidden phase
341 | for i, xi in enumerate(x):
342 | ci = tf.expand_dims(c[i], axis=0)
343 | hi = tf.expand_dims(h[i], axis=0)
344 | # reset states for a new episode
345 | ci = ci * (1-done)
346 | hi = hi * (1-done)
347 | # receive neighbor messages
348 | mi = tf.reduce_mean(tf.boolean_mask(out_m, masks[i]), axis=0, keepdims=True)
349 | # the state encoder in IC3 code is not consistent with that described in the paper.
350 | # Here we follow the impelmentation in the paper.
351 | si = tf.nn.tanh(tf.matmul(xi, w_ob[i]) + b_ob[i]) + tf.matmul(mi, w_msg[i]) + b_msg[i]
352 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i]
353 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi)
354 | ii = tf.nn.sigmoid(ii)
355 | fi = tf.nn.sigmoid(fi)
356 | oi = tf.nn.sigmoid(oi)
357 | ui = tf.tanh(ui)
358 | ci = fi*ci + ii*ui
359 | hi = oi*tf.tanh(ci)
360 | out_h.append(hi)
361 | out_c.append(ci)
362 | c = tf.concat(out_c, axis=0)
363 | h = tf.concat(out_h, axis=0)
364 | xs[t] = tf.expand_dims(h, axis=0)
365 | s = tf.concat(axis=1, values=[c, h])
366 | xs = seq_to_batch(xs) # TxNxn_h
367 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h
368 | return xs, s
369 |
370 |
371 | def lstm_dial(xs, ps, dones, masks, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE,
372 | init_method=DEFAULT_METHOD):
373 | n_agent = s.shape[0]
374 | n_h = s.shape[1] // 2
375 | n_s = xs.shape[-1]
376 | n_a = ps.shape[-1]
377 | xs = tf.transpose(xs, perm=[1,0,2]) # TxNxn_s
378 | xs = batch_to_seq(xs)
379 | ps = tf.transpose(ps, perm=[1,0,2]) # TxNxn_a
380 | ps = batch_to_seq(ps)
381 | # need dones to reset states
382 | dones = batch_to_seq(dones) # Tx1
383 | # create wts
384 | w_msg = []
385 | b_msg = []
386 | w_ob = []
387 | b_ob = []
388 | wx_hid = []
389 | wh_hid = []
390 | b_hid = []
391 | for i in range(n_agent):
392 | n_m = np.sum(masks[i])
393 | # n_in_hid = (n_m+1)*n_h
394 | with tf.variable_scope(scope + ('_%d' % i)):
395 | w_msg.append(tf.get_variable("w_msg", [n_h*n_m, n_h],
396 | initializer=init_method(init_scale, init_mode)))
397 | b_msg.append(tf.get_variable("b_msg", [n_h],
398 | initializer=tf.constant_initializer(0.0)))
399 | w_ob.append(tf.get_variable("w_ob", [n_s*(n_m+1), n_h],
400 | initializer=init_method(init_scale, init_mode)))
401 | b_ob.append(tf.get_variable("b_ob", [n_h],
402 | initializer=tf.constant_initializer(0.0)))
403 | wx_hid.append(tf.get_variable("wx_hid", [n_h, n_h*4],
404 | initializer=init_method(init_scale, init_mode)))
405 | wh_hid.append(tf.get_variable("wh_hid", [n_h, n_h*4],
406 | initializer=init_method(init_scale, init_mode)))
407 | b_hid.append(tf.get_variable("b_hid", [n_h*4],
408 | initializer=tf.constant_initializer(0.0)))
409 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
410 | # loop over steps
411 | for t, (x, p, done) in enumerate(zip(xs, ps, dones)):
412 | # abuse 1 agent as 1 step
413 | x = tf.squeeze(x, axis=0)
414 | p = tf.squeeze(p, axis=0)
415 | out_h = []
416 | out_c = []
417 | out_m = []
418 | # communication phase
419 | for i in range(n_agent):
420 | hi = tf.expand_dims(h[i], axis=0)
421 | mi = fc(hi, 'mfc_%d' % i, n_h)
422 | out_m.append(mi)
423 | out_m = tf.concat(out_m, axis=0) # Nxn_h
424 | # hidden phase
425 | for i in range(n_agent):
426 | ci = tf.expand_dims(c[i], axis=0)
427 | hi = tf.expand_dims(h[i], axis=0)
428 | # reset states for a new episode
429 | ci = ci * (1-done)
430 | hi = hi * (1-done)
431 | # receive neighbor messages
432 | mi = tf.expand_dims(tf.reshape(tf.boolean_mask(out_m, masks[i]), [-1]), axis=0)
433 | ai = tf.one_hot(tf.expand_dims(tf.argmax(p[i]), axis=0), n_h)
434 | xi = tf.expand_dims(tf.reshape(tf.boolean_mask(x, masks[i]), [-1]), axis=0)
435 | xi = tf.concat([tf.expand_dims(x[i], axis=0), xi], axis=1)
436 | hxi = tf.nn.relu(tf.matmul(xi, w_ob[i]) + b_ob[i])
437 | hmi = tf.nn.relu(tf.matmul(mi, w_msg[i]) + b_msg[i])
438 | si = hxi + hmi + ai
439 | zi = tf.matmul(si, wx_hid[i]) + tf.matmul(hi, wh_hid[i]) + b_hid[i]
440 | ii, fi, oi, ui = tf.split(axis=1, num_or_size_splits=4, value=zi)
441 | ii = tf.nn.sigmoid(ii)
442 | fi = tf.nn.sigmoid(fi)
443 | oi = tf.nn.sigmoid(oi)
444 | ui = tf.tanh(ui)
445 | ci = fi*ci + ii*ui
446 | hi = oi*tf.tanh(ci)
447 | out_h.append(hi)
448 | out_c.append(ci)
449 | c = tf.concat(out_c, axis=0)
450 | h = tf.concat(out_h, axis=0)
451 | xs[t] = tf.expand_dims(h, axis=0)
452 | s = tf.concat(axis=1, values=[c, h])
453 | xs = seq_to_batch(xs) # TxNxn_h
454 | xs = tf.transpose(xs, perm=[1,0,2]) # NxTxn_h
455 | return xs, s
456 |
457 |
458 | """
459 | buffers
460 | """
461 | class TransBuffer:
462 | def reset(self):
463 | self.buffer = []
464 |
465 | @property
466 | def size(self):
467 | return len(self.buffer)
468 |
469 | def add_transition(self, ob, a, r, *_args, **_kwargs):
470 | raise NotImplementedError()
471 |
472 | def sample_transition(self, *_args, **_kwargs):
473 | raise NotImplementedError()
474 |
475 |
476 | class OnPolicyBuffer(TransBuffer):
477 | def __init__(self, gamma, alpha, distance_mask):
478 | self.gamma = gamma
479 | self.alpha = alpha
480 | if alpha > 0:
481 | self.distance_mask = distance_mask
482 | self.max_distance = np.max(distance_mask, axis=-1)
483 | self.reset()
484 |
485 | def reset(self, done=False):
486 | # the done before each step is required
487 | self.obs = []
488 | self.acts = []
489 | self.rs = []
490 | self.vs = []
491 | self.adds = []
492 | self.dones = [done]
493 |
494 | def add_transition(self, ob, na, a, r, v, done):
495 | self.obs.append(ob)
496 | self.adds.append(na)
497 | self.acts.append(a)
498 | self.rs.append(r)
499 | self.vs.append(v)
500 | self.dones.append(done)
501 |
502 | def sample_transition(self, R, dt=0):
503 | if self.alpha < 0:
504 | self._add_R_Adv(R)
505 | else:
506 | self._add_s_R_Adv(R)
507 | obs = np.array(self.obs, dtype=np.float32)
508 | nas = np.array(self.adds, dtype=np.int32)
509 | acts = np.array(self.acts, dtype=np.int32)
510 | Rs = np.array(self.Rs, dtype=np.float32)
511 | Advs = np.array(self.Advs, dtype=np.float32)
512 | # use pre-step dones here
513 | dones = np.array(self.dones[:-1], dtype=np.bool)
514 | self.reset(self.dones[-1])
515 | return obs, nas, acts, dones, Rs, Advs
516 |
517 | def _add_R_Adv(self, R):
518 | Rs = []
519 | Advs = []
520 | # use post-step dones here
521 | for r, v, done in zip(self.rs[::-1], self.vs[::-1], self.dones[:0:-1]):
522 | R = r + self.gamma * R * (1.-done)
523 | Adv = R - v
524 | Rs.append(R)
525 | Advs.append(Adv)
526 | Rs.reverse()
527 | Advs.reverse()
528 | self.Rs = Rs
529 | self.Advs = Advs
530 |
531 | def _add_st_R_Adv(self, R, dt):
532 | Rs = []
533 | Advs = []
534 | # use post-step dones here
535 | tdiff = dt
536 | for r, v, done in zip(self.rs[::-1], self.vs[::-1], self.dones[:0:-1]):
537 | R = self.gamma * R * (1.-done)
538 | if done:
539 | tdiff = 0
540 | # additional spatial rewards
541 | tmax = min(tdiff, self.max_distance)
542 | for t in range(tmax + 1):
543 | rt = np.sum(r[self.distance_mask == t])
544 | R += (self.gamma * self.alpha) ** t * rt
545 | Adv = R - v
546 | tdiff += 1
547 | Rs.append(R)
548 | Advs.append(Adv)
549 | Rs.reverse()
550 | Advs.reverse()
551 | self.Rs = Rs
552 | self.Advs = Advs
553 |
554 | def _add_s_R_Adv(self, R):
555 | Rs = []
556 | Advs = []
557 | # use post-step dones here
558 | for r, v, done in zip(self.rs[::-1], self.vs[::-1], self.dones[:0:-1]):
559 | R = self.gamma * R * (1.-done)
560 | # additional spatial rewards
561 | for t in range(self.max_distance + 1):
562 | rt = np.sum(r[self.distance_mask == t])
563 | R += (self.alpha ** t) * rt
564 | Adv = R - v
565 | Rs.append(R)
566 | Advs.append(Adv)
567 | Rs.reverse()
568 | Advs.reverse()
569 | self.Rs = Rs
570 | self.Advs = Advs
571 |
572 |
573 | class MultiAgentOnPolicyBuffer(OnPolicyBuffer):
574 | def __init__(self, gamma, alpha, distance_mask):
575 | super().__init__(gamma, alpha, distance_mask)
576 |
577 | def sample_transition(self, R, dt=0):
578 | if self.alpha < 0:
579 | self._add_R_Adv(R)
580 | else:
581 | self._add_s_R_Adv(R)
582 | obs = np.transpose(np.array(self.obs, dtype=np.float32), (1, 0, 2))
583 | policies = np.transpose(np.array(self.adds, dtype=np.float32), (1, 0, 2))
584 | acts = np.transpose(np.array(self.acts, dtype=np.int32))
585 | Rs = np.array(self.Rs, dtype=np.float32)
586 | Advs = np.array(self.Advs, dtype=np.float32)
587 | dones = np.array(self.dones[:-1], dtype=np.bool)
588 | self.reset(self.dones[-1])
589 | return obs, policies, acts, dones, Rs, Advs
590 |
591 | def _add_R_Adv(self, R):
592 | Rs = []
593 | Advs = []
594 | vs = np.array(self.vs)
595 | for i in range(vs.shape[1]):
596 | cur_Rs = []
597 | cur_Advs = []
598 | cur_R = R[i]
599 | for r, v, done in zip(self.rs[::-1], vs[::-1,i], self.dones[:0:-1]):
600 | cur_R = r + self.gamma * cur_R * (1.-done)
601 | cur_Adv = cur_R - v
602 | cur_Rs.append(cur_R)
603 | cur_Advs.append(cur_Adv)
604 | cur_Rs.reverse()
605 | cur_Advs.reverse()
606 | Rs.append(cur_Rs)
607 | Advs.append(cur_Advs)
608 | self.Rs = np.array(Rs)
609 | self.Advs = np.array(Advs)
610 |
611 | def _add_st_R_Adv(self, R, dt):
612 | Rs = []
613 | Advs = []
614 | vs = np.array(self.vs)
615 | for i in range(vs.shape[1]):
616 | cur_Rs = []
617 | cur_Advs = []
618 | cur_R = R[i]
619 | tdiff = dt
620 | distance_mask = self.distance_mask[i]
621 | max_distance = self.max_distance[i]
622 | for r, v, done in zip(self.rs[::-1], vs[::-1,i], self.dones[:0:-1]):
623 | cur_R = self.gamma * cur_R * (1.-done)
624 | if done:
625 | tdiff = 0
626 | # additional spatial rewards
627 | tmax = min(tdiff, max_distance)
628 | for t in range(tmax + 1):
629 | rt = np.sum(r[distance_mask==t])
630 | cur_R += (self.gamma * self.alpha) ** t * rt
631 | cur_Adv = cur_R - v
632 | tdiff += 1
633 | cur_Rs.append(cur_R)
634 | cur_Advs.append(cur_Adv)
635 | cur_Rs.reverse()
636 | cur_Advs.reverse()
637 | Rs.append(cur_Rs)
638 | Advs.append(cur_Advs)
639 | self.Rs = np.array(Rs)
640 | self.Advs = np.array(Advs)
641 |
642 | def _add_s_R_Adv(self, R):
643 | Rs = []
644 | Advs = []
645 | vs = np.array(self.vs)
646 | for i in range(vs.shape[1]):
647 | cur_Rs = []
648 | cur_Advs = []
649 | cur_R = R[i]
650 | distance_mask = self.distance_mask[i]
651 | max_distance = self.max_distance[i]
652 | for r, v, done in zip(self.rs[::-1], vs[::-1,i], self.dones[:0:-1]):
653 | cur_R = self.gamma * cur_R * (1.-done)
654 | # additional spatial rewards
655 | for t in range(max_distance + 1):
656 | rt = np.sum(r[distance_mask==t])
657 | cur_R += (self.alpha ** t) * rt
658 | cur_Adv = cur_R - v
659 | cur_Rs.append(cur_R)
660 | cur_Advs.append(cur_Adv)
661 | cur_Rs.reverse()
662 | cur_Advs.reverse()
663 | Rs.append(cur_Rs)
664 | Advs.append(cur_Advs)
665 | self.Rs = np.array(Rs)
666 | self.Advs = np.array(Advs)
667 |
668 | """
669 | util functions
670 | """
671 | class Scheduler:
672 | def __init__(self, val_init, val_min=0, total_step=0, decay='linear'):
673 | self.val = val_init
674 | self.N = float(total_step)
675 | self.val_min = val_min
676 | self.decay = decay
677 | self.n = 0
678 |
679 | def get(self, n_step):
680 | self.n += n_step
681 | if self.decay == 'linear':
682 | return max(self.val_min, self.val * (1 - self.n / self.N))
683 | else:
684 | return self.val
685 |
686 |
--------------------------------------------------------------------------------
/config/config_greedy.ini:
--------------------------------------------------------------------------------
1 | [ENV_CONFIG]
2 | clip_wave = -1.0
3 | clip_wait = -1.0
4 | control_interval_sec = 5
5 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc.
6 | agent = greedy
7 | ; coop discount is used to discount the neighbors' impact
8 | coop_gamma = 0.75
9 | data_path = ./envs/data/
10 | episode_length_sec = 3600
11 | ; the normailization is based on typical values in sim
12 | norm_wave = 1.0
13 | norm_wait = 1.0
14 | coef_wait = 0.2
15 | peak_flow1 = 1100
16 | peak_flow2 = 925
17 | init_density = 0
18 | ; objective is chosen from queue, wait, hybrid
19 | objective = queue
20 | scenario = large_grid
21 | seed = 12
22 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000
23 | yellow_interval_sec = 2
24 |
--------------------------------------------------------------------------------
/config/config_ia2c.ini:
--------------------------------------------------------------------------------
1 | [MODEL_CONFIG]
2 | rmsp_alpha = 0.99
3 | rmsp_epsilon = 1e-5
4 | max_grad_norm = 40
5 | gamma = 0.99
6 | lr_init = 5e-4
7 | lr_decay = constant
8 | entropy_coef = 0.01
9 | value_coef = 0.5
10 | num_lstm = 64
11 | num_fc = 64
12 | batch_size = 120
13 | reward_norm = 100.0
14 | reward_clip = -1
15 |
16 | [TRAIN_CONFIG]
17 | total_step = 1e6
18 | test_interval = 2e6
19 | log_interval = 1e4
20 |
21 | [ENV_CONFIG]
22 | clip_wave = 2.0
23 | clip_wait = -1
24 | control_interval_sec = 5
25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc.
26 | agent = ia2c
27 | ; coop discount is used to discount the neighbors' impact
28 | coop_gamma = 0.9
29 | data_path = ./envs/data/
30 | episode_length_sec = 3600
31 | ; the normailization is based on typical values in sim
32 | norm_wave = 5.0
33 | norm_wait = -1
34 | coef_wait = 0
35 | peak_flow1 = 1100
36 | peak_flow2 = 925
37 | init_density = 0
38 | ; objective is chosen from queue, wait, hybrid
39 | objective = queue
40 | scenario = large_grid
41 | seed = 12
42 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000
43 | yellow_interval_sec = 2
44 |
--------------------------------------------------------------------------------
/config/config_ia2c_cu.ini:
--------------------------------------------------------------------------------
1 | [MODEL_CONFIG]
2 | rmsp_alpha = 0.99
3 | rmsp_epsilon = 1e-5
4 | max_grad_norm = 40
5 | gamma = 0.99
6 | lr_init = 5e-4
7 | lr_decay = constant
8 | entropy_coef = 0.01
9 | value_coef = 0.5
10 | num_lstm = 64
11 | num_fc = 64
12 | batch_size = 120
13 | reward_norm = 100.0
14 | reward_clip = -1
15 |
16 | [TRAIN_CONFIG]
17 | total_step = 1e6
18 | test_interval = 2e6
19 | log_interval = 1e4
20 |
21 | [ENV_CONFIG]
22 | clip_wave = 2.0
23 | clip_wait = -1
24 | control_interval_sec = 5
25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc.
26 | agent = ma2c_cu
27 | ; coop discount is used to discount the neighbors' impact
28 | coop_gamma = 0.9
29 | data_path = ./envs/data/
30 | episode_length_sec = 3600
31 | ; the normailization is based on typical values in sim
32 | norm_wave = 5.0
33 | norm_wait = -1
34 | coef_wait = 0
35 | peak_flow1 = 1100
36 | peak_flow2 = 925
37 | init_density = 0
38 | ; objective is chosen from queue, wait, hybrid
39 | objective = queue
40 | scenario = large_grid
41 | seed = 12
42 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000
43 | yellow_interval_sec = 2
44 |
--------------------------------------------------------------------------------
/config/config_ia2c_fp.ini:
--------------------------------------------------------------------------------
1 | [MODEL_CONFIG]
2 | rmsp_alpha = 0.99
3 | rmsp_epsilon = 1e-5
4 | max_grad_norm = 40
5 | gamma = 0.99
6 | lr_init = 5e-4
7 | lr_decay = constant
8 | entropy_coef = 0.01
9 | value_coef = 0.5
10 | num_lstm = 64
11 | num_fc = 64
12 | batch_size = 120
13 | reward_norm = 100.0
14 | reward_clip = -1
15 |
16 | [TRAIN_CONFIG]
17 | total_step = 1e6
18 | test_interval = 2e6
19 | log_interval = 1e4
20 |
21 | [ENV_CONFIG]
22 | clip_wave = 2.0
23 | clip_wait = -1
24 | control_interval_sec = 5
25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc.
26 | agent = ia2c_fp
27 | ; coop discount is used to discount the neighbors' impact
28 | coop_gamma = 0.9
29 | data_path = ./envs/data/
30 | episode_length_sec = 3600
31 | ; the normailization is based on typical values in sim
32 | norm_wave = 5.0
33 | norm_wait = -1
34 | coef_wait = 0
35 | peak_flow1 = 1100
36 | peak_flow2 = 925
37 | init_density = 0
38 | ; objective is chosen from queue, wait, hybrid
39 | objective = queue
40 | scenario = large_grid
41 | seed = 12
42 | test_seeds = 10000,20000,30000,40000,50000,60000,70000,80000,90000,100000
43 | yellow_interval_sec = 2
44 |
--------------------------------------------------------------------------------
/config/config_ma2c_dial.ini:
--------------------------------------------------------------------------------
1 | [MODEL_CONFIG]
2 | rmsp_alpha = 0.99
3 | rmsp_epsilon = 1e-5
4 | max_grad_norm = 40
5 | gamma = 0.99
6 | lr_init = 5e-4
7 | lr_decay = constant
8 | entropy_coef = 0.01
9 | value_coef = 0.5
10 | num_lstm = 64
11 | num_fc = 64
12 | batch_size = 120
13 | reward_norm = 2000.0
14 | reward_clip = -1
15 |
16 | [TRAIN_CONFIG]
17 | total_step = 1e6
18 | test_interval = 2e6
19 | log_interval = 1e4
20 |
21 | [ENV_CONFIG]
22 | clip_wave = 2.0
23 | clip_wait = -1
24 | control_interval_sec = 5
25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc.
26 | agent = ma2c_dial
27 | ; coop discount is used to discount the neighbors' impact
28 | coop_gamma = -1
29 | data_path = ./envs/data/
30 | episode_length_sec = 3600
31 | ; the normailization is based on typical values in sim
32 | norm_wave = 5.0
33 | norm_wait = -1
34 | coef_wait = 0
35 | peak_flow1 = 1100
36 | peak_flow2 = 925
37 | init_density = 0
38 | ; objective is chosen from queue, wait, hybrid
39 | objective = queue
40 | scenario = large_grid
41 | seed = 12
42 | test_seeds = 10000
43 | yellow_interval_sec = 2
44 |
--------------------------------------------------------------------------------
/config/config_ma2c_ic3.ini:
--------------------------------------------------------------------------------
1 | [MODEL_CONFIG]
2 | rmsp_alpha = 0.99
3 | rmsp_epsilon = 1e-5
4 | max_grad_norm = 40
5 | gamma = 0.99
6 | lr_init = 5e-4
7 | lr_decay = constant
8 | entropy_coef = 0.01
9 | value_coef = 0.5
10 | num_lstm = 64
11 | num_fc = 64
12 | batch_size = 120
13 | reward_norm = 2000.0
14 | reward_clip = -1
15 |
16 | [TRAIN_CONFIG]
17 | total_step = 1e6
18 | test_interval = 2e6
19 | log_interval = 1e4
20 |
21 | [ENV_CONFIG]
22 | clip_wave = 2.0
23 | clip_wait = -1
24 | control_interval_sec = 5
25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc.
26 | agent = ma2c_ic3
27 | ; coop discount is used to discount the neighbors' impact
28 | coop_gamma = -1
29 | data_path = ./envs/data/
30 | episode_length_sec = 3600
31 | ; the normailization is based on typical values in sim
32 | norm_wave = 5.0
33 | norm_wait = -1
34 | coef_wait = 0
35 | peak_flow1 = 1100
36 | peak_flow2 = 925
37 | init_density = 0
38 | ; objective is chosen from queue, wait, hybrid
39 | objective = queue
40 | scenario = large_grid
41 | seed = 12
42 | test_seeds = 10000
43 | yellow_interval_sec = 2
44 |
--------------------------------------------------------------------------------
/config/config_ma2c_nc.ini:
--------------------------------------------------------------------------------
1 | [MODEL_CONFIG]
2 | rmsp_alpha = 0.99
3 | rmsp_epsilon = 1e-5
4 | max_grad_norm = 40
5 | gamma = 0.99
6 | lr_init = 5e-4
7 | lr_decay = constant
8 | entropy_coef = 0.01
9 | value_coef = 0.5
10 | num_lstm = 64
11 | num_fc = 64
12 | batch_size = 120
13 | reward_norm = 2000.0
14 | reward_clip = -1
15 |
16 | [TRAIN_CONFIG]
17 | total_step = 1e6
18 | test_interval = 2e6
19 | log_interval = 1e4
20 |
21 | [ENV_CONFIG]
22 | clip_wave = 2.0
23 | clip_wait = -1
24 | control_interval_sec = 5
25 | ; agent is greedy, ia2c, ia2c_fp, ma2c_som, ma2c_ic3, ma2c_nc.
26 | agent = ma2c_nc
27 | ; coop discount is used to discount the neighbors' impact
28 | coop_gamma = -1
29 | data_path = ./envs/data/
30 | episode_length_sec = 3600
31 | ; the normailization is based on typical values in sim
32 | norm_wave = 5.0
33 | norm_wait = -1
34 | coef_wait = 0
35 | peak_flow1 = 1100
36 | peak_flow2 = 925
37 | init_density = 0
38 | ; objective is chosen from queue, wait, hybrid
39 | objective = queue
40 | scenario = large_grid
41 | seed = 12
42 | test_seeds = 10000
43 | yellow_interval_sec = 2
44 |
--------------------------------------------------------------------------------
/envs/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/env.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/__pycache__/env.cpython-37.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/large_grid_env.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/__pycache__/large_grid_env.cpython-37.pyc
--------------------------------------------------------------------------------
/envs/data/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/envs/data/__pycache__/build_file.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/__pycache__/build_file.cpython-37.pyc
--------------------------------------------------------------------------------
/envs/data/build_file.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | build *.xml files for a large 5 x 5 network
4 | w/ the traffic dynamics modified from the following paper:
5 |
6 | Chu, Tianshu, Shuhui Qu, and Jie Wang. "Large-scale traffic grid signal control with
7 | regional reinforcement learning." American Control Conference (ACC), 2016. IEEE, 2016.
8 |
9 | @author: Tianshu Chu
10 | """
11 | import numpy as np
12 | import os
13 |
14 | MAX_CAR_NUM = 30
15 | SPEED_LIMIT_ST = 20
16 | SPEED_LIMIT_AV = 11
17 | L0 = 200
18 | L0_end = 75
19 | N = 5
20 |
21 |
22 | def write_file(path, content):
23 | with open(path, 'w') as f:
24 | f.write(content)
25 |
26 |
27 | def output_nodes(node):
28 | str_nodes = '\n'
29 | # traffic light nodes
30 | ind = 1
31 | for dy in np.arange(0, L0 * 5, L0):
32 | for dx in np.arange(0, L0 * 5, L0):
33 | str_nodes += node % ('nt' + str(ind), dx, dy, 'traffic_light')
34 | ind += 1
35 | # other nodes
36 | ind = 1
37 | for dx in np.arange(0, L0 * 5, L0):
38 | str_nodes += node % ('np' + str(ind), dx, -L0_end, 'priority')
39 | ind += 1
40 | for dy in np.arange(0, L0 * 5, L0):
41 | str_nodes += node % ('np' + str(ind), L0 * 4 + L0_end, dy, 'priority')
42 | ind += 1
43 | for dx in np.arange(L0 * 4, -1, -L0):
44 | str_nodes += node % ('np' + str(ind), dx, L0 * 4 + L0_end, 'priority')
45 | ind += 1
46 | for dy in np.arange(L0 * 4, -1, -L0):
47 | str_nodes += node % ('np' + str(ind), -L0_end, dy, 'priority')
48 | ind += 1
49 | str_nodes += '\n'
50 | return str_nodes
51 |
52 |
53 | def output_road_types():
54 | str_types = '\n'
55 | str_types += ' \n' % SPEED_LIMIT_ST
56 | str_types += ' \n' % SPEED_LIMIT_AV
57 | str_types += '\n'
58 | return str_types
59 |
60 |
61 | def get_edge_str(edge, from_node, to_node, edge_type):
62 | edge_id = '%s_%s' % (from_node, to_node)
63 | return edge % (edge_id, from_node, to_node, edge_type)
64 |
65 |
66 | def output_edges(edge):
67 | str_edges = '\n'
68 | # external roads
69 | in_edges = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1]
70 | out_edges = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20]
71 | for in_i, out_i in zip(in_edges, out_edges):
72 | in_node = 'nt' + str(in_i)
73 | out_node = 'np' + str(out_i)
74 | str_edges += get_edge_str(edge, in_node, out_node, 'a')
75 | str_edges += get_edge_str(edge, out_node, in_node, 'a')
76 |
77 | in_edges = [1, 2, 3, 4, 5, 25, 24, 23, 22, 21]
78 | out_edges = [1, 2, 3, 4, 5, 11, 12, 13, 14, 15]
79 | for in_i, out_i in zip(in_edges, out_edges):
80 | in_node = 'nt' + str(in_i)
81 | out_node = 'np' + str(out_i)
82 | str_edges += get_edge_str(edge, in_node, out_node, 'b')
83 | str_edges += get_edge_str(edge, out_node, in_node, 'b')
84 | # internal roads
85 | for i in range(1, 25, 5):
86 | for j in range(4):
87 | from_node = 'nt' + str(i + j)
88 | to_node = 'nt' + str(i + j + 1)
89 | str_edges += get_edge_str(edge, from_node, to_node, 'a')
90 | str_edges += get_edge_str(edge, to_node, from_node, 'a')
91 | for i in range(1, 6):
92 | for j in range(0, 20, 5):
93 | from_node = 'nt' + str(i + j)
94 | to_node = 'nt' + str(i + j + 5)
95 | str_edges += get_edge_str(edge, from_node, to_node, 'b')
96 | str_edges += get_edge_str(edge, to_node, from_node, 'b')
97 | str_edges += '\n'
98 | return str_edges
99 |
100 |
101 | def get_con_str(con, from_node, cur_node, to_node, from_lane, to_lane):
102 | from_edge = '%s_%s' % (from_node, cur_node)
103 | to_edge = '%s_%s' % (cur_node, to_node)
104 | return con % (from_edge, to_edge, from_lane, to_lane)
105 |
106 |
107 | def get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node):
108 | str_cons = ''
109 | # go-through
110 | str_cons += get_con_str(con, s_node, cur_node, n_node, 0, 0)
111 | str_cons += get_con_str(con, n_node, cur_node, s_node, 0, 0)
112 | str_cons += get_con_str(con, w_node, cur_node, e_node, 0, 0)
113 | str_cons += get_con_str(con, e_node, cur_node, w_node, 0, 0)
114 | # left-turn
115 | str_cons += get_con_str(con, s_node, cur_node, w_node, 0, 1)
116 | str_cons += get_con_str(con, n_node, cur_node, e_node, 0, 1)
117 | str_cons += get_con_str(con, w_node, cur_node, n_node, 1, 0)
118 | str_cons += get_con_str(con, e_node, cur_node, s_node, 1, 0)
119 | # right-turn
120 | str_cons += get_con_str(con, s_node, cur_node, e_node, 0, 0)
121 | str_cons += get_con_str(con, n_node, cur_node, w_node, 0, 0)
122 | str_cons += get_con_str(con, w_node, cur_node, s_node, 0, 0)
123 | str_cons += get_con_str(con, e_node, cur_node, n_node, 0, 0)
124 | return str_cons
125 |
126 |
127 | def output_connections(con):
128 | str_cons = '\n'
129 | # edge nodes
130 | in_edges = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1]
131 | out_edges = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20]
132 | for i, j in zip(in_edges, out_edges):
133 | if i == 5:
134 | s_node = 'np5'
135 | elif i == 1:
136 | s_node = 'np1'
137 | else:
138 | s_node = 'nt' + str(i - 5)
139 | if i == 25:
140 | n_node = 'np11'
141 | elif i == 21:
142 | n_node = 'np15'
143 | else:
144 | n_node = 'nt' + str(i + 5)
145 | if i % 5 == 1:
146 | w_node = 'np' + str(j)
147 | else:
148 | w_node = 'nt' + str(i - 1)
149 | if i % 5 == 0:
150 | e_node = 'np' + str(j)
151 | else:
152 | e_node = 'nt' + str(i + 1)
153 | cur_node = 'nt' + str(i)
154 | str_cons += get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node)
155 |
156 | in_edges = [2, 3, 4, 24, 23, 22]
157 | out_edges = [2, 3, 4, 12, 13, 14]
158 | for i, j in zip(in_edges, out_edges):
159 | w_node = 'nt' + str(i - 1)
160 | e_node = 'nt' + str(i + 1)
161 | if i <= 5:
162 | s_node = 'np' + str(j)
163 | else:
164 | s_node = 'nt' + str(i - 5)
165 | if i >= 20:
166 | n_node = 'np' + str(j)
167 | else:
168 | n_node = 'nt' + str(i + 5)
169 | cur_node = 'nt' + str(i)
170 | str_cons += get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node)
171 |
172 | # internal nodes
173 | for i in [7, 8, 9, 12, 13, 14, 17, 18, 19]:
174 | n_node = 'nt' + str(i + 5)
175 | s_node = 'nt' + str(i - 5)
176 | w_node = 'nt' + str(i - 1)
177 | e_node = 'nt' + str(i + 1)
178 | cur_node = 'nt' + str(i)
179 | str_cons += get_con_str_set(con, cur_node, n_node, s_node, w_node, e_node)
180 |
181 | str_cons += '\n'
182 | return str_cons
183 |
184 |
185 | def output_netconfig():
186 | str_config = '\n \n'
187 | str_config += ' \n'
188 | str_config += ' \n'
189 | str_config += ' \n'
190 | str_config += ' \n'
191 | str_config += ' \n'
192 | str_config += ' \n \n\n'
195 | return str_config
196 |
197 |
198 | def get_external_od(out_edges, dest=True):
199 | edge_maps = [0, 1, 2, 3, 4, 5, 5, 10, 15, 20, 25,
200 | 25, 24, 23, 22, 21, 21, 16, 11, 6, 1]
201 | cur_dest = []
202 | for out_edge in out_edges:
203 | in_edge = edge_maps[out_edge]
204 | in_node = 'nt' + str(in_edge)
205 | out_node = 'np' + str(out_edge)
206 | if dest:
207 | edge = '%s_%s' % (in_node, out_node)
208 | else:
209 | edge = '%s_%s' % (out_node, in_node)
210 | cur_dest.append(edge)
211 | return cur_dest
212 |
213 |
214 | def sample_od_pair(orig_edges, dest_edges):
215 | from_edges = []
216 | to_edges = []
217 | for i in range(len(orig_edges)):
218 | from_edges.append(np.random.choice(orig_edges[i]))
219 | to_edges.append(np.random.choice(dest_edges))
220 | return from_edges, to_edges
221 |
222 |
223 | def init_routes(density):
224 | init_flow = ' \n'
225 | output = ''
226 | in_nodes = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1,
227 | 1, 2, 3, 4, 5, 25, 24, 23, 22, 21]
228 | out_nodes = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20,
229 | 1, 2, 3, 4, 5, 11, 12, 13, 14, 15]
230 | # external edges
231 | sink_edges = []
232 | for i, j in zip(in_nodes, out_nodes):
233 | node1 = 'nt' + str(i)
234 | node2 = 'np' + str(j)
235 | sink_edges.append('%s_%s' % (node1, node2))
236 |
237 | def get_od(node1, node2, k, lane=0):
238 | source_edge = '%s_%s' % (node1, node2)
239 | sink_edge = np.random.choice(sink_edges)
240 | return init_flow % (str(k), source_edge, sink_edge, lane, car_num)
241 |
242 | # streets
243 | k = 1
244 | car_num = int(MAX_CAR_NUM * density)
245 | for i in range(1, 25, 5):
246 | for j in range(4):
247 | node1 = 'nt' + str(i + j)
248 | node2 = 'nt' + str(i + j + 1)
249 | output += get_od(node1, node2, k)
250 | k += 1
251 | output += get_od(node2, node1, k)
252 | k += 1
253 | output += get_od(node1, node2, k, lane=1)
254 | k += 1
255 | output += get_od(node2, node1, k, lane=1)
256 | k += 1
257 | # avenues
258 | for i in range(1, 6):
259 | for j in range(0, 20, 5):
260 | node1 = 'nt' + str(i + j)
261 | node2 = 'nt' + str(i + j + 5)
262 | output += get_od(node1, node2, k)
263 | k += 1
264 | output += get_od(node2, node1, k)
265 | k += 1
266 | return output
267 |
268 | def output_flows(peak_flow1, peak_flow2, density, seed=None):
269 | '''
270 | flow1: x11, x12, x13, x14, x15 -> x1, x2, x3, x4, x5
271 | flow2: x16, x17, x18, x19, x20 -> x6, x7, x8, x9, x10
272 | flow3: x1, x2, x3, x4, x5 -> x15, x14, x13, x12, x11
273 | flow4: x6, x7, x8, x9, x10 -> x20, x19, x18, x17, x16
274 | '''
275 | if seed is not None:
276 | np.random.seed(seed)
277 | ext_flow = ' \n'
278 | str_flows = '\n'
279 | str_flows += ' \n'
280 | # initial traffic dist
281 | if density > 0:
282 | str_flows += init_routes(density)
283 |
284 | # create external origins and destinations for flows
285 | srcs = []
286 | srcs.append(get_external_od([12, 13, 14], dest=False))
287 | srcs.append(get_external_od([16, 18, 20], dest=False))
288 | srcs.append(get_external_od([2, 3, 4], dest=False))
289 | srcs.append(get_external_od([6, 8, 10], dest=False))
290 |
291 | sinks = []
292 | sinks.append(get_external_od([2, 3, 4]))
293 | sinks.append(get_external_od([6, 8, 10]))
294 | sinks.append(get_external_od([14, 13, 12]))
295 | sinks.append(get_external_od([20, 18, 16]))
296 |
297 | # create volumes per 5 min for flows
298 | ratios1 = np.array([0.4, 0.7, 0.9, 1.0, 0.75, 0.5, 0.25]) # start from 0
299 | ratios2 = np.array([0.3, 0.8, 0.9, 1.0, 0.8, 0.6, 0.2]) # start from 15min
300 | flows1 = peak_flow1 * 0.6 * ratios1
301 | flows2 = peak_flow1 * ratios1
302 | flows3 = peak_flow2 * 0.6 * ratios2
303 | flows4 = peak_flow2 * ratios2
304 | flows = [flows1, flows2, flows3, flows4]
305 | times = np.arange(0, 3001, 300)
306 | id1 = len(flows1)
307 | id2 = len(times) - 1 - id1
308 | for i in range(len(times) - 1):
309 | name = str(i)
310 | t_begin, t_end = times[i], times[i + 1]
311 | # external flow
312 | k = 0
313 | if i < id1:
314 | for j in [0, 1]:
315 | for e1, e2 in zip(srcs[j], sinks[j]):
316 | cur_name = name + '_' + str(k)
317 | str_flows += ext_flow % (cur_name, e1, e2, t_begin, t_end, flows[j][i])
318 | k += 1
319 | if i >= id2:
320 | for j in [2, 3]:
321 | for e1, e2 in zip(srcs[j], sinks[j]):
322 | cur_name = name + '_' + str(k)
323 | str_flows += ext_flow % (cur_name, e1, e2, t_begin, t_end, flows[j][i - id2])
324 | k += 1
325 | str_flows += '\n'
326 | return str_flows
327 |
328 |
329 | def gen_rou_file(path, peak_flow1, peak_flow2, density, seed=None, thread=None):
330 | if thread is None:
331 | flow_file = 'exp.rou.xml'
332 | else:
333 | flow_file = 'exp_%d.rou.xml' % int(thread)
334 | write_file(path + flow_file, output_flows(peak_flow1, peak_flow2, density, seed=seed))
335 | sumocfg_file = path + ('exp_%d.sumocfg' % thread)
336 | write_file(sumocfg_file, output_config(thread=thread))
337 | return sumocfg_file
338 |
339 |
340 | def output_config(thread=None):
341 | if thread is None:
342 | out_file = 'exp.rou.xml'
343 | else:
344 | out_file = 'exp_%d.rou.xml' % int(thread)
345 | str_config = '\n \n'
346 | str_config += ' \n'
347 | str_config += ' \n' % out_file
348 | str_config += ' \n'
349 | str_config += ' \n \n\n'
352 | return str_config
353 |
354 |
355 | def get_ild_str(from_node, to_node, ild_str, lane_i=0):
356 | edge = '%s_%s' % (from_node, to_node)
357 | return ild_str % (edge, lane_i, edge, lane_i)
358 |
359 |
360 | def output_ild(ild):
361 | str_adds = '\n'
362 | in_edges = [5, 10, 15, 20, 25, 21, 16, 11, 6, 1,
363 | 1, 2, 3, 4, 5, 25, 24, 23, 22, 21]
364 | out_edges = [6, 7, 8, 9, 10, 16, 17, 18, 19, 20,
365 | 1, 2, 3, 4, 5, 11, 12, 13, 14, 15]
366 | # external edges
367 | for k, (i, j) in enumerate(zip(in_edges, out_edges)):
368 | node1 = 'nt' + str(i)
369 | node2 = 'np' + str(j)
370 | str_adds += get_ild_str(node2, node1, ild)
371 | if k < 10:
372 | # streets
373 | str_adds += get_ild_str(node2, node1, ild, lane_i=1)
374 | # streets
375 | for i in range(1, 25, 5):
376 | for j in range(4):
377 | node1 = 'nt' + str(i + j)
378 | node2 = 'nt' + str(i + j + 1)
379 | str_adds += get_ild_str(node1, node2, ild)
380 | str_adds += get_ild_str(node2, node1, ild)
381 | str_adds += get_ild_str(node1, node2, ild, lane_i=1)
382 | str_adds += get_ild_str(node2, node1, ild, lane_i=1)
383 | # avenues
384 | for i in range(1, 6):
385 | for j in range(0, 20, 5):
386 | node1 = 'nt' + str(i + j)
387 | node2 = 'nt' + str(i + j + 5)
388 | str_adds += get_ild_str(node1, node2, ild)
389 | str_adds += get_ild_str(node2, node1, ild)
390 | str_adds += '\n'
391 | return str_adds
392 |
393 |
394 | def output_tls(tls, phase):
395 | str_adds = '\n'
396 | # all crosses have 3 phases
397 | three_phases = ['GGgrrrGGgrrr', 'yyyrrryyyrrr',
398 | 'rrrGrGrrrGrG', 'rrrGryrrrGry',
399 | 'rrrGGrrrrGGr', 'rrryyrrrryyr']
400 | phase_duration = [30, 3]
401 | for i in range(1, 26):
402 | node = 'nt' + str(i)
403 | str_adds += tls % node
404 | for k, p in enumerate(three_phases):
405 | str_adds += phase % (phase_duration[k % 2], p)
406 | str_adds += ' \n'
407 | str_adds += '\n'
408 | return str_adds
409 |
410 |
411 | def main():
412 | # nod.xml file
413 | node = ' \n'
414 | write_file('./exp.nod.xml', output_nodes(node))
415 |
416 | # typ.xml file
417 | write_file('./exp.typ.xml', output_road_types())
418 |
419 | # edg.xml file
420 | edge = ' \n'
421 | write_file('./exp.edg.xml', output_edges(edge))
422 |
423 | # con.xml file
424 | con = ' \n'
425 | write_file('./exp.con.xml', output_connections(con))
426 |
427 | # tls.xml file
428 | tls = ' \n'
429 | phase = ' \n'
430 | write_file('./exp.tll.xml', output_tls(tls, phase))
431 |
432 | # net config file
433 | write_file('./exp.netccfg', output_netconfig())
434 |
435 | # generate net.xml file
436 | os.system('netconvert -c exp.netccfg')
437 |
438 | # raw.rou.xml file
439 | write_file('./exp.rou.xml', output_flows(1000, 2000, 0.2))
440 |
441 | # generate rou.xml file
442 | # os.system('jtrrouter -n exp.net.xml -r exp.raw.rou.xml -o exp.rou.xml')
443 |
444 | # add.xml file
445 | ild = ' \n'
446 | # ild_in = ' \n'
447 | write_file('./exp.add.xml', output_ild(ild))
448 |
449 | # config file
450 | write_file('./exp.sumocfg', output_config())
451 |
452 | if __name__ == '__main__':
453 | main()
454 |
--------------------------------------------------------------------------------
/envs/data/intersection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/intersection.pdf
--------------------------------------------------------------------------------
/envs/data/network.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MUmarJaved/MultiAgent-Distributed-Reinforcement-Learning/d5a0d7226011f7050f643b66e51e141277dd0e58/envs/data/network.pdf
--------------------------------------------------------------------------------
/envs/data/view.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
--------------------------------------------------------------------------------
/envs/env.py:
--------------------------------------------------------------------------------
1 | """
2 | Traffic network simulator w/ defined sumo files
3 | @author: Tianshu Chu
4 | """
5 | import logging
6 | import numpy as np
7 | import pandas as pd
8 | import subprocess
9 | #from sumolib import checkBinary
10 | import time
11 | #import traci
12 | import xml.etree.cElementTree as ET
13 |
14 | DEFAULT_PORT = 8000
15 | SEC_IN_MS = 1000
16 |
17 |
18 | class PhaseSet:
19 | def __init__(self, phases):
20 | self.num_phase = len(phases)
21 | self.num_lane = len(phases[0])
22 | self.phases = phases
23 | self._init_phase_set()
24 |
25 | @staticmethod
26 | def _get_phase_lanes(phase, signal='r'):
27 | phase_lanes = []
28 | for i, l in enumerate(phase):
29 | if l == signal:
30 | phase_lanes.append(i)
31 | return phase_lanes
32 |
33 | def _init_phase_set(self):
34 | self.red_lanes = []
35 | for phase in self.phases:
36 | self.red_lanes.append(self._get_phase_lanes(phase))
37 |
38 |
39 | class PhaseMap:
40 | def __init__(self):
41 | self.phases = {}
42 |
43 | def get_phase(self, phase_id, action):
44 | # phase_type is either green or yellow
45 | return self.phases[phase_id].phases[int(action)]
46 |
47 | def get_phase_num(self, phase_id):
48 | return self.phases[phase_id].num_phase
49 |
50 | def get_lane_num(self, phase_id):
51 | # the lane number is link number
52 | return self.phases[phase_id].num_lane
53 |
54 | def get_red_lanes(self, phase_id, action):
55 | # the lane number is link number
56 | return self.phases[phase_id].red_lanes[int(action)]
57 |
58 |
59 | class Node:
60 | def __init__(self, name, neighbor=[], control=False):
61 | self.control = control # disabled
62 | self.lanes_in = []
63 | self.ilds_in = [] # for state
64 | self.fingerprint = [] # local policy
65 | self.name = name
66 | self.neighbor = neighbor
67 | self.num_state = 0 # wave and wait should have the same dim
68 | self.wave_state = [] # local state
69 | self.wait_state = [] # local state
70 | self.phase_id = -1
71 | self.n_a = 0
72 | self.prev_action = -1
73 |
74 |
75 | class TrafficSimulator:
76 | def __init__(self, config, output_path, is_record, record_stats, port=0):
77 | self.name = config.get('scenario')
78 | self.seed = config.getint('seed')
79 | self.control_interval_sec = config.getint('control_interval_sec')
80 | self.yellow_interval_sec = config.getint('yellow_interval_sec')
81 | self.episode_length_sec = config.getint('episode_length_sec')
82 | self.T = np.ceil(self.episode_length_sec / self.control_interval_sec)
83 | self.port = DEFAULT_PORT + port
84 | self.sim_thread = port
85 | self.obj = config.get('objective')
86 | self.data_path = config.get('data_path')
87 | self.agent = config.get('agent')
88 | self.coop_gamma = config.getfloat('coop_gamma')
89 | self.cur_episode = 0
90 | self.norms = {'wave': config.getfloat('norm_wave'),
91 | 'wait': config.getfloat('norm_wait')}
92 | self.clips = {'wave': config.getfloat('clip_wave'),
93 | 'wait': config.getfloat('clip_wait')}
94 | self.coef_wait = config.getfloat('coef_wait')
95 | self.train_mode = True
96 | test_seeds = config.get('test_seeds').split(',')
97 | test_seeds = [int(s) for s in test_seeds]
98 | self._init_map()
99 | self.init_data(is_record, record_stats, output_path)
100 | self.init_test_seeds(test_seeds)
101 | self._init_sim(self.seed)
102 | self._init_nodes()
103 | self.terminate()
104 |
105 | def collect_tripinfo(self):
106 | # read trip xml, has to be called externally to get complete file
107 | trip_file = self.output_path + ('%s_%s_trip.xml' % (self.name, self.agent))
108 | tree = ET.ElementTree(file=trip_file)
109 | for child in tree.getroot():
110 | cur_trip = child.attrib
111 | cur_dict = {}
112 | cur_dict['episode'] = self.cur_episode
113 | cur_dict['id'] = cur_trip['id']
114 | cur_dict['depart_sec'] = cur_trip['depart']
115 | cur_dict['arrival_sec'] = cur_trip['arrival']
116 | cur_dict['duration_sec'] = cur_trip['duration']
117 | cur_dict['wait_step'] = cur_trip['waitingCount']
118 | cur_dict['wait_sec'] = cur_trip['waitingTime']
119 | self.trip_data.append(cur_dict)
120 | # delete the current xml
121 | cmd = 'rm ' + trip_file
122 | subprocess.check_call(cmd, shell=True)
123 |
124 | def get_fingerprint(self):
125 | policies = []
126 | for node_name in self.node_names:
127 | policies.append(self.nodes[node_name].fingerprint)
128 | return np.array(policies)
129 |
130 | def get_neighbor_action(self, action):
131 | naction = []
132 | for i in range(self.n_agent):
133 | naction.append(action[self.neighbor_mask[i] == 1])
134 | return naction
135 |
136 | def init_data(self, is_record, record_stats, output_path):
137 | self.is_record = is_record
138 | self.record_stats = record_stats
139 | self.output_path = output_path
140 | if self.is_record:
141 | self.traffic_data = []
142 | self.control_data = []
143 | self.trip_data = []
144 | if self.record_stats:
145 | self.state_stat = {}
146 | for state_name in self.state_names:
147 | self.state_stat[state_name] = []
148 |
149 | def init_test_seeds(self, test_seeds):
150 | self.test_num = len(test_seeds)
151 | self.test_seeds = test_seeds
152 |
153 | def output_data(self):
154 | if not self.is_record:
155 | logging.error('Env: no record to output!')
156 | control_data = pd.DataFrame(self.control_data)
157 | control_data.to_csv(self.output_path + ('%s_%s_control.csv' % (self.name, self.agent)))
158 | traffic_data = pd.DataFrame(self.traffic_data)
159 | traffic_data.to_csv(self.output_path + ('%s_%s_traffic.csv' % (self.name, self.agent)))
160 | trip_data = pd.DataFrame(self.trip_data)
161 | trip_data.to_csv(self.output_path + ('%s_%s_trip.csv' % (self.name, self.agent)))
162 |
163 | def reset(self, gui=False, test_ind=0):
164 | # have to terminate previous sim before calling reset
165 | self._reset_state()
166 | if self.train_mode:
167 | seed = self.seed
168 | else:
169 | seed = self.test_seeds[test_ind]
170 | self._init_sim(seed, gui=gui)
171 | self.cur_sec = 0
172 | self.cur_episode += 1
173 | # initialize fingerprint
174 | self.update_fingerprint(self._init_policy())
175 | # next environment random condition should be different
176 | self.seed += 1
177 | return self._get_state()
178 |
179 | def step(self, action):
180 | self._set_phase(action, 'yellow', self.yellow_interval_sec)
181 | self._simulate(self.yellow_interval_sec)
182 | rest_interval_sec = self.control_interval_sec - self.yellow_interval_sec
183 | self._set_phase(action, 'green', rest_interval_sec)
184 | self._simulate(rest_interval_sec)
185 | state = self._get_state()
186 | reward = self._measure_reward_step()
187 | done = False
188 | if self.cur_sec >= self.episode_length_sec:
189 | done = True
190 | global_reward = np.sum(reward)
191 | if self.is_record:
192 | action_r = ','.join(['%d' % a for a in action])
193 | cur_control = {'episode': self.cur_episode,
194 | 'time_sec': self.cur_sec,
195 | 'step': self.cur_sec / self.control_interval_sec,
196 | 'action': action_r,
197 | 'reward': global_reward}
198 | self.control_data.append(cur_control)
199 |
200 | # use original rewards in test
201 | if not self.train_mode:
202 | return state, reward, done, global_reward
203 | if (self.agent == 'greedy') or (self.coop_gamma < 0):
204 | reward = global_reward
205 | return state, reward, done, global_reward
206 |
207 | def terminate(self):
208 | self.sim.close()
209 |
210 | def update_fingerprint(self, policy):
211 | for node_name, pi in zip(self.node_names, policy):
212 | self.nodes[node_name].fingerprint = pi
213 |
214 | def _get_node_phase(self, action, node_name, phase_type):
215 | node = self.nodes[node_name]
216 | cur_phase = self.phase_map.get_phase(node.phase_id, action)
217 | if phase_type == 'green':
218 | return cur_phase
219 | prev_action = node.prev_action
220 | node.prev_action = action
221 | if (prev_action < 0) or (action == prev_action):
222 | return cur_phase
223 | prev_phase = self.phase_map.get_phase(node.phase_id, prev_action)
224 | switch_reds = []
225 | switch_greens = []
226 | for i, (p0, p1) in enumerate(zip(prev_phase, cur_phase)):
227 | if (p0 in 'Gg') and (p1 == 'r'):
228 | switch_reds.append(i)
229 | elif (p0 in 'r') and (p1 in 'Gg'):
230 | switch_greens.append(i)
231 | if not len(switch_reds):
232 | return cur_phase
233 | yellow_phase = list(cur_phase)
234 | for i in switch_reds:
235 | yellow_phase[i] = 'y'
236 | for i in switch_greens:
237 | yellow_phase[i] = 'r'
238 | return ''.join(yellow_phase)
239 |
240 | def _get_node_phase_id(self, node_name):
241 | # needs to be overwriteen
242 | raise NotImplementedError()
243 |
244 | def _get_state(self):
245 | # hard code the state ordering as wave, wait, fp
246 | state = []
247 | # measure the most recent state
248 | self._measure_state_step()
249 |
250 | # get the appropriate state vectors
251 | for node_name in self.node_names:
252 | node = self.nodes[node_name]
253 | # wave is required in state
254 | if self.agent == 'greedy':
255 | state.append(node.wave_state)
256 | else:
257 | cur_state = [node.wave_state]
258 |
259 | # include wave states of neighbors
260 | if self.agent.startswith('ia2c'):
261 | for nnode_name in node.neighbor:
262 | cur_state.append(self.nodes[nnode_name].wave_state)
263 |
264 | # include fingerprints of neighbors
265 | if self.agent == 'ia2c_fp':
266 | for nnode_name in node.neighbor:
267 | cur_state.append(self.nodes[nnode_name].fingerprint)
268 |
269 | # include wait state
270 | if 'wait' in self.state_names:
271 | cur_state.append(node.wait_state)
272 | state.append(np.concatenate(cur_state))
273 | return state
274 |
275 | def _init_action_space(self):
276 | # for local and neighbor coop level
277 | self.n_agent = self.n_node
278 | # to simplify the sim, we assume all agents have the same action dim
279 | phase_id = self._get_node_phase_id('all')
280 | phase_num = self.phase_map.get_phase_num(phase_id)
281 | self.n_a = phase_num
282 | for node_name in self.node_names:
283 | node = self.nodes[node_name]
284 | node.phase_id = phase_id
285 | node.n_a = phase_num
286 |
287 | def _init_map(self):
288 | # needs to be overwriteen
289 | self.neighbor_map = None
290 | self.phase_map = None
291 | self.state_names = None
292 | raise NotImplementedError()
293 |
294 | def _init_nodes(self):
295 | nodes = {}
296 | tl_nodes = self.sim.trafficlight.getIDList()
297 | for node_name in self.node_names:
298 | if node_name not in tl_nodes:
299 | logging.error('node %s can not be found!' % node_name)
300 | exit(1)
301 | neighbor = self.neighbor_map[node_name]
302 | nodes[node_name] = Node(node_name,
303 | neighbor=neighbor,
304 | control=True)
305 | # controlled lanes: l:j,i_k
306 | lanes_in = self.sim.trafficlight.getControlledLanes(node_name)
307 | nodes[node_name].lanes_in = lanes_in
308 | ilds_in = []
309 | for lane_name in lanes_in:
310 | ild_name = lane_name
311 | if ild_name not in ilds_in:
312 | ilds_in.append(ild_name)
313 | nodes[node_name].ilds_in = ilds_in
314 | self.nodes = nodes
315 | s = 'Env: init %d node information:\n' % len(self.node_names)
316 | for node in self.nodes.values():
317 | s += node.name + ':\n'
318 | s += '\tneigbor: %r\n' % node.neighbor
319 | s += '\tilds_in: %r\n' % node.ilds_in
320 | logging.info(s)
321 | self._init_action_space()
322 | self._init_state_space()
323 |
324 | def _init_policy(self):
325 | return [np.ones(self.n_a) / self.n_a for _ in range(self.n_agent)]
326 |
327 | def _init_sim(self, seed, gui=False):
328 | sumocfg_file = self._init_sim_config(seed)
329 | if gui:
330 | app = 'sumo-gui'
331 | else:
332 | app = 'sumo'
333 | command = [checkBinary(app), '-c', sumocfg_file]
334 | command += ['--seed', str(seed)]
335 | command += ['--remote-port', str(self.port)]
336 | command += ['--no-step-log', 'True']
337 | command += ['--time-to-teleport', '600'] # long teleport for safety
338 | command += ['--no-warnings', 'True']
339 | command += ['--duration-log.disable', 'True']
340 | # collect trip info if necessary
341 | if self.is_record:
342 | command += ['--tripinfo-output',
343 | self.output_path + ('%s_%s_trip.xml' % (self.name, self.agent))]
344 | subprocess.Popen(command)
345 | # wait 1s to establish the traci server
346 | time.sleep(1)
347 | self.sim = traci.connect(port=self.port)
348 |
349 | def _init_sim_config(self):
350 | # needs to be overwriteen
351 | raise NotImplementedError()
352 |
353 | def _init_state_space(self):
354 | self._reset_state()
355 | n_s_ls = []
356 | for node_name in self.node_names:
357 | node = self.nodes[node_name]
358 | # fingerprint is previous policy
359 | node.num_fingerprint = self.n_a
360 | node.num_state = len(node.ilds_in)
361 | num_wave = node.num_state
362 | num_wait = 0 if 'wait' not in self.state_names else node.num_state
363 | if self.agent.startswith('ma2c'):
364 | num_n = 1
365 | else:
366 | num_n = 1 + len(node.neighbor)
367 | n_s_ls.append(num_wait + num_wave * num_n)
368 | if self.agent.startswith('ma2c'):
369 | assert len(set(n_s_ls)) == 1
370 | self.n_s = n_s_ls[0]
371 | else:
372 | self.n_s_ls = n_s_ls
373 |
374 | def _measure_reward_step(self):
375 | rewards = []
376 | for node_name in self.node_names:
377 | queues = []
378 | waits = []
379 | for ild in self.nodes[node_name].ilds_in:
380 | if self.obj in ['queue', 'hybrid']:
381 | cur_queue = self.sim.lanearea.getLastStepHaltingNumber(ild)
382 | queues.append(cur_queue)
383 | if self.obj in ['wait', 'hybrid']:
384 | max_pos = 0
385 | car_wait = 0
386 | cur_cars = self.sim.lanearea.getLastStepVehicleIDs(ild)
387 | for vid in cur_cars:
388 | car_pos = self.sim.vehicle.getLanePosition(vid)
389 | if car_pos > max_pos:
390 | max_pos = car_pos
391 | car_wait = self.sim.vehicle.getWaitingTime(vid)
392 | waits.append(car_wait)
393 | queue = np.sum(np.array(queues)) if len(queues) else 0
394 | wait = np.sum(np.array(waits)) if len(waits) else 0
395 | if self.obj == 'queue':
396 | reward = - queue
397 | elif self.obj == 'wait':
398 | reward = - wait
399 | else:
400 | reward = - queue - self.coef_wait * wait
401 | rewards.append(reward)
402 | return np.array(rewards)
403 |
404 | def _measure_state_step(self):
405 | for node_name in self.node_names:
406 | node = self.nodes[node_name]
407 | for state_name in self.state_names:
408 | if state_name == 'wave':
409 | cur_state = []
410 | for ild in node.ilds_in:
411 | cur_wave = self.sim.lanearea.getLastStepVehicleNumber(ild)
412 | cur_state.append(cur_wave)
413 | cur_state = np.array(cur_state)
414 | elif state_name == 'wait':
415 | cur_state = []
416 | for ild in node.ilds_in:
417 | max_pos = 0
418 | car_wait = 0
419 | cur_cars = self.sim.lanearea.getLastStepVehicleIDs(ild)
420 | for vid in cur_cars:
421 | car_pos = self.sim.vehicle.getLanePosition(vid)
422 | if car_pos > max_pos:
423 | max_pos = car_pos
424 | car_wait = self.sim.vehicle.getWaitingTime(vid)
425 | cur_state.append(car_wait)
426 | cur_state = np.array(cur_state)
427 | if self.record_stats:
428 | self.state_stat[state_name] += list(cur_state)
429 | # normalization
430 | norm_cur_state = self._norm_clip_state(cur_state,
431 | self.norms[state_name],
432 | self.clips[state_name])
433 | if state_name == 'wave':
434 | node.wave_state = norm_cur_state
435 | else:
436 | node.wait_state = norm_cur_state
437 |
438 | def _measure_traffic_step(self):
439 | cars = self.sim.vehicle.getIDList()
440 | num_tot_car = len(cars)
441 | num_in_car = self.sim.simulation.getDepartedNumber()
442 | num_out_car = self.sim.simulation.getArrivedNumber()
443 | if num_tot_car > 0:
444 | avg_waiting_time = np.mean([self.sim.vehicle.getWaitingTime(car) for car in cars])
445 | avg_speed = np.mean([self.sim.vehicle.getSpeed(car) for car in cars])
446 | else:
447 | avg_speed = 0
448 | avg_waiting_time = 0
449 | # all trip-related measurements are not supported by traci,
450 | # need to read from outputfile afterwards
451 | queues = []
452 | for node_name in self.node_names:
453 | for ild in self.nodes[node_name].ilds_in:
454 | lane_name = ild
455 | queues.append(self.sim.lane.getLastStepHaltingNumber(lane_name))
456 | avg_queue = np.mean(np.array(queues))
457 | std_queue = np.std(np.array(queues))
458 | cur_traffic = {'episode': self.cur_episode,
459 | 'time_sec': self.cur_sec,
460 | 'number_total_car': num_tot_car,
461 | 'number_departed_car': num_in_car,
462 | 'number_arrived_car': num_out_car,
463 | 'avg_wait_sec': avg_waiting_time,
464 | 'avg_speed_mps': avg_speed,
465 | 'std_queue': std_queue,
466 | 'avg_queue': avg_queue}
467 | self.traffic_data.append(cur_traffic)
468 |
469 | @staticmethod
470 | def _norm_clip_state(x, norm, clip=-1):
471 | x = x / norm
472 | return x if clip < 0 else np.clip(x, 0, clip)
473 |
474 | def _reset_state(self):
475 | for node_name in self.node_names:
476 | node = self.nodes[node_name]
477 | # prev action for yellow phase before each switch
478 | node.prev_action = 0
479 |
480 | def _set_phase(self, action, phase_type, phase_duration):
481 | for node_name, a in zip(self.node_names, list(action)):
482 | phase = self._get_node_phase(a, node_name, phase_type)
483 | self.sim.trafficlight.setRedYellowGreenState(node_name, phase)
484 | self.sim.trafficlight.setPhaseDuration(node_name, phase_duration)
485 |
486 | def _simulate(self, num_step):
487 | # reward = np.zeros(len(self.control_node_names))
488 | for _ in range(num_step):
489 | self.sim.simulationStep()
490 | self.cur_sec += 1
491 | if self.is_record:
492 | self._measure_traffic_step()
493 |
--------------------------------------------------------------------------------
/envs/large_grid_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Particular class of large traffic grid
3 | @author: Tianshu Chu
4 | """
5 |
6 | import configparser
7 | import logging
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import os
11 | import seaborn as sns
12 | import time
13 | from envs.env import PhaseMap, PhaseSet, TrafficSimulator
14 | from envs.data.build_file import gen_rou_file
15 |
16 | sns.set_color_codes()
17 |
18 |
19 | STATE_NAMES = ['wave']
20 | PHASE_NUM = 5
21 |
22 |
23 | class LargeGridPhase(PhaseMap):
24 | def __init__(self):
25 | phases = ['GGgrrrGGgrrr', 'rrrGrGrrrGrG', 'rrrGGrrrrGGr',
26 | 'rrrGGGrrrrrr', 'rrrrrrrrrGGG']
27 | self.phases = {PHASE_NUM: PhaseSet(phases)}
28 |
29 |
30 | class LargeGridController:
31 | def __init__(self, node_names):
32 | self.name = 'greedy'
33 | self.node_names = node_names
34 |
35 | def forward(self, obs):
36 | actions = []
37 | for ob, node_name in zip(obs, self.node_names):
38 | actions.append(self.greedy(ob, node_name))
39 | return actions
40 |
41 | def greedy(self, ob, node_name):
42 | # hard code the mapping from state to number of cars
43 | flows = [ob[0] + ob[3], ob[2] + ob[5], ob[1] + ob[4],
44 | ob[1] + ob[2], ob[4] + ob[5]]
45 | return np.argmax(np.array(flows))
46 |
47 |
48 | class LargeGridEnv(TrafficSimulator):
49 | def __init__(self, config, port=0, output_path='', is_record=False, record_stat=False):
50 | self.peak_flow1 = config.getint('peak_flow1')
51 | self.peak_flow2 = config.getint('peak_flow2')
52 | self.init_density = config.getfloat('init_density')
53 | super().__init__(config, output_path, is_record, record_stat, port=port)
54 |
55 | def _get_node_phase_id(self, node_name):
56 | return PHASE_NUM
57 |
58 | def _init_neighbor_map(self):
59 | neighbor_map = {}
60 | # corner nodes
61 | neighbor_map['nt1'] = ['nt6', 'nt2']
62 | neighbor_map['nt5'] = ['nt10', 'nt4']
63 | neighbor_map['nt21'] = ['nt22', 'nt16']
64 | neighbor_map['nt25'] = ['nt20', 'nt24']
65 | # edge nodes
66 | neighbor_map['nt2'] = ['nt7', 'nt3', 'nt1']
67 | neighbor_map['nt3'] = ['nt8', 'nt4', 'nt2']
68 | neighbor_map['nt4'] = ['nt9', 'nt5', 'nt3']
69 | neighbor_map['nt22'] = ['nt23', 'nt17', 'nt21']
70 | neighbor_map['nt23'] = ['nt24', 'nt18', 'nt22']
71 | neighbor_map['nt24'] = ['nt25', 'nt19', 'nt23']
72 | neighbor_map['nt10'] = ['nt15', 'nt5', 'nt9']
73 | neighbor_map['nt15'] = ['nt20', 'nt10', 'nt14']
74 | neighbor_map['nt20'] = ['nt25', 'nt15', 'nt19']
75 | neighbor_map['nt6'] = ['nt11', 'nt7', 'nt1']
76 | neighbor_map['nt11'] = ['nt16', 'nt12', 'nt6']
77 | neighbor_map['nt16'] = ['nt21', 'nt17', 'nt11']
78 | # internal nodes
79 | for i in [7, 8, 9, 12, 13, 14, 17, 18, 19]:
80 | n_node = 'nt' + str(i + 5)
81 | s_node = 'nt' + str(i - 5)
82 | w_node = 'nt' + str(i - 1)
83 | e_node = 'nt' + str(i + 1)
84 | cur_node = 'nt' + str(i)
85 | neighbor_map[cur_node] = [n_node, e_node, s_node, w_node]
86 | self.neighbor_map = neighbor_map
87 | self.neighbor_mask = np.zeros((self.n_node, self.n_node))
88 | for i in range(self.n_node):
89 | for nnode in neighbor_map['nt%d' % (i+1)]:
90 | ni = self.node_names.index(nnode)
91 | self.neighbor_mask[i, ni] = 1
92 | logging.info('neighbor mask:\n %r' % self.neighbor_mask)
93 |
94 | def _init_distance_map(self):
95 | block0 = np.array([[0,1,2,3,4],[1,0,1,2,3],[2,1,0,1,2],[3,2,1,0,1],[4,3,2,1,0]])
96 | block1 = block0 + 1
97 | block2 = block0 + 2
98 | block3 = block0 + 3
99 | block4 = block0 + 4
100 | row0 = np.hstack([block0, block1, block2, block3, block4])
101 | row1 = np.hstack([block1, block0, block1, block2, block3])
102 | row2 = np.hstack([block2, block1, block0, block1, block2])
103 | row3 = np.hstack([block3, block2, block1, block0, block1])
104 | row4 = np.hstack([block4, block3, block2, block1, block0])
105 | self.distance_mask = np.vstack([row0, row1, row2, row3, row4])
106 |
107 | def _init_map(self):
108 | self.node_names = ['nt%d' % i for i in range(1, 26)]
109 | self.n_node = 25
110 | self._init_neighbor_map()
111 | # for spatial discount
112 | self._init_distance_map()
113 | self.max_distance = 8
114 | self.phase_map = LargeGridPhase()
115 | self.state_names = STATE_NAMES
116 |
117 | def _init_sim_config(self, seed):
118 | return gen_rou_file(self.data_path,
119 | self.peak_flow1,
120 | self.peak_flow2,
121 | self.init_density,
122 | seed=seed,
123 | thread=self.sim_thread)
124 |
125 | def plot_stat(self, rewards):
126 | self.state_stat['reward'] = rewards
127 | for name, data in self.state_stat.items():
128 | fig = plt.figure(figsize=(8, 6))
129 | plot_cdf(data)
130 | plt.ylabel(name)
131 | fig.savefig(self.output_path + self.name + '_' + name + '.png')
132 |
133 |
134 | def plot_cdf(X, c='b', label=None):
135 | sorted_data = np.sort(X)
136 | yvals = np.arange(len(sorted_data))/float(len(sorted_data)-1)
137 | plt.plot(sorted_data, yvals, color=c, label=label)
138 |
139 | if __name__ == '__main__':
140 | logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
141 | level=logging.INFO)
142 | config = configparser.ConfigParser()
143 | config.read('./config/config_greedy.ini')
144 | base_dir = './greedy/'
145 | if not os.path.exists(base_dir):
146 | os.mkdir(base_dir)
147 | env = LargeGridEnv(config['ENV_CONFIG'], 2, base_dir, is_record=True, record_stat=True)
148 | env.train_mode = False
149 | time.sleep(1)
150 | controller = LargeGridController(env.node_names)
151 | rewards = []
152 | for i in range(env.test_num):
153 | ob = env.reset(test_ind=i)
154 | while True:
155 | next_ob, _, done, reward = env.step(controller.forward(ob))
156 | rewards.append(reward)
157 | if done:
158 | break
159 | ob = next_ob
160 | env.terminate()
161 | time.sleep(2)
162 | env.collect_tripinfo()
163 | env.plot_stat(np.array(rewards))
164 | logging.info('avg reward: %.2f' % np.mean(rewards))
165 | env.output_data()
166 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Main function for training and evaluating MARL algorithms in traffic envs
3 | @author: Tianshu Chu
4 | """
5 |
6 | import argparse
7 | import configparser
8 | import logging
9 | import tensorflow as tf
10 | import threading
11 | from envs.large_grid_env import LargeGridEnv, LargeGridController
12 | from agents.models import IA2C, IA2C_FP, IA2C_CU, MA2C_NC, MA2C_IC3, MA2C_DIAL
13 | from utils import (Counter, Trainer, Tester, Evaluator,
14 | check_dir, copy_file, find_file,
15 | init_dir, init_log, init_test_flag,
16 | plot_evaluation, plot_train)
17 |
18 |
19 | def parse_args():
20 | default_base_dir = '/Users/tchu/Documents/rl_test/deeprl_dist/ma2c_ic3_test'
21 | default_config_dir = './config/config_ma2c_ic3.ini'
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument('--base-dir', type=str, required=False,
24 | default=default_base_dir, help="experiment base dir")
25 | subparsers = parser.add_subparsers(dest='option', help="train or evaluate")
26 | sp = subparsers.add_parser('train', help='train a single agent under base dir')
27 | sp.add_argument('--test-mode', type=str, required=False,
28 | default='after_train_test',
29 | help="test mode during training",
30 | choices=['no_test', 'in_train_test', 'after_train_test', 'all_test'])
31 | sp.add_argument('--config-dir', type=str, required=False,
32 | default=default_config_dir, help="experiment config path")
33 | sp = subparsers.add_parser('evaluate', help="evaluate and compare agents under base dir")
34 | sp.add_argument('--evaluate-seeds', type=str, required=False,
35 | default=','.join([str(i) for i in range(2000, 2500, 10)]),
36 | help="random seeds for evaluation, split by ,")
37 | args = parser.parse_args()
38 | if not args.option:
39 | parser.print_help()
40 | exit(1)
41 | return args
42 |
43 |
44 | def init_env(config, port=0, naive_policy=False):
45 | if not naive_policy:
46 | return LargeGridEnv(config, port=port)
47 | else:
48 | env = LargeGridEnv(config, port=port)
49 | policy = LargeGridController(env.node_names)
50 | return env, policy
51 |
52 |
53 | def init_agent(env, config, total_step, seed):
54 | if env.agent == 'ia2c':
55 | return IA2C(env.n_s_ls, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma,
56 | total_step, config, seed=seed)
57 | elif env.agent == 'ia2c_fp':
58 | return IA2C_FP(env.n_s_ls, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma,
59 | total_step, config, seed=seed)
60 | elif env.agent == 'ma2c_nc':
61 | return MA2C_NC(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma,
62 | total_step, config, seed=seed)
63 | elif env.agent == 'ma2c_ic3':
64 | return MA2C_IC3(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma,
65 | total_step, config, seed=seed)
66 | elif env.agent == 'ma2c_cu':
67 | return IA2C_CU(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma,
68 | total_step, config, seed=seed)
69 | elif env.agent == 'ma2c_dial':
70 | return MA2C_DIAL(env.n_s, env.n_a, env.neighbor_mask, env.distance_mask, env.coop_gamma,
71 | total_step, config, seed=seed)
72 | else:
73 | return None
74 |
75 |
76 | def train(args):
77 | base_dir = args.base_dir
78 | dirs = init_dir(base_dir)
79 | init_log(dirs['log'])
80 | config_dir = args.config_dir
81 | copy_file(config_dir, dirs['data'])
82 | config = configparser.ConfigParser()
83 | config.read(config_dir)
84 | in_test, post_test = init_test_flag(args.test_mode)
85 |
86 | # init env
87 | env = init_env(config['ENV_CONFIG'])
88 | logging.info('Training: a dim %d, agent dim: %d' % (env.n_a, env.n_agent))
89 |
90 | # init step counter
91 | total_step = int(config.getfloat('TRAIN_CONFIG', 'total_step'))
92 | test_step = int(config.getfloat('TRAIN_CONFIG', 'test_interval'))
93 | log_step = int(config.getfloat('TRAIN_CONFIG', 'log_interval'))
94 | global_counter = Counter(total_step, test_step, log_step)
95 |
96 | # init centralized or multi agent
97 | seed = config.getint('ENV_CONFIG', 'seed')
98 | model = init_agent(env, config['MODEL_CONFIG'], total_step, seed)
99 |
100 | # disable multi-threading for safe SUMO implementation
101 | summary_writer = tf.summary.FileWriter(dirs['log'])
102 | trainer = Trainer(env, model, global_counter, summary_writer, in_test, output_path=dirs['data'])
103 | trainer.run()
104 |
105 | # save model
106 | final_step = global_counter.cur_step
107 | logging.info('Training: save final model at step %d ...' % final_step)
108 | model.save(dirs['model'], final_step)
109 |
110 | # post-training test
111 | if post_test:
112 | test_dirs = init_dir(base_dir, pathes=['eva_data'])
113 | evaluator = Evaluator(env, model, test_dirs['eva_data'])
114 | evaluator.run()
115 |
116 |
117 | def evaluate_fn(agent_dir, output_dir, seeds, port):
118 | agent = agent_dir.split('/')[-1]
119 | if not check_dir(agent_dir):
120 | logging.error('Evaluation: %s does not exist!' % agent)
121 | return
122 | # load config file for env
123 | config_dir = find_file(agent_dir + '/data/')
124 | if not config_dir:
125 | return
126 | config = configparser.ConfigParser()
127 | config.read(config_dir)
128 |
129 | # init env
130 | env, greedy_policy = init_env(config['ENV_CONFIG'], port=port, naive_policy=True)
131 | env.init_test_seeds(seeds)
132 |
133 | # load model for agent
134 | if agent != 'greedy':
135 | # init centralized or multi agent
136 | model = init_agent(env, config['MODEL_CONFIG'], 0, 0)
137 | if model is None:
138 | return
139 | if not model.load(agent_dir + '/model/'):
140 | return
141 | else:
142 | model = greedy_policy
143 | # collect evaluation data
144 | evaluator = Evaluator(env, model, output_dir)
145 | evaluator.run()
146 |
147 |
148 | def evaluate(args):
149 | base_dir = args.base_dir
150 | dirs = init_dir(base_dir, pathes=['eva_data', 'eva_log'])
151 | init_log(dirs['eva_log'])
152 | # enforce the same evaluation seeds across agents
153 | seeds = args.evaluate_seeds
154 | logging.info('Evaluation: random seeds: %s' % seeds)
155 | if not seeds:
156 | seeds = []
157 | else:
158 | seeds = [int(s) for s in seeds.split(',')]
159 | evaluate_fn(base_dir, dirs['eva_data'], seeds, 1)
160 |
161 |
162 | if __name__ == '__main__':
163 | args = parse_args()
164 | if args.option == 'train':
165 | train(args)
166 | else:
167 | evaluate(args)
168 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import logging
3 | import numpy as np
4 | import tensorflow as tf
5 | import time
6 | import os
7 | import pandas as pd
8 | import subprocess
9 |
10 |
11 | def check_dir(cur_dir):
12 | if not os.path.exists(cur_dir):
13 | return False
14 | return True
15 |
16 |
17 | def copy_file(src_dir, tar_dir):
18 | cmd = 'cp %s %s' % (src_dir, tar_dir)
19 | subprocess.check_call(cmd, shell=True)
20 |
21 |
22 | def find_file(cur_dir, suffix='.ini'):
23 | for file in os.listdir(cur_dir):
24 | if file.endswith(suffix):
25 | return cur_dir + '/' + file
26 | logging.error('Cannot find %s file' % suffix)
27 | return None
28 |
29 |
30 | def init_dir(base_dir, pathes=['log', 'data', 'model']):
31 | if not os.path.exists(base_dir):
32 | os.mkdir(base_dir)
33 | dirs = {}
34 | for path in pathes:
35 | cur_dir = base_dir + '/%s/' % path
36 | if not os.path.exists(cur_dir):
37 | os.mkdir(cur_dir)
38 | dirs[path] = cur_dir
39 | return dirs
40 |
41 |
42 | def init_log(log_dir):
43 | logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
44 | level=logging.INFO,
45 | handlers=[
46 | logging.FileHandler('%s/%d.log' % (log_dir, time.time())),
47 | logging.StreamHandler()
48 | ])
49 |
50 |
51 | def init_test_flag(test_mode):
52 | if test_mode == 'no_test':
53 | return False, False
54 | if test_mode == 'in_train_test':
55 | return True, False
56 | if test_mode == 'after_train_test':
57 | return False, True
58 | if test_mode == 'all_test':
59 | return True, True
60 | return False, False
61 |
62 |
63 | def plot_train(data_dirs, labels):
64 | pass
65 |
66 | def plot_evaluation(data_dirs, labels):
67 | pass
68 |
69 |
70 | class Counter:
71 | def __init__(self, total_step, test_step, log_step):
72 | self.counter = itertools.count(1)
73 | self.cur_step = 0
74 | self.cur_test_step = 0
75 | self.total_step = total_step
76 | self.test_step = test_step
77 | self.log_step = log_step
78 | self.stop = False
79 |
80 | def next(self):
81 | self.cur_step = next(self.counter)
82 | return self.cur_step
83 |
84 | def should_test(self):
85 | test = False
86 | if (self.cur_step - self.cur_test_step) >= self.test_step:
87 | test = True
88 | self.cur_test_step = self.cur_step
89 | return test
90 |
91 | def should_log(self):
92 | return (self.cur_step % self.log_step == 0)
93 |
94 | def should_stop(self):
95 | if self.cur_step >= self.total_step:
96 | return True
97 | return self.stop
98 |
99 |
100 | class Trainer():
101 | def __init__(self, env, model, global_counter, summary_writer, run_test, output_path=None):
102 | self.cur_step = 0
103 | self.global_counter = global_counter
104 | self.env = env
105 | self.agent = self.env.agent
106 | self.model = model
107 | self.sess = self.model.sess
108 | self.n_step = self.model.n_step
109 | self.summary_writer = summary_writer
110 | self.run_test = run_test
111 | assert self.env.T % self.n_step == 0
112 | self.data = []
113 | self.output_path = output_path
114 | if run_test:
115 | self.test_num = self.env.test_num
116 | logging.info('Testing: total test num: %d' % self.test_num)
117 | self._init_summary()
118 |
119 | def _init_summary(self):
120 | self.train_reward = tf.placeholder(tf.float32, [])
121 | self.train_summary = tf.summary.scalar('train_reward', self.train_reward)
122 | self.test_reward = tf.placeholder(tf.float32, [])
123 | self.test_summary = tf.summary.scalar('test_reward', self.test_reward)
124 |
125 | def _add_summary(self, reward, global_step, is_train=True):
126 | if is_train:
127 | summ = self.sess.run(self.train_summary, {self.train_reward: reward})
128 | else:
129 | summ = self.sess.run(self.test_summary, {self.test_reward: reward})
130 | self.summary_writer.add_summary(summ, global_step=global_step)
131 |
132 | def _get_policy(self, ob, done, mode='train'):
133 | if self.agent.startswith('ma2c'):
134 | self.ps = self.env.get_fingerprint()
135 | policy = self.model.forward(np.array(ob), done, self.ps)
136 | else:
137 | policy = self.model.forward(ob, done)
138 | action = []
139 | for pi in policy:
140 | if mode == 'train':
141 | action.append(np.random.choice(np.arange(len(pi)), p=pi))
142 | else:
143 | action.append(np.argmax(pi))
144 | return policy, np.array(action)
145 |
146 | def _get_value(self, ob, done, action):
147 | if self.agent.startswith('ma2c'):
148 | value = self.model.forward(np.array(ob), done, self.ps, np.array(action), 'v')
149 | else:
150 | self.naction = self.env.get_neighbor_action(action)
151 | value = self.model.forward(ob, done, self.naction, 'v')
152 | return value
153 |
154 | def explore(self, prev_ob, prev_done):
155 | ob = prev_ob
156 | done = prev_done
157 | rewards = []
158 | for _ in range(self.n_step):
159 | # pre-decision
160 | policy, action = self._get_policy(ob, done)
161 | # post-decision
162 | value = self._get_value(ob, done, action)
163 | # transition
164 | self.env.update_fingerprint(policy)
165 | next_ob, reward, done, global_reward = self.env.step(action)
166 | rewards.append(global_reward)
167 | global_step = self.global_counter.next()
168 | self.cur_step += 1
169 | # collect experience
170 | if self.agent.startswith('ma2c'):
171 | self.model.add_transition(ob, self.ps, action, reward, value, done)
172 | else:
173 | self.model.add_transition(ob, self.naction, action, reward, value, done)
174 | # logging
175 | if self.global_counter.should_log():
176 | logging.info('''Training: global step %d, episode step %d,
177 | ob: %s, a: %s, pi: %s, r: %.2f, train r: %.2f, done: %r''' %
178 | (global_step, self.cur_step,
179 | str(ob), str(action), str(policy), global_reward, np.mean(reward), done))
180 | if done:
181 | break
182 | ob = next_ob
183 | if done:
184 | R = np.zeros(self.model.n_agent)
185 | else:
186 | _, action = self._get_policy(ob, done)
187 | R = self._get_value(ob, done, action)
188 | return ob, done, R, rewards
189 |
190 | def perform(self, test_ind):
191 | ob = self.env.reset(test_ind=test_ind)
192 | rewards = []
193 | while True:
194 | if self.agent == 'greedy':
195 | action = self.model.forward(ob)
196 | else:
197 | # in on-policy learning, test policy has to be stochastic
198 | # policy, action = self._get_policy(ob, False, mode='test')
199 | policy, action = self._get_policy(ob, False)
200 | self.env.update_fingerprint(policy)
201 | next_ob, reward, done, global_reward = self.env.step(action)
202 | rewards.append(global_reward)
203 | if done:
204 | break
205 | ob = next_ob
206 | mean_reward = np.mean(np.array(rewards))
207 | std_reward = np.std(np.array(rewards))
208 | return mean_reward, std_reward
209 |
210 | def run_thread(self, coord):
211 | '''Multi-threading is disabled'''
212 | ob = self.env.reset()
213 | done = False
214 | cum_reward = 0
215 | while not coord.should_stop():
216 | ob, done, R, cum_reward = self.explore(ob, done, cum_reward)
217 | global_step = self.global_counter.cur_step
218 | if self.agent.endswith('a2c'):
219 | self.model.backward(R, self.summary_writer, global_step)
220 | else:
221 | self.model.backward(self.summary_writer, global_step)
222 | self.summary_writer.flush()
223 | if (self.global_counter.should_stop()) and (not coord.should_stop()):
224 | self.env.terminate()
225 | coord.request_stop()
226 | logging.info('Training: stop condition reached!')
227 | return
228 |
229 | def run(self):
230 | while not self.global_counter.should_stop():
231 | # test
232 | if self.run_test and self.global_counter.should_test():
233 | rewards = []
234 | global_step = self.global_counter.cur_step
235 | self.env.train_mode = False
236 | for test_ind in range(self.test_num):
237 | mean_reward, std_reward = self.perform(test_ind)
238 | self.env.terminate()
239 | rewards.append(mean_reward)
240 | log = {'agent': self.agent,
241 | 'step': global_step,
242 | 'test_id': test_ind,
243 | 'avg_reward': mean_reward,
244 | 'std_reward': std_reward}
245 | self.data.append(log)
246 | avg_reward = np.mean(np.array(rewards))
247 | self._add_summary(avg_reward, global_step, is_train=False)
248 | logging.info('Testing: global step %d, avg R: %.2f' %
249 | (global_step, avg_reward))
250 | # train
251 | self.env.train_mode = True
252 | ob = self.env.reset()
253 | done = False
254 | self.cur_step = 0
255 | rewards = []
256 | while True:
257 | ob, done, R, cur_rewards = self.explore(ob, done)
258 | dt = self.env.T - self.cur_step
259 | rewards += cur_rewards
260 | global_step = self.global_counter.cur_step
261 | self.model.backward(R, dt, self.summary_writer, global_step)
262 | # termination
263 | if done:
264 | self.env.terminate()
265 | break
266 | rewards = np.array(rewards)
267 | mean_reward = np.mean(rewards)
268 | std_reward = np.std(rewards)
269 | log = {'agent': self.agent,
270 | 'step': global_step,
271 | 'test_id': -1,
272 | 'avg_reward': mean_reward,
273 | 'std_reward': std_reward}
274 | self.data.append(log)
275 | self._add_summary(mean_reward, global_step)
276 | self.summary_writer.flush()
277 | df = pd.DataFrame(self.data)
278 | df.to_csv(self.output_path + 'train_reward.csv')
279 |
280 |
281 | class Tester(Trainer):
282 | def __init__(self, env, model, global_counter, summary_writer, output_path):
283 | super().__init__(env, model, global_counter, summary_writer)
284 | self.env.train_mode = False
285 | self.test_num = self.env.test_num
286 | self.output_path = output_path
287 | self.data = []
288 | logging.info('Testing: total test num: %d' % self.test_num)
289 |
290 | def _init_summary(self):
291 | self.reward = tf.placeholder(tf.float32, [])
292 | self.summary = tf.summary.scalar('test_reward', self.reward)
293 |
294 | def run_offline(self):
295 | # enable traffic measurments for offline test
296 | is_record = True
297 | record_stats = False
298 | self.env.cur_episode = 0
299 | self.env.init_data(is_record, record_stats, self.output_path)
300 | rewards = []
301 | for test_ind in range(self.test_num):
302 | rewards.append(self.perform(test_ind))
303 | self.env.terminate()
304 | time.sleep(2)
305 | self.env.collect_tripinfo()
306 | avg_reward = np.mean(np.array(rewards))
307 | logging.info('Offline testing: avg R: %.2f' % avg_reward)
308 | self.env.output_data()
309 |
310 | def run_online(self, coord):
311 | self.env.cur_episode = 0
312 | while not coord.should_stop():
313 | time.sleep(30)
314 | if self.global_counter.should_test():
315 | rewards = []
316 | global_step = self.global_counter.cur_step
317 | for test_ind in range(self.test_num):
318 | cur_reward = self.perform(test_ind)
319 | self.env.terminate()
320 | rewards.append(cur_reward)
321 | log = {'agent': self.agent,
322 | 'step': global_step,
323 | 'test_id': test_ind,
324 | 'reward': cur_reward}
325 | self.data.append(log)
326 | avg_reward = np.mean(np.array(rewards))
327 | self._add_summary(avg_reward, global_step)
328 | logging.info('Testing: global step %d, avg R: %.2f' %
329 | (global_step, avg_reward))
330 | # self.global_counter.update_test(avg_reward)
331 | df = pd.DataFrame(self.data)
332 | df.to_csv(self.output_path + 'train_reward.csv')
333 |
334 |
335 | class Evaluator(Tester):
336 | def __init__(self, env, model, output_path):
337 | self.env = env
338 | self.model = model
339 | self.agent = self.env.agent
340 | self.env.train_mode = False
341 | self.test_num = self.env.test_num
342 | self.output_path = output_path
343 |
344 | def run(self):
345 | is_record = True
346 | record_stats = False
347 | self.env.cur_episode = 0
348 | self.env.init_data(is_record, record_stats, self.output_path)
349 | time.sleep(1)
350 | for test_ind in range(self.test_num):
351 | reward, _ = self.perform(test_ind)
352 | self.env.terminate()
353 | logging.info('test %i, avg reward %.2f' % (test_ind, reward))
354 | time.sleep(2)
355 | self.env.collect_tripinfo()
356 | self.env.output_data()
357 |
--------------------------------------------------------------------------------