├── .gitignore ├── README.md ├── bicnet.py ├── comm_net.py ├── docs └── 2_agents_commnet.png ├── guessing_sum_env.py ├── hypersearch.py ├── replay_buffer.py ├── summaries └── .gitkeep ├── train_bicnet.py └── train_comm_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | 107 | .idea 108 | summaries/* 109 | !summaries/.gitkeep 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CommNet-BiCnet 2 | [CommNet](https://arxiv.org/abs/1605.07736) and [BiCnet](https://arxiv.org/abs/1703.10069) implementation in tensorflow 3 | 4 | ## Training 5 | Train CommNet using DDPG algorithm 6 | ``` 7 | python train_comm_net.py 8 | ``` 9 | 10 | ## Hypersearch 11 | To find the optimal hyperparameters such as `actor_lr` or `critic_lr`, a simple grid search has been implemented. It launches multiple instances of the trainer in parallel based on the number of CPU cores. 12 | ``` 13 | python hypersearch.py 14 | ``` 15 | 16 | ## Guessing sum environment 17 | It is a simple game described in the [BiCnet](https://arxiv.org/abs/1703.10069) paper for testing if the communication works. The environment implements the crucial methods of the core gym interface from OpenAI 18 | 19 | Each agent receives a scalar sampled between `[−10, 10]` under a truncated Gaussian. Each agent needs to output the sum of all inputs received among the agents. An agent gets a normalized reward between `[0, 1]` based on the absolute difference between the sum and its output. 20 | 21 | ## Results 22 | ### Training CommNet in the Guessing sum env with 2 agents 23 | ![2_agents_commnet_training_reward](docs/2_agents_commnet.png) 24 | -------------------------------------------------------------------------------- /bicnet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from guessing_sum_env import * 4 | 5 | # TODO use the parameters of train_ddpg 6 | HIDDEN_VECTOR_LEN = 1 7 | NUM_AGENTS = 2 8 | VECTOR_OBS_LEN = 1 9 | OUTPUT_LEN = 1 10 | 11 | 12 | class BiCNet: 13 | @staticmethod 14 | def base_build_network(observation): 15 | encoded = BiCNet.shared_dense_layer("encoder", observation, HIDDEN_VECTOR_LEN) 16 | 17 | hidden_agents = tf.unstack(encoded, NUM_AGENTS, 1) 18 | 19 | lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_VECTOR_LEN, forget_bias=1.0, name="lstm_fw_cell") 20 | lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_VECTOR_LEN, forget_bias=1.0, name="lstm_bw_cell") 21 | outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, hidden_agents, dtype=tf.float32) 22 | with tf.variable_scope("bidirectional_rnn", reuse=tf.AUTO_REUSE): 23 | tf.summary.histogram("lstm_fw_cell/kernel", tf.get_variable("fw/lstm_fw_cell/kernel")) 24 | tf.summary.histogram("lstm_bw_cell/kernel", tf.get_variable("bw/lstm_bw_cell/kernel")) 25 | 26 | outputs = tf.stack(outputs, 1) 27 | return outputs 28 | 29 | @staticmethod 30 | def actor_build_network(name, observation): 31 | with tf.variable_scope(name): 32 | outputs = BiCNet.base_build_network(observation) 33 | return BiCNet.shared_dense_layer("output_layer", outputs, OUTPUT_LEN) 34 | 35 | 36 | @staticmethod 37 | def shared_dense_layer(name, observation, output_len): 38 | H = [] 39 | with tf.variable_scope(name, reuse=tf.AUTO_REUSE): 40 | for j in range(NUM_AGENTS): 41 | agent_obs = observation[:, j] 42 | agent_encoded = tf.layers.dense(agent_obs, output_len, name="dense") 43 | tf.summary.histogram(name + "/dense/kernel", tf.get_variable("dense/kernel")) 44 | H.append(agent_encoded) 45 | H = tf.stack(H, 1) 46 | return H 47 | 48 | @staticmethod 49 | def critic_build_network(name, observation, action): 50 | with tf.variable_scope(name, reuse=tf.AUTO_REUSE): 51 | outputs = BiCNet.base_build_network(tf.concat([observation, action], 2)) 52 | outputs = BiCNet.shared_dense_layer("output_layer", outputs, 1) 53 | return outputs 54 | 55 | if __name__ == '__main__': 56 | tf.set_random_seed(42) 57 | 58 | tf.reset_default_graph() 59 | 60 | config = tf.ConfigProto() 61 | config.gpu_options.allow_growth = True 62 | with tf.Session(config=config) as sess: 63 | BATCH_SIZE = 10 64 | 65 | observation = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="observation") 66 | actions = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN), name="actions") 67 | 68 | actor_out = BiCNet.actor_build_network("actor_network", observation) 69 | critic_out = BiCNet.critic_build_network("critic_network", observation, actions) 70 | 71 | sess.run(tf.global_variables_initializer()) 72 | 73 | feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN))} 74 | print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN), "== (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN)") 75 | 76 | feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN)), 77 | actions: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN))} 78 | print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, NUM_AGENTS, 1), "== (BATCH_SIZE, NUM_AGENTS, 1)") 79 | 80 | feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN))} 81 | print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (1, NUM_AGENTS, OUTPUT_LEN), "== (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN)") 82 | 83 | feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN)), 84 | actions: np.random.random_sample((1, NUM_AGENTS, OUTPUT_LEN))} 85 | print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (1, NUM_AGENTS, 1), "== (BATCH_SIZE, NUM_AGENTS, 1)") 86 | -------------------------------------------------------------------------------- /comm_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from guessing_sum_env import * 4 | 5 | # TODO use the parameters of train_ddpg 6 | HIDDEN_VECTOR_LEN = 1 7 | NUM_AGENTS = 2 8 | VECTOR_OBS_LEN = 1 9 | OUTPUT_LEN = 1 10 | 11 | 12 | class CommNet: 13 | @staticmethod 14 | def base_build_network(observation): 15 | # H0 = CommNet.encoder(observation) 16 | H0 = observation 17 | C0 = tf.zeros(tf.shape(H0), name="C0") 18 | H1, C1 = CommNet.comm_step("comm_step1", H0, C0) 19 | H2, _ = CommNet.comm_step("comm_step2", H1, C1, H0) 20 | # H3, _ = CommNet.comm_step("comm_step3", H2, C2, H0) 21 | return H2 22 | 23 | @staticmethod 24 | def actor_build_network(name, observation): 25 | with tf.variable_scope(name): 26 | H = CommNet.base_build_network(observation) 27 | return CommNet.actor_output_layer(H) 28 | 29 | @staticmethod 30 | def critic_build_network(name, observation, action): 31 | with tf.variable_scope(name): 32 | H = CommNet.base_build_network(observation) 33 | return CommNet.critic_output_layer(H, action) 34 | 35 | @staticmethod 36 | def encoder(s): 37 | H = [] 38 | with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): 39 | for j in range(NUM_AGENTS): 40 | encoded = tf.layers.dense(tf.reshape(s[j], (1, VECTOR_OBS_LEN)), HIDDEN_VECTOR_LEN, name="dense") 41 | H.append(tf.squeeze(encoded)) 42 | H = tf.stack(H) 43 | H = tf.reshape(H, (NUM_AGENTS, HIDDEN_VECTOR_LEN)) 44 | 45 | return H 46 | 47 | @staticmethod 48 | def module(h, c): 49 | with tf.variable_scope("module", reuse=tf.AUTO_REUSE): 50 | w_H = tf.get_variable(name='w_H', shape=HIDDEN_VECTOR_LEN, 51 | initializer=tf.contrib.layers.xavier_initializer()) 52 | w_C = tf.get_variable(name='w_C', shape=HIDDEN_VECTOR_LEN, 53 | initializer=tf.contrib.layers.xavier_initializer()) 54 | 55 | tf.summary.histogram('w_H', w_H) 56 | tf.summary.histogram('w_C', w_C) 57 | 58 | return tf.tanh(tf.multiply(w_H, h) + tf.multiply(w_C, c)) 59 | 60 | @staticmethod 61 | def comm_step(name, H, C, H0_skip_con=None): 62 | batch_size = tf.shape(H)[0] 63 | with tf.variable_scope(name): 64 | next_H = tf.zeros(shape=(batch_size, 0, HIDDEN_VECTOR_LEN)) 65 | for j in range(NUM_AGENTS): 66 | h = H[:, j] 67 | c = C[:, j] 68 | 69 | next_h = CommNet.module(h, c) # shape (BATCH_SIZE, HIDDEN_VECTOR_LEN) 70 | next_H = tf.concat([next_H, tf.reshape(next_h, (batch_size, 1, HIDDEN_VECTOR_LEN))], 1) 71 | 72 | next_H = tf.identity(next_H, "H") 73 | 74 | if H0_skip_con is not None: 75 | next_H = tf.add(next_H, H0_skip_con) 76 | 77 | if NUM_AGENTS > 1: 78 | next_C = tf.zeros(shape=(batch_size, 0, HIDDEN_VECTOR_LEN)) 79 | for j1 in range(NUM_AGENTS): 80 | next_c = [] 81 | for j2 in range(NUM_AGENTS): 82 | if j1 != j2: 83 | next_c.append(next_H[:, j2]) 84 | next_c = tf.reduce_mean(tf.stack(next_c), 0) 85 | next_C = tf.concat([next_C, tf.reshape(next_c, (batch_size, 1, HIDDEN_VECTOR_LEN))], 1) 86 | else: 87 | next_C = C 88 | 89 | return next_H, tf.identity(next_C, "C") 90 | 91 | @staticmethod 92 | def actor_output_layer(H): 93 | with tf.variable_scope("actor_output"): 94 | w_out = tf.get_variable(name='w_out', shape=(HIDDEN_VECTOR_LEN, OUTPUT_LEN), 95 | initializer=tf.contrib.layers.xavier_initializer()) 96 | b_out = tf.get_variable(name='b_out', shape=OUTPUT_LEN, initializer=tf.zeros_initializer()) 97 | 98 | tf.summary.histogram('w_out', w_out) 99 | tf.summary.histogram('b_out', b_out) 100 | 101 | batch_size = tf.shape(H)[0] 102 | 103 | actions = [] 104 | for j in range(NUM_AGENTS): 105 | h = tf.slice(H, [0, j, 0], [batch_size, 1, HIDDEN_VECTOR_LEN]) 106 | w_out_batch = tf.tile(tf.expand_dims(w_out, axis=0), [batch_size, 1, 1]) 107 | action = tf.squeeze(tf.matmul(h, w_out_batch) + b_out, [1]) 108 | 109 | actions.append(action) 110 | actions = tf.stack(actions, name="actions", axis=1) 111 | 112 | return actions 113 | 114 | @staticmethod 115 | def critic_output_layer(H, action): 116 | with tf.variable_scope("critic_output", reuse=tf.AUTO_REUSE): 117 | baseline = tf.layers.dense(inputs=tf.concat([H, action], 2), 118 | units=1, 119 | activation=tf.tanh, 120 | kernel_initializer=tf.contrib.layers.xavier_initializer()) 121 | baseline = tf.squeeze(baseline, [2]) 122 | baseline = tf.layers.dense(inputs=baseline, 123 | units=1, 124 | kernel_initializer=tf.contrib.layers.xavier_initializer()) 125 | tf.summary.histogram("w_baseline", tf.get_variable("dense/kernel")) 126 | 127 | return baseline 128 | 129 | 130 | if __name__ == '__main__': 131 | tf.set_random_seed(42) 132 | 133 | tf.reset_default_graph() 134 | 135 | config = tf.ConfigProto() 136 | config.gpu_options.allow_growth = True 137 | with tf.Session(config=config) as sess: 138 | BATCH_SIZE = 10 139 | 140 | observation = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN)) 141 | actions = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN)) 142 | 143 | actor_out = CommNet.actor_build_network("actor_network", observation) 144 | critic_out = CommNet.critic_build_network("critic_network", observation, actions) 145 | 146 | sess.run(tf.global_variables_initializer()) 147 | 148 | feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN))} 149 | print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN)) 150 | 151 | feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN)), 152 | actions: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN))} 153 | print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, 1)) 154 | 155 | feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN))} 156 | print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (1, NUM_AGENTS, OUTPUT_LEN)) 157 | 158 | feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN)), 159 | actions: np.random.random_sample((1, NUM_AGENTS, OUTPUT_LEN))} 160 | print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (1, 1)) 161 | -------------------------------------------------------------------------------- /docs/2_agents_commnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Coac/CommNet-BiCnet/b7a1e3184c9881c9957d0cfe3b160797a6bd7cd6/docs/2_agents_commnet.png -------------------------------------------------------------------------------- /guessing_sum_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class GuessingSumEnv: 5 | def __init__(self, num_agents=5): 6 | self.num_agents = num_agents 7 | self.sum = 0 8 | self.scale = 10.0 9 | self.sum_scale = self.num_agents * self.scale 10 | 11 | def step(self, actions): 12 | if actions.shape != (self.num_agents, 1): 13 | raise Exception('got input shape ', actions.shape, ' instead of ', (self.num_agents, 1)) 14 | 15 | observations = None 16 | rewards = -np.abs(actions - self.sum) # [-Inf ; 0] 17 | 18 | normalized_rewards = (np.maximum(rewards, -self.sum_scale) + self.sum_scale) / self.sum_scale # [0 ; 1] 19 | 20 | done = True 21 | info = None 22 | 23 | return observations, normalized_rewards, done, info 24 | 25 | def reset(self): 26 | observations = np.clip(np.random.normal(size=(self.num_agents, 1)), -self.scale, self.scale) 27 | self.sum = np.sum(observations) 28 | return observations 29 | 30 | def render(self, mode='human'): 31 | return 32 | 33 | def close(self): 34 | return 35 | 36 | def seed(self, seed=None): 37 | np.random.seed(seed) 38 | return 39 | 40 | 41 | if __name__ == '__main__': 42 | env = GuessingSumEnv() 43 | env.seed(0) 44 | 45 | print('obs:', env.reset()) 46 | actions = np.random.normal(size=(env.num_agents, 1)) 47 | print('actions:', actions) 48 | print('rewards:', env.step(actions)) 49 | -------------------------------------------------------------------------------- /hypersearch.py: -------------------------------------------------------------------------------- 1 | from concurrent import futures 2 | from multiprocessing import cpu_count 3 | import train_comm_net 4 | import itertools 5 | import shlex 6 | 7 | def start_process(args): 8 | process = pool.submit(train_comm_net.main, args) 9 | process.arg = args 10 | process.add_done_callback(done_callback) 11 | return False 12 | 13 | 14 | def done_callback(process): 15 | if process.cancelled(): 16 | print('Process {0} was cancelled'.format(process.arg)) 17 | elif process.done(): 18 | error = process.exception() 19 | if error: 20 | print('Process {0} - {1} '.format(process.arg, error)) 21 | else: 22 | print('Process {0} done'.format(process.arg)) 23 | 24 | 25 | if __name__ == '__main__': 26 | num_workers = cpu_count() 27 | num_workers = 100 28 | 29 | print('Initializing Process Pool - {0} workers'.format(num_workers)) 30 | pool = futures.ProcessPoolExecutor(max_workers=num_workers) 31 | 32 | params = { 33 | "--actor-lr": [0.01, 0.05, 0.1, 0.15], 34 | "--critic-lr": [0.01, 0.05, 0.1, 0.15] 35 | } 36 | 37 | 38 | hyperparams_names = list(params.keys()) 39 | hyperparams = list(itertools.product(*params.values())) 40 | print("Number of run needed:", len(hyperparams)) 41 | 42 | for hyperparam in hyperparams: 43 | args = "" 44 | for index, value in enumerate(hyperparam): 45 | args += hyperparams_names[index] + ' ' + str(value) + " " 46 | 47 | start_process(shlex.split(args)) 48 | -------------------------------------------------------------------------------- /replay_buffer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data structure for implementing experience replay 3 | 4 | Author: Patrick Emami 5 | """ 6 | from collections import deque 7 | import random 8 | import numpy as np 9 | 10 | class ReplayBuffer(object): 11 | 12 | def __init__(self, buffer_size, random_seed=123): 13 | """ 14 | The right side of the deque contains the most recent experiences 15 | """ 16 | self.buffer_size = buffer_size 17 | self.count = 0 18 | self.buffer = deque() 19 | random.seed(random_seed) 20 | 21 | def add(self, state, action, reward, done, state2): 22 | experience = (state, action, reward, done, state2) 23 | if self.count < self.buffer_size: 24 | self.buffer.append(experience) 25 | self.count += 1 26 | else: 27 | self.buffer.popleft() 28 | self.buffer.append(experience) 29 | 30 | def size(self): 31 | return self.count 32 | 33 | def sample_batch(self, batch_size): 34 | if self.count < batch_size: 35 | batch = random.sample(self.buffer, self.count) 36 | else: 37 | batch = random.sample(self.buffer, batch_size) 38 | 39 | s_batch = np.array([_[0] for _ in batch]) 40 | a_batch = np.array([_[1] for _ in batch]) 41 | r_batch = np.array([_[2] for _ in batch]) 42 | t_batch = np.array([_[3] for _ in batch]) 43 | s2_batch = np.array([_[4] for _ in batch]) 44 | 45 | return s_batch, a_batch, r_batch, t_batch, s2_batch 46 | 47 | def clear(self): 48 | self.buffer.clear() 49 | self.count = 0 50 | -------------------------------------------------------------------------------- /summaries/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Coac/CommNet-BiCnet/b7a1e3184c9881c9957d0cfe3b160797a6bd7cd6/summaries/.gitkeep -------------------------------------------------------------------------------- /train_bicnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of DDPG - Deep Deterministic Policy Gradient https://github.com/pemami4911/deep-rl 3 | Modified by Coac for BiCNet implementation https://github.com/Coac/CommNet-BiCnet 4 | """ 5 | import argparse 6 | import pprint as pp 7 | from datetime import datetime 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | # from comm_net import CommNet 12 | from bicnet import BiCNet as CommNet 13 | from guessing_sum_env import * 14 | from replay_buffer import ReplayBuffer 15 | 16 | HIDDEN_VECTOR_LEN = 1 17 | NUM_AGENTS = 2 18 | VECTOR_OBS_LEN = 1 19 | OUTPUT_LEN = 1 20 | 21 | 22 | # =========================== 23 | # Actor and Critic DNNs 24 | # =========================== 25 | 26 | class ActorNetwork(object): 27 | def __init__(self, sess, state_dim, action_dim, learning_rate, tau, batch_size): 28 | self.sess = sess 29 | self.s_dim = state_dim 30 | self.a_dim = action_dim 31 | self.learning_rate = learning_rate 32 | self.tau = tau 33 | self.batch_size = batch_size 34 | 35 | self.inputs, self.out = self.create_actor_network("actor_network") 36 | self.network_params = tf.trainable_variables() 37 | 38 | self.target_inputs, self.target_out = self.create_actor_network("target_actor_network") 39 | self.target_network_params = tf.trainable_variables()[ 40 | len(self.network_params):] 41 | 42 | with tf.name_scope("actor_update_target_network_params"): 43 | self.update_target_network_params = \ 44 | [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + 45 | tf.multiply(self.target_network_params[i], 1. - self.tau)) 46 | for i in range(len(self.target_network_params))] 47 | 48 | self.action_gradient = tf.placeholder(tf.float32, (NUM_AGENTS, None, NUM_AGENTS, OUTPUT_LEN), name="action_gradient") 49 | 50 | 51 | with tf.name_scope("actor_gradients"): 52 | grads = [] 53 | for i in range(NUM_AGENTS): 54 | for j in range(NUM_AGENTS): 55 | grads.append(tf.gradients(self.out[:, j], self.network_params, -self.action_gradient[j][:, i])) 56 | grads = np.array(grads) 57 | self.unnormalized_actor_gradients = [tf.reduce_sum(list(grads[:, i]), axis=0) for i in range(len(self.network_params))] 58 | self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients)) 59 | 60 | self.optimize = tf.train.AdamOptimizer(self.learning_rate) 61 | self.optimize = self.optimize.apply_gradients(zip(self.actor_gradients, self.network_params)) 62 | 63 | self.num_trainable_vars = len(self.network_params) + len(self.target_network_params) 64 | 65 | def create_actor_network(self, name): 66 | inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="actor_inputs") 67 | out = CommNet.actor_build_network(name, inputs) 68 | return inputs, out 69 | 70 | def train(self, inputs, action_gradient): 71 | self.sess.run(self.optimize, feed_dict={ 72 | self.inputs: inputs, 73 | self.action_gradient: action_gradient 74 | }) 75 | 76 | def predict(self, inputs): 77 | return self.sess.run(self.out, feed_dict={ 78 | self.inputs: inputs 79 | }) 80 | 81 | def predict_target(self, inputs): 82 | return self.sess.run(self.target_out, feed_dict={ 83 | self.target_inputs: inputs 84 | }) 85 | 86 | def update_target_network(self): 87 | self.sess.run(self.update_target_network_params) 88 | 89 | def get_num_trainable_vars(self): 90 | return self.num_trainable_vars 91 | 92 | 93 | class CriticNetwork(object): 94 | """ 95 | Input to the network is the state and action, output is Q(s,a). 96 | The action must be obtained from the output of the Actor network. 97 | 98 | """ 99 | 100 | def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars): 101 | self.sess = sess 102 | self.s_dim = state_dim 103 | self.a_dim = action_dim 104 | self.learning_rate = learning_rate 105 | self.tau = tau 106 | self.gamma = gamma 107 | 108 | self.inputs, self.action, self.out = self.create_critic_network("critic_network") 109 | self.network_params = tf.trainable_variables()[num_actor_vars:] 110 | 111 | self.target_inputs, self.target_action, self.target_out = self.create_critic_network("target_critic_network") 112 | self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] 113 | 114 | with tf.name_scope("critic_update_target_network_params"): 115 | self.update_target_network_params = \ 116 | [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) 117 | + tf.multiply(self.target_network_params[i], 1. - self.tau)) 118 | for i in range(len(self.target_network_params))] 119 | 120 | self.predicted_q_value = tf.placeholder(tf.float32, (None, NUM_AGENTS, 1), name="predicted_q_value") 121 | 122 | M = tf.to_float(tf.shape(self.out)[0]) 123 | # Li = (Yi - Qi)^2 124 | # L = Sum(Li) 125 | self.loss = tf.squeeze(1.0/M * tf.reduce_sum(tf.reduce_sum(tf.square(self.predicted_q_value - self.out), axis=1), axis=0), name="critic_loss") 126 | 127 | self.optimize = tf.train.AdamOptimizer( 128 | self.learning_rate).minimize(self.loss) 129 | 130 | # self.action_grads = tf.gradients(self.out, self.action, name="action_grads") 131 | self.action_grads = [tf.gradients(self.out[:, i], self.action) for i in range(NUM_AGENTS)] 132 | self.action_grads = tf.stack(tf.squeeze(self.action_grads, 1)) 133 | 134 | def create_critic_network(self, name): 135 | inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="critic_inputs") 136 | action = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN), name="critic_action") 137 | 138 | out = CommNet.critic_build_network(name, inputs, action) 139 | return inputs, action, out 140 | 141 | def train(self, inputs, action, predicted_q_value): 142 | return self.sess.run([self.out, self.optimize, self.loss], feed_dict={ 143 | self.inputs: inputs, 144 | self.action: action, 145 | self.predicted_q_value: predicted_q_value 146 | }) 147 | 148 | def predict(self, inputs, action): 149 | return self.sess.run(self.out, feed_dict={ 150 | self.inputs: inputs, 151 | self.action: action 152 | }) 153 | 154 | def predict_target(self, inputs, action): 155 | return self.sess.run(self.target_out, feed_dict={ 156 | self.target_inputs: inputs, 157 | self.target_action: action 158 | }) 159 | 160 | def action_gradients(self, inputs, actions): 161 | return self.sess.run(self.action_grads, feed_dict={ 162 | self.inputs: inputs, 163 | self.action: actions 164 | }) 165 | 166 | def update_target_network(self): 167 | self.sess.run(self.update_target_network_params) 168 | 169 | 170 | # =========================== 171 | # Tensorflow Summary Ops 172 | # =========================== 173 | 174 | def build_summaries(): 175 | episode_reward = tf.Variable(0., name="episode_reward") 176 | tf.summary.scalar("Reward", episode_reward) 177 | episode_ave_max_q = tf.Variable(0., name="episode_ave_max_q") 178 | tf.summary.scalar("Qmax Value", episode_ave_max_q) 179 | loss = tf.Variable(0., name="critic_loss") 180 | tf.summary.scalar("Critic_loss", loss) 181 | 182 | summary_vars = [episode_reward, episode_ave_max_q, loss] 183 | summary_ops = tf.summary.merge_all() 184 | 185 | return summary_ops, summary_vars 186 | 187 | 188 | # =========================== 189 | # Agent Training 190 | # =========================== 191 | 192 | def train(sess, env, args, actor, critic): 193 | summary_ops, summary_vars = build_summaries() 194 | 195 | sess.run(tf.global_variables_initializer()) 196 | writer = tf.summary.FileWriter(args['summary_dir'] + " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph) 197 | 198 | actor.update_target_network() 199 | critic.update_target_network() 200 | 201 | replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) 202 | 203 | for i in range(int(args['max_episodes'])): 204 | state = env.reset() 205 | 206 | ep_reward = 0 207 | ep_ave_max_q = 0 208 | 209 | for j in range(int(args['max_episode_len'])): 210 | action = actor.predict([state])[0] 211 | 212 | state2, reward, done, info = env.step(action) 213 | replay_buffer.add(state, action, reward, done, state2) 214 | 215 | if replay_buffer.size() > int(args['minibatch_size']): 216 | s_batch, a_batch, r_batch, t_batch, s2_batch = \ 217 | replay_buffer.sample_batch(int(args['minibatch_size'])) 218 | 219 | # TODO 220 | # Calculate targets 221 | # target_q = critic.predict_target( 222 | # s2_batch, actor.predict_target(s2_batch)) 223 | 224 | target_q = tf.zeros((1)) 225 | 226 | # Update the critic given the targets 227 | predicted_q_value, _, loss = critic.train(s_batch, a_batch, 228 | np.reshape(r_batch, (int(args['minibatch_size']), NUM_AGENTS, 1))) 229 | 230 | ep_ave_max_q += np.amax(predicted_q_value) 231 | 232 | # Update the actor policy using the sampled gradient 233 | a_outs = actor.predict(s_batch) 234 | grads = critic.action_gradients(s_batch, a_outs) 235 | actor.train(s_batch, grads) 236 | 237 | actor.update_target_network() 238 | critic.update_target_network() 239 | 240 | replay_buffer.clear() 241 | 242 | # Log 243 | summary_str = sess.run(summary_ops, feed_dict={ 244 | summary_vars[0]: np.mean(r_batch), 245 | summary_vars[1]: ep_ave_max_q / float(j + 1), 246 | summary_vars[2]: loss 247 | }) 248 | 249 | writer.add_summary(summary_str, i) 250 | writer.flush() 251 | 252 | print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch), 253 | i, (ep_ave_max_q / float(j + 1)))) 254 | 255 | state = state2 256 | ep_reward += reward 257 | 258 | if done: 259 | break 260 | 261 | 262 | def main(args=None): 263 | args = parse_arg(args or None) 264 | 265 | tf.reset_default_graph() 266 | config = tf.ConfigProto() 267 | config.gpu_options.allow_growth = True 268 | with tf.Session(config=config) as sess: 269 | env = GuessingSumEnv(NUM_AGENTS) 270 | env.seed(0) 271 | 272 | np.random.seed(int(args['random_seed'])) 273 | tf.set_random_seed(int(args['random_seed'])) 274 | env.seed(int(args['random_seed'])) 275 | 276 | state_dim = (NUM_AGENTS, VECTOR_OBS_LEN) 277 | action_dim = (NUM_AGENTS, OUTPUT_LEN) 278 | 279 | actor = ActorNetwork(sess, state_dim, action_dim, 280 | float(args['actor_lr']), float(args['tau']), 281 | int(args['minibatch_size'])) 282 | 283 | critic = CriticNetwork(sess, state_dim, action_dim, 284 | float(args['critic_lr']), float(args['tau']), 285 | float(args['gamma']), 286 | actor.get_num_trainable_vars()) 287 | 288 | train(sess, env, args, actor, critic) 289 | 290 | 291 | def parse_arg(args=None): 292 | parser = argparse.ArgumentParser(description='provide arguments for DDPG agent') 293 | 294 | # agent parameters 295 | parser.add_argument('--actor-lr', help='actor network learning rate', default=0.1) 296 | parser.add_argument('--critic-lr', help='critic network learning rate', default=0.1) 297 | parser.add_argument('--gamma', help='discount factor for critic updates', default=0.99) 298 | parser.add_argument('--tau', help='soft target update parameter', default=0.001) 299 | parser.add_argument('--buffer-size', help='max size of the replay buffer', default=1000000) 300 | parser.add_argument('--minibatch-size', help='size of minibatch for minibatch-SGD', default=1024) 301 | 302 | # run parameters 303 | parser.add_argument('--random-seed', help='random seed for repeatability', default=1234) 304 | parser.add_argument('--max-episodes', help='max num of episodes to do while training', default=9999999999999) 305 | parser.add_argument('--max-episode-len', help='max length of 1 episode', default=1000) 306 | parser.add_argument('--summary-dir', help='directory for storing tensorboard info', 307 | default="summaries/" + datetime.now().strftime('%d-%m-%y %H%M')) 308 | 309 | if args is not None: 310 | args = vars(parser.parse_args(args)) 311 | else: 312 | args = vars(parser.parse_args()) 313 | 314 | pp.pprint(args) 315 | 316 | return args 317 | 318 | 319 | if __name__ == '__main__': 320 | main() 321 | -------------------------------------------------------------------------------- /train_comm_net.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of DDPG - Deep Deterministic Policy Gradient https://github.com/pemami4911/deep-rl 3 | Modified by Coac for CommNet implementation https://github.com/Coac/CommNet-BiCnet 4 | """ 5 | import argparse 6 | import pprint as pp 7 | from datetime import datetime 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | from comm_net import CommNet 12 | # from bicnet import BiCNet as CommNet 13 | from guessing_sum_env import * 14 | from replay_buffer import ReplayBuffer 15 | 16 | HIDDEN_VECTOR_LEN = 1 17 | NUM_AGENTS = 2 18 | VECTOR_OBS_LEN = 1 19 | OUTPUT_LEN = 1 20 | 21 | 22 | # =========================== 23 | # Actor and Critic DNNs 24 | # =========================== 25 | 26 | class ActorNetwork(object): 27 | def __init__(self, sess, state_dim, action_dim, learning_rate, tau, batch_size): 28 | self.sess = sess 29 | self.s_dim = state_dim 30 | self.a_dim = action_dim 31 | self.learning_rate = learning_rate 32 | self.tau = tau 33 | self.batch_size = batch_size 34 | 35 | self.inputs, self.out = self.create_actor_network("actor_network") 36 | self.network_params = tf.trainable_variables() 37 | 38 | self.target_inputs, self.target_out = self.create_actor_network("target_actor_network") 39 | self.target_network_params = tf.trainable_variables()[ 40 | len(self.network_params):] 41 | 42 | with tf.name_scope("actor_update_target_network_params"): 43 | self.update_target_network_params = \ 44 | [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + 45 | tf.multiply(self.target_network_params[i], 1. - self.tau)) 46 | for i in range(len(self.target_network_params))] 47 | 48 | self.action_gradient = tf.placeholder(tf.float32, (None, self.a_dim[0], self.a_dim[1]), name="action_gradient") 49 | 50 | with tf.name_scope("actor_gradients"): 51 | self.unnormalized_actor_gradients = tf.gradients(self.out, self.network_params, -self.action_gradient) 52 | self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients)) 53 | 54 | self.optimize = tf.train.AdamOptimizer(self.learning_rate) 55 | self.optimize = self.optimize.apply_gradients(zip(self.actor_gradients, self.network_params)) 56 | 57 | self.num_trainable_vars = len(self.network_params) + len(self.target_network_params) 58 | 59 | def create_actor_network(self, name): 60 | inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="actor_inputs") 61 | out = CommNet.actor_build_network(name, inputs) 62 | return inputs, out 63 | 64 | def train(self, inputs, action_gradient): 65 | self.sess.run(self.optimize, feed_dict={ 66 | self.inputs: inputs, 67 | self.action_gradient: action_gradient 68 | }) 69 | 70 | def predict(self, inputs): 71 | return self.sess.run(self.out, feed_dict={ 72 | self.inputs: inputs 73 | }) 74 | 75 | def predict_target(self, inputs): 76 | return self.sess.run(self.target_out, feed_dict={ 77 | self.target_inputs: inputs 78 | }) 79 | 80 | def update_target_network(self): 81 | self.sess.run(self.update_target_network_params) 82 | 83 | def get_num_trainable_vars(self): 84 | return self.num_trainable_vars 85 | 86 | 87 | class CriticNetwork(object): 88 | """ 89 | Input to the network is the state and action, output is Q(s,a). 90 | The action must be obtained from the output of the Actor network. 91 | 92 | """ 93 | 94 | def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars): 95 | self.sess = sess 96 | self.s_dim = state_dim 97 | self.a_dim = action_dim 98 | self.learning_rate = learning_rate 99 | self.tau = tau 100 | self.gamma = gamma 101 | 102 | self.inputs, self.action, self.out = self.create_critic_network("critic_network") 103 | self.network_params = tf.trainable_variables()[num_actor_vars:] 104 | 105 | self.target_inputs, self.target_action, self.target_out = self.create_critic_network("target_critic_network") 106 | self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] 107 | 108 | with tf.name_scope("critic_update_target_network_params"): 109 | self.update_target_network_params = \ 110 | [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) 111 | + tf.multiply(self.target_network_params[i], 1. - self.tau)) 112 | for i in range(len(self.target_network_params))] 113 | 114 | self.predicted_q_value = tf.placeholder(tf.float32, (None, 1), name="predicted_q_value") 115 | 116 | self.loss = tf.losses.mean_squared_error(self.predicted_q_value, self.out) 117 | 118 | self.optimize = tf.train.AdamOptimizer( 119 | self.learning_rate).minimize(self.loss) 120 | 121 | self.action_grads = tf.gradients(self.out, self.action, name="action_grads") 122 | 123 | def create_critic_network(self, name): 124 | inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="critic_inputs") 125 | action = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN), name="critic_action") 126 | 127 | out = CommNet.critic_build_network(name, inputs, action) 128 | return inputs, action, out 129 | 130 | def train(self, inputs, action, predicted_q_value): 131 | return self.sess.run([self.out, self.optimize, self.loss], feed_dict={ 132 | self.inputs: inputs, 133 | self.action: action, 134 | self.predicted_q_value: predicted_q_value 135 | }) 136 | 137 | def predict(self, inputs, action): 138 | return self.sess.run(self.out, feed_dict={ 139 | self.inputs: inputs, 140 | self.action: action 141 | }) 142 | 143 | def predict_target(self, inputs, action): 144 | return self.sess.run(self.target_out, feed_dict={ 145 | self.target_inputs: inputs, 146 | self.target_action: action 147 | }) 148 | 149 | def action_gradients(self, inputs, actions): 150 | return self.sess.run(self.action_grads, feed_dict={ 151 | self.inputs: inputs, 152 | self.action: actions 153 | }) 154 | 155 | def update_target_network(self): 156 | self.sess.run(self.update_target_network_params) 157 | 158 | 159 | # =========================== 160 | # Tensorflow Summary Ops 161 | # =========================== 162 | 163 | def build_summaries(): 164 | episode_reward = tf.Variable(0., name="episode_reward") 165 | tf.summary.scalar("Reward", episode_reward) 166 | episode_ave_max_q = tf.Variable(0., name="episode_ave_max_q") 167 | tf.summary.scalar("Qmax Value", episode_ave_max_q) 168 | loss = tf.Variable(0., name="critic_loss") 169 | tf.summary.scalar("Critic_loss", loss) 170 | 171 | summary_vars = [episode_reward, episode_ave_max_q, loss] 172 | summary_ops = tf.summary.merge_all() 173 | 174 | return summary_ops, summary_vars 175 | 176 | 177 | # =========================== 178 | # Agent Training 179 | # =========================== 180 | 181 | def train(sess, env, args, actor, critic): 182 | summary_ops, summary_vars = build_summaries() 183 | 184 | sess.run(tf.global_variables_initializer()) 185 | writer = tf.summary.FileWriter(args['summary_dir'] + " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph) 186 | 187 | actor.update_target_network() 188 | critic.update_target_network() 189 | 190 | replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) 191 | 192 | for i in range(int(args['max_episodes'])): 193 | state = env.reset() 194 | 195 | ep_reward = 0 196 | ep_ave_max_q = 0 197 | 198 | for j in range(int(args['max_episode_len'])): 199 | action = actor.predict([state])[0] 200 | 201 | state2, reward, done, info = env.step(action) 202 | reward = np.sum(reward) / NUM_AGENTS 203 | 204 | replay_buffer.add(state, action, reward, done, state2) 205 | 206 | if replay_buffer.size() > int(args['minibatch_size']): 207 | s_batch, a_batch, r_batch, t_batch, s2_batch = \ 208 | replay_buffer.sample_batch(int(args['minibatch_size'])) 209 | 210 | # TODO 211 | # Calculate targets 212 | # target_q = critic.predict_target( 213 | # s2_batch, actor.predict_target(s2_batch)) 214 | 215 | target_q = tf.zeros((1)) 216 | 217 | # Update the critic given the targets 218 | predicted_q_value, _, loss = critic.train(s_batch, a_batch, 219 | np.reshape(r_batch, (int(args['minibatch_size']), 1))) 220 | 221 | ep_ave_max_q += np.amax(predicted_q_value) 222 | 223 | # Update the actor policy using the sampled gradient 224 | a_outs = actor.predict(s_batch) 225 | grads = critic.action_gradients(s_batch, a_outs) 226 | actor.train(s_batch, grads[0]) 227 | 228 | actor.update_target_network() 229 | critic.update_target_network() 230 | 231 | replay_buffer.clear() 232 | 233 | # Log 234 | summary_str = sess.run(summary_ops, feed_dict={ 235 | summary_vars[0]: np.mean(r_batch), 236 | summary_vars[1]: ep_ave_max_q / float(j + 1), 237 | summary_vars[2]: loss 238 | }) 239 | 240 | writer.add_summary(summary_str, i) 241 | writer.flush() 242 | 243 | print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch), 244 | i, (ep_ave_max_q / float(j + 1)))) 245 | 246 | state = state2 247 | ep_reward += reward 248 | 249 | if done: 250 | break 251 | 252 | 253 | def main(args=None): 254 | args = parse_arg(args or None) 255 | 256 | tf.reset_default_graph() 257 | config = tf.ConfigProto() 258 | config.gpu_options.allow_growth = True 259 | with tf.Session(config=config) as sess: 260 | env = GuessingSumEnv(NUM_AGENTS) 261 | env.seed(0) 262 | 263 | np.random.seed(int(args['random_seed'])) 264 | tf.set_random_seed(int(args['random_seed'])) 265 | env.seed(int(args['random_seed'])) 266 | 267 | state_dim = (NUM_AGENTS, VECTOR_OBS_LEN) 268 | action_dim = (NUM_AGENTS, OUTPUT_LEN) 269 | 270 | actor = ActorNetwork(sess, state_dim, action_dim, 271 | float(args['actor_lr']), float(args['tau']), 272 | int(args['minibatch_size'])) 273 | 274 | critic = CriticNetwork(sess, state_dim, action_dim, 275 | float(args['critic_lr']), float(args['tau']), 276 | float(args['gamma']), 277 | actor.get_num_trainable_vars()) 278 | 279 | train(sess, env, args, actor, critic) 280 | 281 | 282 | def parse_arg(args=None): 283 | parser = argparse.ArgumentParser(description='provide arguments for DDPG agent') 284 | 285 | # agent parameters 286 | parser.add_argument('--actor-lr', help='actor network learning rate', default=0.1) 287 | parser.add_argument('--critic-lr', help='critic network learning rate', default=0.1) 288 | parser.add_argument('--gamma', help='discount factor for critic updates', default=0.99) 289 | parser.add_argument('--tau', help='soft target update parameter', default=0.001) 290 | parser.add_argument('--buffer-size', help='max size of the replay buffer', default=1000000) 291 | parser.add_argument('--minibatch-size', help='size of minibatch for minibatch-SGD', default=1024) 292 | 293 | # run parameters 294 | parser.add_argument('--random-seed', help='random seed for repeatability', default=1234) 295 | parser.add_argument('--max-episodes', help='max num of episodes to do while training', default=9999999999999) 296 | parser.add_argument('--max-episode-len', help='max length of 1 episode', default=1000) 297 | parser.add_argument('--summary-dir', help='directory for storing tensorboard info', 298 | default="summaries/" + datetime.now().strftime('%d-%m-%y %H%M')) 299 | 300 | if args is not None: 301 | args = vars(parser.parse_args(args)) 302 | else: 303 | args = vars(parser.parse_args()) 304 | 305 | pp.pprint(args) 306 | 307 | return args 308 | 309 | 310 | if __name__ == '__main__': 311 | main() 312 | --------------------------------------------------------------------------------