├── .gitignore
├── README.md
├── bicnet.py
├── comm_net.py
├── docs
    └── 2_agents_commnet.png
├── guessing_sum_env.py
├── hypersearch.py
├── replay_buffer.py
├── summaries
    └── .gitkeep
├── train_bicnet.py
└── train_comm_net.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | 
107 | .idea
108 | summaries/*
109 | !summaries/.gitkeep
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CommNet-BiCnet
 2 | [CommNet](https://arxiv.org/abs/1605.07736) and [BiCnet](https://arxiv.org/abs/1703.10069) implementation in tensorflow
 3 | 
 4 | ## Training
 5 | Train CommNet using DDPG algorithm
 6 | ```
 7 | python train_comm_net.py
 8 | ```
 9 | 
10 | ## Hypersearch
11 | To find the optimal hyperparameters such as `actor_lr` or `critic_lr`, a simple grid search has been implemented. It launches multiple instances of the trainer in parallel based on the number of CPU cores.
12 | ```
13 | python hypersearch.py
14 | ```
15 | 
16 | ## Guessing sum environment
17 | It is a simple game described in the [BiCnet](https://arxiv.org/abs/1703.10069) paper for testing if the communication works. The environment implements the crucial methods of the core gym interface from OpenAI
18 | 
19 | Each agent receives a scalar sampled between `[−10, 10]` under a truncated Gaussian. Each agent needs to output the sum of all inputs received among the agents. An agent gets a normalized reward between `[0, 1]` based on the absolute difference between the sum and its output.
20 | 
21 | ## Results
22 | ### Training CommNet in the Guessing sum env with 2 agents
23 | ![2_agents_commnet_training_reward](docs/2_agents_commnet.png)
24 | 


--------------------------------------------------------------------------------
/bicnet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from guessing_sum_env import *
 4 | 
 5 | # TODO use the parameters of train_ddpg
 6 | HIDDEN_VECTOR_LEN = 1
 7 | NUM_AGENTS = 2
 8 | VECTOR_OBS_LEN = 1
 9 | OUTPUT_LEN = 1
10 | 
11 | 
12 | class BiCNet:
13 |     @staticmethod
14 |     def base_build_network(observation):
15 |         encoded = BiCNet.shared_dense_layer("encoder", observation, HIDDEN_VECTOR_LEN)
16 | 
17 |         hidden_agents = tf.unstack(encoded, NUM_AGENTS, 1)
18 | 
19 |         lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_VECTOR_LEN, forget_bias=1.0, name="lstm_fw_cell")
20 |         lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_VECTOR_LEN, forget_bias=1.0, name="lstm_bw_cell")
21 |         outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, hidden_agents, dtype=tf.float32)
22 |         with tf.variable_scope("bidirectional_rnn", reuse=tf.AUTO_REUSE):
23 |             tf.summary.histogram("lstm_fw_cell/kernel", tf.get_variable("fw/lstm_fw_cell/kernel"))
24 |             tf.summary.histogram("lstm_bw_cell/kernel", tf.get_variable("bw/lstm_bw_cell/kernel"))
25 | 
26 |         outputs = tf.stack(outputs, 1)
27 |         return outputs
28 | 
29 |     @staticmethod
30 |     def actor_build_network(name, observation):
31 |         with tf.variable_scope(name):
32 |             outputs = BiCNet.base_build_network(observation)
33 |             return BiCNet.shared_dense_layer("output_layer", outputs, OUTPUT_LEN)
34 | 
35 | 
36 |     @staticmethod
37 |     def shared_dense_layer(name, observation, output_len):
38 |         H = []
39 |         with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
40 |             for j in range(NUM_AGENTS):
41 |                 agent_obs = observation[:, j]
42 |                 agent_encoded = tf.layers.dense(agent_obs, output_len, name="dense")
43 |                 tf.summary.histogram(name + "/dense/kernel", tf.get_variable("dense/kernel"))
44 |                 H.append(agent_encoded)
45 |             H = tf.stack(H, 1)
46 |         return H
47 | 
48 |     @staticmethod
49 |     def critic_build_network(name, observation, action):
50 |         with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
51 |             outputs = BiCNet.base_build_network(tf.concat([observation, action], 2))
52 |             outputs = BiCNet.shared_dense_layer("output_layer", outputs, 1)
53 |             return outputs
54 | 
55 | if __name__ == '__main__':
56 |     tf.set_random_seed(42)
57 | 
58 |     tf.reset_default_graph()
59 | 
60 |     config = tf.ConfigProto()
61 |     config.gpu_options.allow_growth = True
62 |     with tf.Session(config=config) as sess:
63 |         BATCH_SIZE = 10
64 | 
65 |         observation = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="observation")
66 |         actions = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN), name="actions")
67 | 
68 |         actor_out = BiCNet.actor_build_network("actor_network", observation)
69 |         critic_out = BiCNet.critic_build_network("critic_network", observation, actions)
70 | 
71 |         sess.run(tf.global_variables_initializer())
72 | 
73 |         feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN))}
74 |         print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN), "== (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN)")
75 | 
76 |         feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN)),
77 |                      actions: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN))}
78 |         print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, NUM_AGENTS, 1), "== (BATCH_SIZE, NUM_AGENTS, 1)")
79 | 
80 |         feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN))}
81 |         print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (1, NUM_AGENTS, OUTPUT_LEN), "== (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN)")
82 | 
83 |         feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN)),
84 |                      actions: np.random.random_sample((1, NUM_AGENTS, OUTPUT_LEN))}
85 |         print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (1, NUM_AGENTS, 1), "== (BATCH_SIZE, NUM_AGENTS, 1)")
86 | 


--------------------------------------------------------------------------------
/comm_net.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from guessing_sum_env import *
  4 | 
  5 | # TODO use the parameters of train_ddpg
  6 | HIDDEN_VECTOR_LEN = 1
  7 | NUM_AGENTS = 2
  8 | VECTOR_OBS_LEN = 1
  9 | OUTPUT_LEN = 1
 10 | 
 11 | 
 12 | class CommNet:
 13 |     @staticmethod
 14 |     def base_build_network(observation):
 15 |         # H0 = CommNet.encoder(observation)
 16 |         H0 = observation
 17 |         C0 = tf.zeros(tf.shape(H0), name="C0")
 18 |         H1, C1 = CommNet.comm_step("comm_step1", H0, C0)
 19 |         H2, _ = CommNet.comm_step("comm_step2", H1, C1, H0)
 20 |         # H3, _ = CommNet.comm_step("comm_step3", H2, C2, H0)
 21 |         return H2
 22 | 
 23 |     @staticmethod
 24 |     def actor_build_network(name, observation):
 25 |         with tf.variable_scope(name):
 26 |             H = CommNet.base_build_network(observation)
 27 |             return CommNet.actor_output_layer(H)
 28 | 
 29 |     @staticmethod
 30 |     def critic_build_network(name, observation, action):
 31 |         with tf.variable_scope(name):
 32 |             H = CommNet.base_build_network(observation)
 33 |             return CommNet.critic_output_layer(H, action)
 34 | 
 35 |     @staticmethod
 36 |     def encoder(s):
 37 |         H = []
 38 |         with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
 39 |             for j in range(NUM_AGENTS):
 40 |                 encoded = tf.layers.dense(tf.reshape(s[j], (1, VECTOR_OBS_LEN)), HIDDEN_VECTOR_LEN, name="dense")
 41 |                 H.append(tf.squeeze(encoded))
 42 |             H = tf.stack(H)
 43 |             H = tf.reshape(H, (NUM_AGENTS, HIDDEN_VECTOR_LEN))
 44 | 
 45 |         return H
 46 | 
 47 |     @staticmethod
 48 |     def module(h, c):
 49 |         with tf.variable_scope("module", reuse=tf.AUTO_REUSE):
 50 |             w_H = tf.get_variable(name='w_H', shape=HIDDEN_VECTOR_LEN,
 51 |                                   initializer=tf.contrib.layers.xavier_initializer())
 52 |             w_C = tf.get_variable(name='w_C', shape=HIDDEN_VECTOR_LEN,
 53 |                                   initializer=tf.contrib.layers.xavier_initializer())
 54 | 
 55 |             tf.summary.histogram('w_H', w_H)
 56 |             tf.summary.histogram('w_C', w_C)
 57 | 
 58 |             return tf.tanh(tf.multiply(w_H, h) + tf.multiply(w_C, c))
 59 | 
 60 |     @staticmethod
 61 |     def comm_step(name, H, C, H0_skip_con=None):
 62 |         batch_size = tf.shape(H)[0]
 63 |         with tf.variable_scope(name):
 64 |             next_H = tf.zeros(shape=(batch_size, 0, HIDDEN_VECTOR_LEN))
 65 |             for j in range(NUM_AGENTS):
 66 |                 h = H[:, j]
 67 |                 c = C[:, j]
 68 | 
 69 |                 next_h = CommNet.module(h, c)  # shape (BATCH_SIZE, HIDDEN_VECTOR_LEN)
 70 |                 next_H = tf.concat([next_H, tf.reshape(next_h, (batch_size, 1, HIDDEN_VECTOR_LEN))], 1)
 71 | 
 72 |             next_H = tf.identity(next_H, "H")
 73 | 
 74 |             if H0_skip_con is not None:
 75 |                 next_H = tf.add(next_H, H0_skip_con)
 76 | 
 77 |             if NUM_AGENTS > 1:
 78 |                 next_C = tf.zeros(shape=(batch_size, 0, HIDDEN_VECTOR_LEN))
 79 |                 for j1 in range(NUM_AGENTS):
 80 |                     next_c = []
 81 |                     for j2 in range(NUM_AGENTS):
 82 |                         if j1 != j2:
 83 |                             next_c.append(next_H[:, j2])
 84 |                     next_c = tf.reduce_mean(tf.stack(next_c), 0)
 85 |                     next_C = tf.concat([next_C, tf.reshape(next_c, (batch_size, 1, HIDDEN_VECTOR_LEN))], 1)
 86 |             else:
 87 |                 next_C = C
 88 | 
 89 |             return next_H, tf.identity(next_C, "C")
 90 | 
 91 |     @staticmethod
 92 |     def actor_output_layer(H):
 93 |         with tf.variable_scope("actor_output"):
 94 |             w_out = tf.get_variable(name='w_out', shape=(HIDDEN_VECTOR_LEN, OUTPUT_LEN),
 95 |                                     initializer=tf.contrib.layers.xavier_initializer())
 96 |             b_out = tf.get_variable(name='b_out', shape=OUTPUT_LEN, initializer=tf.zeros_initializer())
 97 | 
 98 |             tf.summary.histogram('w_out', w_out)
 99 |             tf.summary.histogram('b_out', b_out)
100 | 
101 |             batch_size = tf.shape(H)[0]
102 | 
103 |             actions = []
104 |             for j in range(NUM_AGENTS):
105 |                 h = tf.slice(H, [0, j, 0], [batch_size, 1, HIDDEN_VECTOR_LEN])
106 |                 w_out_batch = tf.tile(tf.expand_dims(w_out, axis=0), [batch_size, 1, 1])
107 |                 action =  tf.squeeze(tf.matmul(h, w_out_batch) + b_out, [1])
108 | 
109 |                 actions.append(action)
110 |             actions = tf.stack(actions, name="actions", axis=1)
111 | 
112 |         return actions
113 | 
114 |     @staticmethod
115 |     def critic_output_layer(H, action):
116 |         with tf.variable_scope("critic_output", reuse=tf.AUTO_REUSE):
117 |             baseline = tf.layers.dense(inputs=tf.concat([H, action], 2),
118 |                                        units=1,
119 |                                        activation=tf.tanh,
120 |                                        kernel_initializer=tf.contrib.layers.xavier_initializer())
121 |             baseline = tf.squeeze(baseline, [2])
122 |             baseline = tf.layers.dense(inputs=baseline,
123 |                                        units=1,
124 |                                        kernel_initializer=tf.contrib.layers.xavier_initializer())
125 |             tf.summary.histogram("w_baseline", tf.get_variable("dense/kernel"))
126 | 
127 |             return baseline
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     tf.set_random_seed(42)
132 | 
133 |     tf.reset_default_graph()
134 | 
135 |     config = tf.ConfigProto()
136 |     config.gpu_options.allow_growth = True
137 |     with tf.Session(config=config) as sess:
138 |         BATCH_SIZE = 10
139 | 
140 |         observation = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN))
141 |         actions = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN))
142 | 
143 |         actor_out = CommNet.actor_build_network("actor_network", observation)
144 |         critic_out = CommNet.critic_build_network("critic_network", observation, actions)
145 | 
146 |         sess.run(tf.global_variables_initializer())
147 | 
148 |         feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN))}
149 |         print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN))
150 | 
151 |         feed_dict = {observation: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, VECTOR_OBS_LEN)),
152 |                      actions: np.random.random_sample((BATCH_SIZE, NUM_AGENTS, OUTPUT_LEN))}
153 |         print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (BATCH_SIZE, 1))
154 | 
155 |         feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN))}
156 |         print(sess.run(actor_out, feed_dict=feed_dict).shape, "==", (1, NUM_AGENTS, OUTPUT_LEN))
157 | 
158 |         feed_dict = {observation: np.random.random_sample((1, NUM_AGENTS, VECTOR_OBS_LEN)),
159 |                      actions: np.random.random_sample((1, NUM_AGENTS, OUTPUT_LEN))}
160 |         print(sess.run(critic_out, feed_dict=feed_dict).shape, "==", (1, 1))
161 | 


--------------------------------------------------------------------------------
/docs/2_agents_commnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Coac/CommNet-BiCnet/b7a1e3184c9881c9957d0cfe3b160797a6bd7cd6/docs/2_agents_commnet.png


--------------------------------------------------------------------------------
/guessing_sum_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class GuessingSumEnv:
 5 |     def __init__(self, num_agents=5):
 6 |         self.num_agents = num_agents
 7 |         self.sum = 0
 8 |         self.scale = 10.0
 9 |         self.sum_scale = self.num_agents * self.scale
10 | 
11 |     def step(self, actions):
12 |         if actions.shape != (self.num_agents, 1):
13 |             raise Exception('got input shape ', actions.shape, ' instead of ', (self.num_agents, 1))
14 | 
15 |         observations = None
16 |         rewards = -np.abs(actions - self.sum) # [-Inf ; 0]
17 | 
18 |         normalized_rewards = (np.maximum(rewards, -self.sum_scale) + self.sum_scale) / self.sum_scale # [0 ; 1]
19 | 
20 |         done = True
21 |         info = None
22 | 
23 |         return observations, normalized_rewards, done, info
24 | 
25 |     def reset(self):
26 |         observations = np.clip(np.random.normal(size=(self.num_agents, 1)), -self.scale, self.scale)
27 |         self.sum = np.sum(observations)
28 |         return observations
29 | 
30 |     def render(self, mode='human'):
31 |         return
32 | 
33 |     def close(self):
34 |         return
35 | 
36 |     def seed(self, seed=None):
37 |         np.random.seed(seed)
38 |         return
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     env = GuessingSumEnv()
43 |     env.seed(0)
44 | 
45 |     print('obs:', env.reset())
46 |     actions = np.random.normal(size=(env.num_agents, 1))
47 |     print('actions:', actions)
48 |     print('rewards:', env.step(actions))
49 | 


--------------------------------------------------------------------------------
/hypersearch.py:
--------------------------------------------------------------------------------
 1 | from concurrent import futures
 2 | from multiprocessing import cpu_count
 3 | import train_comm_net
 4 | import itertools
 5 | import shlex
 6 | 
 7 | def start_process(args):
 8 |     process = pool.submit(train_comm_net.main, args)
 9 |     process.arg = args
10 |     process.add_done_callback(done_callback)
11 |     return False
12 | 
13 | 
14 | def done_callback(process):
15 |     if process.cancelled():
16 |         print('Process {0} was cancelled'.format(process.arg))
17 |     elif process.done():
18 |         error = process.exception()
19 |         if error:
20 |             print('Process {0} - {1} '.format(process.arg, error))
21 |         else:
22 |             print('Process {0} done'.format(process.arg))
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     num_workers = cpu_count()
27 |     num_workers = 100
28 | 
29 |     print('Initializing Process Pool - {0} workers'.format(num_workers))
30 |     pool = futures.ProcessPoolExecutor(max_workers=num_workers)
31 | 
32 |     params = {
33 |         "--actor-lr": [0.01, 0.05, 0.1, 0.15],
34 |         "--critic-lr": [0.01, 0.05, 0.1, 0.15]
35 |     }
36 | 
37 | 
38 |     hyperparams_names = list(params.keys())
39 |     hyperparams = list(itertools.product(*params.values()))
40 |     print("Number of run needed:", len(hyperparams))
41 | 
42 |     for hyperparam in hyperparams:
43 |         args = ""
44 |         for index, value in enumerate(hyperparam):
45 |             args += hyperparams_names[index] + ' ' + str(value) + " "
46 | 
47 |         start_process(shlex.split(args))
48 | 


--------------------------------------------------------------------------------
/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data structure for implementing experience replay
 3 | 
 4 | Author: Patrick Emami
 5 | """
 6 | from collections import deque
 7 | import random
 8 | import numpy as np
 9 | 
10 | class ReplayBuffer(object):
11 | 
12 |     def __init__(self, buffer_size, random_seed=123):
13 |         """
14 |         The right side of the deque contains the most recent experiences
15 |         """
16 |         self.buffer_size = buffer_size
17 |         self.count = 0
18 |         self.buffer = deque()
19 |         random.seed(random_seed)
20 | 
21 |     def add(self, state, action, reward, done, state2):
22 |         experience = (state, action, reward, done, state2)
23 |         if self.count < self.buffer_size:
24 |             self.buffer.append(experience)
25 |             self.count += 1
26 |         else:
27 |             self.buffer.popleft()
28 |             self.buffer.append(experience)
29 | 
30 |     def size(self):
31 |         return self.count
32 | 
33 |     def sample_batch(self, batch_size):
34 |         if self.count < batch_size:
35 |             batch = random.sample(self.buffer, self.count)
36 |         else:
37 |             batch = random.sample(self.buffer, batch_size)
38 | 
39 |         s_batch = np.array([_[0] for _ in batch])
40 |         a_batch = np.array([_[1] for _ in batch])
41 |         r_batch = np.array([_[2] for _ in batch])
42 |         t_batch = np.array([_[3] for _ in batch])
43 |         s2_batch = np.array([_[4] for _ in batch])
44 | 
45 |         return s_batch, a_batch, r_batch, t_batch, s2_batch
46 | 
47 |     def clear(self):
48 |         self.buffer.clear()
49 |         self.count = 0
50 | 


--------------------------------------------------------------------------------
/summaries/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Coac/CommNet-BiCnet/b7a1e3184c9881c9957d0cfe3b160797a6bd7cd6/summaries/.gitkeep


--------------------------------------------------------------------------------
/train_bicnet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of DDPG - Deep Deterministic Policy Gradient https://github.com/pemami4911/deep-rl
  3 | Modified by Coac for BiCNet implementation https://github.com/Coac/CommNet-BiCnet
  4 | """
  5 | import argparse
  6 | import pprint as pp
  7 | from datetime import datetime
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | # from comm_net import CommNet
 12 | from bicnet import BiCNet as CommNet
 13 | from guessing_sum_env import *
 14 | from replay_buffer import ReplayBuffer
 15 | 
 16 | HIDDEN_VECTOR_LEN = 1
 17 | NUM_AGENTS = 2
 18 | VECTOR_OBS_LEN = 1
 19 | OUTPUT_LEN = 1
 20 | 
 21 | 
 22 | # ===========================
 23 | #   Actor and Critic DNNs
 24 | # ===========================
 25 | 
 26 | class ActorNetwork(object):
 27 |     def __init__(self, sess, state_dim, action_dim, learning_rate, tau, batch_size):
 28 |         self.sess = sess
 29 |         self.s_dim = state_dim
 30 |         self.a_dim = action_dim
 31 |         self.learning_rate = learning_rate
 32 |         self.tau = tau
 33 |         self.batch_size = batch_size
 34 | 
 35 |         self.inputs, self.out = self.create_actor_network("actor_network")
 36 |         self.network_params = tf.trainable_variables()
 37 | 
 38 |         self.target_inputs, self.target_out = self.create_actor_network("target_actor_network")
 39 |         self.target_network_params = tf.trainable_variables()[
 40 |                                      len(self.network_params):]
 41 | 
 42 |         with tf.name_scope("actor_update_target_network_params"):
 43 |             self.update_target_network_params = \
 44 |                 [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
 45 |                                                       tf.multiply(self.target_network_params[i], 1. - self.tau))
 46 |                  for i in range(len(self.target_network_params))]
 47 | 
 48 |         self.action_gradient = tf.placeholder(tf.float32, (NUM_AGENTS, None, NUM_AGENTS, OUTPUT_LEN), name="action_gradient")
 49 | 
 50 | 
 51 |         with tf.name_scope("actor_gradients"):
 52 |             grads = []
 53 |             for i in range(NUM_AGENTS):
 54 |                 for j in range(NUM_AGENTS):
 55 |                     grads.append(tf.gradients(self.out[:, j], self.network_params, -self.action_gradient[j][:, i]))
 56 |             grads = np.array(grads)
 57 |             self.unnormalized_actor_gradients = [tf.reduce_sum(list(grads[:, i]), axis=0) for i in range(len(self.network_params))]
 58 |             self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients))
 59 | 
 60 |         self.optimize = tf.train.AdamOptimizer(self.learning_rate)
 61 |         self.optimize = self.optimize.apply_gradients(zip(self.actor_gradients, self.network_params))
 62 | 
 63 |         self.num_trainable_vars = len(self.network_params) + len(self.target_network_params)
 64 | 
 65 |     def create_actor_network(self, name):
 66 |         inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="actor_inputs")
 67 |         out = CommNet.actor_build_network(name, inputs)
 68 |         return inputs, out
 69 | 
 70 |     def train(self, inputs, action_gradient):
 71 |         self.sess.run(self.optimize, feed_dict={
 72 |             self.inputs: inputs,
 73 |             self.action_gradient: action_gradient
 74 |         })
 75 | 
 76 |     def predict(self, inputs):
 77 |         return self.sess.run(self.out, feed_dict={
 78 |             self.inputs: inputs
 79 |         })
 80 | 
 81 |     def predict_target(self, inputs):
 82 |         return self.sess.run(self.target_out, feed_dict={
 83 |             self.target_inputs: inputs
 84 |         })
 85 | 
 86 |     def update_target_network(self):
 87 |         self.sess.run(self.update_target_network_params)
 88 | 
 89 |     def get_num_trainable_vars(self):
 90 |         return self.num_trainable_vars
 91 | 
 92 | 
 93 | class CriticNetwork(object):
 94 |     """
 95 |     Input to the network is the state and action, output is Q(s,a).
 96 |     The action must be obtained from the output of the Actor network.
 97 | 
 98 |     """
 99 | 
100 |     def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars):
101 |         self.sess = sess
102 |         self.s_dim = state_dim
103 |         self.a_dim = action_dim
104 |         self.learning_rate = learning_rate
105 |         self.tau = tau
106 |         self.gamma = gamma
107 | 
108 |         self.inputs, self.action, self.out = self.create_critic_network("critic_network")
109 |         self.network_params = tf.trainable_variables()[num_actor_vars:]
110 | 
111 |         self.target_inputs, self.target_action, self.target_out = self.create_critic_network("target_critic_network")
112 |         self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]
113 | 
114 |         with tf.name_scope("critic_update_target_network_params"):
115 |             self.update_target_network_params = \
116 |                 [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau)
117 |                                                       + tf.multiply(self.target_network_params[i], 1. - self.tau))
118 |                  for i in range(len(self.target_network_params))]
119 | 
120 |         self.predicted_q_value = tf.placeholder(tf.float32, (None, NUM_AGENTS, 1), name="predicted_q_value")
121 | 
122 |         M = tf.to_float(tf.shape(self.out)[0])
123 |         # Li = (Yi - Qi)^2
124 |         # L = Sum(Li)
125 |         self.loss = tf.squeeze(1.0/M * tf.reduce_sum(tf.reduce_sum(tf.square(self.predicted_q_value - self.out), axis=1), axis=0), name="critic_loss")
126 | 
127 |         self.optimize = tf.train.AdamOptimizer(
128 |             self.learning_rate).minimize(self.loss)
129 | 
130 |         # self.action_grads = tf.gradients(self.out, self.action, name="action_grads")
131 |         self.action_grads = [tf.gradients(self.out[:, i], self.action) for i in range(NUM_AGENTS)]
132 |         self.action_grads = tf.stack(tf.squeeze(self.action_grads, 1))
133 | 
134 |     def create_critic_network(self, name):
135 |         inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="critic_inputs")
136 |         action = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN), name="critic_action")
137 | 
138 |         out = CommNet.critic_build_network(name, inputs, action)
139 |         return inputs, action, out
140 | 
141 |     def train(self, inputs, action, predicted_q_value):
142 |         return self.sess.run([self.out, self.optimize, self.loss], feed_dict={
143 |             self.inputs: inputs,
144 |             self.action: action,
145 |             self.predicted_q_value: predicted_q_value
146 |         })
147 | 
148 |     def predict(self, inputs, action):
149 |         return self.sess.run(self.out, feed_dict={
150 |             self.inputs: inputs,
151 |             self.action: action
152 |         })
153 | 
154 |     def predict_target(self, inputs, action):
155 |         return self.sess.run(self.target_out, feed_dict={
156 |             self.target_inputs: inputs,
157 |             self.target_action: action
158 |         })
159 | 
160 |     def action_gradients(self, inputs, actions):
161 |         return self.sess.run(self.action_grads, feed_dict={
162 |             self.inputs: inputs,
163 |             self.action: actions
164 |         })
165 | 
166 |     def update_target_network(self):
167 |         self.sess.run(self.update_target_network_params)
168 | 
169 | 
170 | # ===========================
171 | #   Tensorflow Summary Ops
172 | # ===========================
173 | 
174 | def build_summaries():
175 |     episode_reward = tf.Variable(0., name="episode_reward")
176 |     tf.summary.scalar("Reward", episode_reward)
177 |     episode_ave_max_q = tf.Variable(0., name="episode_ave_max_q")
178 |     tf.summary.scalar("Qmax Value", episode_ave_max_q)
179 |     loss = tf.Variable(0., name="critic_loss")
180 |     tf.summary.scalar("Critic_loss", loss)
181 | 
182 |     summary_vars = [episode_reward, episode_ave_max_q, loss]
183 |     summary_ops = tf.summary.merge_all()
184 | 
185 |     return summary_ops, summary_vars
186 | 
187 | 
188 | # ===========================
189 | #   Agent Training
190 | # ===========================
191 | 
192 | def train(sess, env, args, actor, critic):
193 |     summary_ops, summary_vars = build_summaries()
194 | 
195 |     sess.run(tf.global_variables_initializer())
196 |     writer = tf.summary.FileWriter(args['summary_dir'] +  " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph)
197 | 
198 |     actor.update_target_network()
199 |     critic.update_target_network()
200 | 
201 |     replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed']))
202 | 
203 |     for i in range(int(args['max_episodes'])):
204 |         state = env.reset()
205 | 
206 |         ep_reward = 0
207 |         ep_ave_max_q = 0
208 | 
209 |         for j in range(int(args['max_episode_len'])):
210 |             action = actor.predict([state])[0]
211 | 
212 |             state2, reward, done, info = env.step(action)
213 |             replay_buffer.add(state, action, reward, done, state2)
214 | 
215 |             if replay_buffer.size() > int(args['minibatch_size']):
216 |                 s_batch, a_batch, r_batch, t_batch, s2_batch = \
217 |                     replay_buffer.sample_batch(int(args['minibatch_size']))
218 | 
219 |                 # TODO
220 |                 # Calculate targets
221 |                 # target_q = critic.predict_target(
222 |                 #     s2_batch, actor.predict_target(s2_batch))
223 | 
224 |                 target_q = tf.zeros((1))
225 | 
226 |                 # Update the critic given the targets
227 |                 predicted_q_value, _, loss = critic.train(s_batch, a_batch,
228 |                                                           np.reshape(r_batch, (int(args['minibatch_size']), NUM_AGENTS, 1)))
229 | 
230 |                 ep_ave_max_q += np.amax(predicted_q_value)
231 | 
232 |                 # Update the actor policy using the sampled gradient
233 |                 a_outs = actor.predict(s_batch)
234 |                 grads = critic.action_gradients(s_batch, a_outs)
235 |                 actor.train(s_batch, grads)
236 | 
237 |                 actor.update_target_network()
238 |                 critic.update_target_network()
239 | 
240 |                 replay_buffer.clear()
241 | 
242 |                 # Log
243 |                 summary_str = sess.run(summary_ops, feed_dict={
244 |                     summary_vars[0]: np.mean(r_batch),
245 |                     summary_vars[1]: ep_ave_max_q / float(j + 1),
246 |                     summary_vars[2]: loss
247 |                 })
248 | 
249 |                 writer.add_summary(summary_str, i)
250 |                 writer.flush()
251 | 
252 |                 print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch),
253 |                                                                                i, (ep_ave_max_q / float(j + 1))))
254 | 
255 |             state = state2
256 |             ep_reward += reward
257 | 
258 |             if done:
259 |                 break
260 | 
261 | 
262 | def main(args=None):
263 |     args = parse_arg(args or None)
264 | 
265 |     tf.reset_default_graph()
266 |     config = tf.ConfigProto()
267 |     config.gpu_options.allow_growth = True
268 |     with tf.Session(config=config) as sess:
269 |         env = GuessingSumEnv(NUM_AGENTS)
270 |         env.seed(0)
271 | 
272 |         np.random.seed(int(args['random_seed']))
273 |         tf.set_random_seed(int(args['random_seed']))
274 |         env.seed(int(args['random_seed']))
275 | 
276 |         state_dim = (NUM_AGENTS, VECTOR_OBS_LEN)
277 |         action_dim = (NUM_AGENTS, OUTPUT_LEN)
278 | 
279 |         actor = ActorNetwork(sess, state_dim, action_dim,
280 |                              float(args['actor_lr']), float(args['tau']),
281 |                              int(args['minibatch_size']))
282 | 
283 |         critic = CriticNetwork(sess, state_dim, action_dim,
284 |                                float(args['critic_lr']), float(args['tau']),
285 |                                float(args['gamma']),
286 |                                actor.get_num_trainable_vars())
287 | 
288 |         train(sess, env, args, actor, critic)
289 | 
290 | 
291 | def parse_arg(args=None):
292 |     parser = argparse.ArgumentParser(description='provide arguments for DDPG agent')
293 | 
294 |     # agent parameters
295 |     parser.add_argument('--actor-lr', help='actor network learning rate', default=0.1)
296 |     parser.add_argument('--critic-lr', help='critic network learning rate', default=0.1)
297 |     parser.add_argument('--gamma', help='discount factor for critic updates', default=0.99)
298 |     parser.add_argument('--tau', help='soft target update parameter', default=0.001)
299 |     parser.add_argument('--buffer-size', help='max size of the replay buffer', default=1000000)
300 |     parser.add_argument('--minibatch-size', help='size of minibatch for minibatch-SGD', default=1024)
301 | 
302 |     # run parameters
303 |     parser.add_argument('--random-seed', help='random seed for repeatability', default=1234)
304 |     parser.add_argument('--max-episodes', help='max num of episodes to do while training', default=9999999999999)
305 |     parser.add_argument('--max-episode-len', help='max length of 1 episode', default=1000)
306 |     parser.add_argument('--summary-dir', help='directory for storing tensorboard info',
307 |                         default="summaries/" + datetime.now().strftime('%d-%m-%y %H%M'))
308 | 
309 |     if args is not None:
310 |         args = vars(parser.parse_args(args))
311 |     else:
312 |         args = vars(parser.parse_args())
313 | 
314 |     pp.pprint(args)
315 | 
316 |     return args
317 | 
318 | 
319 | if __name__ == '__main__':
320 |     main()
321 | 


--------------------------------------------------------------------------------
/train_comm_net.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of DDPG - Deep Deterministic Policy Gradient https://github.com/pemami4911/deep-rl
  3 | Modified by Coac for CommNet implementation https://github.com/Coac/CommNet-BiCnet
  4 | """
  5 | import argparse
  6 | import pprint as pp
  7 | from datetime import datetime
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | from comm_net import CommNet
 12 | # from bicnet import BiCNet as CommNet
 13 | from guessing_sum_env import *
 14 | from replay_buffer import ReplayBuffer
 15 | 
 16 | HIDDEN_VECTOR_LEN = 1
 17 | NUM_AGENTS = 2
 18 | VECTOR_OBS_LEN = 1
 19 | OUTPUT_LEN = 1
 20 | 
 21 | 
 22 | # ===========================
 23 | #   Actor and Critic DNNs
 24 | # ===========================
 25 | 
 26 | class ActorNetwork(object):
 27 |     def __init__(self, sess, state_dim, action_dim, learning_rate, tau, batch_size):
 28 |         self.sess = sess
 29 |         self.s_dim = state_dim
 30 |         self.a_dim = action_dim
 31 |         self.learning_rate = learning_rate
 32 |         self.tau = tau
 33 |         self.batch_size = batch_size
 34 | 
 35 |         self.inputs, self.out = self.create_actor_network("actor_network")
 36 |         self.network_params = tf.trainable_variables()
 37 | 
 38 |         self.target_inputs, self.target_out = self.create_actor_network("target_actor_network")
 39 |         self.target_network_params = tf.trainable_variables()[
 40 |                                      len(self.network_params):]
 41 | 
 42 |         with tf.name_scope("actor_update_target_network_params"):
 43 |             self.update_target_network_params = \
 44 |                 [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
 45 |                                                       tf.multiply(self.target_network_params[i], 1. - self.tau))
 46 |                  for i in range(len(self.target_network_params))]
 47 | 
 48 |         self.action_gradient = tf.placeholder(tf.float32, (None, self.a_dim[0], self.a_dim[1]), name="action_gradient")
 49 | 
 50 |         with tf.name_scope("actor_gradients"):
 51 |             self.unnormalized_actor_gradients = tf.gradients(self.out, self.network_params, -self.action_gradient)
 52 |             self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients))
 53 | 
 54 |         self.optimize = tf.train.AdamOptimizer(self.learning_rate)
 55 |         self.optimize = self.optimize.apply_gradients(zip(self.actor_gradients, self.network_params))
 56 | 
 57 |         self.num_trainable_vars = len(self.network_params) + len(self.target_network_params)
 58 | 
 59 |     def create_actor_network(self, name):
 60 |         inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="actor_inputs")
 61 |         out = CommNet.actor_build_network(name, inputs)
 62 |         return inputs, out
 63 | 
 64 |     def train(self, inputs, action_gradient):
 65 |         self.sess.run(self.optimize, feed_dict={
 66 |             self.inputs: inputs,
 67 |             self.action_gradient: action_gradient
 68 |         })
 69 | 
 70 |     def predict(self, inputs):
 71 |         return self.sess.run(self.out, feed_dict={
 72 |             self.inputs: inputs
 73 |         })
 74 | 
 75 |     def predict_target(self, inputs):
 76 |         return self.sess.run(self.target_out, feed_dict={
 77 |             self.target_inputs: inputs
 78 |         })
 79 | 
 80 |     def update_target_network(self):
 81 |         self.sess.run(self.update_target_network_params)
 82 | 
 83 |     def get_num_trainable_vars(self):
 84 |         return self.num_trainable_vars
 85 | 
 86 | 
 87 | class CriticNetwork(object):
 88 |     """
 89 |     Input to the network is the state and action, output is Q(s,a).
 90 |     The action must be obtained from the output of the Actor network.
 91 | 
 92 |     """
 93 | 
 94 |     def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars):
 95 |         self.sess = sess
 96 |         self.s_dim = state_dim
 97 |         self.a_dim = action_dim
 98 |         self.learning_rate = learning_rate
 99 |         self.tau = tau
100 |         self.gamma = gamma
101 | 
102 |         self.inputs, self.action, self.out = self.create_critic_network("critic_network")
103 |         self.network_params = tf.trainable_variables()[num_actor_vars:]
104 | 
105 |         self.target_inputs, self.target_action, self.target_out = self.create_critic_network("target_critic_network")
106 |         self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]
107 | 
108 |         with tf.name_scope("critic_update_target_network_params"):
109 |             self.update_target_network_params = \
110 |                 [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau)
111 |                                                       + tf.multiply(self.target_network_params[i], 1. - self.tau))
112 |                  for i in range(len(self.target_network_params))]
113 | 
114 |         self.predicted_q_value = tf.placeholder(tf.float32, (None, 1), name="predicted_q_value")
115 | 
116 |         self.loss = tf.losses.mean_squared_error(self.predicted_q_value, self.out)
117 | 
118 |         self.optimize = tf.train.AdamOptimizer(
119 |             self.learning_rate).minimize(self.loss)
120 | 
121 |         self.action_grads = tf.gradients(self.out, self.action, name="action_grads")
122 | 
123 |     def create_critic_network(self, name):
124 |         inputs = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, VECTOR_OBS_LEN), name="critic_inputs")
125 |         action = tf.placeholder(tf.float32, shape=(None, NUM_AGENTS, OUTPUT_LEN), name="critic_action")
126 | 
127 |         out = CommNet.critic_build_network(name, inputs, action)
128 |         return inputs, action, out
129 | 
130 |     def train(self, inputs, action, predicted_q_value):
131 |         return self.sess.run([self.out, self.optimize, self.loss], feed_dict={
132 |             self.inputs: inputs,
133 |             self.action: action,
134 |             self.predicted_q_value: predicted_q_value
135 |         })
136 | 
137 |     def predict(self, inputs, action):
138 |         return self.sess.run(self.out, feed_dict={
139 |             self.inputs: inputs,
140 |             self.action: action
141 |         })
142 | 
143 |     def predict_target(self, inputs, action):
144 |         return self.sess.run(self.target_out, feed_dict={
145 |             self.target_inputs: inputs,
146 |             self.target_action: action
147 |         })
148 | 
149 |     def action_gradients(self, inputs, actions):
150 |         return self.sess.run(self.action_grads, feed_dict={
151 |             self.inputs: inputs,
152 |             self.action: actions
153 |         })
154 | 
155 |     def update_target_network(self):
156 |         self.sess.run(self.update_target_network_params)
157 | 
158 | 
159 | # ===========================
160 | #   Tensorflow Summary Ops
161 | # ===========================
162 | 
163 | def build_summaries():
164 |     episode_reward = tf.Variable(0., name="episode_reward")
165 |     tf.summary.scalar("Reward", episode_reward)
166 |     episode_ave_max_q = tf.Variable(0., name="episode_ave_max_q")
167 |     tf.summary.scalar("Qmax Value", episode_ave_max_q)
168 |     loss = tf.Variable(0., name="critic_loss")
169 |     tf.summary.scalar("Critic_loss", loss)
170 | 
171 |     summary_vars = [episode_reward, episode_ave_max_q, loss]
172 |     summary_ops = tf.summary.merge_all()
173 | 
174 |     return summary_ops, summary_vars
175 | 
176 | 
177 | # ===========================
178 | #   Agent Training
179 | # ===========================
180 | 
181 | def train(sess, env, args, actor, critic):
182 |     summary_ops, summary_vars = build_summaries()
183 | 
184 |     sess.run(tf.global_variables_initializer())
185 |     writer = tf.summary.FileWriter(args['summary_dir'] +  " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph)
186 | 
187 |     actor.update_target_network()
188 |     critic.update_target_network()
189 | 
190 |     replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed']))
191 | 
192 |     for i in range(int(args['max_episodes'])):
193 |         state = env.reset()
194 | 
195 |         ep_reward = 0
196 |         ep_ave_max_q = 0
197 | 
198 |         for j in range(int(args['max_episode_len'])):
199 |             action = actor.predict([state])[0]
200 | 
201 |             state2, reward, done, info = env.step(action)
202 |             reward = np.sum(reward) / NUM_AGENTS
203 | 
204 |             replay_buffer.add(state, action, reward, done, state2)
205 | 
206 |             if replay_buffer.size() > int(args['minibatch_size']):
207 |                 s_batch, a_batch, r_batch, t_batch, s2_batch = \
208 |                     replay_buffer.sample_batch(int(args['minibatch_size']))
209 | 
210 |                 # TODO
211 |                 # Calculate targets
212 |                 # target_q = critic.predict_target(
213 |                 #     s2_batch, actor.predict_target(s2_batch))
214 | 
215 |                 target_q = tf.zeros((1))
216 | 
217 |                 # Update the critic given the targets
218 |                 predicted_q_value, _, loss = critic.train(s_batch, a_batch,
219 |                                                           np.reshape(r_batch, (int(args['minibatch_size']), 1)))
220 | 
221 |                 ep_ave_max_q += np.amax(predicted_q_value)
222 | 
223 |                 # Update the actor policy using the sampled gradient
224 |                 a_outs = actor.predict(s_batch)
225 |                 grads = critic.action_gradients(s_batch, a_outs)
226 |                 actor.train(s_batch, grads[0])
227 | 
228 |                 actor.update_target_network()
229 |                 critic.update_target_network()
230 | 
231 |                 replay_buffer.clear()
232 | 
233 |                 # Log
234 |                 summary_str = sess.run(summary_ops, feed_dict={
235 |                     summary_vars[0]: np.mean(r_batch),
236 |                     summary_vars[1]: ep_ave_max_q / float(j + 1),
237 |                     summary_vars[2]: loss
238 |                 })
239 | 
240 |                 writer.add_summary(summary_str, i)
241 |                 writer.flush()
242 | 
243 |                 print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch),
244 |                                                                                i, (ep_ave_max_q / float(j + 1))))
245 | 
246 |             state = state2
247 |             ep_reward += reward
248 | 
249 |             if done:
250 |                 break
251 | 
252 | 
253 | def main(args=None):
254 |     args = parse_arg(args or None)
255 | 
256 |     tf.reset_default_graph()
257 |     config = tf.ConfigProto()
258 |     config.gpu_options.allow_growth = True
259 |     with tf.Session(config=config) as sess:
260 |         env = GuessingSumEnv(NUM_AGENTS)
261 |         env.seed(0)
262 | 
263 |         np.random.seed(int(args['random_seed']))
264 |         tf.set_random_seed(int(args['random_seed']))
265 |         env.seed(int(args['random_seed']))
266 | 
267 |         state_dim = (NUM_AGENTS, VECTOR_OBS_LEN)
268 |         action_dim = (NUM_AGENTS, OUTPUT_LEN)
269 | 
270 |         actor = ActorNetwork(sess, state_dim, action_dim,
271 |                              float(args['actor_lr']), float(args['tau']),
272 |                              int(args['minibatch_size']))
273 | 
274 |         critic = CriticNetwork(sess, state_dim, action_dim,
275 |                                float(args['critic_lr']), float(args['tau']),
276 |                                float(args['gamma']),
277 |                                actor.get_num_trainable_vars())
278 | 
279 |         train(sess, env, args, actor, critic)
280 | 
281 | 
282 | def parse_arg(args=None):
283 |     parser = argparse.ArgumentParser(description='provide arguments for DDPG agent')
284 | 
285 |     # agent parameters
286 |     parser.add_argument('--actor-lr', help='actor network learning rate', default=0.1)
287 |     parser.add_argument('--critic-lr', help='critic network learning rate', default=0.1)
288 |     parser.add_argument('--gamma', help='discount factor for critic updates', default=0.99)
289 |     parser.add_argument('--tau', help='soft target update parameter', default=0.001)
290 |     parser.add_argument('--buffer-size', help='max size of the replay buffer', default=1000000)
291 |     parser.add_argument('--minibatch-size', help='size of minibatch for minibatch-SGD', default=1024)
292 | 
293 |     # run parameters
294 |     parser.add_argument('--random-seed', help='random seed for repeatability', default=1234)
295 |     parser.add_argument('--max-episodes', help='max num of episodes to do while training', default=9999999999999)
296 |     parser.add_argument('--max-episode-len', help='max length of 1 episode', default=1000)
297 |     parser.add_argument('--summary-dir', help='directory for storing tensorboard info',
298 |                         default="summaries/" + datetime.now().strftime('%d-%m-%y %H%M'))
299 | 
300 |     if args is not None:
301 |         args = vars(parser.parse_args(args))
302 |     else:
303 |         args = vars(parser.parse_args())
304 | 
305 |     pp.pprint(args)
306 | 
307 |     return args
308 | 
309 | 
310 | if __name__ == '__main__':
311 |     main()
312 | 


--------------------------------------------------------------------------------