├── .gitignore
├── .gitmodules
├── Actor.py
├── Critic.py
├── FCNN.py
├── README.md
├── env.py
├── maddpg.py
├── memory.py
├── noise.py
├── rollout.py
├── train.py
└── util.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __*/
2 | *.ipynb
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "PointEnvironment"]
2 | 	path = PointEnvironment
3 | 	url = https://github.com/aarg-kcis/PointEnvironment
4 | 


--------------------------------------------------------------------------------
/Actor.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from tensorflow import multiply as mul
 4 | 
 5 | from tensorflow.train import AdamOptimizer as Adam
 6 | from tensorflow.initializers import truncated_normal as TN
 7 | 
 8 | from FCNN import FCNN
 9 | 
10 | 
11 | class Actor:
12 |     def __init__(self, sess, input_t, _id, **params):
13 |         self.session = sess
14 |         self._id = _id
15 |         self.__dict__.update(params)
16 |         self.__dict__.update(input_t)
17 |         self.obs = input_t["o{}".format(_id)]
18 |         self.u = input_t["a{}".format(_id)]
19 |         self.generate_networks()
20 |         self.define_operations()
21 | 
22 |     def generate_networks(self, load_from_ckpt=False):
23 |         pi_input = tf.concat([self.obs, self.g], axis=1)
24 |         # MAIN ACTOR NETWORK
25 |         self.pi = FCNN(pi_input, self.u.shape[-1], self.n_layers, self.n_units,
26 |                        tf.nn.relu, tf.nn.tanh, name="pi_{}".format(self._id),
27 |                        w_init=TN(stddev=1e-1))
28 |         # TARGET ACTOR NETWORK
29 |         self.PI = FCNN(pi_input, self.u.shape[-1], self.n_layers, self.n_units,
30 |                        tf.nn.relu, tf.nn.tanh, name="t_pi_{}".format(self._id))
31 | 
32 |     def define_operations(self):
33 |         with tf.name_scope("actor_ops"):
34 |             # GRADIENT OF ACTIONS WRT ACTOR PARAMS TIMES NEGATIVE GRADIENT OF
35 |             # VALUE FUNCTION WRT ACTIONS
36 |             grads = tf.gradients(self.pi.nn, self.pi.net_params, -self.dqdu)
37 |             # APPLY GRADIENTS TO ACTOR NETWORK
38 |             self.optimize = Adam(self.lr, name="pi_adam")\
39 |                 .apply_gradients(zip(grads, self.pi.net_params))
40 |             # UPDATE TARGET OP
41 |             net_param_pairs = zip(self.pi.net_params, self.PI.net_params)
42 |             with tf.name_scope("update_target_pi"):
43 |                 self.updt_PI = [j.assign(mul(self.tau, i)+mul((1-self.tau), j))
44 |                                 for i, j in net_param_pairs]
45 | 
46 |     def predict(self, obs, g):
47 |         return self.session.run(self.pi.nn,
48 |                                 feed_dict={self.obs: obs, self.g: g})
49 | 
50 |     def predict_target(self, obs, g):
51 |         return self.session.run(self.PI.nn,
52 |                                 feed_dict={self.obs: obs, self.g: g})
53 | 
54 |     def train(self, obs, g, dqdu):
55 |         return self.session.run(self.optimize,
56 |                                 feed_dict={self.obs: obs, self.g: g,
57 |                                            self.dqdu: dqdu})
58 | 
59 |     def update_target(self):
60 |         self.session.run(self.updt_PI)
61 | 


--------------------------------------------------------------------------------
/Critic.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from tensorflow import square as sq
 4 | from tensorflow import multiply as mul
 5 | from tensorflow import reduce_mean as rmean
 6 | 
 7 | from tensorflow.train import AdamOptimizer as Adam
 8 | 
 9 | from FCNN import FCNN
10 | 
11 | 
12 | class Critic:
13 |     def __init__(self, sess, input_t, **params):
14 |         self.session = sess
15 |         self.__dict__.update(params)
16 |         self.__dict__.update(input_t)
17 |         self.generate_networks()
18 |         self.define_operations()
19 | 
20 |     def generate_networks(self):
21 |         q_input = tf.concat([self.x, self.g, self.a1, self.a2, self.a3], axis=1)
22 |         # MAIN CRITIC NETWORK
23 |         self.q = FCNN(q_input, 1, self.n_layers, self.n_units,
24 |                       tf.nn.relu, name="q")
25 |         # TARGET CRITIC NETWORK
26 |         self.Q = FCNN(q_input, 1, self.n_layers, self.n_units,
27 |                       tf.nn.relu, name="t_q")
28 | 
29 |     def define_operations(self):
30 |         with tf.name_scope("critic_ops"):
31 |             # LOSS
32 |             loss = tf.sqrt(rmean(sq(self.p - self.q.nn)))
33 |             # MINIMIZE LOSS OP
34 |             self.minimize = Adam(self.lr, name="q_adam")\
35 |                 .minimize(loss, var_list=self.q.net_params)
36 |             # ACTION GRADIENTS
37 |             self.a_grads1 = tf.gradients(self.q.nn, self.a1, name="dq_da1")
38 |             self.a_grads2 = tf.gradients(self.q.nn, self.a2, name="dq_da2")
39 |             self.a_grads3 = tf.gradients(self.q.nn, self.a3, name="dq_da3")
40 |             self.a_grad_ops = [self.a_grads1, self.a_grads2, self.a_grads3]
41 |             # UPDATE TARGET OP
42 |             net_param_pairs = zip(self.q.net_params, self.Q.net_params)
43 |             with tf.name_scope("update_target_q"):
44 |                 self.updt_Q = [j.assign(mul(self.tau, i)+mul((1-self.tau), j))
45 |                                for i, j in net_param_pairs]
46 | 
47 |     def predict(self, x, g, a1, a2, a3):
48 |         feed_vals = {self.x: x, self.g: g, self.a1: a1,
49 |                      self.a2: a2, self.a3: a3}
50 |         return self.session.run(self.q.nn, feed_dict=feed_vals)
51 | 
52 |     def predict_target(self, x, g, a1, a2, a3):
53 |         feed_vals = {self.x: x, self.g: g, self.a1: a1,
54 |                      self.a2: a2, self.a3: a3}
55 |         return self.session.run(self.Q.nn, feed_dict=feed_vals)
56 | 
57 |     def train(self, x, g, a1, a2, a3, p):
58 |         feed_vals = {self.x: x, self.g: g, self.a1: a1,
59 |                      self.a2: a2, self.a3: a3, self.p: p}
60 |         return self.session.run([self.q.nn, self.minimize], feed_dict=feed_vals)
61 | 
62 |     def get_action_grads(self, x, g, a1, a2, a3):
63 |         feed_vals = {self.x: x, self.g: g, self.a1: a1,
64 |                      self.a2: a2, self.a3: a3}
65 |         r = self.session.run(self.a_grad_ops, feed_dict=feed_vals)
66 |         
67 |         return r
68 | 
69 |     def update_target(self):
70 |         self.session.run(self.updt_Q)
71 | 


--------------------------------------------------------------------------------
/FCNN.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import GraphKeys
 3 | from tensorflow.layers import dense
 4 | from tensorflow.initializers import truncated_normal as TN
 5 | TVARS = GraphKeys.TRAINABLE_VARIABLES
 6 | 
 7 | 
 8 | class FCNN:
 9 |     def __init__(self, _input, op_dim, n_layers, n_units, activation,
10 |                  op_act=None, name="FCNN", w_init=None, from_ckpt=False):
11 |         print("CREATING NETWORK NAMED {}".format(name))
12 |         self._input = _input
13 |         self.op_dim = op_dim
14 |         self.n_layers = n_layers
15 |         self.n_units = n_units
16 |         self.activation = activation
17 |         self.scope = name
18 |         self.op_act = op_act
19 |         self.w_init = w_init if w_init is not None else TN(stddev=1e-1)
20 |         self.make()
21 | 
22 |     def make(self):
23 |         with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
24 |             _input = self._input
25 |             for i in range(0, self.n_layers-1):
26 |                 op = dense(_input, self.n_units,
27 |                            kernel_initializer=self.w_init,
28 |                            name="layer_{}".format(i))
29 |                 _input = self.activation(op)
30 |             op = dense(_input, self.op_dim,
31 |                        kernel_initializer=self.w_init,
32 |                        name="layer_{}".format(i+1))
33 |             self.nn = op if self.op_act is None else self.op_act(op)
34 |             self.net_params = tf.get_collection(TVARS, scope=self.scope)
35 |         for i in self.net_params:
36 |             tf.summary.histogram(i.name.replace(":", "_"), i)
37 | 
38 |     def __call__(self, sess, inputs):
39 |         return sess.run(self.nn, inputs)
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Formation Control Using MARL
 2 | 
 3 | [Video to the output](https://www.youtube.com/watch?v=_w3tvqFUYxA)
 4 | 
 5 | ## Some notes about this project
 6 | 
 7 | This project aims at training agents to cooperatively learn a policy to move to
 8 | a goal position while in a formation.
 9 | 
10 | The rewards are sparse and two fold.  
11 |  - `R_form`: agents recieve this reward when the desired formation is achieved.
12 |  - `R_goal`: agents recieve this reward when they have moved to the desired 
13 | goal. This reward is only given when the agents have moved to the goal position
14 | in the desired formation.
15 | 
16 | ### The Environment
17 | 
18 |  - The environment will inherit the classic control gym envs.
19 |  - The environment will be totally observable by the critic but partially
20 | observable by the agents.
21 |  - A function will convert the state of the environment to the observations of
22 | the different agents.
23 | 
24 | #### State
25 | The state of the environment will be denoted by a a vector indicating the
26 | following:
27 | This is the state of the system and will only be observed by the central critic.
28 | 
29 | 1. Position vectors of the agents from the centroid of the formation(current)
30 | 2. Heading of the individual agents represented as the unit vector from 
31 | the centroid of the formation(it will be the same as the world frame heading)
32 | 
33 | #### Agent Observations
34 | This will be a dictionary of vectors representing the observed state of the
35 | agents from their respective coordinate frames.
36 | 
37 | #### Goal
38 | A triangle is sampled as the goal formation.
39 | The goal is given as the relative position vector for agents from the 
40 | centroid of the sampled goal formation.
41 | 
42 | 
43 | It will have a goal, a random formation in the form of sides of the triangle.
44 | 
45 | The main training algorithm will use DDPG + HER for training
46 | 
47 | 
48 | 
49 | ## To do list:
50 |  - [ ] Make multiple agents
51 |  - [x] Central critic
52 |  - [ ] HER transitons
53 |  - [ ] Rollouts
54 |  - [ ] Training code
55 |  - [ ] update code for maddpg
56 |  - [ ] Exploration through critic
57 | 
58 | ## Setting up
59 | 


--------------------------------------------------------------------------------
/env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym.spaces import Box
  3 | from gym.envs.registration import register
  4 | from gym.envs.classic_control import rendering
  5 | 
  6 | import numpy as np
  7 | from numpy.linalg import norm
  8 | from numpy.random import random
  9 | 
 10 | from PointEnvironment.Pose import Pose
 11 | from PointEnvironment.Agent import Agent
 12 | 
 13 | from util import error, log
 14 | 
 15 | class FormEnv(gym.Env):
 16 |     cossin = staticmethod(lambda x: np.array([np.cos(x), np.sin(x)]))
 17 | 
 18 |     def __init__(self, config=None):
 19 |         self.configure_defaults()
 20 |         if config is not None:
 21 |             self.__dict__.update(config)
 22 |         self.goal = None
 23 |         self.viewer = None
 24 | 
 25 |     def configure_defaults(self):
 26 |         self.dt = 1e-2
 27 |         self.num_iter = 50
 28 |         self.max_episode_steps = 25
 29 |         self.step_penalty = 1.0
 30 |         self.max_reward = 2.0
 31 |         self.action_low = np.array([0.0, -np.pi/4])
 32 |         self.action_high = np.array([0.4, np.pi/4])
 33 |         self.action_space = Box(self.action_low, self.action_high, dtype="f")
 34 |         self.w_limits = np.array([10, 10])
 35 |         self.s_limits = np.array([600, 600])
 36 |         self.scale = self.s_limits/self.w_limits
 37 |         self.scale = self.scale[0]
 38 |         self.agent_radius = 0.15  # in meters
 39 |         self.agents = [Agent(i) for i in range(3)]
 40 | 
 41 |     def init_viewer(self):
 42 |         self.viewer = rendering.Viewer(*self.s_limits)
 43 |         lx = [rendering.Line((0, pt), (self.s_limits[1], pt)) for pt in
 44 |               np.arange(0, self.s_limits[0], self.scale)]
 45 |         ly = [rendering.Line((pt, 0), (pt, self.s_limits[0])) for pt in
 46 |               np.arange(0, self.s_limits[1], self.scale)]
 47 |         [self.viewer.add_geom(i) for i in lx+ly]
 48 |         # GOAL MARKER
 49 |         circle = rendering.make_circle(radius=0.15*self.scale)
 50 |         circle.set_color(0.3, 0.82, 0.215)
 51 |         self.goal_tf = rendering.Transform()
 52 |         circle.add_attr(self.goal_tf)
 53 |         self.viewer.add_geom(circle)
 54 |         # AGENT MARKERS
 55 |         self.agent_tfs = []
 56 |         a_rad_px = self.agent_radius * self.scale
 57 |         verx = [a_rad_px*FormEnv.cossin(np.radians(i)) for i in [0, 140, -140]]
 58 |         for i in self.agents:
 59 |             agent = rendering.FilledPolygon([tuple(j) for j in verx])
 60 |             agent.set_color(0.15, 0.235, 0.459)
 61 |             agent_tf = rendering.Transform()
 62 |             agent.add_attr(agent_tf)
 63 |             self.agent_tfs.append(agent_tf)
 64 |             self.viewer.add_geom(agent)
 65 |         # CENTROID MARKER
 66 |         circle = rendering.make_circle(radius=0.05*self.scale)
 67 |         circle.set_color(0.9, 0.3, 0.23)
 68 |         self.centroid_tf = rendering.Transform()
 69 |         circle.add_attr(self.centroid_tf)
 70 |         self.viewer.add_geom(circle)
 71 | 
 72 |     def get_form_goal(self):
 73 |         sides = sorted(np.random.random(3)*0.8 + 0.7)
 74 |         if np.array([np.sum(sides) - 2*x > 0 for x in sides]).all():
 75 |             self.goal_sides = sides
 76 |             a, b, c = sides
 77 |             coordinates = [[0, 0], [a, 0]]
 78 |             h = (c**2 - b**2 + a**2)/(2*a)
 79 |             coordinates.append([h, (c**2 - h**2)**.5])
 80 |             centroid = np.mean(coordinates, axis=0)
 81 |             return np.hstack([i - centroid for i in coordinates])
 82 |         return self.get_form_goal()
 83 | 
 84 |     def sample_pose(self, limits=None):
 85 |         if limits is None:
 86 |             x, y = random(2)*self.w_limits - self.w_limits/2
 87 |         else:
 88 |             x, y = random(2)*limits - limits/2
 89 |         theta = (random()*2 - 1)*np.pi
 90 |         return Pose(x=x, y=y, t=theta)
 91 | 
 92 |     def reset(self):
 93 |         poses = self.get_form_goal().reshape((3,-1))
 94 |         # log.out(poses)
 95 |         [a.reset(Pose(*poses[i])) for i, a in enumerate(self.agents)]
 96 |         self.goal = self.get_form_goal()
 97 |         self.goal_changed = True
 98 |         return self.compute_obs()
 99 | 
100 |     def render(self, mode='human'):
101 |         if self.goal is None:
102 |             return None
103 |         if self.viewer is None:
104 |             self.init_viewer()
105 |         for agent, agent_tf in zip(self.agents, self.agent_tfs):
106 |             agent_tf.set_translation(*(agent.pose.tolist()[:-1] +
107 |                                      self.w_limits//2)*self.scale)
108 |             agent_tf.set_rotation(agent.pose.theta)
109 |         centroid = np.mean([a.pose.tolist()[:-1] for a in self.agents], 0)
110 |         self.centroid_tf.set_translation(*(centroid+self.w_limits//2)
111 |                                          * self.scale)
112 |         return self.viewer.render(return_rgb_array=mode == 'rgb_array')
113 | 
114 |     def compute_obs(self):
115 |         obs = {j.id: np.hstack([np.hstack(j.pose.getPoseInFrame(i.pose))
116 |                for i in self.agents if i.id != j.id]) for j in self.agents}
117 |         f_c = np.mean([a.pose.tolist()[:-1] for a in self.agents], axis=0)
118 |         cst = np.hstack([a.pose.tolist()[:-1] - f_c for a in self.agents])
119 |         hed = np.hstack(FormEnv.cossin(a.pose.theta) for a in self.agents)
120 |         return obs, cst, hed
121 | 
122 |     def step(self, actions):
123 |         # print(actions.values())
124 |         assert self.goal is not None
125 |         for agent_id, action in actions.items():
126 |             [self.agents[agent_id].step(action) for _ in range(self.num_iter)]
127 |         new_obs = self.compute_obs()
128 |         return (*new_obs, *self.compute_reward())
129 | 
130 |     def compute_reward(self):
131 |         reward, done = -self.step_penalty, False
132 |         sides = sorted([Pose.dist(i.pose, j.pose) for i in self.agents 
133 |                         for j in self.agents if i.id < j.id])
134 |         # log.out(sides)
135 |         if (np.abs(sides - np.array(self.goal_sides)) < 0.15).all():
136 |             error.out("\nHURRAY\n{}\n{}\n{}\n{}\n".format(self.goal_sides, sides, [i.pose for i in self.agents], sides - np.array(self.goal_sides)))
137 |             reward, done = self.max_reward, True
138 |         return reward, done, {"success": done}
139 | 
140 | 


--------------------------------------------------------------------------------
/maddpg.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | from Actor import Actor
  6 | from noise import Noise
  7 | from Critic import Critic
  8 | from memory import Memory
  9 | 
 10 | from util import error
 11 | 
 12 | class MADDPG:
 13 | 
 14 |     def __init__(self, sess, scale_u, params):
 15 |         self.sess = sess
 16 |         self.scale_u = scale_u
 17 |         self.__dict__.update(params)
 18 |         # CREATE INPUT PLACEHOLDERS
 19 |         self.create_input_placeholders()
 20 |         # INITIALIZE ACTOR & CRITIC MODELS
 21 |         self.agents = [Actor(self.sess, self.inputs, i, **self.actor_params)
 22 |                        for i in [1, 2, 3]]
 23 |         self.critic = Critic(self.sess, self.inputs, **self.critic_params)
 24 |         # INITIALIZE EXPLORATION MODEL
 25 |         self.noise_params = {k: np.fromstring(v, sep=",", dtype="f")
 26 |                              for k, v in self.noise_params.items()}
 27 |         self.noise = [Noise(**self.noise_params) for _ in range(3)]
 28 |         # INITIALIZE REPLAY BUFFER
 29 |         self.memory = Memory(self.memory_size)
 30 |         # AVERAGE AGENT POLICIES
 31 |         avg_pi = [tf.reduce_mean(i, axis=0) for i in zip(*[x.pi.net_params for x in self.agents])]
 32 |         self.avg_op = [tf.assign(i, j) for x in self.agents for i, j in zip(x.pi.net_params, avg_pi)]
 33 | 
 34 |     def create_input_placeholders(self):
 35 |         self.inputs = {}
 36 |         ph = lambda s, n: tf.placeholder(tf.float32, shape=s, name=n)
 37 |         with tf.name_scope("inputs"):
 38 |             self.inputs["g"] = ph((None, 6), "goal")
 39 |             self.inputs["d"] = ph((None, 8), "done")
 40 |             self.inputs["p"] = ph((None, 1), "pred_q")
 41 |             self.inputs["r"] = ph((None, 8), "reward")
 42 |             self.inputs["x"] = ph((None, 12), "state")
 43 |             self.inputs["o1"] = ph((None, 8), "obs1")
 44 |             self.inputs["o2"] = ph((None, 8), "obs2")
 45 |             self.inputs["o3"] = ph((None, 8), "obs3")
 46 |             self.inputs["a1"] = ph((None, 2), "act1")
 47 |             self.inputs["a2"] = ph((None, 2), "act2")
 48 |             self.inputs["a3"] = ph((None, 2), "act3")
 49 |             self.inputs["dqdu"] = ph((None, 2), "dqdu")
 50 | 
 51 | 
 52 |     def step(self, obs, goal, state=None, explore=True):
 53 |         q = 0.
 54 |         sh = (1, -1)
 55 |         obs = {i: j.reshape(sh) for i, j in obs.items()}
 56 |         goal = goal.reshape(sh)
 57 |         state = state.reshape(sh)
 58 |         if explore:
 59 |             u = [x.predict(obs[i], goal) + self.noise[i]() 
 60 |                  for i, x in enumerate(self.agents)]
 61 |             # print(u)
 62 |         else:
 63 |             u = [x.predict_target(obs[i], goal) for i, x in enumerate(self.agents)]
 64 |         if state is not None:
 65 |             q = self.critic.predict_target(state, goal, *u)
 66 |         u = [x.reshape((1, -1)) for x in u]
 67 |         return [self.scale_u(x) for x in u], u, float(q)
 68 | 
 69 |     def remember(self, experience):
 70 |         self.memory.add(experience)
 71 | 
 72 |     def train(self):
 73 |         # check if the memory contains enough experiences
 74 |         if self.memory.size < 2*self.b_size:
 75 |             return
 76 |         x, g, o1, o2, o3, a1, a2, a3, r, d, x2, ag, o21, o22, o23 = self.get_batch()
 77 |         # print(a1)
 78 |         # HER TRANSACTIONS
 79 |        
 80 |         # her_idxs = np.where(np.random.random(self.b_size).reshape((-1, 1)) < 0.80)
 81 |         # g[her_idxs] = ag[her_idxs]
 82 |         # r[her_idxs] = 2.
 83 |         # d[her_idxs] = True
 84 |         obs = [o1, o2, o3]
 85 |         n_o = [o21, o22, o23]
 86 |         n_u = [j.predict_target(n_o[i], g) for i, j in enumerate(self.agents)]
 87 |         n_q = self.critic.predict_target(x2, g, *n_u)
 88 |         t_q = r + self.gamma*n_q*(1 - d)
 89 |         self.critic.train(x, g, a1, a2, a3, t_q)
 90 |         grad = self.critic.get_action_grads(x, g, a1, a2, a3)
 91 |         [j.train(obs[i], g, grad[i][0]) for i, j in enumerate(self.agents)]
 92 |         # self.update_targets()
 93 | 
 94 |     def get_batch(self):
 95 |         batch = self.memory.sample(self.b_size)
 96 |         return [np.vstack([experience[j] for experience in batch]) for j in range(15)]
 97 | 
 98 |     def update_targets(self):
 99 |         self.critic.update_target()
100 |         [x.update_target() for x in self.agents]
101 |         self.sess.run(self.avg_op)
102 | 


--------------------------------------------------------------------------------
/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import deque
 3 | 
 4 | class Memory:
 5 |     def __init__(self, maxlen, seed=None):
 6 |         self.data = deque(maxlen=maxlen)
 7 |         if seed is not None:
 8 |             np.random.seed(seed)
 9 | 
10 |     def add(self, *args):
11 |         self.data.append(*args)
12 | 
13 |     def sample(self, num):
14 |         if num > self.size:
15 |             raise ValueError("Memory size: {}, but requested: {}"
16 |                              .format(self.size, num))
17 |         samples = np.random.choice(self.size, num, False)
18 |         batches = [self.data[i] for i in samples]
19 |         return batches
20 | 
21 |     @property
22 |     def size(self):
23 |         return len(self.data)
24 | 


--------------------------------------------------------------------------------
/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from util import info
 3 | 
 4 | class Noise(object):
 5 | 
 6 |     def __init__(self, delta, sigma, ou_a, ou_mu):
 7 |         # Noise parameters
 8 |         self.delta = delta
 9 |         self.sigma = sigma
10 |         self.ou_a = ou_a
11 |         self.ou_mu = ou_mu
12 |         self.ou_lvl = np.zeros(self.ou_mu.shape)
13 | 
14 |     def brownian_motion_log_returns(self):
15 |         sqrt_delta_sigma = np.sqrt(self.delta) * self.sigma
16 |         return np.random.normal(loc=0, scale=sqrt_delta_sigma, size=None)
17 | 
18 |     def __call__(self):
19 |         drift = self.ou_a * (self.ou_mu - self.ou_lvl) * self.delta
20 |         randomness = self.brownian_motion_log_returns()
21 |         self.ou_lvl += drift + randomness
22 |         # info.out("{} {}".format(self.ou_lvl, id(self)))
23 |         return self.ou_lvl
24 | 


--------------------------------------------------------------------------------
/rollout.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from util import log, eva
  5 | 
  6 | 
  7 | class RolloutGenerator:
  8 |     """
  9 |     Class for generating a rollout of trajectory by the agent
 10 |     args:
 11 |     env: gym env
 12 |     agent: agent for performing rollout
 13 |     config: rollout configuration
 14 |     checkpoint(opt): perform rollout from a saved policy
 15 |     """
 16 | 
 17 |     def __init__(self, env, agent, config: dict, _eval=False, summarize=False):
 18 |         self.env = env
 19 |         self.agent = agent
 20 |         self.eval = _eval
 21 |         self.best_score = 0.
 22 |         self.__dict__.update(config)
 23 |         self.saver = tf.train.Saver()
 24 |         self.p_ckpt = "__checkpoints/{}_{}"
 25 |         self.name = ["*TRAINING*", "EVALUATION"][int(_eval)]
 26 |         if "periodic_ckpt" not in self.__dict__:
 27 |             self.periodic_ckpt = False
 28 |         if "save_best" not in self.__dict__:
 29 |             self.save_best = False
 30 |         self.reset()
 31 |         metrics = ["", "EPISODE.", "REWARD.", "TIMESTEPS.", "AVG_Q.", ""]
 32 |         self.logstr = "||".join(i.replace(".", ": {}") for i in metrics)
 33 |         self.logger = eva if self.eval else log
 34 |         self.logger.out("INITIALIZED {} ROLLOUT GENERATOR".format(self.name))
 35 | 
 36 |     def reset(self):
 37 |         self.q_total = 0.
 38 |         self.r_total = 0.
 39 |         self.t_steps = 0
 40 |         self.episode = 0
 41 |         self.successes = 0
 42 | 
 43 |     def generate_rollout(self):
 44 |         t = 0
 45 |         done = False
 46 |         episodic_q = 0.
 47 |         episodic_r = 0.
 48 |         obs, xs, xh = self.env.reset()
 49 |         x = np.hstack([xs, xh])
 50 |         g = self.env.goal
 51 |         while not done and t < self.env.max_episode_steps:
 52 |             a, u, q = self.agent.step(obs, g, x, (not self.eval))
 53 |             # log.out(a)
 54 |             obs2, xs2, xh2, r, done, info = self.env.step(dict(enumerate(a)))
 55 |             x2 = np.hstack([xs2, xh2])
 56 |             self.agent.remember([x, g, *obs.values(), *a, r, done, x2, xs2, *obs2.values()])
 57 |             x = x2
 58 | 
 59 |             # Render if required
 60 |             if "render" in self.__dict__ and self.render:
 61 |                 self.env.render()
 62 | 
 63 |             # Update stats
 64 |             t += 1
 65 |             episodic_r += float(r)
 66 |             episodic_q += float(q)
 67 | 
 68 |             # Train agent if required
 69 |             if not self.eval:
 70 |                 [self.agent.train() for _ in range(self.train_cycles_per_ts)]
 71 |                 self.agent.update_targets()
 72 |             else:
 73 |                 if "step_sleep" in self.__dict__:
 74 |                     time.sleep(self.step_sleep)
 75 |         self.episode += 1
 76 |         self.update_stats(episodic_q, episodic_r, t)
 77 |         self.successes += 1 if done else 0
 78 |         self.logger.out(self.logstr.format(self.episode, episodic_r, t, episodic_q/t))
 79 |         self.create_checkpoint()
 80 | 
 81 |     def create_checkpoint(self):
 82 |         if self.periodic_ckpt and self.episode % self.periodic_ckpt == 0:
 83 |             log.out("Creating periodic checkpoint")
 84 |             self.saver.save(self.agent.sess,
 85 |                             self.p_ckpt.format("P", self.episode))
 86 |         if self.eval and self.done() and self.save_best and self.successes > self.best_score:
 87 |             log.out("New best score: {}".format(self.successes))
 88 |             self.best_score = self.successes
 89 |             self.saver.save(self.agent.sess,
 90 |                             self.p_ckpt.format("B", self.episode))
 91 | 
 92 |     def update_stats(self, eps_q, eps_r, t):
 93 |         self.q_total += eps_q
 94 |         self.r_total += eps_r
 95 |         self.t_steps += t
 96 |         self.mean_eq = self.q_total/self.episode
 97 |         self.mean_er = self.r_total/self.episode
 98 | 
 99 | 
100 |     def done(self):
101 |         done = self.n_episodes <= self.episode
102 |         if done and self.eval:
103 |             print("\n")
104 |         return done
105 | 
106 |     # def summarize(self):
107 |     #     if self.summarizer is None:
108 |     #         return
109 |     #     summarizer.value.add(tag="{}/")
110 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | from env import FormEnv
 5 | from maddpg import MADDPG
 6 | from rollout import RolloutGenerator
 7 | 
 8 | env = FormEnv()
 9 | 
10 | def scale_action_gen(env, u_min, u_max):
11 |     def scale_action(u):
12 |         u = np.clip(u, u_min, u_max)
13 |         # print("clipped ", u)
14 |         zo = (u - u_min)/(u_max - u_min)
15 |         return zo * (env.action_high - env.action_low) + env.action_low
16 |     return scale_action
17 | 
18 | sess = tf.Session()
19 | actor_params = {"n_layers":2, "n_units":128, "tau": 0.01, "lr": 1e-4}
20 | critc_params = {"n_layers":3, "n_units":128, "tau": 0.01, "lr": 1e-3}
21 | noise_params = {"delta": "0.5,0.2", "sigma": "0.5,0.7",
22 |                 "ou_a":  "0.6,0.6", "ou_mu": "0.5,0.0"}
23 | params = {"actor_params": actor_params, "b_size": 64,
24 |           "critic_params": critc_params, "gamma": 0.99,
25 |           "noise_params": noise_params, "memory_size": 50000}
26 | agent = MADDPG(sess, scale_action_gen(env, -np.ones(2), np.ones(2)), params)
27 | 
28 | 
29 | sess.run(tf.global_variables_initializer())
30 | train_rollouts = RolloutGenerator(env, agent, {"render": 1, "n_episodes": 100000, "periodic_ckpt": 50,
31 |                                   "train_cycles_per_ts":10})
32 | eval_rollouts = RolloutGenerator(env, agent, {"render": 1, "n_episodes": 20, "save_best": True}, True)
33 | 
34 | while not train_rollouts.done():
35 |     train_rollouts.generate_rollout()
36 |     if (train_rollouts.episode) % 20 == 0:
37 |         eval_rollouts.reset()
38 |         while not eval_rollouts.done():
39 |             eval_rollouts.generate_rollout()


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | class TextFormatter: 
 2 |   
 3 |     COLORCODE = { 
 4 |         'k': 0,  # black 
 5 |         'r': 1,  # red 
 6 |         'g': 2,  # green 
 7 |         'y': 3,  # yellow 
 8 |         'b': 4,  # blue 
 9 |         'm': 5,  # magenta 
10 |         'c': 6,  # cyan 
11 |         'w': 7   # white 
12 |     } 
13 |   
14 |     STYLECODE = { 
15 |         'b': 1,  # bold 
16 |         'f': 2,  # faint 
17 |         'i': 3,  # italic 
18 |         'u': 4,  # underline 
19 |         'x': 5,  # blinking 
20 |         'y': 6,  # fast blinking 
21 |         'r': 7,  # reverse 
22 |         'h': 8,  # hide 
23 |         's': 9,  # strikethrough 
24 |     } 
25 |   
26 |     # constructor 
27 |     def __init__(self, fg, bg=None, st=None):
28 |         self.prop = {}
29 |         self.prop["fg"] = 30 + self.COLORCODE[fg]
30 |         self.prop["bg"] = 40 + self.COLORCODE[bg] if bg is not None else None
31 |         self.prop["st"] = self.STYLECODE[st] if st is not None else None
32 |  
33 |     # formatting function 
34 |     def format(self, string): 
35 |         w = [self.prop['st'],self.prop['fg'], self.prop['bg']] 
36 |         w = [ str(x) for x in w if x is not None ] 
37 |         # return formatted string 
38 |         return '\x1b[%sm%s\x1b[0m' % (';'.join(w), string) if w else string 
39 |   
40 |     # output formatted string 
41 |     def out(self, string): 
42 |         print(self.format(string))
43 | 
44 | error = TextFormatter("r", st="b")
45 | eva = TextFormatter("m")
46 | info = TextFormatter("m")
47 | log = TextFormatter("b", st="b")
48 | 
49 | 


--------------------------------------------------------------------------------