├── .gitignore ├── .gitmodules ├── Actor.py ├── Critic.py ├── FCNN.py ├── README.md ├── env.py ├── maddpg.py ├── memory.py ├── noise.py ├── rollout.py ├── train.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | __*/ 2 | *.ipynb 3 | *.pyc 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "PointEnvironment"] 2 | path = PointEnvironment 3 | url = https://github.com/aarg-kcis/PointEnvironment 4 | -------------------------------------------------------------------------------- /Actor.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tensorflow import multiply as mul 4 | 5 | from tensorflow.train import AdamOptimizer as Adam 6 | from tensorflow.initializers import truncated_normal as TN 7 | 8 | from FCNN import FCNN 9 | 10 | 11 | class Actor: 12 | def __init__(self, sess, input_t, _id, **params): 13 | self.session = sess 14 | self._id = _id 15 | self.__dict__.update(params) 16 | self.__dict__.update(input_t) 17 | self.obs = input_t["o{}".format(_id)] 18 | self.u = input_t["a{}".format(_id)] 19 | self.generate_networks() 20 | self.define_operations() 21 | 22 | def generate_networks(self, load_from_ckpt=False): 23 | pi_input = tf.concat([self.obs, self.g], axis=1) 24 | # MAIN ACTOR NETWORK 25 | self.pi = FCNN(pi_input, self.u.shape[-1], self.n_layers, self.n_units, 26 | tf.nn.relu, tf.nn.tanh, name="pi_{}".format(self._id), 27 | w_init=TN(stddev=1e-1)) 28 | # TARGET ACTOR NETWORK 29 | self.PI = FCNN(pi_input, self.u.shape[-1], self.n_layers, self.n_units, 30 | tf.nn.relu, tf.nn.tanh, name="t_pi_{}".format(self._id)) 31 | 32 | def define_operations(self): 33 | with tf.name_scope("actor_ops"): 34 | # GRADIENT OF ACTIONS WRT ACTOR PARAMS TIMES NEGATIVE GRADIENT OF 35 | # VALUE FUNCTION WRT ACTIONS 36 | grads = tf.gradients(self.pi.nn, self.pi.net_params, -self.dqdu) 37 | # APPLY GRADIENTS TO ACTOR NETWORK 38 | self.optimize = Adam(self.lr, name="pi_adam")\ 39 | .apply_gradients(zip(grads, self.pi.net_params)) 40 | # UPDATE TARGET OP 41 | net_param_pairs = zip(self.pi.net_params, self.PI.net_params) 42 | with tf.name_scope("update_target_pi"): 43 | self.updt_PI = [j.assign(mul(self.tau, i)+mul((1-self.tau), j)) 44 | for i, j in net_param_pairs] 45 | 46 | def predict(self, obs, g): 47 | return self.session.run(self.pi.nn, 48 | feed_dict={self.obs: obs, self.g: g}) 49 | 50 | def predict_target(self, obs, g): 51 | return self.session.run(self.PI.nn, 52 | feed_dict={self.obs: obs, self.g: g}) 53 | 54 | def train(self, obs, g, dqdu): 55 | return self.session.run(self.optimize, 56 | feed_dict={self.obs: obs, self.g: g, 57 | self.dqdu: dqdu}) 58 | 59 | def update_target(self): 60 | self.session.run(self.updt_PI) 61 | -------------------------------------------------------------------------------- /Critic.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tensorflow import square as sq 4 | from tensorflow import multiply as mul 5 | from tensorflow import reduce_mean as rmean 6 | 7 | from tensorflow.train import AdamOptimizer as Adam 8 | 9 | from FCNN import FCNN 10 | 11 | 12 | class Critic: 13 | def __init__(self, sess, input_t, **params): 14 | self.session = sess 15 | self.__dict__.update(params) 16 | self.__dict__.update(input_t) 17 | self.generate_networks() 18 | self.define_operations() 19 | 20 | def generate_networks(self): 21 | q_input = tf.concat([self.x, self.g, self.a1, self.a2, self.a3], axis=1) 22 | # MAIN CRITIC NETWORK 23 | self.q = FCNN(q_input, 1, self.n_layers, self.n_units, 24 | tf.nn.relu, name="q") 25 | # TARGET CRITIC NETWORK 26 | self.Q = FCNN(q_input, 1, self.n_layers, self.n_units, 27 | tf.nn.relu, name="t_q") 28 | 29 | def define_operations(self): 30 | with tf.name_scope("critic_ops"): 31 | # LOSS 32 | loss = tf.sqrt(rmean(sq(self.p - self.q.nn))) 33 | # MINIMIZE LOSS OP 34 | self.minimize = Adam(self.lr, name="q_adam")\ 35 | .minimize(loss, var_list=self.q.net_params) 36 | # ACTION GRADIENTS 37 | self.a_grads1 = tf.gradients(self.q.nn, self.a1, name="dq_da1") 38 | self.a_grads2 = tf.gradients(self.q.nn, self.a2, name="dq_da2") 39 | self.a_grads3 = tf.gradients(self.q.nn, self.a3, name="dq_da3") 40 | self.a_grad_ops = [self.a_grads1, self.a_grads2, self.a_grads3] 41 | # UPDATE TARGET OP 42 | net_param_pairs = zip(self.q.net_params, self.Q.net_params) 43 | with tf.name_scope("update_target_q"): 44 | self.updt_Q = [j.assign(mul(self.tau, i)+mul((1-self.tau), j)) 45 | for i, j in net_param_pairs] 46 | 47 | def predict(self, x, g, a1, a2, a3): 48 | feed_vals = {self.x: x, self.g: g, self.a1: a1, 49 | self.a2: a2, self.a3: a3} 50 | return self.session.run(self.q.nn, feed_dict=feed_vals) 51 | 52 | def predict_target(self, x, g, a1, a2, a3): 53 | feed_vals = {self.x: x, self.g: g, self.a1: a1, 54 | self.a2: a2, self.a3: a3} 55 | return self.session.run(self.Q.nn, feed_dict=feed_vals) 56 | 57 | def train(self, x, g, a1, a2, a3, p): 58 | feed_vals = {self.x: x, self.g: g, self.a1: a1, 59 | self.a2: a2, self.a3: a3, self.p: p} 60 | return self.session.run([self.q.nn, self.minimize], feed_dict=feed_vals) 61 | 62 | def get_action_grads(self, x, g, a1, a2, a3): 63 | feed_vals = {self.x: x, self.g: g, self.a1: a1, 64 | self.a2: a2, self.a3: a3} 65 | r = self.session.run(self.a_grad_ops, feed_dict=feed_vals) 66 | 67 | return r 68 | 69 | def update_target(self): 70 | self.session.run(self.updt_Q) 71 | -------------------------------------------------------------------------------- /FCNN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import GraphKeys 3 | from tensorflow.layers import dense 4 | from tensorflow.initializers import truncated_normal as TN 5 | TVARS = GraphKeys.TRAINABLE_VARIABLES 6 | 7 | 8 | class FCNN: 9 | def __init__(self, _input, op_dim, n_layers, n_units, activation, 10 | op_act=None, name="FCNN", w_init=None, from_ckpt=False): 11 | print("CREATING NETWORK NAMED {}".format(name)) 12 | self._input = _input 13 | self.op_dim = op_dim 14 | self.n_layers = n_layers 15 | self.n_units = n_units 16 | self.activation = activation 17 | self.scope = name 18 | self.op_act = op_act 19 | self.w_init = w_init if w_init is not None else TN(stddev=1e-1) 20 | self.make() 21 | 22 | def make(self): 23 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): 24 | _input = self._input 25 | for i in range(0, self.n_layers-1): 26 | op = dense(_input, self.n_units, 27 | kernel_initializer=self.w_init, 28 | name="layer_{}".format(i)) 29 | _input = self.activation(op) 30 | op = dense(_input, self.op_dim, 31 | kernel_initializer=self.w_init, 32 | name="layer_{}".format(i+1)) 33 | self.nn = op if self.op_act is None else self.op_act(op) 34 | self.net_params = tf.get_collection(TVARS, scope=self.scope) 35 | for i in self.net_params: 36 | tf.summary.histogram(i.name.replace(":", "_"), i) 37 | 38 | def __call__(self, sess, inputs): 39 | return sess.run(self.nn, inputs) 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Formation Control Using MARL 2 | 3 | [Video to the output](https://www.youtube.com/watch?v=_w3tvqFUYxA) 4 | 5 | ## Some notes about this project 6 | 7 | This project aims at training agents to cooperatively learn a policy to move to 8 | a goal position while in a formation. 9 | 10 | The rewards are sparse and two fold. 11 | - `R_form`: agents recieve this reward when the desired formation is achieved. 12 | - `R_goal`: agents recieve this reward when they have moved to the desired 13 | goal. This reward is only given when the agents have moved to the goal position 14 | in the desired formation. 15 | 16 | ### The Environment 17 | 18 | - The environment will inherit the classic control gym envs. 19 | - The environment will be totally observable by the critic but partially 20 | observable by the agents. 21 | - A function will convert the state of the environment to the observations of 22 | the different agents. 23 | 24 | #### State 25 | The state of the environment will be denoted by a a vector indicating the 26 | following: 27 | This is the state of the system and will only be observed by the central critic. 28 | 29 | 1. Position vectors of the agents from the centroid of the formation(current) 30 | 2. Heading of the individual agents represented as the unit vector from 31 | the centroid of the formation(it will be the same as the world frame heading) 32 | 33 | #### Agent Observations 34 | This will be a dictionary of vectors representing the observed state of the 35 | agents from their respective coordinate frames. 36 | 37 | #### Goal 38 | A triangle is sampled as the goal formation. 39 | The goal is given as the relative position vector for agents from the 40 | centroid of the sampled goal formation. 41 | 42 | 43 | It will have a goal, a random formation in the form of sides of the triangle. 44 | 45 | The main training algorithm will use DDPG + HER for training 46 | 47 | 48 | 49 | ## To do list: 50 | - [ ] Make multiple agents 51 | - [x] Central critic 52 | - [ ] HER transitons 53 | - [ ] Rollouts 54 | - [ ] Training code 55 | - [ ] update code for maddpg 56 | - [ ] Exploration through critic 57 | 58 | ## Setting up 59 | -------------------------------------------------------------------------------- /env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.spaces import Box 3 | from gym.envs.registration import register 4 | from gym.envs.classic_control import rendering 5 | 6 | import numpy as np 7 | from numpy.linalg import norm 8 | from numpy.random import random 9 | 10 | from PointEnvironment.Pose import Pose 11 | from PointEnvironment.Agent import Agent 12 | 13 | from util import error, log 14 | 15 | class FormEnv(gym.Env): 16 | cossin = staticmethod(lambda x: np.array([np.cos(x), np.sin(x)])) 17 | 18 | def __init__(self, config=None): 19 | self.configure_defaults() 20 | if config is not None: 21 | self.__dict__.update(config) 22 | self.goal = None 23 | self.viewer = None 24 | 25 | def configure_defaults(self): 26 | self.dt = 1e-2 27 | self.num_iter = 50 28 | self.max_episode_steps = 25 29 | self.step_penalty = 1.0 30 | self.max_reward = 2.0 31 | self.action_low = np.array([0.0, -np.pi/4]) 32 | self.action_high = np.array([0.4, np.pi/4]) 33 | self.action_space = Box(self.action_low, self.action_high, dtype="f") 34 | self.w_limits = np.array([10, 10]) 35 | self.s_limits = np.array([600, 600]) 36 | self.scale = self.s_limits/self.w_limits 37 | self.scale = self.scale[0] 38 | self.agent_radius = 0.15 # in meters 39 | self.agents = [Agent(i) for i in range(3)] 40 | 41 | def init_viewer(self): 42 | self.viewer = rendering.Viewer(*self.s_limits) 43 | lx = [rendering.Line((0, pt), (self.s_limits[1], pt)) for pt in 44 | np.arange(0, self.s_limits[0], self.scale)] 45 | ly = [rendering.Line((pt, 0), (pt, self.s_limits[0])) for pt in 46 | np.arange(0, self.s_limits[1], self.scale)] 47 | [self.viewer.add_geom(i) for i in lx+ly] 48 | # GOAL MARKER 49 | circle = rendering.make_circle(radius=0.15*self.scale) 50 | circle.set_color(0.3, 0.82, 0.215) 51 | self.goal_tf = rendering.Transform() 52 | circle.add_attr(self.goal_tf) 53 | self.viewer.add_geom(circle) 54 | # AGENT MARKERS 55 | self.agent_tfs = [] 56 | a_rad_px = self.agent_radius * self.scale 57 | verx = [a_rad_px*FormEnv.cossin(np.radians(i)) for i in [0, 140, -140]] 58 | for i in self.agents: 59 | agent = rendering.FilledPolygon([tuple(j) for j in verx]) 60 | agent.set_color(0.15, 0.235, 0.459) 61 | agent_tf = rendering.Transform() 62 | agent.add_attr(agent_tf) 63 | self.agent_tfs.append(agent_tf) 64 | self.viewer.add_geom(agent) 65 | # CENTROID MARKER 66 | circle = rendering.make_circle(radius=0.05*self.scale) 67 | circle.set_color(0.9, 0.3, 0.23) 68 | self.centroid_tf = rendering.Transform() 69 | circle.add_attr(self.centroid_tf) 70 | self.viewer.add_geom(circle) 71 | 72 | def get_form_goal(self): 73 | sides = sorted(np.random.random(3)*0.8 + 0.7) 74 | if np.array([np.sum(sides) - 2*x > 0 for x in sides]).all(): 75 | self.goal_sides = sides 76 | a, b, c = sides 77 | coordinates = [[0, 0], [a, 0]] 78 | h = (c**2 - b**2 + a**2)/(2*a) 79 | coordinates.append([h, (c**2 - h**2)**.5]) 80 | centroid = np.mean(coordinates, axis=0) 81 | return np.hstack([i - centroid for i in coordinates]) 82 | return self.get_form_goal() 83 | 84 | def sample_pose(self, limits=None): 85 | if limits is None: 86 | x, y = random(2)*self.w_limits - self.w_limits/2 87 | else: 88 | x, y = random(2)*limits - limits/2 89 | theta = (random()*2 - 1)*np.pi 90 | return Pose(x=x, y=y, t=theta) 91 | 92 | def reset(self): 93 | poses = self.get_form_goal().reshape((3,-1)) 94 | # log.out(poses) 95 | [a.reset(Pose(*poses[i])) for i, a in enumerate(self.agents)] 96 | self.goal = self.get_form_goal() 97 | self.goal_changed = True 98 | return self.compute_obs() 99 | 100 | def render(self, mode='human'): 101 | if self.goal is None: 102 | return None 103 | if self.viewer is None: 104 | self.init_viewer() 105 | for agent, agent_tf in zip(self.agents, self.agent_tfs): 106 | agent_tf.set_translation(*(agent.pose.tolist()[:-1] + 107 | self.w_limits//2)*self.scale) 108 | agent_tf.set_rotation(agent.pose.theta) 109 | centroid = np.mean([a.pose.tolist()[:-1] for a in self.agents], 0) 110 | self.centroid_tf.set_translation(*(centroid+self.w_limits//2) 111 | * self.scale) 112 | return self.viewer.render(return_rgb_array=mode == 'rgb_array') 113 | 114 | def compute_obs(self): 115 | obs = {j.id: np.hstack([np.hstack(j.pose.getPoseInFrame(i.pose)) 116 | for i in self.agents if i.id != j.id]) for j in self.agents} 117 | f_c = np.mean([a.pose.tolist()[:-1] for a in self.agents], axis=0) 118 | cst = np.hstack([a.pose.tolist()[:-1] - f_c for a in self.agents]) 119 | hed = np.hstack(FormEnv.cossin(a.pose.theta) for a in self.agents) 120 | return obs, cst, hed 121 | 122 | def step(self, actions): 123 | # print(actions.values()) 124 | assert self.goal is not None 125 | for agent_id, action in actions.items(): 126 | [self.agents[agent_id].step(action) for _ in range(self.num_iter)] 127 | new_obs = self.compute_obs() 128 | return (*new_obs, *self.compute_reward()) 129 | 130 | def compute_reward(self): 131 | reward, done = -self.step_penalty, False 132 | sides = sorted([Pose.dist(i.pose, j.pose) for i in self.agents 133 | for j in self.agents if i.id < j.id]) 134 | # log.out(sides) 135 | if (np.abs(sides - np.array(self.goal_sides)) < 0.15).all(): 136 | error.out("\nHURRAY\n{}\n{}\n{}\n{}\n".format(self.goal_sides, sides, [i.pose for i in self.agents], sides - np.array(self.goal_sides))) 137 | reward, done = self.max_reward, True 138 | return reward, done, {"success": done} 139 | 140 | -------------------------------------------------------------------------------- /maddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | from Actor import Actor 6 | from noise import Noise 7 | from Critic import Critic 8 | from memory import Memory 9 | 10 | from util import error 11 | 12 | class MADDPG: 13 | 14 | def __init__(self, sess, scale_u, params): 15 | self.sess = sess 16 | self.scale_u = scale_u 17 | self.__dict__.update(params) 18 | # CREATE INPUT PLACEHOLDERS 19 | self.create_input_placeholders() 20 | # INITIALIZE ACTOR & CRITIC MODELS 21 | self.agents = [Actor(self.sess, self.inputs, i, **self.actor_params) 22 | for i in [1, 2, 3]] 23 | self.critic = Critic(self.sess, self.inputs, **self.critic_params) 24 | # INITIALIZE EXPLORATION MODEL 25 | self.noise_params = {k: np.fromstring(v, sep=",", dtype="f") 26 | for k, v in self.noise_params.items()} 27 | self.noise = [Noise(**self.noise_params) for _ in range(3)] 28 | # INITIALIZE REPLAY BUFFER 29 | self.memory = Memory(self.memory_size) 30 | # AVERAGE AGENT POLICIES 31 | avg_pi = [tf.reduce_mean(i, axis=0) for i in zip(*[x.pi.net_params for x in self.agents])] 32 | self.avg_op = [tf.assign(i, j) for x in self.agents for i, j in zip(x.pi.net_params, avg_pi)] 33 | 34 | def create_input_placeholders(self): 35 | self.inputs = {} 36 | ph = lambda s, n: tf.placeholder(tf.float32, shape=s, name=n) 37 | with tf.name_scope("inputs"): 38 | self.inputs["g"] = ph((None, 6), "goal") 39 | self.inputs["d"] = ph((None, 8), "done") 40 | self.inputs["p"] = ph((None, 1), "pred_q") 41 | self.inputs["r"] = ph((None, 8), "reward") 42 | self.inputs["x"] = ph((None, 12), "state") 43 | self.inputs["o1"] = ph((None, 8), "obs1") 44 | self.inputs["o2"] = ph((None, 8), "obs2") 45 | self.inputs["o3"] = ph((None, 8), "obs3") 46 | self.inputs["a1"] = ph((None, 2), "act1") 47 | self.inputs["a2"] = ph((None, 2), "act2") 48 | self.inputs["a3"] = ph((None, 2), "act3") 49 | self.inputs["dqdu"] = ph((None, 2), "dqdu") 50 | 51 | 52 | def step(self, obs, goal, state=None, explore=True): 53 | q = 0. 54 | sh = (1, -1) 55 | obs = {i: j.reshape(sh) for i, j in obs.items()} 56 | goal = goal.reshape(sh) 57 | state = state.reshape(sh) 58 | if explore: 59 | u = [x.predict(obs[i], goal) + self.noise[i]() 60 | for i, x in enumerate(self.agents)] 61 | # print(u) 62 | else: 63 | u = [x.predict_target(obs[i], goal) for i, x in enumerate(self.agents)] 64 | if state is not None: 65 | q = self.critic.predict_target(state, goal, *u) 66 | u = [x.reshape((1, -1)) for x in u] 67 | return [self.scale_u(x) for x in u], u, float(q) 68 | 69 | def remember(self, experience): 70 | self.memory.add(experience) 71 | 72 | def train(self): 73 | # check if the memory contains enough experiences 74 | if self.memory.size < 2*self.b_size: 75 | return 76 | x, g, o1, o2, o3, a1, a2, a3, r, d, x2, ag, o21, o22, o23 = self.get_batch() 77 | # print(a1) 78 | # HER TRANSACTIONS 79 | 80 | # her_idxs = np.where(np.random.random(self.b_size).reshape((-1, 1)) < 0.80) 81 | # g[her_idxs] = ag[her_idxs] 82 | # r[her_idxs] = 2. 83 | # d[her_idxs] = True 84 | obs = [o1, o2, o3] 85 | n_o = [o21, o22, o23] 86 | n_u = [j.predict_target(n_o[i], g) for i, j in enumerate(self.agents)] 87 | n_q = self.critic.predict_target(x2, g, *n_u) 88 | t_q = r + self.gamma*n_q*(1 - d) 89 | self.critic.train(x, g, a1, a2, a3, t_q) 90 | grad = self.critic.get_action_grads(x, g, a1, a2, a3) 91 | [j.train(obs[i], g, grad[i][0]) for i, j in enumerate(self.agents)] 92 | # self.update_targets() 93 | 94 | def get_batch(self): 95 | batch = self.memory.sample(self.b_size) 96 | return [np.vstack([experience[j] for experience in batch]) for j in range(15)] 97 | 98 | def update_targets(self): 99 | self.critic.update_target() 100 | [x.update_target() for x in self.agents] 101 | self.sess.run(self.avg_op) 102 | -------------------------------------------------------------------------------- /memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | 4 | class Memory: 5 | def __init__(self, maxlen, seed=None): 6 | self.data = deque(maxlen=maxlen) 7 | if seed is not None: 8 | np.random.seed(seed) 9 | 10 | def add(self, *args): 11 | self.data.append(*args) 12 | 13 | def sample(self, num): 14 | if num > self.size: 15 | raise ValueError("Memory size: {}, but requested: {}" 16 | .format(self.size, num)) 17 | samples = np.random.choice(self.size, num, False) 18 | batches = [self.data[i] for i in samples] 19 | return batches 20 | 21 | @property 22 | def size(self): 23 | return len(self.data) 24 | -------------------------------------------------------------------------------- /noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from util import info 3 | 4 | class Noise(object): 5 | 6 | def __init__(self, delta, sigma, ou_a, ou_mu): 7 | # Noise parameters 8 | self.delta = delta 9 | self.sigma = sigma 10 | self.ou_a = ou_a 11 | self.ou_mu = ou_mu 12 | self.ou_lvl = np.zeros(self.ou_mu.shape) 13 | 14 | def brownian_motion_log_returns(self): 15 | sqrt_delta_sigma = np.sqrt(self.delta) * self.sigma 16 | return np.random.normal(loc=0, scale=sqrt_delta_sigma, size=None) 17 | 18 | def __call__(self): 19 | drift = self.ou_a * (self.ou_mu - self.ou_lvl) * self.delta 20 | randomness = self.brownian_motion_log_returns() 21 | self.ou_lvl += drift + randomness 22 | # info.out("{} {}".format(self.ou_lvl, id(self))) 23 | return self.ou_lvl 24 | -------------------------------------------------------------------------------- /rollout.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tensorflow as tf 4 | from util import log, eva 5 | 6 | 7 | class RolloutGenerator: 8 | """ 9 | Class for generating a rollout of trajectory by the agent 10 | args: 11 | env: gym env 12 | agent: agent for performing rollout 13 | config: rollout configuration 14 | checkpoint(opt): perform rollout from a saved policy 15 | """ 16 | 17 | def __init__(self, env, agent, config: dict, _eval=False, summarize=False): 18 | self.env = env 19 | self.agent = agent 20 | self.eval = _eval 21 | self.best_score = 0. 22 | self.__dict__.update(config) 23 | self.saver = tf.train.Saver() 24 | self.p_ckpt = "__checkpoints/{}_{}" 25 | self.name = ["*TRAINING*", "EVALUATION"][int(_eval)] 26 | if "periodic_ckpt" not in self.__dict__: 27 | self.periodic_ckpt = False 28 | if "save_best" not in self.__dict__: 29 | self.save_best = False 30 | self.reset() 31 | metrics = ["", "EPISODE.", "REWARD.", "TIMESTEPS.", "AVG_Q.", ""] 32 | self.logstr = "||".join(i.replace(".", ": {}") for i in metrics) 33 | self.logger = eva if self.eval else log 34 | self.logger.out("INITIALIZED {} ROLLOUT GENERATOR".format(self.name)) 35 | 36 | def reset(self): 37 | self.q_total = 0. 38 | self.r_total = 0. 39 | self.t_steps = 0 40 | self.episode = 0 41 | self.successes = 0 42 | 43 | def generate_rollout(self): 44 | t = 0 45 | done = False 46 | episodic_q = 0. 47 | episodic_r = 0. 48 | obs, xs, xh = self.env.reset() 49 | x = np.hstack([xs, xh]) 50 | g = self.env.goal 51 | while not done and t < self.env.max_episode_steps: 52 | a, u, q = self.agent.step(obs, g, x, (not self.eval)) 53 | # log.out(a) 54 | obs2, xs2, xh2, r, done, info = self.env.step(dict(enumerate(a))) 55 | x2 = np.hstack([xs2, xh2]) 56 | self.agent.remember([x, g, *obs.values(), *a, r, done, x2, xs2, *obs2.values()]) 57 | x = x2 58 | 59 | # Render if required 60 | if "render" in self.__dict__ and self.render: 61 | self.env.render() 62 | 63 | # Update stats 64 | t += 1 65 | episodic_r += float(r) 66 | episodic_q += float(q) 67 | 68 | # Train agent if required 69 | if not self.eval: 70 | [self.agent.train() for _ in range(self.train_cycles_per_ts)] 71 | self.agent.update_targets() 72 | else: 73 | if "step_sleep" in self.__dict__: 74 | time.sleep(self.step_sleep) 75 | self.episode += 1 76 | self.update_stats(episodic_q, episodic_r, t) 77 | self.successes += 1 if done else 0 78 | self.logger.out(self.logstr.format(self.episode, episodic_r, t, episodic_q/t)) 79 | self.create_checkpoint() 80 | 81 | def create_checkpoint(self): 82 | if self.periodic_ckpt and self.episode % self.periodic_ckpt == 0: 83 | log.out("Creating periodic checkpoint") 84 | self.saver.save(self.agent.sess, 85 | self.p_ckpt.format("P", self.episode)) 86 | if self.eval and self.done() and self.save_best and self.successes > self.best_score: 87 | log.out("New best score: {}".format(self.successes)) 88 | self.best_score = self.successes 89 | self.saver.save(self.agent.sess, 90 | self.p_ckpt.format("B", self.episode)) 91 | 92 | def update_stats(self, eps_q, eps_r, t): 93 | self.q_total += eps_q 94 | self.r_total += eps_r 95 | self.t_steps += t 96 | self.mean_eq = self.q_total/self.episode 97 | self.mean_er = self.r_total/self.episode 98 | 99 | 100 | def done(self): 101 | done = self.n_episodes <= self.episode 102 | if done and self.eval: 103 | print("\n") 104 | return done 105 | 106 | # def summarize(self): 107 | # if self.summarizer is None: 108 | # return 109 | # summarizer.value.add(tag="{}/") 110 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from env import FormEnv 5 | from maddpg import MADDPG 6 | from rollout import RolloutGenerator 7 | 8 | env = FormEnv() 9 | 10 | def scale_action_gen(env, u_min, u_max): 11 | def scale_action(u): 12 | u = np.clip(u, u_min, u_max) 13 | # print("clipped ", u) 14 | zo = (u - u_min)/(u_max - u_min) 15 | return zo * (env.action_high - env.action_low) + env.action_low 16 | return scale_action 17 | 18 | sess = tf.Session() 19 | actor_params = {"n_layers":2, "n_units":128, "tau": 0.01, "lr": 1e-4} 20 | critc_params = {"n_layers":3, "n_units":128, "tau": 0.01, "lr": 1e-3} 21 | noise_params = {"delta": "0.5,0.2", "sigma": "0.5,0.7", 22 | "ou_a": "0.6,0.6", "ou_mu": "0.5,0.0"} 23 | params = {"actor_params": actor_params, "b_size": 64, 24 | "critic_params": critc_params, "gamma": 0.99, 25 | "noise_params": noise_params, "memory_size": 50000} 26 | agent = MADDPG(sess, scale_action_gen(env, -np.ones(2), np.ones(2)), params) 27 | 28 | 29 | sess.run(tf.global_variables_initializer()) 30 | train_rollouts = RolloutGenerator(env, agent, {"render": 1, "n_episodes": 100000, "periodic_ckpt": 50, 31 | "train_cycles_per_ts":10}) 32 | eval_rollouts = RolloutGenerator(env, agent, {"render": 1, "n_episodes": 20, "save_best": True}, True) 33 | 34 | while not train_rollouts.done(): 35 | train_rollouts.generate_rollout() 36 | if (train_rollouts.episode) % 20 == 0: 37 | eval_rollouts.reset() 38 | while not eval_rollouts.done(): 39 | eval_rollouts.generate_rollout() -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | class TextFormatter: 2 | 3 | COLORCODE = { 4 | 'k': 0, # black 5 | 'r': 1, # red 6 | 'g': 2, # green 7 | 'y': 3, # yellow 8 | 'b': 4, # blue 9 | 'm': 5, # magenta 10 | 'c': 6, # cyan 11 | 'w': 7 # white 12 | } 13 | 14 | STYLECODE = { 15 | 'b': 1, # bold 16 | 'f': 2, # faint 17 | 'i': 3, # italic 18 | 'u': 4, # underline 19 | 'x': 5, # blinking 20 | 'y': 6, # fast blinking 21 | 'r': 7, # reverse 22 | 'h': 8, # hide 23 | 's': 9, # strikethrough 24 | } 25 | 26 | # constructor 27 | def __init__(self, fg, bg=None, st=None): 28 | self.prop = {} 29 | self.prop["fg"] = 30 + self.COLORCODE[fg] 30 | self.prop["bg"] = 40 + self.COLORCODE[bg] if bg is not None else None 31 | self.prop["st"] = self.STYLECODE[st] if st is not None else None 32 | 33 | # formatting function 34 | def format(self, string): 35 | w = [self.prop['st'],self.prop['fg'], self.prop['bg']] 36 | w = [ str(x) for x in w if x is not None ] 37 | # return formatted string 38 | return '\x1b[%sm%s\x1b[0m' % (';'.join(w), string) if w else string 39 | 40 | # output formatted string 41 | def out(self, string): 42 | print(self.format(string)) 43 | 44 | error = TextFormatter("r", st="b") 45 | eva = TextFormatter("m") 46 | info = TextFormatter("m") 47 | log = TextFormatter("b", st="b") 48 | 49 | --------------------------------------------------------------------------------