├── src ├── __init__.py ├── ddpg │ ├── __init__.py │ ├── tensorflow_grad_inverter.py │ ├── batch_norm.py │ ├── actor_net.py │ ├── critic_net.py │ ├── actor_net_bn.py │ ├── critic_net_bn.py │ └── agent.py ├── util │ ├── __init__.py │ ├── my_plotlib.py │ ├── timer.py │ ├── agent_data.py │ ├── data.py │ └── data_graph.py ├── wolp_agent.py └── main.py ├── .gitignore └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import inspect 4 | 5 | 6 | cmd_subfolder = os.path.split(inspect.getfile(inspect.currentframe()))[0] 7 | if cmd_subfolder not in sys.path: 8 | sys.path.insert(0, cmd_subfolder) 9 | -------------------------------------------------------------------------------- /src/ddpg/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import inspect 4 | 5 | 6 | cmd_subfolder = os.path.split(inspect.getfile(inspect.currentframe()))[0] 7 | if cmd_subfolder not in sys.path: 8 | sys.path.insert(0, cmd_subfolder) 9 | -------------------------------------------------------------------------------- /src/util/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import inspect 4 | 5 | 6 | cmd_subfolder = os.path.split(inspect.getfile(inspect.currentframe()))[0] 7 | if cmd_subfolder not in sys.path: 8 | sys.path.insert(0, cmd_subfolder) 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # tests 3 | tests 4 | test_*.py 5 | .cache 6 | 7 | # build dirs 8 | __pycache__ 9 | 10 | # .py files 11 | clipboard.py 12 | example.py 13 | 14 | # .pyc files 15 | *.pyc 16 | 17 | 18 | # results 19 | results 20 | data 21 | default_name 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-Reinforcement-Learning-in-Large-Discrete-Action-Spaces 2 | Link to [paper](https://arxiv.org/abs/1512.07679) 3 | 4 | Implementation of the algorithm in Python 3, TensorFlow and OpenAI Gym. 5 | 6 | 7 | 8 | This paper introduces Wolpertinger training algorithm that extends the Deep Deterministic Policy Gradient training algorithm introduced in [this](https://arxiv.org/abs/1509.02971) paper. I extended stevenpjg's implementation of DDPG algorithm found [here](https://github.com/stevenpjg/ddpg-aigym) licensed under the MIT license. 9 | -------------------------------------------------------------------------------- /src/ddpg/tensorflow_grad_inverter.py: -------------------------------------------------------------------------------- 1 | #Reference: 2 | #https://github.com/MOCR/ 3 | 4 | import tensorflow as tf 5 | 6 | 7 | 8 | class grad_inverter: 9 | def __init__(self, action_bounds): 10 | 11 | self.sess = tf.InteractiveSession() 12 | 13 | self.action_size = len(action_bounds[0]) 14 | 15 | self.action_input = tf.placeholder(tf.float32, [None, self.action_size]) 16 | self.pmax = tf.constant(action_bounds[0], dtype = tf.float32) 17 | self.pmin = tf.constant(action_bounds[1], dtype = tf.float32) 18 | self.prange = tf.constant([x - y for x, y in zip(action_bounds[0],action_bounds[1])], dtype = tf.float32) 19 | self.pdiff_max = tf.div(-self.action_input+self.pmax, self.prange) 20 | self.pdiff_min = tf.div(self.action_input - self.pmin, self.prange) 21 | self.zeros_act_grad_filter = tf.zeros([self.action_size]) 22 | self.act_grad = tf.placeholder(tf.float32, [None, self.action_size]) 23 | self.grad_inverter = tf.where(tf.greater(self.act_grad, self.zeros_act_grad_filter), tf.multiply(self.act_grad, self.pdiff_max), tf.multiply(self.act_grad, self.pdiff_min)) 24 | 25 | def invert(self, grad, action): 26 | 27 | 28 | return self.sess.run(self.grad_inverter, feed_dict = {self.action_input: action, self.act_grad: grad[0]}) 29 | -------------------------------------------------------------------------------- /src/ddpg/batch_norm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | decay = 0.95 3 | TAU = 0.001 4 | 5 | 6 | class Batch_norm: 7 | def __init__(self, inputs, size, is_training, sess, parForTarget=None, bn_param=None): 8 | 9 | self.sess = sess 10 | self.scale = tf.Variable(tf.random_uniform([size], 0.9, 1.1)) 11 | self.beta = tf.Variable(tf.random_uniform([size], -0.03, 0.03)) 12 | self.pop_mean = tf.Variable(tf.random_uniform([size], -0.03, 0.03), trainable=False) 13 | self.pop_var = tf.Variable(tf.random_uniform([size], 0.9, 1.1), trainable=False) 14 | self.batch_mean, self.batch_var = tf.nn.moments(inputs, [0]) 15 | self.train_mean = tf.assign(self.pop_mean, self.pop_mean * 16 | decay + self.batch_mean * (1 - decay)) 17 | self.train_var = tf.assign(self.pop_var, self.pop_var * 18 | decay + self.batch_var * (1 - decay)) 19 | 20 | def training(): 21 | return tf.nn.batch_normalization(inputs, 22 | self.batch_mean, self.batch_var, self.beta, self.scale, 0.0000001) 23 | 24 | def testing(): 25 | return tf.nn.batch_normalization(inputs, 26 | self.pop_mean, self.pop_var, self.beta, self.scale, 0.0000001) 27 | 28 | if parForTarget != None: 29 | self.parForTarget = parForTarget 30 | self.updateScale = self.scale.assign( 31 | self.scale * (1 - TAU) + self.parForTarget.scale * TAU) 32 | self.updateBeta = self.beta.assign(self.beta * (1 - TAU) + self.parForTarget.beta * TAU) 33 | self.updateTarget = tf.group(self.updateScale, self.updateBeta) 34 | 35 | self.bnorm = tf.cond(is_training, training, testing) 36 | -------------------------------------------------------------------------------- /src/util/my_plotlib.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | class Line: 6 | def __init__(self, x, y, line_width=1, line_color='black', text='', style='-'): 7 | self.x = x 8 | self.y = y 9 | self.width = line_width 10 | self.color = line_color 11 | self.text = text 12 | self.style = style 13 | 14 | def plot(self, fig=None, k=2): 15 | if fig is None: 16 | plt.figure() 17 | plt.grid(True) 18 | plt.ylabel("y") 19 | plt.xlabel("x") 20 | 21 | max_y, min_y = self.y_range() 22 | 23 | plt.plot(self.x, self.y, self.color, linewidth=self.width, linestyle=self.style) 24 | plt.text(0.05 * len(self.y), k * 0.1 * (max_y - min_y), 25 | self.text, color=self.color) 26 | 27 | if fig is None: 28 | plt.show() 29 | 30 | def y_range(self): 31 | return np.amin(self.y), np.amax(self.y) 32 | 33 | 34 | class Function(Line): 35 | 36 | def __init__(self, x, func, line_width=1, line_color='black', text='', style='-'): 37 | y = [func(i) for i in x] 38 | super().__init__(x, y, line_width, line_color, text, style) 39 | 40 | 41 | class Constant(Line): 42 | 43 | def __init__(self, x, c, line_width=1, line_color='black', text='', style='-'): 44 | x = [x[0], x[len(x) - 1]] 45 | y = [c] * len(x) 46 | super().__init__(x, y, line_width, line_color, text, style) 47 | 48 | 49 | def plot_lines(lines, seps=None, grid_flag=True): 50 | fig = plt.figure() 51 | plt.grid(grid_flag) 52 | plt.ylabel("y") 53 | plt.xlabel("x") 54 | max_y = [] 55 | min_y = [] 56 | count = 0 57 | for line in lines: 58 | count += 1 59 | line.plot(fig=fig, k=count) 60 | temp = line.y_range() 61 | min_y.append(temp[0]) 62 | max_y.append(temp[1]) 63 | 64 | min_y = np.amin(min_y) 65 | max_y = np.amax(max_y) 66 | 67 | if seps is not None: 68 | for s in seps: 69 | plt.plot([s - 0.001, s + 0.001], [min_y, max_y], 'r', linewidth=0.5) 70 | 71 | plt.show() 72 | -------------------------------------------------------------------------------- /src/util/timer.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | 3 | 4 | class Timer: 5 | 6 | def __init__(self, one_hot=True): 7 | self.reset() 8 | self.one_hot = one_hot 9 | 10 | def reset(self): 11 | self.now = Timer._get_current_milis() 12 | 13 | def reset_one_hot(self): 14 | if self.one_hot: 15 | self.reset() 16 | 17 | def get_time(self, reset=False): 18 | return Timer._get_current_milis() - self.now 19 | 20 | @staticmethod 21 | def _get_current_milis(): 22 | return int(round(time() * 1000)) 23 | 24 | 25 | class Time_stats: 26 | 27 | def __init__(self, name, fields, one_active=True): 28 | self.name = name 29 | self.count = 0 30 | self.one_active = one_active 31 | self.values = {} 32 | self.timers = {} 33 | for str in fields: 34 | self.values[str] = 0 35 | self.timers[str] = Timer() 36 | 37 | def start(self, field): 38 | self.timers[field].reset() 39 | 40 | def add_time(self, field): 41 | self.values[field] += self.timers[field].get_time() 42 | if self.one_active: 43 | self.reset_timers() 44 | 45 | def increase_count(self, n=1): 46 | self.count += n 47 | 48 | def set_count(self, n): 49 | self.count = n 50 | 51 | def get_count(self): 52 | return self.count 53 | 54 | def reset_timers(self): 55 | for key in self.timers.keys(): 56 | self.start(key) 57 | 58 | def reset_values(self): 59 | for key in self.values.keys(): 60 | self.values[key] = 0 61 | 62 | def get_total(self): 63 | total = 0 64 | for key in self.values.keys(): 65 | total += self.values[key] 66 | return total 67 | 68 | def print_stats(self): 69 | print('\nName: {}\tCount: {}'.format(self.name, self.count)) 70 | print('key\t\tabs\t\tavg/unit\t% of total') 71 | print('-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-') 72 | 73 | keys = list(self.values.keys()) 74 | keys.sort() 75 | total_time = max(self.get_total(), 1) 76 | count = max(self.count, 1) 77 | for key in keys: 78 | temp = self.values[key] 79 | avg = temp / count 80 | total = 100 * temp / total_time 81 | print('{}\t\t{}\t\t{:6.2f}\t\t{:6.2f}'.format( 82 | key, temp, avg, total)) 83 | 84 | total_time = self.get_total() 85 | print('Total\t\t{}\t\t{:6.2f}\t\t 100.0'.format(total_time, total_time / count)) 86 | -------------------------------------------------------------------------------- /src/wolp_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyflann 3 | 4 | from ddpg import agent 5 | 6 | 7 | class WolpertingerAgent(agent.DDPGAgent): 8 | 9 | def __init__(self, env, max_actions=1e6, k_nearest_neighbors=10): 10 | super().__init__(env) 11 | if self.continious_action_space: 12 | self.actions = np.linspace(self.low, self.high, max_actions) 13 | else: 14 | self.actions = np.arange(self.low, self.high) 15 | # self.actions = list(self.actions) 16 | self.k_nearest_neighbors = k_nearest_neighbors 17 | print('wolpertinger agent init') 18 | print('max actions = ', max_actions) 19 | print('k nearest neighbors =', k_nearest_neighbors) 20 | # init flann 21 | self.actions.shape = (len(self.actions), self.action_space_size) 22 | self.flann = pyflann.FLANN() 23 | params = self.flann.build_index(self.actions, algorithm='kdtree') 24 | print('flann init with params->', params) 25 | 26 | def get_name(self): 27 | return 'Wolp_v1_k' + str(self.k_nearest_neighbors) + '_' + super().get_name() 28 | 29 | def act(self, state): 30 | proto_action = super().act(state) 31 | if self.k_nearest_neighbors <= 1: 32 | return proto_action 33 | 34 | if len(proto_action) > 1: 35 | return 0 36 | res = np.array([]) 37 | for i in range(len(proto_action)): 38 | res = np.append(res, self.wolp_action(state[i], proto_action[i])) 39 | res.shape = (len(res), 1) 40 | return res 41 | else: 42 | return self.wolp_action(state, proto_action) 43 | 44 | def wolp_action(self, state, proto_action): 45 | debug = False 46 | actions = self.nearest_neighbors(proto_action)[0] 47 | if debug: 48 | print('--\nproto action', proto_action, 'state', state) 49 | states = np.tile(state, [len(actions), 1]) 50 | actions_evaluation = self.critic_net.evaluate_critic(states, actions) 51 | if debug: 52 | print('action evalueations', actions_evaluation.shape) 53 | if debug: 54 | for i in range(len(actions)): 55 | print(actions[i], 'v', actions_evaluation[i]) 56 | 57 | max_index = np.argmax(actions_evaluation) 58 | max = actions_evaluation[max_index] 59 | if debug: 60 | print('max', max, '->', max_index) 61 | if debug: 62 | print('result action', actions[max_index]) 63 | # if debug: 64 | # exit() 65 | return actions[max_index] 66 | 67 | def nearest_neighbors(self, proto_action): 68 | results, dists = self.flann.nn_index( 69 | proto_action, self.k_nearest_neighbors) # checks=params["checks"] 70 | return self.actions[results] 71 | -------------------------------------------------------------------------------- /src/util/agent_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from my_plotlib import * 3 | from data import * 4 | import data_graph 5 | import gym 6 | from gym.spaces import Box, Discrete 7 | 8 | 9 | def get_action_space(env): 10 | low = 0 11 | high = 0 12 | if isinstance(env.action_space, Box): 13 | low = env.action_space.low[0] 14 | high = env.action_space.high[0] 15 | else: 16 | low = 0 17 | high = env.action_space.n 18 | 19 | return low, high 20 | 21 | 22 | def plot_rewards(fd): 23 | data = fd.get_data('rewards') 24 | 25 | data_graph.plot_data(data, batch_size=-1, file_name='rewards') 26 | 27 | 28 | def plot_actions(fd, episodes=None, action_space=None): 29 | lines = [] 30 | 31 | data = [] 32 | seps = [] 33 | if episodes is None: 34 | data = fd.get_data('actions') 35 | else: 36 | for ep in episodes: 37 | data.extend(fd.get_episode_data('actions', ep)) 38 | seps.append(len(data) - 0.5) 39 | 40 | if len(seps) == 1: 41 | seps = [] 42 | x = np.arange(len(data)) 43 | if action_space is not None: 44 | lines.extend((Constant(x, k, line_color='#a0a0a0') for k in action_space)) 45 | 46 | lines.append(Line(x, data, line_color='-o')) 47 | plot_lines(lines, seps, grid_flag=action_space is None) 48 | 49 | 50 | def plot_states(fd, episodes=None): 51 | lines = [] 52 | 53 | data = {'s0': [], 54 | 's1': [], 55 | 's2': [], 56 | 's3': [], 57 | 'actions': []} 58 | seps = [] 59 | if episodes is None: 60 | data['s0'] = fd.get_data('state_0') 61 | data['s1'] = fd.get_data('state_1') 62 | data['s2'] = fd.get_data('state_2') 63 | data['s3'] = fd.get_data('state_3') 64 | data['actions'] = fd.get_data('actions') 65 | else: 66 | for ep in episodes: 67 | data['s0'].extend(fd.get_episode_data('state_0', ep)) 68 | data['s1'].extend(fd.get_episode_data('state_1', ep)) 69 | data['s2'].extend(fd.get_episode_data('state_2', ep)) 70 | data['s3'].extend(fd.get_episode_data('state_3', ep)) 71 | data['actions'].extend(fd.get_episode_data('actions', ep)) 72 | seps.append(len(data) - 0.5) 73 | 74 | if len(seps) == 1: 75 | seps = [] 76 | x = np.arange(len(data['s0'])) 77 | 78 | # print(data['s0']) 79 | 80 | lines.append(Line(x, data['s0'], line_color='b', text='s0')) 81 | lines.append(Line(x, data['s1'], line_color='g', text='s1')) 82 | lines.append(Line(x, data['s2'], line_color='r', text='s2')) 83 | lines.append(Line(x, data['s3'], line_color='m', text='s3')) 84 | lines.append(Line(x, data['actions'], line_color='black', text='actions', style=':')) 85 | 86 | plot_lines(lines, seps) 87 | 88 | 89 | class Agent_data(Data): 90 | 91 | def get_episodes_with_reward_greater_than(self, th): 92 | return np.where(self.get_data('rewards') >= th)[0] 93 | 94 | def find_episode(self, ep): 95 | done = self.get_data('done') 96 | eps = np.where(done == 1)[0] 97 | return eps[ep - 1] + 1 if ep > 0 else 0, eps[min(ep, len(done))] 98 | 99 | def get_episode_data(self, field, ep): 100 | s, e = self.find_episode(ep) 101 | data = self.get_data(field) 102 | if field == 'rewards': 103 | return data[ep] 104 | else: 105 | return data[s: e + 1] 106 | 107 | def get_full_episode_data(self, ep): 108 | start, end = self.find_episode(ep) 109 | clone = self.get_empty_clone() 110 | for key in self.get_keys(): 111 | clone.set_data(key, self.get_data(key)[start: end + 1]) 112 | 113 | r = self.get_data('rewards')[ep] 114 | clone.set_data('rewards', np.array([r])) 115 | return clone 116 | -------------------------------------------------------------------------------- /src/ddpg/actor_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import math 4 | 5 | LEARNING_RATE = 0.0001 6 | BATCH_SIZE = 64 7 | TAU = 0.001 8 | 9 | 10 | class ActorNet: 11 | """ Actor Network Model of DDPG Algorithm """ 12 | 13 | def __init__(self, num_states, num_actions): 14 | self.g = tf.Graph() 15 | with self.g.as_default(): 16 | self.sess = tf.InteractiveSession() 17 | 18 | # actor network model parameters: 19 | self.W1_a, self.B1_a, self.W2_a, self.B2_a, self.W3_a, self.B3_a,\ 20 | self.actor_state_in, self.actor_model = self.create_actor_net( 21 | num_states, num_actions) 22 | 23 | # target actor network model parameters: 24 | self.t_W1_a, self.t_B1_a, self.t_W2_a, self.t_B2_a, self.t_W3_a, self.t_B3_a,\ 25 | self.t_actor_state_in, self.t_actor_model = self.create_actor_net( 26 | num_states, num_actions) 27 | 28 | # cost of actor network: 29 | # gets input from action_gradient computed in critic network file 30 | self.q_gradient_input = tf.placeholder("float", [None, num_actions]) 31 | self.actor_parameters = [self.W1_a, self.B1_a, 32 | self.W2_a, self.B2_a, self.W3_a, self.B3_a] 33 | self.parameters_gradients = tf.gradients( 34 | self.actor_model, self.actor_parameters, -self.q_gradient_input) # /BATCH_SIZE) 35 | self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients( 36 | zip(self.parameters_gradients, self.actor_parameters)) 37 | # initialize all tensor variable parameters: 38 | self.sess.run(tf.global_variables_initializer()) 39 | 40 | self.update_target_actor_op = [ 41 | self.t_W1_a.assign(TAU * self.W1_a + (1 - TAU) * self.t_W1_a), 42 | self.t_B1_a.assign(TAU * self.B1_a + (1 - TAU) * self.t_B1_a), 43 | self.t_W2_a.assign(TAU * self.W2_a + (1 - TAU) * self.t_W2_a), 44 | self.t_B2_a.assign(TAU * self.B2_a + (1 - TAU) * self.t_B2_a), 45 | self.t_W3_a.assign(TAU * self.W3_a + (1 - TAU) * self.t_W3_a), 46 | self.t_B3_a.assign(TAU * self.B3_a + (1 - TAU) * self.t_B3_a)] 47 | # To make sure actor and target have same intial parmameters copy the parameters: 48 | # copy target parameters 49 | self.sess.run([ 50 | self.t_W1_a.assign(self.W1_a), 51 | self.t_B1_a.assign(self.B1_a), 52 | self.t_W2_a.assign(self.W2_a), 53 | self.t_B2_a.assign(self.B2_a), 54 | self.t_W3_a.assign(self.W3_a), 55 | self.t_B3_a.assign(self.B3_a)]) 56 | 57 | def create_actor_net(self, num_states=4, num_actions=1): 58 | """ Network that takes states and return action """ 59 | N_HIDDEN_1 = 400 60 | N_HIDDEN_2 = 300 61 | actor_state_in = tf.placeholder("float", [None, num_states]) 62 | W1_a = tf.Variable(tf.random_uniform( 63 | [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 64 | B1_a = tf.Variable(tf.random_uniform( 65 | [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 66 | W2_a = tf.Variable(tf.random_uniform( 67 | [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1))) 68 | B2_a = tf.Variable(tf.random_uniform( 69 | [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1))) 70 | W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2, num_actions], -0.003, 0.003)) 71 | B3_a = tf.Variable(tf.random_uniform([num_actions], -0.003, 0.003)) 72 | 73 | H1_a = tf.nn.softplus(tf.matmul(actor_state_in, W1_a) + B1_a) 74 | H2_a = tf.nn.tanh(tf.matmul(H1_a, W2_a) + B2_a) 75 | actor_model = tf.matmul(H2_a, W3_a) + B3_a 76 | return W1_a, B1_a, W2_a, B2_a, W3_a, B3_a, actor_state_in, actor_model 77 | 78 | def evaluate_actor(self, state_t): 79 | return self.sess.run(self.actor_model, feed_dict={self.actor_state_in: state_t}) 80 | 81 | def evaluate_target_actor(self, state_t_1): 82 | return self.sess.run(self.t_actor_model, feed_dict={self.t_actor_state_in: state_t_1}) 83 | 84 | def train_actor(self, actor_state_in, q_gradient_input): 85 | self.sess.run(self.optimizer, feed_dict={ 86 | self.actor_state_in: actor_state_in, self.q_gradient_input: q_gradient_input}) 87 | 88 | def update_target_actor(self): 89 | self.sess.run(self.update_target_actor_op) 90 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | from util import * 5 | 6 | from wolp_agent import * 7 | from ddpg.agent import DDPGAgent 8 | from util.data import Data 9 | from util.data import Timer 10 | 11 | time_now = -1 12 | 13 | 14 | def run(episodes=[2500], collecting_data=True): 15 | 16 | experiment = ('CartPole-v1', 17 | 'InvertedPendulum-v1', 18 | 'LunarLanderContinuous-v2')[1] 19 | env = gym.make(experiment) 20 | 21 | print(env.observation_space) 22 | print(env.action_space) 23 | 24 | steps = env.spec.timestep_limit 25 | 26 | # agent = DDPGAgent(env) 27 | max_actions = 1e2 28 | agent = WolpertingerAgent(env, k_nearest_neighbors=int(0.1 * max_actions), 29 | max_actions=max_actions) 30 | 31 | # file_name = "results/data_" + agent.get_name() + str(episodes) + ".txt" 32 | file_name = "data_" + str(episodes) + '_' + agent.get_name() 33 | print(file_name) 34 | result_fetcher = Data(file_name) 35 | 36 | result_fetcher.add_arrays(['rewards', 'count', 'actions', 'done']) 37 | result_fetcher.add_arrays(['state_' + str(i) for i in range(agent.observation_space_size)]) 38 | 39 | result_fetcher.add_timers(['render', 'act', 'step', 'saving'], 'run_') 40 | result_fetcher.add_timer('t_run_observe', one_hot=False) 41 | agent.add_data_fetch(result_fetcher) 42 | 43 | timer = Timer() 44 | 45 | for i in range(episodes): 46 | timer.reset() 47 | observation = env.reset() 48 | # for i in range(agent.observation_space_size): 49 | # result_fetcher.add_to_array('state_' + str(i), observation[i]) 50 | 51 | total_reward = 0 52 | print('Episode ', i, '/', episodes - 1, 'started...', end='') 53 | for t in range(steps): 54 | 55 | result_fetcher.reset_timers() 56 | 57 | if not collecting_data: 58 | env.render() 59 | 60 | result_fetcher.sample_timer('render') # ------ 61 | 62 | action = agent.act(observation) 63 | 64 | result_fetcher.add_to_array('actions', action) # ------- 65 | 66 | result_fetcher.sample_timer('act') # ------ 67 | 68 | for i in range(agent.observation_space_size): 69 | result_fetcher.add_to_array('state_' + str(i), observation[i]) 70 | prev_observation = observation 71 | observation, reward, done, info = env.step(action) 72 | 73 | episode = {'obs': prev_observation, 74 | 'action': action, 75 | 'reward': reward, 76 | 'obs2': observation, 77 | 'done': done, 78 | 't': t} 79 | 80 | result_fetcher.sample_timer('step') # ------ 81 | result_fetcher.add_to_array('count', 1) 82 | 83 | # print('\n' + str(episode['obs'])) 84 | result_fetcher.start_timer('observe') 85 | agent.observe(episode) 86 | result_fetcher.sample_timer('observe') # ------ 87 | 88 | total_reward += reward 89 | result_fetcher.add_to_array('done', 1 if done else 0) 90 | if done or (t == steps - 1): 91 | t += 1 92 | result_fetcher.add_to_array('rewards', total_reward) # ------ 93 | 94 | time_passed = timer.get_time() 95 | print('Reward:', total_reward, 'Steps:', t, 't:', 96 | time_passed, '({}/step)'.format(round(time_passed / t))) 97 | 98 | if not collecting_data: 99 | # save_episode(episode_history) 100 | pass 101 | else: 102 | pass 103 | # if i % 100 == 0: 104 | # result_fetcher.async_save() 105 | result_fetcher.sample_timer('saving') # ------ 106 | break 107 | # end of episodes 108 | 109 | result_fetcher.async_save() 110 | # result_fetcher.print_data() 111 | 112 | result_fetcher.print_times(groups=['run_']) 113 | result_fetcher.print_times(groups=['agent_'], total_time_field='count') 114 | 115 | 116 | def save_episode(episode, overwrite=True): 117 | from pathlib import Path 118 | import datetime 119 | from os import makedirs 120 | 121 | string = str(episode).replace('},', '},\n') 122 | 123 | if overwrite: 124 | file = open('results/last_episode', 'w') 125 | file.write(string) 126 | file.close() 127 | else: 128 | now = datetime.datetime.now() 129 | 130 | dir_name = "results/%s-%s-%s" % (now.day, now.month, now.year) 131 | file = Path(dir_name) 132 | if not file.is_dir(): 133 | makedirs(dir_name) 134 | 135 | counter = 0 136 | while True: 137 | file_name = dir_name + '/episode_%d.txt' % (counter) 138 | file = Path(file_name) 139 | if file.is_file(): 140 | print(file_name + " exists") 141 | counter += 1 142 | else: 143 | file = open(file_name, 'w') 144 | file.write(string) 145 | file.close() 146 | break 147 | 148 | 149 | if __name__ == '__main__': 150 | run() 151 | -------------------------------------------------------------------------------- /src/util/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import threading 3 | import pickle 4 | from timer import * 5 | 6 | 7 | def save_dictionary(dict, path): 8 | with open('results/obj/' + path + '.pkl', 'wb') as f: 9 | pickle.dump(dict, f, 0) 10 | 11 | 12 | class Data: 13 | 14 | def __init__(self, name='default_name'): 15 | self.name = name 16 | self.data = {} 17 | self.timers = {} 18 | 19 | def _add(self, field_name, timer, timer_one_hot=True): 20 | self.data[field_name] = np.array([]) 21 | if timer: 22 | self.timers[field_name] = Timer(timer_one_hot) 23 | 24 | def add_array(self, field_name): 25 | self._add(field_name, False) 26 | # self.data[field_name] = np.array([]) 27 | 28 | def add_arrays(self, fields, prefix=''): 29 | for f in fields: 30 | self.add_array(prefix + f) 31 | 32 | def add_to_array(self, field_name, value, abs_name=False): 33 | if abs_name: 34 | self.data[field_name] = np.append(self.data[field_name], value) 35 | else: 36 | fields = self.get_keys(field_name) 37 | for f in fields: 38 | self.data[f] = np.append(self.data[f], value) 39 | 40 | def add_timer(self, field_name, one_hot=True): 41 | self._add(field_name, True, one_hot) 42 | # self.add_array(name) 43 | # self.timers[name] = Timer() 44 | 45 | def add_timers(self, names, prefix='', one_hot=True): 46 | for f in names: 47 | self.add_timer(prefix + f, one_hot) 48 | 49 | def start_timer(self, field_name): 50 | fields = self.get_keys(field_name) 51 | for f in fields: 52 | self.timers[f].reset() 53 | 54 | def sample_timer(self, field_name, abs_name=False): 55 | if abs_name: 56 | self.data[field_name] = np.append( 57 | self.data[field_name], self.timers[field_name].get_time()) 58 | else: 59 | fields = self.get_keys(field_name) 60 | timer_keys = self.timers.keys() 61 | for f in fields: 62 | if f in timer_keys: 63 | self.data[f] = np.append(self.data[f], self.timers[f].get_time()) 64 | 65 | self.reset_timers_one_hot() 66 | 67 | def reset_timers(self): 68 | for t in self.timers: 69 | self.timers[t].reset() 70 | 71 | def reset_timers_one_hot(self): 72 | for t in self.timers: 73 | self.timers[t].reset_one_hot() 74 | 75 | def set_data(self, field_name, data): 76 | self.data[field_name] = data 77 | 78 | def get_data(self, field_name): 79 | return self.data[field_name] 80 | 81 | def print_data(self, field_name=''): 82 | keys = list(self.get_keys(field_name)) 83 | keys.sort() 84 | for key in keys: 85 | print(key, self.data[key].shape, self.data[key]) 86 | 87 | def print_fields(self): 88 | for k in self.get_keys(): 89 | print(k) 90 | 91 | def load(self, path=None): 92 | if path is None: 93 | path = self.name 94 | with open('results/obj/' + path + '.pkl', 'rb') as f: 95 | self.data = pickle.load(f) 96 | 97 | def async_save(self): 98 | thread = save_fulldata(self) 99 | thread.start() 100 | 101 | def print_times(self, other_keys=None, groups=None, total_time_field=None): 102 | final_keys = [] 103 | if (other_keys is None) and (groups is None): 104 | final_keys = self.timers.keys() 105 | else: 106 | if other_keys is not None: 107 | final_keys.extend(other_keys) 108 | 109 | if groups is not None: 110 | timers = self.timers.keys() 111 | for g in groups: 112 | for t in timers: 113 | if g in t: 114 | final_keys.append(t) 115 | 116 | if (final_keys is None) or (len(final_keys) == 0): 117 | print("No items found to be printed") 118 | return 119 | 120 | times = {} 121 | total_time = 0 122 | samples = [] 123 | 124 | for key in final_keys: 125 | times[key] = np.sum(self.get_data(key)) 126 | total_time += times[key] 127 | 128 | samples.append(len(self.get_data(key))) 129 | 130 | count = max(samples) 131 | if total_time_field is not None: 132 | count = np.sum(self.get_data(total_time_field)) 133 | 134 | print('\n\nName: {}\tCount: {} Group:{}'.format(self.name, count, groups)) 135 | print('key\t\tabs\t\tavg/unit\t% of total') 136 | print('-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-') 137 | 138 | keys = list(final_keys) 139 | keys.sort() 140 | max_key_len = 5 141 | for key in keys: 142 | if max_key_len < len(key): 143 | max_key_len = len(key) 144 | 145 | for key in keys: 146 | temp = times[key] 147 | avg = temp / count 148 | total = 100 * temp / total_time 149 | print('{}{}\t\t{}\t\t{:6.2f}\t\t{:6.2f}'.format( 150 | key, '.' * (max_key_len - len(key)), temp, avg, total)) 151 | 152 | print('Total{}\t\t{}\t\t{:6.2f}\t\t 100.0'.format( 153 | '.' * (max_key_len - 5), total_time, total_time / count)) 154 | 155 | def get_keys(self, key=''): 156 | res = [] 157 | for k in self.data.keys(): 158 | # if k.find(key) >= 0: 159 | if key in k: 160 | res.append(k) 161 | 162 | return res 163 | 164 | def get_empty_clone(self): 165 | res = Fulldata(self.name + '_clone') 166 | res.add_arrays(self.get_keys()) 167 | return res 168 | 169 | 170 | class save_fulldata(threading.Thread): 171 | def __init__(self, fd): 172 | threading.Thread.__init__(self) 173 | self.dict = fd.data 174 | self.path = fd.name 175 | 176 | def run(self): 177 | save_dictionary(self.dict, self.path) 178 | -------------------------------------------------------------------------------- /src/util/data_graph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import math 4 | import ntpath 5 | 6 | BATCH_RATIO = 0.01 7 | EXTENSIONS = ['.txt'] 8 | DIRECTORY = "/../../results" 9 | # DIRECTORY = "/../../data/saved_ddpg_new" 10 | 11 | 12 | def plot_file(file_name): 13 | data = np.loadtxt(file_name) 14 | plot_data(data, file_name=file_name) 15 | 16 | 17 | def plot_data(data, batch_size=-1, file_name="data"): 18 | 19 | data_size = data.shape[0] 20 | if batch_size == -1: 21 | batch_size = max(int(data_size * BATCH_RATIO), 1) 22 | if BATCH_RATIO == 1: 23 | batch_size = 1 24 | batches = math.ceil(data_size / batch_size) 25 | 26 | avg = np.average(data) 27 | 28 | final_data = np.zeros((batches, 4)) 29 | 30 | for i in range(batches): 31 | temp = data[i * batch_size: int(min((i + 1) * batch_size, data_size))] 32 | 33 | final_data[i] = [np.amax(temp), np.average(temp), np.amin(temp), avg] 34 | 35 | # if not i == 0: 36 | # final_data[i, 3] = final_data[i, 1] - final_data[i - 1, 1] 37 | 38 | x_axis = batch_size * np.arange(0, final_data.shape[0]) 39 | 40 | plt.figure() 41 | plt.subplot(211) 42 | 43 | line_widths = [1, 2, 1, 0.5] 44 | line_colors = ['r', 'g', 'b', 'm'] 45 | texts = ['max', 'data', 'min', 'avg=' + str(avg)] 46 | for i in range(4): # derivative out 47 | if batch_size == 1 and ((not i == 1) or (not i == 3)): 48 | continue 49 | 50 | index = int((i + 5) * 0.1 * len(final_data[:, i])) 51 | plt.plot(x_axis, final_data[:, i], line_colors[i], linewidth=line_widths[i]) 52 | plt.text(0.05 * len(final_data[:, i]), (i + 1) * 0.1 * np.amax(final_data[:, 0]), 53 | texts[i], color=line_colors[i]) 54 | 55 | # plt.annotate(texts[i], xy=(x_axis[index], final_data[index, i]), 56 | # xytext=(x_axis[index], final_data[index, i] + int(np.amax(final_data) * 0.4)), 57 | # arrowprops=dict(facecolor=line_colors[i], shrink=0.05)) 58 | 59 | # plt.plot(x_axis, final_data[:, 0], 'r', linewidth = 1) 60 | # plt.plot(x_axis, final_data[:, 1], 'g') 61 | # plt.plot(x_axis, final_data[:, 2], 'b', linewidth = 1) 62 | # plt.plot(x_axis, final_data[:, 3], 'm--', linewidth = 0.5) 63 | 64 | plt.grid(True) 65 | plt.title(ntpath.basename(file_name) + "(" + str(batch_size) + " batch size)") 66 | plt.ylabel("Reward") 67 | plt.xlabel("Episode") 68 | 69 | # reduced_data, ignored = ignore_low_values(data) 70 | # reduced_data, ignored = ignore_starting_rewards(data) 71 | reduced_data, ignored = data, 0 72 | STAT_GROUPS = 20 73 | MAX_VALUE = np.amax(reduced_data) 74 | # statistics 75 | stats = np.zeros((STAT_GROUPS)) 76 | for i in reduced_data: 77 | index = int(i / ((MAX_VALUE + 1) / STAT_GROUPS)) 78 | stats[index] += 1 79 | 80 | #stats *=100/len(data) 81 | x_axis = ((MAX_VALUE + 1) / STAT_GROUPS) * np.arange(STAT_GROUPS) 82 | plt.subplot(212) 83 | plt.plot(x_axis, stats, 'go-') 84 | # plt.axis([0, MAX_VALUE+1]) 85 | plt.yscale("log") 86 | plt.grid(True) 87 | # plt.title("Statistics histogram") 88 | # plt.ylabel("%(ign "+ str(round(100*ignored/len(data)))+ '%)') 89 | plt.ylabel("Samples") 90 | plt.xlabel("Value") 91 | 92 | plt.show() 93 | 94 | # unstested 95 | 96 | 97 | def plot_surface(X, Y, Z): 98 | from mpl_toolkits.mplot3d import Axes3D 99 | import matplotlib.pyplot as plt 100 | from matplotlib import cm 101 | from matplotlib.ticker import LinearLocator, FormatStrFormatter 102 | import numpy as np 103 | 104 | fig = plt.figure(figsize=plt.figaspect(0.5)) 105 | ax = fig.add_subplot(1, 2, 1) 106 | 107 | t = plt.imshow(Z) 108 | 109 | t.set_cmap(cm.coolwarm) 110 | plt.colorbar() 111 | ax = fig.add_subplot(1, 2, 2, projection='3d') 112 | # Plot the surface. 113 | surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, 114 | linewidth=0, antialiased=False) 115 | # Add a color bar which maps values to colors. 116 | # fig.colorbars(surf, shrink=0.5, aspect=5) 117 | plt.tight_layout() 118 | plt.show() 119 | 120 | 121 | def ignore_starting_rewards(data, threshold=200): 122 | index = 0 123 | for i in range(len(data)): 124 | if data[i] >= threshold: 125 | index = i 126 | break 127 | return data[index:], index 128 | 129 | 130 | def ignore_low_values(data, threshold=200): 131 | res = np.extract(data > 200, data) 132 | return res, len(data) - len(res) 133 | 134 | 135 | def set_patameters_and_get_files(): 136 | import argparse as arg 137 | 138 | parser = arg.ArgumentParser(description="Plot given reward files") 139 | parser.add_argument("file", type=str, nargs='*', 140 | help="files to be plotted") 141 | parser.add_argument("-r", "--ratio", type=float, 142 | help="batch to sample size ratio. Default: 0.01") 143 | parser.add_argument("-f", "--directory", type=str, 144 | help="specify taret directory. Default: /") 145 | 146 | directory = parser.parse_args().directory 147 | if directory is not None: 148 | global DIRECTORY 149 | if directory[0] != '/': 150 | directory = '/' + directory 151 | DIRECTORY = directory 152 | # parser.add_argument("-e", "--extensions", type=str , 153 | # help="Extension to b searched. Default: .txt") 154 | # 155 | # ext = parser.parse_args().extensions 156 | # if ext is not None: 157 | # global EXTENSIONS 158 | # EXTENSIONS = ext 159 | 160 | files = parser.parse_args().file 161 | if len(files) == 0: 162 | files = get_all_txt_files() 163 | 164 | rat = parser.parse_args().ratio 165 | if rat is not None: 166 | global BATCH_RATIO 167 | BATCH_RATIO = rat 168 | 169 | print(parser.parse_args().ratio) 170 | return files 171 | 172 | 173 | def get_all_txt_files(): 174 | from os import listdir 175 | from os.path import isfile, join, dirname, realpath, splitext 176 | 177 | mypath = dirname(realpath(__file__)) + DIRECTORY 178 | onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] 179 | txtfiles = [] 180 | for f in onlyfiles: 181 | if splitext(f)[1] in EXTENSIONS: 182 | txtfiles.append(mypath + "/" + f) 183 | return txtfiles 184 | -------------------------------------------------------------------------------- /src/ddpg/critic_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import math 4 | 5 | TAU = 0.001 6 | LEARNING_RATE = 0.001 7 | BATCH_SIZE = 64 8 | 9 | 10 | class CriticNet: 11 | """ Critic Q value model of the DDPG algorithm """ 12 | 13 | def __init__(self, num_states, num_actions): 14 | 15 | self.g = tf.Graph() 16 | with self.g.as_default(): 17 | self.sess = tf.InteractiveSession() 18 | 19 | # critic_q_model parameters: 20 | self.W1_c, self.B1_c, self.W2_c, self.W2_action_c, self.B2_c, self.W3_c, self.B3_c,\ 21 | self.critic_q_model, self.critic_state_in, self.critic_action_in = self.create_critic_net( 22 | num_states, num_actions) 23 | 24 | # create target_q_model: 25 | self.t_W1_c, self.t_B1_c, self.t_W2_c, self.t_W2_action_c, self.t_B2_c, self.t_W3_c, self.t_B3_c,\ 26 | self.t_critic_q_model, self.t_critic_state_in, self.t_critic_action_in = self.create_critic_net( 27 | num_states, num_actions) 28 | 29 | self.q_value_in = tf.placeholder("float", [None, 1]) # supervisor 30 | #self.l2_regularizer_loss = tf.nn.l2_loss(self.W1_c)+tf.nn.l2_loss(self.W2_c)+ tf.nn.l2_loss(self.W2_action_c) + tf.nn.l2_loss(self.W3_c)+tf.nn.l2_loss(self.B1_c)+tf.nn.l2_loss(self.B2_c)+tf.nn.l2_loss(self.B3_c) 31 | self.l2_regularizer_loss = 0.0001 * \ 32 | tf.reduce_sum(tf.pow(self.W2_c, 2)) + 0.0001 * tf.reduce_sum(tf.pow(self.B2_c, 2)) 33 | self.cost = tf.pow(self.critic_q_model - self.q_value_in, 2) / BATCH_SIZE + \ 34 | self.l2_regularizer_loss # /tf.to_float(tf.shape(self.q_value_in)[0]) 35 | self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.cost) 36 | 37 | # action gradient to be used in actor network: 38 | # self.action_gradients=tf.gradients(self.critic_q_model,self.critic_action_in) 39 | # from simple actor net: 40 | self.act_grad_v = tf.gradients(self.critic_q_model, self.critic_action_in) 41 | # this is just divided by batch size 42 | self.action_gradients = [self.act_grad_v[0] / 43 | tf.to_float(tf.shape(self.act_grad_v[0])[0])] 44 | # from simple actor net: 45 | self.check_fl = self.action_gradients 46 | 47 | # initialize all tensor variable parameters: 48 | self.sess.run(tf.global_variables_initializer()) 49 | 50 | self.update_target_critic_op = [ 51 | self.t_W1_c.assign(TAU * self.W1_c + (1 - TAU) * self.t_W1_c), 52 | self.t_B1_c.assign(TAU * self.B1_c + (1 - TAU) * self.t_B1_c), 53 | self.t_W2_c.assign(TAU * self.W2_c + (1 - TAU) * self.t_W2_c), 54 | self.t_W2_action_c.assign(TAU * self.W2_action_c + (1 - TAU) * self.t_W2_action_c), 55 | self.t_B2_c.assign(TAU * self.B2_c + (1 - TAU) * self.t_B2_c), 56 | self.t_W3_c.assign(TAU * self.W3_c + (1 - TAU) * self.t_W3_c), 57 | self.t_B3_c.assign(TAU * self.B3_c + (1 - TAU) * self.t_B3_c) 58 | ] 59 | # To make sure critic and target have same parmameters copy the parameters: 60 | # copy target parameters 61 | self.sess.run([ 62 | self.t_W1_c.assign(self.W1_c), 63 | self.t_B1_c.assign(self.B1_c), 64 | self.t_W2_c.assign(self.W2_c), 65 | self.t_W2_action_c.assign(self.W2_action_c), 66 | self.t_B2_c.assign(self.B2_c), 67 | self.t_W3_c.assign(self.W3_c), 68 | self.t_B3_c.assign(self.B3_c) 69 | ]) 70 | 71 | def create_critic_net(self, num_states=4, num_actions=1): 72 | N_HIDDEN_1 = 400 73 | N_HIDDEN_2 = 300 74 | critic_state_in = tf.placeholder("float", [None, num_states]) 75 | critic_action_in = tf.placeholder("float", [None, num_actions]) 76 | 77 | W1_c = tf.Variable(tf.random_uniform( 78 | [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 79 | B1_c = tf.Variable(tf.random_uniform( 80 | [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 81 | W2_c = tf.Variable(tf.random_uniform( 82 | [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 83 | W2_action_c = tf.Variable(tf.random_uniform( 84 | [num_actions, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 85 | B2_c = tf.Variable(tf.random_uniform( 86 | [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 87 | W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2, 1], -0.003, 0.003)) 88 | B3_c = tf.Variable(tf.random_uniform([1], -0.003, 0.003)) 89 | 90 | H1_c = tf.nn.softplus(tf.matmul(critic_state_in, W1_c) + B1_c) 91 | H2_c = tf.nn.tanh(tf.matmul(H1_c, W2_c) + tf.matmul(critic_action_in, W2_action_c) + B2_c) 92 | 93 | critic_q_model = tf.matmul(H2_c, W3_c) + B3_c 94 | 95 | return W1_c, B1_c, W2_c, W2_action_c, B2_c, W3_c, B3_c, critic_q_model, critic_state_in, critic_action_in 96 | 97 | def train_critic(self, state_t_batch, action_batch, y_i_batch): 98 | self.sess.run(self.optimizer, feed_dict={ 99 | self.critic_state_in: state_t_batch, self.critic_action_in: action_batch, self.q_value_in: y_i_batch}) 100 | 101 | def evaluate_target_critic(self, state_t_1, action_t_1): 102 | return self.sess.run(self.t_critic_q_model, feed_dict={self.t_critic_state_in: state_t_1, self.t_critic_action_in: action_t_1}) 103 | 104 | def evaluate_critic(self, state_1, action_1): 105 | return self.sess.run(self.critic_q_model, feed_dict={self.critic_state_in: state_1, self.critic_action_in: action_1}) 106 | 107 | def compute_delQ_a(self, state_t, action_t): 108 | # print '\n' 109 | # print 'check grad number' 110 | # ch= self.sess.run(self.check_fl, feed_dict={self.critic_state_in: state_t,self.critic_action_in: action_t}) 111 | # print len(ch) 112 | # print len(ch[0]) 113 | # raw_input("Press Enter to continue...") 114 | return self.sess.run(self.action_gradients, feed_dict={self.critic_state_in: state_t, self.critic_action_in: action_t}) 115 | 116 | def update_target_critic(self): 117 | self.sess.run(self.update_target_critic_op) 118 | -------------------------------------------------------------------------------- /src/ddpg/actor_net_bn.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import math 4 | import batch_norm 5 | import numpy as np 6 | LEARNING_RATE = 0.0001 7 | TAU = 0.001 8 | BATCH_SIZE = 64 9 | N_HIDDEN_1 = 400 10 | N_HIDDEN_2 = 300 11 | 12 | 13 | class ActorNet_bn: 14 | """ Actor Network Model with Batch Normalization of DDPG Algorithm """ 15 | 16 | def __init__(self, num_states, num_actions): 17 | tf.reset_default_graph() 18 | self.g = tf.Graph() 19 | with self.g.as_default(): 20 | self.sess = tf.InteractiveSession() 21 | 22 | # actor network model parameters: 23 | self.actor_state_in = tf.placeholder("float", [None, num_states]) 24 | self.W1_a = tf.Variable(tf.random_uniform( 25 | [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 26 | self.B1_a = tf.Variable(tf.random_uniform( 27 | [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 28 | self.W2_a = tf.Variable(tf.random_uniform( 29 | [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1))) 30 | self.B2_a = tf.Variable(tf.random_uniform( 31 | [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1))) 32 | self.W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2, num_actions], -0.003, 0.003)) 33 | self.B3_a = tf.Variable(tf.random_uniform([num_actions], -0.003, 0.003)) 34 | 35 | self.is_training = tf.placeholder(tf.bool, []) 36 | self.H1_t = tf.matmul(self.actor_state_in, self.W1_a) 37 | self.H1_a_bn = batch_norm(self.H1_t, N_HIDDEN_1, self.is_training, self.sess) 38 | self.H1_a = tf.nn.softplus(self.H1_a_bn.bnorm) + self.B1_a 39 | 40 | self.H2_t = tf.matmul(self.H1_a, self.W2_a) 41 | self.H2_a_bn = batch_norm(self.H2_t, N_HIDDEN_2, self.is_training, self.sess) 42 | self.H2_a = tf.nn.tanh(self.H2_a_bn.bnorm) + self.B2_a 43 | self.actor_model = tf.matmul(self.H2_a, self.W3_a) + self.B3_a 44 | 45 | # target actor network model parameters: 46 | self.t_actor_state_in = tf.placeholder("float", [None, num_states]) 47 | self.t_W1_a = tf.Variable(tf.random_uniform( 48 | [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 49 | self.t_B1_a = tf.Variable(tf.random_uniform( 50 | [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 51 | self.t_W2_a = tf.Variable(tf.random_uniform( 52 | [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1))) 53 | self.t_B2_a = tf.Variable(tf.random_uniform( 54 | [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1))) 55 | self.t_W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2, num_actions], -0.003, 0.003)) 56 | self.t_B3_a = tf.Variable(tf.random_uniform([num_actions], -0.003, 0.003)) 57 | 58 | self.t_is_training = tf.placeholder(tf.bool, []) 59 | self.t_H1_t = tf.matmul(self.t_actor_state_in, self.t_W1_a) 60 | self.t_H1_a_bn = batch_norm(self.t_H1_t, N_HIDDEN_1, 61 | self.t_is_training, self.sess, self.H1_a_bn) 62 | self.t_H1_a = tf.nn.softplus(self.t_H1_a_bn.bnorm) + self.t_B1_a 63 | 64 | self.t_H2_t = tf.matmul(self.t_H1_a, self.t_W2_a) 65 | self.t_H2_a_bn = batch_norm(self.t_H2_t, N_HIDDEN_2, 66 | self.t_is_training, self.sess, self.H2_a_bn) 67 | self.t_H2_a = tf.nn.tanh(self.t_H2_a_bn.bnorm) + self.t_B2_a 68 | self.t_actor_model = tf.matmul(self.t_H2_a, self.t_W3_a) + self.t_B3_a 69 | 70 | # cost of actor network: 71 | # gets input from action_gradient computed in critic network file 72 | self.q_gradient_input = tf.placeholder("float", [None, num_actions]) 73 | self.actor_parameters = [self.W1_a, self.B1_a, self.W2_a, self.B2_a, self.W3_a, 74 | self.B3_a, self.H1_a_bn.scale, self.H1_a_bn.beta, self.H2_a_bn.scale, self.H2_a_bn.beta] 75 | # /BATCH_SIZE) changed -self.q_gradient to - 76 | self.parameters_gradients = tf.gradients( 77 | self.actor_model, self.actor_parameters, -self.q_gradient_input) 78 | 79 | self.optimizer = tf.train.AdamOptimizer( 80 | learning_rate=LEARNING_RATE, epsilon=1e-08).apply_gradients(zip(self.parameters_gradients, self.actor_parameters)) 81 | # initialize all tensor variable parameters: 82 | self.sess.run(tf.initialize_all_variables()) 83 | 84 | # To make sure actor and target have same intial parmameters copy the parameters: 85 | # copy target parameters 86 | self.sess.run([ 87 | self.t_W1_a.assign(self.W1_a), 88 | self.t_B1_a.assign(self.B1_a), 89 | self.t_W2_a.assign(self.W2_a), 90 | self.t_B2_a.assign(self.B2_a), 91 | self.t_W3_a.assign(self.W3_a), 92 | self.t_B3_a.assign(self.B3_a)]) 93 | 94 | def evaluate_actor(self, state_t): 95 | return self.sess.run(self.actor_model, feed_dict={self.actor_state_in: state_t, self.is_training: False}) 96 | 97 | def evaluate_target_actor(self, state_t_1): 98 | return self.sess.run(self.t_actor_model, feed_dict={self.t_actor_state_in: state_t_1, self.t_is_training: False}) 99 | 100 | def train_actor(self, actor_state_in, q_gradient_input): 101 | self.sess.run([self.optimizer, self.H1_a_bn.train_mean, self.H1_a_bn.train_var, self.H2_a_bn.train_mean, self.H2_a_bn.train_var, self.t_H1_a_bn.train_mean, self.t_H1_a_bn.train_var, self.t_H2_a_bn.train_mean, 102 | self.t_H2_a_bn.train_var], feed_dict={self.actor_state_in: actor_state_in, self.t_actor_state_in: actor_state_in, self.q_gradient_input: q_gradient_input, self.is_training: True, self.t_is_training: True}) 103 | 104 | def update_target_actor(self): 105 | self.sess.run([ 106 | self.t_W1_a.assign(TAU * self.W1_a + (1 - TAU) * self.t_W1_a), 107 | self.t_B1_a.assign(TAU * self.B1_a + (1 - TAU) * self.t_B1_a), 108 | self.t_W2_a.assign(TAU * self.W2_a + (1 - TAU) * self.t_W2_a), 109 | self.t_B2_a.assign(TAU * self.B2_a + (1 - TAU) * self.t_B2_a), 110 | self.t_W3_a.assign(TAU * self.W3_a + (1 - TAU) * self.t_W3_a), 111 | self.t_B3_a.assign(TAU * self.B3_a + (1 - TAU) * self.t_B3_a), 112 | self.t_H1_a_bn.updateTarget, 113 | self.t_H2_a_bn.updateTarget, 114 | ]) 115 | -------------------------------------------------------------------------------- /src/ddpg/critic_net_bn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import math 3 | from batch_norm import * 4 | import numpy as np 5 | LEARNING_RATE = 0.001 6 | TAU = 0.001 7 | BATCH_SIZE = 64 8 | N_HIDDEN_1 = 400 9 | N_HIDDEN_2 = 300 10 | 11 | 12 | class CriticNet_bn: 13 | """ Critic Q value model with batch normalization of the DDPG algorithm """ 14 | 15 | def __init__(self, num_states, num_actions): 16 | 17 | tf.reset_default_graph() 18 | self.g = tf.Graph() 19 | with self.g.as_default(): 20 | self.sess = tf.InteractiveSession() 21 | 22 | # Critic Q Network: 23 | self.critic_state_in = tf.placeholder("float", [None, num_states]) 24 | self.critic_action_in = tf.placeholder("float", [None, num_actions]) 25 | self.W1_c = tf.Variable(tf.random_uniform( 26 | [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 27 | self.B1_c = tf.Variable(tf.random_uniform( 28 | [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 29 | self.W2_c = tf.Variable(tf.random_uniform( 30 | [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 31 | self.B2_c = tf.Variable(tf.random_uniform( 32 | [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 33 | self.W2_action_c = tf.Variable(tf.random_uniform( 34 | [num_actions, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 35 | self.W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2, 1], -0.003, 0.003)) 36 | self.B3_c = tf.Variable(tf.random_uniform([1], -0.003, 0.003)) 37 | 38 | self.is_training = tf.placeholder(tf.bool, []) 39 | self.H1_t = tf.matmul(self.critic_state_in, self.W1_c) 40 | self.H1_c_bn = batch_norm(self.H1_t, N_HIDDEN_1, self.is_training, self.sess) 41 | 42 | self.H1_c = tf.nn.softplus(self.H1_c_bn.bnorm) + self.B1_c 43 | 44 | self.H2_t = tf.matmul(self.H1_c, self.W2_c) + \ 45 | tf.matmul(self.critic_action_in, self.W2_action_c) 46 | self.H2_c_bn = batch_norm(self.H2_t, N_HIDDEN_2, self.is_training, self.sess) 47 | self.H2_c = tf.nn.tanh(self.H2_c_bn.bnorm) + self.B2_c 48 | 49 | self.critic_q_model = tf.matmul(self.H2_c, self.W3_c) + self.B3_c 50 | 51 | # Target Critic Q Network: 52 | self.t_critic_state_in = tf.placeholder("float", [None, num_states]) 53 | self.t_critic_action_in = tf.placeholder("float", [None, num_actions]) 54 | self.t_W1_c = tf.Variable(tf.random_uniform( 55 | [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 56 | self.t_B1_c = tf.Variable(tf.random_uniform( 57 | [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states))) 58 | self.t_W2_c = tf.Variable(tf.random_uniform( 59 | [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 60 | self.t_W2_action_c = tf.Variable(tf.random_uniform( 61 | [num_actions, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 62 | self.t_B2_c = tf.Variable(tf.random_uniform( 63 | [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions))) 64 | self.t_W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2, 1], -0.003, 0.003)) 65 | self.t_B3_c = tf.Variable(tf.random_uniform([1], -0.003, 0.003)) 66 | 67 | self.t_H1_t = tf.matmul(self.t_critic_state_in, self.t_W1_c) 68 | self.t_H1_c_bn = batch_norm(self.t_H1_t, N_HIDDEN_1, 69 | self.is_training, self.sess, self.H1_c_bn) 70 | self.t_H1_c = tf.nn.softplus(self.t_H1_c_bn.bnorm) + self.t_B1_c 71 | 72 | self.t_H2_t = tf.matmul(self.t_H1_c, self.t_W2_c) + \ 73 | tf.matmul(self.t_critic_action_in, self.t_W2_action_c) 74 | self.t_H2_c_bn = batch_norm(self.t_H2_t, N_HIDDEN_2, 75 | self.is_training, self.sess, self.H2_c_bn) 76 | self.t_H2_c = tf.nn.tanh(self.t_H2_c_bn.bnorm) + self.t_B2_c 77 | 78 | self.t_critic_q_model = tf.matmul(self.t_H2_c, self.t_W3_c) + self.t_B3_c 79 | 80 | self.q_value_in = tf.placeholder("float", [None, 1]) # supervisor 81 | #self.l2_regularizer_loss = tf.nn.l2_loss(self.W1_c)+tf.nn.l2_loss(self.W2_c)+ tf.nn.l2_loss(self.W2_action_c) + tf.nn.l2_loss(self.W3_c)+tf.nn.l2_loss(self.B1_c)+tf.nn.l2_loss(self.B2_c)+tf.nn.l2_loss(self.B3_c) 82 | self.l2_regularizer_loss = 0.0001 * tf.reduce_sum(tf.pow(self.W2_c, 2)) 83 | self.cost = tf.pow(self.critic_q_model - self.q_value_in, 2) / BATCH_SIZE + \ 84 | self.l2_regularizer_loss # /tf.to_float(tf.shape(self.q_value_in)[0]) 85 | self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.cost) 86 | self.act_grad_v = tf.gradients(self.critic_q_model, self.critic_action_in) 87 | # this is just divided by batch size 88 | self.action_gradients = [self.act_grad_v[0] / 89 | tf.to_float(tf.shape(self.act_grad_v[0])[0])] 90 | # from simple actor net: 91 | self.check_fl = self.action_gradients 92 | 93 | # initialize all tensor variable parameters: 94 | self.sess.run(tf.initialize_all_variables()) 95 | 96 | # To initialize critic and target with the same values: 97 | # copy target parameters 98 | self.sess.run([ 99 | self.t_W1_c.assign(self.W1_c), 100 | self.t_B1_c.assign(self.B1_c), 101 | self.t_W2_c.assign(self.W2_c), 102 | self.t_W2_action_c.assign(self.W2_action_c), 103 | self.t_B2_c.assign(self.B2_c), 104 | self.t_W3_c.assign(self.W3_c), 105 | self.t_B3_c.assign(self.B3_c) 106 | ]) 107 | 108 | def train_critic(self, state_t_batch, action_batch, y_i_batch): 109 | self.sess.run([self.optimizer, self.H1_c_bn.train_mean, self.H1_c_bn.train_var, self.H2_c_bn.train_mean, self.H2_c_bn.train_var, self.t_H1_c_bn.train_mean, self.t_H1_c_bn.train_var, self.t_H2_c_bn.train_mean, self.t_H2_c_bn.train_var], feed_dict={ 110 | self.critic_state_in: state_t_batch, self.t_critic_state_in: state_t_batch, self.critic_action_in: action_batch, self.t_critic_action_in: action_batch, self.q_value_in: y_i_batch, self.is_training: True}) 111 | 112 | def evaluate_target_critic(self, state_t_1, action_t_1): 113 | return self.sess.run(self.t_critic_q_model, feed_dict={self.t_critic_state_in: state_t_1, self.t_critic_action_in: action_t_1, self.is_training: False}) 114 | 115 | def compute_delQ_a(self, state_t, action_t): 116 | return self.sess.run(self.action_gradients, feed_dict={self.critic_state_in: state_t, self.critic_action_in: action_t, self.is_training: False}) 117 | 118 | def update_target_critic(self): 119 | self.sess.run([ 120 | self.t_W1_c.assign(TAU * self.W1_c + (1 - TAU) * self.t_W1_c), 121 | self.t_B1_c.assign(TAU * self.B1_c + (1 - TAU) * self.t_B1_c), 122 | self.t_W2_c.assign(TAU * self.W2_c + (1 - TAU) * self.t_W2_c), 123 | self.t_W2_action_c.assign(TAU * self.W2_action_c + (1 - TAU) * self.t_W2_action_c), 124 | self.t_B2_c.assign(TAU * self.B2_c + (1 - TAU) * self.t_B2_c), 125 | self.t_W3_c.assign(TAU * self.W3_c + (1 - TAU) * self.t_W3_c), 126 | self.t_B3_c.assign(TAU * self.B3_c + (1 - TAU) * self.t_B3_c), 127 | self.t_H1_c_bn.updateTarget, 128 | self.t_H2_c_bn.updateTarget 129 | ]) 130 | -------------------------------------------------------------------------------- /src/ddpg/agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | from util import * 5 | 6 | import gym 7 | from gym.spaces import Box, Discrete 8 | 9 | from actor_net import ActorNet 10 | from critic_net import CriticNet 11 | from actor_net_bn import ActorNet_bn 12 | from critic_net_bn import CriticNet_bn 13 | from tensorflow_grad_inverter import grad_inverter 14 | 15 | from collections import deque 16 | 17 | MAX_ACTION_SPACE_SIZE = 1e6 18 | 19 | 20 | class Agent: 21 | 22 | def __init__(self, env): 23 | # checking state space 24 | if isinstance(env.observation_space, Box): 25 | self.observation_space_size = env.observation_space.shape[0] 26 | else: 27 | self.observation_space_size = env.observation_space.n 28 | 29 | # checking action space 30 | if isinstance(env.action_space, Box): 31 | self.action_space_size = env.action_space.shape[0] 32 | self.continious_action_space = True 33 | self.low = env.action_space.low 34 | self.high = env.action_space.high 35 | else: 36 | self.action_space_size = env.action_space.n 37 | self.continious_action_space = False 38 | self.low = 0 39 | self.high = env.action_space.n 40 | 41 | def act(self, state): 42 | pass 43 | 44 | def observe(self, episode): 45 | pass 46 | 47 | # shaping input states and actions 48 | def _np_shaping(self, array, is_state): 49 | 50 | number_of_elements = array.shape[0] if len(array.shape) > 1 else 1 51 | size_of_element = self.observation_space_size if is_state else self.action_space_size 52 | 53 | res = np.array(array) 54 | res.shape = (number_of_elements, size_of_element) 55 | return res 56 | 57 | def get_name(self): 58 | return 'Agent' 59 | 60 | 61 | class RandomAgent(Agent): 62 | 63 | def act(self, state): 64 | if self.continious_action_space: 65 | res = self.low + (self.high - self.low) * np.random.uniform(size=len(self.low)) 66 | return res 67 | else: 68 | return random.randint(self.low, self.high - 1) 69 | 70 | def get_name(self): 71 | return 'Random' + super().get_name() 72 | 73 | 74 | class DiscreteRandomAgent(RandomAgent): 75 | 76 | def __init__(self, env, max_actions=10): 77 | super().__init__(env) 78 | if self.continious_action_space: 79 | self.discrete_actions = np.linspace(self.low, self.high, max_actions) 80 | else: 81 | self.discrete_actions = np.arange(self.low, self.high) 82 | self.discrete_actions = list(self.discrete_actions) 83 | 84 | def act(self, state): 85 | return random.sample(self.discrete_actions, 1)[0] 86 | 87 | def get_name(self): 88 | return 'Discrete' + super().get_name() 89 | 90 | 91 | class DDPGAgent(Agent): 92 | ''' stevenpjg's implementation of DDPG algorithm ''' 93 | 94 | REPLAY_MEMORY_SIZE = 10000 95 | BATCH_SIZE = 64 96 | GAMMA = 0.99 97 | 98 | def __init__(self, env, is_batch_norm=False, is_grad_inverter=True): 99 | super().__init__(env) 100 | if is_batch_norm: 101 | self.critic_net = CriticNet_bn(self.observation_space_size, 102 | self.action_space_size) 103 | self.actor_net = ActorNet_bn(self.observation_space_size, 104 | self.action_space_size) 105 | 106 | else: 107 | self.critic_net = CriticNet(self.observation_space_size, 108 | self.action_space_size) 109 | self.actor_net = ActorNet(self.observation_space_size, 110 | self.action_space_size) 111 | 112 | self.is_grad_inverter = is_grad_inverter 113 | self.replay_memory = deque() 114 | 115 | self.time_step = 0 116 | 117 | action_max = np.array(env.action_space.high).tolist() 118 | action_min = np.array(env.action_space.low).tolist() 119 | action_bounds = [action_max, action_min] 120 | self.grad_inv = grad_inverter(action_bounds) 121 | 122 | def add_data_fetch(self, df): 123 | self.data_fetch = df 124 | self.data_fetch.add_timers(['ev_p_t', 'ev_q_t', 'y', 125 | 'train_q', 'train_p', 126 | 'up_q_t', 'up_p_t'], prefix='t_agent_training_') 127 | 128 | def get_name(self): 129 | return 'DDPG' + super().get_name() 130 | 131 | def act(self, state): 132 | state = self._np_shaping(state, True) 133 | return self.actor_net.evaluate_actor(state).astype(float) 134 | 135 | def observe(self, episode): 136 | episode['obs'] = self._np_shaping(episode['obs'], True) 137 | episode['action'] = self._np_shaping(episode['action'], False) 138 | episode['obs2'] = self._np_shaping(episode['obs2'], True) 139 | self.add_experience(episode) 140 | 141 | def add_experience(self, episode): 142 | self.replay_memory.append(episode) 143 | 144 | self.time_step += 1 145 | if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: 146 | self.replay_memory.popleft() 147 | 148 | if len(self.replay_memory) > type(self).BATCH_SIZE: 149 | res = self.train() 150 | return res 151 | else: 152 | return None 153 | 154 | def minibatches(self): 155 | batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) 156 | # state t 157 | state = self._np_shaping(np.array([item['obs'] for item in batch]), True) 158 | # action 159 | action = self._np_shaping(np.array([item['action'] for item in batch]), False) 160 | # reward 161 | reward = np.array([item['reward'] for item in batch]) 162 | # state t+1 163 | state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) 164 | # doneA 165 | done = np.array([item['done'] for item in batch]) 166 | 167 | return state, action, reward, state_2, done 168 | 169 | def train(self): 170 | # sample a random minibatch of N transitions from R 171 | state, action, reward, state_2, done = self.minibatches() 172 | 173 | actual_batch_size = len(state) 174 | 175 | self.data_fetch.reset_timers() 176 | target_action = self.actor_net.evaluate_target_actor(state) 177 | self.data_fetch.sample_timer('ev_p_t') # ------ 178 | 179 | # Q'(s_i+1,a_i+1) 180 | q_t = self.critic_net.evaluate_target_critic(state_2, target_action) 181 | self.data_fetch.sample_timer('ev_q_t') # ------ 182 | 183 | y = [] # fix initialization of y 184 | for i in range(0, actual_batch_size): 185 | 186 | if done[i]: 187 | y.append(reward[i]) 188 | else: 189 | y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t 190 | 191 | y = np.reshape(np.array(y), [len(y), 1]) 192 | self.data_fetch.sample_timer('y') # ------ 193 | 194 | # Update critic by minimizing the loss 195 | self.critic_net.train_critic(state, action, y) 196 | self.data_fetch.sample_timer('train_q') # ------ 197 | # Update actor proportional to the gradients: 198 | # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act 199 | action_for_delQ = self.actor_net.evaluate_actor(state) # dont need wolp action 200 | 201 | if self.is_grad_inverter: 202 | del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ) # /BATCH_SIZE 203 | del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) 204 | else: 205 | del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)[0] # /BATCH_SIZE 206 | 207 | # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: 208 | self.actor_net.train_actor(state, del_Q_a) 209 | self.data_fetch.sample_timer('train_p') # ------ 210 | 211 | # Update target Critic and actor network 212 | self.critic_net.update_target_critic() 213 | self.data_fetch.sample_timer('up_q_t') # ------ 214 | self.actor_net.update_target_actor() 215 | self.data_fetch.sample_timer('up_p_t') # ------ 216 | --------------------------------------------------------------------------------