├── src
    ├── __init__.py
    ├── ddpg
    │   ├── __init__.py
    │   ├── tensorflow_grad_inverter.py
    │   ├── batch_norm.py
    │   ├── actor_net.py
    │   ├── critic_net.py
    │   ├── actor_net_bn.py
    │   ├── critic_net_bn.py
    │   └── agent.py
    ├── util
    │   ├── __init__.py
    │   ├── my_plotlib.py
    │   ├── timer.py
    │   ├── agent_data.py
    │   ├── data.py
    │   └── data_graph.py
    ├── wolp_agent.py
    └── main.py
├── .gitignore
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import inspect
4 | 
5 | 
6 | cmd_subfolder = os.path.split(inspect.getfile(inspect.currentframe()))[0]
7 | if cmd_subfolder not in sys.path:
8 |     sys.path.insert(0, cmd_subfolder)
9 | 


--------------------------------------------------------------------------------
/src/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import inspect
4 | 
5 | 
6 | cmd_subfolder = os.path.split(inspect.getfile(inspect.currentframe()))[0]
7 | if cmd_subfolder not in sys.path:
8 |     sys.path.insert(0, cmd_subfolder)
9 | 


--------------------------------------------------------------------------------
/src/util/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import inspect
4 | 
5 | 
6 | cmd_subfolder = os.path.split(inspect.getfile(inspect.currentframe()))[0]
7 | if cmd_subfolder not in sys.path:
8 |     sys.path.insert(0, cmd_subfolder)
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # tests
 3 | tests
 4 | test_*.py
 5 | .cache
 6 | 
 7 | # build dirs
 8 | __pycache__
 9 | 
10 | # .py files
11 | clipboard.py
12 | example.py
13 | 
14 | # .pyc files
15 | *.pyc
16 | 
17 | 
18 | # results
19 | results
20 | data
21 | default_name
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deep-Reinforcement-Learning-in-Large-Discrete-Action-Spaces
2 | Link to [paper](https://arxiv.org/abs/1512.07679)
3 | 
4 | Implementation of the algorithm in Python 3, TensorFlow and OpenAI Gym.
5 | 
6 | 
7 | 
8 | This paper introduces Wolpertinger training algorithm that extends the Deep Deterministic Policy Gradient training algorithm introduced in [this](https://arxiv.org/abs/1509.02971) paper. I extended stevenpjg's implementation of DDPG algorithm found [here](https://github.com/stevenpjg/ddpg-aigym) licensed under the MIT license.
9 | 


--------------------------------------------------------------------------------
/src/ddpg/tensorflow_grad_inverter.py:
--------------------------------------------------------------------------------
 1 | #Reference:
 2 | #https://github.com/MOCR/
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | 
 8 | class grad_inverter:
 9 |     def __init__(self, action_bounds):
10 | 
11 |         self.sess = tf.InteractiveSession()       
12 |         
13 |         self.action_size = len(action_bounds[0])
14 |         
15 |         self.action_input = tf.placeholder(tf.float32, [None, self.action_size])
16 |         self.pmax = tf.constant(action_bounds[0], dtype = tf.float32)
17 |         self.pmin = tf.constant(action_bounds[1], dtype = tf.float32)
18 |         self.prange = tf.constant([x - y for x, y in zip(action_bounds[0],action_bounds[1])], dtype = tf.float32)
19 |         self.pdiff_max = tf.div(-self.action_input+self.pmax, self.prange)
20 |         self.pdiff_min = tf.div(self.action_input - self.pmin, self.prange)
21 |         self.zeros_act_grad_filter = tf.zeros([self.action_size])
22 |         self.act_grad = tf.placeholder(tf.float32, [None, self.action_size])
23 |         self.grad_inverter = tf.where(tf.greater(self.act_grad, self.zeros_act_grad_filter), tf.multiply(self.act_grad, self.pdiff_max), tf.multiply(self.act_grad, self.pdiff_min))        
24 |     
25 |     def invert(self, grad, action):
26 | 
27 |         
28 |         return self.sess.run(self.grad_inverter, feed_dict = {self.action_input: action, self.act_grad: grad[0]})
29 | 


--------------------------------------------------------------------------------
/src/ddpg/batch_norm.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | decay = 0.95
 3 | TAU = 0.001
 4 | 
 5 | 
 6 | class Batch_norm:
 7 |     def __init__(self, inputs, size, is_training, sess, parForTarget=None, bn_param=None):
 8 | 
 9 |         self.sess = sess
10 |         self.scale = tf.Variable(tf.random_uniform([size], 0.9, 1.1))
11 |         self.beta = tf.Variable(tf.random_uniform([size], -0.03, 0.03))
12 |         self.pop_mean = tf.Variable(tf.random_uniform([size], -0.03, 0.03), trainable=False)
13 |         self.pop_var = tf.Variable(tf.random_uniform([size], 0.9, 1.1), trainable=False)
14 |         self.batch_mean, self.batch_var = tf.nn.moments(inputs, [0])
15 |         self.train_mean = tf.assign(self.pop_mean, self.pop_mean *
16 |                                     decay + self.batch_mean * (1 - decay))
17 |         self.train_var = tf.assign(self.pop_var, self.pop_var *
18 |                                    decay + self.batch_var * (1 - decay))
19 | 
20 |         def training():
21 |             return tf.nn.batch_normalization(inputs,
22 |                                              self.batch_mean, self.batch_var, self.beta, self.scale, 0.0000001)
23 | 
24 |         def testing():
25 |             return tf.nn.batch_normalization(inputs,
26 |                                              self.pop_mean, self.pop_var, self.beta, self.scale, 0.0000001)
27 | 
28 |         if parForTarget != None:
29 |             self.parForTarget = parForTarget
30 |             self.updateScale = self.scale.assign(
31 |                 self.scale * (1 - TAU) + self.parForTarget.scale * TAU)
32 |             self.updateBeta = self.beta.assign(self.beta * (1 - TAU) + self.parForTarget.beta * TAU)
33 |             self.updateTarget = tf.group(self.updateScale, self.updateBeta)
34 | 
35 |         self.bnorm = tf.cond(is_training, training, testing)
36 | 


--------------------------------------------------------------------------------
/src/util/my_plotlib.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | class Line:
 6 |     def __init__(self, x, y, line_width=1, line_color='black', text='', style='-'):
 7 |         self.x = x
 8 |         self.y = y
 9 |         self.width = line_width
10 |         self.color = line_color
11 |         self.text = text
12 |         self.style = style
13 | 
14 |     def plot(self, fig=None, k=2):
15 |         if fig is None:
16 |             plt.figure()
17 |             plt.grid(True)
18 |             plt.ylabel("y")
19 |             plt.xlabel("x")
20 | 
21 |         max_y, min_y = self.y_range()
22 | 
23 |         plt.plot(self.x, self.y, self.color, linewidth=self.width, linestyle=self.style)
24 |         plt.text(0.05 * len(self.y), k * 0.1 * (max_y - min_y),
25 |                  self.text, color=self.color)
26 | 
27 |         if fig is None:
28 |             plt.show()
29 | 
30 |     def y_range(self):
31 |         return np.amin(self.y), np.amax(self.y)
32 | 
33 | 
34 | class Function(Line):
35 | 
36 |     def __init__(self, x, func, line_width=1, line_color='black', text='', style='-'):
37 |         y = [func(i) for i in x]
38 |         super().__init__(x, y, line_width, line_color, text, style)
39 | 
40 | 
41 | class Constant(Line):
42 | 
43 |     def __init__(self, x, c, line_width=1, line_color='black', text='', style='-'):
44 |         x = [x[0], x[len(x) - 1]]
45 |         y = [c] * len(x)
46 |         super().__init__(x, y, line_width, line_color, text, style)
47 | 
48 | 
49 | def plot_lines(lines, seps=None, grid_flag=True):
50 |     fig = plt.figure()
51 |     plt.grid(grid_flag)
52 |     plt.ylabel("y")
53 |     plt.xlabel("x")
54 |     max_y = []
55 |     min_y = []
56 |     count = 0
57 |     for line in lines:
58 |         count += 1
59 |         line.plot(fig=fig, k=count)
60 |         temp = line.y_range()
61 |         min_y.append(temp[0])
62 |         max_y.append(temp[1])
63 | 
64 |     min_y = np.amin(min_y)
65 |     max_y = np.amax(max_y)
66 | 
67 |     if seps is not None:
68 |         for s in seps:
69 |             plt.plot([s - 0.001, s + 0.001], [min_y, max_y], 'r', linewidth=0.5)
70 | 
71 |     plt.show()
72 | 


--------------------------------------------------------------------------------
/src/util/timer.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | 
 3 | 
 4 | class Timer:
 5 | 
 6 |     def __init__(self, one_hot=True):
 7 |         self.reset()
 8 |         self.one_hot = one_hot
 9 | 
10 |     def reset(self):
11 |         self.now = Timer._get_current_milis()
12 | 
13 |     def reset_one_hot(self):
14 |         if self.one_hot:
15 |             self.reset()
16 | 
17 |     def get_time(self, reset=False):
18 |         return Timer._get_current_milis() - self.now
19 | 
20 |     @staticmethod
21 |     def _get_current_milis():
22 |         return int(round(time() * 1000))
23 | 
24 | 
25 | class Time_stats:
26 | 
27 |     def __init__(self, name, fields, one_active=True):
28 |         self.name = name
29 |         self.count = 0
30 |         self.one_active = one_active
31 |         self.values = {}
32 |         self.timers = {}
33 |         for str in fields:
34 |             self.values[str] = 0
35 |             self.timers[str] = Timer()
36 | 
37 |     def start(self, field):
38 |         self.timers[field].reset()
39 | 
40 |     def add_time(self, field):
41 |         self.values[field] += self.timers[field].get_time()
42 |         if self.one_active:
43 |             self.reset_timers()
44 | 
45 |     def increase_count(self, n=1):
46 |         self.count += n
47 | 
48 |     def set_count(self, n):
49 |         self.count = n
50 | 
51 |     def get_count(self):
52 |         return self.count
53 | 
54 |     def reset_timers(self):
55 |         for key in self.timers.keys():
56 |             self.start(key)
57 | 
58 |     def reset_values(self):
59 |         for key in self.values.keys():
60 |             self.values[key] = 0
61 | 
62 |     def get_total(self):
63 |         total = 0
64 |         for key in self.values.keys():
65 |             total += self.values[key]
66 |         return total
67 | 
68 |     def print_stats(self):
69 |         print('\nName: {}\tCount: {}'.format(self.name, self.count))
70 |         print('key\t\tabs\t\tavg/unit\t% of total')
71 |         print('-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-')
72 | 
73 |         keys = list(self.values.keys())
74 |         keys.sort()
75 |         total_time = max(self.get_total(), 1)
76 |         count = max(self.count, 1)
77 |         for key in keys:
78 |             temp = self.values[key]
79 |             avg = temp / count
80 |             total = 100 * temp / total_time
81 |             print('{}\t\t{}\t\t{:6.2f}\t\t{:6.2f}'.format(
82 |                 key, temp, avg, total))
83 | 
84 |         total_time = self.get_total()
85 |         print('Total\t\t{}\t\t{:6.2f}\t\t 100.0'.format(total_time, total_time / count))
86 | 


--------------------------------------------------------------------------------
/src/wolp_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyflann
 3 | 
 4 | from ddpg import agent
 5 | 
 6 | 
 7 | class WolpertingerAgent(agent.DDPGAgent):
 8 | 
 9 |     def __init__(self, env, max_actions=1e6, k_nearest_neighbors=10):
10 |         super().__init__(env)
11 |         if self.continious_action_space:
12 |             self.actions = np.linspace(self.low, self.high, max_actions)
13 |         else:
14 |             self.actions = np.arange(self.low, self.high)
15 |         # self.actions = list(self.actions)
16 |         self.k_nearest_neighbors = k_nearest_neighbors
17 |         print('wolpertinger agent init')
18 |         print('max actions = ', max_actions)
19 |         print('k nearest neighbors =', k_nearest_neighbors)
20 |         # init flann
21 |         self.actions.shape = (len(self.actions), self.action_space_size)
22 |         self.flann = pyflann.FLANN()
23 |         params = self.flann.build_index(self.actions, algorithm='kdtree')
24 |         print('flann init with params->', params)
25 | 
26 |     def get_name(self):
27 |         return 'Wolp_v1_k' + str(self.k_nearest_neighbors) + '_' + super().get_name()
28 | 
29 |     def act(self, state):
30 |         proto_action = super().act(state)
31 |         if self.k_nearest_neighbors <= 1:
32 |             return proto_action
33 | 
34 |         if len(proto_action) > 1:
35 |             return 0
36 |             res = np.array([])
37 |             for i in range(len(proto_action)):
38 |                 res = np.append(res, self.wolp_action(state[i], proto_action[i]))
39 |             res.shape = (len(res), 1)
40 |             return res
41 |         else:
42 |             return self.wolp_action(state, proto_action)
43 | 
44 |     def wolp_action(self, state, proto_action):
45 |         debug = False
46 |         actions = self.nearest_neighbors(proto_action)[0]
47 |         if debug:
48 |             print('--\nproto action', proto_action, 'state', state)
49 |         states = np.tile(state, [len(actions), 1])
50 |         actions_evaluation = self.critic_net.evaluate_critic(states, actions)
51 |         if debug:
52 |             print('action evalueations', actions_evaluation.shape)
53 |         if debug:
54 |             for i in range(len(actions)):
55 |                 print(actions[i], 'v', actions_evaluation[i])
56 | 
57 |         max_index = np.argmax(actions_evaluation)
58 |         max = actions_evaluation[max_index]
59 |         if debug:
60 |             print('max', max, '->', max_index)
61 |         if debug:
62 |             print('result action', actions[max_index])
63 |         # if debug:
64 |         #     exit()
65 |         return actions[max_index]
66 | 
67 |     def nearest_neighbors(self, proto_action):
68 |         results, dists = self.flann.nn_index(
69 |             proto_action, self.k_nearest_neighbors)  # checks=params["checks"]
70 |         return self.actions[results]
71 | 


--------------------------------------------------------------------------------
/src/util/agent_data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from my_plotlib import *
  3 | from data import *
  4 | import data_graph
  5 | import gym
  6 | from gym.spaces import Box, Discrete
  7 | 
  8 | 
  9 | def get_action_space(env):
 10 |     low = 0
 11 |     high = 0
 12 |     if isinstance(env.action_space, Box):
 13 |         low = env.action_space.low[0]
 14 |         high = env.action_space.high[0]
 15 |     else:
 16 |         low = 0
 17 |         high = env.action_space.n
 18 | 
 19 |     return low, high
 20 | 
 21 | 
 22 | def plot_rewards(fd):
 23 |     data = fd.get_data('rewards')
 24 | 
 25 |     data_graph.plot_data(data, batch_size=-1, file_name='rewards')
 26 | 
 27 | 
 28 | def plot_actions(fd, episodes=None, action_space=None):
 29 |     lines = []
 30 | 
 31 |     data = []
 32 |     seps = []
 33 |     if episodes is None:
 34 |         data = fd.get_data('actions')
 35 |     else:
 36 |         for ep in episodes:
 37 |             data.extend(fd.get_episode_data('actions', ep))
 38 |             seps.append(len(data) - 0.5)
 39 | 
 40 |     if len(seps) == 1:
 41 |         seps = []
 42 |     x = np.arange(len(data))
 43 |     if action_space is not None:
 44 |         lines.extend((Constant(x, k, line_color='#a0a0a0') for k in action_space))
 45 | 
 46 |     lines.append(Line(x, data, line_color='-o'))
 47 |     plot_lines(lines, seps, grid_flag=action_space is None)
 48 | 
 49 | 
 50 | def plot_states(fd, episodes=None):
 51 |     lines = []
 52 | 
 53 |     data = {'s0': [],
 54 |             's1': [],
 55 |             's2': [],
 56 |             's3': [],
 57 |             'actions': []}
 58 |     seps = []
 59 |     if episodes is None:
 60 |         data['s0'] = fd.get_data('state_0')
 61 |         data['s1'] = fd.get_data('state_1')
 62 |         data['s2'] = fd.get_data('state_2')
 63 |         data['s3'] = fd.get_data('state_3')
 64 |         data['actions'] = fd.get_data('actions')
 65 |     else:
 66 |         for ep in episodes:
 67 |             data['s0'].extend(fd.get_episode_data('state_0', ep))
 68 |             data['s1'].extend(fd.get_episode_data('state_1', ep))
 69 |             data['s2'].extend(fd.get_episode_data('state_2', ep))
 70 |             data['s3'].extend(fd.get_episode_data('state_3', ep))
 71 |             data['actions'].extend(fd.get_episode_data('actions', ep))
 72 |             seps.append(len(data) - 0.5)
 73 | 
 74 |     if len(seps) == 1:
 75 |         seps = []
 76 |     x = np.arange(len(data['s0']))
 77 | 
 78 |     # print(data['s0'])
 79 | 
 80 |     lines.append(Line(x, data['s0'], line_color='b', text='s0'))
 81 |     lines.append(Line(x, data['s1'], line_color='g', text='s1'))
 82 |     lines.append(Line(x, data['s2'], line_color='r', text='s2'))
 83 |     lines.append(Line(x, data['s3'], line_color='m', text='s3'))
 84 |     lines.append(Line(x, data['actions'], line_color='black', text='actions', style=':'))
 85 | 
 86 |     plot_lines(lines, seps)
 87 | 
 88 | 
 89 | class Agent_data(Data):
 90 | 
 91 |     def get_episodes_with_reward_greater_than(self, th):
 92 |         return np.where(self.get_data('rewards') >= th)[0]
 93 | 
 94 |     def find_episode(self, ep):
 95 |         done = self.get_data('done')
 96 |         eps = np.where(done == 1)[0]
 97 |         return eps[ep - 1] + 1 if ep > 0 else 0, eps[min(ep, len(done))]
 98 | 
 99 |     def get_episode_data(self, field, ep):
100 |         s, e = self.find_episode(ep)
101 |         data = self.get_data(field)
102 |         if field == 'rewards':
103 |             return data[ep]
104 |         else:
105 |             return data[s: e + 1]
106 | 
107 |     def get_full_episode_data(self, ep):
108 |         start, end = self.find_episode(ep)
109 |         clone = self.get_empty_clone()
110 |         for key in self.get_keys():
111 |             clone.set_data(key, self.get_data(key)[start: end + 1])
112 | 
113 |         r = self.get_data('rewards')[ep]
114 |         clone.set_data('rewards', np.array([r]))
115 |         return clone
116 | 


--------------------------------------------------------------------------------
/src/ddpg/actor_net.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import math
 4 | 
 5 | LEARNING_RATE = 0.0001
 6 | BATCH_SIZE = 64
 7 | TAU = 0.001
 8 | 
 9 | 
10 | class ActorNet:
11 |     """ Actor Network Model of DDPG Algorithm """
12 | 
13 |     def __init__(self, num_states, num_actions):
14 |         self.g = tf.Graph()
15 |         with self.g.as_default():
16 |             self.sess = tf.InteractiveSession()
17 | 
18 |             # actor network model parameters:
19 |             self.W1_a, self.B1_a, self.W2_a, self.B2_a, self.W3_a, self.B3_a,\
20 |                 self.actor_state_in, self.actor_model = self.create_actor_net(
21 |                     num_states, num_actions)
22 | 
23 |             # target actor network model parameters:
24 |             self.t_W1_a, self.t_B1_a, self.t_W2_a, self.t_B2_a, self.t_W3_a, self.t_B3_a,\
25 |                 self.t_actor_state_in, self.t_actor_model = self.create_actor_net(
26 |                     num_states, num_actions)
27 | 
28 |             # cost of actor network:
29 |             # gets input from action_gradient computed in critic network file
30 |             self.q_gradient_input = tf.placeholder("float", [None, num_actions])
31 |             self.actor_parameters = [self.W1_a, self.B1_a,
32 |                                      self.W2_a, self.B2_a, self.W3_a, self.B3_a]
33 |             self.parameters_gradients = tf.gradients(
34 |                 self.actor_model, self.actor_parameters, -self.q_gradient_input)  # /BATCH_SIZE)
35 |             self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(
36 |                 zip(self.parameters_gradients, self.actor_parameters))
37 |             # initialize all tensor variable parameters:
38 |             self.sess.run(tf.global_variables_initializer())
39 | 
40 |             self.update_target_actor_op = [
41 |                 self.t_W1_a.assign(TAU * self.W1_a + (1 - TAU) * self.t_W1_a),
42 |                 self.t_B1_a.assign(TAU * self.B1_a + (1 - TAU) * self.t_B1_a),
43 |                 self.t_W2_a.assign(TAU * self.W2_a + (1 - TAU) * self.t_W2_a),
44 |                 self.t_B2_a.assign(TAU * self.B2_a + (1 - TAU) * self.t_B2_a),
45 |                 self.t_W3_a.assign(TAU * self.W3_a + (1 - TAU) * self.t_W3_a),
46 |                 self.t_B3_a.assign(TAU * self.B3_a + (1 - TAU) * self.t_B3_a)]
47 |             # To make sure actor and target have same intial parmameters copy the parameters:
48 |             # copy target parameters
49 |             self.sess.run([
50 |                 self.t_W1_a.assign(self.W1_a),
51 |                 self.t_B1_a.assign(self.B1_a),
52 |                 self.t_W2_a.assign(self.W2_a),
53 |                 self.t_B2_a.assign(self.B2_a),
54 |                 self.t_W3_a.assign(self.W3_a),
55 |                 self.t_B3_a.assign(self.B3_a)])
56 | 
57 |     def create_actor_net(self, num_states=4, num_actions=1):
58 |         """ Network that takes states and return action """
59 |         N_HIDDEN_1 = 400
60 |         N_HIDDEN_2 = 300
61 |         actor_state_in = tf.placeholder("float", [None, num_states])
62 |         W1_a = tf.Variable(tf.random_uniform(
63 |             [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
64 |         B1_a = tf.Variable(tf.random_uniform(
65 |             [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
66 |         W2_a = tf.Variable(tf.random_uniform(
67 |             [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1)))
68 |         B2_a = tf.Variable(tf.random_uniform(
69 |             [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1)))
70 |         W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2, num_actions], -0.003, 0.003))
71 |         B3_a = tf.Variable(tf.random_uniform([num_actions], -0.003, 0.003))
72 | 
73 |         H1_a = tf.nn.softplus(tf.matmul(actor_state_in, W1_a) + B1_a)
74 |         H2_a = tf.nn.tanh(tf.matmul(H1_a, W2_a) + B2_a)
75 |         actor_model = tf.matmul(H2_a, W3_a) + B3_a
76 |         return W1_a, B1_a, W2_a, B2_a, W3_a, B3_a, actor_state_in, actor_model
77 | 
78 |     def evaluate_actor(self, state_t):
79 |         return self.sess.run(self.actor_model, feed_dict={self.actor_state_in: state_t})
80 | 
81 |     def evaluate_target_actor(self, state_t_1):
82 |         return self.sess.run(self.t_actor_model, feed_dict={self.t_actor_state_in: state_t_1})
83 | 
84 |     def train_actor(self, actor_state_in, q_gradient_input):
85 |         self.sess.run(self.optimizer, feed_dict={
86 |                       self.actor_state_in: actor_state_in, self.q_gradient_input: q_gradient_input})
87 | 
88 |     def update_target_actor(self):
89 |         self.sess.run(self.update_target_actor_op)
90 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | 
  4 | from util import *
  5 | 
  6 | from wolp_agent import *
  7 | from ddpg.agent import DDPGAgent
  8 | from util.data import Data
  9 | from util.data import Timer
 10 | 
 11 | time_now = -1
 12 | 
 13 | 
 14 | def run(episodes=[2500], collecting_data=True):
 15 | 
 16 |     experiment = ('CartPole-v1',
 17 |                   'InvertedPendulum-v1',
 18 |                   'LunarLanderContinuous-v2')[1]
 19 |     env = gym.make(experiment)
 20 | 
 21 |     print(env.observation_space)
 22 |     print(env.action_space)
 23 | 
 24 |     steps = env.spec.timestep_limit
 25 | 
 26 |     # agent = DDPGAgent(env)
 27 |     max_actions = 1e2
 28 |     agent = WolpertingerAgent(env, k_nearest_neighbors=int(0.1 * max_actions),
 29 |                               max_actions=max_actions)
 30 | 
 31 |     # file_name = "results/data_" + agent.get_name() + str(episodes) + ".txt"
 32 |     file_name = "data_" + str(episodes) + '_' + agent.get_name()
 33 |     print(file_name)
 34 |     result_fetcher = Data(file_name)
 35 | 
 36 |     result_fetcher.add_arrays(['rewards', 'count', 'actions', 'done'])
 37 |     result_fetcher.add_arrays(['state_' + str(i) for i in range(agent.observation_space_size)])
 38 | 
 39 |     result_fetcher.add_timers(['render', 'act', 'step', 'saving'], 'run_')
 40 |     result_fetcher.add_timer('t_run_observe', one_hot=False)
 41 |     agent.add_data_fetch(result_fetcher)
 42 | 
 43 |     timer = Timer()
 44 | 
 45 |     for i in range(episodes):
 46 |         timer.reset()
 47 |         observation = env.reset()
 48 |         # for i in range(agent.observation_space_size):
 49 |         #     result_fetcher.add_to_array('state_' + str(i), observation[i])
 50 | 
 51 |         total_reward = 0
 52 |         print('Episode ', i, '/', episodes - 1, 'started...', end='')
 53 |         for t in range(steps):
 54 | 
 55 |             result_fetcher.reset_timers()
 56 | 
 57 |             if not collecting_data:
 58 |                 env.render()
 59 | 
 60 |             result_fetcher.sample_timer('render')  # ------
 61 | 
 62 |             action = agent.act(observation)
 63 | 
 64 |             result_fetcher.add_to_array('actions', action)  # -------
 65 | 
 66 |             result_fetcher.sample_timer('act')  # ------
 67 | 
 68 |             for i in range(agent.observation_space_size):
 69 |                 result_fetcher.add_to_array('state_' + str(i), observation[i])
 70 |             prev_observation = observation
 71 |             observation, reward, done, info = env.step(action)
 72 | 
 73 |             episode = {'obs': prev_observation,
 74 |                        'action': action,
 75 |                        'reward': reward,
 76 |                        'obs2': observation,
 77 |                        'done': done,
 78 |                        't': t}
 79 | 
 80 |             result_fetcher.sample_timer('step')  # ------
 81 |             result_fetcher.add_to_array('count', 1)
 82 | 
 83 |             # print('\n' + str(episode['obs']))
 84 |             result_fetcher.start_timer('observe')
 85 |             agent.observe(episode)
 86 |             result_fetcher.sample_timer('observe')  # ------
 87 | 
 88 |             total_reward += reward
 89 |             result_fetcher.add_to_array('done', 1 if done else 0)
 90 |             if done or (t == steps - 1):
 91 |                 t += 1
 92 |                 result_fetcher.add_to_array('rewards', total_reward)  # ------
 93 | 
 94 |                 time_passed = timer.get_time()
 95 |                 print('Reward:', total_reward, 'Steps:', t, 't:',
 96 |                       time_passed, '({}/step)'.format(round(time_passed / t)))
 97 | 
 98 |                 if not collecting_data:
 99 |                     # save_episode(episode_history)
100 |                     pass
101 |                 else:
102 |                     pass
103 |                     # if i % 100 == 0:
104 |                     # result_fetcher.async_save()
105 |                 result_fetcher.sample_timer('saving')  # ------
106 |                 break
107 |     # end of episodes
108 | 
109 |     result_fetcher.async_save()
110 |     # result_fetcher.print_data()
111 | 
112 |     result_fetcher.print_times(groups=['run_'])
113 |     result_fetcher.print_times(groups=['agent_'], total_time_field='count')
114 | 
115 | 
116 | def save_episode(episode, overwrite=True):
117 |     from pathlib import Path
118 |     import datetime
119 |     from os import makedirs
120 | 
121 |     string = str(episode).replace('},', '},\n')
122 | 
123 |     if overwrite:
124 |         file = open('results/last_episode', 'w')
125 |         file.write(string)
126 |         file.close()
127 |     else:
128 |         now = datetime.datetime.now()
129 | 
130 |         dir_name = "results/%s-%s-%s" % (now.day, now.month, now.year)
131 |         file = Path(dir_name)
132 |         if not file.is_dir():
133 |             makedirs(dir_name)
134 | 
135 |         counter = 0
136 |         while True:
137 |             file_name = dir_name + '/episode_%d.txt' % (counter)
138 |             file = Path(file_name)
139 |             if file.is_file():
140 |                 print(file_name + " exists")
141 |                 counter += 1
142 |             else:
143 |                 file = open(file_name, 'w')
144 |                 file.write(string)
145 |                 file.close()
146 |                 break
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     run()
151 | 


--------------------------------------------------------------------------------
/src/util/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import threading
  3 | import pickle
  4 | from timer import *
  5 | 
  6 | 
  7 | def save_dictionary(dict, path):
  8 |     with open('results/obj/' + path + '.pkl', 'wb') as f:
  9 |         pickle.dump(dict, f, 0)
 10 | 
 11 | 
 12 | class Data:
 13 | 
 14 |     def __init__(self, name='default_name'):
 15 |         self.name = name
 16 |         self.data = {}
 17 |         self.timers = {}
 18 | 
 19 |     def _add(self, field_name, timer, timer_one_hot=True):
 20 |         self.data[field_name] = np.array([])
 21 |         if timer:
 22 |             self.timers[field_name] = Timer(timer_one_hot)
 23 | 
 24 |     def add_array(self, field_name):
 25 |         self._add(field_name, False)
 26 |         # self.data[field_name] = np.array([])
 27 | 
 28 |     def add_arrays(self, fields, prefix=''):
 29 |         for f in fields:
 30 |             self.add_array(prefix + f)
 31 | 
 32 |     def add_to_array(self, field_name, value, abs_name=False):
 33 |         if abs_name:
 34 |             self.data[field_name] = np.append(self.data[field_name], value)
 35 |         else:
 36 |             fields = self.get_keys(field_name)
 37 |             for f in fields:
 38 |                 self.data[f] = np.append(self.data[f], value)
 39 | 
 40 |     def add_timer(self, field_name, one_hot=True):
 41 |         self._add(field_name, True, one_hot)
 42 |         # self.add_array(name)
 43 |         # self.timers[name] = Timer()
 44 | 
 45 |     def add_timers(self, names, prefix='', one_hot=True):
 46 |         for f in names:
 47 |             self.add_timer(prefix + f, one_hot)
 48 | 
 49 |     def start_timer(self, field_name):
 50 |         fields = self.get_keys(field_name)
 51 |         for f in fields:
 52 |             self.timers[f].reset()
 53 | 
 54 |     def sample_timer(self, field_name, abs_name=False):
 55 |         if abs_name:
 56 |             self.data[field_name] = np.append(
 57 |                 self.data[field_name], self.timers[field_name].get_time())
 58 |         else:
 59 |             fields = self.get_keys(field_name)
 60 |             timer_keys = self.timers.keys()
 61 |             for f in fields:
 62 |                 if f in timer_keys:
 63 |                     self.data[f] = np.append(self.data[f], self.timers[f].get_time())
 64 | 
 65 |         self.reset_timers_one_hot()
 66 | 
 67 |     def reset_timers(self):
 68 |         for t in self.timers:
 69 |             self.timers[t].reset()
 70 | 
 71 |     def reset_timers_one_hot(self):
 72 |         for t in self.timers:
 73 |             self.timers[t].reset_one_hot()
 74 | 
 75 |     def set_data(self, field_name, data):
 76 |         self.data[field_name] = data
 77 | 
 78 |     def get_data(self, field_name):
 79 |         return self.data[field_name]
 80 | 
 81 |     def print_data(self, field_name=''):
 82 |         keys = list(self.get_keys(field_name))
 83 |         keys.sort()
 84 |         for key in keys:
 85 |             print(key, self.data[key].shape, self.data[key])
 86 | 
 87 |     def print_fields(self):
 88 |         for k in self.get_keys():
 89 |             print(k)
 90 | 
 91 |     def load(self, path=None):
 92 |         if path is None:
 93 |             path = self.name
 94 |         with open('results/obj/' + path + '.pkl', 'rb') as f:
 95 |             self.data = pickle.load(f)
 96 | 
 97 |     def async_save(self):
 98 |         thread = save_fulldata(self)
 99 |         thread.start()
100 | 
101 |     def print_times(self, other_keys=None, groups=None, total_time_field=None):
102 |         final_keys = []
103 |         if (other_keys is None) and (groups is None):
104 |             final_keys = self.timers.keys()
105 |         else:
106 |             if other_keys is not None:
107 |                 final_keys.extend(other_keys)
108 | 
109 |             if groups is not None:
110 |                 timers = self.timers.keys()
111 |                 for g in groups:
112 |                     for t in timers:
113 |                         if g in t:
114 |                             final_keys.append(t)
115 | 
116 |         if (final_keys is None) or (len(final_keys) == 0):
117 |             print("No items found to be printed")
118 |             return
119 | 
120 |         times = {}
121 |         total_time = 0
122 |         samples = []
123 | 
124 |         for key in final_keys:
125 |             times[key] = np.sum(self.get_data(key))
126 |             total_time += times[key]
127 | 
128 |             samples.append(len(self.get_data(key)))
129 | 
130 |         count = max(samples)
131 |         if total_time_field is not None:
132 |             count = np.sum(self.get_data(total_time_field))
133 | 
134 |         print('\n\nName: {}\tCount: {} Group:{}'.format(self.name, count, groups))
135 |         print('key\t\tabs\t\tavg/unit\t% of total')
136 |         print('-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-')
137 | 
138 |         keys = list(final_keys)
139 |         keys.sort()
140 |         max_key_len = 5
141 |         for key in keys:
142 |             if max_key_len < len(key):
143 |                 max_key_len = len(key)
144 | 
145 |         for key in keys:
146 |             temp = times[key]
147 |             avg = temp / count
148 |             total = 100 * temp / total_time
149 |             print('{}{}\t\t{}\t\t{:6.2f}\t\t{:6.2f}'.format(
150 |                 key, '.' * (max_key_len - len(key)), temp, avg, total))
151 | 
152 |         print('Total{}\t\t{}\t\t{:6.2f}\t\t 100.0'.format(
153 |             '.' * (max_key_len - 5), total_time, total_time / count))
154 | 
155 |     def get_keys(self, key=''):
156 |         res = []
157 |         for k in self.data.keys():
158 |             # if k.find(key) >= 0:
159 |             if key in k:
160 |                 res.append(k)
161 | 
162 |         return res
163 | 
164 |     def get_empty_clone(self):
165 |         res = Fulldata(self.name + '_clone')
166 |         res.add_arrays(self.get_keys())
167 |         return res
168 | 
169 | 
170 | class save_fulldata(threading.Thread):
171 |     def __init__(self, fd):
172 |         threading.Thread.__init__(self)
173 |         self.dict = fd.data
174 |         self.path = fd.name
175 | 
176 |     def run(self):
177 |         save_dictionary(self.dict, self.path)
178 | 


--------------------------------------------------------------------------------
/src/util/data_graph.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import math
  4 | import ntpath
  5 | 
  6 | BATCH_RATIO = 0.01
  7 | EXTENSIONS = ['.txt']
  8 | DIRECTORY = "/../../results"
  9 | # DIRECTORY = "/../../data/saved_ddpg_new"
 10 | 
 11 | 
 12 | def plot_file(file_name):
 13 |     data = np.loadtxt(file_name)
 14 |     plot_data(data, file_name=file_name)
 15 | 
 16 | 
 17 | def plot_data(data, batch_size=-1, file_name="data"):
 18 | 
 19 |     data_size = data.shape[0]
 20 |     if batch_size == -1:
 21 |         batch_size = max(int(data_size * BATCH_RATIO), 1)
 22 |     if BATCH_RATIO == 1:
 23 |         batch_size = 1
 24 |     batches = math.ceil(data_size / batch_size)
 25 | 
 26 |     avg = np.average(data)
 27 | 
 28 |     final_data = np.zeros((batches, 4))
 29 | 
 30 |     for i in range(batches):
 31 |         temp = data[i * batch_size: int(min((i + 1) * batch_size, data_size))]
 32 | 
 33 |         final_data[i] = [np.amax(temp), np.average(temp), np.amin(temp), avg]
 34 | 
 35 |         # if not i == 0:
 36 |         #     final_data[i, 3] = final_data[i, 1] - final_data[i - 1, 1]
 37 | 
 38 |     x_axis = batch_size * np.arange(0, final_data.shape[0])
 39 | 
 40 |     plt.figure()
 41 |     plt.subplot(211)
 42 | 
 43 |     line_widths = [1, 2, 1, 0.5]
 44 |     line_colors = ['r', 'g', 'b', 'm']
 45 |     texts = ['max', 'data', 'min', 'avg=' + str(avg)]
 46 |     for i in range(4):  # derivative out
 47 |         if batch_size == 1 and ((not i == 1) or (not i == 3)):
 48 |             continue
 49 | 
 50 |         index = int((i + 5) * 0.1 * len(final_data[:, i]))
 51 |         plt.plot(x_axis, final_data[:, i], line_colors[i], linewidth=line_widths[i])
 52 |         plt.text(0.05 * len(final_data[:, i]), (i + 1) * 0.1 * np.amax(final_data[:, 0]),
 53 |                  texts[i], color=line_colors[i])
 54 | 
 55 |         # plt.annotate(texts[i],  xy=(x_axis[index], final_data[index, i]),
 56 |         #              xytext=(x_axis[index], final_data[index, i] + int(np.amax(final_data) * 0.4)),
 57 |         #              arrowprops=dict(facecolor=line_colors[i], shrink=0.05))
 58 | 
 59 |     # plt.plot(x_axis, final_data[:, 0], 'r', linewidth = 1)
 60 |     # plt.plot(x_axis, final_data[:, 1], 'g')
 61 |     # plt.plot(x_axis, final_data[:, 2], 'b', linewidth = 1)
 62 |     # plt.plot(x_axis, final_data[:, 3], 'm--', linewidth = 0.5)
 63 | 
 64 |     plt.grid(True)
 65 |     plt.title(ntpath.basename(file_name) + "(" + str(batch_size) + " batch size)")
 66 |     plt.ylabel("Reward")
 67 |     plt.xlabel("Episode")
 68 | 
 69 |     # reduced_data, ignored = ignore_low_values(data)
 70 |     # reduced_data, ignored = ignore_starting_rewards(data)
 71 |     reduced_data, ignored = data, 0
 72 |     STAT_GROUPS = 20
 73 |     MAX_VALUE = np.amax(reduced_data)
 74 |     # statistics
 75 |     stats = np.zeros((STAT_GROUPS))
 76 |     for i in reduced_data:
 77 |         index = int(i / ((MAX_VALUE + 1) / STAT_GROUPS))
 78 |         stats[index] += 1
 79 | 
 80 |     #stats *=100/len(data)
 81 |     x_axis = ((MAX_VALUE + 1) / STAT_GROUPS) * np.arange(STAT_GROUPS)
 82 |     plt.subplot(212)
 83 |     plt.plot(x_axis, stats, 'go-')
 84 |     # plt.axis([0, MAX_VALUE+1])
 85 |     plt.yscale("log")
 86 |     plt.grid(True)
 87 |     # plt.title("Statistics histogram")
 88 |     # plt.ylabel("%(ign "+ str(round(100*ignored/len(data)))+ '%)')
 89 |     plt.ylabel("Samples")
 90 |     plt.xlabel("Value")
 91 | 
 92 |     plt.show()
 93 | 
 94 | # unstested
 95 | 
 96 | 
 97 | def plot_surface(X, Y, Z):
 98 |     from mpl_toolkits.mplot3d import Axes3D
 99 |     import matplotlib.pyplot as plt
100 |     from matplotlib import cm
101 |     from matplotlib.ticker import LinearLocator, FormatStrFormatter
102 |     import numpy as np
103 | 
104 |     fig = plt.figure(figsize=plt.figaspect(0.5))
105 |     ax = fig.add_subplot(1, 2, 1)
106 | 
107 |     t = plt.imshow(Z)
108 | 
109 |     t.set_cmap(cm.coolwarm)
110 |     plt.colorbar()
111 |     ax = fig.add_subplot(1, 2, 2, projection='3d')
112 |     # Plot the surface.
113 |     surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
114 |                            linewidth=0, antialiased=False)
115 |     # Add a color bar which maps values to colors.
116 |     # fig.colorbars(surf, shrink=0.5, aspect=5)
117 |     plt.tight_layout()
118 |     plt.show()
119 | 
120 | 
121 | def ignore_starting_rewards(data, threshold=200):
122 |     index = 0
123 |     for i in range(len(data)):
124 |         if data[i] >= threshold:
125 |             index = i
126 |             break
127 |     return data[index:], index
128 | 
129 | 
130 | def ignore_low_values(data, threshold=200):
131 |     res = np.extract(data > 200, data)
132 |     return res, len(data) - len(res)
133 | 
134 | 
135 | def set_patameters_and_get_files():
136 |     import argparse as arg
137 | 
138 |     parser = arg.ArgumentParser(description="Plot given reward files")
139 |     parser.add_argument("file", type=str, nargs='*',
140 |                         help="files to be plotted")
141 |     parser.add_argument("-r", "--ratio", type=float,
142 |                         help="batch to sample size ratio. Default: 0.01")
143 |     parser.add_argument("-f", "--directory", type=str,
144 |                         help="specify taret directory. Default: /")
145 | 
146 |     directory = parser.parse_args().directory
147 |     if directory is not None:
148 |         global DIRECTORY
149 |         if directory[0] != '/':
150 |             directory = '/' + directory
151 |         DIRECTORY = directory
152 |     # parser.add_argument("-e", "--extensions", type=str ,
153 |     #                     help="Extension to b searched. Default: .txt")
154 |     #
155 |     # ext = parser.parse_args().extensions
156 |     # if ext is not None:
157 |     #     global EXTENSIONS
158 |     #     EXTENSIONS = ext
159 | 
160 |     files = parser.parse_args().file
161 |     if len(files) == 0:
162 |         files = get_all_txt_files()
163 | 
164 |     rat = parser.parse_args().ratio
165 |     if rat is not None:
166 |         global BATCH_RATIO
167 |         BATCH_RATIO = rat
168 | 
169 |     print(parser.parse_args().ratio)
170 |     return files
171 | 
172 | 
173 | def get_all_txt_files():
174 |     from os import listdir
175 |     from os.path import isfile, join, dirname, realpath, splitext
176 | 
177 |     mypath = dirname(realpath(__file__)) + DIRECTORY
178 |     onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
179 |     txtfiles = []
180 |     for f in onlyfiles:
181 |         if splitext(f)[1] in EXTENSIONS:
182 |             txtfiles.append(mypath + "/" + f)
183 |     return txtfiles
184 | 


--------------------------------------------------------------------------------
/src/ddpg/critic_net.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import math
  4 | 
  5 | TAU = 0.001
  6 | LEARNING_RATE = 0.001
  7 | BATCH_SIZE = 64
  8 | 
  9 | 
 10 | class CriticNet:
 11 |     """ Critic Q value model of the DDPG algorithm """
 12 | 
 13 |     def __init__(self, num_states, num_actions):
 14 | 
 15 |         self.g = tf.Graph()
 16 |         with self.g.as_default():
 17 |             self.sess = tf.InteractiveSession()
 18 | 
 19 |             # critic_q_model parameters:
 20 |             self.W1_c, self.B1_c, self.W2_c, self.W2_action_c, self.B2_c, self.W3_c, self.B3_c,\
 21 |                 self.critic_q_model, self.critic_state_in, self.critic_action_in = self.create_critic_net(
 22 |                     num_states, num_actions)
 23 | 
 24 |             # create target_q_model:
 25 |             self.t_W1_c, self.t_B1_c, self.t_W2_c, self.t_W2_action_c, self.t_B2_c, self.t_W3_c, self.t_B3_c,\
 26 |                 self.t_critic_q_model, self.t_critic_state_in, self.t_critic_action_in = self.create_critic_net(
 27 |                     num_states, num_actions)
 28 | 
 29 |             self.q_value_in = tf.placeholder("float", [None, 1])  # supervisor
 30 |             #self.l2_regularizer_loss = tf.nn.l2_loss(self.W1_c)+tf.nn.l2_loss(self.W2_c)+ tf.nn.l2_loss(self.W2_action_c) + tf.nn.l2_loss(self.W3_c)+tf.nn.l2_loss(self.B1_c)+tf.nn.l2_loss(self.B2_c)+tf.nn.l2_loss(self.B3_c)
 31 |             self.l2_regularizer_loss = 0.0001 * \
 32 |                 tf.reduce_sum(tf.pow(self.W2_c, 2)) + 0.0001 * tf.reduce_sum(tf.pow(self.B2_c, 2))
 33 |             self.cost = tf.pow(self.critic_q_model - self.q_value_in, 2) / BATCH_SIZE + \
 34 |                 self.l2_regularizer_loss  # /tf.to_float(tf.shape(self.q_value_in)[0])
 35 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.cost)
 36 | 
 37 |             # action gradient to be used in actor network:
 38 |             # self.action_gradients=tf.gradients(self.critic_q_model,self.critic_action_in)
 39 |             # from simple actor net:
 40 |             self.act_grad_v = tf.gradients(self.critic_q_model, self.critic_action_in)
 41 |             # this is just divided by batch size
 42 |             self.action_gradients = [self.act_grad_v[0] /
 43 |                                      tf.to_float(tf.shape(self.act_grad_v[0])[0])]
 44 |             # from simple actor net:
 45 |             self.check_fl = self.action_gradients
 46 | 
 47 |             # initialize all tensor variable parameters:
 48 |             self.sess.run(tf.global_variables_initializer())
 49 | 
 50 |             self.update_target_critic_op = [
 51 |                 self.t_W1_c.assign(TAU * self.W1_c + (1 - TAU) * self.t_W1_c),
 52 |                 self.t_B1_c.assign(TAU * self.B1_c + (1 - TAU) * self.t_B1_c),
 53 |                 self.t_W2_c.assign(TAU * self.W2_c + (1 - TAU) * self.t_W2_c),
 54 |                 self.t_W2_action_c.assign(TAU * self.W2_action_c + (1 - TAU) * self.t_W2_action_c),
 55 |                 self.t_B2_c.assign(TAU * self.B2_c + (1 - TAU) * self.t_B2_c),
 56 |                 self.t_W3_c.assign(TAU * self.W3_c + (1 - TAU) * self.t_W3_c),
 57 |                 self.t_B3_c.assign(TAU * self.B3_c + (1 - TAU) * self.t_B3_c)
 58 |             ]
 59 |             # To make sure critic and target have same parmameters copy the parameters:
 60 |             # copy target parameters
 61 |             self.sess.run([
 62 |                 self.t_W1_c.assign(self.W1_c),
 63 |                 self.t_B1_c.assign(self.B1_c),
 64 |                 self.t_W2_c.assign(self.W2_c),
 65 |                 self.t_W2_action_c.assign(self.W2_action_c),
 66 |                 self.t_B2_c.assign(self.B2_c),
 67 |                 self.t_W3_c.assign(self.W3_c),
 68 |                 self.t_B3_c.assign(self.B3_c)
 69 |             ])
 70 | 
 71 |     def create_critic_net(self, num_states=4, num_actions=1):
 72 |         N_HIDDEN_1 = 400
 73 |         N_HIDDEN_2 = 300
 74 |         critic_state_in = tf.placeholder("float", [None, num_states])
 75 |         critic_action_in = tf.placeholder("float", [None, num_actions])
 76 | 
 77 |         W1_c = tf.Variable(tf.random_uniform(
 78 |             [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 79 |         B1_c = tf.Variable(tf.random_uniform(
 80 |             [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 81 |         W2_c = tf.Variable(tf.random_uniform(
 82 |             [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 83 |         W2_action_c = tf.Variable(tf.random_uniform(
 84 |             [num_actions, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 85 |         B2_c = tf.Variable(tf.random_uniform(
 86 |             [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 87 |         W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2, 1], -0.003, 0.003))
 88 |         B3_c = tf.Variable(tf.random_uniform([1], -0.003, 0.003))
 89 | 
 90 |         H1_c = tf.nn.softplus(tf.matmul(critic_state_in, W1_c) + B1_c)
 91 |         H2_c = tf.nn.tanh(tf.matmul(H1_c, W2_c) + tf.matmul(critic_action_in, W2_action_c) + B2_c)
 92 | 
 93 |         critic_q_model = tf.matmul(H2_c, W3_c) + B3_c
 94 | 
 95 |         return W1_c, B1_c, W2_c, W2_action_c, B2_c, W3_c, B3_c, critic_q_model, critic_state_in, critic_action_in
 96 | 
 97 |     def train_critic(self, state_t_batch, action_batch, y_i_batch):
 98 |         self.sess.run(self.optimizer, feed_dict={
 99 |                       self.critic_state_in: state_t_batch, self.critic_action_in: action_batch, self.q_value_in: y_i_batch})
100 | 
101 |     def evaluate_target_critic(self, state_t_1, action_t_1):
102 |         return self.sess.run(self.t_critic_q_model, feed_dict={self.t_critic_state_in: state_t_1, self.t_critic_action_in: action_t_1})
103 | 
104 |     def evaluate_critic(self, state_1, action_1):
105 |         return self.sess.run(self.critic_q_model, feed_dict={self.critic_state_in: state_1, self.critic_action_in: action_1})
106 | 
107 |     def compute_delQ_a(self, state_t, action_t):
108 |         #        print '\n'
109 |         #        print 'check grad number'
110 |         #        ch= self.sess.run(self.check_fl, feed_dict={self.critic_state_in: state_t,self.critic_action_in: action_t})
111 |         #        print len(ch)
112 |         #        print len(ch[0])
113 |         #        raw_input("Press Enter to continue...")
114 |         return self.sess.run(self.action_gradients, feed_dict={self.critic_state_in: state_t, self.critic_action_in: action_t})
115 | 
116 |     def update_target_critic(self):
117 |         self.sess.run(self.update_target_critic_op)
118 | 


--------------------------------------------------------------------------------
/src/ddpg/actor_net_bn.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import math
  4 | import batch_norm
  5 | import numpy as np
  6 | LEARNING_RATE = 0.0001
  7 | TAU = 0.001
  8 | BATCH_SIZE = 64
  9 | N_HIDDEN_1 = 400
 10 | N_HIDDEN_2 = 300
 11 | 
 12 | 
 13 | class ActorNet_bn:
 14 |     """ Actor Network Model with Batch Normalization of DDPG Algorithm """
 15 | 
 16 |     def __init__(self, num_states, num_actions):
 17 |         tf.reset_default_graph()
 18 |         self.g = tf.Graph()
 19 |         with self.g.as_default():
 20 |             self.sess = tf.InteractiveSession()
 21 | 
 22 |             # actor network model parameters:
 23 |             self.actor_state_in = tf.placeholder("float", [None, num_states])
 24 |             self.W1_a = tf.Variable(tf.random_uniform(
 25 |                 [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 26 |             self.B1_a = tf.Variable(tf.random_uniform(
 27 |                 [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 28 |             self.W2_a = tf.Variable(tf.random_uniform(
 29 |                 [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1)))
 30 |             self.B2_a = tf.Variable(tf.random_uniform(
 31 |                 [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1)))
 32 |             self.W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2, num_actions], -0.003, 0.003))
 33 |             self.B3_a = tf.Variable(tf.random_uniform([num_actions], -0.003, 0.003))
 34 | 
 35 |             self.is_training = tf.placeholder(tf.bool, [])
 36 |             self.H1_t = tf.matmul(self.actor_state_in, self.W1_a)
 37 |             self.H1_a_bn = batch_norm(self.H1_t, N_HIDDEN_1, self.is_training, self.sess)
 38 |             self.H1_a = tf.nn.softplus(self.H1_a_bn.bnorm) + self.B1_a
 39 | 
 40 |             self.H2_t = tf.matmul(self.H1_a, self.W2_a)
 41 |             self.H2_a_bn = batch_norm(self.H2_t, N_HIDDEN_2, self.is_training, self.sess)
 42 |             self.H2_a = tf.nn.tanh(self.H2_a_bn.bnorm) + self.B2_a
 43 |             self.actor_model = tf.matmul(self.H2_a, self.W3_a) + self.B3_a
 44 | 
 45 |             # target actor network model parameters:
 46 |             self.t_actor_state_in = tf.placeholder("float", [None, num_states])
 47 |             self.t_W1_a = tf.Variable(tf.random_uniform(
 48 |                 [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 49 |             self.t_B1_a = tf.Variable(tf.random_uniform(
 50 |                 [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 51 |             self.t_W2_a = tf.Variable(tf.random_uniform(
 52 |                 [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1)))
 53 |             self.t_B2_a = tf.Variable(tf.random_uniform(
 54 |                 [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1), 1 / math.sqrt(N_HIDDEN_1)))
 55 |             self.t_W3_a = tf.Variable(tf.random_uniform([N_HIDDEN_2, num_actions], -0.003, 0.003))
 56 |             self.t_B3_a = tf.Variable(tf.random_uniform([num_actions], -0.003, 0.003))
 57 | 
 58 |             self.t_is_training = tf.placeholder(tf.bool, [])
 59 |             self.t_H1_t = tf.matmul(self.t_actor_state_in, self.t_W1_a)
 60 |             self.t_H1_a_bn = batch_norm(self.t_H1_t, N_HIDDEN_1,
 61 |                                         self.t_is_training, self.sess, self.H1_a_bn)
 62 |             self.t_H1_a = tf.nn.softplus(self.t_H1_a_bn.bnorm) + self.t_B1_a
 63 | 
 64 |             self.t_H2_t = tf.matmul(self.t_H1_a, self.t_W2_a)
 65 |             self.t_H2_a_bn = batch_norm(self.t_H2_t, N_HIDDEN_2,
 66 |                                         self.t_is_training, self.sess, self.H2_a_bn)
 67 |             self.t_H2_a = tf.nn.tanh(self.t_H2_a_bn.bnorm) + self.t_B2_a
 68 |             self.t_actor_model = tf.matmul(self.t_H2_a, self.t_W3_a) + self.t_B3_a
 69 | 
 70 |             # cost of actor network:
 71 |             # gets input from action_gradient computed in critic network file
 72 |             self.q_gradient_input = tf.placeholder("float", [None, num_actions])
 73 |             self.actor_parameters = [self.W1_a, self.B1_a, self.W2_a, self.B2_a, self.W3_a,
 74 |                                      self.B3_a, self.H1_a_bn.scale, self.H1_a_bn.beta, self.H2_a_bn.scale, self.H2_a_bn.beta]
 75 |             # /BATCH_SIZE) changed -self.q_gradient to -
 76 |             self.parameters_gradients = tf.gradients(
 77 |                 self.actor_model, self.actor_parameters, -self.q_gradient_input)
 78 | 
 79 |             self.optimizer = tf.train.AdamOptimizer(
 80 |                 learning_rate=LEARNING_RATE, epsilon=1e-08).apply_gradients(zip(self.parameters_gradients, self.actor_parameters))
 81 |             # initialize all tensor variable parameters:
 82 |             self.sess.run(tf.initialize_all_variables())
 83 | 
 84 |             # To make sure actor and target have same intial parmameters copy the parameters:
 85 |             # copy target parameters
 86 |             self.sess.run([
 87 |                 self.t_W1_a.assign(self.W1_a),
 88 |                 self.t_B1_a.assign(self.B1_a),
 89 |                 self.t_W2_a.assign(self.W2_a),
 90 |                 self.t_B2_a.assign(self.B2_a),
 91 |                 self.t_W3_a.assign(self.W3_a),
 92 |                 self.t_B3_a.assign(self.B3_a)])
 93 | 
 94 |     def evaluate_actor(self, state_t):
 95 |         return self.sess.run(self.actor_model, feed_dict={self.actor_state_in: state_t, self.is_training: False})
 96 | 
 97 |     def evaluate_target_actor(self, state_t_1):
 98 |         return self.sess.run(self.t_actor_model, feed_dict={self.t_actor_state_in: state_t_1, self.t_is_training: False})
 99 | 
100 |     def train_actor(self, actor_state_in, q_gradient_input):
101 |         self.sess.run([self.optimizer, self.H1_a_bn.train_mean, self.H1_a_bn.train_var, self.H2_a_bn.train_mean, self.H2_a_bn.train_var, self.t_H1_a_bn.train_mean, self.t_H1_a_bn.train_var, self.t_H2_a_bn.train_mean,
102 |                        self.t_H2_a_bn.train_var], feed_dict={self.actor_state_in: actor_state_in, self.t_actor_state_in: actor_state_in, self.q_gradient_input: q_gradient_input, self.is_training: True, self.t_is_training: True})
103 | 
104 |     def update_target_actor(self):
105 |         self.sess.run([
106 |             self.t_W1_a.assign(TAU * self.W1_a + (1 - TAU) * self.t_W1_a),
107 |             self.t_B1_a.assign(TAU * self.B1_a + (1 - TAU) * self.t_B1_a),
108 |             self.t_W2_a.assign(TAU * self.W2_a + (1 - TAU) * self.t_W2_a),
109 |             self.t_B2_a.assign(TAU * self.B2_a + (1 - TAU) * self.t_B2_a),
110 |             self.t_W3_a.assign(TAU * self.W3_a + (1 - TAU) * self.t_W3_a),
111 |             self.t_B3_a.assign(TAU * self.B3_a + (1 - TAU) * self.t_B3_a),
112 |             self.t_H1_a_bn.updateTarget,
113 |             self.t_H2_a_bn.updateTarget,
114 |         ])
115 | 


--------------------------------------------------------------------------------
/src/ddpg/critic_net_bn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import math
  3 | from batch_norm import *
  4 | import numpy as np
  5 | LEARNING_RATE = 0.001
  6 | TAU = 0.001
  7 | BATCH_SIZE = 64
  8 | N_HIDDEN_1 = 400
  9 | N_HIDDEN_2 = 300
 10 | 
 11 | 
 12 | class CriticNet_bn:
 13 |     """ Critic Q value model with batch normalization of the DDPG algorithm """
 14 | 
 15 |     def __init__(self, num_states, num_actions):
 16 | 
 17 |         tf.reset_default_graph()
 18 |         self.g = tf.Graph()
 19 |         with self.g.as_default():
 20 |             self.sess = tf.InteractiveSession()
 21 | 
 22 |             # Critic Q Network:
 23 |             self.critic_state_in = tf.placeholder("float", [None, num_states])
 24 |             self.critic_action_in = tf.placeholder("float", [None, num_actions])
 25 |             self.W1_c = tf.Variable(tf.random_uniform(
 26 |                 [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 27 |             self.B1_c = tf.Variable(tf.random_uniform(
 28 |                 [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 29 |             self.W2_c = tf.Variable(tf.random_uniform(
 30 |                 [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 31 |             self.B2_c = tf.Variable(tf.random_uniform(
 32 |                 [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 33 |             self.W2_action_c = tf.Variable(tf.random_uniform(
 34 |                 [num_actions, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 35 |             self.W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2, 1], -0.003, 0.003))
 36 |             self.B3_c = tf.Variable(tf.random_uniform([1], -0.003, 0.003))
 37 | 
 38 |             self.is_training = tf.placeholder(tf.bool, [])
 39 |             self.H1_t = tf.matmul(self.critic_state_in, self.W1_c)
 40 |             self.H1_c_bn = batch_norm(self.H1_t, N_HIDDEN_1, self.is_training, self.sess)
 41 | 
 42 |             self.H1_c = tf.nn.softplus(self.H1_c_bn.bnorm) + self.B1_c
 43 | 
 44 |             self.H2_t = tf.matmul(self.H1_c, self.W2_c) + \
 45 |                 tf.matmul(self.critic_action_in, self.W2_action_c)
 46 |             self.H2_c_bn = batch_norm(self.H2_t, N_HIDDEN_2, self.is_training, self.sess)
 47 |             self.H2_c = tf.nn.tanh(self.H2_c_bn.bnorm) + self.B2_c
 48 | 
 49 |             self.critic_q_model = tf.matmul(self.H2_c, self.W3_c) + self.B3_c
 50 | 
 51 |            # Target Critic Q Network:
 52 |             self.t_critic_state_in = tf.placeholder("float", [None, num_states])
 53 |             self.t_critic_action_in = tf.placeholder("float", [None, num_actions])
 54 |             self.t_W1_c = tf.Variable(tf.random_uniform(
 55 |                 [num_states, N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 56 |             self.t_B1_c = tf.Variable(tf.random_uniform(
 57 |                 [N_HIDDEN_1], -1 / math.sqrt(num_states), 1 / math.sqrt(num_states)))
 58 |             self.t_W2_c = tf.Variable(tf.random_uniform(
 59 |                 [N_HIDDEN_1, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 60 |             self.t_W2_action_c = tf.Variable(tf.random_uniform(
 61 |                 [num_actions, N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 62 |             self.t_B2_c = tf.Variable(tf.random_uniform(
 63 |                 [N_HIDDEN_2], -1 / math.sqrt(N_HIDDEN_1 + num_actions), 1 / math.sqrt(N_HIDDEN_1 + num_actions)))
 64 |             self.t_W3_c = tf.Variable(tf.random_uniform([N_HIDDEN_2, 1], -0.003, 0.003))
 65 |             self.t_B3_c = tf.Variable(tf.random_uniform([1], -0.003, 0.003))
 66 | 
 67 |             self.t_H1_t = tf.matmul(self.t_critic_state_in, self.t_W1_c)
 68 |             self.t_H1_c_bn = batch_norm(self.t_H1_t, N_HIDDEN_1,
 69 |                                         self.is_training, self.sess, self.H1_c_bn)
 70 |             self.t_H1_c = tf.nn.softplus(self.t_H1_c_bn.bnorm) + self.t_B1_c
 71 | 
 72 |             self.t_H2_t = tf.matmul(self.t_H1_c, self.t_W2_c) + \
 73 |                 tf.matmul(self.t_critic_action_in, self.t_W2_action_c)
 74 |             self.t_H2_c_bn = batch_norm(self.t_H2_t, N_HIDDEN_2,
 75 |                                         self.is_training, self.sess, self.H2_c_bn)
 76 |             self.t_H2_c = tf.nn.tanh(self.t_H2_c_bn.bnorm) + self.t_B2_c
 77 | 
 78 |             self.t_critic_q_model = tf.matmul(self.t_H2_c, self.t_W3_c) + self.t_B3_c
 79 | 
 80 |             self.q_value_in = tf.placeholder("float", [None, 1])  # supervisor
 81 |             #self.l2_regularizer_loss = tf.nn.l2_loss(self.W1_c)+tf.nn.l2_loss(self.W2_c)+ tf.nn.l2_loss(self.W2_action_c) + tf.nn.l2_loss(self.W3_c)+tf.nn.l2_loss(self.B1_c)+tf.nn.l2_loss(self.B2_c)+tf.nn.l2_loss(self.B3_c)
 82 |             self.l2_regularizer_loss = 0.0001 * tf.reduce_sum(tf.pow(self.W2_c, 2))
 83 |             self.cost = tf.pow(self.critic_q_model - self.q_value_in, 2) / BATCH_SIZE + \
 84 |                 self.l2_regularizer_loss  # /tf.to_float(tf.shape(self.q_value_in)[0])
 85 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(self.cost)
 86 |             self.act_grad_v = tf.gradients(self.critic_q_model, self.critic_action_in)
 87 |             # this is just divided by batch size
 88 |             self.action_gradients = [self.act_grad_v[0] /
 89 |                                      tf.to_float(tf.shape(self.act_grad_v[0])[0])]
 90 |             # from simple actor net:
 91 |             self.check_fl = self.action_gradients
 92 | 
 93 |             # initialize all tensor variable parameters:
 94 |             self.sess.run(tf.initialize_all_variables())
 95 | 
 96 |             # To initialize critic and target with the same values:
 97 |             # copy target parameters
 98 |             self.sess.run([
 99 |                 self.t_W1_c.assign(self.W1_c),
100 |                 self.t_B1_c.assign(self.B1_c),
101 |                 self.t_W2_c.assign(self.W2_c),
102 |                 self.t_W2_action_c.assign(self.W2_action_c),
103 |                 self.t_B2_c.assign(self.B2_c),
104 |                 self.t_W3_c.assign(self.W3_c),
105 |                 self.t_B3_c.assign(self.B3_c)
106 |             ])
107 | 
108 |     def train_critic(self, state_t_batch, action_batch, y_i_batch):
109 |         self.sess.run([self.optimizer, self.H1_c_bn.train_mean, self.H1_c_bn.train_var, self.H2_c_bn.train_mean, self.H2_c_bn.train_var, self.t_H1_c_bn.train_mean, self.t_H1_c_bn.train_var, self.t_H2_c_bn.train_mean, self.t_H2_c_bn.train_var], feed_dict={
110 |                       self.critic_state_in: state_t_batch, self.t_critic_state_in: state_t_batch, self.critic_action_in: action_batch, self.t_critic_action_in: action_batch, self.q_value_in: y_i_batch, self.is_training: True})
111 | 
112 |     def evaluate_target_critic(self, state_t_1, action_t_1):
113 |         return self.sess.run(self.t_critic_q_model, feed_dict={self.t_critic_state_in: state_t_1, self.t_critic_action_in: action_t_1, self.is_training: False})
114 | 
115 |     def compute_delQ_a(self, state_t, action_t):
116 |         return self.sess.run(self.action_gradients, feed_dict={self.critic_state_in: state_t, self.critic_action_in: action_t, self.is_training: False})
117 | 
118 |     def update_target_critic(self):
119 |         self.sess.run([
120 |             self.t_W1_c.assign(TAU * self.W1_c + (1 - TAU) * self.t_W1_c),
121 |             self.t_B1_c.assign(TAU * self.B1_c + (1 - TAU) * self.t_B1_c),
122 |             self.t_W2_c.assign(TAU * self.W2_c + (1 - TAU) * self.t_W2_c),
123 |             self.t_W2_action_c.assign(TAU * self.W2_action_c + (1 - TAU) * self.t_W2_action_c),
124 |             self.t_B2_c.assign(TAU * self.B2_c + (1 - TAU) * self.t_B2_c),
125 |             self.t_W3_c.assign(TAU * self.W3_c + (1 - TAU) * self.t_W3_c),
126 |             self.t_B3_c.assign(TAU * self.B3_c + (1 - TAU) * self.t_B3_c),
127 |             self.t_H1_c_bn.updateTarget,
128 |             self.t_H2_c_bn.updateTarget
129 |         ])
130 | 


--------------------------------------------------------------------------------
/src/ddpg/agent.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | 
  4 | from util import *
  5 | 
  6 | import gym
  7 | from gym.spaces import Box, Discrete
  8 | 
  9 | from actor_net import ActorNet
 10 | from critic_net import CriticNet
 11 | from actor_net_bn import ActorNet_bn
 12 | from critic_net_bn import CriticNet_bn
 13 | from tensorflow_grad_inverter import grad_inverter
 14 | 
 15 | from collections import deque
 16 | 
 17 | MAX_ACTION_SPACE_SIZE = 1e6
 18 | 
 19 | 
 20 | class Agent:
 21 | 
 22 |     def __init__(self, env):
 23 |         # checking state space
 24 |         if isinstance(env.observation_space, Box):
 25 |             self.observation_space_size = env.observation_space.shape[0]
 26 |         else:
 27 |             self.observation_space_size = env.observation_space.n
 28 | 
 29 |         # checking action space
 30 |         if isinstance(env.action_space, Box):
 31 |             self.action_space_size = env.action_space.shape[0]
 32 |             self.continious_action_space = True
 33 |             self.low = env.action_space.low
 34 |             self.high = env.action_space.high
 35 |         else:
 36 |             self.action_space_size = env.action_space.n
 37 |             self.continious_action_space = False
 38 |             self.low = 0
 39 |             self.high = env.action_space.n
 40 | 
 41 |     def act(self, state):
 42 |         pass
 43 | 
 44 |     def observe(self, episode):
 45 |         pass
 46 | 
 47 |     # shaping input states and actions
 48 |     def _np_shaping(self, array, is_state):
 49 | 
 50 |         number_of_elements = array.shape[0] if len(array.shape) > 1 else 1
 51 |         size_of_element = self.observation_space_size if is_state else self.action_space_size
 52 | 
 53 |         res = np.array(array)
 54 |         res.shape = (number_of_elements, size_of_element)
 55 |         return res
 56 | 
 57 |     def get_name(self):
 58 |         return 'Agent'
 59 | 
 60 | 
 61 | class RandomAgent(Agent):
 62 | 
 63 |     def act(self, state):
 64 |         if self.continious_action_space:
 65 |             res = self.low + (self.high - self.low) * np.random.uniform(size=len(self.low))
 66 |             return res
 67 |         else:
 68 |             return random.randint(self.low, self.high - 1)
 69 | 
 70 |     def get_name(self):
 71 |         return 'Random' + super().get_name()
 72 | 
 73 | 
 74 | class DiscreteRandomAgent(RandomAgent):
 75 | 
 76 |     def __init__(self, env, max_actions=10):
 77 |         super().__init__(env)
 78 |         if self.continious_action_space:
 79 |             self.discrete_actions = np.linspace(self.low, self.high, max_actions)
 80 |         else:
 81 |             self.discrete_actions = np.arange(self.low, self.high)
 82 |         self.discrete_actions = list(self.discrete_actions)
 83 | 
 84 |     def act(self, state):
 85 |         return random.sample(self.discrete_actions, 1)[0]
 86 | 
 87 |     def get_name(self):
 88 |         return 'Discrete' + super().get_name()
 89 | 
 90 | 
 91 | class DDPGAgent(Agent):
 92 |     ''' stevenpjg's implementation of DDPG algorithm '''
 93 | 
 94 |     REPLAY_MEMORY_SIZE = 10000
 95 |     BATCH_SIZE = 64
 96 |     GAMMA = 0.99
 97 | 
 98 |     def __init__(self, env, is_batch_norm=False, is_grad_inverter=True):
 99 |         super().__init__(env)
100 |         if is_batch_norm:
101 |             self.critic_net = CriticNet_bn(self.observation_space_size,
102 |                                            self.action_space_size)
103 |             self.actor_net = ActorNet_bn(self.observation_space_size,
104 |                                          self.action_space_size)
105 | 
106 |         else:
107 |             self.critic_net = CriticNet(self.observation_space_size,
108 |                                         self.action_space_size)
109 |             self.actor_net = ActorNet(self.observation_space_size,
110 |                                       self.action_space_size)
111 | 
112 |         self.is_grad_inverter = is_grad_inverter
113 |         self.replay_memory = deque()
114 | 
115 |         self.time_step = 0
116 | 
117 |         action_max = np.array(env.action_space.high).tolist()
118 |         action_min = np.array(env.action_space.low).tolist()
119 |         action_bounds = [action_max, action_min]
120 |         self.grad_inv = grad_inverter(action_bounds)
121 | 
122 |     def add_data_fetch(self, df):
123 |         self.data_fetch = df
124 |         self.data_fetch.add_timers(['ev_p_t', 'ev_q_t', 'y',
125 |                                     'train_q', 'train_p',
126 |                                     'up_q_t', 'up_p_t'], prefix='t_agent_training_')
127 | 
128 |     def get_name(self):
129 |         return 'DDPG' + super().get_name()
130 | 
131 |     def act(self, state):
132 |         state = self._np_shaping(state, True)
133 |         return self.actor_net.evaluate_actor(state).astype(float)
134 | 
135 |     def observe(self, episode):
136 |         episode['obs'] = self._np_shaping(episode['obs'], True)
137 |         episode['action'] = self._np_shaping(episode['action'], False)
138 |         episode['obs2'] = self._np_shaping(episode['obs2'], True)
139 |         self.add_experience(episode)
140 | 
141 |     def add_experience(self, episode):
142 |         self.replay_memory.append(episode)
143 | 
144 |         self.time_step += 1
145 |         if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
146 |             self.replay_memory.popleft()
147 | 
148 |         if len(self.replay_memory) > type(self).BATCH_SIZE:
149 |             res = self.train()
150 |             return res
151 |         else:
152 |             return None
153 | 
154 |     def minibatches(self):
155 |         batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
156 |         # state t
157 |         state = self._np_shaping(np.array([item['obs'] for item in batch]), True)
158 |         # action
159 |         action = self._np_shaping(np.array([item['action'] for item in batch]), False)
160 |         # reward
161 |         reward = np.array([item['reward'] for item in batch])
162 |         # state t+1
163 |         state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True)
164 |         # doneA
165 |         done = np.array([item['done'] for item in batch])
166 | 
167 |         return state, action, reward, state_2, done
168 | 
169 |     def train(self):
170 |         # sample a random minibatch of N transitions from R
171 |         state, action, reward, state_2, done = self.minibatches()
172 | 
173 |         actual_batch_size = len(state)
174 | 
175 |         self.data_fetch.reset_timers()
176 |         target_action = self.actor_net.evaluate_target_actor(state)
177 |         self.data_fetch.sample_timer('ev_p_t')  # ------
178 | 
179 |         # Q'(s_i+1,a_i+1)
180 |         q_t = self.critic_net.evaluate_target_critic(state_2, target_action)
181 |         self.data_fetch.sample_timer('ev_q_t')  # ------
182 | 
183 |         y = []  # fix initialization of y
184 |         for i in range(0, actual_batch_size):
185 | 
186 |             if done[i]:
187 |                 y.append(reward[i])
188 |             else:
189 |                 y.append(reward[i] + type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t
190 | 
191 |         y = np.reshape(np.array(y), [len(y), 1])
192 |         self.data_fetch.sample_timer('y')  # ------
193 | 
194 |         # Update critic by minimizing the loss
195 |         self.critic_net.train_critic(state, action, y)
196 |         self.data_fetch.sample_timer('train_q')  # ------
197 |         # Update actor proportional to the gradients:
198 |         # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
199 |         action_for_delQ = self.actor_net.evaluate_actor(state)  # dont need wolp action
200 | 
201 |         if self.is_grad_inverter:
202 |             del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)  # /BATCH_SIZE
203 |             del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
204 |         else:
205 |             del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)[0]  # /BATCH_SIZE
206 | 
207 |         # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
208 |         self.actor_net.train_actor(state, del_Q_a)
209 |         self.data_fetch.sample_timer('train_p')  # ------
210 | 
211 |         # Update target Critic and actor network
212 |         self.critic_net.update_target_critic()
213 |         self.data_fetch.sample_timer('up_q_t')  # ------
214 |         self.actor_net.update_target_actor()
215 |         self.data_fetch.sample_timer('up_p_t')  # ------
216 | 


--------------------------------------------------------------------------------