├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── _config.yml
├── agent
    ├── __init__.py
    ├── ddpg.py
    └── reinforce.py
├── mechanism
    ├── __init__.py
    ├── ou_process.py
    └── replay_buffer.py
├── model
    ├── __init__.py
    ├── ddpg_actor.py
    ├── ddpg_critic.py
    ├── ddpg_model.py
    └── reinforce_model.py
├── run_ddpg.bat
├── run_ddpg.py
└── run_reinforce.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # gym video
104 | /CartPole-v0*
105 | /Pendulum-v0*
106 | 
107 | # test scripts
108 | test*
109 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 ligh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TensorAgent
2 | Author: Guohao Li
3 | 
4 | Email: lightaime@gmail.com
5 | 
6 | Deep reinforcement learning agents implemented by tensorflow
7 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/__init__.py


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/agent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/agent/__init__.py


--------------------------------------------------------------------------------
/agent/ddpg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Agent(object):
 4 |     def __init__(self, model, replay_buffer, exploration_noise, discout_factor, verbose=False):
 5 |         self.model = model
 6 |         self.replay_buffer = replay_buffer
 7 |         self.exploration_noise = exploration_noise
 8 |         self.discout_factor = discout_factor
 9 |         self.verbose = verbose
10 | 
11 |     def predict_action(self, observation):
12 |         return self.model.predict_action(observation)
13 | 
14 |     def select_action(self, observation, p=None):
15 |         pred_action = self.predict_action(observation)
16 |         noise = self.exploration_noise.return_noise()
17 |         if p is not None:
18 |             return pred_action * p + noise * (1 - p)
19 |         else:
20 |             return pred_action + noise
21 | 
22 |     def store_transition(self, transition):
23 |         self.replay_buffer.store_transition(transition)
24 | 
25 |     def init_process(self):
26 |         self.exploration_noise.init_process()
27 | 
28 |     def get_transition_batch(self):
29 |         batch = self.replay_buffer.get_batch()
30 |         transpose_batch = list(zip(*batch))
31 |         s_batch = np.vstack(transpose_batch[0])
32 |         a_batch = np.vstack(transpose_batch[1])
33 |         r_batch = np.vstack(transpose_batch[2])
34 |         next_s_batch = np.vstack(transpose_batch[3])
35 |         done_batch = np.vstack(transpose_batch[4])
36 |         return s_batch, a_batch, r_batch, next_s_batch, done_batch
37 | 
38 |     def preprocess_batch(self, s_batch, a_batch, r_batch, next_s_batch, done_batch):
39 |         target_actor_net_pred_action = self.model.actor.predict_action_target_net(next_s_batch)
40 |         target_critic_net_pred_q = self.model.critic.predict_q_target_net(next_s_batch, target_actor_net_pred_action)
41 |         y_batch = r_batch + self.discout_factor * target_critic_net_pred_q * (1 - done_batch)
42 |         return s_batch, a_batch, y_batch
43 | 
44 |     def train_model(self):
45 |         s_batch, a_batch, r_batch, next_s_batch, done_batch = self.get_transition_batch()
46 |         self.model.update(*self.preprocess_batch(s_batch, a_batch, r_batch, next_s_batch, done_batch))
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/agent/reinforce.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | class Agent(object):
 5 |     def __init__(self, model, discout_factor, verbose=False):
 6 |         self.model = model
 7 |         self.discout_factor = discout_factor
 8 |         self.verbose = verbose
 9 |         self.sess = tf.Session()
10 |         self.sess.run(tf.global_variables_initializer())
11 |         self.state_rollout = []
12 |         self.action_rollout = []
13 |         self.reward_rollout = []
14 |         self.done_rollout = []
15 | 
16 |     def state_append(self, state):
17 |         self.state_rollout.append(state)
18 | 
19 |     def action_append(self, action):
20 |         self.action_rollout.append(action)
21 | 
22 |     def reward_append(self, reward):
23 |         self.reward_rollout.append(reward)
24 | 
25 |     def predict_policy(self, observation):
26 |         return self.model.predict_policy([observation], self.sess)
27 | 
28 |     def train_model(self):
29 |         w = np.array([])
30 |         b = np.array([])
31 |         for i, sar in enumerate(zip(self.state_rollout, self.action_rollout, self.reward_rollout)):
32 |             s, a, r = sar
33 |             _, total_loss, policy_loss, base_line_loss = self.model.update([s],
34 |                             [a],
35 |                             [[sum(self.discout_factor**i_ * rwd for i_, rwd in enumerate(self.reward_rollout[i:]))]],
36 |                             self.sess)
37 |             if i%10 == 0:
38 |                 print(base_line_loss)
39 |             if self.verbose:
40 |                 if i%10 == 0:
41 |                     print(base_line_loss)
42 |                     print(total_loss)
43 |                     w_p, b_p = w.copy(), b.copy()
44 |                     w, b = model.run_layer_weight()
45 |                     if i > 0:
46 |                         print(w-w_p)
47 |                         print(b-b_p)
48 | 
49 |     def clear_rollout(self):
50 |         del self.state_rollout[:]
51 |         del self.action_rollout[:]
52 |         del self.reward_rollout[:]
53 | 
54 | 


--------------------------------------------------------------------------------
/mechanism/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/mechanism/__init__.py


--------------------------------------------------------------------------------
/mechanism/ou_process.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | refer to openai
 3 | https://github.com/rll/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py
 4 | '''
 5 | 
 6 | import numpy as np
 7 | 
 8 | class OU_Process(object):
 9 |     def __init__(self, action_dim, theta=0.15, mu=0, sigma=0.2):
10 |         self.action_dim = action_dim
11 |         self.theta = theta
12 |         self.mu = mu
13 |         self.sigma = sigma
14 |         self.current_x = None
15 | 
16 |         self.init_process()
17 | 
18 |     def init_process(self):
19 |         self.current_x = np.ones(self.action_dim) * self.mu
20 | 
21 |     def update_process(self):
22 |         dx = self.theta * (self.mu - self.current_x) + self.sigma * np.random.randn(self.action_dim)
23 |         self.current_x = self.current_x + dx
24 | 
25 |     def return_noise(self):
26 |         self.update_process()
27 |         return self.current_x
28 | 
29 | if __name__ == "__main__":
30 |     ou = OU_Process(3, theta=0.15, mu=0, sigma=0.2)
31 |     states = []
32 |     for i in range(10000):
33 |         states.append(ou.return_noise()[0])
34 |     import matplotlib.pyplot as plt
35 | 
36 |     plt.plot(states)
37 |     plt.show()
38 | 
39 | 


--------------------------------------------------------------------------------
/mechanism/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import random
 3 | 
 4 | class Replay_Buffer(object):
 5 |     def __init__(self, buffer_size=10e6, batch_size=1):
 6 |         self.buffer_size = buffer_size
 7 |         self.batch_size = batch_size
 8 |         self.memory = deque(maxlen=buffer_size)
 9 | 
10 |     def __call__(self):
11 |         return self.memory
12 | 
13 |     def store_transition(self, transition):
14 |         self.memory.append(transition)
15 | 
16 |     def store_transitions(self, transitions):
17 |         self.memory.extend(transitions)
18 | 
19 |     def get_batch(self, batch_size=None):
20 |         b_s = batch_size or self.batch_size
21 |         cur_men_size = len(self.memory)
22 |         if cur_men_size < b_s:
23 |             return random.sample(list(self.memory), cur_men_size)
24 |         else:
25 |             return random.sample(list(self.memory), b_s)
26 | 
27 |     def memory_state(self):
28 |         return {"buffer_size": self.buffer_size,
29 |                 "current_size": len(self.memory),
30 |                 "full": len(self.memory)==self.buffer_size}
31 | 
32 |     def empty_transition(self):
33 |         self.memory.clear()
34 | 
35 | if __name__ == '__main__':
36 |     import numpy as np
37 |     replay_buffer = Replay_Buffer(buffer_size=4)
38 |     print(replay_buffer.memory_state())
39 |     replay_buffer.store_transition([1, 2, 3, 4, False])
40 |     print(replay_buffer.memory_state())
41 |     replay_buffer.store_transition([2, 2, 3, 4, False])
42 |     print(replay_buffer.memory_state())
43 |     replay_buffer.store_transition([3, 2, 3, 4, True])
44 |     print(replay_buffer.memory_state())
45 |     print(replay_buffer())
46 | 
47 |     replay_buffer.store_transition([4, 2, 3, 4, True])
48 |     print(replay_buffer.memory_state())
49 |     print(replay_buffer())
50 | 
51 |     replay_buffer.store_transitions([[5, 2, 3, 4, False],
52 |                                      [6, 2, 3, 4, True]])
53 |     print(replay_buffer.memory_state())
54 |     print(replay_buffer())
55 | 
56 |     batch = replay_buffer.get_batch(3)
57 |     print("batch", batch)
58 |     transpose_batch = list(zip(*batch))
59 |     print("transpose_batch", transpose_batch)
60 |     s_batch = np.array(transpose_batch[0])
61 |     a_batch = list(transpose_batch[1])
62 |     r_batch = list(transpose_batch[2])
63 |     next_s_batch = list(transpose_batch[3])
64 |     done_batch = np.array(transpose_batch[4])
65 |     print("s_batch", s_batch)
66 |     print("a_batch", a_batch)
67 |     print("r_batch", r_batch)
68 |     print("next_s_batch", next_s_batch)
69 |     print("done_batch", done_batch)
70 |     print((1-done_batch)*s_batch)
71 | 
72 |     replay_buffer.empty_transition()
73 |     print(replay_buffer.memory_state())
74 |     print(replay_buffer())
75 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/model/__init__.py


--------------------------------------------------------------------------------
/model/ddpg_actor.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from math import sqrt
  3 | 
  4 | class DDPG_Actor(object):
  5 |     def __init__(self, state_dim, action_dim, optimizer=None, learning_rate=0.001, tau=0.001, scope="", sess=None):
  6 |         self.scope = scope
  7 |         self.sess = sess
  8 |         self.state_dim = state_dim
  9 |         self.action_dim = action_dim
 10 |         self.learning_rate = learning_rate
 11 |         self.l2_reg = 0.01
 12 |         self.optimizer = optimizer or tf.train.AdamOptimizer(self.learning_rate)
 13 |         self.tau = tau
 14 |         self.h1_dim = 400
 15 |         self.h2_dim = 300
 16 |         # self.h3_dim = 200
 17 |         self.activation = tf.nn.relu
 18 |         self.kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
 19 |         # fan-out uniform initializer which is different from original paper
 20 |         self.kernel_initializer_1 = tf.random_uniform_initializer(minval=-1/sqrt(self.h1_dim), maxval=1/sqrt(self.h1_dim))
 21 |         self.kernel_initializer_2 = tf.random_uniform_initializer(minval=-1/sqrt(self.h2_dim), maxval=1/sqrt(self.h2_dim))
 22 |         self.kernel_initializer_3 = tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)
 23 |         self.kernel_regularizer = tf.contrib.layers.l2_regularizer(self.l2_reg)
 24 | 
 25 |         with tf.name_scope("actor_input"):
 26 |             self.input_state = tf.placeholder(tf.float32, shape=[None, self.state_dim], name="states")
 27 | 
 28 |         with tf.name_scope("actor_label"):
 29 |             self.actions_grad = tf.placeholder(tf.float32, shape=[None, self.action_dim], name="actions_grad")
 30 | 
 31 |         self.source_var_scope = "ddpg/" + "actor_net"
 32 |         with tf.variable_scope(self.source_var_scope):
 33 |             self.action_output = self.__create_actor_network()
 34 | 
 35 |         self.target_var_scope = "ddpg/" + "actor_target_net"
 36 |         with tf.variable_scope(self.target_var_scope):
 37 |             self.target_net_actions_output = self.__create_target_network()
 38 | 
 39 |         with tf.name_scope("compute_policy_gradients"):
 40 |             self.__create_loss()
 41 | 
 42 |         self.train_op_scope = "actor_train_op"
 43 |         with tf.variable_scope(self.train_op_scope):
 44 |             self.__create_train_op()
 45 | 
 46 |         with tf.name_scope("actor_target_update_train_op"):
 47 |             self.__create_update_target_net_op()
 48 | 
 49 |         self.__create_get_layer_weight_op_source()
 50 |         self.__create_get_layer_weight_op_target()
 51 | 
 52 |     def __create_actor_network(self):
 53 |         h1 = tf.layers.dense(self.input_state,
 54 |                                 units=self.h1_dim,
 55 |                                 activation=self.activation,
 56 |                                 kernel_initializer=self.kernel_initializer_1,
 57 |                                 # kernel_initializer=self.kernel_initializer,
 58 |                                 kernel_regularizer=self.kernel_regularizer,
 59 |                                 name="hidden_1")
 60 | 
 61 |         h2 = tf.layers.dense(h1,
 62 |                                 units=self.h2_dim,
 63 |                                 activation=self.activation,
 64 |                                 kernel_initializer=self.kernel_initializer_2,
 65 |                                 # kernel_initializer=self.kernel_initializer,
 66 |                                 kernel_regularizer=self.kernel_regularizer,
 67 |                                 name="hidden_2")
 68 | 
 69 |         # h3 = tf.layers.dense(h2,
 70 |                                 # units=self.h3_dim,
 71 |                                 # activation=self.activation,
 72 |                                 # kernel_initializer=self.kernel_initializer,
 73 |                                 # kernel_regularizer=self.kernel_regularizer,
 74 |                                 # name="hidden_3")
 75 | 
 76 |         action_output = tf.layers.dense(h2,
 77 |                                 units=self.action_dim,
 78 |                                 activation=tf.nn.tanh,
 79 |                                 # activation=tf.nn.tanh,
 80 |                                 kernel_initializer=self.kernel_initializer_3,
 81 |                                 # kernel_initializer=self.kernel_initializer,
 82 |                                 kernel_regularizer=self.kernel_regularizer,
 83 |                                 use_bias=False,
 84 |                                 name="action_outputs")
 85 | 
 86 |         return action_output
 87 | 
 88 |     def __create_target_network(self):
 89 |         # get source variales and initialize
 90 |         source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope)
 91 |         self.sess.run(tf.variables_initializer(source_vars))
 92 | 
 93 |         # create target network and initialize it by source network
 94 |         action_output = self.__create_actor_network()
 95 |         target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope)
 96 | 
 97 |         target_init_op_list = [target_vars[i].assign(source_vars[i]) for i in range(len(source_vars))]
 98 |         self.sess.run(target_init_op_list)
 99 | 
100 |         return action_output
101 | 
102 |     def __create_loss(self):
103 |         source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope)
104 |         self.policy_gradient = tf.gradients(self.action_output, source_vars, -self.actions_grad)
105 |         self.grads_and_vars = zip(self.policy_gradient, source_vars)
106 | 
107 |     def __create_train_op(self):
108 |         self.train_policy_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step=tf.contrib.framework.get_global_step())
109 |         train_op_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope= self.scope + "/" + self.train_op_scope) # to do: remove prefix
110 |         train_op_vars.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.train_op_scope))
111 |         self.sess.run(tf.variables_initializer(train_op_vars))
112 | 
113 |     def __create_update_target_net_op(self):
114 |         source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope)
115 |         target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope)
116 |         update_target_net_op_list = [target_vars[i].assign(self.tau*source_vars[i] + (1-self.tau)*target_vars[i]) for i in range(len(source_vars))]
117 | 
118 |         # source_net_dict = {var.name[len(self.source_var_scope):]: var for var in source_vars}
119 |         # target_net_dict = {var.name[len(self.target_var_scope):]: var for var in target_vars}
120 |         # keys = source_net_dict.keys()
121 |         # update_target_net_op_list = [target_net_dict[key].assign((1-self.tau)*target_net_dict[key]+self.tau*source_net_dict[key]) \
122 |                                                         # for key in keys]
123 | 
124 |         # for s_v, t_v in zip(source_vars, target_vars):
125 |             # update_target_net_op_list.append(t_v.assign(self.tau*s_v - (1-self.tau)*t_v))
126 | 
127 |         self.update_target_net_op = tf.group(*update_target_net_op_list)
128 | 
129 |     def predict_action_source_net(self, feed_state, sess=None):
130 |         sess = sess or self.sess
131 |         return sess.run(self.action_output, {self.input_state: feed_state})
132 | 
133 |     def predict_action_target_net(self, feed_state, sess=None):
134 |         sess = sess or self.sess
135 |         return sess.run(self.target_net_actions_output, {self.input_state: feed_state})
136 | 
137 |     def update_source_actor_net(self, feed_state, actions_grad, sess=None):
138 |         sess = sess or self.sess
139 |         batch_size = len(actions_grad)
140 |         return sess.run([self. train_policy_op],
141 |                         {self.input_state: feed_state,
142 |                          self.actions_grad: actions_grad/batch_size})
143 | 
144 |     def update_target_actor_net(self, sess=None):
145 |         sess = sess or self.sess
146 |         return sess.run(self.update_target_net_op)
147 | 
148 |     def __create_get_layer_weight_op_source(self):
149 |         with tf.variable_scope(self.source_var_scope, reuse=True):
150 |             self.h1_weight_source = tf.get_variable("hidden_1/kernel")
151 |             self.h1_bias_source = tf.get_variable("hidden_1/bias")
152 | 
153 |     def run_layer_weight_source(self, sess=None):
154 |         sess = sess or self.sess
155 |         return sess.run([self.h1_weight_source, self.h1_bias_source])
156 | 
157 |     def __create_get_layer_weight_op_target(self):
158 |         with tf.variable_scope(self.target_var_scope, reuse=True):
159 |             self.h1_weight_target = tf.get_variable("hidden_1/kernel")
160 |             self.h1_bias_target = tf.get_variable("hidden_1/bias")
161 | 
162 |     def run_layer_weight_target(self, sess=None):
163 |         sess = sess or self.sess
164 |         return sess.run([self.h1_weight_target, self.h1_bias_target])
165 | 
166 | 
167 | if __name__ == '__main__':
168 |     import numpy as np
169 |     state_dim = 40
170 |     action_dim = 3
171 |     learning_rate = np.random.rand(1)
172 |     print("learning_rate: ", learning_rate)
173 |     tau = np.random.rand(1)
174 |     print("tau: ", tau)
175 |     sess = tf.Session()
176 |     actor = DDPG_Actor(state_dim, action_dim, sess=sess, tau=tau, learning_rate=learning_rate[0])
177 |     # actor = DDPG_Actor(state_dim, action_dim, sess=sess, tau=tau)
178 |     random_state = np.random.normal(size=state_dim)
179 |     print("random_state", random_state)
180 | 
181 |     # check forward
182 |     action = actor.predict_action_source_net([random_state], sess)
183 |     print("predict action", action)
184 | 
185 |     # check update_source_net
186 |     h1_weight, h1_bias = actor.run_layer_weight_source(sess)
187 |     random_actions_grad = np.random.normal(size=action_dim)
188 |     actor.update_source_actor_net([random_state], [random_actions_grad], sess)
189 |     h1_weight_trained, h1_bias_trained = actor.run_layer_weight_source(sess)
190 |     print("h1_weight_difference", (h1_weight_trained-h1_weight))
191 |     print("h1_bias_difference", (h1_bias_trained-h1_bias))
192 | 
193 |     # check update target net
194 |     h1_weight_target, h1_bias_target = actor.run_layer_weight_target(sess)
195 |     actor.update_target_actor_net(sess)
196 |     h1_weight_trained_target, h1_bias_trained_target = actor.run_layer_weight_target(sess)
197 |     print("source_target_differece_weight", (h1_weight_trained - h1_weight_trained_target))
198 |     print("source_target_differece_bias", (h1_bias_trained - h1_bias_trained_target))
199 |     print("weight_error", h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target)
200 |     print("bias_error", h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target)
201 |     print(np.sum(np.abs(h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target)))
202 |     print(np.sum(np.abs(h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target)))
203 | 


--------------------------------------------------------------------------------
/model/ddpg_critic.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from math import sqrt
  3 | 
  4 | class DDPG_Critic(object):
  5 |     def __init__(self, state_dim, action_dim, optimizer=None, learning_rate=0.001, tau=0.001, scope="", sess=None):
  6 |         self.scope = scope
  7 |         self.sess = sess
  8 |         self.state_dim = state_dim
  9 |         self.action_dim = action_dim
 10 |         self.learning_rate = learning_rate
 11 |         self.l2_reg = 0.01
 12 |         self.optimizer = optimizer or tf.train.AdamOptimizer(self.learning_rate)
 13 |         self.tau = tau
 14 |         self.h1_dim = 400
 15 |         self.h2_dim = 100
 16 |         self.h3_dim = 300
 17 |         self.activation = tf.nn.relu
 18 |         self.kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
 19 |         # fan-out uniform initializer which is different from original paper
 20 |         self.kernel_initializer_1 = tf.random_uniform_initializer(minval=-1/sqrt(self.h1_dim), maxval=1/sqrt(self.h1_dim))
 21 |         self.kernel_initializer_2 = tf.random_uniform_initializer(minval=-1/sqrt(self.h2_dim), maxval=1/sqrt(self.h2_dim))
 22 |         self.kernel_initializer_3 = tf.random_uniform_initializer(minval=-1/sqrt(self.h3_dim), maxval=1/sqrt(self.h3_dim))
 23 |         self.kernel_initializer_4 = tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)
 24 |         self.kernel_regularizer = tf.contrib.layers.l2_regularizer(self.l2_reg)
 25 | 
 26 |         with tf.name_scope("critic_input"):
 27 |             self.input_state = tf.placeholder(tf.float32, shape=[None, self.state_dim], name="states")
 28 |             self.input_action = tf.placeholder(tf.float32, shape=[None, self.action_dim], name="actions")
 29 | 
 30 |         with tf.name_scope("critic_label"):
 31 |             self.y = tf.placeholder(tf.float32, shape=[None, 1], name="y")
 32 | 
 33 |         self.source_var_scope = "ddpg/" + "critic_net"
 34 |         with tf.variable_scope(self.source_var_scope):
 35 |             self.q_output = self.__create_critic_network()
 36 | 
 37 |         self.target_var_scope = "ddpg/" + "critic_target_net"
 38 |         with tf.variable_scope(self.target_var_scope):
 39 |             self.target_net_q_output = self.__create_target_network()
 40 | 
 41 |         with tf.name_scope("compute_critic_loss"):
 42 |             self.__create_loss()
 43 | 
 44 |         self.train_op_scope = "critic_train_op"
 45 |         with tf.variable_scope(self.train_op_scope):
 46 |             self.__create_train_op()
 47 | 
 48 |         with tf.name_scope("critic_target_update_train_op"):
 49 |             self.__create_update_target_net_op()
 50 | 
 51 |         with tf.name_scope("get_action_grad_op"):
 52 |             self.__create_get_action_grad_op()
 53 | 
 54 |         self.__create_get_layer_weight_op_source()
 55 |         self.__create_get_layer_weight_op_target()
 56 | 
 57 |     def __create_critic_network(self):
 58 |         h1 = tf.layers.dense(self.input_state,
 59 |                                 units=self.h1_dim,
 60 |                                 activation=self.activation,
 61 |                                 kernel_initializer=self.kernel_initializer_1,
 62 |                                 # kernel_initializer=self.kernel_initializer,
 63 |                                 kernel_regularizer=self.kernel_regularizer,
 64 |                                 name="hidden_1")
 65 | 
 66 |         # h1_with_action = tf.concat([h1, self.input_action], 1, name="hidden_1_with_action")
 67 | 
 68 |         h2 = tf.layers.dense(self.input_action,
 69 |                                 units=self.h2_dim,
 70 |                                 activation=self.activation,
 71 |                                 kernel_initializer=self.kernel_initializer_2,
 72 |                                 # kernel_initializer=self.kernel_initializer,
 73 |                                 kernel_regularizer=self.kernel_regularizer,
 74 |                                 name="hidden_2")
 75 | 
 76 |         h_concat = tf.concat([h1, h2], 1, name="h_concat")
 77 | 
 78 |         h3 = tf.layers.dense(h_concat,
 79 |                                 units=self.h3_dim,
 80 |                                 activation=self.activation,
 81 |                                 kernel_initializer=self.kernel_initializer_3,
 82 |                                 # kernel_initializer=self.kernel_initializer,
 83 |                                 kernel_regularizer=self.kernel_regularizer,
 84 |                                 name="hidden_3")
 85 | 
 86 |         # h2_with_action = tf.concat([h2, self.input_action], 1, name="hidden_3_with_action")
 87 | 
 88 |         q_output = tf.layers.dense(h3,
 89 |                                 units=1,
 90 |                                 # activation=tf.nn.sigmoid,
 91 |                                 activation = None,
 92 |                                 kernel_initializer=self.kernel_initializer_4,
 93 |                                 # kernel_initializer=self.kernel_initializer,
 94 |                                 kernel_regularizer=self.kernel_regularizer,
 95 |                                 name="q_output")
 96 | 
 97 |         return q_output
 98 | 
 99 |     def __create_target_network(self):
100 |         # get source variales and initialize
101 |         source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope)
102 |         self.sess.run(tf.variables_initializer(source_vars))
103 | 
104 |         # create target network and initialize it by source network
105 |         q_output = self.__create_critic_network()
106 |         target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope)
107 | 
108 |         target_init_op_list = [target_vars[i].assign(source_vars[i]) for i in range(len(source_vars))]
109 |         self.sess.run(target_init_op_list)
110 | 
111 |         return q_output
112 | 
113 |     def __create_loss(self):
114 |         self.loss = tf.losses.mean_squared_error(self.y, self.q_output)
115 | 
116 |     def __create_train_op(self):
117 |         self.train_q_op = self.optimizer.minimize(self.loss)
118 |         train_op_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope= self.scope + "/" + self.train_op_scope) # to do: remove prefix
119 |         train_op_vars.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.train_op_scope))
120 |         self.sess.run(tf.variables_initializer(train_op_vars))
121 | 
122 |     def __create_update_target_net_op(self):
123 |         source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope)
124 |         target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope)
125 |         update_target_net_op_list = [target_vars[i].assign(self.tau*source_vars[i] + (1-self.tau)*target_vars[i]) for i in range(len(source_vars))]
126 |         # source_net_dict = {var.name[len(self.source_var_scope):]: var for var in source_vars}
127 |         # target_net_dict = {var.name[len(self.target_var_scope):]: var for var in target_vars}
128 |         # keys = source_net_dict.keys()
129 |         # update_target_net_op_list = [target_net_dict[key].assign((1-self.tau)*target_net_dict[key]+self.tau*source_net_dict[key]) \
130 |                                                         # for key in keys]
131 | 
132 |         # for s_v, t_v in zip(source_vars, target_vars):
133 |             # update_target_net_op_list.append(t_v.assign(self.tau*s_v - (1-self.tau)*t_v))
134 | 
135 |         self.update_target_net_op = tf.group(*update_target_net_op_list)
136 | 
137 |     def __create_get_action_grad_op(self):
138 |         self.get_action_grad_op = tf.gradients(self.q_output, self.input_action)
139 | 
140 |     def predict_q_source_net(self, feed_state, feed_action, sess=None):
141 |         sess = sess or self.sess
142 |         return sess.run(self.q_output, {self.input_state: feed_state,
143 |                                         self.input_action: feed_action})
144 | 
145 |     def predict_q_target_net(self, feed_state, feed_action, sess=None):
146 |         sess = sess or self.sess
147 |         return sess.run(self.target_net_q_output, {self.input_state: feed_state,
148 |                                              self.input_action: feed_action})
149 | 
150 |     def update_source_critic_net(self, feed_state, feed_action, feed_y, sess=None):
151 |         sess = sess or self.sess
152 |         return sess.run([self.train_q_op],
153 |                         {self.input_state: feed_state,
154 |                          self.input_action: feed_action,
155 |                          self.y: feed_y})
156 | 
157 |     def update_target_critic_net(self, sess=None):
158 |         sess = sess or self.sess
159 |         return sess.run(self.update_target_net_op)
160 | 
161 |     def get_action_grads(self, feed_state, feed_action, sess=None):
162 |         sess = sess or self.sess
163 |         return (sess.run(self.get_action_grad_op, {self.input_state: feed_state,
164 |                                                   self.input_action: feed_action}))[0]
165 | 
166 |     def __create_get_layer_weight_op_source(self):
167 |         with tf.variable_scope(self.source_var_scope, reuse=True):
168 |             self.h1_weight_source = tf.get_variable("hidden_1/kernel")
169 |             self.h1_bias_source = tf.get_variable("hidden_1/bias")
170 | 
171 |     def run_layer_weight_source(self, sess=None):
172 |         sess = sess or self.sess
173 |         return sess.run([self.h1_weight_source, self.h1_bias_source])
174 | 
175 |     def __create_get_layer_weight_op_target(self):
176 |         with tf.variable_scope(self.target_var_scope, reuse=True):
177 |             self.h1_weight_target = tf.get_variable("hidden_1/kernel")
178 |             self.h1_bias_target = tf.get_variable("hidden_1/bias")
179 | 
180 |     def run_layer_weight_target(self, sess=None):
181 |         sess = sess or self.sess
182 |         return sess.run([self.h1_weight_target, self.h1_bias_target])
183 | 
184 | if __name__ == '__main__':
185 |     import numpy as np
186 |     state_dim = 40
187 |     action_dim = 3
188 |     learning_rate = np.random.rand(1)
189 |     print("learning_rate: ", learning_rate)
190 |     tau = np.random.rand(1)
191 |     print("tau: ", tau)
192 |     sess = tf.Session()
193 |     critic = DDPG_Critic(state_dim, action_dim, sess=sess, tau=tau, learning_rate=learning_rate[0])
194 |     # critic = DDPG_Actor(state_dim, action_dim, sess=sess, tau=tau)
195 |     random_state = np.random.normal(size=state_dim)
196 |     print("random_state", random_state)
197 | 
198 |     random_action = np.random.random(size=action_dim)
199 |     print("random_action", random_action)
200 | 
201 |     # check forward
202 |     target_q = critic.predict_q_target_net([random_state], [random_action], sess)
203 |     print("predict target q", target_q)
204 | 
205 |     # check update_source_net
206 |     y = target_q[0] + 1
207 |     h1_weight, h1_bias = critic.run_layer_weight_source(sess)
208 |     random_actions_grad = np.random.normal(size=action_dim)
209 |     critic.update_source_critic_net([random_state], [random_action], [y], sess)
210 |     h1_weight_trained, h1_bias_trained = critic.run_layer_weight_source(sess)
211 |     print("h1_weight_difference", (h1_weight_trained-h1_weight))
212 |     print("h1_bias_difference", (h1_bias_trained-h1_bias))
213 | 
214 |     # check update target net
215 |     h1_weight_target, h1_bias_target = critic.run_layer_weight_target(sess)
216 |     critic.update_target_critic_net(sess)
217 |     h1_weight_trained_target, h1_bias_trained_target = critic.run_layer_weight_target(sess)
218 |     print("source_target_differece_weight", (h1_weight_trained - h1_weight_trained_target))
219 |     print("source_target_differece_bias", (h1_bias_trained - h1_bias_trained_target))
220 |     print("weight_error", h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target)
221 |     print("bias_error", h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target)
222 |     print(np.sum(np.abs(h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target)))
223 |     print(np.sum(np.abs(h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target)))
224 | 
225 |     # check get action grad
226 |     random_action_for_grad = np.random.random(size=action_dim)
227 |     print("random_actions_grad", random_action_for_grad)
228 |     action_grad = critic.get_action_grads([random_state], [random_action_for_grad], sess)
229 |     # print("action_grad", action_grad)
230 |     for i in action_grad:
231 |         print(i)
232 | 


--------------------------------------------------------------------------------
/model/ddpg_model.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | lib_path = os.path.abspath(os.path.dirname(__file__))
 3 | sys.path.append(lib_path)
 4 | 
 5 | import tensorflow as tf
 6 | from ddpg_actor import DDPG_Actor
 7 | from ddpg_critic import DDPG_Critic
 8 | 
 9 | 
10 | class Model(object):
11 |     def __init__(self,
12 |                  state_dim,
13 |                  action_dim,
14 |                  optimizer=None,
15 |                  actor_learning_rate=1e-4,
16 |                  critic_learning_rate=1e-3,
17 |                  tau = 0.001,
18 |                  sess=None):
19 |         self.state_dim = state_dim
20 |         self.action_dim = action_dim
21 |         self.actor_learning_rate = actor_learning_rate
22 |         self.critic_learning_rate = critic_learning_rate
23 |         self.tau = tau
24 | 
25 |         #tf.reset_default_graph()
26 |         self.sess = sess or tf.Session()
27 | 
28 |         self.global_step = tf.Variable(0, name="global_step", trainable=False)
29 |         global_step_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="global_step")
30 |         self.sess.run(tf.variables_initializer(global_step_vars))
31 | 
32 |         self.actor_scope = "actor_net"
33 |         with tf.name_scope(self.actor_scope):
34 |             self.actor = DDPG_Actor(self.state_dim,
35 |                         self.action_dim,
36 |                         learning_rate=self.actor_learning_rate,
37 |                         tau=self.tau,
38 |                         scope=self.actor_scope,
39 |                         sess=self.sess)
40 | 
41 |         self.critic_scope = "critic_net"
42 |         with tf.name_scope(self.critic_scope):
43 |             self.critic = DDPG_Critic(self.state_dim,
44 |                         self.action_dim,
45 |                         learning_rate=self.critic_learning_rate,
46 |                         tau=self.tau,
47 |                         scope=self.critic_scope,
48 |                         sess=self.sess)
49 | 
50 |     def update(self, state_batch, action_batch, y_batch, sess=None):
51 |         sess = sess or self.sess
52 |         self.critic.update_source_critic_net(state_batch, action_batch, y_batch, sess)
53 |         action_batch_for_grad = self.actor.predict_action_source_net(state_batch, sess)
54 |         action_grad_batch = self.critic.get_action_grads(state_batch, action_batch_for_grad, sess)
55 |         self.actor.update_source_actor_net(state_batch, action_grad_batch, sess)
56 | 
57 |         self.critic.update_target_critic_net(sess)
58 |         self.actor.update_target_actor_net(sess)
59 | 
60 |     def predict_action(self, observation, sess=None):
61 |         sess = sess or self.sess
62 |         return self.actor.predict_action_source_net(observation, sess)
63 | 
64 | if __name__ == '__main__':
65 |     import numpy as np
66 |     state_dim = 40
67 |     action_dim = 3
68 |     actor_learning_rate = np.random.rand(1)
69 |     print("actor_learning_rate: ", actor_learning_rate)
70 |     critic_learning_rate = np.random.rand(1)
71 |     print("critic_learning_rate: ", critic_learning_rate)
72 |     tau = np.random.rand(1)
73 |     print("tau: ", tau)
74 |     sess = tf.Session()
75 |     model = Model(state_dim,
76 |                   action_dim,
77 |                   tau=tau,
78 |                   actor_learning_rate=actor_learning_rate[0],
79 |                   critic_learning_rate=critic_learning_rate[0],
80 |                   sess=sess)
81 |     random_state = np.random.normal(size=state_dim)
82 |     print("random_state", random_state)
83 | 
84 |     random_action = np.random.random(size=action_dim)
85 |     print("random_action", random_action)
86 | 
87 |     # check prediction
88 |     pred_action = model.predict_action(random_state)
89 |     print("predict_action", pred_action)
90 | 
91 |     # check forward
92 |     target_q = model.critic.predict_q_target_net([random_state], [random_action], sess)
93 |     print("predict target q", target_q)
94 |     y = target_q[0] + 1
95 | 
96 |     model.update([random_state], [random_action], [y])
97 | 


--------------------------------------------------------------------------------
/model/reinforce_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | class Model(object):
  4 |     def __init__(self, state_dim, action_dim, entropy_beta=1e-3, optimizer=None, learning_rate=0.001):
  5 |         self.state_dim = state_dim
  6 |         self.action_dim = action_dim
  7 |         self.entropy_beta = entropy_beta
  8 |         self.learning_rate = learning_rate
  9 |         tf.reset_default_graph()
 10 |         self.global_step = tf.Variable(0, name="global_step", trainable=False)
 11 |         self.optimizer = optimizer or tf.train.RMSPropOptimizer(self.learning_rate)
 12 | 
 13 |         with tf.name_scope("model_input"):
 14 |             self.input_state = tf.placeholder(tf.float32, shape=[None, self.state_dim], name="states")
 15 | 
 16 |         with tf.name_scope("model_target"):
 17 |             self.taken_actions = tf.placeholder(tf.int32, shape=[None, 1], name="taken_actions")
 18 |             self.future_rewards = tf.placeholder(tf.float32, shape=[None, 1], name="future_rewards")
 19 | 
 20 |         with tf.name_scope("model"):
 21 |             self.__create_model()
 22 | 
 23 |     def __create_policy_network(self):
 24 |         with tf.variable_scope("shared_network"):
 25 |             h1 = tf.layers.dense(self.input_state,
 26 |                                     units=32,
 27 |                                     activation=tf.nn.relu,
 28 |                                     kernel_initializer=tf.contrib.layers.xavier_initializer(),
 29 |                                     name="hidden_1")
 30 | 
 31 |             h2 = tf.layers.dense(h1,
 32 |                                     units=32,
 33 |                                     activation=tf.nn.relu,
 34 |                                     kernel_initializer=tf.contrib.layers.xavier_initializer(),
 35 |                                     name="hidden_2")
 36 | 
 37 |         with tf.variable_scope("policy_network"):
 38 |             self.policy_outputs = tf.layers.dense(h2,
 39 |                                     units=self.action_dim,
 40 |                                     activation=tf.nn.softmax,
 41 |                                     kernel_initializer=tf.contrib.layers.xavier_initializer(),
 42 |                                     name="policy_outputs")
 43 | 
 44 |     def __creat_base_line_network(self):
 45 |         with tf.variable_scope("shared_network"):
 46 |             h1 = tf.layers.dense(self.input_state,
 47 |                                     units=32,
 48 |                                     activation=tf.nn.relu,
 49 |                                     kernel_initializer=tf.contrib.layers.xavier_initializer(),
 50 |                                     name="hidden_1",
 51 |                                     reuse=True)
 52 | 
 53 |             h2 = tf.layers.dense(h1,
 54 |                                     units=32,
 55 |                                     activation=tf.nn.relu,
 56 |                                     kernel_initializer=tf.contrib.layers.xavier_initializer(),
 57 |                                     name="hidden_2",
 58 |                                     reuse=True)
 59 | 
 60 |         with tf.variable_scope("base_line_network"):
 61 |             self.base_line_outputs = tf.layers.dense(h2,
 62 |                                     units=1,
 63 |                                     activation=None,
 64 |                                     kernel_initializer=tf.contrib.layers.xavier_initializer(),
 65 |                                     name="base_line_outputs")
 66 | 
 67 |     def __create_loss(self):
 68 |         with tf.name_scope("compute_policy_gradients"):
 69 |             self.log_probs = tf.log(tf.clip_by_value(self.policy_outputs, 1e-20, 1.0))
 70 | 
 71 |             # entropy loss of exploration
 72 |             self.entropy_loss = -tf.reduce_sum(self.policy_outputs * self.log_probs, reduction_indices=1)
 73 |             self.policy_loss = -tf.reduce_sum(tf.multiply(self.log_probs, tf.squeeze(tf.cast(tf.one_hot(self.taken_actions, self.action_dim), tf.float32))) * \
 74 |                                               (self.future_rewards - tf.stop_gradient(self.base_line_outputs)), reduction_indices=1)
 75 |             # self.policy_loss = -tf.reduce_sum(tf.multiply(self.log_probs, tf.squeeze(tf.cast(tf.one_hot(self.taken_actions, self.action_dim), tf.float32))) * \
 76 |                                               # (self.future_rewards), reduction_indices=1)
 77 |             td = self.base_line_outputs - self.future_rewards
 78 |             td = tf.clip_by_value(td, -5.0, 5.0)
 79 |             # self.base_line_loss = tf.nn.l2_loss(self.base_line_outputs - self.future_rewards)
 80 |             self.base_line_loss = tf.reduce_sum(td**2 / 2, reduction_indices=1)
 81 |             self.total_loss = tf.add_n([self.policy_loss,
 82 |                                       self.entropy_beta * self.entropy_loss,
 83 |                                       self.base_line_loss])
 84 | 
 85 |     def __create_train_policy_op(self):
 86 |         self.train_policy_op = self.optimizer.minimize(self.policy_loss,
 87 |                                                 global_step=tf.contrib.framework.get_global_step())
 88 | 
 89 |     def __create_train_op(self):
 90 |         self.train_op = self.optimizer.minimize(self.total_loss,
 91 |                                                 global_step=tf.contrib.framework.get_global_step())
 92 | 
 93 |     def __create_model(self):
 94 |         self.__create_policy_network()
 95 |         self.__creat_base_line_network()
 96 |         self.__create_loss()
 97 |         self.__create_train_policy_op()
 98 |         self.__create_train_op()
 99 |         self.__create_get_layer_weight_op()
100 | 
101 |     def predict_policy(self, feed_state, sess=None):
102 |         sess = sess or tf.get_default_session()
103 |         return sess.run(self.policy_outputs, {self.input_state: feed_state})
104 | 
105 |     def update_policy(self, feed_state, feed_taken_actions, feed_future_rewards, sess=None):
106 |         sess = sess or tf.get_default_session()
107 |         return sess.run([self. train_policy_op, self.policy_loss],
108 |                         {self.input_state: feed_state,
109 |                             self.taken_actions: feed_taken_actions,
110 |                             self.future_rewards: feed_future_rewards})
111 | 
112 |     def predict_baseline(self, feed_state, sess=None):
113 |         sess = sess or tf.get_default_session()
114 |         return sess.run(self.base_line_outputs, {self.input_state: feed_state})
115 | 
116 |     def update_baseline(self, feed_state, feed_future_rewards, sess=None):
117 |         sess = sess or tf.get_default_session()
118 |         return sess.run(self.base_line_loss, {self.input_state: feed_state,
119 |                                               self.future_rewards: feed_future_rewards})
120 | 
121 |     def update(self, feed_state, feed_taken_actions, feed_future_rewards, sess=None):
122 |         sess = sess or tf.get_default_session()
123 |         return sess.run([self.train_op, self.total_loss, self.policy_loss, self.base_line_loss],
124 |                         {self.input_state: feed_state,
125 |                             self.taken_actions: feed_taken_actions,
126 |                             self.future_rewards: feed_future_rewards})
127 | 
128 |     def __create_get_layer_weight_op(self):
129 |         with tf.name_scope("model"):
130 |             with tf.variable_scope("shared_network", reuse=True):
131 |                 self.h1_weiget = tf.get_variable("hidden_1/kernel")
132 |                 self.h1_bias = tf.get_variable("hidden_1/bias")
133 | 
134 |     def run_layer_weight(self, sess=None):
135 |         sess = sess or tf.get_default_session()
136 |         return sess.run([self.h1_weiget, self.h1_bias])
137 | 
138 | 
139 | if __name__ == '__main__':
140 |     model = Model(40, 5)
141 | 


--------------------------------------------------------------------------------
/run_ddpg.bat:
--------------------------------------------------------------------------------
1 | mode con: cols=80 lines=100
2 | 
3 | 
4 | if NOT "%ComputerName%" == "PC-KW-60002" (
5 | 	set CUDA_VISIBLE_DEVICES=0 & activate tensorflow & python run_ddpg.py
6 | ) else (
7 | 	set CUDA_VISIBLE_DEVICES=0 & activate deep-fpv-racer & python run_ddpg.py & set /p temp="Hit enter to exit"
8 | )


--------------------------------------------------------------------------------
/run_ddpg.py:
--------------------------------------------------------------------------------
 1 | from model.ddpg_model import Model
 2 | from agent.ddpg import Agent
 3 | from mechanism.replay_buffer import Replay_Buffer
 4 | from mechanism.ou_process import OU_Process
 5 | from gym import wrappers
 6 | import gym
 7 | import numpy as np
 8 | 
 9 | ENV_NAME = 'Pendulum-v0'
10 | EPISODES = 100000
11 | MAX_EXPLORE_EPS = 100
12 | TEST_EPS = 1
13 | BATCH_SIZE = 64
14 | BUFFER_SIZE = 1e6
15 | WARM_UP_MEN = 5 * BATCH_SIZE
16 | DISCOUNT_FACTOR = 0.99
17 | ACTOR_LEARNING_RATE = 1e-4
18 | CRITIC_LEARNING_RATE = 1e-3
19 | TAU = 0.001
20 | 
21 | def main():
22 |     env = gym.make(ENV_NAME)
23 |     env = wrappers.Monitor(env, ENV_NAME+"experiment-1", force=True)
24 |     state_dim = env.observation_space.shape[0]
25 |     action_dim = env.action_space.shape[0]
26 |     model = Model(state_dim,
27 |                   action_dim,
28 |                   actor_learning_rate=ACTOR_LEARNING_RATE,
29 |                   critic_learning_rate=CRITIC_LEARNING_RATE,
30 |                   tau=TAU)
31 |     replay_buffer = Replay_Buffer(buffer_size=int(BUFFER_SIZE) ,batch_size=BATCH_SIZE)
32 |     exploration_noise = OU_Process(action_dim)
33 |     agent = Agent(model, replay_buffer, exploration_noise, discout_factor=DISCOUNT_FACTOR)
34 | 
35 |     action_mean = 0
36 |     i = 0
37 |     for episode in range(EPISODES):
38 |         state = env.reset()
39 |         agent.init_process()
40 |         # Training:
41 |         for step in range(env.spec.timestep_limit):
42 |             # env.render()
43 |             state = np.reshape(state, (1, -1))
44 |             if episode < MAX_EXPLORE_EPS:
45 |                 p = episode / MAX_EXPLORE_EPS
46 |                 action = np.clip(agent.select_action(state, p), -1.0, 1.0)
47 |             else:
48 |                 action = agent.predict_action(state)
49 |             action_ = action * 2
50 |             next_state, reward, done, _ = env.step(action_)
51 |             next_state = np.reshape(next_state, (1, -1))
52 |             agent.store_transition([state, action, reward, next_state, done])
53 |             if agent.replay_buffer.memory_state()["current_size"] > WARM_UP_MEN:
54 |                 agent.train_model()
55 |             else:
56 |                 i += 1
57 |                 action_mean = action_mean + (action - action_mean) / i
58 |                 print("running action mean: {}".format(action_mean))
59 |             state = next_state
60 |             if done:
61 |                 break
62 | 
63 |         # Testing:
64 |         if episode % 2 == 0 and episode > 10:
65 |             total_reward = 0
66 |             for i in range(TEST_EPS):
67 |                 state = env.reset()
68 |                 for j in range(env.spec.timestep_limit):
69 |                     # env.render()
70 |                     state = np.reshape(state, (1, 3))
71 |                     action = agent.predict_action(state)
72 |                     action_ = action * 2
73 |                     state, reward, done, _ = env.step(action_)
74 |                     total_reward += reward
75 |                     if done:
76 |                         break
77 |             avg_reward = total_reward/TEST_EPS
78 |             print("episode: {}, Evaluation Average Reward: {}".format(episode, avg_reward))
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/run_reinforce.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym import wrappers
 3 | import numpy as np
 4 | import random
 5 | from model.reinforce_model import Model
 6 | from agent.reinforce import Agent
 7 | 
 8 | import logging
 9 | 
10 | # config logger
11 | logger = logging.getLogger(__name__)
12 | logger.setLevel(logging.INFO)
13 | 
14 | # make environment
15 | env_name = 'CartPole-v0'
16 | env = gym.make(env_name)
17 | env = wrappers.Monitor(env, env_name+"experiment-1", force=True)
18 | logger.info("{} is made".format(env_name))
19 | action_dim = env.action_space.n
20 | state_dim = env.observation_space.shape[0]
21 | logger.info("action dimension of env is {}".format(action_dim))
22 | logger.info("state dimension of env is {}".format(state_dim))
23 | 
24 | # parameters
25 | MAX_EPISODE = 100000
26 | MAX_STEP = 1000
27 | DISCOUNT_FACTOR = 0.97
28 | ENTROPY_BETA = 1e-3
29 | LEARNING_RATE = 0.001
30 | VERBOSE = False
31 | 
32 | model = Model(state_dim, action_dim, entropy_beta=ENTROPY_BETA, learning_rate=LEARNING_RATE)
33 | agent = Agent(model, DISCOUNT_FACTOR, VERBOSE)
34 | last_100_epi_red = []
35 | for i_episode in xrange(MAX_EPISODE):
36 |     observation = env.reset()
37 |     episode_reward = 0
38 |     for t in xrange(MAX_STEP):
39 |         agent.state_append(observation)
40 |         # env.render()
41 |         p = agent.predict_policy(observation)
42 |         action = np.random.choice(action_dim, 1, p=p[0])
43 |         observation, reward, done, info = env.step(action[0])
44 |         episode_reward += reward
45 | 
46 |         if done and episode_reward != 200:
47 |             reward = -10
48 |         elif done and episode_reward == 200:
49 |             reward = 10
50 |             print("positive done!")
51 |         agent.action_append(action)
52 |         agent.reward_append(reward)
53 | 
54 |         if done:
55 |             last_100_epi_red.insert(0, episode_reward)
56 |             if len(last_100_epi_red) > 100:
57 |                 last_100_epi_red.pop()
58 |             logger.info("episode {} finished after {} timesteps with total reward {}".format(i_episode, t+1, episode_reward))
59 |             avg_reward = sum(last_100_epi_red) / float(len(last_100_epi_red))
60 |             logger.info("last 100 episodes average reward is {}".format(sum(last_100_epi_red) / float(len(last_100_epi_red))))
61 |             if avg_reward >= 195.0:
62 |                 print("problem solved!")
63 |                 exit()
64 |             break
65 |     agent.train_model()
66 |     agent.clear_rollout()
67 | 


--------------------------------------------------------------------------------