├── .gitignore
├── A2C
├── A2C_Continuous.py
└── A2C_Discrete.py
├── A3C
├── A3C_Continuous.py
└── A3C_Discrete.py
├── DDPG
└── DDPG_Continuous.py
├── DQN
└── DQN_Discrete.py
├── DRQN
└── DRQN_Discrete.py
├── DoubleDQN
└── DoubleDQN_Discrete.py
├── DuelingDQN
└── DuelingDQN_Discrete.py
├── DuelingDoubleDQN
└── DuelingDoubleDQN_Discrete.py
├── LICENSE
├── PPO
├── PPO_Continuous.py
└── PPO_Discrete.py
├── README.md
└── assets
├── .DS_Store
├── cartpolev1.svg
├── discrete_reward_plot.png
└── logo.png
/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | **/.DS_Store/
3 | **/.vscode/
4 | **/wandb
--------------------------------------------------------------------------------
/A2C/A2C_Continuous.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, Lambda
4 |
5 | import gym
6 | import argparse
7 | import numpy as np
8 |
9 | tf.keras.backend.set_floatx('float64')
10 |
11 | wandb.init(name='A2C', project="deep-rl-tf2")
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--gamma', type=float, default=0.99)
15 | parser.add_argument('--update_interval', type=int, default=5)
16 | parser.add_argument('--actor_lr', type=float, default=0.0005)
17 | parser.add_argument('--critic_lr', type=float, default=0.001)
18 |
19 | args = parser.parse_args()
20 |
21 |
22 | class Actor:
23 | def __init__(self, state_dim, action_dim, action_bound, std_bound):
24 | self.state_dim = state_dim
25 | self.action_dim = action_dim
26 | self.action_bound = action_bound
27 | self.std_bound = std_bound
28 | self.model = self.create_model()
29 | self.opt = tf.keras.optimizers.Adam(args.actor_lr)
30 |
31 | def create_model(self):
32 | state_input = Input((self.state_dim,))
33 | dense_1 = Dense(32, activation='relu')(state_input)
34 | dense_2 = Dense(32, activation='relu')(dense_1)
35 | out_mu = Dense(self.action_dim, activation='tanh')(dense_2)
36 | mu_output = Lambda(lambda x: x * self.action_bound)(out_mu)
37 | std_output = Dense(self.action_dim, activation='softplus')(dense_2)
38 | return tf.keras.models.Model(state_input, [mu_output, std_output])
39 |
40 | def get_action(self, state):
41 | state = np.reshape(state, [1, self.state_dim])
42 | mu, std = self.model.predict(state)
43 | mu, std = mu[0], std[0]
44 | return np.random.normal(mu, std, size=self.action_dim)
45 |
46 | def log_pdf(self, mu, std, action):
47 | std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
48 | var = std ** 2
49 | log_policy_pdf = -0.5 * (action - mu) ** 2 / \
50 | var - 0.5 * tf.math.log(var * 2 * np.pi)
51 | return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)
52 |
53 | def compute_loss(self, mu, std, actions, advantages):
54 | log_policy_pdf = self.log_pdf(mu, std, actions)
55 | loss_policy = log_policy_pdf * advantages
56 | return tf.reduce_sum(-loss_policy)
57 |
58 | def train(self, states, actions, advantages):
59 | with tf.GradientTape() as tape:
60 | mu, std = self.model(states, training=True)
61 | loss = self.compute_loss(mu, std, actions, advantages)
62 | grads = tape.gradient(loss, self.model.trainable_variables)
63 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
64 | return loss
65 |
66 |
67 | class Critic:
68 | def __init__(self, state_dim):
69 | self.state_dim = state_dim
70 | self.model = self.create_model()
71 | self.opt = tf.keras.optimizers.Adam(args.critic_lr)
72 |
73 | def create_model(self):
74 | return tf.keras.Sequential([
75 | Input((self.state_dim,)),
76 | Dense(32, activation='relu'),
77 | Dense(32, activation='relu'),
78 | Dense(16, activation='relu'),
79 | Dense(1, activation='linear')
80 | ])
81 |
82 | def compute_loss(self, v_pred, td_targets):
83 | mse = tf.keras.losses.MeanSquaredError()
84 | return mse(td_targets, v_pred)
85 |
86 | def train(self, states, td_targets):
87 | with tf.GradientTape() as tape:
88 | v_pred = self.model(states, training=True)
89 | assert v_pred.shape == td_targets.shape
90 | loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
91 | grads = tape.gradient(loss, self.model.trainable_variables)
92 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
93 | return loss
94 |
95 |
96 | class Agent:
97 | def __init__(self, env):
98 | self.env = env
99 | self.state_dim = self.env.observation_space.shape[0]
100 | self.action_dim = self.env.action_space.shape[0]
101 | self.action_bound = self.env.action_space.high[0]
102 | self.std_bound = [1e-2, 1.0]
103 |
104 | self.actor = Actor(self.state_dim, self.action_dim,
105 | self.action_bound, self.std_bound)
106 | self.critic = Critic(self.state_dim)
107 |
108 | def td_target(self, reward, next_state, done):
109 | if done:
110 | return reward
111 | v_value = self.critic.model.predict(
112 | np.reshape(next_state, [1, self.state_dim]))
113 | return np.reshape(reward + args.gamma * v_value[0], [1, 1])
114 |
115 | def advatnage(self, td_targets, baselines):
116 | return td_targets - baselines
117 |
118 | def list_to_batch(self, list):
119 | batch = list[0]
120 | for elem in list[1:]:
121 | batch = np.append(batch, elem, axis=0)
122 | return batch
123 |
124 | def train(self, max_episodes=1000):
125 | for ep in range(max_episodes):
126 | state_batch = []
127 | action_batch = []
128 | td_target_batch = []
129 | advatnage_batch = []
130 | episode_reward, done = 0, False
131 |
132 | state = self.env.reset()
133 |
134 | while not done:
135 | # self.env.render()
136 | action = self.actor.get_action(state)
137 | action = np.clip(action, -self.action_bound, self.action_bound)
138 |
139 | next_state, reward, done, _ = self.env.step(action)
140 |
141 | state = np.reshape(state, [1, self.state_dim])
142 | action = np.reshape(action, [1, self.action_dim])
143 | next_state = np.reshape(next_state, [1, self.state_dim])
144 | reward = np.reshape(reward, [1, 1])
145 |
146 | td_target = self.td_target((reward+8)/8, next_state, done)
147 | advantage = self.advatnage(
148 | td_target, self.critic.model.predict(state))
149 |
150 | state_batch.append(state)
151 | action_batch.append(action)
152 | td_target_batch.append(td_target)
153 | advatnage_batch.append(advantage)
154 |
155 | if len(state_batch) >= args.update_interval or done:
156 | states = self.list_to_batch(state_batch)
157 | actions = self.list_to_batch(action_batch)
158 | td_targets = self.list_to_batch(td_target_batch)
159 | advantages = self.list_to_batch(advatnage_batch)
160 |
161 | actor_loss = self.actor.train(states, actions, advantages)
162 | critic_loss = self.critic.train(states, td_targets)
163 |
164 | state_batch = []
165 | action_batch = []
166 | td_target_batch = []
167 | advatnage_batch = []
168 |
169 | episode_reward += reward[0][0]
170 | state = next_state[0]
171 |
172 | print('EP{} EpisodeReward={}'.format(ep, episode_reward))
173 | wandb.log({'Reward': episode_reward})
174 |
175 |
176 | def main():
177 | env_name = 'Pendulum-v0'
178 | env = gym.make(env_name)
179 | agent = Agent(env)
180 | agent.train()
181 |
182 |
183 | if __name__ == "__main__":
184 | main()
185 |
--------------------------------------------------------------------------------
/A2C/A2C_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense
4 |
5 | import gym
6 | import argparse
7 | import numpy as np
8 |
9 | tf.keras.backend.set_floatx('float64')
10 |
11 | wandb.init(name='A2C', project="deep-rl-tf2")
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--gamma', type=float, default=0.99)
15 | parser.add_argument('--update_interval', type=int, default=5)
16 | parser.add_argument('--actor_lr', type=float, default=0.0005)
17 | parser.add_argument('--critic_lr', type=float, default=0.001)
18 |
19 | args = parser.parse_args()
20 |
21 |
22 | class Actor:
23 | def __init__(self, state_dim, action_dim):
24 | self.state_dim = state_dim
25 | self.action_dim = action_dim
26 | self.model = self.create_model()
27 | self.opt = tf.keras.optimizers.Adam(args.actor_lr)
28 |
29 | def create_model(self):
30 | return tf.keras.Sequential([
31 | Input((self.state_dim,)),
32 | Dense(32, activation='relu'),
33 | Dense(16, activation='relu'),
34 | Dense(self.action_dim, activation='softmax')
35 | ])
36 |
37 | def compute_loss(self, actions, logits, advantages):
38 | ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(
39 | from_logits=True)
40 | actions = tf.cast(actions, tf.int32)
41 | policy_loss = ce_loss(
42 | actions, logits, sample_weight=tf.stop_gradient(advantages))
43 | return policy_loss
44 |
45 | def train(self, states, actions, advantages):
46 | with tf.GradientTape() as tape:
47 | logits = self.model(states, training=True)
48 | loss = self.compute_loss(
49 | actions, logits, advantages)
50 | grads = tape.gradient(loss, self.model.trainable_variables)
51 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
52 | return loss
53 |
54 |
55 | class Critic:
56 | def __init__(self, state_dim):
57 | self.state_dim = state_dim
58 | self.model = self.create_model()
59 | self.opt = tf.keras.optimizers.Adam(args.critic_lr)
60 |
61 | def create_model(self):
62 | return tf.keras.Sequential([
63 | Input((self.state_dim,)),
64 | Dense(32, activation='relu'),
65 | Dense(16, activation='relu'),
66 | Dense(16, activation='relu'),
67 | Dense(1, activation='linear')
68 | ])
69 |
70 | def compute_loss(self, v_pred, td_targets):
71 | mse = tf.keras.losses.MeanSquaredError()
72 | return mse(td_targets, v_pred)
73 |
74 | def train(self, states, td_targets):
75 | with tf.GradientTape() as tape:
76 | v_pred = self.model(states, training=True)
77 | assert v_pred.shape == td_targets.shape
78 | loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
79 | grads = tape.gradient(loss, self.model.trainable_variables)
80 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
81 | return loss
82 |
83 |
84 | class Agent:
85 | def __init__(self, env):
86 | self.env = env
87 | self.state_dim = self.env.observation_space.shape[0]
88 | self.action_dim = self.env.action_space.n
89 | self.actor = Actor(self.state_dim, self.action_dim)
90 | self.critic = Critic(self.state_dim)
91 |
92 | def td_target(self, reward, next_state, done):
93 | if done:
94 | return reward
95 | v_value = self.critic.model.predict(
96 | np.reshape(next_state, [1, self.state_dim]))
97 | return np.reshape(reward + args.gamma * v_value[0], [1, 1])
98 |
99 | def advatnage(self, td_targets, baselines):
100 | return td_targets - baselines
101 |
102 | def list_to_batch(self, list):
103 | batch = list[0]
104 | for elem in list[1:]:
105 | batch = np.append(batch, elem, axis=0)
106 | return batch
107 |
108 | def train(self, max_episodes=1000):
109 | for ep in range(max_episodes):
110 | state_batch = []
111 | action_batch = []
112 | td_target_batch = []
113 | advatnage_batch = []
114 | episode_reward, done = 0, False
115 |
116 | state = self.env.reset()
117 |
118 | while not done:
119 | # self.env.render()
120 | probs = self.actor.model.predict(
121 | np.reshape(state, [1, self.state_dim]))
122 | action = np.random.choice(self.action_dim, p=probs[0])
123 |
124 | next_state, reward, done, _ = self.env.step(action)
125 |
126 | state = np.reshape(state, [1, self.state_dim])
127 | action = np.reshape(action, [1, 1])
128 | next_state = np.reshape(next_state, [1, self.state_dim])
129 | reward = np.reshape(reward, [1, 1])
130 |
131 | td_target = self.td_target(reward * 0.01, next_state, done)
132 | advantage = self.advatnage(
133 | td_target, self.critic.model.predict(state))
134 |
135 | state_batch.append(state)
136 | action_batch.append(action)
137 | td_target_batch.append(td_target)
138 | advatnage_batch.append(advantage)
139 |
140 | if len(state_batch) >= args.update_interval or done:
141 | states = self.list_to_batch(state_batch)
142 | actions = self.list_to_batch(action_batch)
143 | td_targets = self.list_to_batch(td_target_batch)
144 | advantages = self.list_to_batch(advatnage_batch)
145 |
146 | actor_loss = self.actor.train(states, actions, advantages)
147 | critic_loss = self.critic.train(states, td_targets)
148 |
149 | state_batch = []
150 | action_batch = []
151 | td_target_batch = []
152 | advatnage_batch = []
153 |
154 | episode_reward += reward[0][0]
155 | state = next_state[0]
156 |
157 | print('EP{} EpisodeReward={}'.format(ep, episode_reward))
158 | wandb.log({'Reward': episode_reward})
159 |
160 |
161 | def main():
162 | env_name = 'CartPole-v1'
163 | env = gym.make(env_name)
164 | agent = Agent(env)
165 | agent.train()
166 |
167 |
168 | if __name__ == "__main__":
169 | main()
170 |
--------------------------------------------------------------------------------
/A3C/A3C_Continuous.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, Lambda
4 |
5 | import gym
6 | import argparse
7 | import numpy as np
8 | from threading import Thread
9 | from multiprocessing import cpu_count
10 | tf.keras.backend.set_floatx('float64')
11 | wandb.init(name='A3C', project="deep-rl-tf2")
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--gamma', type=float, default=0.99)
15 | parser.add_argument('--update_interval', type=int, default=5)
16 | parser.add_argument('--actor_lr', type=float, default=0.0005)
17 | parser.add_argument('--critic_lr', type=float, default=0.001)
18 |
19 | args = parser.parse_args()
20 |
21 | CUR_EPISODE = 0
22 |
23 | class Actor:
24 | def __init__(self, state_dim, action_dim, action_bound, std_bound):
25 | self.state_dim = state_dim
26 | self.action_dim = action_dim
27 | self.action_bound = action_bound
28 | self.std_bound = std_bound
29 | self.model = self.create_model()
30 | self.opt = tf.keras.optimizers.Adam(args.actor_lr)
31 | self.entropy_beta = 0.01
32 |
33 | def create_model(self):
34 | state_input = Input((self.state_dim,))
35 | dense_1 = Dense(32, activation='relu')(state_input)
36 | dense_2 = Dense(32, activation='relu')(dense_1)
37 | out_mu = Dense(self.action_dim, activation='tanh')(dense_2)
38 | mu_output = Lambda(lambda x: x * self.action_bound)(out_mu)
39 | std_output = Dense(self.action_dim, activation='softplus')(dense_2)
40 | return tf.keras.models.Model(state_input, [mu_output, std_output])
41 |
42 | def get_action(self, state):
43 | state = np.reshape(state, [1, self.state_dim])
44 | mu, std = self.model.predict(state)
45 | mu, std = mu[0], std[0]
46 | return np.random.normal(mu, std, size=self.action_dim)
47 |
48 | def log_pdf(self, mu, std, action):
49 | std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
50 | var = std ** 2
51 | log_policy_pdf = -0.5 * (action - mu) ** 2 / \
52 | var - 0.5 * tf.math.log(var * 2 * np.pi)
53 | return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)
54 |
55 | def compute_loss(self, mu, std, actions, advantages):
56 | log_policy_pdf = self.log_pdf(mu, std, actions)
57 | loss_policy = log_policy_pdf * advantages
58 | return tf.reduce_sum(-loss_policy)
59 |
60 | def train(self, states, actions, advantages):
61 | with tf.GradientTape() as tape:
62 | mu, std = self.model(states, training=True)
63 | loss = self.compute_loss(mu, std, actions, advantages)
64 | grads = tape.gradient(loss, self.model.trainable_variables)
65 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
66 | return loss
67 |
68 |
69 | class Critic:
70 | def __init__(self, state_dim):
71 | self.state_dim = state_dim
72 | self.model = self.create_model()
73 | self.opt = tf.keras.optimizers.Adam(args.critic_lr)
74 |
75 | def create_model(self):
76 | return tf.keras.Sequential([
77 | Input((self.state_dim,)),
78 | Dense(32, activation='relu'),
79 | Dense(32, activation='relu'),
80 | Dense(16, activation='relu'),
81 | Dense(1, activation='linear')
82 | ])
83 |
84 | def compute_loss(self, v_pred, td_targets):
85 | mse = tf.keras.losses.MeanSquaredError()
86 | return mse(td_targets, v_pred)
87 |
88 | def train(self, states, td_targets):
89 | with tf.GradientTape() as tape:
90 | v_pred = self.model(states, training=True)
91 | assert v_pred.shape == td_targets.shape
92 | loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
93 | grads = tape.gradient(loss, self.model.trainable_variables)
94 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
95 | return loss
96 |
97 |
98 | class Agent:
99 | def __init__(self, env_name):
100 | env = gym.make(env_name)
101 | self.env_name = env_name
102 | self.state_dim = env.observation_space.shape[0]
103 | self.action_dim = env.action_space.shape[0]
104 | self.action_bound = env.action_space.high[0]
105 | self.std_bound = [1e-2, 1.0]
106 |
107 | self.global_actor = Actor(
108 | self.state_dim, self.action_dim, self.action_bound, self.std_bound)
109 | self.global_critic = Critic(self.state_dim)
110 | self.num_workers = cpu_count()
111 |
112 | def train(self, max_episodes=1000):
113 | workers = []
114 |
115 | for i in range(self.num_workers):
116 | env = gym.make(self.env_name)
117 | workers.append(WorkerAgent(
118 | env, self.global_actor, self.global_critic, max_episodes))
119 |
120 | for worker in workers:
121 | worker.start()
122 |
123 | for worker in workers:
124 | worker.join()
125 |
126 |
127 | class WorkerAgent(Thread):
128 | def __init__(self, env, global_actor, global_critic, max_episodes):
129 | Thread.__init__(self)
130 | self.env = env
131 | self.state_dim = self.env.observation_space.shape[0]
132 | self.action_dim = self.env.action_space.shape[0]
133 | self.action_bound = self.env.action_space.high[0]
134 | self.std_bound = [1e-2, 1.0]
135 |
136 | self.max_episodes = max_episodes
137 | self.global_actor = global_actor
138 | self.global_critic = global_critic
139 | self.actor = Actor(self.state_dim, self.action_dim,
140 | self.action_bound, self.std_bound)
141 | self.critic = Critic(self.state_dim)
142 |
143 | self.actor.model.set_weights(self.global_actor.model.get_weights())
144 | self.critic.model.set_weights(self.global_critic.model.get_weights())
145 |
146 | def n_step_td_target(self, rewards, next_v_value, done):
147 | td_targets = np.zeros_like(rewards)
148 | cumulative = 0
149 | if not done:
150 | cumulative = next_v_value
151 |
152 | for k in reversed(range(0, len(rewards))):
153 | cumulative = args.gamma * cumulative + rewards[k]
154 | td_targets[k] = cumulative
155 | return td_targets
156 |
157 | def advatnage(self, td_targets, baselines):
158 | return td_targets - baselines
159 |
160 | def list_to_batch(self, list):
161 | batch = list[0]
162 | for elem in list[1:]:
163 | batch = np.append(batch, elem, axis=0)
164 | return batch
165 |
166 | def train(self):
167 | global CUR_EPISODE
168 |
169 | while self.max_episodes >= CUR_EPISODE:
170 | state_batch = []
171 | action_batch = []
172 | reward_batch = []
173 | episode_reward, done = 0, False
174 |
175 | state = self.env.reset()
176 |
177 | while not done:
178 | # self.env.render()
179 | action = self.actor.get_action(state)
180 | action = np.clip(action, -self.action_bound, self.action_bound)
181 |
182 | next_state, reward, done, _ = self.env.step(action)
183 |
184 | state = np.reshape(state, [1, self.state_dim])
185 | action = np.reshape(action, [1, 1])
186 | next_state = np.reshape(next_state, [1, self.state_dim])
187 | reward = np.reshape(reward, [1, 1])
188 |
189 | state_batch.append(state)
190 | action_batch.append(action)
191 | reward_batch.append(reward)
192 |
193 | if len(state_batch) >= args.update_interval or done:
194 | states = self.list_to_batch(state_batch)
195 | actions = self.list_to_batch(action_batch)
196 | rewards = self.list_to_batch(reward_batch)
197 |
198 | next_v_value = self.critic.model.predict(next_state)
199 | td_targets = self.n_step_td_target(
200 | (rewards+8)/8, next_v_value, done)
201 | advantages = td_targets - self.critic.model.predict(states)
202 |
203 | actor_loss = self.global_actor.train(
204 | states, actions, advantages)
205 | critic_loss = self.global_critic.train(
206 | states, td_targets)
207 |
208 | self.actor.model.set_weights(
209 | self.global_actor.model.get_weights())
210 | self.critic.model.set_weights(
211 | self.global_critic.model.get_weights())
212 |
213 | state_batch = []
214 | action_batch = []
215 | reward_batch = []
216 | td_target_batch = []
217 | advatnage_batch = []
218 |
219 | episode_reward += reward[0][0]
220 | state = next_state[0]
221 |
222 | print('EP{} EpisodeReward={}'.format(CUR_EPISODE, episode_reward))
223 | wandb.log({'Reward': episode_reward})
224 | CUR_EPISODE += 1
225 |
226 | def run(self):
227 | self.train()
228 |
229 |
230 | def main():
231 | env_name = 'Pendulum-v0'
232 | agent = Agent(env_name)
233 | agent.train()
234 |
235 |
236 | if __name__ == "__main__":
237 | main()
238 |
--------------------------------------------------------------------------------
/A3C/A3C_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense
4 |
5 | import gym
6 | import argparse
7 | import numpy as np
8 | from threading import Thread, Lock
9 | from multiprocessing import cpu_count
10 | tf.keras.backend.set_floatx('float64')
11 | wandb.init(name='A3C', project="deep-rl-tf2")
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--gamma', type=float, default=0.99)
15 | parser.add_argument('--update_interval', type=int, default=5)
16 | parser.add_argument('--actor_lr', type=float, default=0.0005)
17 | parser.add_argument('--critic_lr', type=float, default=0.001)
18 |
19 | args = parser.parse_args()
20 |
21 | CUR_EPISODE = 0
22 |
23 |
24 | class Actor:
25 | def __init__(self, state_dim, action_dim):
26 | self.state_dim = state_dim
27 | self.action_dim = action_dim
28 | self.model = self.create_model()
29 | self.opt = tf.keras.optimizers.Adam(args.actor_lr)
30 | self.entropy_beta = 0.01
31 |
32 | def create_model(self):
33 | return tf.keras.Sequential([
34 | Input((self.state_dim,)),
35 | Dense(32, activation='relu'),
36 | Dense(16, activation='relu'),
37 | Dense(self.action_dim, activation='softmax')
38 | ])
39 |
40 | def compute_loss(self, actions, logits, advantages):
41 | ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(
42 | from_logits=True)
43 | entropy_loss = tf.keras.losses.CategoricalCrossentropy(
44 | from_logits=True)
45 | actions = tf.cast(actions, tf.int32)
46 | policy_loss = ce_loss(
47 | actions, logits, sample_weight=tf.stop_gradient(advantages))
48 | entropy = entropy_loss(logits, logits)
49 | return policy_loss - self.entropy_beta * entropy
50 |
51 | def train(self, states, actions, advantages):
52 | with tf.GradientTape() as tape:
53 | logits = self.model(states, training=True)
54 | loss = self.compute_loss(
55 | actions, logits, advantages)
56 | grads = tape.gradient(loss, self.model.trainable_variables)
57 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
58 | return loss
59 |
60 |
61 | class Critic:
62 | def __init__(self, state_dim):
63 | self.state_dim = state_dim
64 | self.model = self.create_model()
65 | self.opt = tf.keras.optimizers.Adam(args.critic_lr)
66 |
67 | def create_model(self):
68 | return tf.keras.Sequential([
69 | Input((self.state_dim,)),
70 | Dense(32, activation='relu'),
71 | Dense(16, activation='relu'),
72 | Dense(16, activation='relu'),
73 | Dense(1, activation='linear')
74 | ])
75 |
76 | def compute_loss(self, v_pred, td_targets):
77 | mse = tf.keras.losses.MeanSquaredError()
78 | return mse(td_targets, v_pred)
79 |
80 | def train(self, states, td_targets):
81 | with tf.GradientTape() as tape:
82 | v_pred = self.model(states, training=True)
83 | assert v_pred.shape == td_targets.shape
84 | loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
85 | grads = tape.gradient(loss, self.model.trainable_variables)
86 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
87 | return loss
88 |
89 |
90 | class Agent:
91 | def __init__(self, env_name):
92 | env = gym.make(env_name)
93 | self.env_name = env_name
94 | self.state_dim = env.observation_space.shape[0]
95 | self.action_dim = env.action_space.n
96 |
97 | self.global_actor = Actor(self.state_dim, self.action_dim)
98 | self.global_critic = Critic(self.state_dim)
99 | self.num_workers = cpu_count()
100 |
101 | def train(self, max_episodes=1000):
102 | workers = []
103 |
104 | for i in range(self.num_workers):
105 | env = gym.make(self.env_name)
106 | workers.append(WorkerAgent(
107 | env, self.global_actor, self.global_critic, max_episodes))
108 |
109 | for worker in workers:
110 | worker.start()
111 |
112 | for worker in workers:
113 | worker.join()
114 |
115 |
116 | class WorkerAgent(Thread):
117 | def __init__(self, env, global_actor, global_critic, max_episodes):
118 | Thread.__init__(self)
119 | self.lock = Lock()
120 | self.env = env
121 | self.state_dim = self.env.observation_space.shape[0]
122 | self.action_dim = self.env.action_space.n
123 |
124 | self.max_episodes = max_episodes
125 | self.global_actor = global_actor
126 | self.global_critic = global_critic
127 | self.actor = Actor(self.state_dim, self.action_dim)
128 | self.critic = Critic(self.state_dim)
129 |
130 | self.actor.model.set_weights(self.global_actor.model.get_weights())
131 | self.critic.model.set_weights(self.global_critic.model.get_weights())
132 |
133 | def n_step_td_target(self, rewards, next_v_value, done):
134 | td_targets = np.zeros_like(rewards)
135 | cumulative = 0
136 | if not done:
137 | cumulative = next_v_value
138 |
139 | for k in reversed(range(0, len(rewards))):
140 | cumulative = args.gamma * cumulative + rewards[k]
141 | td_targets[k] = cumulative
142 | return td_targets
143 |
144 | def advatnage(self, td_targets, baselines):
145 | return td_targets - baselines
146 |
147 | def list_to_batch(self, list):
148 | batch = list[0]
149 | for elem in list[1:]:
150 | batch = np.append(batch, elem, axis=0)
151 | return batch
152 |
153 | def train(self):
154 | global CUR_EPISODE
155 |
156 | while self.max_episodes >= CUR_EPISODE:
157 | state_batch = []
158 | action_batch = []
159 | reward_batch = []
160 | episode_reward, done = 0, False
161 |
162 | state = self.env.reset()
163 |
164 | while not done:
165 | # self.env.render()
166 | probs = self.actor.model.predict(
167 | np.reshape(state, [1, self.state_dim]))
168 | action = np.random.choice(self.action_dim, p=probs[0])
169 |
170 | next_state, reward, done, _ = self.env.step(action)
171 |
172 | state = np.reshape(state, [1, self.state_dim])
173 | action = np.reshape(action, [1, 1])
174 | next_state = np.reshape(next_state, [1, self.state_dim])
175 | reward = np.reshape(reward, [1, 1])
176 |
177 | state_batch.append(state)
178 | action_batch.append(action)
179 | reward_batch.append(reward)
180 |
181 | if len(state_batch) >= args.update_interval or done:
182 | states = self.list_to_batch(state_batch)
183 | actions = self.list_to_batch(action_batch)
184 | rewards = self.list_to_batch(reward_batch)
185 |
186 | next_v_value = self.critic.model.predict(next_state)
187 | td_targets = self.n_step_td_target(
188 | rewards, next_v_value, done)
189 | advantages = td_targets - self.critic.model.predict(states)
190 |
191 | with self.lock:
192 | actor_loss = self.global_actor.train(
193 | states, actions, advantages)
194 | critic_loss = self.global_critic.train(
195 | states, td_targets)
196 |
197 | self.actor.model.set_weights(
198 | self.global_actor.model.get_weights())
199 | self.critic.model.set_weights(
200 | self.global_critic.model.get_weights())
201 |
202 | state_batch = []
203 | action_batch = []
204 | reward_batch = []
205 | td_target_batch = []
206 | advatnage_batch = []
207 |
208 | episode_reward += reward[0][0]
209 | state = next_state[0]
210 |
211 | print('EP{} EpisodeReward={}'.format(CUR_EPISODE, episode_reward))
212 | wandb.log({'Reward': episode_reward})
213 | CUR_EPISODE += 1
214 |
215 | def run(self):
216 | self.train()
217 |
218 |
219 | def main():
220 | env_name = 'CartPole-v1'
221 | agent = Agent(env_name)
222 | agent.train()
223 |
224 |
225 | if __name__ == "__main__":
226 | main()
227 |
--------------------------------------------------------------------------------
/DDPG/DDPG_Continuous.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, Lambda, concatenate
4 |
5 | import gym
6 | import argparse
7 | import numpy as np
8 | import random
9 | from collections import deque
10 |
11 | tf.keras.backend.set_floatx('float64')
12 | wandb.init(name='DDPG', project="deep-rl-tf2")
13 |
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--gamma', type=float, default=0.99)
16 | parser.add_argument('--actor_lr', type=float, default=0.0005)
17 | parser.add_argument('--critic_lr', type=float, default=0.001)
18 | parser.add_argument('--batch_size', type=int, default=64)
19 | parser.add_argument('--tau', type=float, default=0.05)
20 | parser.add_argument('--train_start', type=int, default=2000)
21 |
22 | args = parser.parse_args()
23 |
24 | class ReplayBuffer:
25 | def __init__(self, capacity=20000):
26 | self.buffer = deque(maxlen=capacity)
27 |
28 | def put(self, state, action, reward, next_state, done):
29 | self.buffer.append([state, action, reward, next_state, done])
30 |
31 | def sample(self):
32 | sample = random.sample(self.buffer, args.batch_size)
33 | states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
34 | states = np.array(states).reshape(args.batch_size, -1)
35 | next_states = np.array(next_states).reshape(args.batch_size, -1)
36 | return states, actions, rewards, next_states, done
37 |
38 | def size(self):
39 | return len(self.buffer)
40 |
41 | class Actor:
42 | def __init__(self, state_dim, action_dim, action_bound):
43 | self.state_dim = state_dim
44 | self.action_dim = action_dim
45 | self.action_bound = action_bound
46 | self.model = self.create_model()
47 | self.opt = tf.keras.optimizers.Adam(args.actor_lr)
48 |
49 | def create_model(self):
50 | return tf.keras.Sequential([
51 | Input((self.state_dim,)),
52 | Dense(32, activation='relu'),
53 | Dense(32, activation='relu'),
54 | Dense(self.action_dim, activation='tanh'),
55 | Lambda(lambda x: x * self.action_bound)
56 | ])
57 |
58 | def train(self, states, q_grads):
59 | with tf.GradientTape() as tape:
60 | grads = tape.gradient(self.model(states), self.model.trainable_variables, -q_grads)
61 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
62 |
63 | def predict(self, state):
64 | return self.model.predict(state)
65 |
66 | def get_action(self, state):
67 | state = np.reshape(state, [1, self.state_dim])
68 | return self.model.predict(state)[0]
69 |
70 |
71 |
72 | class Critic:
73 | def __init__(self, state_dim, action_dim):
74 | self.state_dim = state_dim
75 | self.action_dim = action_dim
76 | self.model = self.create_model()
77 | self.opt = tf.keras.optimizers.Adam(args.critic_lr)
78 |
79 | def create_model(self):
80 | state_input = Input((self.state_dim,))
81 | s1 = Dense(64, activation='relu')(state_input)
82 | s2 = Dense(32, activation='relu')(s1)
83 | action_input = Input((self.action_dim,))
84 | a1 = Dense(32, activation='relu')(action_input)
85 | c1 = concatenate([s2, a1], axis=-1)
86 | c2 = Dense(16, activation='relu')(c1)
87 | output = Dense(1, activation='linear')(c2)
88 | return tf.keras.Model([state_input, action_input], output)
89 |
90 | def predict(self, inputs):
91 | return self.model.predict(inputs)
92 |
93 | def q_grads(self, states, actions):
94 | actions = tf.convert_to_tensor(actions)
95 | with tf.GradientTape() as tape:
96 | tape.watch(actions)
97 | q_values = self.model([states, actions])
98 | q_values = tf.squeeze(q_values)
99 | return tape.gradient(q_values, actions)
100 |
101 | def compute_loss(self, v_pred, td_targets):
102 | mse = tf.keras.losses.MeanSquaredError()
103 | return mse(td_targets, v_pred)
104 |
105 | def train(self, states, actions, td_targets):
106 | with tf.GradientTape() as tape:
107 | v_pred = self.model([states, actions], training=True)
108 | assert v_pred.shape == td_targets.shape
109 | loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
110 | grads = tape.gradient(loss, self.model.trainable_variables)
111 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
112 | return loss
113 |
114 |
115 | class Agent:
116 | def __init__(self, env):
117 | self.env = env
118 | self.state_dim = self.env.observation_space.shape[0]
119 | self.action_dim = self.env.action_space.shape[0]
120 | self.action_bound = self.env.action_space.high[0]
121 |
122 | self.buffer = ReplayBuffer()
123 |
124 | self.actor = Actor(self.state_dim, self.action_dim, self.action_bound)
125 | self.critic = Critic(self.state_dim, self.action_dim)
126 |
127 | self.target_actor = Actor(self.state_dim, self.action_dim, self.action_bound)
128 | self.target_critic = Critic(self.state_dim, self.action_dim)
129 |
130 | actor_weights = self.actor.model.get_weights()
131 | critic_weights = self.critic.model.get_weights()
132 | self.target_actor.model.set_weights(actor_weights)
133 | self.target_critic.model.set_weights(critic_weights)
134 |
135 |
136 | def target_update(self):
137 | actor_weights = self.actor.model.get_weights()
138 | t_actor_weights = self.target_actor.model.get_weights()
139 | critic_weights = self.critic.model.get_weights()
140 | t_critic_weights = self.target_critic.model.get_weights()
141 |
142 | for i in range(len(actor_weights)):
143 | t_actor_weights[i] = args.tau * actor_weights[i] + (1 - args.tau) * t_actor_weights[i]
144 |
145 | for i in range(len(critic_weights)):
146 | t_critic_weights[i] = args.tau * critic_weights[i] + (1 - args.tau) * t_critic_weights[i]
147 |
148 | self.target_actor.model.set_weights(t_actor_weights)
149 | self.target_critic.model.set_weights(t_critic_weights)
150 |
151 |
152 | def td_target(self, rewards, q_values, dones):
153 | targets = np.asarray(q_values)
154 | for i in range(q_values.shape[0]):
155 | if dones[i]:
156 | targets[i] = rewards[i]
157 | else:
158 | targets[i] = args.gamma * q_values[i]
159 | return targets
160 |
161 | def list_to_batch(self, list):
162 | batch = list[0]
163 | for elem in list[1:]:
164 | batch = np.append(batch, elem, axis=0)
165 | return batch
166 |
167 | def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
168 | return x + rho * (mu-x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)
169 |
170 | def replay(self):
171 | for _ in range(10):
172 | states, actions, rewards, next_states, dones = self.buffer.sample()
173 | target_q_values = self.target_critic.predict([next_states, self.target_actor.predict(next_states)])
174 | td_targets = self.td_target(rewards, target_q_values, dones)
175 |
176 | self.critic.train(states, actions, td_targets)
177 |
178 | s_actions = self.actor.predict(states)
179 | s_grads = self.critic.q_grads(states, s_actions)
180 | grads = np.array(s_grads).reshape((-1, self.action_dim))
181 | self.actor.train(states, grads)
182 | self.target_update()
183 |
184 | def train(self, max_episodes=1000):
185 | for ep in range(max_episodes):
186 | episode_reward, done = 0, False
187 |
188 | state = self.env.reset()
189 | bg_noise = np.zeros(self.action_dim)
190 | while not done:
191 | # self.env.render()
192 | action = self.actor.get_action(state)
193 | noise = self.ou_noise(bg_noise, dim=self.action_dim)
194 | action = np.clip(action + noise, -self.action_bound, self.action_bound)
195 |
196 | next_state, reward, done, _ = self.env.step(action)
197 | self.buffer.put(state, action, (reward+8)/8, next_state, done)
198 | bg_noise = noise
199 | episode_reward += reward
200 | state = next_state
201 | if self.buffer.size() >= args.batch_size and self.buffer.size() >= args.train_start:
202 | self.replay()
203 | print('EP{} EpisodeReward={}'.format(ep, episode_reward))
204 | wandb.log({'Reward': episode_reward})
205 |
206 |
207 | def main():
208 | env_name = 'Pendulum-v0'
209 | env = gym.make(env_name)
210 | agent = Agent(env)
211 | agent.train()
212 |
213 |
214 | if __name__ == "__main__":
215 | main()
216 |
--------------------------------------------------------------------------------
/DQN/DQN_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense
4 | from tensorflow.keras.optimizers import Adam
5 |
6 | import gym
7 | import argparse
8 | import numpy as np
9 | from collections import deque
10 | import random
11 |
12 | tf.keras.backend.set_floatx('float64')
13 | wandb.init(name='DQN', project="deep-rl-tf2")
14 |
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--gamma', type=float, default=0.95)
17 | parser.add_argument('--lr', type=float, default=0.005)
18 | parser.add_argument('--batch_size', type=int, default=32)
19 | parser.add_argument('--eps', type=float, default=1.0)
20 | parser.add_argument('--eps_decay', type=float, default=0.995)
21 | parser.add_argument('--eps_min', type=float, default=0.01)
22 |
23 | args = parser.parse_args()
24 |
25 | class ReplayBuffer:
26 | def __init__(self, capacity=10000):
27 | self.buffer = deque(maxlen=capacity)
28 |
29 | def put(self, state, action, reward, next_state, done):
30 | self.buffer.append([state, action, reward, next_state, done])
31 |
32 | def sample(self):
33 | sample = random.sample(self.buffer, args.batch_size)
34 | states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
35 | states = np.array(states).reshape(args.batch_size, -1)
36 | next_states = np.array(next_states).reshape(args.batch_size, -1)
37 | return states, actions, rewards, next_states, done
38 |
39 | def size(self):
40 | return len(self.buffer)
41 |
42 | class ActionStateModel:
43 | def __init__(self, state_dim, aciton_dim):
44 | self.state_dim = state_dim
45 | self.action_dim = aciton_dim
46 | self.epsilon = args.eps
47 |
48 | self.model = self.create_model()
49 |
50 | def create_model(self):
51 | model = tf.keras.Sequential([
52 | Input((self.state_dim,)),
53 | Dense(32, activation='relu'),
54 | Dense(16, activation='relu'),
55 | Dense(self.action_dim)
56 | ])
57 | model.compile(loss='mse', optimizer=Adam(args.lr))
58 | return model
59 |
60 | def predict(self, state):
61 | return self.model.predict(state)
62 |
63 | def get_action(self, state):
64 | state = np.reshape(state, [1, self.state_dim])
65 | self.epsilon *= args.eps_decay
66 | self.epsilon = max(self.epsilon, args.eps_min)
67 | q_value = self.predict(state)[0]
68 | if np.random.random() < self.epsilon:
69 | return random.randint(0, self.action_dim-1)
70 | return np.argmax(q_value)
71 |
72 | def train(self, states, targets):
73 | self.model.fit(states, targets, epochs=1, verbose=0)
74 |
75 |
76 | class Agent:
77 | def __init__(self, env):
78 | self.env = env
79 | self.state_dim = self.env.observation_space.shape[0]
80 | self.action_dim = self.env.action_space.n
81 |
82 | self.model = ActionStateModel(self.state_dim, self.action_dim)
83 | self.target_model = ActionStateModel(self.state_dim, self.action_dim)
84 | self.target_update()
85 |
86 | self.buffer = ReplayBuffer()
87 |
88 | def target_update(self):
89 | weights = self.model.model.get_weights()
90 | self.target_model.model.set_weights(weights)
91 |
92 | def replay(self):
93 | for _ in range(10):
94 | states, actions, rewards, next_states, done = self.buffer.sample()
95 | targets = self.target_model.predict(states)
96 | next_q_values = self.target_model.predict(next_states).max(axis=1)
97 | targets[range(args.batch_size), actions] = rewards + (1-done) * next_q_values * args.gamma
98 | self.model.train(states, targets)
99 |
100 | def train(self, max_episodes=1000):
101 | for ep in range(max_episodes):
102 | done, total_reward = False, 0
103 | state = self.env.reset()
104 | while not done:
105 | action = self.model.get_action(state)
106 | next_state, reward, done, _ = self.env.step(action)
107 | self.buffer.put(state, action, reward*0.01, next_state, done)
108 | total_reward += reward
109 | state = next_state
110 | if self.buffer.size() >= args.batch_size:
111 | self.replay()
112 | self.target_update()
113 | print('EP{} EpisodeReward={}'.format(ep, total_reward))
114 | wandb.log({'Reward': total_reward})
115 |
116 |
117 | def main():
118 | env = gym.make('CartPole-v1')
119 | agent = Agent(env)
120 | agent.train(max_episodes=1000)
121 |
122 | if __name__ == "__main__":
123 | main()
124 |
--------------------------------------------------------------------------------
/DRQN/DRQN_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, LSTM
4 | from tensorflow.keras.optimizers import Adam
5 |
6 | import gym
7 | import argparse
8 | import numpy as np
9 | from collections import deque
10 | import random
11 |
12 | tf.keras.backend.set_floatx('float64')
13 | wandb.init(name='DRQN', project="deep-rl-tf2")
14 |
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--gamma', type=float, default=0.95)
17 | parser.add_argument('--lr', type=float, default=0.005)
18 | parser.add_argument('--batch_size', type=int, default=32)
19 | parser.add_argument('--time_steps', type=int, default=4)
20 | parser.add_argument('--eps', type=float, default=1.0)
21 | parser.add_argument('--eps_decay', type=float, default=0.995)
22 | parser.add_argument('--eps_min', type=float, default=0.01)
23 |
24 | args = parser.parse_args()
25 |
26 | class ReplayBuffer:
27 | def __init__(self, capacity=10000):
28 | self.buffer = deque(maxlen=capacity)
29 |
30 | def put(self, state, action, reward, next_state, done):
31 | self.buffer.append([state, action, reward, next_state, done])
32 |
33 | def sample(self):
34 | sample = random.sample(self.buffer, args.batch_size)
35 | states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
36 | states = np.array(states).reshape(args.batch_size, args.time_steps, -1)
37 | next_states = np.array(next_states).reshape(args.batch_size, args.time_steps, -1)
38 | return states, actions, rewards, next_states, done
39 |
40 | def size(self):
41 | return len(self.buffer)
42 |
43 | class ActionStateModel:
44 | def __init__(self, state_dim, aciton_dim):
45 | self.state_dim = state_dim
46 | self.action_dim = aciton_dim
47 | self.epsilon = args.eps
48 |
49 | self.opt = Adam(args.lr)
50 | self.compute_loss = tf.keras.losses.MeanSquaredError()
51 | self.model = self.create_model()
52 |
53 | def create_model(self):
54 | return tf.keras.Sequential([
55 | Input((args.time_steps, self.state_dim)),
56 | LSTM(32, activation='tanh'),
57 | Dense(16, activation='relu'),
58 | Dense(self.action_dim)
59 | ])
60 |
61 | def predict(self, state):
62 | return self.model.predict(state)
63 |
64 | def get_action(self, state):
65 | state = np.reshape(state, [1, args.time_steps, self.state_dim])
66 | self.epsilon *= args.eps_decay
67 | self.epsilon = max(self.epsilon, args.eps_min)
68 | q_value = self.predict(state)[0]
69 | if np.random.random() < self.epsilon:
70 | return random.randint(0, self.action_dim-1)
71 | return np.argmax(q_value)
72 |
73 | def train(self, states, targets):
74 | targets = tf.stop_gradient(targets)
75 | with tf.GradientTape() as tape:
76 | logits = self.model(states, training=True)
77 | assert targets.shape == logits.shape
78 | loss = self.compute_loss(targets, logits)
79 | grads = tape.gradient(loss, self.model.trainable_variables)
80 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
81 |
82 |
83 |
84 | class Agent:
85 | def __init__(self, env):
86 | self.env = env
87 | self.state_dim = self.env.observation_space.shape[0]
88 | self.action_dim = self.env.action_space.n
89 |
90 | self.states = np.zeros([args.time_steps, self.state_dim])
91 |
92 | self.model = ActionStateModel(self.state_dim, self.action_dim)
93 | self.target_model = ActionStateModel(self.state_dim, self.action_dim)
94 | self.target_update()
95 |
96 | self.buffer = ReplayBuffer()
97 |
98 | def target_update(self):
99 | weights = self.model.model.get_weights()
100 | self.target_model.model.set_weights(weights)
101 |
102 | def replay(self):
103 | for _ in range(10):
104 | states, actions, rewards, next_states, done = self.buffer.sample()
105 | targets = self.target_model.predict(states)
106 | next_q_values = self.target_model.predict(next_states).max(axis=1)
107 | targets[range(args.batch_size), actions] = rewards + (1-done) * next_q_values * args.gamma
108 | self.model.train(states, targets)
109 |
110 | def update_states(self, next_state):
111 | self.states = np.roll(self.states, -1, axis=0)
112 | self.states[-1] = next_state
113 |
114 | def train(self, max_episodes=1000):
115 | for ep in range(max_episodes):
116 | done, total_reward = False, 0
117 | self.states = np.zeros([args.time_steps, self.state_dim])
118 | self.update_states(self.env.reset())
119 | while not done:
120 | action = self.model.get_action(self.states)
121 | next_state, reward, done, _ = self.env.step(action)
122 | prev_states = self.states
123 | self.update_states(next_state)
124 | self.buffer.put(prev_states, action, reward*0.01, self.states, done)
125 | total_reward += reward
126 |
127 | if self.buffer.size() >= args.batch_size:
128 | self.replay()
129 | self.target_update()
130 | print('EP{} EpisodeReward={}'.format(ep, total_reward))
131 | wandb.log({'Reward': total_reward})
132 |
133 |
134 | def main():
135 | env = gym.make('CartPole-v1')
136 | agent = Agent(env)
137 | agent.train(max_episodes=1000)
138 |
139 | if __name__ == "__main__":
140 | main()
141 |
--------------------------------------------------------------------------------
/DoubleDQN/DoubleDQN_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, Flatten, Lambda
4 | from tensorflow.keras.optimizers import Adam
5 |
6 | import gym
7 | import argparse
8 | import numpy as np
9 | from collections import deque
10 | import random
11 |
12 | tf.keras.backend.set_floatx('float64')
13 | wandb.init(name='DoubleDQN', project="deep-rl-tf2")
14 |
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--gamma', type=float, default=0.95)
17 | parser.add_argument('--lr', type=float, default=0.005)
18 | parser.add_argument('--batch_size', type=int, default=32)
19 | parser.add_argument('--eps', type=float, default=1.0)
20 | parser.add_argument('--eps_decay', type=float, default=0.995)
21 | parser.add_argument('--eps_min', type=float, default=0.01)
22 |
23 | args = parser.parse_args()
24 |
25 | class ReplayBuffer:
26 | def __init__(self, capacity=10000):
27 | self.buffer = deque(maxlen=capacity)
28 |
29 | def put(self, state, action, reward, next_state, done):
30 | self.buffer.append([state, action, reward, next_state, done])
31 |
32 | def sample(self):
33 | sample = random.sample(self.buffer, args.batch_size)
34 | states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
35 | states = np.array(states).reshape(args.batch_size, -1)
36 | next_states = np.array(next_states).reshape(args.batch_size, -1)
37 | return states, actions, rewards, next_states, done
38 |
39 | def size(self):
40 | return len(self.buffer)
41 |
42 | class ActionStateModel:
43 | def __init__(self, state_dim, aciton_dim):
44 | self.state_dim = state_dim
45 | self.action_dim = aciton_dim
46 | self.epsilon = args.eps
47 |
48 | self.model = self.create_model()
49 |
50 | def create_model(self):
51 | model = tf.keras.Sequential([
52 | Input((self.state_dim,)),
53 | Dense(32, activation='relu'),
54 | Dense(16, activation='relu'),
55 | Dense(self.action_dim)
56 | ])
57 | model.compile(loss='mse', optimizer=Adam(args.lr))
58 | return model
59 |
60 | def predict(self, state):
61 | return self.model.predict(state)
62 |
63 | def get_action(self, state):
64 | state = np.reshape(state, [1, self.state_dim])
65 | self.epsilon *= args.eps_decay
66 | self.epsilon = max(self.epsilon, args.eps_min)
67 | q_value = self.predict(state)[0]
68 | if np.random.random() < self.epsilon:
69 | return random.randint(0, self.action_dim-1)
70 | return np.argmax(q_value)
71 |
72 | def train(self, states, targets):
73 | self.model.fit(states, targets, epochs=1, verbose=0)
74 |
75 |
76 | class Agent:
77 | def __init__(self, env):
78 | self.env = env
79 | self.state_dim = self.env.observation_space.shape[0]
80 | self.action_dim = self.env.action_space.n
81 |
82 | self.model = ActionStateModel(self.state_dim, self.action_dim)
83 | self.target_model = ActionStateModel(self.state_dim, self.action_dim)
84 | self.target_update()
85 |
86 | self.buffer = ReplayBuffer()
87 |
88 | def target_update(self):
89 | weights = self.model.model.get_weights()
90 | self.target_model.model.set_weights(weights)
91 |
92 | def replay(self):
93 | for _ in range(10):
94 | states, actions, rewards, next_states, done = self.buffer.sample()
95 | targets = self.target_model.predict(states)
96 | next_q_values = self.target_model.predict(next_states)[range(args.batch_size),np.argmax(self.model.predict(next_states), axis=1)]
97 | targets[range(args.batch_size), actions] = rewards + (1-done) * next_q_values * args.gamma
98 | self.model.train(states, targets)
99 |
100 | def train(self, max_episodes=1000):
101 | for ep in range(max_episodes):
102 | done, total_reward = False, 0
103 | state = self.env.reset()
104 | while not done:
105 | action = self.model.get_action(state)
106 | next_state, reward, done, _ = self.env.step(action)
107 | self.buffer.put(state, action, reward*0.01, next_state, done)
108 | total_reward += reward
109 | state = next_state
110 |
111 | if self.buffer.size() >= args.batch_size:
112 | self.replay()
113 | self.target_update()
114 | print('EP{} EpisodeReward={}'.format(ep, total_reward))
115 | wandb.log({'Reward': total_reward})
116 |
117 |
118 | def main():
119 | env = gym.make('CartPole-v1')
120 | agent = Agent(env)
121 | agent.train(max_episodes=1000)
122 |
123 | if __name__ == "__main__":
124 | main()
125 |
--------------------------------------------------------------------------------
/DuelingDQN/DuelingDQN_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, Add
4 | from tensorflow.keras.optimizers import Adam
5 |
6 | import gym
7 | import argparse
8 | import numpy as np
9 | from collections import deque
10 | import random
11 |
12 | tf.keras.backend.set_floatx('float64')
13 | wandb.init(name='DuelingDQN', project="deep-rl-tf2")
14 |
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--gamma', type=float, default=0.95)
17 | parser.add_argument('--lr', type=float, default=0.005)
18 | parser.add_argument('--batch_size', type=int, default=32)
19 | parser.add_argument('--eps', type=float, default=1.0)
20 | parser.add_argument('--eps_decay', type=float, default=0.995)
21 | parser.add_argument('--eps_min', type=float, default=0.01)
22 |
23 | args = parser.parse_args()
24 |
25 | class ReplayBuffer:
26 | def __init__(self, capacity=10000):
27 | self.buffer = deque(maxlen=capacity)
28 |
29 | def put(self, state, action, reward, next_state, done):
30 | self.buffer.append([state, action, reward, next_state, done])
31 |
32 | def sample(self):
33 | sample = random.sample(self.buffer, args.batch_size)
34 | states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
35 | states = np.array(states).reshape(args.batch_size, -1)
36 | next_states = np.array(next_states).reshape(args.batch_size, -1)
37 | return states, actions, rewards, next_states, done
38 |
39 | def size(self):
40 | return len(self.buffer)
41 |
42 | class ActionStateModel:
43 | def __init__(self, state_dim, aciton_dim):
44 | self.state_dim = state_dim
45 | self.action_dim = aciton_dim
46 | self.epsilon = args.eps
47 |
48 | self.model = self.create_model()
49 |
50 | def create_model(self):
51 | backbone = tf.keras.Sequential([
52 | Input((self.state_dim,)),
53 | Dense(32, activation='relu'),
54 | Dense(16, activation='relu')
55 | ])
56 | state_input = Input((self.state_dim,))
57 | backbone_1 = Dense(32, activation='relu')(state_input)
58 | backbone_2 = Dense(16, activation='relu')(backbone_1)
59 | value_output = Dense(1)(backbone_2)
60 | advantage_output = Dense(self.action_dim)(backbone_2)
61 | output = Add()([value_output, advantage_output])
62 | model = tf.keras.Model(state_input, output)
63 | model.compile(loss='mse', optimizer=Adam(args.lr))
64 | return model
65 |
66 | def predict(self, state):
67 | return self.model.predict(state)
68 |
69 | def get_action(self, state):
70 | state = np.reshape(state, [1, self.state_dim])
71 | self.epsilon *= args.eps_decay
72 | self.epsilon = max(self.epsilon, args.eps_min)
73 | q_value = self.predict(state)[0]
74 | if np.random.random() < self.epsilon:
75 | return random.randint(0, self.action_dim-1)
76 | return np.argmax(q_value)
77 |
78 | def train(self, states, targets):
79 | self.model.fit(states, targets, epochs=1, verbose=0)
80 |
81 |
82 | class Agent:
83 | def __init__(self, env):
84 | self.env = env
85 | self.state_dim = self.env.observation_space.shape[0]
86 | self.action_dim = self.env.action_space.n
87 |
88 | self.model = ActionStateModel(self.state_dim, self.action_dim)
89 | self.target_model = ActionStateModel(self.state_dim, self.action_dim)
90 | self.target_update()
91 |
92 | self.buffer = ReplayBuffer()
93 |
94 | def target_update(self):
95 | weights = self.model.model.get_weights()
96 | self.target_model.model.set_weights(weights)
97 |
98 | def replay(self):
99 | for _ in range(10):
100 | states, actions, rewards, next_states, done = self.buffer.sample()
101 | targets = self.target_model.predict(states)
102 | next_q_values = self.target_model.predict(next_states).max(axis=1)
103 | targets[range(args.batch_size), actions] = rewards + (1-done) * next_q_values * args.gamma
104 | self.model.train(states, targets)
105 |
106 | def train(self, max_episodes=1000):
107 | for ep in range(max_episodes):
108 | done, total_reward = False, 0
109 | state = self.env.reset()
110 | while not done:
111 | action = self.model.get_action(state)
112 | next_state, reward, done, _ = self.env.step(action)
113 | self.buffer.put(state, action, reward*0.01, next_state, done)
114 | total_reward += reward
115 | state = next_state
116 |
117 | if self.buffer.size() >= args.batch_size:
118 | self.replay()
119 | self.target_update()
120 | print('EP{} EpisodeReward={}'.format(ep, total_reward))
121 | wandb.log({'Reward': total_reward})
122 |
123 |
124 | def main():
125 | env = gym.make('CartPole-v1')
126 | agent = Agent(env)
127 | agent.train(max_episodes=1000)
128 |
129 | if __name__ == "__main__":
130 | main()
131 |
--------------------------------------------------------------------------------
/DuelingDoubleDQN/DuelingDoubleDQN_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, Add
4 | from tensorflow.keras.optimizers import Adam
5 |
6 | import gym
7 | import argparse
8 | import numpy as np
9 | from collections import deque
10 | import random
11 |
12 | tf.keras.backend.set_floatx('float64')
13 | wandb.init(name='DuelingDoubleDQN', project="deep-rl-tf2")
14 |
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--gamma', type=float, default=0.95)
17 | parser.add_argument('--lr', type=float, default=0.005)
18 | parser.add_argument('--batch_size', type=int, default=32)
19 | parser.add_argument('--eps', type=float, default=1.0)
20 | parser.add_argument('--eps_decay', type=float, default=0.995)
21 | parser.add_argument('--eps_min', type=float, default=0.01)
22 |
23 | args = parser.parse_args()
24 |
25 | class ReplayBuffer:
26 | def __init__(self, capacity=10000):
27 | self.buffer = deque(maxlen=capacity)
28 |
29 | def put(self, state, action, reward, next_state, done):
30 | self.buffer.append([state, action, reward, next_state, done])
31 |
32 | def sample(self):
33 | sample = random.sample(self.buffer, args.batch_size)
34 | states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
35 | states = np.array(states).reshape(args.batch_size, -1)
36 | next_states = np.array(next_states).reshape(args.batch_size, -1)
37 | return states, actions, rewards, next_states, done
38 |
39 | def size(self):
40 | return len(self.buffer)
41 |
42 | class ActionStateModel:
43 | def __init__(self, state_dim, aciton_dim):
44 | self.state_dim = state_dim
45 | self.action_dim = aciton_dim
46 | self.epsilon = args.eps
47 |
48 | self.model = self.create_model()
49 |
50 | def create_model(self):
51 | backbone = tf.keras.Sequential([
52 | Input((self.state_dim,)),
53 | Dense(32, activation='relu'),
54 | Dense(16, activation='relu')
55 | ])
56 | state_input = Input((self.state_dim,))
57 | backbone_1 = Dense(32, activation='relu')(state_input)
58 | backbone_2 = Dense(16, activation='relu')(backbone_1)
59 | value_output = Dense(1)(backbone_2)
60 | advantage_output = Dense(self.action_dim)(backbone_2)
61 | output = Add()([value_output, advantage_output])
62 | model = tf.keras.Model(state_input, output)
63 | model.compile(loss='mse', optimizer=Adam(args.lr))
64 | return model
65 |
66 | def predict(self, state):
67 | return self.model.predict(state)
68 |
69 | def get_action(self, state):
70 | state = np.reshape(state, [1, self.state_dim])
71 | self.epsilon *= args.eps_decay
72 | self.epsilon = max(self.epsilon, args.eps_min)
73 | q_value = self.predict(state)[0]
74 | if np.random.random() < self.epsilon:
75 | return random.randint(0, self.action_dim-1)
76 | return np.argmax(q_value)
77 |
78 | def train(self, states, targets):
79 | self.model.fit(states, targets, epochs=1, verbose=0)
80 |
81 |
82 | class Agent:
83 | def __init__(self, env):
84 | self.env = env
85 | self.state_dim = self.env.observation_space.shape[0]
86 | self.action_dim = self.env.action_space.n
87 |
88 | self.model = ActionStateModel(self.state_dim, self.action_dim)
89 | self.target_model = ActionStateModel(self.state_dim, self.action_dim)
90 | self.target_update()
91 |
92 | self.buffer = ReplayBuffer()
93 |
94 | def target_update(self):
95 | weights = self.model.model.get_weights()
96 | self.target_model.model.set_weights(weights)
97 |
98 | def replay(self):
99 | for _ in range(10):
100 | states, actions, rewards, next_states, done = self.buffer.sample()
101 | targets = self.target_model.predict(states)
102 | next_q_values = self.target_model.predict(next_states)[range(args.batch_size),np.argmax(self.model.predict(next_states), axis=1)]
103 | targets[range(args.batch_size), actions] = rewards + (1-done) * next_q_values * args.gamma
104 | self.model.train(states, targets)
105 |
106 | def train(self, max_episodes=1000):
107 | for ep in range(max_episodes):
108 | done, total_reward = False, 0
109 | state = self.env.reset()
110 | while not done:
111 | action = self.model.get_action(state)
112 | next_state, reward, done, _ = self.env.step(action)
113 | self.buffer.put(state, action, reward*0.01, next_state, done)
114 | total_reward += reward
115 | state = next_state
116 |
117 | if self.buffer.size() >= args.batch_size:
118 | self.replay()
119 | self.target_update()
120 | print('EP{} EpisodeReward={}'.format(ep, total_reward))
121 | wandb.log({'Reward': total_reward})
122 |
123 |
124 | def main():
125 | env = gym.make('CartPole-v1')
126 | agent = Agent(env)
127 | agent.train(max_episodes=1000)
128 |
129 | if __name__ == "__main__":
130 | main()
131 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/PPO/PPO_Continuous.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense, Lambda
4 |
5 | import gym
6 | import argparse
7 | import numpy as np
8 |
9 | tf.keras.backend.set_floatx('float64')
10 |
11 | wandb.init(name='PPO', project="deep-rl-tf2")
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--gamma', type=float, default=0.99)
15 | parser.add_argument('--update_interval', type=int, default=5)
16 | parser.add_argument('--actor_lr', type=float, default=0.0005)
17 | parser.add_argument('--critic_lr', type=float, default=0.001)
18 | parser.add_argument('--clip_ratio', type=float, default=0.1)
19 | parser.add_argument('--lmbda', type=float, default=0.95)
20 | parser.add_argument('--epochs', type=int, default=3)
21 |
22 | args = parser.parse_args()
23 |
24 |
25 | class Actor:
26 | def __init__(self, state_dim, action_dim, action_bound, std_bound):
27 | self.state_dim = state_dim
28 | self.action_dim = action_dim
29 | self.action_bound = action_bound
30 | self.std_bound = std_bound
31 | self.model = self.create_model()
32 | self.opt = tf.keras.optimizers.Adam(args.actor_lr)
33 |
34 | def get_action(self, state):
35 | state = np.reshape(state, [1, self.state_dim])
36 | mu, std = self.model.predict(state)
37 | action = np.random.normal(mu[0], std[0], size=self.action_dim)
38 | action = np.clip(action, -self.action_bound, self.action_bound)
39 | log_policy = self.log_pdf(mu, std, action)
40 |
41 | return log_policy, action
42 |
43 | def log_pdf(self, mu, std, action):
44 | std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
45 | var = std ** 2
46 | log_policy_pdf = -0.5 * (action - mu) ** 2 / \
47 | var - 0.5 * tf.math.log(var * 2 * np.pi)
48 | return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)
49 |
50 | def create_model(self):
51 | state_input = Input((self.state_dim,))
52 | dense_1 = Dense(32, activation='relu')(state_input)
53 | dense_2 = Dense(32, activation='relu')(dense_1)
54 | out_mu = Dense(self.action_dim, activation='tanh')(dense_2)
55 | mu_output = Lambda(lambda x: x * self.action_bound)(out_mu)
56 | std_output = Dense(self.action_dim, activation='softplus')(dense_2)
57 | return tf.keras.models.Model(state_input, [mu_output, std_output])
58 |
59 | def compute_loss(self, log_old_policy, log_new_policy, actions, gaes):
60 | ratio = tf.exp(log_new_policy - tf.stop_gradient(log_old_policy))
61 | gaes = tf.stop_gradient(gaes)
62 | clipped_ratio = tf.clip_by_value(
63 | ratio, 1.0-args.clip_ratio, 1.0+args.clip_ratio)
64 | surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)
65 | return tf.reduce_mean(surrogate)
66 |
67 | def train(self, log_old_policy, states, actions, gaes):
68 | with tf.GradientTape() as tape:
69 | mu, std = self.model(states, training=True)
70 | log_new_policy = self.log_pdf(mu, std, actions)
71 | loss = self.compute_loss(
72 | log_old_policy, log_new_policy, actions, gaes)
73 | grads = tape.gradient(loss, self.model.trainable_variables)
74 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
75 | return loss
76 |
77 |
78 | class Critic:
79 | def __init__(self, state_dim):
80 | self.state_dim = state_dim
81 | self.model = self.create_model()
82 | self.opt = tf.keras.optimizers.Adam(args.critic_lr)
83 |
84 | def create_model(self):
85 | return tf.keras.Sequential([
86 | Input((self.state_dim,)),
87 | Dense(32, activation='relu'),
88 | Dense(32, activation='relu'),
89 | Dense(16, activation='relu'),
90 | Dense(1, activation='linear')
91 | ])
92 |
93 | def compute_loss(self, v_pred, td_targets):
94 | mse = tf.keras.losses.MeanSquaredError()
95 | return mse(td_targets, v_pred)
96 |
97 | def train(self, states, td_targets):
98 | with tf.GradientTape() as tape:
99 | v_pred = self.model(states, training=True)
100 | assert v_pred.shape == td_targets.shape
101 | loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
102 | grads = tape.gradient(loss, self.model.trainable_variables)
103 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
104 | return loss
105 |
106 |
107 | class Agent:
108 | def __init__(self, env):
109 | self.env = env
110 | self.state_dim = self.env.observation_space.shape[0]
111 | self.action_dim = self.env.action_space.shape[0]
112 | self.action_bound = self.env.action_space.high[0]
113 | self.std_bound = [1e-2, 1.0]
114 |
115 | self.actor_opt = tf.keras.optimizers.Adam(args.actor_lr)
116 | self.critic_opt = tf.keras.optimizers.Adam(args.critic_lr)
117 | self.actor = Actor(self.state_dim, self.action_dim,
118 | self.action_bound, self.std_bound)
119 | self.critic = Critic(self.state_dim)
120 |
121 | def gae_target(self, rewards, v_values, next_v_value, done):
122 | n_step_targets = np.zeros_like(rewards)
123 | gae = np.zeros_like(rewards)
124 | gae_cumulative = 0
125 | forward_val = 0
126 |
127 | if not done:
128 | forward_val = next_v_value
129 |
130 | for k in reversed(range(0, len(rewards))):
131 | delta = rewards[k] + args.gamma * forward_val - v_values[k]
132 | gae_cumulative = args.gamma * args.lmbda * gae_cumulative + delta
133 | gae[k] = gae_cumulative
134 | forward_val = v_values[k]
135 | n_step_targets[k] = gae[k] + v_values[k]
136 | return gae, n_step_targets
137 |
138 | def list_to_batch(self, list):
139 | batch = list[0]
140 | for elem in list[1:]:
141 | batch = np.append(batch, elem, axis=0)
142 | return batch
143 |
144 | def train(self, max_episodes=1000):
145 | for ep in range(max_episodes):
146 | state_batch = []
147 | action_batch = []
148 | reward_batch = []
149 | old_policy_batch = []
150 |
151 | episode_reward, done = 0, False
152 |
153 | state = self.env.reset()
154 |
155 | while not done:
156 | # self.env.render()
157 | log_old_policy, action = self.actor.get_action(state)
158 |
159 | next_state, reward, done, _ = self.env.step(action)
160 |
161 | state = np.reshape(state, [1, self.state_dim])
162 | action = np.reshape(action, [1, 1])
163 | next_state = np.reshape(next_state, [1, self.state_dim])
164 | reward = np.reshape(reward, [1, 1])
165 | log_old_policy = np.reshape(log_old_policy, [1, 1])
166 |
167 | state_batch.append(state)
168 | action_batch.append(action)
169 | reward_batch.append((reward+8)/8)
170 | old_policy_batch.append(log_old_policy)
171 |
172 | if len(state_batch) >= args.update_interval or done:
173 | states = self.list_to_batch(state_batch)
174 | actions = self.list_to_batch(action_batch)
175 | rewards = self.list_to_batch(reward_batch)
176 | old_policys = self.list_to_batch(old_policy_batch)
177 |
178 | v_values = self.critic.model.predict(states)
179 | next_v_value = self.critic.model.predict(next_state)
180 |
181 | gaes, td_targets = self.gae_target(
182 | rewards, v_values, next_v_value, done)
183 |
184 | for epoch in range(args.epochs):
185 | actor_loss = self.actor.train(
186 | old_policys, states, actions, gaes)
187 | critic_loss = self.critic.train(states, td_targets)
188 |
189 | state_batch = []
190 | action_batch = []
191 | reward_batch = []
192 | old_policy_batch = []
193 |
194 | episode_reward += reward[0][0]
195 | state = next_state[0]
196 |
197 | print('EP{} EpisodeReward={}'.format(ep, episode_reward))
198 | wandb.log({'Reward': episode_reward})
199 |
200 |
201 | def main():
202 | env_name = 'Pendulum-v0'
203 | env = gym.make(env_name)
204 | agent = Agent(env)
205 | agent.train()
206 |
207 |
208 | if __name__ == "__main__":
209 | main()
210 |
--------------------------------------------------------------------------------
/PPO/PPO_Discrete.py:
--------------------------------------------------------------------------------
1 | import wandb
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Dense
4 |
5 | import gym
6 | import argparse
7 | import numpy as np
8 |
9 | tf.keras.backend.set_floatx('float64')
10 |
11 | wandb.init(name='PPO', project="deep-rl-tf2")
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--gamma', type=float, default=0.99)
15 | parser.add_argument('--update_interval', type=int, default=5)
16 | parser.add_argument('--actor_lr', type=float, default=0.0005)
17 | parser.add_argument('--critic_lr', type=float, default=0.001)
18 | parser.add_argument('--clip_ratio', type=float, default=0.1)
19 | parser.add_argument('--lmbda', type=float, default=0.95)
20 | parser.add_argument('--epochs', type=int, default=3)
21 |
22 | args = parser.parse_args()
23 |
24 |
25 | class Actor:
26 | def __init__(self, state_dim, action_dim):
27 | self.state_dim = state_dim
28 | self.action_dim = action_dim
29 | self.model = self.create_model()
30 | self.opt = tf.keras.optimizers.Adam(args.actor_lr)
31 |
32 | def create_model(self):
33 | return tf.keras.Sequential([
34 | Input((self.state_dim,)),
35 | Dense(32, activation='relu'),
36 | Dense(16, activation='relu'),
37 | Dense(self.action_dim, activation='softmax')
38 | ])
39 |
40 | def compute_loss(self, old_policy, new_policy, actions, gaes):
41 | gaes = tf.stop_gradient(gaes)
42 | old_log_p = tf.math.log(
43 | tf.reduce_sum(old_policy * actions))
44 | old_log_p = tf.stop_gradient(old_log_p)
45 | log_p = tf.math.log(tf.reduce_sum(
46 | new_policy * actions))
47 | ratio = tf.math.exp(log_p - old_log_p)
48 | clipped_ratio = tf.clip_by_value(
49 | ratio, 1 - args.clip_ratio, 1 + args.clip_ratio)
50 | surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)
51 | return tf.reduce_mean(surrogate)
52 |
53 | def train(self, old_policy, states, actions, gaes):
54 | actions = tf.one_hot(actions, self.action_dim)
55 | actions = tf.reshape(actions, [-1, self.action_dim])
56 | actions = tf.cast(actions, tf.float64)
57 |
58 | with tf.GradientTape() as tape:
59 | logits = self.model(states, training=True)
60 | loss = self.compute_loss(old_policy, logits, actions, gaes)
61 | grads = tape.gradient(loss, self.model.trainable_variables)
62 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
63 | return loss
64 |
65 |
66 | class Critic:
67 | def __init__(self, state_dim):
68 | self.state_dim = state_dim
69 | self.model = self.create_model()
70 | self.opt = tf.keras.optimizers.Adam(args.critic_lr)
71 |
72 | def create_model(self):
73 | return tf.keras.Sequential([
74 | Input((self.state_dim,)),
75 | Dense(32, activation='relu'),
76 | Dense(16, activation='relu'),
77 | Dense(16, activation='relu'),
78 | Dense(1, activation='linear')
79 | ])
80 |
81 | def compute_loss(self, v_pred, td_targets):
82 | mse = tf.keras.losses.MeanSquaredError()
83 | return mse(td_targets, v_pred)
84 |
85 | def train(self, states, td_targets):
86 | with tf.GradientTape() as tape:
87 | v_pred = self.model(states, training=True)
88 | assert v_pred.shape == td_targets.shape
89 | loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
90 | grads = tape.gradient(loss, self.model.trainable_variables)
91 | self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
92 | return loss
93 |
94 |
95 | class Agent:
96 | def __init__(self, env):
97 | self.env = env
98 | self.state_dim = self.env.observation_space.shape[0]
99 | self.action_dim = self.env.action_space.n
100 |
101 | self.actor = Actor(self.state_dim, self.action_dim)
102 | self.critic = Critic(self.state_dim)
103 |
104 | def gae_target(self, rewards, v_values, next_v_value, done):
105 | n_step_targets = np.zeros_like(rewards)
106 | gae = np.zeros_like(rewards)
107 | gae_cumulative = 0
108 | forward_val = 0
109 |
110 | if not done:
111 | forward_val = next_v_value
112 |
113 | for k in reversed(range(0, len(rewards))):
114 | delta = rewards[k] + args.gamma * forward_val - v_values[k]
115 | gae_cumulative = args.gamma * args.lmbda * gae_cumulative + delta
116 | gae[k] = gae_cumulative
117 | forward_val = v_values[k]
118 | n_step_targets[k] = gae[k] + v_values[k]
119 | return gae, n_step_targets
120 |
121 | def list_to_batch(self, list):
122 | batch = list[0]
123 | for elem in list[1:]:
124 | batch = np.append(batch, elem, axis=0)
125 | return batch
126 |
127 | def train(self, max_episodes=1000):
128 | for ep in range(max_episodes):
129 | state_batch = []
130 | action_batch = []
131 | reward_batch = []
132 | old_policy_batch = []
133 |
134 | episode_reward, done = 0, False
135 |
136 | state = self.env.reset()
137 |
138 | while not done:
139 | # self.env.render()
140 | probs = self.actor.model.predict(
141 | np.reshape(state, [1, self.state_dim]))
142 | action = np.random.choice(self.action_dim, p=probs[0])
143 |
144 | next_state, reward, done, _ = self.env.step(action)
145 |
146 | state = np.reshape(state, [1, self.state_dim])
147 | action = np.reshape(action, [1, 1])
148 | next_state = np.reshape(next_state, [1, self.state_dim])
149 | reward = np.reshape(reward, [1, 1])
150 |
151 | state_batch.append(state)
152 | action_batch.append(action)
153 | reward_batch.append(reward * 0.01)
154 | old_policy_batch.append(probs)
155 |
156 | if len(state_batch) >= args.update_interval or done:
157 | states = self.list_to_batch(state_batch)
158 | actions = self.list_to_batch(action_batch)
159 | rewards = self.list_to_batch(reward_batch)
160 | old_policys = self.list_to_batch(old_policy_batch)
161 |
162 | v_values = self.critic.model.predict(states)
163 | next_v_value = self.critic.model.predict(next_state)
164 |
165 | gaes, td_targets = self.gae_target(
166 | rewards, v_values, next_v_value, done)
167 |
168 | for epoch in range(args.epochs):
169 | actor_loss = self.actor.train(
170 | old_policys, states, actions, gaes)
171 | critic_loss = self.critic.train(states, td_targets)
172 |
173 | state_batch = []
174 | action_batch = []
175 | reward_batch = []
176 | old_policy_batch = []
177 |
178 | episode_reward += reward[0][0]
179 | state = next_state[0]
180 |
181 | print('EP{} EpisodeReward={}'.format(ep, episode_reward))
182 | wandb.log({'Reward': episode_reward})
183 |
184 |
185 | def main():
186 | env_name = 'CartPole-v1'
187 | env = gym.make(env_name)
188 | agent = Agent(env)
189 | agent.train()
190 |
191 |
192 | if __name__ == "__main__":
193 | main()
194 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |   
2 |
3 |
4 |
5 |
12 |
13 |