├── Black-Box Optimization
├── Evolution_Strategies_parallel+novelty
│ ├── ES_baseline_parallel.py
│ ├── ES_disc_parallel_novelty.py
│ ├── ES_own_conti_parallel.py
│ ├── README.md
│ └── imgs
│ │ └── pendulum.png
├── Evolutionary_Strategies_Cartpole.ipynb
├── README.md
├── genetic_algorithm_base.py
└── imgs
│ └── ga_cartpole.png
├── Categorical_DQN.ipynb
├── ContinousControl
├── A2C_conti_seperate_networks.ipynb
├── A2C_continuous_multienv.ipynb
├── DDPG.py
├── MultiPro.py
├── PPO_conti_gae_curios.ipynb
├── PPO_conti_gae_multi.ipynb
├── PPO_gae_multi.py
├── PPO_test_crawler.ipynb
├── PPO_unity_Crawler.ipynb
├── Parallel_processing.py
├── ROBOSCHOOL_PPO_GAE.ipynb
├── SAC.ipynb
├── SAC_script.py
└── TD3_conti.ipynb
├── Cross_entropy
├── Cross_entropy.py
├── README.md
└── img
│ └── Cross_entropy.png
├── Deep Q_Learning
├── DQN_Experience_Replay.py
├── Img
│ ├── 4k Learning_curve.png
│ └── Converging.png
└── README.md
├── Double DQN
├── CNN_Double_DQN.py
├── Double_DQN.py
├── Imgs
│ ├── 4000_40-40.png
│ ├── CNN_pong_converge.png
│ └── test.png
├── README.md
└── wrapper.py
├── Dueling Deep Q-Network
├── CNN_Dueling_DDQN.py
├── CNN_Dueling_DDQN_PER.py
├── Dueling_DQN.ipynb
├── Img
│ ├── Duel_per.png
│ └── Dueling_DQN.png
├── PrioritizedExperienceReplay.py
├── Video
│ ├── Breakout.mp4
│ └── Pong.mp4
└── wrapper.py
├── Grid_search_for_Reinforcement_learning.ipynb
├── Noisy_DQN.ipynb
├── Nstep_DQN.ipynb
├── PPO_conti_gae_curio_multi.ipynb
├── PPO_gae_curios.ipynb
├── Paper
├── A3C.pdf
├── DDPG.pdf
├── DQN.pdf
├── Distributional DQN.pdf
├── Double_DQN.pdf
├── Dueling.pdf
├── GAE.pdf
├── Noisy_networks.pdf
├── PPO.pdf
├── SAC_2019.pdf
└── TD3.pdf
├── Policy Gradient Algorithms
├── A2C.ipynb
├── A2C_conti_seperate_networks.ipynb
├── A2C_continous_action_space.ipynb
├── A2C_continuous_multienv.ipynb
├── PPO.ipynb
├── Parallel_processing.py
├── PolicyGradient_LSTM.ipynb
├── Policy_Gradien_+_Baseline_mean.ipynb
└── REINFORCE
│ ├── Img
│ └── Steps_needed.png
│ └── REINFORCE.py
├── Q_Learning
├── FrozenLake_q-table.py
├── Img
│ ├── Q_table10000.png
│ ├── Q_value.png
│ ├── Receivedrewards.png
│ └── steps_taken.png
├── Q_Table_E10000_a0.09_g0.9_eps0.9.pkl
├── Q_Table_E3000_a0.09_g0.9_eps0.9.pkl
├── Q_Table_own_example.ipynb
├── Readme.md
├── play_FrozenLake_Q_table.py
├── train_FrozenLake_Qtable.py
└── treasure_q.py
├── README.md
├── Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)
├── Temporal_Difference.ipynb
└── lab-taxi
│ ├── README.md
│ ├── __pycache__
│ ├── agent.cpython-37.pyc
│ └── monitor.cpython-37.pyc
│ ├── agent.py
│ ├── main.py
│ └── monitor.py
└── imgs
└── web-3706562_640.jpg
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_baseline_parallel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Oct 9 10:24:39 2019
4 |
5 | @author: Z0014354
6 | """
7 |
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import gym
11 | import multiprocessing as mp
12 | import collections
13 | import copy
14 |
15 | ITERS_PER_UPDATE = 10
16 | NOISE_STD = 0.01
17 | LR = 1e-3
18 | PROCESSES_COUNT = 8 # amount of worker
19 | HIDDEN_SIZE = 4
20 | ENV_NAME = "CartPole-v0"
21 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps'])
22 |
23 |
24 |
25 | class Model(object):
26 |
27 | def __init__(self, stateCnt, actionCnt, hidden_size = HIDDEN_SIZE):
28 | # inits zero weights
29 | self.weights = [np.zeros(shape=(stateCnt, hidden_size)), np.zeros(shape=(hidden_size, hidden_size)), np.zeros(shape=(hidden_size,actionCnt))]
30 |
31 | def predict(self, inp):
32 | out = np.expand_dims(inp.flatten(), 0)
33 | out = out / np.linalg.norm(out)
34 | weight_len = len(self.weights)
35 | for idx, layer in enumerate(self.weights):
36 | # hidden activation
37 | if idx < weight_len - 1:
38 | out = self.activation(np.dot(out, layer))
39 | # outout activation
40 | else:
41 | out = self.activation(np.dot(out, layer), type_="output_layer")
42 | return out[0]
43 |
44 | def activation(self,x, type_="hidden"):
45 | if type_ == "hidden":
46 | # relu
47 | return np.maximum(x,0)
48 |
49 | # softmax
50 | #return (np.exp(x))/sum(np.exp(x))
51 |
52 | #softplus
53 | #return np.log(1 + np.exp(x))
54 |
55 | #sigmoid
56 | #return 1/(1+np.exp(-x))
57 |
58 | # tanh
59 | #return np.tanh(x)
60 | else:
61 | # softnmax
62 | #return (np.exp(x))/sum(np.exp(x))
63 |
64 | # relu
65 | return np.maximum(x,0)
66 |
67 | def get_weights(self):
68 | return self.weights
69 |
70 | def set_weights(self, weights):
71 | self.weights = weights
72 |
73 |
74 | def evaluate(env, brain):
75 | """
76 | Runs an evaluation on the given brain.
77 | """
78 | state = env.reset()
79 | rewards = 0
80 | steps = 0
81 | while True:
82 | state = np.expand_dims(state, axis=0)
83 | action_prob = brain.predict(state)
84 | action = action_prob.argmax() # for discrete action space
85 |
86 | next_state, reward, done, _ = env.step(action)
87 | rewards += reward
88 | steps += 1
89 | state = next_state
90 | if done:
91 | break
92 |
93 | return rewards, steps
94 |
95 |
96 | def sample_noise(brain):
97 | """
98 | Sampling noise to a positive and negative noise buffer.
99 | """
100 | pos = []
101 | neg = []
102 | for param in brain.get_weights():
103 | noise_t = np.random.normal(size = param.shape)
104 | pos.append(noise_t)
105 | neg.append(-noise_t)
106 | return pos, neg
107 |
108 |
109 | def eval_with_noise(env, brain, noise, noise_std):
110 | """
111 | Evaluates the current brain with added parameter noise
112 |
113 | """
114 |
115 | old_params = copy.deepcopy(brain.get_weights())
116 | new_params = []
117 | for p, p_n in zip(brain.get_weights(), noise):
118 | p += noise_std*p_n
119 | new_params.append(p)
120 | brain.set_weights(new_params)
121 | r, s = evaluate(env, brain)
122 | brain.set_weights(old_params)
123 | return r, s
124 |
125 |
126 | def worker_func(worker_id, params_queue, rewards_queue, noise_std):
127 | #print("worker: {} has started".format(worker_id))
128 | env = gym.make(ENV_NAME)
129 | net = Model(env.observation_space.shape[0], env.action_space.n)
130 |
131 | while True:
132 | params = params_queue.get()
133 | if params is None:
134 | break
135 |
136 | # set parameters of the queue - equal to: net.load_state_dict(params)
137 | net.set_weights([param for param in params])
138 |
139 | for _ in range(ITERS_PER_UPDATE):
140 | seed = np.random.randint(low=0, high=65535)
141 | np.random.seed(seed)
142 | noise, neg_noise = sample_noise(net)
143 | pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std)
144 | neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std)
145 | #print(_, "\n",noise, pos_reward, neg_reward)
146 |
147 | rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps))
148 |
149 | pass
150 |
151 |
152 | def train_step(brain, batch_noise, batch_rewards, step_idx):
153 | """
154 | Optimizes the weights of the NN based on the rewards and noise gathered
155 | """
156 | # normalize rewards to have zero mean and unit variance
157 | norm_reward = np.array(batch_reward)
158 | norm_reward -= np.mean(norm_reward)
159 | s = np.std(norm_reward)
160 | if abs(s) > 1e-6:
161 | norm_reward /= s
162 |
163 | weighted_noise = None
164 | for noise, reward in zip(batch_noise, norm_reward):
165 | if weighted_noise is None:
166 | weighted_noise = [reward * p_n for p_n in noise]
167 | else:
168 | for w_n, p_n in zip(weighted_noise, noise):
169 | w_n += reward * p_n
170 |
171 |
172 | for p, p_update in zip(brain.get_weights(), weighted_noise):
173 | update = p_update / (len(batch_reward)*NOISE_STD)
174 | p += LR * update
175 |
176 |
177 |
178 | if __name__ == "__main__":
179 |
180 | env = gym.make(ENV_NAME)
181 | #env.seed(2)
182 | brain = Model(env.observation_space.shape[0], env.action_space.n)
183 |
184 | iterations = 100 # max iterations to run
185 |
186 | params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)]
187 | rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE)
188 |
189 |
190 | workers = []
191 |
192 |
193 | for idx, params_queue in enumerate(params_queues):
194 | proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD))
195 | proc.start()
196 | workers.append(proc)
197 |
198 | print("All started!")
199 | step_idx = 0
200 | reward_history = []
201 | reward_max =[]
202 | reward_std = []
203 |
204 |
205 | for step_idx in range(iterations):
206 | # broadcasting network params
207 | params = brain.get_weights()
208 | for q in params_queues:
209 | q.put(params)
210 |
211 | batch_noise = []
212 | batch_reward = []
213 | batch_steps_data = []
214 | batch_steps = 0
215 | results = 0
216 | while True:
217 | while not rewards_queue.empty():
218 | reward = rewards_queue.get_nowait()
219 | np.random.seed(reward.seed) # sets the seed of the current worker rewards
220 | noise, neg_noise = sample_noise(brain)
221 | batch_noise.append(noise)
222 | batch_reward.append(reward.pos_reward)
223 | batch_noise.append(neg_noise)
224 | batch_reward.append(reward.neg_reward)
225 | results += 1
226 | batch_steps += reward.steps
227 |
228 | if results == PROCESSES_COUNT * ITERS_PER_UPDATE:
229 | break
230 |
231 | step_idx += 1
232 | m_reward = np.mean(batch_reward)
233 | reward_history.append(m_reward)
234 | reward_max.append(np.max(batch_reward))
235 | reward_std.append(np.std(batch_reward))
236 | if m_reward > 199:
237 | print("\nSolved the environment in {} steps".format(step_idx))
238 | break
239 | train_step(brain, batch_noise, batch_reward, step_idx)
240 |
241 | print("\rStep: {}, Mean_Reward: {:.2f}".format(step_idx, m_reward), end = "", flush = True)
242 |
243 |
244 | for worker, p_queue in zip(workers, params_queues):
245 | p_queue.put(None)
246 | worker.join()
247 |
248 | plt.figure(figsize = (11,7))
249 | plt.plot(reward_history, label = "Mean Reward", color = "orange")
250 | plt.plot(reward_max, label = "Max Reward", color = "blue")
251 | plt.plot(reward_std, label = "Reward std", color = "green")
252 | plt.xlabel("Steps")
253 | plt.ylabel("Rewards")
254 | plt.legend()
255 | plt.show()
--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_disc_parallel_novelty.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Oct 9 10:24:39 2019
4 |
5 | @author: Z0014354
6 | """
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 | from torch.distributions import Normal
11 |
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | import gym
15 | import torch.multiprocessing as mp
16 | import collections
17 | from collections import deque
18 | import copy
19 | from tensorboardX import SummaryWriter
20 | from sklearn.neighbors import NearestNeighbors
21 |
22 | ITERS_PER_UPDATE = 10
23 | NOISE_STD = 0.1 #0.04 higher std leeds to better exploration - more stable learning
24 | LR = 2e-2
25 | PROCESSES_COUNT = 6 # amount of worker default 6
26 | HIDDEN_SIZE = 5 # 6
27 | K_NEIGHBORS = 10
28 | ENV_NAME = "CartPole-v0" #"Alien-ram-v0"
29 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps'])
30 |
31 |
32 |
33 | class Model(nn.Module):
34 | def __init__(self, state_size, action_size, idx, hidden_size=HIDDEN_SIZE):
35 | super(Model, self).__init__()
36 | self.idx = idx
37 | self.fc1 = nn.Linear(state_size, hidden_size)
38 | self.fc2 = nn.Linear(hidden_size, hidden_size)
39 | self.fc3 = nn.Linear(hidden_size, action_size)
40 |
41 | def forward(self, x):
42 | x = torch.relu(self.fc1(x))
43 | x = torch.relu(self.fc2(x))
44 | probs = torch.softmax(self.fc3(x), dim=1)
45 | return probs
46 |
47 |
48 | def evaluate(env, brain):
49 | """
50 | Runs an evaluation on the given brain.
51 | """
52 | state = env.reset()
53 | rewards = 0
54 | steps = 0
55 | while True:
56 | state = torch.from_numpy(state).unsqueeze(0).float()
57 | probs = brain(state)
58 | action = probs.max(dim = 1)[1]
59 | next_state, reward, done, _ = env.step(action.data.numpy()[0])
60 | rewards += reward
61 | steps += 1
62 | state = next_state
63 | if done:
64 | break
65 |
66 | return rewards, steps
67 |
68 |
69 | def sample_noise(brain):
70 | """
71 | Samples noise from a normal distribution in the shape of the brain parameters. Output are two noisy parameters: + noise and - noise (for better and more stable learning!)
72 | """
73 | pos = []
74 | neg = []
75 | for param in brain.parameters():
76 | noise_t = torch.tensor(np.random.normal(size = param.data.size()).astype(np.float32))
77 | pos.append(noise_t)
78 | neg.append(-noise_t)
79 | return pos, neg
80 |
81 |
82 | def eval_with_noise(env, brain, noise, noise_std):
83 | """
84 | Evaluates the current brain with added parameter noise
85 |
86 | """
87 | for p, p_n in zip(brain.parameters(), noise):
88 | p.data += noise_std * p_n
89 | r, s = evaluate(env, brain)
90 | for p, p_n in zip(brain.parameters(), noise):
91 | p.data -= noise_std * p_n
92 | return r, s
93 |
94 |
95 | def worker_func(worker_id, params_queue, rewards_queue, noise_std):
96 | """
97 | Worker function that gathers pos and negative rewards for the optimization process and puts them in the rewards_queue with the network parameter seed:
98 | >> rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) <<
99 | """
100 | #print("worker: {} has started".format(worker_id))
101 | env = gym.make(ENV_NAME)
102 | net = Model(env.observation_space.shape[0], env.action_space.n, "worker")
103 | net.eval()
104 | while True:
105 | params = params_queue.get()
106 | if params is None:
107 | break
108 |
109 | # set parameters of the queue
110 | net.load_state_dict(params)
111 |
112 | for _ in range(ITERS_PER_UPDATE):
113 | seed = np.random.randint(low=0, high=65535)
114 | np.random.seed(seed)
115 | noise, neg_noise = sample_noise(net)
116 | pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std)
117 | neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std)
118 | #print(_, "\n",noise, pos_reward, neg_reward)
119 | rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps))
120 |
121 | pass
122 |
123 |
124 | def train_step(brain, novelty, batch_noise, batch_rewards, step_idx):
125 | """
126 | Optimizes the weights of the NN based on the rewards and noise gathered
127 | """
128 | # normalize rewards to have zero mean and unit variance
129 | norm_reward = np.array(batch_reward)
130 | norm_reward -= np.mean(norm_reward)
131 | s = np.std(norm_reward)
132 | if abs(s) > 1e-6:
133 | norm_reward /= s
134 |
135 | weighted_noise = None
136 | for noise, reward in zip(batch_noise, norm_reward):
137 | if weighted_noise is None:
138 | weighted_noise = [(W*reward* p_n) + ((1-W)*novelty*p_n) for p_n in noise] # combining reward and novelty
139 | else:
140 | for w_n, p_n in zip(weighted_noise, noise):
141 | w_n += (W*reward* p_n) + ((1-W)*novelty*p_n)
142 |
143 |
144 | for p, p_update in zip(brain.parameters(), weighted_noise):
145 | update = p_update / (len(batch_reward)*NOISE_STD)
146 | p.data += LR * update
147 |
148 |
149 | def test_current_params(env, net):
150 | """
151 | Runs the current network parameters on the env to visually monitor the progress.
152 | """
153 | state = env.reset()
154 |
155 | while True:
156 | env.render()
157 | state = torch.from_numpy(state).unsqueeze(0).float()
158 | probs = brain(state)
159 | action = probs.max(dim = 1)[1]
160 | state, reward, done, _ = env.step(action.data.numpy()[0])
161 |
162 | if done:
163 | break
164 |
165 | def get_behavior_char(env, net):
166 | """
167 | Returns the initial behavior characterization value b_pi0 for a network.
168 | The value is defined in this case as the final state of agent in the environment.
169 |
170 | >>> Important to find a good behavior characterization. Depents on the environment! <<< -> final state, step count ...
171 |
172 | """
173 | state = env.reset()
174 | step_count = 0
175 | while True:
176 | state = torch.from_numpy(state).unsqueeze(0).float()
177 | probs = brain(state)
178 | action = probs.max(dim = 1)[1]
179 | state, reward, done, _ = env.step(action.data.numpy()[0])
180 | step_count += 1
181 | if done:
182 | break
183 | #print(step_count)
184 | return np.array([step_count]) #state
185 |
186 |
187 | def get_kNN(archive, bc, n_neighbors):
188 | """
189 | Searches and samples the K-nearest-neighbors from the archive and a new behavior characterization
190 | returns the summed distance between input behavior characterization and the bc in the archive
191 |
192 | """
193 |
194 | archive = np.concatenate(archive)
195 | neigh = NearestNeighbors(n_neighbors=n_neighbors)
196 | neigh.fit(archive)
197 | distances, idx = neigh.kneighbors(X = bc, n_neighbors=n_neighbors)
198 | #k_nearest_neighbors = archive[idx].squeeze(0)
199 |
200 | return sum(distances.squeeze(0))
201 |
202 |
203 |
204 | # =============================================================================
205 | # def calc_novelty(b_pi_theta, archive):
206 | # """
207 | # calculates the novelty of a given arcive of behavior characterizations.
208 | # returns the mean distance between the initial behavior characterizations and all new gathered behavior characterizations.
209 | # """
210 | # # distance loss function:
211 | # distance = nn.MSELoss() #nn.PairwiseDistance()
212 | # # creates arcive vector for distance calc
213 | # archive_v = torch.cat(archive)
214 | # # create a vector of initial behavior characterizations in the shape of the arcive length
215 | # b_pi_theta_v = torch.cat([b_pi_theta for i in range(len(archive))])
216 | #
217 | # return torch.sqrt(distance(b_pi_theta_v, archive_v)).mean()
218 | # =============================================================================
219 |
220 | def calc_noveltiy_distribution(novelties):
221 | """
222 | Calculates the probabilities of each model parameters of being selected as its
223 | novelty normalized by the sum of novelty across all policies:
224 |
225 | P(theta_m) for each element in the meta_population M - m element M
226 |
227 | """
228 | probabilities = [round((novel/(sum(novelties))),4) for novel in novelties]
229 | return probabilities
230 |
231 |
232 | if __name__ == "__main__":
233 |
234 | env = gym.make(ENV_NAME)
235 | #env.seed(2)
236 | MPS = 2 # meta population size
237 | meta_population = [Model(env.observation_space.shape[0],env.action_space.n, idx=i) for i in range(MPS)]
238 |
239 | # create arcive for models
240 | archive = []
241 | writer = SummaryWriter()
242 | iterations = 300 #1500 # max iterations to run
243 |
244 | delta_reward_buffer = deque(maxlen=10) # buffer to store the reward gradients to see if rewards stay constant over a defined time horizont ~> local min
245 | W = 1
246 |
247 | params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)]
248 | rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE)
249 | workers = []
250 |
251 | for idx, params_queue in enumerate(params_queues):
252 | proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD))
253 | proc.start()
254 | workers.append(proc)
255 |
256 | print("All started!")
257 | step_idx = 0
258 | reward_history = []
259 | reward_max =[]
260 | reward_min = []
261 | reward_std = []
262 |
263 | old_m_reward = 0
264 |
265 | for step_idx in range(iterations):
266 |
267 | ########################## NOVELTY BRAIN SELECTION #############################
268 | # select new network from the meta population based on its probability:
269 | if len(archive) > 0:
270 | novelties = []
271 | S = np.minimum(K_NEIGHBORS, len(archive))
272 | for model in meta_population:
273 | b_pi_theta = torch.from_numpy(get_behavior_char(env, model)).unsqueeze(0).float()
274 | distance = get_kNN(archive, b_pi_theta.numpy(), S)
275 | novelty = distance / S
276 | if novelty <= 1e-3:
277 | novelty = 5e-3
278 | novelties.append(novelty)
279 |
280 | #print("novelties:", novelties)
281 |
282 | probs = calc_noveltiy_distribution(novelties)
283 | #print("probs: ", probs )
284 | probs = np.array(probs)
285 | probs /= probs.sum() # norm so that sum up to one - does without as well but np gives error because of rounding
286 | brain_idx = np.random.choice(list(range(MPS)),p=probs) # select new brain based on novelty probabilities
287 | brain = meta_population[brain_idx]
288 | novelty = novelties[brain_idx]
289 | else:
290 | brain_idx = np.random.randint(0, MPS)
291 | brain = meta_population[brain_idx]
292 | novelty = 1
293 | ###################################################################################
294 |
295 | # broadcasting network params
296 | params = brain.state_dict()
297 | for q in params_queues:
298 | q.put(params)
299 |
300 | batch_noise = []
301 | batch_reward = []
302 | batch_steps_data = []
303 | batch_steps = 0
304 | results = 0
305 |
306 | while True:
307 | #print(rewards_queue.qsize())
308 | while not rewards_queue.empty():
309 | reward = rewards_queue.get_nowait()
310 | np.random.seed(reward.seed) # sets the seed of the current worker rewards
311 | noise, neg_noise = sample_noise(brain)
312 | batch_noise.append(noise)
313 | batch_reward.append(reward.pos_reward)
314 | batch_noise.append(neg_noise)
315 | batch_reward.append(reward.neg_reward)
316 | results += 1
317 | batch_steps += reward.steps
318 |
319 | if results == PROCESSES_COUNT * ITERS_PER_UPDATE:
320 | break
321 |
322 | step_idx += 1
323 | m_reward = np.mean(batch_reward)
324 |
325 | reward_gradient_mean = np.mean(delta_reward_buffer)
326 | r_koeff = abs(m_reward - reward_gradient_mean)
327 | # if last few rewards are almost konstant -> stuck in loc minima -> decrease W for exploration: higher novelty weight
328 | if r_koeff < 1.5:
329 | W = np.maximum(0, W-0.05)
330 | else:
331 | W = np.minimum(1, W+0.05)
332 | delta_reward_buffer.append(m_reward)
333 | old_m_reward = m_reward
334 |
335 | writer.add_scalar("mean_reward", np.mean(batch_reward), step_idx)
336 | writer.add_scalar("max_reward", np.max(batch_reward), step_idx)
337 | writer.add_scalar("min_reward", np.min(batch_reward), step_idx)
338 | writer.add_scalar("std", np.std(batch_reward), step_idx)
339 | writer.add_scalar("novelty", novelty, step_idx)
340 | writer.add_scalar("novelty_w", W, step_idx)
341 | # =============================================================================
342 | # if m_reward > -250:
343 | # print("\nSolved the environment in {} steps".format(step_idx))
344 | # break
345 | # =============================================================================
346 | train_step(brain, novelty, batch_noise, batch_reward, step_idx)
347 | # select new behavior:
348 | b_pix = torch.from_numpy(get_behavior_char(env, brain)).unsqueeze(0).float()
349 | # append new behavior to specific brain archive
350 | archive.append(b_pix.numpy())
351 |
352 | print("\rStep: {}, Mean_Reward: {:.2f}, Novelty: {:.2f}, W: {:.2f} r_koeff: {:.2f}".format(step_idx, m_reward, novelty, W, r_koeff), end = "", flush = True)
353 |
354 | # if step_idx % 10 == 0:
355 | # test_current_params(env, brain)
356 |
357 | for worker, p_queue in zip(workers, params_queues):
358 | p_queue.put(None)
359 | worker.join()
360 |
--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_own_conti_parallel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Oct 9 10:24:39 2019
4 |
5 | @author: Z0014354
6 | """
7 |
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import gym
11 | import multiprocessing as mp
12 | import collections
13 | import copy
14 |
15 | ITERS_PER_UPDATE = 10
16 | NOISE_STD = 0.1 #0.04 higher std leeds to better exploration - more stable learning
17 | LR = 2e-2
18 | PROCESSES_COUNT = 6 # amount of worker default 6
19 | HIDDEN_SIZE = 12 # 6
20 | ENV_NAME = "Damper-v0"
21 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps'])
22 |
23 |
24 |
25 | class Model(object):
26 |
27 | def __init__(self, stateCnt, actionCnt, hidden_size = HIDDEN_SIZE):
28 | # inits zero weights
29 | self.weights = [np.random.uniform(-1,1,size=(stateCnt, hidden_size)), np.random.uniform(-1,1, size=(hidden_size, hidden_size)), np.random.uniform(-1,1,size=(hidden_size,actionCnt))]
30 |
31 | def predict(self, inp):
32 | out = np.expand_dims(inp.flatten(), 0)
33 | #out = out / np.linalg.norm(out)
34 | weight_len = len(self.weights)
35 | for idx, layer in enumerate(self.weights):
36 | # hidden activation
37 | if idx < weight_len - 1:
38 | out = self.activation(np.dot(out, layer))
39 | # outout activation
40 | else:
41 | out = self.activation(np.dot(out, layer), type_="output_layer")
42 | return out[0]
43 |
44 | def activation(self,x, type_="hidden"):
45 | if type_ == "hidden":
46 | # relu
47 | return np.maximum(x,0)
48 |
49 | # softmax
50 | #return (np.exp(x))/sum(np.exp(x))
51 |
52 | #softplus
53 | #return np.log(1 + np.exp(x))
54 |
55 | #sigmoid
56 | #return 1/(1+np.exp(-x))
57 |
58 | # tanh
59 | #return np.tanh(x)
60 | else:
61 | # tanh
62 | return np.tanh(x)
63 |
64 | # relu
65 | #return np.maximum(x,0)
66 |
67 | def get_weights(self):
68 | return self.weights
69 |
70 | def set_weights(self, weights):
71 | self.weights = weights
72 |
73 |
74 | def evaluate(env, brain):
75 | """
76 | Runs an evaluation on the given brain.
77 | """
78 | state = env.reset()
79 | rewards = 0
80 | steps = 0
81 | while True:
82 | state = np.expand_dims(state, axis=0)
83 | #print("State:", state)
84 | action_mean = brain.predict(state)
85 | action = np.random.normal(action_mean, scale=0.01)
86 | action = np.clip(action, -1, 1) # pendulums action range is between -2,2
87 | next_state, reward, done, _ = env.step(action)
88 | rewards += reward
89 | steps += 1
90 | state = next_state
91 | if done:
92 | break
93 |
94 | return rewards, steps
95 |
96 |
97 | def sample_noise(brain):
98 | """
99 | Samples noise from a normal distribution in the shape of the brain parameters. Output are two noisy parameters: + noise and - noise (for better and more stable learning!)
100 | """
101 | pos = []
102 | neg = []
103 | for param in brain.get_weights():
104 | noise_t = np.random.normal(size = param.shape)
105 | pos.append(noise_t)
106 | neg.append(-noise_t)
107 | return pos, neg
108 |
109 |
110 | def eval_with_noise(env, brain, noise, noise_std):
111 | """
112 | Evaluates the current brain with added parameter noise
113 |
114 | """
115 | old_params = copy.deepcopy(brain.get_weights())
116 | new_params = []
117 | for p, p_n in zip(brain.get_weights(), noise):
118 | p += noise_std*p_n
119 | new_params.append(p)
120 | brain.set_weights(new_params)
121 | r, s = evaluate(env, brain)
122 | brain.set_weights(old_params)
123 | return r, s
124 |
125 |
126 | def worker_func(worker_id, params_queue, rewards_queue, noise_std):
127 | """
128 | Worker function that gathers pos and negative rewards for the optimization process and puts them in the rewards_queue with the network parameter seed:
129 | >> rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) <<
130 | """
131 | #print("worker: {} has started".format(worker_id))
132 | env = gym.make(ENV_NAME)
133 | net = Model(env.observation_space.shape[0], env.action_space.shape[0])
134 |
135 | while True:
136 | params = params_queue.get()
137 | if params is None:
138 | break
139 |
140 | # set parameters of the queue - equal to: net.load_state_dict(params)
141 | net.set_weights([param for param in params])
142 |
143 | for _ in range(ITERS_PER_UPDATE):
144 | seed = np.random.randint(low=0, high=65535)
145 | np.random.seed(seed)
146 | noise, neg_noise = sample_noise(net)
147 | pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std)
148 | neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std)
149 | #print(_, "\n",noise, pos_reward, neg_reward)
150 | rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps))
151 |
152 | pass
153 |
154 |
155 | def train_step(brain, batch_noise, batch_rewards, step_idx):
156 | """
157 | Optimizes the weights of the NN based on the rewards and noise gathered
158 | """
159 | # normalize rewards to have zero mean and unit variance
160 | norm_reward = np.array(batch_reward)
161 | norm_reward -= np.mean(norm_reward)
162 | s = np.std(norm_reward)
163 | if abs(s) > 1e-6:
164 | norm_reward /= s
165 |
166 | weighted_noise = None
167 | for noise, reward in zip(batch_noise, norm_reward):
168 | if weighted_noise is None:
169 | weighted_noise = [reward * p_n for p_n in noise]
170 | else:
171 | for w_n, p_n in zip(weighted_noise, noise):
172 | w_n += reward * p_n
173 |
174 |
175 | for p, p_update in zip(brain.get_weights(), weighted_noise):
176 | update = p_update / (len(batch_reward)*NOISE_STD)
177 | p += LR * update
178 |
179 |
180 | def test_current_params(env, brain):
181 | """
182 | Runs the current network parameters on the env to visually monitor the progress.
183 | """
184 | state = env.reset()
185 |
186 | while True:
187 | env.render()
188 | state = np.expand_dims(state, axis=0)
189 | action_mean = brain.predict(state)
190 | action = np.random.normal(action_mean, scale=0.01)
191 | action = np.clip(action, -1, 1) # pendulums action range is between -2,2
192 | state, reward, done, _ = env.step(action)
193 |
194 | if done:
195 | break
196 |
197 |
198 | if __name__ == "__main__":
199 |
200 | env = gym.make(ENV_NAME)
201 | #env.seed(2)
202 | brain = Model(env.observation_space.shape[0], env.action_space.shape[0])
203 |
204 | iterations = 100 #1500 # max iterations to run
205 |
206 | params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)]
207 | rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE)
208 | workers = []
209 |
210 | for idx, params_queue in enumerate(params_queues):
211 | proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD))
212 | proc.start()
213 | workers.append(proc)
214 |
215 | print("All started!")
216 | step_idx = 0
217 | reward_history = []
218 | reward_max =[]
219 | reward_min = []
220 | reward_std = []
221 |
222 | for step_idx in range(iterations):
223 | # broadcasting network params
224 | params = brain.get_weights()
225 | for q in params_queues:
226 | q.put(params)
227 |
228 | batch_noise = []
229 | batch_reward = []
230 | batch_steps_data = []
231 | batch_steps = 0
232 | results = 0
233 |
234 | while True:
235 | while not rewards_queue.empty():
236 | reward = rewards_queue.get_nowait()
237 | np.random.seed(reward.seed) # sets the seed of the current worker rewards
238 | noise, neg_noise = sample_noise(brain)
239 | batch_noise.append(noise)
240 | batch_reward.append(reward.pos_reward)
241 | batch_noise.append(neg_noise)
242 | batch_reward.append(reward.neg_reward)
243 | results += 1
244 | batch_steps += reward.steps
245 |
246 | if results == PROCESSES_COUNT * ITERS_PER_UPDATE:
247 | break
248 |
249 | step_idx += 1
250 | m_reward = np.mean(batch_reward)
251 | reward_history.append(m_reward)
252 | reward_max.append(np.max(batch_reward))
253 | reward_min.append(np.min(batch_reward))
254 | reward_std.append(np.std(batch_reward))
255 | # =============================================================================
256 | # if m_reward > -250:
257 | # print("\nSolved the environment in {} steps".format(step_idx))
258 | # break
259 | # =============================================================================
260 | train_step(brain, batch_noise, batch_reward, step_idx)
261 |
262 | print("\rStep: {}, Mean_Reward: {:.2f}".format(step_idx, m_reward), end = "", flush = True)
263 |
264 | if step_idx % 10 == 0:
265 | test_current_params(env, brain)
266 |
267 | for worker, p_queue in zip(workers, params_queues):
268 | p_queue.put(None)
269 | worker.join()
270 |
271 | plt.figure(figsize = (11,7))
272 | plt.plot(reward_history, label = "Mean Reward", color = "green")
273 | plt.plot(reward_max, label = "Max Reward", color = "blue")
274 | plt.plot(reward_min, label = "Min Reward", color = "red")
275 | plt.plot(reward_std, label = "Reward std", color = "orange")
276 | plt.xlabel("Steps")
277 | plt.ylabel("Rewards")
278 | plt.legend()
279 | plt.show()
--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/README.md:
--------------------------------------------------------------------------------
1 | # Evolution Strategy algorithms
2 |
3 |
4 |
5 | This folder contains 3 different Evolutionary Strategy algorithms:
6 |
7 | - [ES_baseline_parallel.py](ES_baseline_parallel.py)
8 |
9 | Baseline Evolution Strategy Algorithm for discrete action space that solves the CartPole environment.
10 | Code is based on this [Paper](https://arxiv.org/abs/1703.03864) and on this [book chapter](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16)
11 | Written solely in numpy!
12 |
13 |
14 | - [ES_dis_parallel_novelty.py](ES_dis_parallel_novelty.py)
15 |
16 | Evolution Strategy algorithm for discrete action space with a novelty search for extra exploration.
17 | Code is based on [Paper EvoStrategy](https://arxiv.org/abs/1703.03864), [Paper novelty seeking agents](http://papers.nips.cc/paper/7750-improving-exploration-in-evolution-strategies-for-deep-reinforcement-learning-via-a-population-of-novelty-seeking-agents) and the [book chapter 16](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16).
18 | Written with pytorch.
19 |
20 | - [ES_conti_parallel_novelty.py](ES_conti_parallel_novelty.py)
21 |
22 | Evolution Strategy algorithm for continuous action space with a novelty search for extra exploration.
23 | Code is based on [Paper EvoStrategy](https://arxiv.org/abs/1703.03864), [Paper novelty seeking agents](http://papers.nips.cc/paper/7750-improving-exploration-in-evolution-strategies-for-deep-reinforcement-learning-via-a-population-of-novelty-seeking-agents) and the [book chapter 16](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16).
24 | Written with pytorch.
25 |
26 |
27 |
28 | Evolution Strategies solving Pendulum:
29 |
30 |
31 | 
32 |
33 |
--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/imgs/pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Black-Box Optimization/Evolution_Strategies_parallel+novelty/imgs/pendulum.png
--------------------------------------------------------------------------------
/Black-Box Optimization/README.md:
--------------------------------------------------------------------------------
1 | # Black-Box Optimization
2 |
3 | ### Evolution Strategy
4 | #### Baseline implementation [ES_cartpole](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolutionary_Strategies_Cartpole.ipynb)
5 |
6 |
7 |
8 | ### Genetic Algorithms
9 |
10 | #### Baseline implementation [GA_cartpole](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/genetic_algorithm_base.py)
11 | - run 'python genetic_algorithm_base.py' with the flags: `--noise`, `--ps`, `--pc` as:
12 | - `--noise`(std) that is added as the mutation of the neural network weights, default = 0.05
13 | - `--ps` as the population size, default = 50
14 | - `--pc` as the parents count or amount of top performer that build the new population, default = 10
15 |
16 | Example performance with noise_std = 0.05, ps=30, pc=10
17 |
18 | 
19 |
--------------------------------------------------------------------------------
/Black-Box Optimization/genetic_algorithm_base.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import torch
3 | import torch.nn as nn
4 | from torch.utils.tensorboard import SummaryWriter
5 | import numpy as np
6 | import argparse
7 | import gym
8 |
9 |
10 | class Network(nn.Module):
11 | def __init__(self,state_size,action_size,hidden_layer,seed):
12 | super(Network, self).__init__()
13 | self.state_size = state_size
14 | self.action_size = action_size
15 | self.hidden_layer = hidden_layer
16 |
17 | self.net = nn.Sequential(
18 | nn.Linear(self.state_size, self.hidden_layer),
19 | nn.ReLU(),
20 | nn.Linear(self.hidden_layer, self.hidden_layer),
21 | nn.ReLU(),
22 | nn.Linear(self.hidden_layer, self.action_size),
23 | nn.Softmax(dim = 1))
24 |
25 | def forward(self, x):
26 | return self.net(x)
27 |
28 | def evaluate(env, net):
29 | """
30 | Plays a round of the game and returns the obtained reward
31 | """
32 | state = env.reset()
33 | rewards = 0
34 | while True:
35 | state = torch.from_numpy(state).unsqueeze(0).float()
36 | action_prob = net(state)
37 | action = action_prob.max(dim=1)[1] #argmax
38 | next_state, reward, done, info = env.step(action.data.numpy()[0])
39 | rewards += reward
40 | state = next_state
41 | if done:
42 | break
43 | return rewards
44 |
45 | def mutate_parent(net):
46 | """
47 | Mutates the parent neural nets by adding noise sampled by a normal distribution.
48 |
49 | """
50 | new_net = copy.deepcopy(net)
51 | for parameter in new_net.parameters():
52 | noise = torch.tensor(np.random.normal(size=parameter.data.size()).astype(np.float32))
53 | parameter.data += NOISE_STD * noise
54 | return new_net
55 |
56 |
57 | if __name__ == "__main__":
58 | # parse input values like
59 | # - Noise standard deviation [NOISE_STD]
60 | # - Population size [POPULATION_SIZE]
61 | # - Parents count [PARENTS_COUNT]
62 |
63 | parser = argparse.ArgumentParser(description = "Noise, Population size, Parents count")
64 | parser.add_argument("--noise",type = float,default=1e-2)
65 | parser.add_argument( "--ps",type=int,default=50)
66 | parser.add_argument( "--pc",type=int,default=10)
67 |
68 | args = parser.parse_args()
69 | NOISE_STD = args.noise
70 | POPULATION_SIZE = args.ps
71 | PARENTS_COUNT = args.pc
72 |
73 | #print(f"Noise: {NOISE_STD}, PopS: {POPULATION_SIZE}, PARENTS_COUNT: {PARENTS_COUNT}")
74 | np.random.seed(seed=42)
75 | torch.manual_seed(42)
76 | writer = SummaryWriter(comment="-CartPole")
77 | env = gym.make("CartPole-v0")
78 | gen_idx = 0
79 | state_size = env.observation_space.shape[0]
80 | action_size = env.action_space.n
81 |
82 | nets = [Network(state_size, action_size, hidden_layer=32, seed=3) for _ in range(POPULATION_SIZE)]
83 | population = [(net, evaluate(env, net)) for net in nets]
84 |
85 | while True:
86 | population.sort(key=lambda p: p[1], reverse=True) # sorts the fitness from highest to lowest
87 | rewards = [p[1] for p in population[:PARENTS_COUNT]] # takes the fitness of the top x-parents
88 | reward_mean = np.mean(rewards)
89 | reward_max = np.max(rewards)
90 | reward_std = np.std(rewards)
91 |
92 | writer.add_scalar("reward_mean", reward_mean, gen_idx)
93 | writer.add_scalar("reward_max", reward_max, gen_idx)
94 | writer.add_scalar("reward_std", reward_std, gen_idx)
95 | print(f"Generation: {gen_idx} | Reward_mean: {reward_mean} | Reward_max: {reward_max} | Reward_std: {reward_std}")
96 |
97 | if reward_mean > 199:
98 | print("Solved the environment in {} generations".format(gen_idx))
99 | break
100 | writer.close()
101 |
102 | prev_population = population
103 | population = [population[0]] # list of the nets
104 |
105 | for _ in range(POPULATION_SIZE-1):
106 | parent_idx = np.random.randint(0, PARENTS_COUNT) #sample the new population from the top x-parents
107 | parent = prev_population[parent_idx][0]
108 | net = mutate_parent(parent)
109 | population.append((net, evaluate(env, net)))
110 |
111 | gen_idx += 1
112 |
113 |
114 |
--------------------------------------------------------------------------------
/Black-Box Optimization/imgs/ga_cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Black-Box Optimization/imgs/ga_cartpole.png
--------------------------------------------------------------------------------
/ContinousControl/DDPG.py:
--------------------------------------------------------------------------------
1 | import random
2 | from tensorboardX import SummaryWriter
3 | import torch
4 | import torch.nn as nn
5 | import torch.optim as optim
6 | import numpy as np
7 | import roboschool
8 | import gym
9 | from gym import wrappers
10 | import pybullet_envs
11 | import time
12 |
13 | class NormalizedActions(gym.ActionWrapper):
14 |
15 | def _action(self, action):
16 | low_bound = self.action_space.low
17 | upper_bound = self.action_space.high
18 |
19 | action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
20 | action = np.clip(action, low_bound, upper_bound)
21 |
22 | return action
23 |
24 | def _reverse_action(self, action):
25 | low_bound = self.action_space.low
26 | upper_bound = self.action_space.high
27 |
28 | action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
29 | action = np.clip(action, low_bound, upper_bound)
30 |
31 | return action
32 |
33 | class ReplayBuffer:
34 | def __init__(self, capacity):
35 | self.capacity = capacity
36 | self.buffer = []
37 | self.position = 0
38 |
39 | def push(self, state, action, reward, next_state, done):
40 | if len(self.buffer) < self.capacity:
41 | self.buffer.append(None)
42 | self.buffer[self.position] = (state, action, reward, next_state, done)
43 | self.position = (self.position + 1) % self.capacity
44 |
45 | def sample(self, batch_size):
46 | batch = random.sample(self.buffer, batch_size)
47 | state, action, reward, next_state, done = map(np.stack, zip(*batch))
48 | return state, action, reward, next_state, done
49 |
50 | def __len__(self):
51 | return len(self.buffer)
52 |
53 | class OUNoise(object):
54 | def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
55 | self.mu = mu # mean value -> as "goal state" 0.0 in the sense of no noise
56 | self.theta = theta
57 | self.sigma = max_sigma # variance of the noise
58 | self.max_sigma = max_sigma
59 | self.min_sigma = min_sigma
60 | self.decay_period = decay_period
61 | self.action_dim = action_space.shape[0]
62 | self.low = action_space.low
63 | self.high = action_space.high
64 | self.reset()
65 |
66 | def reset(self):
67 | self.state = np.ones(self.action_dim) * self.mu
68 |
69 | def evolve_state(self):
70 | x = self.state
71 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
72 | self.state = x + dx
73 | return self.state
74 |
75 | def get_action(self, action, t=0):
76 | ou_state = self.evolve_state()
77 | self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
78 | return np.clip(action + ou_state, self.low, self.high), ou_state
79 |
80 | #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py
81 |
82 |
83 | class Actor(nn.Module):
84 | def __init__(self, input_shape, action_shape):
85 | super(Actor, self).__init__()
86 | self.actor = nn.Sequential(nn.Linear(input_shape[0],400),
87 | nn.LayerNorm(400),
88 | nn.ReLU(),
89 | nn.Linear(400,300),
90 | nn.LayerNorm(300),
91 | nn.ReLU(),
92 | nn.Linear(300,action_shape[0]),
93 | nn.Tanh())
94 | def forward(self, x):
95 | state = torch.FloatTensor(x).to(device)
96 | return self.actor(state)
97 |
98 | class Critic(nn.Module):
99 | def __init__(self, input_shape, action_shape):
100 | super(Critic, self).__init__()
101 |
102 | self.critic1 = nn.Sequential(nn.Linear(input_shape[0],400),
103 | #nn.LayerNorm(256),
104 | nn.ReLU())
105 | self.critic2 = nn.Sequential(nn.Linear(400+ action_shape[0], 300),
106 | #nn.LayerNorm(256),
107 | nn.ReLU(),
108 | nn.Linear(300,1))
109 | def forward(self,state, action):
110 | x = self.critic1(state)
111 | comb = torch.cat([x,action], dim = 1)
112 | return self.critic2(comb)
113 |
114 | def update_and_optimize(batch_size):
115 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
116 | state_v = torch.FloatTensor(state).to(device) # shape[batch_size,3]
117 | action_v = torch.FloatTensor(action).to(device) # shape[batch_size,1]
118 | reward_v = torch.FloatTensor(reward).unsqueeze(1).to(device) # shape [batch_size,1]
119 | next_state_v = torch.FloatTensor(next_state).to(device) # shape [batch_size,3]
120 | done_v = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) # shape [batch_size,1]
121 |
122 | # update critic:
123 | critic_optim.zero_grad()
124 | Q_v = critic(state_v, action_v)
125 | next_action = target_actor(next_state).to(device)
126 | target_Q = target_critic(next_state_v, next_action.detach())
127 | discounted_target_Q = (reward_v + 0.99 * target_Q * (1.0 - done_v)).to(device)
128 | loss = critic_loss(Q_v, discounted_target_Q.detach())
129 | writer.add_scalar("Critic loss", loss, frame_idx)
130 | writer.add_scalar("Target_Q", target_Q.mean(), frame_idx)
131 | loss.backward()
132 | critic_optim.step()
133 |
134 | # update actor:
135 | actor_optim.zero_grad()
136 | current_action = actor(state_v.cpu())
137 | actor_loss = -critic(state_v, current_action.to(device)).mean()
138 | writer.add_scalar("Actor loss", actor_loss, frame_idx)
139 | actor_loss.backward()
140 | actor_optim.step()
141 |
142 | # Softupdate
143 | soft_tau = 0.01
144 | for target_param, param in zip(target_critic.parameters(), critic.parameters()):
145 | target_param.data.copy_(
146 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau
147 | )
148 |
149 | for target_param, param in zip(target_actor.parameters(), actor.parameters()):
150 | target_param.data.copy_(
151 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau
152 | )
153 |
154 |
155 | if __name__ == "__main__":
156 | start = time.time()
157 | # use cuda
158 | use_cuda = torch.cuda.is_available()
159 | device = torch.device("cuda" if use_cuda else "cpu")
160 |
161 | print("Device: ",device)
162 | ENV_ID = "HalfCheetahBulletEnv-v0" #HalfCheetahBulletEnv-v0 #MinitaurBulletEnv-v0
163 | env = gym.make(ENV_ID)
164 | #env = gym.make("RoboschoolHalfCheetah-v1") #RoboschoolHalfCheetah-v1
165 | env = wrappers.Monitor(env, "Saved_Videos/", resume=True, force = True, video_callable=lambda episode_id: episode_id% 5 ==0)
166 | #, video_callable=lambda x: True, force=True
167 | env = NormalizedActions(env)
168 |
169 | action_space = env.action_space.shape
170 | observation_space = env.observation_space.shape
171 |
172 | critic = Critic(observation_space, action_space).to(device)
173 | actor = Actor(observation_space, action_space).to(device)
174 | target_actor = actor
175 | target_critic = critic
176 | target_actor.load_state_dict(actor.state_dict())
177 | target_critic.load_state_dict(critic.state_dict())
178 | critic_optim = optim.Adam(critic.parameters(), lr = 0.001, weight_decay=1e-2)
179 | actor_optim = optim.Adam(actor.parameters(), lr = 0.0001)
180 |
181 | critic_loss = nn.MSELoss()
182 |
183 | replay_buffer_size = 1000000
184 | replay_buffer = ReplayBuffer(replay_buffer_size)
185 |
186 | writer = SummaryWriter()
187 |
188 | noise = OUNoise(env.action_space)
189 | batch_size = 128
190 | max_frames = 80000 #100000~32 min --300000 ~47 min
191 | frame_idx = 0
192 | rewards = []
193 |
194 | while frame_idx < max_frames:
195 | state = env.reset()
196 | noise.reset()
197 | ou_states = []
198 | episode_reward = 0
199 | done = False
200 | step = 0
201 | print("Training Progress: {:.2f}".format(frame_idx/max_frames *100))
202 | while not done:
203 | action = actor(state)
204 | action, ou_state = noise.get_action(action.cpu().detach().numpy(), frame_idx) #step
205 | ou_states.append(ou_state)
206 |
207 | next_state, reward, done, _ = env.step(action)
208 |
209 |
210 |
211 | replay_buffer.push(state, action, reward, next_state, done)
212 | if len(replay_buffer) > batch_size:# and frame_idx % 10 == 0:
213 | update_and_optimize(batch_size)
214 |
215 | state = next_state
216 | episode_reward += reward
217 | frame_idx += 1
218 | step += 1
219 |
220 |
221 | if done:
222 | writer.add_scalar("Rewards", episode_reward, frame_idx)
223 | writer.add_scalar("Steps", step, frame_idx)
224 | writer.add_scalar("OU_state", np.array(ou_states).mean(), frame_idx)
225 |
226 | end = time.time()
227 | writer.close()
228 | print("------------------------------\nTraining for {:.2f} minutes".format((end-start)/60))
229 |
--------------------------------------------------------------------------------
/ContinousControl/MultiPro.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Dec 4 10:31:28 2019
4 |
5 | @author: Z0014354
6 |
7 | """
8 |
9 | from multiprocessing import Process, Pipe
10 | import numpy as np
11 |
12 | def worker(remote, parent_remote, env_fn_wrapper):
13 | parent_remote.close()
14 | env = env_fn_wrapper.x()
15 | while True:
16 | cmd, data = remote.recv()
17 | if cmd == 'step':
18 | ob, reward, done, info = env.step(data)
19 | if done:
20 | ob = env.reset()
21 | remote.send((ob, reward, done, info))
22 | elif cmd == 'reset':
23 | ob = env.reset()
24 | remote.send(ob)
25 | elif cmd == 'reset_task':
26 | ob = env.reset_task()
27 | remote.send(ob)
28 | elif cmd == 'close':
29 | remote.close()
30 | break
31 | elif cmd == 'get_spaces':
32 | remote.send((env.observation_space, env.action_space))
33 | else:
34 | raise NotImplementedError
35 |
36 | class CloudpickleWrapper(object):
37 | """
38 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
39 | """
40 | def __init__(self, x):
41 | self.x = x
42 | def __getstate__(self):
43 | import cloudpickle
44 | return cloudpickle.dumps(self.x)
45 | def __setstate__(self, ob):
46 | import pickle
47 | self.x = pickle.loads(ob)
48 |
49 |
50 | class VecEnv(object):
51 | """
52 | An abstract asynchronous, vectorized environment.
53 | """
54 | def __init__(self, num_envs, observation_space, action_space):
55 | self.num_envs = num_envs
56 | self.observation_space = observation_space
57 | self.action_space = action_space
58 |
59 | def reset(self):
60 | """
61 | Reset all the environments and return an array of
62 | observations, or a tuple of observation arrays.
63 | If step_async is still doing work, that work will
64 | be cancelled and step_wait() should not be called
65 | until step_async() is invoked again.
66 | """
67 | pass
68 |
69 | def step_async(self, actions):
70 | """
71 | Tell all the environments to start taking a step
72 | with the given actions.
73 | Call step_wait() to get the results of the step.
74 | You should not call this if a step_async run is
75 | already pending.
76 | """
77 | pass
78 |
79 | def step_wait(self):
80 | """
81 | Wait for the step taken with step_async().
82 | Returns (obs, rews, dones, infos):
83 | - obs: an array of observations, or a tuple of
84 | arrays of observations.
85 | - rews: an array of rewards
86 | - dones: an array of "episode done" booleans
87 | - infos: a sequence of info objects
88 | """
89 | pass
90 |
91 | def close(self):
92 | """
93 | Clean up the environments' resources.
94 | """
95 | pass
96 |
97 | def step(self, actions):
98 | self.step_async(actions)
99 | return self.step_wait()
100 |
101 | class SubprocVecEnv(VecEnv):
102 | def __init__(self, env_fns, spaces=None):
103 | """
104 | envs: list of gym environments to run in subprocesses
105 | """
106 | self.waiting = False
107 | self.closed = False
108 | nenvs = len(env_fns)
109 | self.nenvs = nenvs
110 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
111 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
112 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
113 | for p in self.ps:
114 | p.daemon = True # if the main process crashes, we should not cause things to hang
115 | p.start()
116 | for remote in self.work_remotes:
117 | remote.close()
118 |
119 | self.remotes[0].send(('get_spaces', None))
120 | observation_space, action_space = self.remotes[0].recv()
121 | VecEnv.__init__(self, len(env_fns), observation_space, action_space)
122 |
123 | def step_async(self, actions):
124 | for remote, action in zip(self.remotes, actions):
125 | remote.send(('step', action))
126 | self.waiting = True
127 |
128 | def step_wait(self):
129 | results = [remote.recv() for remote in self.remotes]
130 | self.waiting = False
131 | obs, rews, dones, infos = zip(*results)
132 | return np.stack(obs), np.stack(rews), np.stack(dones), infos
133 |
134 | def reset(self):
135 | for remote in self.remotes:
136 | remote.send(('reset', None))
137 | return np.stack([remote.recv() for remote in self.remotes])
138 |
139 | def reset_task(self):
140 | for remote in self.remotes:
141 | remote.send(('reset_task', None))
142 | return np.stack([remote.recv() for remote in self.remotes])
143 |
144 | def close(self):
145 | if self.closed:
146 | return
147 | if self.waiting:
148 | for remote in self.remotes:
149 | remote.recv()
150 | for remote in self.remotes:
151 | remote.send(('close', None))
152 | for p in self.ps:
153 | p.join()
154 | self.closed = True
155 |
156 | def __len__(self):
157 | return self.nenvs
--------------------------------------------------------------------------------
/ContinousControl/PPO_gae_multi.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Dec 4 10:33:09 2019
4 |
5 | @author: Z0014354
6 |
7 | PPO with GAE implementation of Sebastian Dittert
8 | """
9 |
10 | import gym
11 | import math
12 | import torch
13 | import torch.nn as nn
14 | import torch.optim as optim
15 | from torch.distributions import Normal
16 | import torch.nn.functional as F
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | from torch.nn.utils import clip_grad_norm_
20 | from collections import deque
21 | from tensorboardX import SummaryWriter
22 | import MultiPro
23 | import argparse
24 | import time
25 |
26 |
27 | def hidden_init(layer):
28 | fan_in = layer.weight.data.size()[0]
29 | lim = 1. / np.sqrt(fan_in)
30 | return (-lim, lim)
31 |
32 | class Critic(nn.Module):
33 | def __init__(self, input_shape, hidden_size):
34 | super(Critic, self).__init__()
35 | self.layer1 = nn.Linear(input_shape, hidden_size)
36 | self.layer2 = nn.Linear(hidden_size, hidden_size)
37 | self.layer3 = nn.Linear(hidden_size, 1)
38 | self.reset_parameters()
39 |
40 | def forward(self,x):
41 | x = torch.tanh(self.layer1(x))
42 | x = torch.tanh(self.layer2(x))
43 | x = self.layer3(x)
44 |
45 | return x
46 |
47 | def reset_parameters(self):
48 | nn.init.xavier_uniform_(self.layer1.weight)
49 | nn.init.xavier_uniform_(self.layer2.weight)
50 | #nn.init.xavier_uniform_(self.layer3.weight)
51 |
52 | class Actor(nn.Module):
53 | def __init__(self, input_shape, output_shape, action_high_low, hidden_size):
54 | super(Actor, self).__init__()
55 | self.layer1 = nn.Linear(input_shape, hidden_size)
56 | self.layer2 = nn.Linear(hidden_size,hidden_size)
57 |
58 | self.mean = nn.Linear(hidden_size, output_shape)
59 | self.variance = nn.Linear(hidden_size, output_shape)
60 | self.action_high_low = action_high_low
61 | #self.reset_parameters()
62 |
63 | def forward(self, x):
64 |
65 | x = torch.tanh(self.layer1(x))
66 | head = torch.tanh(self.layer2(x))
67 |
68 | mean = torch.tanh(self.mean(head)) # tanh squashed output to the range of -1..1
69 | variance = F.softplus(self.variance(head)) # log(1 + e^x) has the shape of a smoothed ReLU
70 | sigma = torch.sqrt(variance.cpu())
71 | m = Normal(mean.cpu(), sigma)
72 | actions = m.sample()
73 | logprobs = m.log_prob(actions) #for the optimization step we create a new distribution based on the new mean and variance - still taking the logprobs based on the old actions!
74 |
75 | return actions, logprobs, m
76 |
77 |
78 | def reset_parameters(self):
79 | nn.init.xavier_uniform_(self.layer1.weight)
80 | nn.init.xavier_uniform_(self.layer2.weight)
81 | nn.init.xavier_uniform_(self.mean.weight)
82 | #nn.init.xavier_uniform_(self.variance.weight)
83 |
84 |
85 |
86 | class Agent():
87 | def __init__(self,
88 | state_size,
89 | action_size,
90 | action_high_low,
91 | hidden_size,
92 | LR_A=3e-4,
93 | LR_C=3e-4,
94 | gamma=0.99,
95 | lambda_=0.95,
96 | mini_batch_size=512,
97 | ppo_epochs=5):
98 |
99 | self.state_size = state_size
100 | self.actor = Actor(state_size, action_size, action_high_low, hidden_size).to(device)
101 | self.action_high = action_high_low[0]
102 | self.action_low = action_high_low[1]
103 | self.critic = Critic(state_size, hidden_size).to(device)
104 |
105 | self.gamma = gamma
106 | self.lambda_ = lambda_
107 | self.mini_batch_size = mini_batch_size
108 | self.ppo_epochs = ppo_epochs
109 |
110 |
111 | self.optimizer_a = optim.Adam(params=self.actor.parameters(), lr=LR_A) #RMSprop
112 | self.optimizer_c = optim.Adam(params=self.critic.parameters(), lr=LR_C)
113 |
114 |
115 | def test_net(self, env, count = 10):
116 | """
117 | Tests the agents performance with current weights.
118 | """
119 | rewards = 0.0
120 | steps = 0
121 | entropys = 0.0
122 |
123 | for _ in range(count):
124 | obs = env.reset()
125 |
126 | while True:
127 | obs_v = torch.from_numpy(obs).float()
128 | action, _, dist = self.actor(obs_v.to(device))
129 | entropy = dist.entropy().detach().cpu().numpy()
130 | action = action.cpu().numpy()
131 | action = np.clip(action*self.action_high, self.action_low, self.action_high)
132 | obs, reward, done, info = env.step(action)
133 |
134 | rewards += reward
135 | entropys += entropy.mean()
136 | steps += 1
137 | if done:
138 | break
139 |
140 | return rewards/count, entropys/count, steps/count
141 |
142 |
143 |
144 |
145 | def compute_gae(self, next_value, rewards, masks, values):
146 | """
147 | lambda => 1: high variance, low bias
148 | lambda => 0: low variance, high bias
149 | """
150 |
151 | rewards_batch = list(zip(*rewards))
152 | masks_batch = list(zip(*masks))
153 | values_batch = torch.cat((torch.stack(values, dim=1).squeeze(2), next_value.squeeze(0)),dim=1)
154 |
155 | out_discounted_rewards = []
156 | out_advantage = []
157 | for rewards, masks, values in zip(rewards_batch, masks_batch, values_batch):
158 |
159 | gae = 0
160 | disc_returns = []
161 | advantage = []
162 | for step in reversed(range(len(rewards))):
163 | # d = r_t +gamma*V(s_t+1) - V(s)
164 | delta = rewards[step] + self.gamma * values[step + 1] * masks[step] - values[step]
165 | # sum(lambda*gamma)^t* delta_t+1
166 | gae = delta + self.gamma * self.lambda_ * masks[step] * gae
167 |
168 | disc_returns.insert(0, gae + values[step]) # adding values since we want the returns and not the advantage yet! A(a,s) = Q"returns" - V(s)
169 | advantage.insert(0, gae)
170 |
171 | out_discounted_rewards.append(disc_returns)
172 | out_advantage.append(advantage)
173 |
174 | return torch.FloatTensor(out_discounted_rewards).flatten().unsqueeze(1), torch.FloatTensor(out_advantage).flatten().unsqueeze(1)
175 |
176 |
177 | def ppo_iter(self, states, actions, log_probs, advantage, discounted_rewards):
178 | batch_size = len(states)
179 |
180 | for i in range(batch_size // self.mini_batch_size):
181 | rand_ids = np.random.randint(0, batch_size, self.mini_batch_size)
182 |
183 | yield states[rand_ids], actions[rand_ids], log_probs[rand_ids], advantage[rand_ids], discounted_rewards[rand_ids]
184 |
185 |
186 |
187 | def ppo_update(self, states, actions, log_probs, advantage, discounted_rewards, eps_clip=0.2):
188 | """
189 |
190 | """
191 |
192 | a_loss_batch = []
193 | c_loss_batch = []
194 |
195 |
196 | for _ in range(self.ppo_epochs):
197 | for states_i, old_actions, old_logprobs, advantage_i, discounted_reward_i in self.ppo_iter(states, actions, log_probs, advantage, discounted_rewards):
198 |
199 | self.optimizer_c.zero_grad()
200 | #train critic
201 | new_value = self.critic(states_i.to(device))
202 |
203 | c_loss = .5 * (discounted_reward_i - new_value).pow(2).mean()
204 | c_loss.backward()
205 | #print("C: ", c_loss)
206 | clip_grad_norm_(self.critic.parameters(),CLIP_GRAD)
207 | self.optimizer_c.step()
208 |
209 | #train actor
210 | self.optimizer_a.zero_grad()
211 | _, _, dist = self.actor(states_i.to(device))
212 | new_logprobs = dist.log_prob(old_actions)
213 | entropy = dist.entropy()
214 |
215 | ratio = torch.exp(new_logprobs - old_logprobs.detach())
216 | surr = ratio * advantage_i
217 | clip = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip)
218 | a_loss = torch.min(surr, clip*advantage_i )
219 | a_loss = (- a_loss - ENTROPY_BONUS * entropy).mean()
220 | clip_grad_norm_(self.actor.parameters(),CLIP_GRAD)
221 | a_loss.backward(retain_graph=True)
222 |
223 | self.optimizer_a.step()
224 |
225 | c_loss_batch.append(c_loss.detach().numpy())
226 | a_loss_batch.append(a_loss.detach().numpy())
227 |
228 | return np.array(c_loss_batch).mean(), np.array(a_loss_batch).mean()
229 |
230 |
231 |
232 | def main(args):
233 | torch.multiprocessing.freeze_support()
234 | t0 = time.time()
235 | ENV = args.env #"MountainCarContinuous-v0" #Pendulum-v0 LunarLanderContinuous-v0
236 |
237 | env = gym.make(ENV)#Creating the Environment
238 | writer = SummaryWriter("runs/"+args.info)
239 | n_cpu = args.worker
240 |
241 | envs = MultiPro.SubprocVecEnv([lambda: gym.make(ENV) for i in range(n_cpu)])
242 | seed = args.seed
243 |
244 | torch.manual_seed(seed)
245 | torch.cuda.manual_seed(seed)
246 | np.random.seed(seed)
247 | env.seed(seed)
248 |
249 |
250 | state_size = env.observation_space.shape[0]
251 | action_size = env.action_space.shape[0]
252 | action_high_low = (env.action_space.high[0], env.action_space.low[0])
253 |
254 | agent = Agent(state_size, action_size, action_high_low= action_high_low, hidden_size=args.layer_size, LR_A=args.lr, LR_C=args.lr, gamma=args.gamma, lambda_=args.lambda_, mini_batch_size=args.mini_batch_size, ppo_epochs=args.ppo_updates)
255 |
256 | max_episodes = args.ep
257 | plot_rewards = []
258 | max_steps = int(args.max_steps/n_cpu)
259 |
260 | # calc reshape stacking size
261 | shape = (max_steps*n_cpu, state_size)
262 |
263 | for ep in range(max_episodes+1):
264 | states = envs.reset()
265 |
266 | done = False
267 |
268 | state_batch = []
269 | value_batch = []
270 | action_batch = []
271 | logprob_batch = []
272 | rewards_batch = []
273 | masks = []
274 | for step in range(max_steps):
275 |
276 | states = torch.from_numpy(states).float()
277 |
278 | action, logprob, _ = agent.actor(states.to(device))
279 | value = agent.critic(states.to(device))
280 | action_v = action.cpu().numpy()
281 |
282 | action_v = np.clip(action_v*env.action_space.high[0], env.action_space.low[0], env.action_space.high[0])
283 | next_states, reward, done, _ = envs.step(action_v)
284 |
285 | state_batch.append(states)
286 | value_batch.append(value)
287 | logprob_batch.append(logprob)
288 | action_batch.append(action)
289 | rewards_batch.append(torch.from_numpy(reward).float())
290 | masks.append(torch.from_numpy(1 - done).float())
291 |
292 | states = next_states
293 |
294 |
295 | if np.any(done):
296 | states = envs.reset()
297 |
298 | # stack all gathered data
299 |
300 | state_batch = torch.stack(state_batch, dim=1).reshape(shape)
301 | actions_batch = torch.stack(action_batch, dim=1).reshape(max_steps*n_cpu,action_size)
302 | logprob_batch = torch.stack(logprob_batch, dim=1).reshape(max_steps*n_cpu,action_size).detach()
303 |
304 |
305 | # calculate advantage:
306 | next_value = agent.critic(torch.from_numpy(next_states).float())
307 | discounted_rewards, advantage = agent.compute_gae(next_value, rewards_batch, masks, value_batch)
308 |
309 | # normalize advantage:
310 | advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
311 |
312 | c_loss, a_loss = agent.ppo_update(states=state_batch, actions=actions_batch, log_probs=logprob_batch, advantage=advantage.detach() , discounted_rewards=discounted_rewards.detach())
313 | writer.add_scalar("critic_loss", c_loss, ep)
314 | writer.add_scalar("actor_loss", a_loss, ep)
315 |
316 |
317 | if ep != 0 and ep % 5 == 0:
318 | test_rewards, test_entropy, test_steps = agent.test_net(env)
319 | writer.add_scalar("entropy",test_entropy, ep)
320 | writer.add_scalar("max_reward",test_rewards, ep)
321 | plot_rewards.append(test_rewards)
322 |
323 | print("\rEpisode: {} | Ep_Reward: {:.2f} | Average_100: {:.2f}".format(ep, test_rewards, np.mean(plot_rewards[-100:])), end = "", flush = True)
324 |
325 | envs.close()
326 | t1 = time.time()
327 | plt.pause(60)
328 | env.close()
329 | print("training took {} min!".format((t1-t0)/60))
330 |
331 | if __name__ == "__main__":
332 | parser = argparse.ArgumentParser(description="")
333 | parser.add_argument("-env", type=str,default="Pendulum-v0", help="Environment name")
334 | parser.add_argument("-info", type=str, help="Information or name of the run")
335 | parser.add_argument("-ep", type=int, default=200, help="The amount of training episodes, default is 200")
336 | parser.add_argument("-seed", type=int, default=0, help="Seed for the env and torch network weights, default is 0")
337 | parser.add_argument("-lr", type=float, default=5e-4, help="Learning rate of adapting the network weights, default is 5e-4")
338 | parser.add_argument("-entropy_bonus", type=float, default=1e-3, help="Entropy bonus for exploration - default is 1e-2")
339 | parser.add_argument("-layer_size", type=int, default=64, help="Number of nodes per neural network layer, default is 64")
340 | parser.add_argument("-worker", type=int, default=8, help="Number of parallel worker -default is 8")
341 | parser.add_argument("-lambda_", type=float, default=0.95, help="GAE lambda")
342 | parser.add_argument("-g", "--gamma", type=float, default=0.99, help="discount factor gamma, default is 0.99")
343 | parser.add_argument("-CG", "--clip_grad", type=float, default=0.25, help="Clip the gradients for updating the network parameters, default is 0.25")
344 | parser.add_argument("-ms", "--max_steps", type=int, default=2048, help="Maximum steps that are taken by the agent in the environment before updating")
345 | parser.add_argument("-mbs", "--mini_batch_size", type=int, default=256, help="Mini Batch size for the ppo updates, default is 256")
346 | parser.add_argument("-updates", "--ppo_updates", type=int, default=7, help="Number of PPO updates, default is 7")
347 | args = parser.parse_args()
348 |
349 | device = "cuda" if torch.cuda.is_available() else "cpu"
350 | print("Using: ", device)
351 |
352 | ENTROPY_BONUS = args.entropy_bonus
353 | CLIP_GRAD = args.clip_grad
354 | main(args)
355 |
--------------------------------------------------------------------------------
/ContinousControl/PPO_test_crawler.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from unityagents import UnityEnvironment\n",
10 | "import gym\n",
11 | "import math\n",
12 | "import torch \n",
13 | "import torch.nn as nn\n",
14 | "import torch.optim as optim\n",
15 | "from torch.distributions import Normal\n",
16 | "import torch.nn.functional as F\n",
17 | "import numpy as np\n",
18 | "import matplotlib.pyplot as plt\n",
19 | "from torch.nn.utils import clip_grad_norm_\n",
20 | "from collections import deque"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stderr",
30 | "output_type": "stream",
31 | "text": [
32 | "INFO:unityagents:\n",
33 | "'Academy' started successfully!\n",
34 | "Unity Academy name: Academy\n",
35 | " Number of Brains: 1\n",
36 | " Number of External Brains : 1\n",
37 | " Lesson number : 0\n",
38 | " Reset Parameters :\n",
39 | "\t\t\n",
40 | "Unity brain name: CrawlerBrain\n",
41 | " Number of Visual Observations (per agent): 0\n",
42 | " Vector Observation space type: continuous\n",
43 | " Vector Observation space size (per agent): 129\n",
44 | " Number of stacked Vector Observation: 1\n",
45 | " Vector Action space type: continuous\n",
46 | " Vector Action space size (per agent): 20\n",
47 | " Vector Action descriptions: , , , , , , , , , , , , , , , , , , , \n"
48 | ]
49 | },
50 | {
51 | "name": "stdout",
52 | "output_type": "stream",
53 | "text": [
54 | "Number of agents: 12\n",
55 | "Size of each action: 20\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "env = UnityEnvironment(file_name='Crawler_Linux/Crawler.x86_64')\n",
61 | "# get the default brain\n",
62 | "brain_name = env.brain_names[0]\n",
63 | "brain = env.brains[brain_name]\n",
64 | "# reset the environment\n",
65 | "env_info = env.reset(train_mode=False)[brain_name]\n",
66 | "\n",
67 | "# number of agents\n",
68 | "num_agents = len(env_info.agents)\n",
69 | "print('Number of agents:', num_agents)\n",
70 | "\n",
71 | "# size of each action\n",
72 | "action_size = brain.vector_action_space_size\n",
73 | "print('Size of each action:', action_size)\n",
74 | "\n",
75 | "# examine the state space \n",
76 | "states_ = env_info.vector_observations\n",
77 | "state_size = states_.shape[1]\n"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 3,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "class Critic(nn.Module):\n",
87 | " def __init__(self, input_shape, layer_size):\n",
88 | " super(Critic, self).__init__()\n",
89 | " self.net = nn.Sequential(nn.Linear(input_shape, layer_size),\n",
90 | " nn.ReLU(),\n",
91 | " nn.Linear(layer_size,layer_size),\n",
92 | " nn.ReLU(),\n",
93 | " nn.Linear(layer_size, 1))\n",
94 | " \n",
95 | " def forward(self,x):\n",
96 | " x = self.net(x)\n",
97 | " return x\n",
98 | " \n",
99 | "class Actor(nn.Module):\n",
100 | " def __init__(self, input_shape, output_shape,layer_size):\n",
101 | " super(Actor, self).__init__()\n",
102 | " self.net = nn.Sequential(nn.Linear(input_shape, layer_size),\n",
103 | " nn.ReLU(),\n",
104 | " nn.Linear(layer_size,layer_size),\n",
105 | " nn.ReLU(),\n",
106 | " )\n",
107 | " self.mean = nn.Sequential(nn.Linear(layer_size, output_shape),\n",
108 | " nn.Tanh()) # tanh squashed output to the range of -1..1\n",
109 | " self.variance =nn.Sequential(nn.Linear(layer_size, output_shape),\n",
110 | " nn.Softplus()) # log(1 + e^x) has the shape of a smoothed ReLU\n",
111 | " \n",
112 | " def forward(self, x):\n",
113 | " x = self.net(x) \n",
114 | " sigma = torch.sqrt(self.variance(x).cpu())\n",
115 | " m = Normal(self.mean(x).cpu(), sigma)\n",
116 | " actions = m.sample()\n",
117 | " actions = torch.clamp(actions, -1, 1) # usually clipping between -1,1 but pendulum env has action range of -2,2\n",
118 | "\n",
119 | " logprobs = m.log_prob(actions) #for the optimization step we create a new distribution based on the new mean and variance - still taking the logprobs based on the old actions!\n",
120 | "\n",
121 | " \n",
122 | " return actions, logprobs, m\n",
123 | " \n",
124 | "class Agent():\n",
125 | " def __init__(self, state_size, action_size, ppo_epochs, mini_batch_size,\\\n",
126 | " layer_size,lr_a, lr_c, gamma, entropy_beta, clip_grad):\n",
127 | " self.state_size = state_size\n",
128 | " self.action_size = action_size\n",
129 | " \n",
130 | " self.layer_size = layer_size\n",
131 | " self.gamma = gamma\n",
132 | " self.entropy_beta = entropy_beta\n",
133 | " self.clip_grad = clip_grad\n",
134 | " \n",
135 | " self.ppo_epochs = ppo_epochs\n",
136 | " self.mini_batch_size = mini_batch_size\n",
137 | " \n",
138 | " self.actor = Actor(state_size, action_size,layer_size).to(device)\n",
139 | " self.critic = Critic(state_size,layer_size).to(device)\n",
140 | " self.a_optimizer = optim.RMSprop(params = self.actor.parameters(),lr = lr_a)\n",
141 | " self.c_optimizer = optim.RMSprop(params = self.critic.parameters(),lr = lr_c)\n",
142 | " \n",
143 | " def act(self, states):\n",
144 | " self.actor.eval()\n",
145 | " with torch.no_grad():\n",
146 | " actions, logprobs ,_ = self.actor(torch.from_numpy(states).float().to(device))\n",
147 | " self.actor.train()\n",
148 | " return actions.cpu().numpy(), logprobs\n",
149 | " \n",
150 | "\n",
151 | " def compute_returns(self,rewards_tensor, masks_tensor):\n",
152 | " output = []\n",
153 | " for rewards, masks in zip(rewards_tensor, masks_tensor):\n",
154 | " R = 0 \n",
155 | " returns = []\n",
156 | " for step in reversed(range(len(rewards))):\n",
157 | " R = rewards[step] + self.gamma * R * masks[step]\n",
158 | " returns.insert(0, R)\n",
159 | " output.append(returns)\n",
160 | " output = list(zip(*output))\n",
161 | " discounted_rewards = [torch.FloatTensor(i).unsqueeze(1) for i in output]\n",
162 | " return torch.cat(discounted_rewards)\n",
163 | "\n",
164 | "\n",
165 | "\n",
166 | " def ppo_iter(self, states, actions, log_probs, advantage, discounted_rewards):\n",
167 | " batch_size = len(states)#.shape[]\n",
168 | "\n",
169 | " for i in range(batch_size // self.mini_batch_size):\n",
170 | " rand_ids = np.random.randint(0, batch_size, self.mini_batch_size)\n",
171 | "\n",
172 | " yield torch.cat(states)[rand_ids], torch.cat(actions)[rand_ids], torch.cat(log_probs)[rand_ids], advantage[rand_ids], discounted_rewards[rand_ids]\n",
173 | "\n",
174 | "\n",
175 | "\n",
176 | " def ppo_update(self, states, actions, log_probs, advantage, discounted_rewards, eps_clip=0.2):\n",
177 | " \"\"\"\n",
178 | "\n",
179 | " \"\"\"\n",
180 | "\n",
181 | " a_loss_batch = []\n",
182 | " c_loss_batch = []\n",
183 | " entropy_batch = []\n",
184 | "\n",
185 | " for _ in range(self.ppo_epochs):\n",
186 | " for states_i, old_actions, old_logprobs, advantage_i, discounted_reward_i in self.ppo_iter(states, actions, log_probs, advantage, discounted_rewards):\n",
187 | "\n",
188 | " self.c_optimizer.zero_grad()\n",
189 | " #tran critic\n",
190 | " new_value = self.critic(states_i.to(device))\n",
191 | " c_loss = F.mse_loss(new_value, discounted_reward_i).cpu()\n",
192 | " c_loss.backward(retain_graph=True)\n",
193 | " clip_grad_norm_(self.critic.parameters(),self.clip_grad)\n",
194 | " self.c_optimizer.step()\n",
195 | "\n",
196 | " c_loss_batch.append(c_loss.detach().cpu().numpy())\n",
197 | "\n",
198 | "\n",
199 | " #train actor\n",
200 | " self.a_optimizer.zero_grad()\n",
201 | " _, _, dist = self.actor(states_i.to(device))\n",
202 | " new_logprobs = dist.log_prob(old_actions)\n",
203 | " entropy = dist.entropy().mean()\n",
204 | " entropy_batch.append(entropy.detach().cpu().numpy())\n",
205 | "\n",
206 | "\n",
207 | " ratio = torch.exp(new_logprobs - old_logprobs.detach()).cpu()\n",
208 | " surr = ratio * advantage_i.cpu()\n",
209 | " clip = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip)\n",
210 | "\n",
211 | " \n",
212 | " a_loss = - (torch.min(surr, clip * advantage_i.cpu() ).mean()) + self.entropy_beta * entropy.cpu()\n",
213 | " a_loss.backward(retain_graph=True)\n",
214 | " clip_grad_norm_(self.actor.parameters(),self.clip_grad)\n",
215 | " self.a_optimizer.step()\n",
216 | "\n",
217 | " a_loss_batch.append(a_loss.detach().cpu().numpy())\n",
218 | "\n",
219 | "\n",
220 | " return np.array(c_loss_batch).mean(), np.array(a_loss_batch).mean(), np.array(entropy_batch).mean()\n",
221 | "\n",
222 | "def list_to_tensor(list_):\n",
223 | " return np.array(list(zip(*list_)))"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "torch.manual_seed(42)\n",
233 | "torch.cuda.manual_seed(42)\n",
234 | "np.random.seed(42)\n",
235 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
236 | "agent = Agent(state_size = state_size, action_size = action_size ,ppo_epochs = 5, mini_batch_size = 512,\\\n",
237 | " layer_size = 512 ,lr_a = 1e-4, lr_c = 1e-4, gamma = 0.99 , entropy_beta = 1e-4, clip_grad = 1)\n",
238 | "\n",
239 | "agent.actor.load_state_dict(torch.load(\"Crawler_weights/actor100.pth\"))\n",
240 | "agent.actor.eval()\n",
241 | "\n",
242 | "max_episodes = 0\n",
243 | "\n",
244 | "c_loss_list = []\n",
245 | "a_loss_list = []\n",
246 | "entropy_list = []\n",
247 | "\n",
248 | "\n",
249 | "average_100 = deque(maxlen = 100)\n",
250 | "\n",
251 | "mean_rewards = []\n",
252 | "max_rewards = []\n",
253 | "min_rewards = []\n",
254 | "average_100_rewards = []\n",
255 | "\n",
256 | "max_steps = 2024\n",
257 | "\n",
258 | "for ep in range(max_episodes+1):\n",
259 | " env_info = env.reset(train_mode=False)[brain_name] # reset the environment \n",
260 | " states = env_info.vector_observations # get the current state (for each agent)\n",
261 | " done = False\n",
262 | " \n",
263 | " states_batch = []\n",
264 | " values_batch = []\n",
265 | " actions_batch = []\n",
266 | " logprobs_batch = []\n",
267 | " rewards_batch = []\n",
268 | " masks = []\n",
269 | " scores = np.zeros(num_agents)\n",
270 | " while True:\n",
271 | "\n",
272 | " actions, logprobs = agent.act(states) \n",
273 | " env_info = env.step(actions)[brain_name] # send all actions to tne environment\n",
274 | " next_states = env_info.vector_observations # get next state (for each agent)\n",
275 | " rewards = env_info.rewards # get reward (for each agent)\n",
276 | " dones = env_info.local_done # see if episode finished\n",
277 | " scores += env_info.rewards\n",
278 | " \n",
279 | " states = next_states\n",
280 | "\n",
281 | " if np.any(dones):\n",
282 | " break\n",
283 | "\n",
284 | " \n",
285 | " mean_rewards.append(np.mean(scores))\n",
286 | " min_rewards.append(np.min(scores))\n",
287 | " max_rewards.append(np.max(scores))\n",
288 | " average_100.append(np.mean(scores))\n",
289 | " average_100_rewards.append(np.array(average_100).mean())\n",
290 | " \n",
291 | " print(\"\\rEpisode: {} | mean_reward: {:.2f} | min_reward: {:.2f} | max_reward: {:.2f} | Average_100: {:.2f}\".format(ep, np.mean(scores), np.min(scores), np.max(scores), np.mean(average_100)), end = \"\", flush = True)\n",
292 | " if ep != 0 and ep % 100 == 0:\n",
293 | " print(\"\\rEpisode: {} | mean_reward: {:.2f} | min_reward: {:.2f} | max_reward: {:.2f} | Average_100: {:.2f}\".format(ep, np.mean(scores), np.min(scores), np.max(scores), np.mean(average_100)))\n",
294 | "\n",
295 | " \n",
296 | "\n",
297 | " \n",
298 | "env.close()\n",
299 | "# PLOTTING RESULTS\n",
300 | "\n",
301 | "plt.figure(figsize = (20,7))\n",
302 | "plt.subplot(1,4,1)\n",
303 | "plt.title(\"actor loss\")\n",
304 | "plt.plot(a_loss_list)\n",
305 | "plt.subplot(1,4,2)\n",
306 | "plt.title(\"critic loss\")\n",
307 | "plt.plot(c_loss_list)\n",
308 | "plt.subplot(1,4,3)\n",
309 | "plt.title(\"entropy\")\n",
310 | "plt.plot(entropy_list)\n",
311 | "plt.subplot(1,4,4)\n",
312 | "plt.title(\"rewards\")\n",
313 | "plt.plot(mean_rewards, c = \"b\")\n",
314 | "plt.plot(min_rewards, c = \"y\")\n",
315 | "plt.plot(max_rewards, c = \"r\")\n",
316 | "plt.plot(average_100_rewards, c = \"g\")\n",
317 | "plt.show()"
318 | ]
319 | }
320 | ],
321 | "metadata": {
322 | "kernelspec": {
323 | "display_name": "Python 3",
324 | "language": "python",
325 | "name": "python3"
326 | },
327 | "language_info": {
328 | "codemirror_mode": {
329 | "name": "ipython",
330 | "version": 3
331 | },
332 | "file_extension": ".py",
333 | "mimetype": "text/x-python",
334 | "name": "python",
335 | "nbconvert_exporter": "python",
336 | "pygments_lexer": "ipython3",
337 | "version": "3.6.5"
338 | }
339 | },
340 | "nbformat": 4,
341 | "nbformat_minor": 2
342 | }
343 |
--------------------------------------------------------------------------------
/ContinousControl/Parallel_processing.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Process, Pipe
2 | import numpy as np
3 |
4 | def worker(remote, parent_remote, env_fn_wrapper):
5 | parent_remote.close()
6 | env = env_fn_wrapper.x()
7 | while True:
8 | cmd, data = remote.recv()
9 | if cmd == 'step':
10 | ob, reward, done, info = env.step(data)
11 | if done:
12 | ob = env.reset()
13 | remote.send((ob, reward, done, info))
14 | elif cmd == 'reset':
15 | ob = env.reset()
16 | remote.send(ob)
17 | elif cmd == 'reset_task':
18 | ob = env.reset_task()
19 | remote.send(ob)
20 | elif cmd == 'close':
21 | remote.close()
22 | break
23 | elif cmd == 'get_spaces':
24 | remote.send((env.observation_space, env.action_space))
25 | else:
26 | raise NotImplementedError
27 |
28 | class CloudpickleWrapper(object):
29 | """
30 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
31 | """
32 | def __init__(self, x):
33 | self.x = x
34 | def __getstate__(self):
35 | import cloudpickle
36 | return cloudpickle.dumps(self.x)
37 | def __setstate__(self, ob):
38 | import pickle
39 | self.x = pickle.loads(ob)
40 |
41 |
42 | class VecEnv(object):
43 | """
44 | An abstract asynchronous, vectorized environment.
45 | """
46 | def __init__(self, num_envs, observation_space, action_space):
47 | self.num_envs = num_envs
48 | self.observation_space = observation_space
49 | self.action_space = action_space
50 |
51 | def reset(self):
52 | """
53 | Reset all the environments and return an array of
54 | observations, or a tuple of observation arrays.
55 | If step_async is still doing work, that work will
56 | be cancelled and step_wait() should not be called
57 | until step_async() is invoked again.
58 | """
59 | pass
60 |
61 | def step_async(self, actions):
62 | """
63 | Tell all the environments to start taking a step
64 | with the given actions.
65 | Call step_wait() to get the results of the step.
66 | You should not call this if a step_async run is
67 | already pending.
68 | """
69 | pass
70 |
71 | def step_wait(self):
72 | """
73 | Wait for the step taken with step_async().
74 | Returns (obs, rews, dones, infos):
75 | - obs: an array of observations, or a tuple of
76 | arrays of observations.
77 | - rews: an array of rewards
78 | - dones: an array of "episode done" booleans
79 | - infos: a sequence of info objects
80 | """
81 | pass
82 |
83 | def close(self):
84 | """
85 | Clean up the environments' resources.
86 | """
87 | pass
88 |
89 | def step(self, actions):
90 | self.step_async(actions)
91 | return self.step_wait()
92 |
93 | class SubprocVecEnv(VecEnv):
94 | def __init__(self, env_fns, spaces=None):
95 | """
96 | envs: list of gym environments to run in subprocesses
97 | """
98 | self.waiting = False
99 | self.closed = False
100 | nenvs = len(env_fns)
101 | self.nenvs = nenvs
102 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
103 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
104 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
105 | for p in self.ps:
106 | p.daemon = True # if the main process crashes, we should not cause things to hang
107 | p.start()
108 | for remote in self.work_remotes:
109 | remote.close()
110 |
111 | self.remotes[0].send(('get_spaces', None))
112 | observation_space, action_space = self.remotes[0].recv()
113 | VecEnv.__init__(self, len(env_fns), observation_space, action_space)
114 |
115 | def step_async(self, actions):
116 | for remote, action in zip(self.remotes, actions):
117 | remote.send(('step', action))
118 | self.waiting = True
119 |
120 | def step_wait(self):
121 | results = [remote.recv() for remote in self.remotes]
122 | self.waiting = False
123 | obs, rews, dones, infos = zip(*results)
124 | return np.stack(obs), np.stack(rews), np.stack(dones), infos
125 |
126 | def reset(self):
127 | for remote in self.remotes:
128 | remote.send(('reset', None))
129 | return np.stack([remote.recv() for remote in self.remotes])
130 |
131 | def reset_task(self):
132 | for remote in self.remotes:
133 | remote.send(('reset_task', None))
134 | return np.stack([remote.recv() for remote in self.remotes])
135 |
136 | def close(self):
137 | if self.closed:
138 | return
139 | if self.waiting:
140 | for remote in self.remotes:
141 | remote.recv()
142 | for remote in self.remotes:
143 | remote.send(('close', None))
144 | for p in self.ps:
145 | p.join()
146 | self.closed = True
147 |
148 | def __len__(self):
149 | return self.nenvs
--------------------------------------------------------------------------------
/ContinousControl/SAC_script.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Nov 6 12:24:34 2019
4 |
5 | @author: Z0014354
6 | """
7 |
8 |
9 | import numpy as np
10 | import random
11 |
12 | import gym
13 | import gym_cartpole_swingup
14 | from collections import namedtuple, deque
15 | import torch
16 | import torch.nn as nn
17 | import torch.nn.functional as F
18 | from torch.distributions import Normal, MultivariateNormal
19 |
20 | import torch.optim as optim
21 | import time
22 | from tensorboardX import SummaryWriter
23 | import argparse
24 |
25 |
26 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
27 |
28 | def hidden_init(layer):
29 | fan_in = layer.weight.data.size()[0]
30 | lim = 1. / np.sqrt(fan_in)
31 | return (-lim, lim)
32 |
33 | class Actor(nn.Module):
34 | """Actor (Policy) Model."""
35 |
36 | def __init__(self, state_size, action_size, seed, hidden_size=32, init_w=3e-3, log_std_min=-20, log_std_max=2):
37 | """Initialize parameters and build model.
38 | Params
39 | ======
40 | state_size (int): Dimension of each state
41 | action_size (int): Dimension of each action
42 | seed (int): Random seed
43 | fc1_units (int): Number of nodes in first hidden layer
44 | fc2_units (int): Number of nodes in second hidden layer
45 | """
46 | super(Actor, self).__init__()
47 | self.seed = torch.manual_seed(seed)
48 | self.log_std_min = log_std_min
49 | self.log_std_max = log_std_max
50 |
51 | self.fc1 = nn.Linear(state_size, hidden_size)
52 | self.fc2 = nn.Linear(hidden_size, hidden_size)
53 |
54 | self.mu = nn.Linear(hidden_size, action_size)
55 | self.log_std_linear = nn.Linear(hidden_size, action_size)
56 |
57 | def reset_parameters(self):
58 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
59 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
60 | self.mu.weight.data.uniform_(-init_w, init_w)
61 | self.log_std_linear.weight.data.uniform_(-init_w, init_w)
62 |
63 | def forward(self, state):
64 |
65 | x = F.relu(self.fc1(state), inplace=True)
66 | x = F.relu(self.fc2(x), inplace=True)
67 | mu = self.mu(x)
68 |
69 | log_std = self.log_std_linear(x)
70 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
71 | return mu, log_std
72 |
73 | def evaluate(self, state, epsilon=1e-6):
74 | mu, log_std = self.forward(state)
75 | std = log_std.exp()
76 | dist = Normal(0, 1)
77 | e = dist.sample().to(device)
78 | action = torch.tanh(mu + e * std)
79 | log_prob = Normal(mu, std).log_prob(mu + e * std) - torch.log(1 - action.pow(2) + epsilon)
80 | #action = torch.clamp(action*action_high, action_low, action_high)
81 | return action, log_prob
82 |
83 |
84 | def get_action(self, state):
85 | """
86 | returns the action based on a squashed gaussian policy. That means the samples are obtained according to:
87 | a(s,e)= tanh(mu(s)+sigma(s)+e)
88 | """
89 | state = torch.FloatTensor(state).to(device)
90 | mu, log_std = self.forward(state)
91 | std = log_std.exp()
92 | dist = Normal(0, 1)
93 | e = dist.sample().to(device)
94 | action = torch.tanh(mu + e * std).cpu()
95 |
96 | return action[0]
97 |
98 |
99 | class Critic(nn.Module):
100 | """Critic (Value) Model."""
101 |
102 | def __init__(self, state_size, action_size, seed, hidden_size=32):
103 | """Initialize parameters and build model.
104 | Params
105 | ======
106 | state_size (int): Dimension of each state
107 | action_size (int): Dimension of each action
108 | seed (int): Random seed
109 | hidden_size (int): Number of nodes in the network layers
110 |
111 | """
112 | super(Critic, self).__init__()
113 | self.seed = torch.manual_seed(seed)
114 | self.fc1 = nn.Linear(state_size+action_size, hidden_size)
115 | self.fc2 = nn.Linear(hidden_size, hidden_size)
116 | self.fc3 = nn.Linear(hidden_size, 1)
117 | self.reset_parameters()
118 |
119 | def reset_parameters(self):
120 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
121 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
122 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
123 |
124 | def forward(self, state, action):
125 | """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
126 | x = torch.cat((state, action), dim=1)
127 | x = F.relu(self.fc1(x))
128 | x = F.relu(self.fc2(x))
129 | return self.fc3(x)
130 |
131 | class Agent():
132 | """Interacts with and learns from the environment."""
133 |
134 | def __init__(self, state_size, action_size, random_seed, hidden_size, action_prior="uniform"):
135 | """Initialize an Agent object.
136 |
137 | Params
138 | ======
139 | state_size (int): dimension of each state
140 | action_size (int): dimension of each action
141 | random_seed (int): random seed
142 | """
143 | self.state_size = state_size
144 | self.action_size = action_size
145 | self.seed = random.seed(random_seed)
146 |
147 | self.target_entropy = -action_size # -dim(A)
148 | self.alpha = 1
149 | self.log_alpha = torch.tensor([0.0], requires_grad=True)
150 | self.alpha_optimizer = optim.Adam(params=[self.log_alpha], lr=LR_ACTOR)
151 | self._action_prior = action_prior
152 |
153 | print("Using: ", device)
154 |
155 | # Actor Network
156 | self.actor_local = Actor(state_size, action_size, random_seed, hidden_size).to(device)
157 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
158 |
159 | # Critic Network (w/ Target Network)
160 | self.critic1 = Critic(state_size, action_size, random_seed, hidden_size).to(device)
161 | self.critic2 = Critic(state_size, action_size, random_seed, hidden_size).to(device)
162 |
163 | self.critic1_target = Critic(state_size, action_size, random_seed,hidden_size).to(device)
164 | self.critic1_target.load_state_dict(self.critic1.state_dict())
165 |
166 | self.critic2_target = Critic(state_size, action_size, random_seed,hidden_size).to(device)
167 | self.critic2_target.load_state_dict(self.critic2.state_dict())
168 |
169 | self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=LR_CRITIC, weight_decay=0)
170 | self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=LR_CRITIC, weight_decay=0)
171 |
172 | # Replay memory
173 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
174 |
175 |
176 | def step(self, state, action, reward, next_state, done, step):
177 | """Save experience in replay memory, and use random sample from buffer to learn."""
178 | # Save experience / reward
179 | self.memory.add(state, action, reward, next_state, done)
180 |
181 | # Learn, if enough samples are available in memory
182 | if len(self.memory) > BATCH_SIZE:
183 | experiences = self.memory.sample()
184 | self.learn(step, experiences, GAMMA)
185 |
186 |
187 | def act(self, state, add_noise=True):
188 | """Returns actions for given state as per current policy."""
189 | state = torch.from_numpy(state).float().to(device)
190 | action = self.actor_local.get_action(state).detach()
191 | return action
192 |
193 | def learn(self, step, experiences, gamma, d=1):
194 | """Updates actor, critics and entropy_alpha parameters using given batch of experience tuples.
195 | Q_targets = r + γ * (min_critic_target(next_state, actor_target(next_state)) - α *log_pi(next_action|next_state))
196 | Critic_loss = MSE(Q, Q_target)
197 | Actor_loss = α * log_pi(a|s) - Q(s,a)
198 | where:
199 | actor_target(state) -> action
200 | critic_target(state, action) -> Q-value
201 | Params
202 | ======
203 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
204 | gamma (float): discount factor
205 | """
206 | states, actions, rewards, next_states, dones = experiences
207 |
208 |
209 | # ---------------------------- update critic ---------------------------- #
210 | # Get predicted next-state actions and Q values from target models
211 | next_action, log_pis_next = self.actor_local.evaluate(next_states)
212 |
213 | Q_target1_next = self.critic1_target(next_states.to(device), next_action.squeeze(0).to(device))
214 | Q_target2_next = self.critic2_target(next_states.to(device), next_action.squeeze(0).to(device))
215 |
216 | # take the mean of both critics for updating
217 | Q_target_next = torch.min(Q_target1_next, Q_target2_next)
218 |
219 | if FIXED_ALPHA == None:
220 | # Compute Q targets for current states (y_i)
221 | Q_targets = rewards + (gamma * (1 - dones) * (Q_target_next - self.alpha * log_pis_next.squeeze(0)))
222 | else:
223 | Q_targets = rewards + (gamma * (1 - dones) * (Q_target_next - FIXED_ALPHA * log_pis_next.squeeze(0)))
224 | # Compute critic loss
225 | Q_1 = self.critic1(states, actions)
226 | Q_2 = self.critic2(states, actions)
227 | critic1_loss = 0.5*F.mse_loss(Q_1, Q_targets.detach())
228 | critic2_loss = 0.5*F.mse_loss(Q_2, Q_targets.detach())
229 | # Update critics
230 | # critic 1
231 | self.critic1_optimizer.zero_grad()
232 | critic1_loss.backward()
233 | self.critic1_optimizer.step()
234 | # critic 2
235 | self.critic2_optimizer.zero_grad()
236 | critic2_loss.backward()
237 | self.critic2_optimizer.step()
238 | if step % d == 0:
239 | # ---------------------------- update actor ---------------------------- #
240 | if FIXED_ALPHA == None:
241 | alpha = torch.exp(self.log_alpha)
242 | # Compute alpha loss
243 | actions_pred, log_pis = self.actor_local.evaluate(states)
244 | alpha_loss = - (self.log_alpha * (log_pis + self.target_entropy).detach()).mean()
245 | self.alpha_optimizer.zero_grad()
246 | alpha_loss.backward()
247 | self.alpha_optimizer.step()
248 |
249 | self.alpha = alpha
250 | # Compute actor loss
251 | if self._action_prior == "normal":
252 | policy_prior = MultivariateNormal(loc=torch.zeros(self.action_size), scale_tril=torch.ones(self.action_size).unsqueeze(0))
253 | policy_prior_log_probs = policy_prior.log_prob(actions_pred)
254 | elif self._action_prior == "uniform":
255 | policy_prior_log_probs = 0.0
256 |
257 | actor_loss = (alpha * log_pis.squeeze(0) - self.critic1(states, actions_pred.squeeze(0)) - policy_prior_log_probs ).mean()
258 | else:
259 | if self._action_prior == "normal":
260 | policy_prior = MultivariateNormal(loc=torch.zeros(self.action_size), scale_tril=torch.ones(self.action_size).unsqueeze(0))
261 | policy_prior_log_probs = policy_prior.log_prob(actions_pred)
262 | elif self._action_prior == "uniform":
263 | policy_prior_log_probs = 0.0
264 |
265 | actor_loss = (FIXED_ALPHA * log_pis.squeeze(0) - self.critic1(states, actions_pred.squeeze(0)) - policy_prior_log_probs ).mean()
266 | # Minimize the loss
267 | self.actor_optimizer.zero_grad()
268 | actor_loss.backward()
269 | self.actor_optimizer.step()
270 |
271 | # ----------------------- update target networks ----------------------- #
272 | self.soft_update(self.critic1, self.critic1_target, TAU)
273 | self.soft_update(self.critic2, self.critic2_target, TAU)
274 |
275 |
276 |
277 | def soft_update(self, local_model, target_model, tau):
278 | """Soft update model parameters.
279 | θ_target = τ*θ_local + (1 - τ)*θ_target
280 | Params
281 | ======
282 | local_model: PyTorch model (weights will be copied from)
283 | target_model: PyTorch model (weights will be copied to)
284 | tau (float): interpolation parameter
285 | """
286 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
287 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
288 |
289 | class ReplayBuffer:
290 | """Fixed-size buffer to store experience tuples."""
291 |
292 | def __init__(self, action_size, buffer_size, batch_size, seed):
293 | """Initialize a ReplayBuffer object.
294 | Params
295 | ======
296 | buffer_size (int): maximum size of buffer
297 | batch_size (int): size of each training batch
298 | """
299 | self.action_size = action_size
300 | self.memory = deque(maxlen=buffer_size) # internal memory (deque)
301 | self.batch_size = batch_size
302 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
303 | self.seed = random.seed(seed)
304 |
305 | def add(self, state, action, reward, next_state, done):
306 | """Add a new experience to memory."""
307 | e = self.experience(state, action, reward, next_state, done)
308 | self.memory.append(e)
309 |
310 | def sample(self):
311 | """Randomly sample a batch of experiences from memory."""
312 | experiences = random.sample(self.memory, k=self.batch_size)
313 |
314 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
315 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
316 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
317 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
318 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
319 |
320 | return (states, actions, rewards, next_states, dones)
321 |
322 | def __len__(self):
323 | """Return the current size of internal memory."""
324 | return len(self.memory)
325 |
326 |
327 |
328 | def SAC(n_episodes=200, max_t=500, print_every=10):
329 | scores_deque = deque(maxlen=100)
330 | scores = []
331 | average_100_scores = []
332 |
333 | for i_episode in range(1, n_episodes+1):
334 |
335 | state = env.reset()
336 | state = state.reshape((1,state_size))
337 | score = 0
338 | for t in range(max_t):
339 |
340 |
341 | action = agent.act(state)
342 | action_v = action[0].numpy()
343 | action_v = np.clip(action_v*action_high, action_low, action_high)
344 | next_state, reward, done, info = env.step(action_v)
345 | next_state = next_state.reshape((1,state_size))
346 | agent.step(state, action, reward, next_state, done, t)
347 | state = next_state
348 | score += reward
349 |
350 | if done:
351 | break
352 |
353 | scores_deque.append(score)
354 | writer.add_scalar("max_reward", score, i_episode)
355 | average_100_scores.append(np.mean(scores_deque))
356 |
357 | print('\rEpisode {} Reward: {:.2f} Average100 Score: {:.2f}'.format(i_episode, score, np.mean(scores_deque)), end="")
358 | if i_episode % print_every == 0:
359 | print('\rEpisode {} Reward: {:.2f} Average100 Score: {:.2f}'.format(i_episode, score, np.mean(scores_deque)))
360 |
361 |
362 | torch.save(agent.actor_local.state_dict(), args.info + ".pt")
363 | return scores, average_100_scores
364 |
365 |
366 |
367 | def play():
368 | agent.actor_local.eval()
369 | for i_episode in range(1):
370 |
371 | state = env.reset()
372 | state = state.reshape((1,state_size))
373 |
374 | while True:
375 | action = agent.act(state)
376 | action_v = action[0].numpy()
377 | action_v = np.clip(action_v*action_high, action_low, action_high)
378 | next_state, reward, done, info = env.step(action_v)
379 | next_state = next_state.reshape((1,state_size))
380 | state = next_state
381 |
382 | if done:
383 | break
384 |
385 |
386 |
387 | parser = argparse.ArgumentParser(description="")
388 | parser.add_argument("-env", type=str,default="Pendulum-v0", help="Environment name")
389 | parser.add_argument("-info", type=str, help="Information or name of the run")
390 | parser.add_argument("-ep", type=int, default=200, help="The amount of training episodes, default is 200")
391 | parser.add_argument("-seed", type=int, default=0, help="Seed for the env and torch network weights, default is 0")
392 | parser.add_argument("-lr", type=float, default=5e-4, help="Learning rate of adapting the network weights, default is 5e-4")
393 | parser.add_argument("-a", "--alpha", type=float, help="entropy alpha value, if not choosen the value is leaned by the agent")
394 | parser.add_argument("-layer_size", type=int, default=64, help="Number of nodes per neural network layer, default is 64")
395 | parser.add_argument("-repm", "--replay_memory", type=float, default=1e6, help="Size of the Replay memory, default is 1e6")
396 | parser.add_argument("-bs", "--batch_size", type=int, default=256, help="Batch size, default is 256")
397 | parser.add_argument("-t", "--tau", type=float, default=1e-2, help="Softupdate factor tau, default is 1e-2")
398 | parser.add_argument("-g", "--gamma", type=float, default=0.99, help="discount factor gamma, default is 0.99")
399 | parser.add_argument("--saved_model", type=str, default=None, help="Load a saved model to perform a test run!")
400 | args = parser.parse_args()
401 |
402 |
403 | env_name = args.env
404 | seed = args.seed
405 | n_episodes = args.ep
406 | GAMMA = args.gamma
407 | TAU = args.tau
408 | HIDDEN_SIZE = args.layer_size
409 | BUFFER_SIZE = int(args.replay_memory)
410 | BATCH_SIZE = args.batch_size # minibatch size
411 | LR_ACTOR = args.lr # learning rate of the actor
412 | LR_CRITIC = args.lr # learning rate of the critic
413 | FIXED_ALPHA = args.alpha
414 | saved_model = args.saved_model
415 |
416 | t0 = time.time()
417 | writer = SummaryWriter("runs/"+args.info)
418 | env = gym.make(env_name)
419 | action_high = env.action_space.high[0]
420 | action_low = env.action_space.low[0]
421 | torch.manual_seed(seed)
422 | env.seed(seed)
423 | state_size = env.observation_space.shape[0]
424 | action_size = env.action_space.shape[0]
425 | agent = Agent(state_size=state_size, action_size=action_size, random_seed=seed,hidden_size=HIDDEN_SIZE, action_prior="uniform") #"normal"
426 |
427 | if saved_model != None:
428 | agent.actor_local.load_state_dict(torch.load(saved_model))
429 | play()
430 | else:
431 | scores, average_100 = SAC(n_episodes=n_episodes, max_t=2300, print_every=10)
432 | t1 = time.time()
433 | env.close()
434 | print("training took {} min!".format((t1-t0)/60))
435 |
436 |
--------------------------------------------------------------------------------
/Cross_entropy/Cross_entropy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from gym import wrappers
4 | import torch
5 | import torch.nn as nn
6 | from torch import optim
7 | from collections import namedtuple
8 | from tensorboardX import SummaryWriter
9 | # Memory
10 | Episode = namedtuple("Episode", field_names = ["reward","steps"])
11 | EpisodeStep = namedtuple("EpisodeStep", field_names = ["state", "action"])
12 |
13 |
14 | class Network(nn.Module):
15 | def __init__(self, input_shape, output_shape):
16 | super(Network, self).__init__()
17 |
18 |
19 | self.net = nn.Sequential(
20 | nn.Linear(input_shape, 128),
21 | nn.ReLU(),
22 | nn.Linear(128, output_shape)
23 | )
24 |
25 | def forward(self,x):
26 | return self.net(x)
27 |
28 | def filter_batch(batch, percentile = 70):
29 | rewards = list(map(lambda s: s.reward, batch))
30 | reward_bound = np.percentile(rewards, percentile)
31 | reward_mean = float(np.mean(rewards))
32 |
33 | train_obs = []
34 | train_act = []
35 | for example in batch:
36 | if example.reward < reward_bound:
37 | continue
38 | train_obs.extend(map(lambda step: step.state, example.steps))
39 | train_act.extend(map(lambda step: step.action, example.steps))
40 | train_obs_vector = torch.FloatTensor(train_obs)
41 | train_act_vector = torch.LongTensor(train_act)
42 | return train_obs_vector, train_act_vector, reward_bound, reward_mean
43 |
44 | def iterative_batches(env, network, batch_size = 16):
45 | batch = []
46 | episode_reward = 0.0
47 | episode_steps = []
48 | state = env.reset()
49 | softmax = nn.Softmax(dim =1)
50 |
51 | while True:
52 | state_vector = torch.Tensor([state])
53 | action_probs_vector = softmax(network(state_vector))
54 |
55 | action_probs = action_probs_vector.data.numpy()[0]
56 | action = np.random.choice(len(action_probs), p = action_probs)
57 |
58 | next_state, reward, done, _ = env.step(action)
59 | episode_reward += reward
60 | episode_steps.append(EpisodeStep(state = state, action = action))
61 |
62 | if done:
63 | batch.append(Episode(reward = episode_reward, steps = episode_steps))
64 | episode_reward = 0.0
65 | episode_steps = []
66 | next_state = env.reset()
67 | if len(batch) == batch_size:
68 | yield batch
69 | batch = []
70 | state = next_state
71 |
72 | if __name__ == "__main__":
73 | env = gym.make("CartPole-v0")
74 | env = gym.wrappers.Monitor(env, directory = "mon", force = True)
75 | output_shape = env.action_space.n
76 | input_shape = env.observation_space.shape[0]
77 |
78 | network = Network(input_shape = input_shape, output_shape = output_shape)
79 | objective = nn.CrossEntropyLoss()
80 | optimizer = optim.Adam(params = network.parameters(), lr = 0.01)
81 | writer = SummaryWriter()
82 |
83 | for iter_no, batch in enumerate(iterative_batches(env, network)):
84 | state_vector, action_vector, reward_bound, reward_mean = filter_batch(batch)
85 | optimizer.zero_grad()
86 | action_values_vector = network(state_vector)
87 | loss_vector = objective(action_values_vector, action_vector)
88 | loss_vector.backward()
89 | optimizer.step()
90 | print("{}: loss = {}, reward_mean = {}, reward_boundary = {}".format(iter_no, loss_vector.item(), reward_mean, reward_bound))
91 | writer.add_scalar("loss", loss_vector.item(), iter_no)
92 | writer.add_scalar("reward mean", reward_mean, iter_no)
93 | writer.add_scalar("reward boundary", reward_bound, iter_no)
94 | if reward_mean > 199:
95 | print("Solved CartPole Problem!")
96 | break
97 | writer.close()
--------------------------------------------------------------------------------
/Cross_entropy/README.md:
--------------------------------------------------------------------------------
1 | # Deep Reinforcement Leanring with Cross entropy
2 | Cross entropy method implemented on the cart pole problem.
3 | based on the example in the book [Deep Reinforcement Learning Hands-on](https://www.amazon.de/Deep-Reinforcement-Learning-Hands-Q-networks-ebook/dp/B076H9VQH6) by Maxim Lapan
4 |
5 |
6 | 
7 |
--------------------------------------------------------------------------------
/Cross_entropy/img/Cross_entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Cross_entropy/img/Cross_entropy.png
--------------------------------------------------------------------------------
/Deep Q_Learning/DQN_Experience_Replay.py:
--------------------------------------------------------------------------------
1 | import keras
2 | from keras.models import Sequential
3 | from keras.layers import Dense, Dropout
4 | from keras.optimizers import Adam
5 | import numpy as np
6 | from collections import deque
7 | from keras.models import load_model
8 | import random
9 | import matplotlib.pyplot as plt
10 | import gym
11 | from gym import wrappers
12 |
13 |
14 |
15 | class AI():
16 | def __init__(self, state_size, action_size, memory_size, learning_rate, gamma):
17 | self.state_size = state_size
18 | self.action_size = action_size
19 | self.memory = deque(maxlen = memory_size)
20 |
21 |
22 | # HYPERPARAMETER
23 | self.learning_rate = learning_rate
24 | self.gamma = gamma
25 | self.epsilon = 0.5
26 | self.epsilon_start = self.epsilon
27 |
28 | self.brain = self.build_brain()
29 |
30 |
31 | def build_brain(self):
32 | model = Sequential()
33 | model.add(Dense(self.state_size, activation='relu'))
34 | model.add(Dense(25, activation='relu'))
35 | #model.add(Dropout(0.3))
36 | model.add(Dense(25, activation='relu'))
37 | #model.add(Dropout(0.3))
38 | # model.add(Dense(12, activation='relu'))
39 | model.add(Dense(self.action_size, activation='linear'))
40 | model.compile(loss = "mse", optimizer = Adam(lr=self.learning_rate))
41 | return model
42 |
43 |
44 | def load_model(self, name):
45 | """
46 | Loads an existing Model
47 | Input: string of the model name - h5 data
48 | """
49 | brain = load_model(name)
50 | return None
51 |
52 | def save_learnings(self, model_name):#
53 | """
54 | Input string of Modelname
55 | """
56 | self.brain.save(model_name+".h5")
57 |
58 | def adapt_epsilon(self,ep):
59 | # Epsilon starts at 0.5 linear increasing to 0.99 by ep 4000:
60 | # linear: epsilon = 0.0001225*ep+self.epsilon_start
61 | # exponent (4000 eps): epsilon = self.epsilon_start + (ep/5714)**2
62 | if ep == 0:
63 | pass
64 | self.epsilon = self.epsilon_start + (ep/5714)**2
65 |
66 | def act(self, state, status = "train"):
67 | if status == "train":
68 | if np.random.rand() > self.epsilon:
69 | return random.randrange(self.action_size)
70 | return np.argmax(self.brain.predict(state)[0])
71 |
72 | def remember(self, state, action, reward, next_state, done):
73 | self.memory.append((state, action, reward, next_state, done))
74 |
75 | def replay(self):
76 | batch_size = 32
77 | if len(self.memory) < batch_size:
78 | return
79 |
80 | samples = random.sample(self.memory, batch_size)
81 | for state, action, reward, next_state, done in samples:
82 | target = reward
83 |
84 | if not done:
85 | target = reward + self.gamma * np.amax(self.brain.predict(next_state)[0]) # Predict the future/target value
86 | #print(target)
87 | Q_target_shape = self.brain.predict(state) # normal Q- Value prediction for the training-shape
88 | Q_target_shape[0][action] = target # replacing the best Q-Value with the target
89 | self.brain.fit(state, Q_target_shape, epochs=1, verbose=0) # training with the new Target value (loss = sum(Q_target-Q)exp2)
90 |
91 |
92 |
93 |
94 | def play(Ep, agent, status = "train"):
95 |
96 | learning_graph = []
97 | env = gym.make("CartPole-v1")
98 | env = wrappers.Monitor(env, "Saved_DQN_ER_Models/", resume=True, video_callable=lambda episode_id: episode_id%250==0)
99 | action_space = env.action_space.n
100 | state_space = env.observation_space.shape[0]
101 | if agent == None:
102 | agent = AI(state_space,action_space,memory_size = 5000,learning_rate = 0.001,gamma = 0.95) #2500 mem
103 | for ep in range(Ep):
104 | state = env.reset()
105 | state = np.reshape(state,[1,state_space])
106 | done = False
107 | score = 0
108 | agent.adapt_epsilon(ep) # Increasing the epsilon linear - adjustable to non linear, log,...
109 | while not done:
110 |
111 | if status == "play":
112 | env.render()
113 | action = agent.act(state, status)
114 | new_state, reward, done, _ = env.step(action)
115 | new_state = np.reshape(new_state,[1,state_space])
116 | agent.remember(state, action, reward, new_state, done)
117 | state = new_state
118 | score +=1
119 | if done:
120 | break
121 | print("Episode {}# Score: {}".format(ep, score + 1))
122 | if ep == 250 or ep % 500 == 0:
123 | # save model eacht 500 ep for videos
124 | agent.save_learnings(str(ep)+","+str(score))
125 | agent.replay()
126 | learning_graph.append(score)
127 | return learning_graph, agent
128 |
129 | def main():
130 | Episodes = 4001 #4001
131 | graph,agent = play(Episodes,None)
132 | plt.plot(graph)
133 | plt.xlabel("Episoden")
134 | plt.ylabel("Score")
135 | plt.show()
136 |
137 | print("Do you want to save the model?")
138 | answer = input("Y/N\n")
139 | if answer == "Y":
140 | name = input("give a name for the model: \n")
141 | agent.save_learnings(name)
142 | else:
143 | pass
144 |
145 |
146 | print("Soll der Agent getestet werden?\n")
147 | n = input("Wie viele Episoden sollen gespielt werden?")
148 | x,y = play(int(n),agent,status = "play")
149 |
150 | if __name__ == "__main__":
151 | main()
152 |
--------------------------------------------------------------------------------
/Deep Q_Learning/Img/4k Learning_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Deep Q_Learning/Img/4k Learning_curve.png
--------------------------------------------------------------------------------
/Deep Q_Learning/Img/Converging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Deep Q_Learning/Img/Converging.png
--------------------------------------------------------------------------------
/Deep Q_Learning/README.md:
--------------------------------------------------------------------------------
1 | # Deep Q_Learning with Experience Replay playing Cart Pole
2 |
3 | [image1]: ./Img/Converging.png "Calculation Equation"
4 | [image2]: ./Img/Q_table10000.png "Calculation Equation"
5 |
6 |
7 | ### Exponential Epsilon:
8 |
9 |
10 |
11 | Learning Curve after 4000 Epochs and and exponentially epsilon Greedy
12 |
13 | ![alt text][image1]
14 |
15 |
16 |
17 |
18 |
19 | ### Youtube Video:
20 | [Deep Q-Network plays Cart Pole](https://www.youtube.com/watch?v=9g2ZLPs5Rs0)
21 |
22 |
23 |
--------------------------------------------------------------------------------
/Double DQN/CNN_Double_DQN.py:
--------------------------------------------------------------------------------
1 | import math, random
2 | from collections import deque
3 | import cv2
4 |
5 | import gym
6 | from gym import wrappers
7 | import wrapper
8 | import numpy as np
9 |
10 | import torch
11 | import torch.nn as nn
12 | import torch.optim as optim
13 | import torch.autograd as autograd
14 | import torch.nn.functional as F
15 | from IPython.display import clear_output
16 |
17 | import matplotlib.pyplot as plt
18 |
19 | USE_CUDA = torch.cuda.is_available()
20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
21 |
22 | class ReplayBuffer(object):
23 | def __init__(self, capacity):
24 | self.buffer = deque(maxlen=capacity)
25 |
26 | def push(self, state, action, reward, next_state, done):
27 | state = np.expand_dims(state, 0)
28 | next_state = np.expand_dims(next_state, 0)
29 |
30 | self.buffer.append((state, action, reward, next_state, done))
31 |
32 | def sample(self, batch_size):
33 | state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
34 | return np.concatenate(state), action, reward, np.concatenate(next_state), done
35 |
36 | def __len__(self):
37 | return len(self.buffer)
38 |
39 | class CnnDQN(nn.Module):
40 | def __init__(self, input_shape, num_actions):
41 | super(CnnDQN, self).__init__()
42 |
43 | self.input_shape = input_shape
44 | self.num_actions = num_actions
45 |
46 | self.features = nn.Sequential(
47 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
48 | nn.ReLU(),
49 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
50 | nn.ReLU(),
51 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
52 | nn.ReLU()
53 | )
54 |
55 | self.fc = nn.Sequential(
56 | nn.Linear(self.feature_size(), 512),
57 | nn.ReLU(),
58 | nn.Linear(512, self.num_actions)
59 | )
60 |
61 | def forward(self, x):
62 | x = self.features(x)
63 | x = x.view(x.size(0), -1)
64 | x = self.fc(x)
65 | return x
66 |
67 | def feature_size(self):
68 | return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
69 |
70 | def act(self, state, epsilon,action_space):
71 | if random.random() > epsilon:
72 | with torch.no_grad():
73 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0))
74 | q_value = self.forward(state)
75 | action = q_value.max(1)[1].data[0] #.max(1) gives the maxvalues--[0] and idx--[1]
76 | else:
77 | action = random.randrange(action_space)
78 | return action
79 |
80 | def update_target(current_model, target_model):
81 | target_model.load_state_dict(current_model.state_dict())
82 |
83 | def save_model(model, idx):
84 | torch.save(model, "Saved_models/")
85 |
86 | def epsilon_by_frame(frame_idx):
87 | epsilon_start = 1.0
88 | epsilon_final = 0.01 #0.01
89 | epsilon_decay = 30000 #30000
90 | eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
91 | return eps
92 |
93 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,replay_buffer):
94 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
95 | # shapes for normal image-- stacked (4,84,84) ...
96 | state = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84)
97 | next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84)
98 | action = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function
99 | reward = Variable(torch.FloatTensor(reward)) #shape [32]
100 | done = Variable(torch.FloatTensor(done)) #shape [32]
101 |
102 | q_values = current_model(state) #shape [32,6]
103 | next_q_values = current_model(next_state) #shape [32,6]
104 | next_q_state_values = target_model(next_state) #shape [32,6]
105 |
106 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action
107 | next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1]
108 | expected_q_value = reward + gamma * next_q_value * (1 - done) # shape [32]
109 |
110 |
111 | # DeepMind took nn.SmoothL1Loss()
112 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss -- .data to get rid of grad_fn=
113 | loss = loss_func(q_value,Variable(expected_q_value.data))
114 |
115 | opti.zero_grad()
116 | loss.backward()
117 | opti.step()
118 | return loss
119 |
120 | def plot(frame_idx, rewards, losses):
121 | plt.close()
122 | plt.figure(figsize=(20,5))
123 | plt.subplot(121)
124 | plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2)))
125 | plt.plot(rewards)
126 | plt.subplot(122)
127 | plt.title("loss")
128 | plt.plot(losses)
129 | plt.ylim(0,1)
130 | plt.draw()
131 | plt.pause(0.0001)
132 |
133 | def processing(img):
134 | img = np.expand_dims(cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), (84,84)),axis= 0)
135 | img = img.astype(np.uint8)
136 | #print(img.dtype)
137 | return img
138 |
139 | def main():
140 | plt.ion()
141 | env = wrapper.make_atari("PongNoFrameskip-v4", monitor=True,epidsode_capture=75)
142 | env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True)
143 | action_space = env.action_space.n
144 | current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape
145 | target_model = CnnDQN(env.observation_space.shape, action_space)
146 |
147 | if USE_CUDA:
148 | current_model = current_model.cuda()
149 | target_model = target_model.cuda()
150 |
151 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000)
152 | #opti = optim.Adam(current_model.parameters(), lr=0.0001)
153 | opti = optim.RMSprop(current_model.parameters(), lr=0.0001)
154 | loss_func = nn.SmoothL1Loss()
155 |
156 | replay_initial = 10000
157 | replay_buffer = ReplayBuffer(100000)
158 |
159 | num_frames = 1000000
160 | batch_size = 32
161 | gamma = 0.99
162 |
163 | losses = []
164 | all_rewards = []
165 | episode_reward = 0
166 |
167 | state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84)
168 | # Manuel Stacking
169 | #state = processing(state)
170 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
171 | #assert state.shape == (4,84,84)
172 | for frame_idx in range(1, num_frames + 1):
173 |
174 | epsilon = epsilon_by_frame(frame_idx)
175 | print("Training :: Frame {} :: Epsilon {} ".format(frame_idx, round(epsilon,2)))
176 | action = current_model.act(state, epsilon,action_space)
177 | next_state, reward, done, _ = env.step(action)
178 | # Manuel Stacking
179 | #next_state = processing(next_state)
180 | #next_state = np.append(next_state, state[1:, :, :],axis= 0)
181 | #assert next_state.shape == (4,84,84)
182 | replay_buffer.push(state, action, reward, next_state, done)
183 |
184 | state = next_state
185 | episode_reward += reward
186 |
187 | if done:
188 | state = env.reset()
189 | # Manuel Stacking
190 | #state = processing(state)
191 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
192 | all_rewards.append(episode_reward)
193 | episode_reward = 0
194 |
195 | if len(replay_buffer) > replay_initial:
196 | loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,replay_buffer)
197 | losses.append(loss.item())
198 |
199 | if frame_idx % 10000 == 0:
200 | plot(frame_idx, all_rewards, losses)
201 |
202 | if frame_idx % 1000 == 0:
203 | update_target(current_model, target_model)
204 |
205 | #if frame_idx % 100000 ==0:
206 | # save_model(current_model, frame_idx)
207 |
208 | if __name__ == "__main__":
209 | main()
--------------------------------------------------------------------------------
/Double DQN/Double_DQN.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from gym import wrappers
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | import torch.optim as optim
7 | from torch.autograd import Variable
8 |
9 | import numpy as np
10 | from collections import deque
11 | import random
12 | import matplotlib.pyplot as plt
13 | import matplotlib.animation as animation
14 | from matplotlib import style
15 | import time
16 |
17 | class Network(nn.Module):
18 | def __init__(self, input_dim, output_dim):
19 | super(Network,self).__init__()
20 | self.linear1 = nn.Linear(input_dim, 40)
21 | self.linear2 = nn.Linear(40, 40)
22 | self.linear3 = nn.Linear(40, output_dim)
23 |
24 | def forward(self,x):
25 | x = self.linear1(x)
26 | x = F.relu(x)
27 | x = self.linear2(x)
28 | x = F.relu(x)
29 | out = self.linear3(x)
30 | return out
31 |
32 | class Agent:
33 | def __init__(self, state_size, action_size):
34 |
35 | self.state_size = state_size
36 | self.action_size = action_size
37 | self.memory = deque(maxlen=5000)
38 | self.gamma = 0.95 # discount rate
39 | self.epsilon = 0.4 # exploration rate
40 | self.epsilon_start = self.epsilon
41 | self.learning_rate = 0.001
42 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #activate device
43 |
44 | # Our DQN and the Target Network
45 | self.model = Network(state_size, action_size).to(self.device)
46 | self.target_model = Network(state_size, action_size).to(self.device)
47 |
48 | self.criteria = nn.MSELoss()
49 | self.opt = optim.Adam(self.model.parameters(), lr=self.learning_rate)
50 |
51 | def remember(self, state, action, reward, next_state, done):
52 | self.memory.append((state, action, reward, next_state, done))
53 |
54 | def update_target(self):
55 | self.target_model.load_state_dict(self.model.state_dict())
56 |
57 | def adapt_epsilon(self,ep):
58 | # Epsilon starts at 0.5 linear increasing to 0.99 by ep 4000:
59 | # linear: epsilon = 0.0001225*ep+self.epsilon_start
60 | # exponent (4000 eps): epsilon = self.epsilon_start + (ep/5714)**2
61 | if ep == 0:
62 | pass
63 | if self.epsilon < 0.98:
64 | self.epsilon = self.epsilon_start + (ep/3800)**2 #4500
65 |
66 | def act(self, state, status = "Train"):
67 | if status == "Play":
68 | self.epsilon = 0.95
69 | if np.random.rand() > self.epsilon:
70 | return random.randrange(self.action_size)
71 |
72 | act_values = self.model(Variable(torch.Tensor(state)).to(self.device)).cpu().data.numpy()
73 | return np.argmax(act_values[0])
74 |
75 | def give_epsilon(self):
76 | return self.epsilon
77 |
78 | def replay(self, batch_size):
79 | if len(self.memory) < batch_size:
80 | return
81 | minibatch = random.sample(self.memory, batch_size)
82 |
83 | for state, action, reward, next_state, done in minibatch:
84 | target = reward
85 | self.model.train()
86 | if not done:
87 | next_state_v = Variable(torch.Tensor(next_state))
88 | target = self.target_model(next_state_v.to(self.device)).cpu() # target has to be on cpu for numpy
89 | target = target.data.numpy()[0]
90 | target_actual = self.target_model(Variable(torch.Tensor(state)).to(self.device)).cpu().data.numpy()
91 | target_actual[0][action] = reward + self.gamma *np.amax(target)
92 |
93 | self.opt.zero_grad()
94 | out = self.model(Variable(torch.Tensor(state)).to(self.device))
95 | loss = self.criteria(out, Variable(torch.Tensor(target_actual)).to(self.device))
96 | loss.backward()
97 | self.opt.step()
98 |
99 |
100 |
101 |
102 | def play(Ep,agent, status = "train"):
103 | # for active plotting:
104 | learning_graph = []
105 | epsilons = []
106 | learning_graph_live = deque(maxlen = 180)
107 | epochs_live = deque(maxlen = 180)
108 | epsilons_live = deque(maxlen = 180)
109 |
110 | batch_size = 64
111 | env = gym.make("CartPole-v1")
112 | env = wrappers.Monitor(env, "Saved_Videos/", resume=True, video_callable=lambda episode_id: episode_id%250==0)
113 | action_space = env.action_space.n
114 | state_space = env.observation_space.shape[0]
115 | if agent == None:
116 | agent = Agent(state_space,action_space)
117 | for ep in range(Ep):
118 | state = env.reset()
119 | state = np.reshape(state,[1,state_space])
120 | done = False
121 | score = 0
122 | agent.adapt_epsilon(ep) # Increasing the epsilon linear - adjustable to non linear, log,...
123 | while not done:
124 |
125 | if status == "play":
126 | env.render()
127 | action = agent.act(state, status)
128 | new_state, reward, done, _ = env.step(action)
129 | new_state = np.reshape(new_state,[1,state_space])
130 | agent.remember(state, action, reward, new_state, done)
131 | state = new_state
132 | score +=1
133 |
134 | if done:
135 | break
136 |
137 |
138 |
139 | print("Episode {}# Score: {}# Epsilon {}".format(ep, score + 1,agent.give_epsilon()))
140 | # Update Target Network
141 | if ep % 200 == 0:
142 | agent.update_target()
143 | print("Updated Target Network!")
144 | agent.replay(batch_size)
145 | # Live plot
146 | learning_graph.append(score)
147 | epsilons.append(agent.give_epsilon()*100)
148 | learning_graph_live.append(score)
149 | epochs_live.append(ep)
150 | epsilons_live.append(agent.give_epsilon()*100)
151 |
152 | plt.plot(epochs_live, learning_graph_live,"b")
153 | plt.plot(epochs_live, epsilons_live,"r")
154 | plt.xlabel("Epoch")
155 | plt.ylabel("Score / Epsilon")
156 | plt.title("Score Live Plot")
157 | plt.show()
158 | plt.pause(0.00000001)
159 | plt.clf()
160 |
161 | return learning_graph, epsilons, agent
162 |
163 | def main():
164 | Episodes = 4000 #4001
165 | graph,epsilons,agent = play(Episodes,None, "train")
166 | plt.plot(graph, "b")
167 | plt.plot(epsilons, "r")
168 | plt.xlabel("Episoden")
169 | plt.ylabel("Score / Epsilon")
170 | plt.show()
171 |
172 | print("Do you want to save the model?")
173 | answer = input("Y/N\n")
174 | if answer == "Y":
175 | name = input("give a name for the model: \n")
176 | agent.save_learnings(name)
177 | else:
178 | pass
179 |
180 |
181 | print("Soll der Agent getestet werden?\n")
182 | n = input("Wie viele Episoden sollen gespielt werden?")
183 | x,y, ag = play(int(n),agent,status = "play")
184 |
185 | if __name__ == "__main__":
186 | fig = plt.figure()
187 | plt.ion()
188 | main()
189 |
--------------------------------------------------------------------------------
/Double DQN/Imgs/4000_40-40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/4000_40-40.png
--------------------------------------------------------------------------------
/Double DQN/Imgs/CNN_pong_converge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/CNN_pong_converge.png
--------------------------------------------------------------------------------
/Double DQN/Imgs/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/test.png
--------------------------------------------------------------------------------
/Double DQN/README.md:
--------------------------------------------------------------------------------
1 | # Double Deep Q_Learning with Experience Replay playing Cart Pole
2 |
3 | [image1]: ./Imgs/test.png "Calculation Equation"
4 | [image2]: ./Imgs/Q_table10000.png "Calculation Equation"
5 | [image3]: ./Imgs/CNN_pong_converge.png
6 |
7 | The difference between DQN and Double DQN is, that in Double DQN the target values get generated by a seperate neural network and not by the same that predicts the the Q_value as in DQN.
8 | [Paper](https://arxiv.org/abs/1509.06461)
9 |
10 | ### Learning Curve:
11 |
12 | Learning Curve after 4000 Epochs and and exponentially epsilon Greedy
13 |
14 | ![alt text][image1]
15 |
16 |
17 |
18 |
19 |
20 | ### Youtube Video:
21 | [Deep Q-Network plays Cart Pole](https://www.youtube.com/watch?v=9g2ZLPs5Rs0)
22 |
23 | ## Training to play pong with an Double Deep Q CNN
24 | I trained a Double Deep Q-Network to play the Atari game Pong. After around 150000 frames it converged and beat its opponent constantly. Thereby the convolutional neural network trained itself totally on visual inputs. Therefor the input images got converted to black-and-white and 4 images got stacked together so the network is able to recognize the velocity of the ball - which would be much more difficult to guess by only one state/image. Also the network was trained offline with a memory technique called experienced replay and after each 1000 frames the target network was updated with the weights of the optimized model.
25 |
26 | ![alt text][image3]
27 |
28 | ### Youtube Video:
29 | [Double Deep Q Network learns to play Pong](https://www.youtube.com/watch?v=I3dTyg_5rFc)
30 |
--------------------------------------------------------------------------------
/Double DQN/wrapper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import deque
3 | import gym
4 |
5 | from gym import spaces,wrappers
6 | import cv2
7 | cv2.ocl.setUseOpenCL(False)
8 |
9 | class NoopResetEnv(gym.Wrapper):
10 | def __init__(self, env, noop_max=30):
11 | """Sample initial states by taking random number of no-ops on reset.
12 | No-op is assumed to be action 0.
13 | """
14 | gym.Wrapper.__init__(self, env)
15 | self.noop_max = noop_max
16 | self.override_num_noops = None
17 | self.noop_action = 0
18 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
19 |
20 | def reset(self, **kwargs):
21 | """ Do no-op action for a number of steps in [1, noop_max]."""
22 | self.env.reset(**kwargs)
23 | if self.override_num_noops is not None:
24 | noops = self.override_num_noops
25 | else:
26 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
27 | assert noops > 0
28 | obs = None
29 | for _ in range(noops):
30 | obs, _, done, _ = self.env.step(self.noop_action)
31 | if done:
32 | obs = self.env.reset(**kwargs)
33 | return obs
34 |
35 | def step(self, ac):
36 | return self.env.step(ac)
37 |
38 | class FireResetEnv(gym.Wrapper):
39 | def __init__(self, env):
40 | """Take action on reset for environments that are fixed until firing."""
41 | gym.Wrapper.__init__(self, env)
42 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
43 | assert len(env.unwrapped.get_action_meanings()) >= 3
44 |
45 | def reset(self, **kwargs):
46 | self.env.reset(**kwargs)
47 | obs, _, done, _ = self.env.step(1)
48 | if done:
49 | self.env.reset(**kwargs)
50 | obs, _, done, _ = self.env.step(2)
51 | if done:
52 | self.env.reset(**kwargs)
53 | return obs
54 |
55 | def step(self, ac):
56 | return self.env.step(ac)
57 |
58 | class EpisodicLifeEnv(gym.Wrapper):
59 | def __init__(self, env):
60 | """Make end-of-life == end-of-episode, but only reset on true game over.
61 | Done by DeepMind for the DQN and co. since it helps value estimation.
62 | """
63 | gym.Wrapper.__init__(self, env)
64 | self.lives = 0
65 | self.was_real_done = True
66 |
67 | def step(self, action):
68 | obs, reward, done, info = self.env.step(action)
69 | self.was_real_done = done
70 | # check current lives, make loss of life terminal,
71 | # then update lives to handle bonus lives
72 | lives = self.env.unwrapped.ale.lives()
73 | if lives < self.lives and lives > 0:
74 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames
75 | # so its important to keep lives > 0, so that we only reset once
76 | # the environment advertises done.
77 | done = True
78 | self.lives = lives
79 | return obs, reward, done, info
80 |
81 | def reset(self, **kwargs):
82 | """Reset only when lives are exhausted.
83 | This way all states are still reachable even though lives are episodic,
84 | and the learner need not know about any of this behind-the-scenes.
85 | """
86 | if self.was_real_done:
87 | obs = self.env.reset(**kwargs)
88 | else:
89 | # no-op step to advance from terminal/lost life state
90 | obs, _, _, _ = self.env.step(0)
91 | self.lives = self.env.unwrapped.ale.lives()
92 | return obs
93 |
94 | class MaxAndSkipEnv(gym.Wrapper):
95 | def __init__(self, env, skip=4):
96 | """Return only every `skip`-th frame"""
97 | gym.Wrapper.__init__(self, env)
98 | # most recent raw observations (for max pooling across time steps)
99 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
100 | self._skip = skip
101 |
102 | def reset(self):
103 | return self.env.reset()
104 |
105 | def step(self, action):
106 | """Repeat action, sum reward, and max over last observations."""
107 | total_reward = 0.0
108 | done = None
109 | for i in range(self._skip):
110 | obs, reward, done, info = self.env.step(action)
111 | if i == self._skip - 2: self._obs_buffer[0] = obs
112 | if i == self._skip - 1: self._obs_buffer[1] = obs
113 | total_reward += reward
114 | if done:
115 | break
116 | # Note that the observation on the done=True frame
117 | # doesn't matter
118 | max_frame = self._obs_buffer.max(axis=0)
119 |
120 | return max_frame, total_reward, done, info
121 |
122 | def reset(self, **kwargs):
123 | return self.env.reset(**kwargs)
124 |
125 | class ClipRewardEnv(gym.RewardWrapper):
126 | def __init__(self, env):
127 | gym.RewardWrapper.__init__(self, env)
128 |
129 | def reward(self, reward):
130 | """Bin reward to {+1, 0, -1} by its sign."""
131 | return np.sign(reward)
132 |
133 | class WarpFrame(gym.ObservationWrapper):
134 | def __init__(self, env):
135 | """Warp frames to 84x84 as done in the Nature paper and later work."""
136 | gym.ObservationWrapper.__init__(self, env)
137 | self.width = 84
138 | self.height = 84
139 | self.observation_space = spaces.Box(low=0, high=255,
140 | shape=(self.height, self.width, 1), dtype=np.uint8)
141 |
142 | def observation(self, frame):
143 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
144 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
145 | return frame[:, :, None]
146 |
147 | class FrameStack(gym.Wrapper):
148 | def __init__(self, env, k):
149 | """Stack k last frames.
150 |
151 | Returns lazy array, which is much more memory efficient.
152 |
153 | See Also
154 | --------
155 | baselines.common.atari_wrappers.LazyFrames
156 | """
157 | gym.Wrapper.__init__(self, env)
158 | self.k = k
159 | self.frames = deque([], maxlen=k)
160 | shp = env.observation_space.shape
161 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
162 |
163 | def reset(self):
164 | ob = self.env.reset()
165 | for _ in range(self.k):
166 | self.frames.append(ob)
167 | return self._get_ob()
168 |
169 | def step(self, action):
170 | ob, reward, done, info = self.env.step(action)
171 | self.frames.append(ob)
172 | return self._get_ob(), reward, done, info
173 |
174 | def _get_ob(self):
175 | assert len(self.frames) == self.k
176 | return LazyFrames(list(self.frames))
177 |
178 | class ScaledFloatFrame(gym.ObservationWrapper):
179 | def __init__(self, env):
180 | gym.ObservationWrapper.__init__(self, env)
181 |
182 | def observation(self, observation):
183 | # careful! This undoes the memory optimization, use
184 | # with smaller replay buffers only.
185 | return np.array(observation).astype(np.float32) / 255.0
186 |
187 | class LazyFrames(object):
188 | def __init__(self, frames):
189 | """This object ensures that common frames between the observations are only stored once.
190 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
191 | buffers.
192 |
193 | This object should only be converted to numpy array before being passed to the model.
194 |
195 | You'd not believe how complex the previous solution was."""
196 | self._frames = frames
197 | self._out = None
198 |
199 | def _force(self):
200 | if self._out is None:
201 | self._out = np.concatenate(self._frames, axis=2)
202 | self._frames = None
203 | return self._out
204 |
205 | def __array__(self, dtype=None):
206 | out = self._force()
207 | if dtype is not None:
208 | out = out.astype(dtype)
209 | return out
210 |
211 | def __len__(self):
212 | return len(self._force())
213 |
214 | def __getitem__(self, i):
215 | return self._force()[i]
216 |
217 | # EDIT BY ATAMAI
218 | # Preparing image received from environment and adjust it to expected format of Pytorch
219 | # HWC (height x width x channel) becomes CHW
220 | class PytorchImage(gym.ObservationWrapper):
221 | def __init__(self, env):
222 | super(PytorchImage, self).__init__(env)
223 | # we check current shape of observations in environment
224 | current_shape = self.observation_space.shape
225 | # we change order of dimensions - so last one (-1) becomes first
226 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(current_shape[-1], current_shape[0], current_shape[1]))
227 |
228 | def observation(self, observation):
229 | # and finally we change order of dimensions for every single observation
230 | # here transpose method could be also used
231 | return np.swapaxes(observation, 2, 0)
232 |
233 | def make_atari(env_id, monitor = False, epidsode_capture = 75):
234 | env = gym.make(env_id)
235 | if monitor == True:
236 | env = wrappers.Monitor(env, "Videos/", resume=True, force =True, video_callable=lambda episode_id: episode_id%epidsode_capture==0)
237 | assert 'NoFrameskip' in env.spec.id
238 | env = NoopResetEnv(env, noop_max=30)
239 | env = MaxAndSkipEnv(env, skip=4)
240 | return env
241 |
242 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False, pytorch_img=False):
243 | """Configure environment for DeepMind-style Atari.
244 | """
245 | if episode_life:
246 | env = EpisodicLifeEnv(env)
247 | if 'FIRE' in env.unwrapped.get_action_meanings():
248 | env = FireResetEnv(env)
249 | env = WarpFrame(env)
250 | if scale:
251 | env = ScaledFloatFrame(env)
252 | if clip_rewards:
253 | env = ClipRewardEnv(env)
254 | if frame_stack:
255 | env = FrameStack(env, 4)
256 | if pytorch_img:
257 | env = PytorchImage(env)
258 | return env
259 |
260 |
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/CNN_Dueling_DDQN.py:
--------------------------------------------------------------------------------
1 | import math, random
2 | from collections import deque
3 | import cv2
4 |
5 | import gym
6 | from gym import wrappers
7 | import wrapper
8 | import numpy as np
9 |
10 | import torch
11 | import torch.nn as nn
12 | import torch.optim as optim
13 | import torch.autograd as autograd
14 | import torch.nn.functional as F
15 | from IPython.display import clear_output
16 |
17 | import matplotlib.pyplot as plt
18 |
19 | USE_CUDA = torch.cuda.is_available()
20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
21 |
22 | class ReplayBuffer(object):
23 | def __init__(self, capacity):
24 | self.buffer = deque(maxlen=capacity)
25 |
26 | def push(self, state, action, reward, next_state, done):
27 | state = np.expand_dims(state, 0)
28 | next_state = np.expand_dims(next_state, 0)
29 |
30 | self.buffer.append((state, action, reward, next_state, done))
31 |
32 | def sample(self, batch_size):
33 | state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
34 | return np.concatenate(state), action, reward, np.concatenate(next_state), done
35 |
36 | def __len__(self):
37 | return len(self.buffer)
38 |
39 | class CnnDQN(nn.Module):
40 | def __init__(self, input_shape, num_actions):
41 | super(CnnDQN, self).__init__()
42 |
43 | self.input_shape = input_shape
44 | self.num_actions = num_actions
45 |
46 | self.convolutional_layers = nn.Sequential(
47 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
48 | nn.ReLU(),
49 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
50 | nn.ReLU(),
51 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
52 | nn.ReLU()
53 | )
54 |
55 | self.value_layer = nn.Sequential(
56 | nn.Linear(self.feature_size(), 512),
57 | nn.ReLU(),
58 | nn.Linear(512, 1)
59 | )
60 | self.advantage_layer = nn.Sequential(
61 | nn.Linear(self.feature_size(), 512),
62 | nn.ReLU(),
63 | nn.Linear(512, self.num_actions)
64 | )
65 |
66 | def forward(self, x):
67 | x = self.convolutional_layers(x)
68 | x = x.view(x.size(0), -1)
69 | value = self.value_layer(x) # shape [1,1]
70 | value = value.expand(x.size(0), self.num_actions) # shape [1,6]
71 | advantage = self.advantage_layer(x) #shape [1,6]
72 | advantage_mean = advantage.mean(1)#shape [1]
73 | advantage_mean = advantage_mean.unsqueeze(1) #shape[1,1]
74 | advantage_mean = advantage_mean.expand(x.size(0), self.num_actions) #shape [1,6]
75 | Q = value + advantage - advantage_mean
76 | #print("Q-Values: ",Q)
77 | return Q
78 |
79 | def feature_size(self):
80 | #Calculate the output size of the CNN
81 | return self.convolutional_layers(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
82 |
83 | def act(self, state, epsilon,action_space):
84 | if random.random() > epsilon:
85 | with torch.no_grad():
86 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0))
87 | q_value = self.forward(state)
88 | action = q_value.max(1)[1].data[0] #.max(1) maxdata: values--[0] and idx--[1]
89 | else:
90 | action = random.randrange(action_space)
91 | return action
92 |
93 | def update_target(current_model, target_model):
94 | target_model.load_state_dict(current_model.state_dict())
95 |
96 | def save_model(model, idx):
97 | torch.save(model, "Saved_models/")
98 |
99 | def epsilon_by_frame(frame_idx):
100 | epsilon_start = 1.0
101 | epsilon_final = 0.01 #0.01
102 | epsilon_decay = 30000 #30000
103 | eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
104 | return eps
105 |
106 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,replay_buffer):
107 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
108 | # shapes for normal image-- stacked (4,84,84) ...
109 | state = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84)
110 | next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84)
111 | action = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function
112 | reward = Variable(torch.FloatTensor(reward)) #shape [32]
113 | done = Variable(torch.FloatTensor(done)) #shape [32]
114 |
115 | q_values = current_model(state) #shape [32,6]
116 | next_q_values = current_model(next_state) #shape [32,6]
117 | next_q_state_values = target_model(next_state) #shape [32,6]
118 |
119 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action
120 | next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1]
121 | expected_q_value = reward + gamma * next_q_value * (1 - done) # shape [32]
122 |
123 |
124 | # DeepMind took nn.SmoothL1Loss()
125 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss -- .data to get rid of grad_fn=
126 | loss = loss_func(q_value,Variable(expected_q_value.data))
127 |
128 | opti.zero_grad()
129 | loss.backward()
130 | opti.step()
131 | return loss
132 |
133 | def plot(frame_idx, rewards, losses):
134 | plt.close()
135 | plt.figure(figsize=(20,5))
136 | plt.subplot(121)
137 | plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2)))
138 | plt.plot(rewards)
139 | plt.subplot(122)
140 | plt.title("loss")
141 | plt.plot(losses)
142 | plt.ylim(0,1)
143 | plt.draw()
144 | plt.pause(0.0001)
145 |
146 | def processing(img):
147 | img = np.expand_dims(cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), (84,84)),axis= 0)
148 | img = img.astype(np.uint8)
149 | #print(img.dtype)
150 | return img
151 |
152 | def main():
153 | plt.ion()
154 | env = wrapper.make_atari("RiverraidNoFrameskip-v4", monitor=True,epidsode_capture=50)
155 | env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True)
156 | action_space = env.action_space.n
157 | current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape
158 | target_model = CnnDQN(env.observation_space.shape, action_space)
159 |
160 | if USE_CUDA:
161 | current_model = current_model.cuda()
162 | target_model = target_model.cuda()
163 |
164 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000)
165 | #opti = optim.Adam(current_model.parameters(), lr=0.0001)
166 | opti = optim.RMSprop(current_model.parameters(), lr=0.0001)
167 | loss_func = nn.SmoothL1Loss()
168 |
169 | replay_initial = 10000
170 | replay_buffer = ReplayBuffer(100000)
171 |
172 | num_frames = 1000000
173 | batch_size = 32
174 | gamma = 0.99
175 |
176 | losses = []
177 | all_rewards = []
178 | episode_reward = 0
179 |
180 | state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84)
181 | # Manuel Stacking
182 | #state = processing(state)
183 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
184 | #assert state.shape == (4,84,84)
185 | for frame_idx in range(1, num_frames + 1):
186 |
187 | epsilon = epsilon_by_frame(frame_idx)
188 | print("Training :: Frame {} :: Epsilon {} ".format(frame_idx, round(epsilon,2)))
189 | action = current_model.act(state, epsilon,action_space)
190 | next_state, reward, done, _ = env.step(action)
191 | # Manuel Stacking
192 | #next_state = processing(next_state)
193 | #next_state = np.append(next_state, state[1:, :, :],axis= 0)
194 | #assert next_state.shape == (4,84,84)
195 | replay_buffer.push(state, action, reward, next_state, done)
196 |
197 | state = next_state
198 | episode_reward += reward
199 |
200 | if done:
201 | state = env.reset()
202 | # Manuel Stacking
203 | #state = processing(state)
204 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
205 | all_rewards.append(episode_reward)
206 | episode_reward = 0
207 |
208 | if len(replay_buffer) > replay_initial:
209 | loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,replay_buffer)
210 | losses.append(loss.item())
211 |
212 | if frame_idx % 10000 == 0:
213 | plot(frame_idx, all_rewards, losses)
214 |
215 | if frame_idx % 1000 == 0:
216 | update_target(current_model, target_model)
217 |
218 | #if frame_idx % 100000 ==0:
219 | # save_model(current_model, frame_idx)
220 |
221 | if __name__ == "__main__":
222 | main()
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/CNN_Dueling_DDQN_PER.py:
--------------------------------------------------------------------------------
1 | import math, random
2 | from collections import deque
3 | import cv2
4 |
5 | import gym
6 | from gym import wrappers
7 | import wrapper
8 | import numpy as np
9 |
10 | import torch
11 | import torch.nn as nn
12 | import torch.optim as optim
13 | import torch.autograd as autograd
14 | import torch.nn.functional as F
15 | from IPython.display import clear_output
16 |
17 | import matplotlib.pyplot as plt
18 | from PrioritizedExperienceReplay import PrioritizedReplay
19 |
20 | USE_CUDA = torch.cuda.is_available()
21 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
22 |
23 | class CnnDQN(nn.Module):
24 | def __init__(self, input_shape, num_actions):
25 | super(CnnDQN, self).__init__()
26 |
27 | self.input_shape = input_shape
28 | self.num_actions = num_actions
29 |
30 | self.convolutional_layers = nn.Sequential(
31 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
32 | nn.ReLU(),
33 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
34 | nn.ReLU(),
35 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
36 | nn.ReLU()
37 | )
38 |
39 | self.value_layer = nn.Sequential(
40 | nn.Linear(self.feature_size(), 512),
41 | nn.ReLU(),
42 | nn.Linear(512, 1)
43 | )
44 | self.advantage_layer = nn.Sequential(
45 | nn.Linear(self.feature_size(), 512),
46 | nn.ReLU(),
47 | nn.Linear(512, self.num_actions)
48 | )
49 |
50 | def forward(self, x):
51 | x = self.convolutional_layers(x)
52 | x = x.view(x.size(0), -1)
53 | value = self.value_layer(x) # shape [1,1]
54 | value = value.expand(x.size(0), self.num_actions) # shape [1,6]
55 | advantage = self.advantage_layer(x) #shape [1,6]
56 | advantage_mean = advantage.mean(1)#shape [1]
57 | advantage_mean = advantage_mean.unsqueeze(1) #shape[1,1]
58 | advantage_mean = advantage_mean.expand(x.size(0), self.num_actions) #shape [1,6]
59 | Q = value + advantage - advantage_mean
60 | #print("Q-Values: ",Q)
61 | return Q
62 |
63 | def feature_size(self):
64 | #Calculate the output size of the CNN
65 | return self.convolutional_layers(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
66 |
67 | def act(self, state, epsilon,action_space):
68 | if random.random() > epsilon:
69 | with torch.no_grad():
70 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0))
71 | q_value = self.forward(state)
72 | action = q_value.max(1)[1].data[0] #.max(1) maxdata: values--[0] and idx--[1]
73 | else:
74 | action = random.randrange(action_space)
75 | return action
76 |
77 | def update_target(current_model, target_model):
78 | target_model.load_state_dict(current_model.state_dict())
79 |
80 | def save_model(model, idx):
81 | torch.save(model, "Saved_models/")
82 |
83 | def epsilon_by_frame(frame_idx):
84 | epsilon_start = 1.0
85 | epsilon_final = 0.01 #0.01
86 | epsilon_decay = 30000 #30000
87 | eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
88 | return eps
89 |
90 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,PER):
91 | state, action, reward, next_state, done,idx,weights = PER.sample(batch_size)
92 | # shapes for normal image-- stacked (4,84,84) ...
93 | state = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84)
94 | next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84)
95 | action = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function
96 | reward = Variable(torch.FloatTensor(reward)) #shape [32]
97 | done = Variable(torch.FloatTensor(done)) #shape [32]
98 | weights = Variable(torch.FloatTensor(weights)) #shape [32]
99 |
100 | q_values = current_model(state) #shape [32,6]
101 | next_q_values = current_model(next_state) #shape [32,6]
102 | next_q_state_values = target_model(next_state) #shape [32,6]
103 |
104 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action
105 | next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1]
106 | expected_q_value = reward + gamma * next_q_value * (1 - done) # shape [32]
107 |
108 |
109 | # DeepMind took nn.SmoothL1Loss()
110 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss -- .data to get rid of grad_fn=
111 | loss = loss_func(q_value,Variable(expected_q_value.data))*weights
112 | prios = loss + 1e-5
113 | loss = loss.mean()
114 |
115 |
116 | opti.zero_grad()
117 | loss.backward()
118 | PER.update_priorities(idx, prios.data.cpu().numpy())
119 | opti.step()
120 | return loss
121 |
122 | def plot(frame_idx, rewards, losses):
123 | plt.close()
124 | plt.figure(figsize=(20,5))
125 | plt.subplot(121)
126 | plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2)))
127 | plt.plot(rewards)
128 | plt.subplot(122)
129 | plt.title("loss")
130 | plt.plot(losses)
131 | plt.ylim(0,1)
132 | plt.draw()
133 | plt.pause(0.0001)
134 |
135 |
136 | def main():
137 | plt.ion()
138 | env = wrapper.make_atari("BreakoutNoFrameskip-v4", monitor=True,epidsode_capture=50)#Riverraid Frostbite Enduro
139 | env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True)
140 | action_space = env.action_space.n
141 | current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape
142 | target_model = CnnDQN(env.observation_space.shape, action_space)
143 |
144 | if USE_CUDA:
145 | current_model = current_model.cuda()
146 | target_model = target_model.cuda()
147 |
148 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000)
149 | #opti = optim.Adam(current_model.parameters(), lr=0.0001)
150 | opti = optim.RMSprop(current_model.parameters(), lr=0.0001)
151 | loss_func = nn.SmoothL1Loss()
152 |
153 | replay_initial = 10000
154 | PER = PrioritizedReplay(100000,alpha = 0.6,beta_start =0.4,beta_frames=1000000)
155 |
156 | num_frames = 1000000
157 | batch_size = 32
158 | gamma = 0.99
159 |
160 | losses = []
161 | all_rewards = []
162 | episode_reward = 0
163 |
164 | state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84)
165 | # Manuel Stacking
166 | #state = processing(state)
167 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
168 | #assert state.shape == (4,84,84)
169 | for frame_idx in range(1, num_frames + 1):
170 |
171 | epsilon = epsilon_by_frame(frame_idx)
172 | action = current_model.act(state, epsilon,action_space)
173 | next_state, reward, done, _ = env.step(action)
174 | print("Training :: Frame {} :: Epsilon {} :: Reward {} ".format(frame_idx, round(epsilon,2),reward))
175 | # Manuel Stacking
176 | #next_state = processing(next_state)
177 | #next_state = np.append(next_state, state[1:, :, :],axis= 0)
178 | #assert next_state.shape == (4,84,84)
179 | PER.push(state, action, reward, next_state, done)
180 |
181 | state = next_state
182 | episode_reward += reward
183 |
184 | if done:
185 | state = env.reset()
186 |
187 | all_rewards.append(episode_reward)
188 | episode_reward = 0
189 |
190 | if PER.__len__() > replay_initial:
191 | loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,PER)
192 | losses.append(loss.item())
193 |
194 | if frame_idx % 10000 == 0:
195 | plot(frame_idx, all_rewards, losses)
196 |
197 | if frame_idx % 1000 == 0:
198 | update_target(current_model, target_model)
199 |
200 |
201 | if __name__ == "__main__":
202 | main()
203 |
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Img/Duel_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Img/Duel_per.png
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Img/Dueling_DQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Img/Dueling_DQN.png
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/PrioritizedExperienceReplay.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class PrioritizedReplay(object):
4 | def __init__(self, capacity, alpha=0.6,beta_start = 0.4,beta_frames=100000):
5 | self.alpha = alpha
6 | self.beta_start = beta_start
7 | self.beta_frames = beta_frames
8 | self.frame = 1 #for beta calculation
9 | self.capacity = capacity
10 | self.buffer = []
11 | self.pos = 0
12 | self.priorities = np.zeros((capacity,), dtype=np.float32)
13 |
14 | def beta_by_frame(self, frame_idx):
15 | return min(1.0, self.beta_start + frame_idx * (1.0 - self.beta_start) / self.beta_frames)
16 |
17 | def push(self, state, action, reward, next_state, done):
18 | assert state.ndim == next_state.ndim
19 | state = np.expand_dims(state, 0)
20 | next_state = np.expand_dims(next_state, 0)
21 |
22 | max_prio = self.priorities.max() if self.buffer else 1.0 # gives max priority if buffer is not empty else 1
23 |
24 | if len(self.buffer) < self.capacity:
25 | self.buffer.append((state, action, reward, next_state, done))
26 | else:
27 | # puts the new data on the position of the oldes since it circles via pos variable
28 | # since if len(buffer) == capacity -> pos == 0 -> oldest memory (at least for the first round?)
29 | self.buffer[self.pos] = (state, action, reward, next_state, done)
30 |
31 | self.priorities[self.pos] = max_prio
32 | self.pos = (self.pos + 1) % self.capacity # lets the pos circle in the ranges of capacity if pos+1 > cap --> new posi = 0
33 |
34 | def sample(self, batch_size):
35 | N = len(self.buffer)
36 | if N == self.capacity:
37 | prios = self.priorities
38 | else:
39 | prios = self.priorities[:self.pos]
40 | # calc P = p^a/sum(p^a)
41 | probs = prios ** self.alpha
42 | P = probs/probs.sum()
43 |
44 | indices = np.random.choice(N, batch_size, p=P) # gets the indices depending on the probability p
45 | samples = [self.buffer[idx] for idx in indices]
46 |
47 | beta = self.beta_by_frame(self.frame)
48 | self.frame+=1
49 |
50 | #min of ALL probs, not just sampled probs
51 | P_min = P.min()
52 | max_weight = (P_min*N)**(-beta)
53 |
54 | #Compute importance-sampling weight step:10 pseudo code
55 | weights = (N * P[indices]) ** (-beta)
56 | weights /= weights.max() # max_weights
57 | weights = np.array(weights, dtype=np.float32) #torch.tensor(weights, device=device, dtype=torch.float)
58 |
59 | #print("Sample-shape befor zipping: ", samples)
60 | states, actions, rewards, next_states, dones = zip(*samples) # example: p = [[1,2,3],[4,5,6]] ,d=zip(*p) -> d = [(1, 4), (2, 5), (3, 6)]
61 | return np.concatenate(states), actions, rewards, np.concatenate(next_states), dones, indices, weights
62 |
63 | def update_priorities(self, batch_indices, batch_priorities):
64 | for idx, prio in zip(batch_indices, batch_priorities):
65 | self.priorities[idx] = prio
66 |
67 | def __len__(self):
68 | return len(self.buffer)
69 |
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Video/Breakout.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Video/Breakout.mp4
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Video/Pong.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Video/Pong.mp4
--------------------------------------------------------------------------------
/Dueling Deep Q-Network/wrapper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import deque
3 | import gym
4 |
5 | from gym import spaces,wrappers
6 | import cv2
7 | cv2.ocl.setUseOpenCL(False)
8 |
9 | class NoopResetEnv(gym.Wrapper):
10 | def __init__(self, env, noop_max=30):
11 | """Sample initial states by taking random number of no-ops on reset.
12 | No-op is assumed to be action 0.
13 | """
14 | gym.Wrapper.__init__(self, env)
15 | self.noop_max = noop_max
16 | self.override_num_noops = None
17 | self.noop_action = 0
18 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
19 |
20 | def reset(self, **kwargs):
21 | """ Do no-op action for a number of steps in [1, noop_max]."""
22 | self.env.reset(**kwargs)
23 | if self.override_num_noops is not None:
24 | noops = self.override_num_noops
25 | else:
26 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
27 | assert noops > 0
28 | obs = None
29 | for _ in range(noops):
30 | obs, _, done, _ = self.env.step(self.noop_action)
31 | if done:
32 | obs = self.env.reset(**kwargs)
33 | return obs
34 |
35 | def step(self, ac):
36 | return self.env.step(ac)
37 |
38 | class FireResetEnv(gym.Wrapper):
39 | def __init__(self, env):
40 | """Take action on reset for environments that are fixed until firing."""
41 | gym.Wrapper.__init__(self, env)
42 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
43 | assert len(env.unwrapped.get_action_meanings()) >= 3
44 |
45 | def reset(self, **kwargs):
46 | self.env.reset(**kwargs)
47 | obs, _, done, _ = self.env.step(1)
48 | if done:
49 | self.env.reset(**kwargs)
50 | obs, _, done, _ = self.env.step(2)
51 | if done:
52 | self.env.reset(**kwargs)
53 | return obs
54 |
55 | def step(self, ac):
56 | return self.env.step(ac)
57 |
58 | class EpisodicLifeEnv(gym.Wrapper):
59 | def __init__(self, env):
60 | """Make end-of-life == end-of-episode, but only reset on true game over.
61 | Done by DeepMind for the DQN and co. since it helps value estimation.
62 | """
63 | gym.Wrapper.__init__(self, env)
64 | self.lives = 0
65 | self.was_real_done = True
66 |
67 | def step(self, action):
68 | obs, reward, done, info = self.env.step(action)
69 | self.was_real_done = done
70 | # check current lives, make loss of life terminal,
71 | # then update lives to handle bonus lives
72 | lives = self.env.unwrapped.ale.lives()
73 | if lives < self.lives and lives > 0:
74 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames
75 | # so its important to keep lives > 0, so that we only reset once
76 | # the environment advertises done.
77 | done = True
78 | self.lives = lives
79 | return obs, reward, done, info
80 |
81 | def reset(self, **kwargs):
82 | """Reset only when lives are exhausted.
83 | This way all states are still reachable even though lives are episodic,
84 | and the learner need not know about any of this behind-the-scenes.
85 | """
86 | if self.was_real_done:
87 | obs = self.env.reset(**kwargs)
88 | else:
89 | # no-op step to advance from terminal/lost life state
90 | obs, _, _, _ = self.env.step(0)
91 | self.lives = self.env.unwrapped.ale.lives()
92 | return obs
93 |
94 | class MaxAndSkipEnv(gym.Wrapper):
95 | def __init__(self, env, skip=4):
96 | """Return only every `skip`-th frame"""
97 | gym.Wrapper.__init__(self, env)
98 | # most recent raw observations (for max pooling across time steps)
99 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
100 | self._skip = skip
101 |
102 | def reset(self):
103 | return self.env.reset()
104 |
105 | def step(self, action):
106 | """Repeat action, sum reward, and max over last observations."""
107 | total_reward = 0.0
108 | done = None
109 | for i in range(self._skip):
110 | obs, reward, done, info = self.env.step(action)
111 | if i == self._skip - 2: self._obs_buffer[0] = obs
112 | if i == self._skip - 1: self._obs_buffer[1] = obs
113 | total_reward += reward
114 | if done:
115 | break
116 | # Note that the observation on the done=True frame
117 | # doesn't matter
118 | max_frame = self._obs_buffer.max(axis=0)
119 |
120 | return max_frame, total_reward, done, info
121 |
122 | def reset(self, **kwargs):
123 | return self.env.reset(**kwargs)
124 |
125 | class ClipRewardEnv(gym.RewardWrapper):
126 | def __init__(self, env):
127 | gym.RewardWrapper.__init__(self, env)
128 |
129 | def reward(self, reward):
130 | """Bin reward to {+1, 0, -1} by its sign."""
131 | return np.sign(reward)
132 |
133 | class WarpFrame(gym.ObservationWrapper):
134 | def __init__(self, env):
135 | """Warp frames to 84x84 as done in the Nature paper and later work."""
136 | gym.ObservationWrapper.__init__(self, env)
137 | self.width = 84
138 | self.height = 84
139 | self.observation_space = spaces.Box(low=0, high=255,
140 | shape=(self.height, self.width, 1), dtype=np.uint8)
141 |
142 | def observation(self, frame):
143 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
144 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
145 | return frame[:, :, None]
146 |
147 | class FrameStack(gym.Wrapper):
148 | def __init__(self, env, k):
149 | """Stack k last frames.
150 |
151 | Returns lazy array, which is much more memory efficient.
152 |
153 | See Also
154 | --------
155 | baselines.common.atari_wrappers.LazyFrames
156 | """
157 | gym.Wrapper.__init__(self, env)
158 | self.k = k
159 | self.frames = deque([], maxlen=k)
160 | shp = env.observation_space.shape
161 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
162 |
163 | def reset(self):
164 | ob = self.env.reset()
165 | for _ in range(self.k):
166 | self.frames.append(ob)
167 | return self._get_ob()
168 |
169 | def step(self, action):
170 | ob, reward, done, info = self.env.step(action)
171 | self.frames.append(ob)
172 | return self._get_ob(), reward, done, info
173 |
174 | def _get_ob(self):
175 | assert len(self.frames) == self.k
176 | return LazyFrames(list(self.frames))
177 |
178 | class ScaledFloatFrame(gym.ObservationWrapper):
179 | def __init__(self, env):
180 | gym.ObservationWrapper.__init__(self, env)
181 |
182 | def observation(self, observation):
183 | # careful! This undoes the memory optimization, use
184 | # with smaller replay buffers only.
185 | return np.array(observation).astype(np.float32) / 255.0
186 |
187 | class LazyFrames(object):
188 | def __init__(self, frames):
189 | """This object ensures that common frames between the observations are only stored once.
190 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
191 | buffers.
192 |
193 | This object should only be converted to numpy array before being passed to the model.
194 |
195 | You'd not believe how complex the previous solution was."""
196 | self._frames = frames
197 | self._out = None
198 |
199 | def _force(self):
200 | if self._out is None:
201 | self._out = np.concatenate(self._frames, axis=2)
202 | self._frames = None
203 | return self._out
204 |
205 | def __array__(self, dtype=None):
206 | out = self._force()
207 | if dtype is not None:
208 | out = out.astype(dtype)
209 | return out
210 |
211 | def __len__(self):
212 | return len(self._force())
213 |
214 | def __getitem__(self, i):
215 | return self._force()[i]
216 |
217 | # EDIT BY ATAMAI
218 | # Preparing image received from environment and adjust it to expected format of Pytorch
219 | # HWC (height x width x channel) becomes CHW
220 | class PytorchImage(gym.ObservationWrapper):
221 | def __init__(self, env):
222 | super(PytorchImage, self).__init__(env)
223 | # we check current shape of observations in environment
224 | current_shape = self.observation_space.shape
225 | # we change order of dimensions - so last one (-1) becomes first
226 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(current_shape[-1], current_shape[0], current_shape[1]))
227 |
228 | def observation(self, observation):
229 | # and finally we change order of dimensions for every single observation
230 | # here transpose method could be also used
231 | return np.swapaxes(observation, 2, 0)
232 |
233 | def make_atari(env_id, monitor = False, epidsode_capture = 75):
234 | env = gym.make(env_id)
235 | if monitor == True:
236 | env = wrappers.Monitor(env, "Videos/", resume=True, force =True, video_callable=lambda episode_id: episode_id%epidsode_capture==0)
237 | assert 'NoFrameskip' in env.spec.id
238 | env = NoopResetEnv(env, noop_max=30)
239 | env = MaxAndSkipEnv(env, skip=4)
240 | return env
241 |
242 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False, pytorch_img=False):
243 | """Configure environment for DeepMind-style Atari.
244 | """
245 | if episode_life:
246 | env = EpisodicLifeEnv(env)
247 | if 'FIRE' in env.unwrapped.get_action_meanings():
248 | env = FireResetEnv(env)
249 | env = WarpFrame(env)
250 | if scale:
251 | env = ScaledFloatFrame(env)
252 | if clip_rewards:
253 | env = ClipRewardEnv(env)
254 | if frame_stack:
255 | env = FrameStack(env, 4)
256 | if pytorch_img:
257 | env = PytorchImage(env)
258 | return env
259 |
260 |
--------------------------------------------------------------------------------
/Paper/A3C.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/A3C.pdf
--------------------------------------------------------------------------------
/Paper/DDPG.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/DDPG.pdf
--------------------------------------------------------------------------------
/Paper/DQN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/DQN.pdf
--------------------------------------------------------------------------------
/Paper/Distributional DQN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Distributional DQN.pdf
--------------------------------------------------------------------------------
/Paper/Double_DQN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Double_DQN.pdf
--------------------------------------------------------------------------------
/Paper/Dueling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Dueling.pdf
--------------------------------------------------------------------------------
/Paper/GAE.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/GAE.pdf
--------------------------------------------------------------------------------
/Paper/Noisy_networks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Noisy_networks.pdf
--------------------------------------------------------------------------------
/Paper/PPO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/PPO.pdf
--------------------------------------------------------------------------------
/Paper/SAC_2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/SAC_2019.pdf
--------------------------------------------------------------------------------
/Paper/TD3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/TD3.pdf
--------------------------------------------------------------------------------
/Policy Gradient Algorithms/Parallel_processing.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Process, Pipe
2 | import numpy as np
3 |
4 | def worker(remote, parent_remote, env_fn_wrapper):
5 | parent_remote.close()
6 | env = env_fn_wrapper.x()
7 | while True:
8 | cmd, data = remote.recv()
9 | if cmd == 'step':
10 | ob, reward, done, info = env.step(data)
11 | if done:
12 | ob = env.reset()
13 | remote.send((ob, reward, done, info))
14 | elif cmd == 'reset':
15 | ob = env.reset()
16 | remote.send(ob)
17 | elif cmd == 'reset_task':
18 | ob = env.reset_task()
19 | remote.send(ob)
20 | elif cmd == 'close':
21 | remote.close()
22 | break
23 | elif cmd == 'get_spaces':
24 | remote.send((env.observation_space, env.action_space))
25 | else:
26 | raise NotImplementedError
27 |
28 | class CloudpickleWrapper(object):
29 | """
30 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
31 | """
32 | def __init__(self, x):
33 | self.x = x
34 | def __getstate__(self):
35 | import cloudpickle
36 | return cloudpickle.dumps(self.x)
37 | def __setstate__(self, ob):
38 | import pickle
39 | self.x = pickle.loads(ob)
40 |
41 |
42 | class VecEnv(object):
43 | """
44 | An abstract asynchronous, vectorized environment.
45 | """
46 | def __init__(self, num_envs, observation_space, action_space):
47 | self.num_envs = num_envs
48 | self.observation_space = observation_space
49 | self.action_space = action_space
50 |
51 | def reset(self):
52 | """
53 | Reset all the environments and return an array of
54 | observations, or a tuple of observation arrays.
55 | If step_async is still doing work, that work will
56 | be cancelled and step_wait() should not be called
57 | until step_async() is invoked again.
58 | """
59 | pass
60 |
61 | def step_async(self, actions):
62 | """
63 | Tell all the environments to start taking a step
64 | with the given actions.
65 | Call step_wait() to get the results of the step.
66 | You should not call this if a step_async run is
67 | already pending.
68 | """
69 | pass
70 |
71 | def step_wait(self):
72 | """
73 | Wait for the step taken with step_async().
74 | Returns (obs, rews, dones, infos):
75 | - obs: an array of observations, or a tuple of
76 | arrays of observations.
77 | - rews: an array of rewards
78 | - dones: an array of "episode done" booleans
79 | - infos: a sequence of info objects
80 | """
81 | pass
82 |
83 | def close(self):
84 | """
85 | Clean up the environments' resources.
86 | """
87 | pass
88 |
89 | def step(self, actions):
90 | self.step_async(actions)
91 | return self.step_wait()
92 |
93 | class SubprocVecEnv(VecEnv):
94 | def __init__(self, env_fns, spaces=None):
95 | """
96 | envs: list of gym environments to run in subprocesses
97 | """
98 | self.waiting = False
99 | self.closed = False
100 | nenvs = len(env_fns)
101 | self.nenvs = nenvs
102 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
103 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
104 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
105 | for p in self.ps:
106 | p.daemon = True # if the main process crashes, we should not cause things to hang
107 | p.start()
108 | for remote in self.work_remotes:
109 | remote.close()
110 |
111 | self.remotes[0].send(('get_spaces', None))
112 | observation_space, action_space = self.remotes[0].recv()
113 | VecEnv.__init__(self, len(env_fns), observation_space, action_space)
114 |
115 | def step_async(self, actions):
116 | for remote, action in zip(self.remotes, actions):
117 | remote.send(('step', action))
118 | self.waiting = True
119 |
120 | def step_wait(self):
121 | results = [remote.recv() for remote in self.remotes]
122 | self.waiting = False
123 | obs, rews, dones, infos = zip(*results)
124 | return np.stack(obs), np.stack(rews), np.stack(dones), infos
125 |
126 | def reset(self):
127 | for remote in self.remotes:
128 | remote.send(('reset', None))
129 | return np.stack([remote.recv() for remote in self.remotes])
130 |
131 | def reset_task(self):
132 | for remote in self.remotes:
133 | remote.send(('reset_task', None))
134 | return np.stack([remote.recv() for remote in self.remotes])
135 |
136 | def close(self):
137 | if self.closed:
138 | return
139 | if self.waiting:
140 | for remote in self.remotes:
141 | remote.recv()
142 | for remote in self.remotes:
143 | remote.send(('close', None))
144 | for p in self.ps:
145 | p.join()
146 | self.closed = True
147 |
148 | def __len__(self):
149 | return self.nenvs
--------------------------------------------------------------------------------
/Policy Gradient Algorithms/REINFORCE/Img/Steps_needed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Policy Gradient Algorithms/REINFORCE/Img/Steps_needed.png
--------------------------------------------------------------------------------
/Policy Gradient Algorithms/REINFORCE/REINFORCE.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import torch.optim as optim
4 | from torch.autograd import Variable
5 | import gym
6 | from gym import wrappers
7 | import numpy as np
8 | import matplotlib.pyplot as plt
9 | from torch.distributions import Categorical
10 |
11 |
12 | class Policy(nn.Module):
13 | def __init__(self,input_shape,action_shape):
14 | super().__init__()
15 |
16 | self.model = nn.Sequential(
17 | nn.Linear(input_shape[0],64),
18 | nn.ReLU(),
19 | nn.Linear(64,32),
20 | nn.ReLU(),
21 | nn.Linear(32,action_shape),
22 | nn.Softmax(dim = 1)
23 | )
24 | def forward(self,x):
25 | return self.model(x)
26 |
27 | def action(model, s):
28 | # simple pytorch aproach for action-selection and log-prob calc
29 | #https://pytorch.org/docs/stable/distributions.html
30 | prob = model(s)
31 | m = Categorical(prob)
32 | a = m.sample()
33 | # log p(a∣π(s))
34 | log_p = m.log_prob(a)
35 | #print(a.item(), log_p)
36 | return a.item(), log_p
37 |
38 | # naive own numpy aproach attenion! grad gets lost by transforming prob to numpy:
39 | #possible_actions = [i for i in range(len(prob.data.detach().numpy()[0]))]
40 | # choose accordingly to probability:
41 | #action = np.random.choice(possible_actions, p = prob.data.detach().numpy()[0])
42 | #calculate the log-prob for the chosen action:
43 | #grad = prob[0][action].grad_fn
44 | #log_prob = np.log(prob.data.detach().numpy()[0][action])
45 | # transform to torch Tensor:
46 | #log_prob = torch.Tensor([log_prob]).unsqueeze(0)
47 | #log_prob = Variable(log_prob,requires_grad=True)
48 | #log_prob.backward()
49 | #print(log_prob)
50 | #print(action,log_prob)
51 | #return action, log_prob
52 |
53 | def policy_optimization(ep, model, optimizer,batch_rewards,log_probs):
54 | R = 0
55 | gamma = 0.99
56 | policy_loss = []
57 | rewards = []
58 | #calc discounted Rewards
59 | for r in batch_rewards[::-1]: # reverses the list of rewards
60 | R = r + gamma * R
61 | rewards.insert(0, R) # inserts the current rewart to first position
62 |
63 | rewards = torch.tensor(rewards)
64 | # standardization to get data of zero mean and varianz 1, stabilizes learning
65 | #-- attention scaling rewards looses information of special events with higher rewards - addapting on different environments
66 | rewards = (rewards - rewards.mean()) / (rewards.std() + ep)
67 | for log_prob, reward in zip(log_probs, rewards):
68 | policy_loss.append(-log_prob * reward) #baseline+
69 |
70 | optimizer.zero_grad()
71 | policy_loss = torch.cat(policy_loss).sum()
72 | policy_loss.backward()
73 | optimizer.step()
74 |
75 | def run(episodes,model,env):
76 | optimizer = optim.Adam(model.parameters(), lr = 1e-2)
77 | rewards = []
78 | steps_taken = []
79 |
80 | for i in range(episodes):
81 | done = False
82 | ep_rewards = 0
83 | batch_rewards = []
84 | log_probs = []
85 | state = env.reset()
86 | steps = 0
87 | while not done:
88 | a, log_p = action(model, torch.Tensor(state).unsqueeze(0))
89 | log_probs.append(log_p)
90 | new_state, reward, done, info = env.step(a)
91 | batch_rewards.append(reward)
92 | ep_rewards += reward
93 | steps +=1
94 |
95 |
96 |
97 | state = new_state
98 |
99 |
100 | rewards.append(ep_rewards)
101 | steps_taken.append(steps)
102 | print("Episode: {} --- Rewards: {} --- Steps: {}".format(i, ep_rewards, steps))
103 | policy_optimization(i, model, optimizer, batch_rewards,log_probs)
104 |
105 | return steps_taken
106 |
107 | def main():
108 | USE_CUDA = torch.cuda.is_available()
109 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
110 |
111 | env = gym.make("Acrobot-v1")
112 | env = wrappers.Monitor(env, "Saved_Videos/", resume=True, force = True, video_callable=lambda episode_id: episode_id%40==0)
113 | obs_shape = env.observation_space.shape
114 | action_shape = env.action_space.n
115 | episodes = 240
116 | model = Policy(obs_shape, action_shape)
117 | steps = run(episodes, model, env)
118 |
119 | plt.plot(steps)
120 | plt.xlabel("Episodes")
121 | plt.ylabel("Steps needed to reach goal")
122 | plt.show()
123 |
124 | if __name__ == "__main__":
125 | #Argparse:
126 | main()
127 |
--------------------------------------------------------------------------------
/Q_Learning/FrozenLake_q-table.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import gym
4 | import time
5 |
6 | EPISODES = 5000
7 | TRYS = 100
8 | EPSILON = 0.9 # epsilon greedy
9 | ALPHA = 0.1 # learning rate
10 | GAMMA = 0.9 #discount factor
11 |
12 |
13 |
14 |
15 | def make_Q_table(actions,n_states):
16 | table = pd.DataFrame(
17 | np.zeros((n_states, actions)), columns = list(range(actions))) # q_table initial values
18 | # print(table) # show table
19 | return table
20 |
21 | def choose_action(state, q_table):
22 | state_actions = q_table.iloc[state, :]
23 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value
24 | action_name = np.random.choice(ACTIONS)
25 | else: # act greedy
26 | action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas
27 | #print("Action_choosen: "+str(action_name))
28 | return action_name
29 |
30 |
31 | def RL(ACTIONS,N_SPACE):
32 | q_table = make_Q_table(ACTIONS,N_SPACE)
33 | for episode in range(EPISODES):
34 | S = env.reset()
35 | for one_try in range(TRYS): #how long one epidsode lasts
36 |
37 | env.render()
38 | A = choose_action(S, q_table)
39 |
40 | S_,R,done,info = env.step(A)
41 | #print(S_)
42 | #time.sleep(1)
43 | q_old = q_table.loc[S, A] #Current Q-Value of the state
44 | q_learned = R + GAMMA * q_table.iloc[S_, :].max()
45 | q_table.loc[S, A] += ALPHA * (q_learned - q_old) # update
46 | S = S_ # move to next state
47 | if done:
48 | print("Episode finished after {} timesteps".format(one_try+1))
49 | break
50 |
51 |
52 | return q_table
53 |
54 |
55 | if __name__ =="__main__":
56 | env = gym.make("FrozenLake-v0")
57 | print(gym.__version__)
58 |
59 | env.reset()
60 | # getting space and action
61 | ACTIONS = env.action_space.n #env.unwrapped.get_action_meanings() to get a list of the action names
62 | N_SPACE = env.observation_space.n
63 | #print(ACTIONS)
64 | #print(N_SPACE)
65 | q_table = RL(ACTIONS,N_SPACE)
66 | print("Q-Table: \n")
67 | print(q_table)
68 |
69 |
70 |
--------------------------------------------------------------------------------
/Q_Learning/Img/Q_table10000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Q_table10000.png
--------------------------------------------------------------------------------
/Q_Learning/Img/Q_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Q_value.png
--------------------------------------------------------------------------------
/Q_Learning/Img/Receivedrewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Receivedrewards.png
--------------------------------------------------------------------------------
/Q_Learning/Img/steps_taken.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/steps_taken.png
--------------------------------------------------------------------------------
/Q_Learning/Q_Table_E10000_a0.09_g0.9_eps0.9.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Q_Table_E10000_a0.09_g0.9_eps0.9.pkl
--------------------------------------------------------------------------------
/Q_Learning/Q_Table_E3000_a0.09_g0.9_eps0.9.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Q_Table_E3000_a0.09_g0.9_eps0.9.pkl
--------------------------------------------------------------------------------
/Q_Learning/Readme.md:
--------------------------------------------------------------------------------
1 | [image1]: ./Img/Q_value.png "Calculation Equation"
2 | [image2]: ./Img/Q_table10000.png "Calculation Equation"
3 | [image3]: ./Img/Receivedrewards.png "Calculation Equation"
4 | [image4]: ./Img/steps_taken.png "Calculation Equation"
5 |
6 |
7 |
8 |
9 |
10 | # Q-Learning and Q-Table
11 |
12 | ## Creating the Q-Table
13 | The Q-Table gets created by the number of states (n_states) and the number of actions (n_actions) and form a matrix: n_states x n_actions
14 |
15 | This even shows the limitations of normal Q-learning with a Q-Table. The number of states has to be finit and not too large. Further, the states are not allowed to change during the game.
16 |
17 | ## Calculating the Q-Values
18 |
19 | The Q-Values get calculated each step by this formula:
20 |
21 | ![alt text][image1]
22 |
23 | Here are as well some limitations. Since the Q-Values are dependent on the given rewards and most of the time the only reward is given when reaching the goal state, there has to be a way to reach the goal state by random actions. Otherwise the Q-Table will stay as a table of zeros.
24 |
25 | ## Testing on Open AI Gyms environment FrozenLake
26 | After Training of 10000 Epochs the following Q-Table got calculated:
27 |
28 | ![alt text][image2]
29 |
30 | Also by looking at the received rewards over the epochs, one can see that after epoch ~1500 almost every following try received an reward of 1 or better won the game.
31 |
32 | ![alt text][image3]
33 |
34 | Same with the steps taken,one can see the increase in taken steps. which happens, since the game doesnt get stopped earlier by failing.
35 |
36 | ![alt text][image4]
37 |
38 |
--------------------------------------------------------------------------------
/Q_Learning/play_FrozenLake_Q_table.py:
--------------------------------------------------------------------------------
1 | import pickle as pkl
2 | import numpy as np
3 | import pandas as pd
4 | import time
5 | import gym
6 | import argparse
7 |
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument("-e", "--Episoden",type = int,help ="Die Anzahl der zu trainierenden Episoden")
10 | parser.add_argument("-v", "--Video",type = bool,help ="Sollen die Versuche in einem Video aufgezeichnet werden?")
11 | parser.add_argument("-q", "--Q_Table",type = str,help ="Name der Q_table mit der gespielt werden soll")
12 |
13 |
14 | args = parser.parse_args()
15 |
16 | EPISODES = args.Episoden
17 | TRYS = 100
18 | AUFZEICHNUNG = args.Video
19 | Q_Table_name = args.Q_Table
20 |
21 |
22 |
23 |
24 | def load_Qtable(Q_table):
25 | Q = pd.read_pickle(Q_table)
26 | return Q
27 |
28 |
29 | def choose_action(state,Q_table):
30 | state_actions = Q_table.iloc[state, :]
31 | action_name1 = state_actions.idxmax()
32 | state_actions.pop(action_name1)
33 | action_name2 = state_actions.idxmax()
34 | if (np.random.uniform() > 0.4):
35 | print("Best action choosen!")
36 | return action_name1
37 | else:
38 | print("Second-best-action choosen!")
39 | return action_name2
40 |
41 | def play():
42 | Q_Table = load_Qtable(Q_Table_name)
43 | for episode in range(EPISODES):
44 | S = env.reset()
45 | for one_try in range(TRYS): #how long one epidsode lasts
46 |
47 | env.render()
48 | A = choose_action(S, Q_Table)
49 | print("Action choosen: {}".format(A))
50 | S_,R,done,info = env.step(A)
51 | #print(S_)
52 | time.sleep(2)
53 |
54 | # Addapting for further learning
55 | #print()
56 | #q_old = q_table.loc[S, A] #Current Q-Value of the state
57 | #q_learned = R + GAMMA * q_table.iloc[S_, :].max()
58 | #q_table.loc[S, A] += ALPHA * (q_learned - q_old) # update
59 | #S = S_ # move to next state
60 |
61 | if done:
62 | print("Episode finished after {} timesteps".format(one_try+1))
63 | break
64 |
65 |
66 | if __name__ =="__main__":
67 |
68 |
69 | env = gym.make("FrozenLake-v0")
70 | print(gym.__version__)
71 | env.reset()
72 |
73 | play()
74 |
75 | # 0 - Down
76 | # 1 -
--------------------------------------------------------------------------------
/Q_Learning/train_FrozenLake_Qtable.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import gym
4 | import time
5 | import matplotlib.pyplot as plt
6 | import argparse
7 |
8 |
9 |
10 | def make_Q_table(actions,n_states):
11 | table = pd.DataFrame(
12 | np.zeros((n_states, actions)), columns = list(range(actions))) # q_table initial values
13 | # print(table) # show table
14 | return table
15 |
16 | def choose_action(state, q_table):
17 | state_actions = q_table.iloc[state, :]
18 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value
19 | action_name = np.random.choice(ACTIONS)
20 | else: # act greedy
21 | action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas
22 | #print("Action_choosen: "+str(action_name))
23 | return action_name
24 |
25 |
26 | def RL(ACTIONS,N_SPACE):
27 | q_table = make_Q_table(ACTIONS,N_SPACE)
28 | reward_list = []
29 | try_list = []
30 |
31 | for episode in range(EPISODES):
32 | S = env.reset()
33 | rewards = 0
34 | steps = 0
35 | for one_try in range(TRYS): #how long one epidsode lasts
36 |
37 | env.render()
38 | A = choose_action(S, q_table)
39 |
40 | S_,R,done,info = env.step(A)
41 | #print(S_)
42 | #time.sleep(1)
43 | print()
44 | q_old = q_table.loc[S, A] #Current Q-Value of the state
45 | q_learned = R + GAMMA * q_table.iloc[S_, :].max()
46 | q_table.loc[S, A] += ALPHA * (q_learned - q_old) # update
47 | S = S_ # move to next state
48 | rewards += R
49 | steps = one_try
50 | if done:
51 | print("Episode finished after {} timesteps".format(one_try+1))
52 | steps = one_try+1
53 | break
54 | reward_list.append(rewards)
55 | try_list.append(steps+1)
56 |
57 |
58 | return q_table,reward_list,try_list
59 |
60 |
61 |
62 |
63 |
64 |
65 | parser = argparse.ArgumentParser()
66 | parser.add_argument("-e", "--Episoden",type = int,help ="Die Anzahl der zu trainierenden Episoden")
67 | parser.add_argument("-a", "--Alpha",type = float,help ="Learning Rate ~0.1")
68 | parser.add_argument("-g", "--Gamma",type = float,help ="Discount Factor ~0.9")
69 | parser.add_argument("-eps", "--Epsilon",type = float,help ="Epsilon- for the Epsilon-Greedy decision process ~0.9")
70 |
71 | args = parser.parse_args()
72 |
73 | EPISODES = args.Episoden
74 | TRYS = 100
75 | EPSILON = args.Epsilon # epsilon greedy
76 | ALPHA = args.Alpha # learning rate
77 | GAMMA = args.Gamma #discount factor
78 |
79 | if __name__ =="__main__":
80 |
81 |
82 | env = gym.make("FrozenLake-v0")
83 | print(gym.__version__)
84 | env.reset()
85 | # getting space and action
86 | ACTIONS = env.action_space.n #env.unwrapped.get_action_meanings() to get a list of the action names
87 | N_SPACE = env.observation_space.n
88 | #print(ACTIONS)
89 | #print(N_SPACE)
90 | q_table,rlist,steps = RL(ACTIONS,N_SPACE)
91 |
92 | plt.plot(rlist)
93 | plt.title("Received Rewards")
94 | plt.xlabel("Epochs")
95 | plt.ylabel("Rewards")
96 | plt.show()
97 |
98 | plt.plot(steps)
99 | plt.title("Needed steps to finish one episode")
100 | plt.xlabel("Epochs")
101 | plt.ylabel("Steps")
102 | plt.show()
103 |
104 |
105 |
106 |
107 |
108 | print("Q-Table: \n")
109 | print(q_table)
110 |
111 | print("\nDo you want to save the Q-Table? \n")
112 | answer = input("[y/n]")
113 |
114 | if answer == "y":
115 | q_table.to_pickle("./Q_Table_E{}_a{}_g{}_eps{}.pkl".format(EPISODES,ALPHA,GAMMA,EPSILON))
116 | else:
117 | pass
118 |
119 |
--------------------------------------------------------------------------------
/Q_Learning/treasure_q.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import time
4 |
5 | np.random.seed(2) # reproducible
6 |
7 |
8 | N_STATES = 6 # the length of the 1 dimensional world
9 | ACTIONS = ['left', 'right'] # available actions
10 | EPSILON = 0.9 # greedy police
11 | ALPHA = 0.1 # learning rate
12 | GAMMA = 0.9 # discount factor
13 | MAX_EPISODES = 18 # maximum episodes
14 | FRESH_TIME = 0.3 # fresh time for one move
15 |
16 |
17 | def build_q_table(n_states, actions):
18 | table = pd.DataFrame(
19 | np.zeros((n_states, len(actions))), # q_table initial values
20 | columns=actions, # actions's name
21 | )
22 | # print(table) # show table
23 | return table
24 |
25 |
26 | def choose_action(state, q_table):
27 | # This is how to choose an action
28 | state_actions = q_table.iloc[state, :]
29 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value
30 | action_name = np.random.choice(ACTIONS)
31 | else: # act greedy
32 | action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas
33 | return action_name
34 |
35 |
36 | def get_env_feedback(S, A):
37 | # This is how agent will interact with the environment
38 | if A == 'right': # move right
39 | if S == N_STATES - 2: # terminate
40 | S_ = 'terminal'
41 | R = 1
42 | else:
43 | S_ = S + 1
44 | R = 0
45 | else: # move left
46 | R = 0
47 | if S == 0:
48 | S_ = S # reach the wall
49 | else:
50 | S_ = S - 1
51 | return S_, R
52 |
53 |
54 | def update_env(S, episode, step_counter):
55 | # This is how environment be updated
56 | env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
57 | if S == 'terminal':
58 | interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
59 | print('\r{}'.format(interaction), end='')
60 | time.sleep(2)
61 | print('\r ', end='')
62 | else:
63 | env_list[S] = 'o'
64 | interaction = ''.join(env_list)
65 | print('\r{}'.format(interaction), end='')
66 | time.sleep(FRESH_TIME)
67 |
68 |
69 | def rl():
70 | # main part of RL loop
71 | q_table = build_q_table(N_STATES, ACTIONS)
72 | for episode in range(MAX_EPISODES):
73 | step_counter = 0
74 | S = 0
75 | is_terminated = False
76 | update_env(S, episode, step_counter)
77 | while not is_terminated:
78 |
79 | A = choose_action(S, q_table)
80 | S_, R = get_env_feedback(S, A) # take action & get next state and reward
81 | q_predict = q_table.loc[S, A]
82 | if S_ != 'terminal':
83 | q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal
84 | else:
85 | q_target = R # next state is terminal
86 | is_terminated = True # terminate this episode
87 |
88 | q_table.loc[S, A] += ALPHA * (q_target - q_predict) # update
89 | S = S_ # move to next state
90 |
91 | update_env(S, episode, step_counter+1)
92 | step_counter += 1
93 | return q_table
94 |
95 |
96 | if __name__ == "__main__":
97 | q_table = rl()
98 | print('\r\nQ-table:\n')
99 | print(q_table)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deep-Reinforcement-Learning
2 |
3 |
4 | 
5 |
6 | Collection of Deep Reinforcement Learning Algorithms in PyTorch.
7 |
8 |
9 | Below a list of Jupyter Notebooks with implementations
10 |
11 | # Value Based / Offline Methods
12 | ## Discrete Action Space
13 |
14 | - [Q-Learning](Q_Learning) [Source/Paper](/Paper/DQN.pdf)
15 |
16 | - [DQN](https://github.com/BY571/Reinforcement-Learning/tree/master/Deep%20Q_Learning) [Paper](/Paper/DQN.pdf)
17 |
18 | - [Double DQN](https://github.com/BY571/Reinforcement-Learning/tree/master/Double%20DQN) [Paper](/Paper/Double_DQN.pdf)
19 |
20 | - [Dueling DQN](https://github.com/BY571/DQN-Atari-Agents) [Paper](/Paper/Dueling.pdf)
21 |
22 | - [N-Step DQN](https://github.com/BY571/DQN-Atari-Agents)
23 |
24 | - [Noisy DQN](https://github.com/BY571/DQN-Atari-Agents)
25 | [Paper](/Paper/Noisy_networks.pdf)
26 |
27 | - [Rainbow](https://github.com/BY571/DQN-Atari-Agents)
28 | [Paper](https://arxiv.org/pdf/1710.02298.pdf)
29 |
30 | ## Distributional RL
31 |
32 | - [Categorical DQN - C51](https://github.com/BY571/DQN-Atari-Agents) [Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/Distributional%20DQN.pdf)
33 |
34 | - [QR-DQN](https://github.com/BY571/QR-DQN)
35 |
36 | - [IQN](https://github.com/BY571/IQN-and-Extensions)
37 |
38 | - [FQF](https://github.com/BY571/FQF-and-Extensions)
39 |
40 |
41 | ## Continuous Action Space
42 |
43 | - [NAF - Normalized Advantage Function](https://github.com/BY571/Normalized-Advantage-Function-NAF-)
44 |
45 | -[Soft-DQN] TODO
46 | _________________________________________________
47 | # Policy Based / Online Methods
48 | ## Discrete Action Space
49 |
50 |
51 | - [Sarsa](https://github.com/BY571/Reinforcement-Learning/blob/master/Temporal%20Difference%20(Sarsa%2C%20Sarsamax%2C%20Expeted%20Sarsa)/Temporal_Difference.ipynb)
52 | [Source/Paper]
53 |
54 |
55 | - [Vanilla Policy Gradient](https://github.com/BY571/Reinforcement-Learning/blob/master/Policy%20Gradient%20Algorithms/Policy_Gradien_%2B_Baseline_mean.ipynb) [+LSTM](https://github.com/BY571/Reinforcement-Learning/blob/master/Policy%20Gradient%20Algorithms/PolicyGradient_LSTM.ipynb)
56 | [Source/Paper]
57 |
58 |
59 | - A2C
60 | [Paper](/Paper/A3C.pdf)
61 |
62 | - A2C with gae* [TODO]
63 |
64 | - A2C multi environment
65 |
66 |
67 | - PPO
68 | [Paper](/Paper/PPO.pdf)
69 |
70 | - PPO with gae*
71 |
72 | - [PPO with gae and curiosity driven exploration (single, digit inputs)](https://github.com/BY571/Reinforcement-Learning/blob/master/PPO_gae_curios.ipynb) [Paper](/Paper/)
73 |
74 | - PPO multi environment
75 |
76 |
77 | ## Continuous Action Space
78 |
79 | - [A2C](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/A2C_conti_seperate_networks.ipynb)
80 |
81 | - A2C with gae* [TODO]
82 |
83 | - [A2C multi environment](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/A2C_continuous_multienv.ipynb)
84 |
85 |
86 | - [PPO](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_unity_Crawler.ipynb)
87 |
88 | - [PPO with gae*](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/ROBOSCHOOL_PPO_GAE.ipynb)[PPO with gae multi](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_conti_gae_multi.ipynb)
89 |
90 | - [PPO+curiosity&single](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_conti_gae_curios.ipynb)[+curiosity&multi](https://github.com/BY571/Reinforcement-Learning/blob/master/PPO_conti_gae_curio_multi.ipynb)
91 |
92 | - [PPO multi environment](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_unity_Crawler.ipynb)
93 |
94 |
95 |
96 |
97 | gae* = Generalized Advanted Estimation [Source](/Paper/GAE.pdf)
98 |
99 | ______________________________________________
100 |
101 | # Actor-Critic Algorithms
102 |
103 | - [DDPG](https://github.com/BY571/Udacity-DRL-Nanodegree-P2)
104 | [Source/Paper]
105 |
106 | - [D4PG](https://github.com/BY571/D4PG)
107 | [Source/Paper](https://arxiv.org/pdf/1804.08617.pdf)
108 |
109 | - [Twin Delayed DDPG (TD3)](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/TD3_conti.ipynb)
110 | [Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/TD3.pdf)
111 |
112 | - [Soft Actor Critic (SAC-newest 2019 version)](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/SAC.ipynb)
113 | [Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/SAC_2019.pdf)
114 |
115 | ________________________________________________
116 |
117 | # Upside-Down-Reinforcement-Learning
118 | Discrete and continuous action space implementation of [⅂ꓤ](https://github.com/BY571/Upside-Down-Reinforcement-Learning)
119 |
120 | ________________________________________________
121 | # Munchausen Reinforcement Learning
122 |
123 | Implementierungen von Munchausen RL
124 |
125 | - [M-DQN](https://github.com/BY571/Munchausen-RL)
126 |
127 | - [M-IQN](https://github.com/BY571/IQN-and-Extensions)
128 |
129 | - [M-FQF](https://github.com/BY571/FQF-and-Extensions)
130 |
131 | - [M-SAC](https://github.com/BY571/Soft-Actor-Critic-and-Extensions)
132 |
133 |
134 | ________________________________________________
135 |
136 | # Model-Based RL
137 |
138 | __________________________________________________
139 |
140 | # Black-Box Optimization
141 |
142 | - [Evolution Strategies]() [with mulit processing](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolution_Strategies_parallel+novelty/README.md) [and novelty search](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolution_Strategies_parallel+novelty/README.md)
143 |
144 | - [Genetic Algorithm - GARNE](https://github.com/BY571/GARNE-Genetic-Algorithm-with-Recurrent-Network-and-Novelty-Exploration/blob/master/README.md)
145 | - Genetic Algorithm implementation with LSTM, Multiprocessing over several CPUs and Novelty Search for Exploration
146 | __________________________________________
147 | # Multi-Agent Deep Reinforcement Learning
148 |
149 | - [Multi-Agent-DDPG](https://github.com/BY571/Udacity-DRL-Nanodegree-P3-Multiagent-RL-)
150 |
151 | # Hyperparameter Tuning
152 |
153 | Gridsearch
154 |
155 | Random Forest [TODO]
156 |
157 | Genetic Algorithm [TODO]
158 |
159 | ====================================
160 |
161 |
162 |
--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/README.md:
--------------------------------------------------------------------------------
1 | # Taxi Problem
2 |
3 | ### Getting Started
4 |
5 | Read the description of the environment in subsection 3.1 of [this paper](https://arxiv.org/pdf/cs/9905014.pdf). You can verify that the description in the paper matches the OpenAI Gym environment by peeking at the code [here](https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py).
6 |
7 |
8 | ### Instructions
9 |
10 | The repository contains three files:
11 | - `agent.py`: Develop your reinforcement learning agent here. This is the only file that you should modify.
12 | - `monitor.py`: The `interact` function tests how well your agent learns from interaction with the environment.
13 | - `main.py`: Run this file in the terminal to check the performance of your agent.
14 |
15 | Begin by running the following command in the terminal:
16 | ```
17 | python main.py
18 | ```
19 |
20 | When you run `main.py`, the agent that you specify in `agent.py` interacts with the environment for 20,000 episodes. The details of the interaction are specified in `monitor.py`, which returns two variables: `avg_rewards` and `best_avg_reward`.
21 | - `avg_rewards` is a deque where `avg_rewards[i]` is the average (undiscounted) return collected by the agent from episodes `i+1` to episode `i+100`, inclusive. So, for instance, `avg_rewards[0]` is the average return collected by the agent over the first 100 episodes.
22 | - `best_avg_reward` is the largest entry in `avg_rewards`. This is the final score that you should use when determining how well your agent performed in the task.
23 |
24 | Your assignment is to modify the `agents.py` file to improve the agent's performance.
25 | - Use the `__init__()` method to define any needed instance variables. Currently, we define the number of actions available to the agent (`nA`) and initialize the action values (`Q`) to an empty dictionary of arrays. Feel free to add more instance variables; for example, you may find it useful to define the value of epsilon if the agent uses an epsilon-greedy policy for selecting actions.
26 | - The `select_action()` method accepts the environment state as input and returns the agent's choice of action. The default code that we have provided randomly selects an action.
27 | - The `step()` method accepts a (`state`, `action`, `reward`, `next_state`) tuple as input, along with the `done` variable, which is `True` if the episode has ended. The default code (which you should certainly change!) increments the action value of the previous state-action pair by 1. You should change this method to use the sampled tuple of experience to update the agent's knowledge of the problem.
28 |
29 | Once you have modified the function, you need only run `python main.py` to test your new agent.
30 |
31 | OpenAI Gym [defines "solving"](https://gym.openai.com/envs/Taxi-v1/) this task as getting average return of 9.7 over 100 consecutive trials.
32 |
--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/agent.cpython-37.pyc
--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/monitor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/monitor.cpython-37.pyc
--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import defaultdict
3 |
4 |
5 | class Agent:
6 |
7 | def __init__(self, nA=6, learning_rate = 0.02, gamma = 0.9):
8 | """ Initialize agent.
9 |
10 | Params
11 | ======
12 | - nA: number of actions available to the agent
13 | """
14 | self.nA = nA
15 | self.Q = defaultdict(lambda: np.zeros(self.nA))
16 | self.lr = learning_rate
17 | self.gamma = gamma
18 |
19 |
20 | def probabilities(self,q, epsilon):
21 | probs = np.ones(self.nA) * epsilon/self.nA
22 | best_action = np.argmax(q)
23 | probs[best_action] = (1 - epsilon) + epsilon/self.nA
24 | return probs
25 |
26 | def select_action(self, state, epsilon):
27 | """ Given the state, select an action.
28 |
29 | Params
30 | ======
31 | - state: the current state of the environment
32 |
33 | Returns
34 | =======
35 | - action: an integer, compatible with the task's action space
36 | """
37 | action = np.random.choice(np.arange(self.nA), p = self.probabilities(self.Q[state], epsilon)) \
38 | if state in self.Q else np.random.choice(np.arange(self.nA))
39 | return action
40 |
41 |
42 |
43 | def step(self, state, action, reward, next_state, done):
44 | """ Update the agent's knowledge, using the most recently sampled tuple.
45 |
46 | Params
47 | ======
48 | - state: the previous state of the environment
49 | - action: the agent's previous choice of action
50 | - reward: last reward received
51 | - next_state: the current state of the environment
52 | - done: whether the episode is complete (True or False)
53 | """
54 | Q_target = np.max(self.Q[next_state])
55 | self.Q[state][action] = self.Q[state][action] + self.lr * (reward + self.gamma*(1-done)*Q_target - self.Q[state][action] )
--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/main.py:
--------------------------------------------------------------------------------
1 | from agent import Agent
2 | from monitor import interact
3 | import gym
4 | import numpy as np
5 |
6 | env = gym.make('Taxi-v2')
7 | agent = Agent()
8 | avg_rewards, best_avg_reward = interact(env, agent)
--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/monitor.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 | import sys
3 | import math
4 | import numpy as np
5 |
6 | def interact(env, agent, num_episodes=20000, window=100,epsilon_start = 1, epsilon_decay = 0.9999, epsilon_min = 0.01):
7 | """ Monitor agent's performance.
8 |
9 | Params
10 | ======
11 | - env: instance of OpenAI Gym's Taxi-v1 environment
12 | - agent: instance of class Agent (see Agent.py for details)
13 | - num_episodes: number of episodes of agent-environment interaction
14 | - window: number of episodes to consider when calculating average rewards
15 |
16 | Returns
17 | =======
18 | - avg_rewards: deque containing average rewards
19 | - best_avg_reward: largest value in the avg_rewards deque
20 | """
21 | # initialize average rewards
22 | avg_rewards = deque(maxlen=num_episodes)
23 | # initialize best average reward
24 | best_avg_reward = -math.inf
25 | # initialize monitor for most recent rewards
26 | samp_rewards = deque(maxlen=window)
27 | # for each episode
28 | epsilon = epsilon_start
29 | for i_episode in range(1, num_episodes+1):
30 | # begin the episode
31 | state = env.reset()
32 | # initialize the sampled reward
33 | samp_reward = 0
34 | while True:
35 | epsilon = max(epsilon*epsilon_decay,epsilon_min)
36 | # agent selects an action
37 | action = agent.select_action(state, epsilon) #
38 | # agent performs the selected action
39 | next_state, reward, done, _ = env.step(action)
40 | # agent performs internal updates based on sampled experience
41 | agent.step(state, action, reward, next_state, done)
42 | # update the sampled reward
43 | samp_reward += reward
44 | # update the state (s <- s') to next time step
45 | state = next_state
46 | if done:
47 | # save final sampled reward
48 | samp_rewards.append(samp_reward)
49 | break
50 | if (i_episode >= 100):
51 | # get average reward from last 100 episodes
52 | avg_reward = np.mean(samp_rewards)
53 | # append to deque
54 | avg_rewards.append(avg_reward)
55 | # update best average reward
56 | if avg_reward > best_avg_reward:
57 | best_avg_reward = avg_reward
58 | # monitor progress
59 | print("\rEpisode {}/{} || Best average reward {} || Epsilon {}".format(i_episode, num_episodes, best_avg_reward, epsilon), end="")
60 | sys.stdout.flush()
61 | # check if task is solved (according to OpenAI Gym)
62 | if best_avg_reward >= 9.7:
63 | print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
64 | break
65 | if i_episode == num_episodes: print('\n')
66 | return avg_rewards, best_avg_reward
--------------------------------------------------------------------------------
/imgs/web-3706562_640.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/imgs/web-3706562_640.jpg
--------------------------------------------------------------------------------