├── Black-Box Optimization ├── Evolution_Strategies_parallel+novelty │ ├── ES_baseline_parallel.py │ ├── ES_disc_parallel_novelty.py │ ├── ES_own_conti_parallel.py │ ├── README.md │ └── imgs │ │ └── pendulum.png ├── Evolutionary_Strategies_Cartpole.ipynb ├── README.md ├── genetic_algorithm_base.py └── imgs │ └── ga_cartpole.png ├── Categorical_DQN.ipynb ├── ContinousControl ├── A2C_conti_seperate_networks.ipynb ├── A2C_continuous_multienv.ipynb ├── DDPG.py ├── MultiPro.py ├── PPO_conti_gae_curios.ipynb ├── PPO_conti_gae_multi.ipynb ├── PPO_gae_multi.py ├── PPO_test_crawler.ipynb ├── PPO_unity_Crawler.ipynb ├── Parallel_processing.py ├── ROBOSCHOOL_PPO_GAE.ipynb ├── SAC.ipynb ├── SAC_script.py └── TD3_conti.ipynb ├── Cross_entropy ├── Cross_entropy.py ├── README.md └── img │ └── Cross_entropy.png ├── Deep Q_Learning ├── DQN_Experience_Replay.py ├── Img │ ├── 4k Learning_curve.png │ └── Converging.png └── README.md ├── Double DQN ├── CNN_Double_DQN.py ├── Double_DQN.py ├── Imgs │ ├── 4000_40-40.png │ ├── CNN_pong_converge.png │ └── test.png ├── README.md └── wrapper.py ├── Dueling Deep Q-Network ├── CNN_Dueling_DDQN.py ├── CNN_Dueling_DDQN_PER.py ├── Dueling_DQN.ipynb ├── Img │ ├── Duel_per.png │ └── Dueling_DQN.png ├── PrioritizedExperienceReplay.py ├── Video │ ├── Breakout.mp4 │ └── Pong.mp4 └── wrapper.py ├── Grid_search_for_Reinforcement_learning.ipynb ├── Noisy_DQN.ipynb ├── Nstep_DQN.ipynb ├── PPO_conti_gae_curio_multi.ipynb ├── PPO_gae_curios.ipynb ├── Paper ├── A3C.pdf ├── DDPG.pdf ├── DQN.pdf ├── Distributional DQN.pdf ├── Double_DQN.pdf ├── Dueling.pdf ├── GAE.pdf ├── Noisy_networks.pdf ├── PPO.pdf ├── SAC_2019.pdf └── TD3.pdf ├── Policy Gradient Algorithms ├── A2C.ipynb ├── A2C_conti_seperate_networks.ipynb ├── A2C_continous_action_space.ipynb ├── A2C_continuous_multienv.ipynb ├── PPO.ipynb ├── Parallel_processing.py ├── PolicyGradient_LSTM.ipynb ├── Policy_Gradien_+_Baseline_mean.ipynb └── REINFORCE │ ├── Img │ └── Steps_needed.png │ └── REINFORCE.py ├── Q_Learning ├── FrozenLake_q-table.py ├── Img │ ├── Q_table10000.png │ ├── Q_value.png │ ├── Receivedrewards.png │ └── steps_taken.png ├── Q_Table_E10000_a0.09_g0.9_eps0.9.pkl ├── Q_Table_E3000_a0.09_g0.9_eps0.9.pkl ├── Q_Table_own_example.ipynb ├── Readme.md ├── play_FrozenLake_Q_table.py ├── train_FrozenLake_Qtable.py └── treasure_q.py ├── README.md ├── Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa) ├── Temporal_Difference.ipynb └── lab-taxi │ ├── README.md │ ├── __pycache__ │ ├── agent.cpython-37.pyc │ └── monitor.cpython-37.pyc │ ├── agent.py │ ├── main.py │ └── monitor.py └── imgs └── web-3706562_640.jpg /Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_baseline_parallel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 9 10:24:39 2019 4 | 5 | @author: Z0014354 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import gym 11 | import multiprocessing as mp 12 | import collections 13 | import copy 14 | 15 | ITERS_PER_UPDATE = 10 16 | NOISE_STD = 0.01 17 | LR = 1e-3 18 | PROCESSES_COUNT = 8 # amount of worker 19 | HIDDEN_SIZE = 4 20 | ENV_NAME = "CartPole-v0" 21 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps']) 22 | 23 | 24 | 25 | class Model(object): 26 | 27 | def __init__(self, stateCnt, actionCnt, hidden_size = HIDDEN_SIZE): 28 | # inits zero weights 29 | self.weights = [np.zeros(shape=(stateCnt, hidden_size)), np.zeros(shape=(hidden_size, hidden_size)), np.zeros(shape=(hidden_size,actionCnt))] 30 | 31 | def predict(self, inp): 32 | out = np.expand_dims(inp.flatten(), 0) 33 | out = out / np.linalg.norm(out) 34 | weight_len = len(self.weights) 35 | for idx, layer in enumerate(self.weights): 36 | # hidden activation 37 | if idx < weight_len - 1: 38 | out = self.activation(np.dot(out, layer)) 39 | # outout activation 40 | else: 41 | out = self.activation(np.dot(out, layer), type_="output_layer") 42 | return out[0] 43 | 44 | def activation(self,x, type_="hidden"): 45 | if type_ == "hidden": 46 | # relu 47 | return np.maximum(x,0) 48 | 49 | # softmax 50 | #return (np.exp(x))/sum(np.exp(x)) 51 | 52 | #softplus 53 | #return np.log(1 + np.exp(x)) 54 | 55 | #sigmoid 56 | #return 1/(1+np.exp(-x)) 57 | 58 | # tanh 59 | #return np.tanh(x) 60 | else: 61 | # softnmax 62 | #return (np.exp(x))/sum(np.exp(x)) 63 | 64 | # relu 65 | return np.maximum(x,0) 66 | 67 | def get_weights(self): 68 | return self.weights 69 | 70 | def set_weights(self, weights): 71 | self.weights = weights 72 | 73 | 74 | def evaluate(env, brain): 75 | """ 76 | Runs an evaluation on the given brain. 77 | """ 78 | state = env.reset() 79 | rewards = 0 80 | steps = 0 81 | while True: 82 | state = np.expand_dims(state, axis=0) 83 | action_prob = brain.predict(state) 84 | action = action_prob.argmax() # for discrete action space 85 | 86 | next_state, reward, done, _ = env.step(action) 87 | rewards += reward 88 | steps += 1 89 | state = next_state 90 | if done: 91 | break 92 | 93 | return rewards, steps 94 | 95 | 96 | def sample_noise(brain): 97 | """ 98 | Sampling noise to a positive and negative noise buffer. 99 | """ 100 | pos = [] 101 | neg = [] 102 | for param in brain.get_weights(): 103 | noise_t = np.random.normal(size = param.shape) 104 | pos.append(noise_t) 105 | neg.append(-noise_t) 106 | return pos, neg 107 | 108 | 109 | def eval_with_noise(env, brain, noise, noise_std): 110 | """ 111 | Evaluates the current brain with added parameter noise 112 | 113 | """ 114 | 115 | old_params = copy.deepcopy(brain.get_weights()) 116 | new_params = [] 117 | for p, p_n in zip(brain.get_weights(), noise): 118 | p += noise_std*p_n 119 | new_params.append(p) 120 | brain.set_weights(new_params) 121 | r, s = evaluate(env, brain) 122 | brain.set_weights(old_params) 123 | return r, s 124 | 125 | 126 | def worker_func(worker_id, params_queue, rewards_queue, noise_std): 127 | #print("worker: {} has started".format(worker_id)) 128 | env = gym.make(ENV_NAME) 129 | net = Model(env.observation_space.shape[0], env.action_space.n) 130 | 131 | while True: 132 | params = params_queue.get() 133 | if params is None: 134 | break 135 | 136 | # set parameters of the queue - equal to: net.load_state_dict(params) 137 | net.set_weights([param for param in params]) 138 | 139 | for _ in range(ITERS_PER_UPDATE): 140 | seed = np.random.randint(low=0, high=65535) 141 | np.random.seed(seed) 142 | noise, neg_noise = sample_noise(net) 143 | pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std) 144 | neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std) 145 | #print(_, "\n",noise, pos_reward, neg_reward) 146 | 147 | rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) 148 | 149 | pass 150 | 151 | 152 | def train_step(brain, batch_noise, batch_rewards, step_idx): 153 | """ 154 | Optimizes the weights of the NN based on the rewards and noise gathered 155 | """ 156 | # normalize rewards to have zero mean and unit variance 157 | norm_reward = np.array(batch_reward) 158 | norm_reward -= np.mean(norm_reward) 159 | s = np.std(norm_reward) 160 | if abs(s) > 1e-6: 161 | norm_reward /= s 162 | 163 | weighted_noise = None 164 | for noise, reward in zip(batch_noise, norm_reward): 165 | if weighted_noise is None: 166 | weighted_noise = [reward * p_n for p_n in noise] 167 | else: 168 | for w_n, p_n in zip(weighted_noise, noise): 169 | w_n += reward * p_n 170 | 171 | 172 | for p, p_update in zip(brain.get_weights(), weighted_noise): 173 | update = p_update / (len(batch_reward)*NOISE_STD) 174 | p += LR * update 175 | 176 | 177 | 178 | if __name__ == "__main__": 179 | 180 | env = gym.make(ENV_NAME) 181 | #env.seed(2) 182 | brain = Model(env.observation_space.shape[0], env.action_space.n) 183 | 184 | iterations = 100 # max iterations to run 185 | 186 | params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)] 187 | rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE) 188 | 189 | 190 | workers = [] 191 | 192 | 193 | for idx, params_queue in enumerate(params_queues): 194 | proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD)) 195 | proc.start() 196 | workers.append(proc) 197 | 198 | print("All started!") 199 | step_idx = 0 200 | reward_history = [] 201 | reward_max =[] 202 | reward_std = [] 203 | 204 | 205 | for step_idx in range(iterations): 206 | # broadcasting network params 207 | params = brain.get_weights() 208 | for q in params_queues: 209 | q.put(params) 210 | 211 | batch_noise = [] 212 | batch_reward = [] 213 | batch_steps_data = [] 214 | batch_steps = 0 215 | results = 0 216 | while True: 217 | while not rewards_queue.empty(): 218 | reward = rewards_queue.get_nowait() 219 | np.random.seed(reward.seed) # sets the seed of the current worker rewards 220 | noise, neg_noise = sample_noise(brain) 221 | batch_noise.append(noise) 222 | batch_reward.append(reward.pos_reward) 223 | batch_noise.append(neg_noise) 224 | batch_reward.append(reward.neg_reward) 225 | results += 1 226 | batch_steps += reward.steps 227 | 228 | if results == PROCESSES_COUNT * ITERS_PER_UPDATE: 229 | break 230 | 231 | step_idx += 1 232 | m_reward = np.mean(batch_reward) 233 | reward_history.append(m_reward) 234 | reward_max.append(np.max(batch_reward)) 235 | reward_std.append(np.std(batch_reward)) 236 | if m_reward > 199: 237 | print("\nSolved the environment in {} steps".format(step_idx)) 238 | break 239 | train_step(brain, batch_noise, batch_reward, step_idx) 240 | 241 | print("\rStep: {}, Mean_Reward: {:.2f}".format(step_idx, m_reward), end = "", flush = True) 242 | 243 | 244 | for worker, p_queue in zip(workers, params_queues): 245 | p_queue.put(None) 246 | worker.join() 247 | 248 | plt.figure(figsize = (11,7)) 249 | plt.plot(reward_history, label = "Mean Reward", color = "orange") 250 | plt.plot(reward_max, label = "Max Reward", color = "blue") 251 | plt.plot(reward_std, label = "Reward std", color = "green") 252 | plt.xlabel("Steps") 253 | plt.ylabel("Rewards") 254 | plt.legend() 255 | plt.show() -------------------------------------------------------------------------------- /Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_disc_parallel_novelty.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 9 10:24:39 2019 4 | 5 | @author: Z0014354 6 | """ 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.distributions import Normal 11 | 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import gym 15 | import torch.multiprocessing as mp 16 | import collections 17 | from collections import deque 18 | import copy 19 | from tensorboardX import SummaryWriter 20 | from sklearn.neighbors import NearestNeighbors 21 | 22 | ITERS_PER_UPDATE = 10 23 | NOISE_STD = 0.1 #0.04 higher std leeds to better exploration - more stable learning 24 | LR = 2e-2 25 | PROCESSES_COUNT = 6 # amount of worker default 6 26 | HIDDEN_SIZE = 5 # 6 27 | K_NEIGHBORS = 10 28 | ENV_NAME = "CartPole-v0" #"Alien-ram-v0" 29 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps']) 30 | 31 | 32 | 33 | class Model(nn.Module): 34 | def __init__(self, state_size, action_size, idx, hidden_size=HIDDEN_SIZE): 35 | super(Model, self).__init__() 36 | self.idx = idx 37 | self.fc1 = nn.Linear(state_size, hidden_size) 38 | self.fc2 = nn.Linear(hidden_size, hidden_size) 39 | self.fc3 = nn.Linear(hidden_size, action_size) 40 | 41 | def forward(self, x): 42 | x = torch.relu(self.fc1(x)) 43 | x = torch.relu(self.fc2(x)) 44 | probs = torch.softmax(self.fc3(x), dim=1) 45 | return probs 46 | 47 | 48 | def evaluate(env, brain): 49 | """ 50 | Runs an evaluation on the given brain. 51 | """ 52 | state = env.reset() 53 | rewards = 0 54 | steps = 0 55 | while True: 56 | state = torch.from_numpy(state).unsqueeze(0).float() 57 | probs = brain(state) 58 | action = probs.max(dim = 1)[1] 59 | next_state, reward, done, _ = env.step(action.data.numpy()[0]) 60 | rewards += reward 61 | steps += 1 62 | state = next_state 63 | if done: 64 | break 65 | 66 | return rewards, steps 67 | 68 | 69 | def sample_noise(brain): 70 | """ 71 | Samples noise from a normal distribution in the shape of the brain parameters. Output are two noisy parameters: + noise and - noise (for better and more stable learning!) 72 | """ 73 | pos = [] 74 | neg = [] 75 | for param in brain.parameters(): 76 | noise_t = torch.tensor(np.random.normal(size = param.data.size()).astype(np.float32)) 77 | pos.append(noise_t) 78 | neg.append(-noise_t) 79 | return pos, neg 80 | 81 | 82 | def eval_with_noise(env, brain, noise, noise_std): 83 | """ 84 | Evaluates the current brain with added parameter noise 85 | 86 | """ 87 | for p, p_n in zip(brain.parameters(), noise): 88 | p.data += noise_std * p_n 89 | r, s = evaluate(env, brain) 90 | for p, p_n in zip(brain.parameters(), noise): 91 | p.data -= noise_std * p_n 92 | return r, s 93 | 94 | 95 | def worker_func(worker_id, params_queue, rewards_queue, noise_std): 96 | """ 97 | Worker function that gathers pos and negative rewards for the optimization process and puts them in the rewards_queue with the network parameter seed: 98 | >> rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) << 99 | """ 100 | #print("worker: {} has started".format(worker_id)) 101 | env = gym.make(ENV_NAME) 102 | net = Model(env.observation_space.shape[0], env.action_space.n, "worker") 103 | net.eval() 104 | while True: 105 | params = params_queue.get() 106 | if params is None: 107 | break 108 | 109 | # set parameters of the queue 110 | net.load_state_dict(params) 111 | 112 | for _ in range(ITERS_PER_UPDATE): 113 | seed = np.random.randint(low=0, high=65535) 114 | np.random.seed(seed) 115 | noise, neg_noise = sample_noise(net) 116 | pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std) 117 | neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std) 118 | #print(_, "\n",noise, pos_reward, neg_reward) 119 | rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) 120 | 121 | pass 122 | 123 | 124 | def train_step(brain, novelty, batch_noise, batch_rewards, step_idx): 125 | """ 126 | Optimizes the weights of the NN based on the rewards and noise gathered 127 | """ 128 | # normalize rewards to have zero mean and unit variance 129 | norm_reward = np.array(batch_reward) 130 | norm_reward -= np.mean(norm_reward) 131 | s = np.std(norm_reward) 132 | if abs(s) > 1e-6: 133 | norm_reward /= s 134 | 135 | weighted_noise = None 136 | for noise, reward in zip(batch_noise, norm_reward): 137 | if weighted_noise is None: 138 | weighted_noise = [(W*reward* p_n) + ((1-W)*novelty*p_n) for p_n in noise] # combining reward and novelty 139 | else: 140 | for w_n, p_n in zip(weighted_noise, noise): 141 | w_n += (W*reward* p_n) + ((1-W)*novelty*p_n) 142 | 143 | 144 | for p, p_update in zip(brain.parameters(), weighted_noise): 145 | update = p_update / (len(batch_reward)*NOISE_STD) 146 | p.data += LR * update 147 | 148 | 149 | def test_current_params(env, net): 150 | """ 151 | Runs the current network parameters on the env to visually monitor the progress. 152 | """ 153 | state = env.reset() 154 | 155 | while True: 156 | env.render() 157 | state = torch.from_numpy(state).unsqueeze(0).float() 158 | probs = brain(state) 159 | action = probs.max(dim = 1)[1] 160 | state, reward, done, _ = env.step(action.data.numpy()[0]) 161 | 162 | if done: 163 | break 164 | 165 | def get_behavior_char(env, net): 166 | """ 167 | Returns the initial behavior characterization value b_pi0 for a network. 168 | The value is defined in this case as the final state of agent in the environment. 169 | 170 | >>> Important to find a good behavior characterization. Depents on the environment! <<< -> final state, step count ... 171 | 172 | """ 173 | state = env.reset() 174 | step_count = 0 175 | while True: 176 | state = torch.from_numpy(state).unsqueeze(0).float() 177 | probs = brain(state) 178 | action = probs.max(dim = 1)[1] 179 | state, reward, done, _ = env.step(action.data.numpy()[0]) 180 | step_count += 1 181 | if done: 182 | break 183 | #print(step_count) 184 | return np.array([step_count]) #state 185 | 186 | 187 | def get_kNN(archive, bc, n_neighbors): 188 | """ 189 | Searches and samples the K-nearest-neighbors from the archive and a new behavior characterization 190 | returns the summed distance between input behavior characterization and the bc in the archive 191 | 192 | """ 193 | 194 | archive = np.concatenate(archive) 195 | neigh = NearestNeighbors(n_neighbors=n_neighbors) 196 | neigh.fit(archive) 197 | distances, idx = neigh.kneighbors(X = bc, n_neighbors=n_neighbors) 198 | #k_nearest_neighbors = archive[idx].squeeze(0) 199 | 200 | return sum(distances.squeeze(0)) 201 | 202 | 203 | 204 | # ============================================================================= 205 | # def calc_novelty(b_pi_theta, archive): 206 | # """ 207 | # calculates the novelty of a given arcive of behavior characterizations. 208 | # returns the mean distance between the initial behavior characterizations and all new gathered behavior characterizations. 209 | # """ 210 | # # distance loss function: 211 | # distance = nn.MSELoss() #nn.PairwiseDistance() 212 | # # creates arcive vector for distance calc 213 | # archive_v = torch.cat(archive) 214 | # # create a vector of initial behavior characterizations in the shape of the arcive length 215 | # b_pi_theta_v = torch.cat([b_pi_theta for i in range(len(archive))]) 216 | # 217 | # return torch.sqrt(distance(b_pi_theta_v, archive_v)).mean() 218 | # ============================================================================= 219 | 220 | def calc_noveltiy_distribution(novelties): 221 | """ 222 | Calculates the probabilities of each model parameters of being selected as its 223 | novelty normalized by the sum of novelty across all policies: 224 | 225 | P(theta_m) for each element in the meta_population M - m element M 226 | 227 | """ 228 | probabilities = [round((novel/(sum(novelties))),4) for novel in novelties] 229 | return probabilities 230 | 231 | 232 | if __name__ == "__main__": 233 | 234 | env = gym.make(ENV_NAME) 235 | #env.seed(2) 236 | MPS = 2 # meta population size 237 | meta_population = [Model(env.observation_space.shape[0],env.action_space.n, idx=i) for i in range(MPS)] 238 | 239 | # create arcive for models 240 | archive = [] 241 | writer = SummaryWriter() 242 | iterations = 300 #1500 # max iterations to run 243 | 244 | delta_reward_buffer = deque(maxlen=10) # buffer to store the reward gradients to see if rewards stay constant over a defined time horizont ~> local min 245 | W = 1 246 | 247 | params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)] 248 | rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE) 249 | workers = [] 250 | 251 | for idx, params_queue in enumerate(params_queues): 252 | proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD)) 253 | proc.start() 254 | workers.append(proc) 255 | 256 | print("All started!") 257 | step_idx = 0 258 | reward_history = [] 259 | reward_max =[] 260 | reward_min = [] 261 | reward_std = [] 262 | 263 | old_m_reward = 0 264 | 265 | for step_idx in range(iterations): 266 | 267 | ########################## NOVELTY BRAIN SELECTION ############################# 268 | # select new network from the meta population based on its probability: 269 | if len(archive) > 0: 270 | novelties = [] 271 | S = np.minimum(K_NEIGHBORS, len(archive)) 272 | for model in meta_population: 273 | b_pi_theta = torch.from_numpy(get_behavior_char(env, model)).unsqueeze(0).float() 274 | distance = get_kNN(archive, b_pi_theta.numpy(), S) 275 | novelty = distance / S 276 | if novelty <= 1e-3: 277 | novelty = 5e-3 278 | novelties.append(novelty) 279 | 280 | #print("novelties:", novelties) 281 | 282 | probs = calc_noveltiy_distribution(novelties) 283 | #print("probs: ", probs ) 284 | probs = np.array(probs) 285 | probs /= probs.sum() # norm so that sum up to one - does without as well but np gives error because of rounding 286 | brain_idx = np.random.choice(list(range(MPS)),p=probs) # select new brain based on novelty probabilities 287 | brain = meta_population[brain_idx] 288 | novelty = novelties[brain_idx] 289 | else: 290 | brain_idx = np.random.randint(0, MPS) 291 | brain = meta_population[brain_idx] 292 | novelty = 1 293 | ################################################################################### 294 | 295 | # broadcasting network params 296 | params = brain.state_dict() 297 | for q in params_queues: 298 | q.put(params) 299 | 300 | batch_noise = [] 301 | batch_reward = [] 302 | batch_steps_data = [] 303 | batch_steps = 0 304 | results = 0 305 | 306 | while True: 307 | #print(rewards_queue.qsize()) 308 | while not rewards_queue.empty(): 309 | reward = rewards_queue.get_nowait() 310 | np.random.seed(reward.seed) # sets the seed of the current worker rewards 311 | noise, neg_noise = sample_noise(brain) 312 | batch_noise.append(noise) 313 | batch_reward.append(reward.pos_reward) 314 | batch_noise.append(neg_noise) 315 | batch_reward.append(reward.neg_reward) 316 | results += 1 317 | batch_steps += reward.steps 318 | 319 | if results == PROCESSES_COUNT * ITERS_PER_UPDATE: 320 | break 321 | 322 | step_idx += 1 323 | m_reward = np.mean(batch_reward) 324 | 325 | reward_gradient_mean = np.mean(delta_reward_buffer) 326 | r_koeff = abs(m_reward - reward_gradient_mean) 327 | # if last few rewards are almost konstant -> stuck in loc minima -> decrease W for exploration: higher novelty weight 328 | if r_koeff < 1.5: 329 | W = np.maximum(0, W-0.05) 330 | else: 331 | W = np.minimum(1, W+0.05) 332 | delta_reward_buffer.append(m_reward) 333 | old_m_reward = m_reward 334 | 335 | writer.add_scalar("mean_reward", np.mean(batch_reward), step_idx) 336 | writer.add_scalar("max_reward", np.max(batch_reward), step_idx) 337 | writer.add_scalar("min_reward", np.min(batch_reward), step_idx) 338 | writer.add_scalar("std", np.std(batch_reward), step_idx) 339 | writer.add_scalar("novelty", novelty, step_idx) 340 | writer.add_scalar("novelty_w", W, step_idx) 341 | # ============================================================================= 342 | # if m_reward > -250: 343 | # print("\nSolved the environment in {} steps".format(step_idx)) 344 | # break 345 | # ============================================================================= 346 | train_step(brain, novelty, batch_noise, batch_reward, step_idx) 347 | # select new behavior: 348 | b_pix = torch.from_numpy(get_behavior_char(env, brain)).unsqueeze(0).float() 349 | # append new behavior to specific brain archive 350 | archive.append(b_pix.numpy()) 351 | 352 | print("\rStep: {}, Mean_Reward: {:.2f}, Novelty: {:.2f}, W: {:.2f} r_koeff: {:.2f}".format(step_idx, m_reward, novelty, W, r_koeff), end = "", flush = True) 353 | 354 | # if step_idx % 10 == 0: 355 | # test_current_params(env, brain) 356 | 357 | for worker, p_queue in zip(workers, params_queues): 358 | p_queue.put(None) 359 | worker.join() 360 | -------------------------------------------------------------------------------- /Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_own_conti_parallel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 9 10:24:39 2019 4 | 5 | @author: Z0014354 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import gym 11 | import multiprocessing as mp 12 | import collections 13 | import copy 14 | 15 | ITERS_PER_UPDATE = 10 16 | NOISE_STD = 0.1 #0.04 higher std leeds to better exploration - more stable learning 17 | LR = 2e-2 18 | PROCESSES_COUNT = 6 # amount of worker default 6 19 | HIDDEN_SIZE = 12 # 6 20 | ENV_NAME = "Damper-v0" 21 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps']) 22 | 23 | 24 | 25 | class Model(object): 26 | 27 | def __init__(self, stateCnt, actionCnt, hidden_size = HIDDEN_SIZE): 28 | # inits zero weights 29 | self.weights = [np.random.uniform(-1,1,size=(stateCnt, hidden_size)), np.random.uniform(-1,1, size=(hidden_size, hidden_size)), np.random.uniform(-1,1,size=(hidden_size,actionCnt))] 30 | 31 | def predict(self, inp): 32 | out = np.expand_dims(inp.flatten(), 0) 33 | #out = out / np.linalg.norm(out) 34 | weight_len = len(self.weights) 35 | for idx, layer in enumerate(self.weights): 36 | # hidden activation 37 | if idx < weight_len - 1: 38 | out = self.activation(np.dot(out, layer)) 39 | # outout activation 40 | else: 41 | out = self.activation(np.dot(out, layer), type_="output_layer") 42 | return out[0] 43 | 44 | def activation(self,x, type_="hidden"): 45 | if type_ == "hidden": 46 | # relu 47 | return np.maximum(x,0) 48 | 49 | # softmax 50 | #return (np.exp(x))/sum(np.exp(x)) 51 | 52 | #softplus 53 | #return np.log(1 + np.exp(x)) 54 | 55 | #sigmoid 56 | #return 1/(1+np.exp(-x)) 57 | 58 | # tanh 59 | #return np.tanh(x) 60 | else: 61 | # tanh 62 | return np.tanh(x) 63 | 64 | # relu 65 | #return np.maximum(x,0) 66 | 67 | def get_weights(self): 68 | return self.weights 69 | 70 | def set_weights(self, weights): 71 | self.weights = weights 72 | 73 | 74 | def evaluate(env, brain): 75 | """ 76 | Runs an evaluation on the given brain. 77 | """ 78 | state = env.reset() 79 | rewards = 0 80 | steps = 0 81 | while True: 82 | state = np.expand_dims(state, axis=0) 83 | #print("State:", state) 84 | action_mean = brain.predict(state) 85 | action = np.random.normal(action_mean, scale=0.01) 86 | action = np.clip(action, -1, 1) # pendulums action range is between -2,2 87 | next_state, reward, done, _ = env.step(action) 88 | rewards += reward 89 | steps += 1 90 | state = next_state 91 | if done: 92 | break 93 | 94 | return rewards, steps 95 | 96 | 97 | def sample_noise(brain): 98 | """ 99 | Samples noise from a normal distribution in the shape of the brain parameters. Output are two noisy parameters: + noise and - noise (for better and more stable learning!) 100 | """ 101 | pos = [] 102 | neg = [] 103 | for param in brain.get_weights(): 104 | noise_t = np.random.normal(size = param.shape) 105 | pos.append(noise_t) 106 | neg.append(-noise_t) 107 | return pos, neg 108 | 109 | 110 | def eval_with_noise(env, brain, noise, noise_std): 111 | """ 112 | Evaluates the current brain with added parameter noise 113 | 114 | """ 115 | old_params = copy.deepcopy(brain.get_weights()) 116 | new_params = [] 117 | for p, p_n in zip(brain.get_weights(), noise): 118 | p += noise_std*p_n 119 | new_params.append(p) 120 | brain.set_weights(new_params) 121 | r, s = evaluate(env, brain) 122 | brain.set_weights(old_params) 123 | return r, s 124 | 125 | 126 | def worker_func(worker_id, params_queue, rewards_queue, noise_std): 127 | """ 128 | Worker function that gathers pos and negative rewards for the optimization process and puts them in the rewards_queue with the network parameter seed: 129 | >> rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) << 130 | """ 131 | #print("worker: {} has started".format(worker_id)) 132 | env = gym.make(ENV_NAME) 133 | net = Model(env.observation_space.shape[0], env.action_space.shape[0]) 134 | 135 | while True: 136 | params = params_queue.get() 137 | if params is None: 138 | break 139 | 140 | # set parameters of the queue - equal to: net.load_state_dict(params) 141 | net.set_weights([param for param in params]) 142 | 143 | for _ in range(ITERS_PER_UPDATE): 144 | seed = np.random.randint(low=0, high=65535) 145 | np.random.seed(seed) 146 | noise, neg_noise = sample_noise(net) 147 | pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std) 148 | neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std) 149 | #print(_, "\n",noise, pos_reward, neg_reward) 150 | rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) 151 | 152 | pass 153 | 154 | 155 | def train_step(brain, batch_noise, batch_rewards, step_idx): 156 | """ 157 | Optimizes the weights of the NN based on the rewards and noise gathered 158 | """ 159 | # normalize rewards to have zero mean and unit variance 160 | norm_reward = np.array(batch_reward) 161 | norm_reward -= np.mean(norm_reward) 162 | s = np.std(norm_reward) 163 | if abs(s) > 1e-6: 164 | norm_reward /= s 165 | 166 | weighted_noise = None 167 | for noise, reward in zip(batch_noise, norm_reward): 168 | if weighted_noise is None: 169 | weighted_noise = [reward * p_n for p_n in noise] 170 | else: 171 | for w_n, p_n in zip(weighted_noise, noise): 172 | w_n += reward * p_n 173 | 174 | 175 | for p, p_update in zip(brain.get_weights(), weighted_noise): 176 | update = p_update / (len(batch_reward)*NOISE_STD) 177 | p += LR * update 178 | 179 | 180 | def test_current_params(env, brain): 181 | """ 182 | Runs the current network parameters on the env to visually monitor the progress. 183 | """ 184 | state = env.reset() 185 | 186 | while True: 187 | env.render() 188 | state = np.expand_dims(state, axis=0) 189 | action_mean = brain.predict(state) 190 | action = np.random.normal(action_mean, scale=0.01) 191 | action = np.clip(action, -1, 1) # pendulums action range is between -2,2 192 | state, reward, done, _ = env.step(action) 193 | 194 | if done: 195 | break 196 | 197 | 198 | if __name__ == "__main__": 199 | 200 | env = gym.make(ENV_NAME) 201 | #env.seed(2) 202 | brain = Model(env.observation_space.shape[0], env.action_space.shape[0]) 203 | 204 | iterations = 100 #1500 # max iterations to run 205 | 206 | params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)] 207 | rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE) 208 | workers = [] 209 | 210 | for idx, params_queue in enumerate(params_queues): 211 | proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD)) 212 | proc.start() 213 | workers.append(proc) 214 | 215 | print("All started!") 216 | step_idx = 0 217 | reward_history = [] 218 | reward_max =[] 219 | reward_min = [] 220 | reward_std = [] 221 | 222 | for step_idx in range(iterations): 223 | # broadcasting network params 224 | params = brain.get_weights() 225 | for q in params_queues: 226 | q.put(params) 227 | 228 | batch_noise = [] 229 | batch_reward = [] 230 | batch_steps_data = [] 231 | batch_steps = 0 232 | results = 0 233 | 234 | while True: 235 | while not rewards_queue.empty(): 236 | reward = rewards_queue.get_nowait() 237 | np.random.seed(reward.seed) # sets the seed of the current worker rewards 238 | noise, neg_noise = sample_noise(brain) 239 | batch_noise.append(noise) 240 | batch_reward.append(reward.pos_reward) 241 | batch_noise.append(neg_noise) 242 | batch_reward.append(reward.neg_reward) 243 | results += 1 244 | batch_steps += reward.steps 245 | 246 | if results == PROCESSES_COUNT * ITERS_PER_UPDATE: 247 | break 248 | 249 | step_idx += 1 250 | m_reward = np.mean(batch_reward) 251 | reward_history.append(m_reward) 252 | reward_max.append(np.max(batch_reward)) 253 | reward_min.append(np.min(batch_reward)) 254 | reward_std.append(np.std(batch_reward)) 255 | # ============================================================================= 256 | # if m_reward > -250: 257 | # print("\nSolved the environment in {} steps".format(step_idx)) 258 | # break 259 | # ============================================================================= 260 | train_step(brain, batch_noise, batch_reward, step_idx) 261 | 262 | print("\rStep: {}, Mean_Reward: {:.2f}".format(step_idx, m_reward), end = "", flush = True) 263 | 264 | if step_idx % 10 == 0: 265 | test_current_params(env, brain) 266 | 267 | for worker, p_queue in zip(workers, params_queues): 268 | p_queue.put(None) 269 | worker.join() 270 | 271 | plt.figure(figsize = (11,7)) 272 | plt.plot(reward_history, label = "Mean Reward", color = "green") 273 | plt.plot(reward_max, label = "Max Reward", color = "blue") 274 | plt.plot(reward_min, label = "Min Reward", color = "red") 275 | plt.plot(reward_std, label = "Reward std", color = "orange") 276 | plt.xlabel("Steps") 277 | plt.ylabel("Rewards") 278 | plt.legend() 279 | plt.show() -------------------------------------------------------------------------------- /Black-Box Optimization/Evolution_Strategies_parallel+novelty/README.md: -------------------------------------------------------------------------------- 1 | # Evolution Strategy algorithms 2 | 3 | 4 | 5 | This folder contains 3 different Evolutionary Strategy algorithms: 6 | 7 | - [ES_baseline_parallel.py](ES_baseline_parallel.py) 8 | 9 | Baseline Evolution Strategy Algorithm for discrete action space that solves the CartPole environment.
10 | Code is based on this [Paper](https://arxiv.org/abs/1703.03864) and on this [book chapter](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16) 11 | Written solely in numpy! 12 | 13 | 14 | - [ES_dis_parallel_novelty.py](ES_dis_parallel_novelty.py) 15 | 16 | Evolution Strategy algorithm for discrete action space with a novelty search for extra exploration.
17 | Code is based on [Paper EvoStrategy](https://arxiv.org/abs/1703.03864), [Paper novelty seeking agents](http://papers.nips.cc/paper/7750-improving-exploration-in-evolution-strategies-for-deep-reinforcement-learning-via-a-population-of-novelty-seeking-agents) and the [book chapter 16](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16). 18 | Written with pytorch. 19 | 20 | - [ES_conti_parallel_novelty.py](ES_conti_parallel_novelty.py) 21 | 22 | Evolution Strategy algorithm for continuous action space with a novelty search for extra exploration.
23 | Code is based on [Paper EvoStrategy](https://arxiv.org/abs/1703.03864), [Paper novelty seeking agents](http://papers.nips.cc/paper/7750-improving-exploration-in-evolution-strategies-for-deep-reinforcement-learning-via-a-population-of-novelty-seeking-agents) and the [book chapter 16](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16). 24 | Written with pytorch. 25 | 26 | 27 | 28 | Evolution Strategies solving Pendulum: 29 | 30 | 31 | ![alt_text](imgs/pendulum.png) 32 | 33 | -------------------------------------------------------------------------------- /Black-Box Optimization/Evolution_Strategies_parallel+novelty/imgs/pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Black-Box Optimization/Evolution_Strategies_parallel+novelty/imgs/pendulum.png -------------------------------------------------------------------------------- /Black-Box Optimization/README.md: -------------------------------------------------------------------------------- 1 | # Black-Box Optimization 2 | 3 | ### Evolution Strategy 4 | #### Baseline implementation [ES_cartpole](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolutionary_Strategies_Cartpole.ipynb) 5 | 6 | 7 | 8 | ### Genetic Algorithms 9 | 10 | #### Baseline implementation [GA_cartpole](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/genetic_algorithm_base.py) 11 | - run 'python genetic_algorithm_base.py' with the flags: `--noise`, `--ps`, `--pc` as: 12 | - `--noise`(std) that is added as the mutation of the neural network weights, default = 0.05 13 | - `--ps` as the population size, default = 50 14 | - `--pc` as the parents count or amount of top performer that build the new population, default = 10 15 | 16 | Example performance with noise_std = 0.05, ps=30, pc=10 17 | 18 | ![graph](imgs/ga_cartpole.png) 19 | -------------------------------------------------------------------------------- /Black-Box Optimization/genetic_algorithm_base.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn as nn 4 | from torch.utils.tensorboard import SummaryWriter 5 | import numpy as np 6 | import argparse 7 | import gym 8 | 9 | 10 | class Network(nn.Module): 11 | def __init__(self,state_size,action_size,hidden_layer,seed): 12 | super(Network, self).__init__() 13 | self.state_size = state_size 14 | self.action_size = action_size 15 | self.hidden_layer = hidden_layer 16 | 17 | self.net = nn.Sequential( 18 | nn.Linear(self.state_size, self.hidden_layer), 19 | nn.ReLU(), 20 | nn.Linear(self.hidden_layer, self.hidden_layer), 21 | nn.ReLU(), 22 | nn.Linear(self.hidden_layer, self.action_size), 23 | nn.Softmax(dim = 1)) 24 | 25 | def forward(self, x): 26 | return self.net(x) 27 | 28 | def evaluate(env, net): 29 | """ 30 | Plays a round of the game and returns the obtained reward 31 | """ 32 | state = env.reset() 33 | rewards = 0 34 | while True: 35 | state = torch.from_numpy(state).unsqueeze(0).float() 36 | action_prob = net(state) 37 | action = action_prob.max(dim=1)[1] #argmax 38 | next_state, reward, done, info = env.step(action.data.numpy()[0]) 39 | rewards += reward 40 | state = next_state 41 | if done: 42 | break 43 | return rewards 44 | 45 | def mutate_parent(net): 46 | """ 47 | Mutates the parent neural nets by adding noise sampled by a normal distribution. 48 | 49 | """ 50 | new_net = copy.deepcopy(net) 51 | for parameter in new_net.parameters(): 52 | noise = torch.tensor(np.random.normal(size=parameter.data.size()).astype(np.float32)) 53 | parameter.data += NOISE_STD * noise 54 | return new_net 55 | 56 | 57 | if __name__ == "__main__": 58 | # parse input values like 59 | # - Noise standard deviation [NOISE_STD] 60 | # - Population size [POPULATION_SIZE] 61 | # - Parents count [PARENTS_COUNT] 62 | 63 | parser = argparse.ArgumentParser(description = "Noise, Population size, Parents count") 64 | parser.add_argument("--noise",type = float,default=1e-2) 65 | parser.add_argument( "--ps",type=int,default=50) 66 | parser.add_argument( "--pc",type=int,default=10) 67 | 68 | args = parser.parse_args() 69 | NOISE_STD = args.noise 70 | POPULATION_SIZE = args.ps 71 | PARENTS_COUNT = args.pc 72 | 73 | #print(f"Noise: {NOISE_STD}, PopS: {POPULATION_SIZE}, PARENTS_COUNT: {PARENTS_COUNT}") 74 | np.random.seed(seed=42) 75 | torch.manual_seed(42) 76 | writer = SummaryWriter(comment="-CartPole") 77 | env = gym.make("CartPole-v0") 78 | gen_idx = 0 79 | state_size = env.observation_space.shape[0] 80 | action_size = env.action_space.n 81 | 82 | nets = [Network(state_size, action_size, hidden_layer=32, seed=3) for _ in range(POPULATION_SIZE)] 83 | population = [(net, evaluate(env, net)) for net in nets] 84 | 85 | while True: 86 | population.sort(key=lambda p: p[1], reverse=True) # sorts the fitness from highest to lowest 87 | rewards = [p[1] for p in population[:PARENTS_COUNT]] # takes the fitness of the top x-parents 88 | reward_mean = np.mean(rewards) 89 | reward_max = np.max(rewards) 90 | reward_std = np.std(rewards) 91 | 92 | writer.add_scalar("reward_mean", reward_mean, gen_idx) 93 | writer.add_scalar("reward_max", reward_max, gen_idx) 94 | writer.add_scalar("reward_std", reward_std, gen_idx) 95 | print(f"Generation: {gen_idx} | Reward_mean: {reward_mean} | Reward_max: {reward_max} | Reward_std: {reward_std}") 96 | 97 | if reward_mean > 199: 98 | print("Solved the environment in {} generations".format(gen_idx)) 99 | break 100 | writer.close() 101 | 102 | prev_population = population 103 | population = [population[0]] # list of the nets 104 | 105 | for _ in range(POPULATION_SIZE-1): 106 | parent_idx = np.random.randint(0, PARENTS_COUNT) #sample the new population from the top x-parents 107 | parent = prev_population[parent_idx][0] 108 | net = mutate_parent(parent) 109 | population.append((net, evaluate(env, net))) 110 | 111 | gen_idx += 1 112 | 113 | 114 | -------------------------------------------------------------------------------- /Black-Box Optimization/imgs/ga_cartpole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Black-Box Optimization/imgs/ga_cartpole.png -------------------------------------------------------------------------------- /ContinousControl/DDPG.py: -------------------------------------------------------------------------------- 1 | import random 2 | from tensorboardX import SummaryWriter 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import numpy as np 7 | import roboschool 8 | import gym 9 | from gym import wrappers 10 | import pybullet_envs 11 | import time 12 | 13 | class NormalizedActions(gym.ActionWrapper): 14 | 15 | def _action(self, action): 16 | low_bound = self.action_space.low 17 | upper_bound = self.action_space.high 18 | 19 | action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound) 20 | action = np.clip(action, low_bound, upper_bound) 21 | 22 | return action 23 | 24 | def _reverse_action(self, action): 25 | low_bound = self.action_space.low 26 | upper_bound = self.action_space.high 27 | 28 | action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1 29 | action = np.clip(action, low_bound, upper_bound) 30 | 31 | return action 32 | 33 | class ReplayBuffer: 34 | def __init__(self, capacity): 35 | self.capacity = capacity 36 | self.buffer = [] 37 | self.position = 0 38 | 39 | def push(self, state, action, reward, next_state, done): 40 | if len(self.buffer) < self.capacity: 41 | self.buffer.append(None) 42 | self.buffer[self.position] = (state, action, reward, next_state, done) 43 | self.position = (self.position + 1) % self.capacity 44 | 45 | def sample(self, batch_size): 46 | batch = random.sample(self.buffer, batch_size) 47 | state, action, reward, next_state, done = map(np.stack, zip(*batch)) 48 | return state, action, reward, next_state, done 49 | 50 | def __len__(self): 51 | return len(self.buffer) 52 | 53 | class OUNoise(object): 54 | def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000): 55 | self.mu = mu # mean value -> as "goal state" 0.0 in the sense of no noise 56 | self.theta = theta 57 | self.sigma = max_sigma # variance of the noise 58 | self.max_sigma = max_sigma 59 | self.min_sigma = min_sigma 60 | self.decay_period = decay_period 61 | self.action_dim = action_space.shape[0] 62 | self.low = action_space.low 63 | self.high = action_space.high 64 | self.reset() 65 | 66 | def reset(self): 67 | self.state = np.ones(self.action_dim) * self.mu 68 | 69 | def evolve_state(self): 70 | x = self.state 71 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) 72 | self.state = x + dx 73 | return self.state 74 | 75 | def get_action(self, action, t=0): 76 | ou_state = self.evolve_state() 77 | self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) 78 | return np.clip(action + ou_state, self.low, self.high), ou_state 79 | 80 | #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py 81 | 82 | 83 | class Actor(nn.Module): 84 | def __init__(self, input_shape, action_shape): 85 | super(Actor, self).__init__() 86 | self.actor = nn.Sequential(nn.Linear(input_shape[0],400), 87 | nn.LayerNorm(400), 88 | nn.ReLU(), 89 | nn.Linear(400,300), 90 | nn.LayerNorm(300), 91 | nn.ReLU(), 92 | nn.Linear(300,action_shape[0]), 93 | nn.Tanh()) 94 | def forward(self, x): 95 | state = torch.FloatTensor(x).to(device) 96 | return self.actor(state) 97 | 98 | class Critic(nn.Module): 99 | def __init__(self, input_shape, action_shape): 100 | super(Critic, self).__init__() 101 | 102 | self.critic1 = nn.Sequential(nn.Linear(input_shape[0],400), 103 | #nn.LayerNorm(256), 104 | nn.ReLU()) 105 | self.critic2 = nn.Sequential(nn.Linear(400+ action_shape[0], 300), 106 | #nn.LayerNorm(256), 107 | nn.ReLU(), 108 | nn.Linear(300,1)) 109 | def forward(self,state, action): 110 | x = self.critic1(state) 111 | comb = torch.cat([x,action], dim = 1) 112 | return self.critic2(comb) 113 | 114 | def update_and_optimize(batch_size): 115 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 116 | state_v = torch.FloatTensor(state).to(device) # shape[batch_size,3] 117 | action_v = torch.FloatTensor(action).to(device) # shape[batch_size,1] 118 | reward_v = torch.FloatTensor(reward).unsqueeze(1).to(device) # shape [batch_size,1] 119 | next_state_v = torch.FloatTensor(next_state).to(device) # shape [batch_size,3] 120 | done_v = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) # shape [batch_size,1] 121 | 122 | # update critic: 123 | critic_optim.zero_grad() 124 | Q_v = critic(state_v, action_v) 125 | next_action = target_actor(next_state).to(device) 126 | target_Q = target_critic(next_state_v, next_action.detach()) 127 | discounted_target_Q = (reward_v + 0.99 * target_Q * (1.0 - done_v)).to(device) 128 | loss = critic_loss(Q_v, discounted_target_Q.detach()) 129 | writer.add_scalar("Critic loss", loss, frame_idx) 130 | writer.add_scalar("Target_Q", target_Q.mean(), frame_idx) 131 | loss.backward() 132 | critic_optim.step() 133 | 134 | # update actor: 135 | actor_optim.zero_grad() 136 | current_action = actor(state_v.cpu()) 137 | actor_loss = -critic(state_v, current_action.to(device)).mean() 138 | writer.add_scalar("Actor loss", actor_loss, frame_idx) 139 | actor_loss.backward() 140 | actor_optim.step() 141 | 142 | # Softupdate 143 | soft_tau = 0.01 144 | for target_param, param in zip(target_critic.parameters(), critic.parameters()): 145 | target_param.data.copy_( 146 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 147 | ) 148 | 149 | for target_param, param in zip(target_actor.parameters(), actor.parameters()): 150 | target_param.data.copy_( 151 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 152 | ) 153 | 154 | 155 | if __name__ == "__main__": 156 | start = time.time() 157 | # use cuda 158 | use_cuda = torch.cuda.is_available() 159 | device = torch.device("cuda" if use_cuda else "cpu") 160 | 161 | print("Device: ",device) 162 | ENV_ID = "HalfCheetahBulletEnv-v0" #HalfCheetahBulletEnv-v0 #MinitaurBulletEnv-v0 163 | env = gym.make(ENV_ID) 164 | #env = gym.make("RoboschoolHalfCheetah-v1") #RoboschoolHalfCheetah-v1 165 | env = wrappers.Monitor(env, "Saved_Videos/", resume=True, force = True, video_callable=lambda episode_id: episode_id% 5 ==0) 166 | #, video_callable=lambda x: True, force=True 167 | env = NormalizedActions(env) 168 | 169 | action_space = env.action_space.shape 170 | observation_space = env.observation_space.shape 171 | 172 | critic = Critic(observation_space, action_space).to(device) 173 | actor = Actor(observation_space, action_space).to(device) 174 | target_actor = actor 175 | target_critic = critic 176 | target_actor.load_state_dict(actor.state_dict()) 177 | target_critic.load_state_dict(critic.state_dict()) 178 | critic_optim = optim.Adam(critic.parameters(), lr = 0.001, weight_decay=1e-2) 179 | actor_optim = optim.Adam(actor.parameters(), lr = 0.0001) 180 | 181 | critic_loss = nn.MSELoss() 182 | 183 | replay_buffer_size = 1000000 184 | replay_buffer = ReplayBuffer(replay_buffer_size) 185 | 186 | writer = SummaryWriter() 187 | 188 | noise = OUNoise(env.action_space) 189 | batch_size = 128 190 | max_frames = 80000 #100000~32 min --300000 ~47 min 191 | frame_idx = 0 192 | rewards = [] 193 | 194 | while frame_idx < max_frames: 195 | state = env.reset() 196 | noise.reset() 197 | ou_states = [] 198 | episode_reward = 0 199 | done = False 200 | step = 0 201 | print("Training Progress: {:.2f}".format(frame_idx/max_frames *100)) 202 | while not done: 203 | action = actor(state) 204 | action, ou_state = noise.get_action(action.cpu().detach().numpy(), frame_idx) #step 205 | ou_states.append(ou_state) 206 | 207 | next_state, reward, done, _ = env.step(action) 208 | 209 | 210 | 211 | replay_buffer.push(state, action, reward, next_state, done) 212 | if len(replay_buffer) > batch_size:# and frame_idx % 10 == 0: 213 | update_and_optimize(batch_size) 214 | 215 | state = next_state 216 | episode_reward += reward 217 | frame_idx += 1 218 | step += 1 219 | 220 | 221 | if done: 222 | writer.add_scalar("Rewards", episode_reward, frame_idx) 223 | writer.add_scalar("Steps", step, frame_idx) 224 | writer.add_scalar("OU_state", np.array(ou_states).mean(), frame_idx) 225 | 226 | end = time.time() 227 | writer.close() 228 | print("------------------------------\nTraining for {:.2f} minutes".format((end-start)/60)) 229 | -------------------------------------------------------------------------------- /ContinousControl/MultiPro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Dec 4 10:31:28 2019 4 | 5 | @author: Z0014354 6 | 7 | """ 8 | 9 | from multiprocessing import Process, Pipe 10 | import numpy as np 11 | 12 | def worker(remote, parent_remote, env_fn_wrapper): 13 | parent_remote.close() 14 | env = env_fn_wrapper.x() 15 | while True: 16 | cmd, data = remote.recv() 17 | if cmd == 'step': 18 | ob, reward, done, info = env.step(data) 19 | if done: 20 | ob = env.reset() 21 | remote.send((ob, reward, done, info)) 22 | elif cmd == 'reset': 23 | ob = env.reset() 24 | remote.send(ob) 25 | elif cmd == 'reset_task': 26 | ob = env.reset_task() 27 | remote.send(ob) 28 | elif cmd == 'close': 29 | remote.close() 30 | break 31 | elif cmd == 'get_spaces': 32 | remote.send((env.observation_space, env.action_space)) 33 | else: 34 | raise NotImplementedError 35 | 36 | class CloudpickleWrapper(object): 37 | """ 38 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 39 | """ 40 | def __init__(self, x): 41 | self.x = x 42 | def __getstate__(self): 43 | import cloudpickle 44 | return cloudpickle.dumps(self.x) 45 | def __setstate__(self, ob): 46 | import pickle 47 | self.x = pickle.loads(ob) 48 | 49 | 50 | class VecEnv(object): 51 | """ 52 | An abstract asynchronous, vectorized environment. 53 | """ 54 | def __init__(self, num_envs, observation_space, action_space): 55 | self.num_envs = num_envs 56 | self.observation_space = observation_space 57 | self.action_space = action_space 58 | 59 | def reset(self): 60 | """ 61 | Reset all the environments and return an array of 62 | observations, or a tuple of observation arrays. 63 | If step_async is still doing work, that work will 64 | be cancelled and step_wait() should not be called 65 | until step_async() is invoked again. 66 | """ 67 | pass 68 | 69 | def step_async(self, actions): 70 | """ 71 | Tell all the environments to start taking a step 72 | with the given actions. 73 | Call step_wait() to get the results of the step. 74 | You should not call this if a step_async run is 75 | already pending. 76 | """ 77 | pass 78 | 79 | def step_wait(self): 80 | """ 81 | Wait for the step taken with step_async(). 82 | Returns (obs, rews, dones, infos): 83 | - obs: an array of observations, or a tuple of 84 | arrays of observations. 85 | - rews: an array of rewards 86 | - dones: an array of "episode done" booleans 87 | - infos: a sequence of info objects 88 | """ 89 | pass 90 | 91 | def close(self): 92 | """ 93 | Clean up the environments' resources. 94 | """ 95 | pass 96 | 97 | def step(self, actions): 98 | self.step_async(actions) 99 | return self.step_wait() 100 | 101 | class SubprocVecEnv(VecEnv): 102 | def __init__(self, env_fns, spaces=None): 103 | """ 104 | envs: list of gym environments to run in subprocesses 105 | """ 106 | self.waiting = False 107 | self.closed = False 108 | nenvs = len(env_fns) 109 | self.nenvs = nenvs 110 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 111 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 112 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 113 | for p in self.ps: 114 | p.daemon = True # if the main process crashes, we should not cause things to hang 115 | p.start() 116 | for remote in self.work_remotes: 117 | remote.close() 118 | 119 | self.remotes[0].send(('get_spaces', None)) 120 | observation_space, action_space = self.remotes[0].recv() 121 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 122 | 123 | def step_async(self, actions): 124 | for remote, action in zip(self.remotes, actions): 125 | remote.send(('step', action)) 126 | self.waiting = True 127 | 128 | def step_wait(self): 129 | results = [remote.recv() for remote in self.remotes] 130 | self.waiting = False 131 | obs, rews, dones, infos = zip(*results) 132 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 133 | 134 | def reset(self): 135 | for remote in self.remotes: 136 | remote.send(('reset', None)) 137 | return np.stack([remote.recv() for remote in self.remotes]) 138 | 139 | def reset_task(self): 140 | for remote in self.remotes: 141 | remote.send(('reset_task', None)) 142 | return np.stack([remote.recv() for remote in self.remotes]) 143 | 144 | def close(self): 145 | if self.closed: 146 | return 147 | if self.waiting: 148 | for remote in self.remotes: 149 | remote.recv() 150 | for remote in self.remotes: 151 | remote.send(('close', None)) 152 | for p in self.ps: 153 | p.join() 154 | self.closed = True 155 | 156 | def __len__(self): 157 | return self.nenvs -------------------------------------------------------------------------------- /ContinousControl/PPO_gae_multi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Dec 4 10:33:09 2019 4 | 5 | @author: Z0014354 6 | 7 | PPO with GAE implementation of Sebastian Dittert 8 | """ 9 | 10 | import gym 11 | import math 12 | import torch 13 | import torch.nn as nn 14 | import torch.optim as optim 15 | from torch.distributions import Normal 16 | import torch.nn.functional as F 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | from torch.nn.utils import clip_grad_norm_ 20 | from collections import deque 21 | from tensorboardX import SummaryWriter 22 | import MultiPro 23 | import argparse 24 | import time 25 | 26 | 27 | def hidden_init(layer): 28 | fan_in = layer.weight.data.size()[0] 29 | lim = 1. / np.sqrt(fan_in) 30 | return (-lim, lim) 31 | 32 | class Critic(nn.Module): 33 | def __init__(self, input_shape, hidden_size): 34 | super(Critic, self).__init__() 35 | self.layer1 = nn.Linear(input_shape, hidden_size) 36 | self.layer2 = nn.Linear(hidden_size, hidden_size) 37 | self.layer3 = nn.Linear(hidden_size, 1) 38 | self.reset_parameters() 39 | 40 | def forward(self,x): 41 | x = torch.tanh(self.layer1(x)) 42 | x = torch.tanh(self.layer2(x)) 43 | x = self.layer3(x) 44 | 45 | return x 46 | 47 | def reset_parameters(self): 48 | nn.init.xavier_uniform_(self.layer1.weight) 49 | nn.init.xavier_uniform_(self.layer2.weight) 50 | #nn.init.xavier_uniform_(self.layer3.weight) 51 | 52 | class Actor(nn.Module): 53 | def __init__(self, input_shape, output_shape, action_high_low, hidden_size): 54 | super(Actor, self).__init__() 55 | self.layer1 = nn.Linear(input_shape, hidden_size) 56 | self.layer2 = nn.Linear(hidden_size,hidden_size) 57 | 58 | self.mean = nn.Linear(hidden_size, output_shape) 59 | self.variance = nn.Linear(hidden_size, output_shape) 60 | self.action_high_low = action_high_low 61 | #self.reset_parameters() 62 | 63 | def forward(self, x): 64 | 65 | x = torch.tanh(self.layer1(x)) 66 | head = torch.tanh(self.layer2(x)) 67 | 68 | mean = torch.tanh(self.mean(head)) # tanh squashed output to the range of -1..1 69 | variance = F.softplus(self.variance(head)) # log(1 + e^x) has the shape of a smoothed ReLU 70 | sigma = torch.sqrt(variance.cpu()) 71 | m = Normal(mean.cpu(), sigma) 72 | actions = m.sample() 73 | logprobs = m.log_prob(actions) #for the optimization step we create a new distribution based on the new mean and variance - still taking the logprobs based on the old actions! 74 | 75 | return actions, logprobs, m 76 | 77 | 78 | def reset_parameters(self): 79 | nn.init.xavier_uniform_(self.layer1.weight) 80 | nn.init.xavier_uniform_(self.layer2.weight) 81 | nn.init.xavier_uniform_(self.mean.weight) 82 | #nn.init.xavier_uniform_(self.variance.weight) 83 | 84 | 85 | 86 | class Agent(): 87 | def __init__(self, 88 | state_size, 89 | action_size, 90 | action_high_low, 91 | hidden_size, 92 | LR_A=3e-4, 93 | LR_C=3e-4, 94 | gamma=0.99, 95 | lambda_=0.95, 96 | mini_batch_size=512, 97 | ppo_epochs=5): 98 | 99 | self.state_size = state_size 100 | self.actor = Actor(state_size, action_size, action_high_low, hidden_size).to(device) 101 | self.action_high = action_high_low[0] 102 | self.action_low = action_high_low[1] 103 | self.critic = Critic(state_size, hidden_size).to(device) 104 | 105 | self.gamma = gamma 106 | self.lambda_ = lambda_ 107 | self.mini_batch_size = mini_batch_size 108 | self.ppo_epochs = ppo_epochs 109 | 110 | 111 | self.optimizer_a = optim.Adam(params=self.actor.parameters(), lr=LR_A) #RMSprop 112 | self.optimizer_c = optim.Adam(params=self.critic.parameters(), lr=LR_C) 113 | 114 | 115 | def test_net(self, env, count = 10): 116 | """ 117 | Tests the agents performance with current weights. 118 | """ 119 | rewards = 0.0 120 | steps = 0 121 | entropys = 0.0 122 | 123 | for _ in range(count): 124 | obs = env.reset() 125 | 126 | while True: 127 | obs_v = torch.from_numpy(obs).float() 128 | action, _, dist = self.actor(obs_v.to(device)) 129 | entropy = dist.entropy().detach().cpu().numpy() 130 | action = action.cpu().numpy() 131 | action = np.clip(action*self.action_high, self.action_low, self.action_high) 132 | obs, reward, done, info = env.step(action) 133 | 134 | rewards += reward 135 | entropys += entropy.mean() 136 | steps += 1 137 | if done: 138 | break 139 | 140 | return rewards/count, entropys/count, steps/count 141 | 142 | 143 | 144 | 145 | def compute_gae(self, next_value, rewards, masks, values): 146 | """ 147 | lambda => 1: high variance, low bias 148 | lambda => 0: low variance, high bias 149 | """ 150 | 151 | rewards_batch = list(zip(*rewards)) 152 | masks_batch = list(zip(*masks)) 153 | values_batch = torch.cat((torch.stack(values, dim=1).squeeze(2), next_value.squeeze(0)),dim=1) 154 | 155 | out_discounted_rewards = [] 156 | out_advantage = [] 157 | for rewards, masks, values in zip(rewards_batch, masks_batch, values_batch): 158 | 159 | gae = 0 160 | disc_returns = [] 161 | advantage = [] 162 | for step in reversed(range(len(rewards))): 163 | # d = r_t +gamma*V(s_t+1) - V(s) 164 | delta = rewards[step] + self.gamma * values[step + 1] * masks[step] - values[step] 165 | # sum(lambda*gamma)^t* delta_t+1 166 | gae = delta + self.gamma * self.lambda_ * masks[step] * gae 167 | 168 | disc_returns.insert(0, gae + values[step]) # adding values since we want the returns and not the advantage yet! A(a,s) = Q"returns" - V(s) 169 | advantage.insert(0, gae) 170 | 171 | out_discounted_rewards.append(disc_returns) 172 | out_advantage.append(advantage) 173 | 174 | return torch.FloatTensor(out_discounted_rewards).flatten().unsqueeze(1), torch.FloatTensor(out_advantage).flatten().unsqueeze(1) 175 | 176 | 177 | def ppo_iter(self, states, actions, log_probs, advantage, discounted_rewards): 178 | batch_size = len(states) 179 | 180 | for i in range(batch_size // self.mini_batch_size): 181 | rand_ids = np.random.randint(0, batch_size, self.mini_batch_size) 182 | 183 | yield states[rand_ids], actions[rand_ids], log_probs[rand_ids], advantage[rand_ids], discounted_rewards[rand_ids] 184 | 185 | 186 | 187 | def ppo_update(self, states, actions, log_probs, advantage, discounted_rewards, eps_clip=0.2): 188 | """ 189 | 190 | """ 191 | 192 | a_loss_batch = [] 193 | c_loss_batch = [] 194 | 195 | 196 | for _ in range(self.ppo_epochs): 197 | for states_i, old_actions, old_logprobs, advantage_i, discounted_reward_i in self.ppo_iter(states, actions, log_probs, advantage, discounted_rewards): 198 | 199 | self.optimizer_c.zero_grad() 200 | #train critic 201 | new_value = self.critic(states_i.to(device)) 202 | 203 | c_loss = .5 * (discounted_reward_i - new_value).pow(2).mean() 204 | c_loss.backward() 205 | #print("C: ", c_loss) 206 | clip_grad_norm_(self.critic.parameters(),CLIP_GRAD) 207 | self.optimizer_c.step() 208 | 209 | #train actor 210 | self.optimizer_a.zero_grad() 211 | _, _, dist = self.actor(states_i.to(device)) 212 | new_logprobs = dist.log_prob(old_actions) 213 | entropy = dist.entropy() 214 | 215 | ratio = torch.exp(new_logprobs - old_logprobs.detach()) 216 | surr = ratio * advantage_i 217 | clip = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip) 218 | a_loss = torch.min(surr, clip*advantage_i ) 219 | a_loss = (- a_loss - ENTROPY_BONUS * entropy).mean() 220 | clip_grad_norm_(self.actor.parameters(),CLIP_GRAD) 221 | a_loss.backward(retain_graph=True) 222 | 223 | self.optimizer_a.step() 224 | 225 | c_loss_batch.append(c_loss.detach().numpy()) 226 | a_loss_batch.append(a_loss.detach().numpy()) 227 | 228 | return np.array(c_loss_batch).mean(), np.array(a_loss_batch).mean() 229 | 230 | 231 | 232 | def main(args): 233 | torch.multiprocessing.freeze_support() 234 | t0 = time.time() 235 | ENV = args.env #"MountainCarContinuous-v0" #Pendulum-v0 LunarLanderContinuous-v0 236 | 237 | env = gym.make(ENV)#Creating the Environment 238 | writer = SummaryWriter("runs/"+args.info) 239 | n_cpu = args.worker 240 | 241 | envs = MultiPro.SubprocVecEnv([lambda: gym.make(ENV) for i in range(n_cpu)]) 242 | seed = args.seed 243 | 244 | torch.manual_seed(seed) 245 | torch.cuda.manual_seed(seed) 246 | np.random.seed(seed) 247 | env.seed(seed) 248 | 249 | 250 | state_size = env.observation_space.shape[0] 251 | action_size = env.action_space.shape[0] 252 | action_high_low = (env.action_space.high[0], env.action_space.low[0]) 253 | 254 | agent = Agent(state_size, action_size, action_high_low= action_high_low, hidden_size=args.layer_size, LR_A=args.lr, LR_C=args.lr, gamma=args.gamma, lambda_=args.lambda_, mini_batch_size=args.mini_batch_size, ppo_epochs=args.ppo_updates) 255 | 256 | max_episodes = args.ep 257 | plot_rewards = [] 258 | max_steps = int(args.max_steps/n_cpu) 259 | 260 | # calc reshape stacking size 261 | shape = (max_steps*n_cpu, state_size) 262 | 263 | for ep in range(max_episodes+1): 264 | states = envs.reset() 265 | 266 | done = False 267 | 268 | state_batch = [] 269 | value_batch = [] 270 | action_batch = [] 271 | logprob_batch = [] 272 | rewards_batch = [] 273 | masks = [] 274 | for step in range(max_steps): 275 | 276 | states = torch.from_numpy(states).float() 277 | 278 | action, logprob, _ = agent.actor(states.to(device)) 279 | value = agent.critic(states.to(device)) 280 | action_v = action.cpu().numpy() 281 | 282 | action_v = np.clip(action_v*env.action_space.high[0], env.action_space.low[0], env.action_space.high[0]) 283 | next_states, reward, done, _ = envs.step(action_v) 284 | 285 | state_batch.append(states) 286 | value_batch.append(value) 287 | logprob_batch.append(logprob) 288 | action_batch.append(action) 289 | rewards_batch.append(torch.from_numpy(reward).float()) 290 | masks.append(torch.from_numpy(1 - done).float()) 291 | 292 | states = next_states 293 | 294 | 295 | if np.any(done): 296 | states = envs.reset() 297 | 298 | # stack all gathered data 299 | 300 | state_batch = torch.stack(state_batch, dim=1).reshape(shape) 301 | actions_batch = torch.stack(action_batch, dim=1).reshape(max_steps*n_cpu,action_size) 302 | logprob_batch = torch.stack(logprob_batch, dim=1).reshape(max_steps*n_cpu,action_size).detach() 303 | 304 | 305 | # calculate advantage: 306 | next_value = agent.critic(torch.from_numpy(next_states).float()) 307 | discounted_rewards, advantage = agent.compute_gae(next_value, rewards_batch, masks, value_batch) 308 | 309 | # normalize advantage: 310 | advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5) 311 | 312 | c_loss, a_loss = agent.ppo_update(states=state_batch, actions=actions_batch, log_probs=logprob_batch, advantage=advantage.detach() , discounted_rewards=discounted_rewards.detach()) 313 | writer.add_scalar("critic_loss", c_loss, ep) 314 | writer.add_scalar("actor_loss", a_loss, ep) 315 | 316 | 317 | if ep != 0 and ep % 5 == 0: 318 | test_rewards, test_entropy, test_steps = agent.test_net(env) 319 | writer.add_scalar("entropy",test_entropy, ep) 320 | writer.add_scalar("max_reward",test_rewards, ep) 321 | plot_rewards.append(test_rewards) 322 | 323 | print("\rEpisode: {} | Ep_Reward: {:.2f} | Average_100: {:.2f}".format(ep, test_rewards, np.mean(plot_rewards[-100:])), end = "", flush = True) 324 | 325 | envs.close() 326 | t1 = time.time() 327 | plt.pause(60) 328 | env.close() 329 | print("training took {} min!".format((t1-t0)/60)) 330 | 331 | if __name__ == "__main__": 332 | parser = argparse.ArgumentParser(description="") 333 | parser.add_argument("-env", type=str,default="Pendulum-v0", help="Environment name") 334 | parser.add_argument("-info", type=str, help="Information or name of the run") 335 | parser.add_argument("-ep", type=int, default=200, help="The amount of training episodes, default is 200") 336 | parser.add_argument("-seed", type=int, default=0, help="Seed for the env and torch network weights, default is 0") 337 | parser.add_argument("-lr", type=float, default=5e-4, help="Learning rate of adapting the network weights, default is 5e-4") 338 | parser.add_argument("-entropy_bonus", type=float, default=1e-3, help="Entropy bonus for exploration - default is 1e-2") 339 | parser.add_argument("-layer_size", type=int, default=64, help="Number of nodes per neural network layer, default is 64") 340 | parser.add_argument("-worker", type=int, default=8, help="Number of parallel worker -default is 8") 341 | parser.add_argument("-lambda_", type=float, default=0.95, help="GAE lambda") 342 | parser.add_argument("-g", "--gamma", type=float, default=0.99, help="discount factor gamma, default is 0.99") 343 | parser.add_argument("-CG", "--clip_grad", type=float, default=0.25, help="Clip the gradients for updating the network parameters, default is 0.25") 344 | parser.add_argument("-ms", "--max_steps", type=int, default=2048, help="Maximum steps that are taken by the agent in the environment before updating") 345 | parser.add_argument("-mbs", "--mini_batch_size", type=int, default=256, help="Mini Batch size for the ppo updates, default is 256") 346 | parser.add_argument("-updates", "--ppo_updates", type=int, default=7, help="Number of PPO updates, default is 7") 347 | args = parser.parse_args() 348 | 349 | device = "cuda" if torch.cuda.is_available() else "cpu" 350 | print("Using: ", device) 351 | 352 | ENTROPY_BONUS = args.entropy_bonus 353 | CLIP_GRAD = args.clip_grad 354 | main(args) 355 | -------------------------------------------------------------------------------- /ContinousControl/PPO_test_crawler.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from unityagents import UnityEnvironment\n", 10 | "import gym\n", 11 | "import math\n", 12 | "import torch \n", 13 | "import torch.nn as nn\n", 14 | "import torch.optim as optim\n", 15 | "from torch.distributions import Normal\n", 16 | "import torch.nn.functional as F\n", 17 | "import numpy as np\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "from torch.nn.utils import clip_grad_norm_\n", 20 | "from collections import deque" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "INFO:unityagents:\n", 33 | "'Academy' started successfully!\n", 34 | "Unity Academy name: Academy\n", 35 | " Number of Brains: 1\n", 36 | " Number of External Brains : 1\n", 37 | " Lesson number : 0\n", 38 | " Reset Parameters :\n", 39 | "\t\t\n", 40 | "Unity brain name: CrawlerBrain\n", 41 | " Number of Visual Observations (per agent): 0\n", 42 | " Vector Observation space type: continuous\n", 43 | " Vector Observation space size (per agent): 129\n", 44 | " Number of stacked Vector Observation: 1\n", 45 | " Vector Action space type: continuous\n", 46 | " Vector Action space size (per agent): 20\n", 47 | " Vector Action descriptions: , , , , , , , , , , , , , , , , , , , \n" 48 | ] 49 | }, 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "Number of agents: 12\n", 55 | "Size of each action: 20\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "env = UnityEnvironment(file_name='Crawler_Linux/Crawler.x86_64')\n", 61 | "# get the default brain\n", 62 | "brain_name = env.brain_names[0]\n", 63 | "brain = env.brains[brain_name]\n", 64 | "# reset the environment\n", 65 | "env_info = env.reset(train_mode=False)[brain_name]\n", 66 | "\n", 67 | "# number of agents\n", 68 | "num_agents = len(env_info.agents)\n", 69 | "print('Number of agents:', num_agents)\n", 70 | "\n", 71 | "# size of each action\n", 72 | "action_size = brain.vector_action_space_size\n", 73 | "print('Size of each action:', action_size)\n", 74 | "\n", 75 | "# examine the state space \n", 76 | "states_ = env_info.vector_observations\n", 77 | "state_size = states_.shape[1]\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "class Critic(nn.Module):\n", 87 | " def __init__(self, input_shape, layer_size):\n", 88 | " super(Critic, self).__init__()\n", 89 | " self.net = nn.Sequential(nn.Linear(input_shape, layer_size),\n", 90 | " nn.ReLU(),\n", 91 | " nn.Linear(layer_size,layer_size),\n", 92 | " nn.ReLU(),\n", 93 | " nn.Linear(layer_size, 1))\n", 94 | " \n", 95 | " def forward(self,x):\n", 96 | " x = self.net(x)\n", 97 | " return x\n", 98 | " \n", 99 | "class Actor(nn.Module):\n", 100 | " def __init__(self, input_shape, output_shape,layer_size):\n", 101 | " super(Actor, self).__init__()\n", 102 | " self.net = nn.Sequential(nn.Linear(input_shape, layer_size),\n", 103 | " nn.ReLU(),\n", 104 | " nn.Linear(layer_size,layer_size),\n", 105 | " nn.ReLU(),\n", 106 | " )\n", 107 | " self.mean = nn.Sequential(nn.Linear(layer_size, output_shape),\n", 108 | " nn.Tanh()) # tanh squashed output to the range of -1..1\n", 109 | " self.variance =nn.Sequential(nn.Linear(layer_size, output_shape),\n", 110 | " nn.Softplus()) # log(1 + e^x) has the shape of a smoothed ReLU\n", 111 | " \n", 112 | " def forward(self, x):\n", 113 | " x = self.net(x) \n", 114 | " sigma = torch.sqrt(self.variance(x).cpu())\n", 115 | " m = Normal(self.mean(x).cpu(), sigma)\n", 116 | " actions = m.sample()\n", 117 | " actions = torch.clamp(actions, -1, 1) # usually clipping between -1,1 but pendulum env has action range of -2,2\n", 118 | "\n", 119 | " logprobs = m.log_prob(actions) #for the optimization step we create a new distribution based on the new mean and variance - still taking the logprobs based on the old actions!\n", 120 | "\n", 121 | " \n", 122 | " return actions, logprobs, m\n", 123 | " \n", 124 | "class Agent():\n", 125 | " def __init__(self, state_size, action_size, ppo_epochs, mini_batch_size,\\\n", 126 | " layer_size,lr_a, lr_c, gamma, entropy_beta, clip_grad):\n", 127 | " self.state_size = state_size\n", 128 | " self.action_size = action_size\n", 129 | " \n", 130 | " self.layer_size = layer_size\n", 131 | " self.gamma = gamma\n", 132 | " self.entropy_beta = entropy_beta\n", 133 | " self.clip_grad = clip_grad\n", 134 | " \n", 135 | " self.ppo_epochs = ppo_epochs\n", 136 | " self.mini_batch_size = mini_batch_size\n", 137 | " \n", 138 | " self.actor = Actor(state_size, action_size,layer_size).to(device)\n", 139 | " self.critic = Critic(state_size,layer_size).to(device)\n", 140 | " self.a_optimizer = optim.RMSprop(params = self.actor.parameters(),lr = lr_a)\n", 141 | " self.c_optimizer = optim.RMSprop(params = self.critic.parameters(),lr = lr_c)\n", 142 | " \n", 143 | " def act(self, states):\n", 144 | " self.actor.eval()\n", 145 | " with torch.no_grad():\n", 146 | " actions, logprobs ,_ = self.actor(torch.from_numpy(states).float().to(device))\n", 147 | " self.actor.train()\n", 148 | " return actions.cpu().numpy(), logprobs\n", 149 | " \n", 150 | "\n", 151 | " def compute_returns(self,rewards_tensor, masks_tensor):\n", 152 | " output = []\n", 153 | " for rewards, masks in zip(rewards_tensor, masks_tensor):\n", 154 | " R = 0 \n", 155 | " returns = []\n", 156 | " for step in reversed(range(len(rewards))):\n", 157 | " R = rewards[step] + self.gamma * R * masks[step]\n", 158 | " returns.insert(0, R)\n", 159 | " output.append(returns)\n", 160 | " output = list(zip(*output))\n", 161 | " discounted_rewards = [torch.FloatTensor(i).unsqueeze(1) for i in output]\n", 162 | " return torch.cat(discounted_rewards)\n", 163 | "\n", 164 | "\n", 165 | "\n", 166 | " def ppo_iter(self, states, actions, log_probs, advantage, discounted_rewards):\n", 167 | " batch_size = len(states)#.shape[]\n", 168 | "\n", 169 | " for i in range(batch_size // self.mini_batch_size):\n", 170 | " rand_ids = np.random.randint(0, batch_size, self.mini_batch_size)\n", 171 | "\n", 172 | " yield torch.cat(states)[rand_ids], torch.cat(actions)[rand_ids], torch.cat(log_probs)[rand_ids], advantage[rand_ids], discounted_rewards[rand_ids]\n", 173 | "\n", 174 | "\n", 175 | "\n", 176 | " def ppo_update(self, states, actions, log_probs, advantage, discounted_rewards, eps_clip=0.2):\n", 177 | " \"\"\"\n", 178 | "\n", 179 | " \"\"\"\n", 180 | "\n", 181 | " a_loss_batch = []\n", 182 | " c_loss_batch = []\n", 183 | " entropy_batch = []\n", 184 | "\n", 185 | " for _ in range(self.ppo_epochs):\n", 186 | " for states_i, old_actions, old_logprobs, advantage_i, discounted_reward_i in self.ppo_iter(states, actions, log_probs, advantage, discounted_rewards):\n", 187 | "\n", 188 | " self.c_optimizer.zero_grad()\n", 189 | " #tran critic\n", 190 | " new_value = self.critic(states_i.to(device))\n", 191 | " c_loss = F.mse_loss(new_value, discounted_reward_i).cpu()\n", 192 | " c_loss.backward(retain_graph=True)\n", 193 | " clip_grad_norm_(self.critic.parameters(),self.clip_grad)\n", 194 | " self.c_optimizer.step()\n", 195 | "\n", 196 | " c_loss_batch.append(c_loss.detach().cpu().numpy())\n", 197 | "\n", 198 | "\n", 199 | " #train actor\n", 200 | " self.a_optimizer.zero_grad()\n", 201 | " _, _, dist = self.actor(states_i.to(device))\n", 202 | " new_logprobs = dist.log_prob(old_actions)\n", 203 | " entropy = dist.entropy().mean()\n", 204 | " entropy_batch.append(entropy.detach().cpu().numpy())\n", 205 | "\n", 206 | "\n", 207 | " ratio = torch.exp(new_logprobs - old_logprobs.detach()).cpu()\n", 208 | " surr = ratio * advantage_i.cpu()\n", 209 | " clip = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip)\n", 210 | "\n", 211 | " \n", 212 | " a_loss = - (torch.min(surr, clip * advantage_i.cpu() ).mean()) + self.entropy_beta * entropy.cpu()\n", 213 | " a_loss.backward(retain_graph=True)\n", 214 | " clip_grad_norm_(self.actor.parameters(),self.clip_grad)\n", 215 | " self.a_optimizer.step()\n", 216 | "\n", 217 | " a_loss_batch.append(a_loss.detach().cpu().numpy())\n", 218 | "\n", 219 | "\n", 220 | " return np.array(c_loss_batch).mean(), np.array(a_loss_batch).mean(), np.array(entropy_batch).mean()\n", 221 | "\n", 222 | "def list_to_tensor(list_):\n", 223 | " return np.array(list(zip(*list_)))" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "torch.manual_seed(42)\n", 233 | "torch.cuda.manual_seed(42)\n", 234 | "np.random.seed(42)\n", 235 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 236 | "agent = Agent(state_size = state_size, action_size = action_size ,ppo_epochs = 5, mini_batch_size = 512,\\\n", 237 | " layer_size = 512 ,lr_a = 1e-4, lr_c = 1e-4, gamma = 0.99 , entropy_beta = 1e-4, clip_grad = 1)\n", 238 | "\n", 239 | "agent.actor.load_state_dict(torch.load(\"Crawler_weights/actor100.pth\"))\n", 240 | "agent.actor.eval()\n", 241 | "\n", 242 | "max_episodes = 0\n", 243 | "\n", 244 | "c_loss_list = []\n", 245 | "a_loss_list = []\n", 246 | "entropy_list = []\n", 247 | "\n", 248 | "\n", 249 | "average_100 = deque(maxlen = 100)\n", 250 | "\n", 251 | "mean_rewards = []\n", 252 | "max_rewards = []\n", 253 | "min_rewards = []\n", 254 | "average_100_rewards = []\n", 255 | "\n", 256 | "max_steps = 2024\n", 257 | "\n", 258 | "for ep in range(max_episodes+1):\n", 259 | " env_info = env.reset(train_mode=False)[brain_name] # reset the environment \n", 260 | " states = env_info.vector_observations # get the current state (for each agent)\n", 261 | " done = False\n", 262 | " \n", 263 | " states_batch = []\n", 264 | " values_batch = []\n", 265 | " actions_batch = []\n", 266 | " logprobs_batch = []\n", 267 | " rewards_batch = []\n", 268 | " masks = []\n", 269 | " scores = np.zeros(num_agents)\n", 270 | " while True:\n", 271 | "\n", 272 | " actions, logprobs = agent.act(states) \n", 273 | " env_info = env.step(actions)[brain_name] # send all actions to tne environment\n", 274 | " next_states = env_info.vector_observations # get next state (for each agent)\n", 275 | " rewards = env_info.rewards # get reward (for each agent)\n", 276 | " dones = env_info.local_done # see if episode finished\n", 277 | " scores += env_info.rewards\n", 278 | " \n", 279 | " states = next_states\n", 280 | "\n", 281 | " if np.any(dones):\n", 282 | " break\n", 283 | "\n", 284 | " \n", 285 | " mean_rewards.append(np.mean(scores))\n", 286 | " min_rewards.append(np.min(scores))\n", 287 | " max_rewards.append(np.max(scores))\n", 288 | " average_100.append(np.mean(scores))\n", 289 | " average_100_rewards.append(np.array(average_100).mean())\n", 290 | " \n", 291 | " print(\"\\rEpisode: {} | mean_reward: {:.2f} | min_reward: {:.2f} | max_reward: {:.2f} | Average_100: {:.2f}\".format(ep, np.mean(scores), np.min(scores), np.max(scores), np.mean(average_100)), end = \"\", flush = True)\n", 292 | " if ep != 0 and ep % 100 == 0:\n", 293 | " print(\"\\rEpisode: {} | mean_reward: {:.2f} | min_reward: {:.2f} | max_reward: {:.2f} | Average_100: {:.2f}\".format(ep, np.mean(scores), np.min(scores), np.max(scores), np.mean(average_100)))\n", 294 | "\n", 295 | " \n", 296 | "\n", 297 | " \n", 298 | "env.close()\n", 299 | "# PLOTTING RESULTS\n", 300 | "\n", 301 | "plt.figure(figsize = (20,7))\n", 302 | "plt.subplot(1,4,1)\n", 303 | "plt.title(\"actor loss\")\n", 304 | "plt.plot(a_loss_list)\n", 305 | "plt.subplot(1,4,2)\n", 306 | "plt.title(\"critic loss\")\n", 307 | "plt.plot(c_loss_list)\n", 308 | "plt.subplot(1,4,3)\n", 309 | "plt.title(\"entropy\")\n", 310 | "plt.plot(entropy_list)\n", 311 | "plt.subplot(1,4,4)\n", 312 | "plt.title(\"rewards\")\n", 313 | "plt.plot(mean_rewards, c = \"b\")\n", 314 | "plt.plot(min_rewards, c = \"y\")\n", 315 | "plt.plot(max_rewards, c = \"r\")\n", 316 | "plt.plot(average_100_rewards, c = \"g\")\n", 317 | "plt.show()" 318 | ] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.6.5" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 2 342 | } 343 | -------------------------------------------------------------------------------- /ContinousControl/Parallel_processing.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Pipe 2 | import numpy as np 3 | 4 | def worker(remote, parent_remote, env_fn_wrapper): 5 | parent_remote.close() 6 | env = env_fn_wrapper.x() 7 | while True: 8 | cmd, data = remote.recv() 9 | if cmd == 'step': 10 | ob, reward, done, info = env.step(data) 11 | if done: 12 | ob = env.reset() 13 | remote.send((ob, reward, done, info)) 14 | elif cmd == 'reset': 15 | ob = env.reset() 16 | remote.send(ob) 17 | elif cmd == 'reset_task': 18 | ob = env.reset_task() 19 | remote.send(ob) 20 | elif cmd == 'close': 21 | remote.close() 22 | break 23 | elif cmd == 'get_spaces': 24 | remote.send((env.observation_space, env.action_space)) 25 | else: 26 | raise NotImplementedError 27 | 28 | class CloudpickleWrapper(object): 29 | """ 30 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 31 | """ 32 | def __init__(self, x): 33 | self.x = x 34 | def __getstate__(self): 35 | import cloudpickle 36 | return cloudpickle.dumps(self.x) 37 | def __setstate__(self, ob): 38 | import pickle 39 | self.x = pickle.loads(ob) 40 | 41 | 42 | class VecEnv(object): 43 | """ 44 | An abstract asynchronous, vectorized environment. 45 | """ 46 | def __init__(self, num_envs, observation_space, action_space): 47 | self.num_envs = num_envs 48 | self.observation_space = observation_space 49 | self.action_space = action_space 50 | 51 | def reset(self): 52 | """ 53 | Reset all the environments and return an array of 54 | observations, or a tuple of observation arrays. 55 | If step_async is still doing work, that work will 56 | be cancelled and step_wait() should not be called 57 | until step_async() is invoked again. 58 | """ 59 | pass 60 | 61 | def step_async(self, actions): 62 | """ 63 | Tell all the environments to start taking a step 64 | with the given actions. 65 | Call step_wait() to get the results of the step. 66 | You should not call this if a step_async run is 67 | already pending. 68 | """ 69 | pass 70 | 71 | def step_wait(self): 72 | """ 73 | Wait for the step taken with step_async(). 74 | Returns (obs, rews, dones, infos): 75 | - obs: an array of observations, or a tuple of 76 | arrays of observations. 77 | - rews: an array of rewards 78 | - dones: an array of "episode done" booleans 79 | - infos: a sequence of info objects 80 | """ 81 | pass 82 | 83 | def close(self): 84 | """ 85 | Clean up the environments' resources. 86 | """ 87 | pass 88 | 89 | def step(self, actions): 90 | self.step_async(actions) 91 | return self.step_wait() 92 | 93 | class SubprocVecEnv(VecEnv): 94 | def __init__(self, env_fns, spaces=None): 95 | """ 96 | envs: list of gym environments to run in subprocesses 97 | """ 98 | self.waiting = False 99 | self.closed = False 100 | nenvs = len(env_fns) 101 | self.nenvs = nenvs 102 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 103 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 104 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 105 | for p in self.ps: 106 | p.daemon = True # if the main process crashes, we should not cause things to hang 107 | p.start() 108 | for remote in self.work_remotes: 109 | remote.close() 110 | 111 | self.remotes[0].send(('get_spaces', None)) 112 | observation_space, action_space = self.remotes[0].recv() 113 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 114 | 115 | def step_async(self, actions): 116 | for remote, action in zip(self.remotes, actions): 117 | remote.send(('step', action)) 118 | self.waiting = True 119 | 120 | def step_wait(self): 121 | results = [remote.recv() for remote in self.remotes] 122 | self.waiting = False 123 | obs, rews, dones, infos = zip(*results) 124 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 125 | 126 | def reset(self): 127 | for remote in self.remotes: 128 | remote.send(('reset', None)) 129 | return np.stack([remote.recv() for remote in self.remotes]) 130 | 131 | def reset_task(self): 132 | for remote in self.remotes: 133 | remote.send(('reset_task', None)) 134 | return np.stack([remote.recv() for remote in self.remotes]) 135 | 136 | def close(self): 137 | if self.closed: 138 | return 139 | if self.waiting: 140 | for remote in self.remotes: 141 | remote.recv() 142 | for remote in self.remotes: 143 | remote.send(('close', None)) 144 | for p in self.ps: 145 | p.join() 146 | self.closed = True 147 | 148 | def __len__(self): 149 | return self.nenvs -------------------------------------------------------------------------------- /ContinousControl/SAC_script.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 6 12:24:34 2019 4 | 5 | @author: Z0014354 6 | """ 7 | 8 | 9 | import numpy as np 10 | import random 11 | 12 | import gym 13 | import gym_cartpole_swingup 14 | from collections import namedtuple, deque 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from torch.distributions import Normal, MultivariateNormal 19 | 20 | import torch.optim as optim 21 | import time 22 | from tensorboardX import SummaryWriter 23 | import argparse 24 | 25 | 26 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 27 | 28 | def hidden_init(layer): 29 | fan_in = layer.weight.data.size()[0] 30 | lim = 1. / np.sqrt(fan_in) 31 | return (-lim, lim) 32 | 33 | class Actor(nn.Module): 34 | """Actor (Policy) Model.""" 35 | 36 | def __init__(self, state_size, action_size, seed, hidden_size=32, init_w=3e-3, log_std_min=-20, log_std_max=2): 37 | """Initialize parameters and build model. 38 | Params 39 | ====== 40 | state_size (int): Dimension of each state 41 | action_size (int): Dimension of each action 42 | seed (int): Random seed 43 | fc1_units (int): Number of nodes in first hidden layer 44 | fc2_units (int): Number of nodes in second hidden layer 45 | """ 46 | super(Actor, self).__init__() 47 | self.seed = torch.manual_seed(seed) 48 | self.log_std_min = log_std_min 49 | self.log_std_max = log_std_max 50 | 51 | self.fc1 = nn.Linear(state_size, hidden_size) 52 | self.fc2 = nn.Linear(hidden_size, hidden_size) 53 | 54 | self.mu = nn.Linear(hidden_size, action_size) 55 | self.log_std_linear = nn.Linear(hidden_size, action_size) 56 | 57 | def reset_parameters(self): 58 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 59 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 60 | self.mu.weight.data.uniform_(-init_w, init_w) 61 | self.log_std_linear.weight.data.uniform_(-init_w, init_w) 62 | 63 | def forward(self, state): 64 | 65 | x = F.relu(self.fc1(state), inplace=True) 66 | x = F.relu(self.fc2(x), inplace=True) 67 | mu = self.mu(x) 68 | 69 | log_std = self.log_std_linear(x) 70 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) 71 | return mu, log_std 72 | 73 | def evaluate(self, state, epsilon=1e-6): 74 | mu, log_std = self.forward(state) 75 | std = log_std.exp() 76 | dist = Normal(0, 1) 77 | e = dist.sample().to(device) 78 | action = torch.tanh(mu + e * std) 79 | log_prob = Normal(mu, std).log_prob(mu + e * std) - torch.log(1 - action.pow(2) + epsilon) 80 | #action = torch.clamp(action*action_high, action_low, action_high) 81 | return action, log_prob 82 | 83 | 84 | def get_action(self, state): 85 | """ 86 | returns the action based on a squashed gaussian policy. That means the samples are obtained according to: 87 | a(s,e)= tanh(mu(s)+sigma(s)+e) 88 | """ 89 | state = torch.FloatTensor(state).to(device) 90 | mu, log_std = self.forward(state) 91 | std = log_std.exp() 92 | dist = Normal(0, 1) 93 | e = dist.sample().to(device) 94 | action = torch.tanh(mu + e * std).cpu() 95 | 96 | return action[0] 97 | 98 | 99 | class Critic(nn.Module): 100 | """Critic (Value) Model.""" 101 | 102 | def __init__(self, state_size, action_size, seed, hidden_size=32): 103 | """Initialize parameters and build model. 104 | Params 105 | ====== 106 | state_size (int): Dimension of each state 107 | action_size (int): Dimension of each action 108 | seed (int): Random seed 109 | hidden_size (int): Number of nodes in the network layers 110 | 111 | """ 112 | super(Critic, self).__init__() 113 | self.seed = torch.manual_seed(seed) 114 | self.fc1 = nn.Linear(state_size+action_size, hidden_size) 115 | self.fc2 = nn.Linear(hidden_size, hidden_size) 116 | self.fc3 = nn.Linear(hidden_size, 1) 117 | self.reset_parameters() 118 | 119 | def reset_parameters(self): 120 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 121 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 122 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 123 | 124 | def forward(self, state, action): 125 | """Build a critic (value) network that maps (state, action) pairs -> Q-values.""" 126 | x = torch.cat((state, action), dim=1) 127 | x = F.relu(self.fc1(x)) 128 | x = F.relu(self.fc2(x)) 129 | return self.fc3(x) 130 | 131 | class Agent(): 132 | """Interacts with and learns from the environment.""" 133 | 134 | def __init__(self, state_size, action_size, random_seed, hidden_size, action_prior="uniform"): 135 | """Initialize an Agent object. 136 | 137 | Params 138 | ====== 139 | state_size (int): dimension of each state 140 | action_size (int): dimension of each action 141 | random_seed (int): random seed 142 | """ 143 | self.state_size = state_size 144 | self.action_size = action_size 145 | self.seed = random.seed(random_seed) 146 | 147 | self.target_entropy = -action_size # -dim(A) 148 | self.alpha = 1 149 | self.log_alpha = torch.tensor([0.0], requires_grad=True) 150 | self.alpha_optimizer = optim.Adam(params=[self.log_alpha], lr=LR_ACTOR) 151 | self._action_prior = action_prior 152 | 153 | print("Using: ", device) 154 | 155 | # Actor Network 156 | self.actor_local = Actor(state_size, action_size, random_seed, hidden_size).to(device) 157 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) 158 | 159 | # Critic Network (w/ Target Network) 160 | self.critic1 = Critic(state_size, action_size, random_seed, hidden_size).to(device) 161 | self.critic2 = Critic(state_size, action_size, random_seed, hidden_size).to(device) 162 | 163 | self.critic1_target = Critic(state_size, action_size, random_seed,hidden_size).to(device) 164 | self.critic1_target.load_state_dict(self.critic1.state_dict()) 165 | 166 | self.critic2_target = Critic(state_size, action_size, random_seed,hidden_size).to(device) 167 | self.critic2_target.load_state_dict(self.critic2.state_dict()) 168 | 169 | self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=LR_CRITIC, weight_decay=0) 170 | self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=LR_CRITIC, weight_decay=0) 171 | 172 | # Replay memory 173 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) 174 | 175 | 176 | def step(self, state, action, reward, next_state, done, step): 177 | """Save experience in replay memory, and use random sample from buffer to learn.""" 178 | # Save experience / reward 179 | self.memory.add(state, action, reward, next_state, done) 180 | 181 | # Learn, if enough samples are available in memory 182 | if len(self.memory) > BATCH_SIZE: 183 | experiences = self.memory.sample() 184 | self.learn(step, experiences, GAMMA) 185 | 186 | 187 | def act(self, state, add_noise=True): 188 | """Returns actions for given state as per current policy.""" 189 | state = torch.from_numpy(state).float().to(device) 190 | action = self.actor_local.get_action(state).detach() 191 | return action 192 | 193 | def learn(self, step, experiences, gamma, d=1): 194 | """Updates actor, critics and entropy_alpha parameters using given batch of experience tuples. 195 | Q_targets = r + γ * (min_critic_target(next_state, actor_target(next_state)) - α *log_pi(next_action|next_state)) 196 | Critic_loss = MSE(Q, Q_target) 197 | Actor_loss = α * log_pi(a|s) - Q(s,a) 198 | where: 199 | actor_target(state) -> action 200 | critic_target(state, action) -> Q-value 201 | Params 202 | ====== 203 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 204 | gamma (float): discount factor 205 | """ 206 | states, actions, rewards, next_states, dones = experiences 207 | 208 | 209 | # ---------------------------- update critic ---------------------------- # 210 | # Get predicted next-state actions and Q values from target models 211 | next_action, log_pis_next = self.actor_local.evaluate(next_states) 212 | 213 | Q_target1_next = self.critic1_target(next_states.to(device), next_action.squeeze(0).to(device)) 214 | Q_target2_next = self.critic2_target(next_states.to(device), next_action.squeeze(0).to(device)) 215 | 216 | # take the mean of both critics for updating 217 | Q_target_next = torch.min(Q_target1_next, Q_target2_next) 218 | 219 | if FIXED_ALPHA == None: 220 | # Compute Q targets for current states (y_i) 221 | Q_targets = rewards + (gamma * (1 - dones) * (Q_target_next - self.alpha * log_pis_next.squeeze(0))) 222 | else: 223 | Q_targets = rewards + (gamma * (1 - dones) * (Q_target_next - FIXED_ALPHA * log_pis_next.squeeze(0))) 224 | # Compute critic loss 225 | Q_1 = self.critic1(states, actions) 226 | Q_2 = self.critic2(states, actions) 227 | critic1_loss = 0.5*F.mse_loss(Q_1, Q_targets.detach()) 228 | critic2_loss = 0.5*F.mse_loss(Q_2, Q_targets.detach()) 229 | # Update critics 230 | # critic 1 231 | self.critic1_optimizer.zero_grad() 232 | critic1_loss.backward() 233 | self.critic1_optimizer.step() 234 | # critic 2 235 | self.critic2_optimizer.zero_grad() 236 | critic2_loss.backward() 237 | self.critic2_optimizer.step() 238 | if step % d == 0: 239 | # ---------------------------- update actor ---------------------------- # 240 | if FIXED_ALPHA == None: 241 | alpha = torch.exp(self.log_alpha) 242 | # Compute alpha loss 243 | actions_pred, log_pis = self.actor_local.evaluate(states) 244 | alpha_loss = - (self.log_alpha * (log_pis + self.target_entropy).detach()).mean() 245 | self.alpha_optimizer.zero_grad() 246 | alpha_loss.backward() 247 | self.alpha_optimizer.step() 248 | 249 | self.alpha = alpha 250 | # Compute actor loss 251 | if self._action_prior == "normal": 252 | policy_prior = MultivariateNormal(loc=torch.zeros(self.action_size), scale_tril=torch.ones(self.action_size).unsqueeze(0)) 253 | policy_prior_log_probs = policy_prior.log_prob(actions_pred) 254 | elif self._action_prior == "uniform": 255 | policy_prior_log_probs = 0.0 256 | 257 | actor_loss = (alpha * log_pis.squeeze(0) - self.critic1(states, actions_pred.squeeze(0)) - policy_prior_log_probs ).mean() 258 | else: 259 | if self._action_prior == "normal": 260 | policy_prior = MultivariateNormal(loc=torch.zeros(self.action_size), scale_tril=torch.ones(self.action_size).unsqueeze(0)) 261 | policy_prior_log_probs = policy_prior.log_prob(actions_pred) 262 | elif self._action_prior == "uniform": 263 | policy_prior_log_probs = 0.0 264 | 265 | actor_loss = (FIXED_ALPHA * log_pis.squeeze(0) - self.critic1(states, actions_pred.squeeze(0)) - policy_prior_log_probs ).mean() 266 | # Minimize the loss 267 | self.actor_optimizer.zero_grad() 268 | actor_loss.backward() 269 | self.actor_optimizer.step() 270 | 271 | # ----------------------- update target networks ----------------------- # 272 | self.soft_update(self.critic1, self.critic1_target, TAU) 273 | self.soft_update(self.critic2, self.critic2_target, TAU) 274 | 275 | 276 | 277 | def soft_update(self, local_model, target_model, tau): 278 | """Soft update model parameters. 279 | θ_target = τ*θ_local + (1 - τ)*θ_target 280 | Params 281 | ====== 282 | local_model: PyTorch model (weights will be copied from) 283 | target_model: PyTorch model (weights will be copied to) 284 | tau (float): interpolation parameter 285 | """ 286 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 287 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) 288 | 289 | class ReplayBuffer: 290 | """Fixed-size buffer to store experience tuples.""" 291 | 292 | def __init__(self, action_size, buffer_size, batch_size, seed): 293 | """Initialize a ReplayBuffer object. 294 | Params 295 | ====== 296 | buffer_size (int): maximum size of buffer 297 | batch_size (int): size of each training batch 298 | """ 299 | self.action_size = action_size 300 | self.memory = deque(maxlen=buffer_size) # internal memory (deque) 301 | self.batch_size = batch_size 302 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 303 | self.seed = random.seed(seed) 304 | 305 | def add(self, state, action, reward, next_state, done): 306 | """Add a new experience to memory.""" 307 | e = self.experience(state, action, reward, next_state, done) 308 | self.memory.append(e) 309 | 310 | def sample(self): 311 | """Randomly sample a batch of experiences from memory.""" 312 | experiences = random.sample(self.memory, k=self.batch_size) 313 | 314 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 315 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device) 316 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 317 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) 318 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) 319 | 320 | return (states, actions, rewards, next_states, dones) 321 | 322 | def __len__(self): 323 | """Return the current size of internal memory.""" 324 | return len(self.memory) 325 | 326 | 327 | 328 | def SAC(n_episodes=200, max_t=500, print_every=10): 329 | scores_deque = deque(maxlen=100) 330 | scores = [] 331 | average_100_scores = [] 332 | 333 | for i_episode in range(1, n_episodes+1): 334 | 335 | state = env.reset() 336 | state = state.reshape((1,state_size)) 337 | score = 0 338 | for t in range(max_t): 339 | 340 | 341 | action = agent.act(state) 342 | action_v = action[0].numpy() 343 | action_v = np.clip(action_v*action_high, action_low, action_high) 344 | next_state, reward, done, info = env.step(action_v) 345 | next_state = next_state.reshape((1,state_size)) 346 | agent.step(state, action, reward, next_state, done, t) 347 | state = next_state 348 | score += reward 349 | 350 | if done: 351 | break 352 | 353 | scores_deque.append(score) 354 | writer.add_scalar("max_reward", score, i_episode) 355 | average_100_scores.append(np.mean(scores_deque)) 356 | 357 | print('\rEpisode {} Reward: {:.2f} Average100 Score: {:.2f}'.format(i_episode, score, np.mean(scores_deque)), end="") 358 | if i_episode % print_every == 0: 359 | print('\rEpisode {} Reward: {:.2f} Average100 Score: {:.2f}'.format(i_episode, score, np.mean(scores_deque))) 360 | 361 | 362 | torch.save(agent.actor_local.state_dict(), args.info + ".pt") 363 | return scores, average_100_scores 364 | 365 | 366 | 367 | def play(): 368 | agent.actor_local.eval() 369 | for i_episode in range(1): 370 | 371 | state = env.reset() 372 | state = state.reshape((1,state_size)) 373 | 374 | while True: 375 | action = agent.act(state) 376 | action_v = action[0].numpy() 377 | action_v = np.clip(action_v*action_high, action_low, action_high) 378 | next_state, reward, done, info = env.step(action_v) 379 | next_state = next_state.reshape((1,state_size)) 380 | state = next_state 381 | 382 | if done: 383 | break 384 | 385 | 386 | 387 | parser = argparse.ArgumentParser(description="") 388 | parser.add_argument("-env", type=str,default="Pendulum-v0", help="Environment name") 389 | parser.add_argument("-info", type=str, help="Information or name of the run") 390 | parser.add_argument("-ep", type=int, default=200, help="The amount of training episodes, default is 200") 391 | parser.add_argument("-seed", type=int, default=0, help="Seed for the env and torch network weights, default is 0") 392 | parser.add_argument("-lr", type=float, default=5e-4, help="Learning rate of adapting the network weights, default is 5e-4") 393 | parser.add_argument("-a", "--alpha", type=float, help="entropy alpha value, if not choosen the value is leaned by the agent") 394 | parser.add_argument("-layer_size", type=int, default=64, help="Number of nodes per neural network layer, default is 64") 395 | parser.add_argument("-repm", "--replay_memory", type=float, default=1e6, help="Size of the Replay memory, default is 1e6") 396 | parser.add_argument("-bs", "--batch_size", type=int, default=256, help="Batch size, default is 256") 397 | parser.add_argument("-t", "--tau", type=float, default=1e-2, help="Softupdate factor tau, default is 1e-2") 398 | parser.add_argument("-g", "--gamma", type=float, default=0.99, help="discount factor gamma, default is 0.99") 399 | parser.add_argument("--saved_model", type=str, default=None, help="Load a saved model to perform a test run!") 400 | args = parser.parse_args() 401 | 402 | 403 | env_name = args.env 404 | seed = args.seed 405 | n_episodes = args.ep 406 | GAMMA = args.gamma 407 | TAU = args.tau 408 | HIDDEN_SIZE = args.layer_size 409 | BUFFER_SIZE = int(args.replay_memory) 410 | BATCH_SIZE = args.batch_size # minibatch size 411 | LR_ACTOR = args.lr # learning rate of the actor 412 | LR_CRITIC = args.lr # learning rate of the critic 413 | FIXED_ALPHA = args.alpha 414 | saved_model = args.saved_model 415 | 416 | t0 = time.time() 417 | writer = SummaryWriter("runs/"+args.info) 418 | env = gym.make(env_name) 419 | action_high = env.action_space.high[0] 420 | action_low = env.action_space.low[0] 421 | torch.manual_seed(seed) 422 | env.seed(seed) 423 | state_size = env.observation_space.shape[0] 424 | action_size = env.action_space.shape[0] 425 | agent = Agent(state_size=state_size, action_size=action_size, random_seed=seed,hidden_size=HIDDEN_SIZE, action_prior="uniform") #"normal" 426 | 427 | if saved_model != None: 428 | agent.actor_local.load_state_dict(torch.load(saved_model)) 429 | play() 430 | else: 431 | scores, average_100 = SAC(n_episodes=n_episodes, max_t=2300, print_every=10) 432 | t1 = time.time() 433 | env.close() 434 | print("training took {} min!".format((t1-t0)/60)) 435 | 436 | -------------------------------------------------------------------------------- /Cross_entropy/Cross_entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import wrappers 4 | import torch 5 | import torch.nn as nn 6 | from torch import optim 7 | from collections import namedtuple 8 | from tensorboardX import SummaryWriter 9 | # Memory 10 | Episode = namedtuple("Episode", field_names = ["reward","steps"]) 11 | EpisodeStep = namedtuple("EpisodeStep", field_names = ["state", "action"]) 12 | 13 | 14 | class Network(nn.Module): 15 | def __init__(self, input_shape, output_shape): 16 | super(Network, self).__init__() 17 | 18 | 19 | self.net = nn.Sequential( 20 | nn.Linear(input_shape, 128), 21 | nn.ReLU(), 22 | nn.Linear(128, output_shape) 23 | ) 24 | 25 | def forward(self,x): 26 | return self.net(x) 27 | 28 | def filter_batch(batch, percentile = 70): 29 | rewards = list(map(lambda s: s.reward, batch)) 30 | reward_bound = np.percentile(rewards, percentile) 31 | reward_mean = float(np.mean(rewards)) 32 | 33 | train_obs = [] 34 | train_act = [] 35 | for example in batch: 36 | if example.reward < reward_bound: 37 | continue 38 | train_obs.extend(map(lambda step: step.state, example.steps)) 39 | train_act.extend(map(lambda step: step.action, example.steps)) 40 | train_obs_vector = torch.FloatTensor(train_obs) 41 | train_act_vector = torch.LongTensor(train_act) 42 | return train_obs_vector, train_act_vector, reward_bound, reward_mean 43 | 44 | def iterative_batches(env, network, batch_size = 16): 45 | batch = [] 46 | episode_reward = 0.0 47 | episode_steps = [] 48 | state = env.reset() 49 | softmax = nn.Softmax(dim =1) 50 | 51 | while True: 52 | state_vector = torch.Tensor([state]) 53 | action_probs_vector = softmax(network(state_vector)) 54 | 55 | action_probs = action_probs_vector.data.numpy()[0] 56 | action = np.random.choice(len(action_probs), p = action_probs) 57 | 58 | next_state, reward, done, _ = env.step(action) 59 | episode_reward += reward 60 | episode_steps.append(EpisodeStep(state = state, action = action)) 61 | 62 | if done: 63 | batch.append(Episode(reward = episode_reward, steps = episode_steps)) 64 | episode_reward = 0.0 65 | episode_steps = [] 66 | next_state = env.reset() 67 | if len(batch) == batch_size: 68 | yield batch 69 | batch = [] 70 | state = next_state 71 | 72 | if __name__ == "__main__": 73 | env = gym.make("CartPole-v0") 74 | env = gym.wrappers.Monitor(env, directory = "mon", force = True) 75 | output_shape = env.action_space.n 76 | input_shape = env.observation_space.shape[0] 77 | 78 | network = Network(input_shape = input_shape, output_shape = output_shape) 79 | objective = nn.CrossEntropyLoss() 80 | optimizer = optim.Adam(params = network.parameters(), lr = 0.01) 81 | writer = SummaryWriter() 82 | 83 | for iter_no, batch in enumerate(iterative_batches(env, network)): 84 | state_vector, action_vector, reward_bound, reward_mean = filter_batch(batch) 85 | optimizer.zero_grad() 86 | action_values_vector = network(state_vector) 87 | loss_vector = objective(action_values_vector, action_vector) 88 | loss_vector.backward() 89 | optimizer.step() 90 | print("{}: loss = {}, reward_mean = {}, reward_boundary = {}".format(iter_no, loss_vector.item(), reward_mean, reward_bound)) 91 | writer.add_scalar("loss", loss_vector.item(), iter_no) 92 | writer.add_scalar("reward mean", reward_mean, iter_no) 93 | writer.add_scalar("reward boundary", reward_bound, iter_no) 94 | if reward_mean > 199: 95 | print("Solved CartPole Problem!") 96 | break 97 | writer.close() -------------------------------------------------------------------------------- /Cross_entropy/README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Leanring with Cross entropy 2 | Cross entropy method implemented on the cart pole problem. 3 | based on the example in the book [Deep Reinforcement Learning Hands-on](https://www.amazon.de/Deep-Reinforcement-Learning-Hands-Q-networks-ebook/dp/B076H9VQH6) by Maxim Lapan 4 | 5 | 6 | ![alt text](img/Cross_entropy.png) 7 | -------------------------------------------------------------------------------- /Cross_entropy/img/Cross_entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Cross_entropy/img/Cross_entropy.png -------------------------------------------------------------------------------- /Deep Q_Learning/DQN_Experience_Replay.py: -------------------------------------------------------------------------------- 1 | import keras 2 | from keras.models import Sequential 3 | from keras.layers import Dense, Dropout 4 | from keras.optimizers import Adam 5 | import numpy as np 6 | from collections import deque 7 | from keras.models import load_model 8 | import random 9 | import matplotlib.pyplot as plt 10 | import gym 11 | from gym import wrappers 12 | 13 | 14 | 15 | class AI(): 16 | def __init__(self, state_size, action_size, memory_size, learning_rate, gamma): 17 | self.state_size = state_size 18 | self.action_size = action_size 19 | self.memory = deque(maxlen = memory_size) 20 | 21 | 22 | # HYPERPARAMETER 23 | self.learning_rate = learning_rate 24 | self.gamma = gamma 25 | self.epsilon = 0.5 26 | self.epsilon_start = self.epsilon 27 | 28 | self.brain = self.build_brain() 29 | 30 | 31 | def build_brain(self): 32 | model = Sequential() 33 | model.add(Dense(self.state_size, activation='relu')) 34 | model.add(Dense(25, activation='relu')) 35 | #model.add(Dropout(0.3)) 36 | model.add(Dense(25, activation='relu')) 37 | #model.add(Dropout(0.3)) 38 | # model.add(Dense(12, activation='relu')) 39 | model.add(Dense(self.action_size, activation='linear')) 40 | model.compile(loss = "mse", optimizer = Adam(lr=self.learning_rate)) 41 | return model 42 | 43 | 44 | def load_model(self, name): 45 | """ 46 | Loads an existing Model 47 | Input: string of the model name - h5 data 48 | """ 49 | brain = load_model(name) 50 | return None 51 | 52 | def save_learnings(self, model_name):# 53 | """ 54 | Input string of Modelname 55 | """ 56 | self.brain.save(model_name+".h5") 57 | 58 | def adapt_epsilon(self,ep): 59 | # Epsilon starts at 0.5 linear increasing to 0.99 by ep 4000: 60 | # linear: epsilon = 0.0001225*ep+self.epsilon_start 61 | # exponent (4000 eps): epsilon = self.epsilon_start + (ep/5714)**2 62 | if ep == 0: 63 | pass 64 | self.epsilon = self.epsilon_start + (ep/5714)**2 65 | 66 | def act(self, state, status = "train"): 67 | if status == "train": 68 | if np.random.rand() > self.epsilon: 69 | return random.randrange(self.action_size) 70 | return np.argmax(self.brain.predict(state)[0]) 71 | 72 | def remember(self, state, action, reward, next_state, done): 73 | self.memory.append((state, action, reward, next_state, done)) 74 | 75 | def replay(self): 76 | batch_size = 32 77 | if len(self.memory) < batch_size: 78 | return 79 | 80 | samples = random.sample(self.memory, batch_size) 81 | for state, action, reward, next_state, done in samples: 82 | target = reward 83 | 84 | if not done: 85 | target = reward + self.gamma * np.amax(self.brain.predict(next_state)[0]) # Predict the future/target value 86 | #print(target) 87 | Q_target_shape = self.brain.predict(state) # normal Q- Value prediction for the training-shape 88 | Q_target_shape[0][action] = target # replacing the best Q-Value with the target 89 | self.brain.fit(state, Q_target_shape, epochs=1, verbose=0) # training with the new Target value (loss = sum(Q_target-Q)exp2) 90 | 91 | 92 | 93 | 94 | def play(Ep, agent, status = "train"): 95 | 96 | learning_graph = [] 97 | env = gym.make("CartPole-v1") 98 | env = wrappers.Monitor(env, "Saved_DQN_ER_Models/", resume=True, video_callable=lambda episode_id: episode_id%250==0) 99 | action_space = env.action_space.n 100 | state_space = env.observation_space.shape[0] 101 | if agent == None: 102 | agent = AI(state_space,action_space,memory_size = 5000,learning_rate = 0.001,gamma = 0.95) #2500 mem 103 | for ep in range(Ep): 104 | state = env.reset() 105 | state = np.reshape(state,[1,state_space]) 106 | done = False 107 | score = 0 108 | agent.adapt_epsilon(ep) # Increasing the epsilon linear - adjustable to non linear, log,... 109 | while not done: 110 | 111 | if status == "play": 112 | env.render() 113 | action = agent.act(state, status) 114 | new_state, reward, done, _ = env.step(action) 115 | new_state = np.reshape(new_state,[1,state_space]) 116 | agent.remember(state, action, reward, new_state, done) 117 | state = new_state 118 | score +=1 119 | if done: 120 | break 121 | print("Episode {}# Score: {}".format(ep, score + 1)) 122 | if ep == 250 or ep % 500 == 0: 123 | # save model eacht 500 ep for videos 124 | agent.save_learnings(str(ep)+","+str(score)) 125 | agent.replay() 126 | learning_graph.append(score) 127 | return learning_graph, agent 128 | 129 | def main(): 130 | Episodes = 4001 #4001 131 | graph,agent = play(Episodes,None) 132 | plt.plot(graph) 133 | plt.xlabel("Episoden") 134 | plt.ylabel("Score") 135 | plt.show() 136 | 137 | print("Do you want to save the model?") 138 | answer = input("Y/N\n") 139 | if answer == "Y": 140 | name = input("give a name for the model: \n") 141 | agent.save_learnings(name) 142 | else: 143 | pass 144 | 145 | 146 | print("Soll der Agent getestet werden?\n") 147 | n = input("Wie viele Episoden sollen gespielt werden?") 148 | x,y = play(int(n),agent,status = "play") 149 | 150 | if __name__ == "__main__": 151 | main() 152 | -------------------------------------------------------------------------------- /Deep Q_Learning/Img/4k Learning_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Deep Q_Learning/Img/4k Learning_curve.png -------------------------------------------------------------------------------- /Deep Q_Learning/Img/Converging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Deep Q_Learning/Img/Converging.png -------------------------------------------------------------------------------- /Deep Q_Learning/README.md: -------------------------------------------------------------------------------- 1 | # Deep Q_Learning with Experience Replay playing Cart Pole 2 | 3 | [image1]: ./Img/Converging.png "Calculation Equation" 4 | [image2]: ./Img/Q_table10000.png "Calculation Equation" 5 | 6 | 7 | ### Exponential Epsilon: 8 | 9 | 10 | 11 | Learning Curve after 4000 Epochs and and exponentially epsilon Greedy 12 | 13 | ![alt text][image1] 14 | 15 | 16 | 17 | 18 | 19 | ### Youtube Video: 20 | [Deep Q-Network plays Cart Pole](https://www.youtube.com/watch?v=9g2ZLPs5Rs0) 21 | 22 | 23 | -------------------------------------------------------------------------------- /Double DQN/CNN_Double_DQN.py: -------------------------------------------------------------------------------- 1 | import math, random 2 | from collections import deque 3 | import cv2 4 | 5 | import gym 6 | from gym import wrappers 7 | import wrapper 8 | import numpy as np 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | import torch.autograd as autograd 14 | import torch.nn.functional as F 15 | from IPython.display import clear_output 16 | 17 | import matplotlib.pyplot as plt 18 | 19 | USE_CUDA = torch.cuda.is_available() 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 21 | 22 | class ReplayBuffer(object): 23 | def __init__(self, capacity): 24 | self.buffer = deque(maxlen=capacity) 25 | 26 | def push(self, state, action, reward, next_state, done): 27 | state = np.expand_dims(state, 0) 28 | next_state = np.expand_dims(next_state, 0) 29 | 30 | self.buffer.append((state, action, reward, next_state, done)) 31 | 32 | def sample(self, batch_size): 33 | state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 34 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 35 | 36 | def __len__(self): 37 | return len(self.buffer) 38 | 39 | class CnnDQN(nn.Module): 40 | def __init__(self, input_shape, num_actions): 41 | super(CnnDQN, self).__init__() 42 | 43 | self.input_shape = input_shape 44 | self.num_actions = num_actions 45 | 46 | self.features = nn.Sequential( 47 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 48 | nn.ReLU(), 49 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 50 | nn.ReLU(), 51 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 52 | nn.ReLU() 53 | ) 54 | 55 | self.fc = nn.Sequential( 56 | nn.Linear(self.feature_size(), 512), 57 | nn.ReLU(), 58 | nn.Linear(512, self.num_actions) 59 | ) 60 | 61 | def forward(self, x): 62 | x = self.features(x) 63 | x = x.view(x.size(0), -1) 64 | x = self.fc(x) 65 | return x 66 | 67 | def feature_size(self): 68 | return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1) 69 | 70 | def act(self, state, epsilon,action_space): 71 | if random.random() > epsilon: 72 | with torch.no_grad(): 73 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0)) 74 | q_value = self.forward(state) 75 | action = q_value.max(1)[1].data[0] #.max(1) gives the maxvalues--[0] and idx--[1] 76 | else: 77 | action = random.randrange(action_space) 78 | return action 79 | 80 | def update_target(current_model, target_model): 81 | target_model.load_state_dict(current_model.state_dict()) 82 | 83 | def save_model(model, idx): 84 | torch.save(model, "Saved_models/") 85 | 86 | def epsilon_by_frame(frame_idx): 87 | epsilon_start = 1.0 88 | epsilon_final = 0.01 #0.01 89 | epsilon_decay = 30000 #30000 90 | eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) 91 | return eps 92 | 93 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,replay_buffer): 94 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 95 | # shapes for normal image-- stacked (4,84,84) ... 96 | state = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84) 97 | next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84) 98 | action = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function 99 | reward = Variable(torch.FloatTensor(reward)) #shape [32] 100 | done = Variable(torch.FloatTensor(done)) #shape [32] 101 | 102 | q_values = current_model(state) #shape [32,6] 103 | next_q_values = current_model(next_state) #shape [32,6] 104 | next_q_state_values = target_model(next_state) #shape [32,6] 105 | 106 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action 107 | next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1] 108 | expected_q_value = reward + gamma * next_q_value * (1 - done) # shape [32] 109 | 110 | 111 | # DeepMind took nn.SmoothL1Loss() 112 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss -- .data to get rid of grad_fn= 113 | loss = loss_func(q_value,Variable(expected_q_value.data)) 114 | 115 | opti.zero_grad() 116 | loss.backward() 117 | opti.step() 118 | return loss 119 | 120 | def plot(frame_idx, rewards, losses): 121 | plt.close() 122 | plt.figure(figsize=(20,5)) 123 | plt.subplot(121) 124 | plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2))) 125 | plt.plot(rewards) 126 | plt.subplot(122) 127 | plt.title("loss") 128 | plt.plot(losses) 129 | plt.ylim(0,1) 130 | plt.draw() 131 | plt.pause(0.0001) 132 | 133 | def processing(img): 134 | img = np.expand_dims(cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), (84,84)),axis= 0) 135 | img = img.astype(np.uint8) 136 | #print(img.dtype) 137 | return img 138 | 139 | def main(): 140 | plt.ion() 141 | env = wrapper.make_atari("PongNoFrameskip-v4", monitor=True,epidsode_capture=75) 142 | env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True) 143 | action_space = env.action_space.n 144 | current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape 145 | target_model = CnnDQN(env.observation_space.shape, action_space) 146 | 147 | if USE_CUDA: 148 | current_model = current_model.cuda() 149 | target_model = target_model.cuda() 150 | 151 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000) 152 | #opti = optim.Adam(current_model.parameters(), lr=0.0001) 153 | opti = optim.RMSprop(current_model.parameters(), lr=0.0001) 154 | loss_func = nn.SmoothL1Loss() 155 | 156 | replay_initial = 10000 157 | replay_buffer = ReplayBuffer(100000) 158 | 159 | num_frames = 1000000 160 | batch_size = 32 161 | gamma = 0.99 162 | 163 | losses = [] 164 | all_rewards = [] 165 | episode_reward = 0 166 | 167 | state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84) 168 | # Manuel Stacking 169 | #state = processing(state) 170 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0) 171 | #assert state.shape == (4,84,84) 172 | for frame_idx in range(1, num_frames + 1): 173 | 174 | epsilon = epsilon_by_frame(frame_idx) 175 | print("Training :: Frame {} :: Epsilon {} ".format(frame_idx, round(epsilon,2))) 176 | action = current_model.act(state, epsilon,action_space) 177 | next_state, reward, done, _ = env.step(action) 178 | # Manuel Stacking 179 | #next_state = processing(next_state) 180 | #next_state = np.append(next_state, state[1:, :, :],axis= 0) 181 | #assert next_state.shape == (4,84,84) 182 | replay_buffer.push(state, action, reward, next_state, done) 183 | 184 | state = next_state 185 | episode_reward += reward 186 | 187 | if done: 188 | state = env.reset() 189 | # Manuel Stacking 190 | #state = processing(state) 191 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0) 192 | all_rewards.append(episode_reward) 193 | episode_reward = 0 194 | 195 | if len(replay_buffer) > replay_initial: 196 | loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,replay_buffer) 197 | losses.append(loss.item()) 198 | 199 | if frame_idx % 10000 == 0: 200 | plot(frame_idx, all_rewards, losses) 201 | 202 | if frame_idx % 1000 == 0: 203 | update_target(current_model, target_model) 204 | 205 | #if frame_idx % 100000 ==0: 206 | # save_model(current_model, frame_idx) 207 | 208 | if __name__ == "__main__": 209 | main() -------------------------------------------------------------------------------- /Double DQN/Double_DQN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import wrappers 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.autograd import Variable 8 | 9 | import numpy as np 10 | from collections import deque 11 | import random 12 | import matplotlib.pyplot as plt 13 | import matplotlib.animation as animation 14 | from matplotlib import style 15 | import time 16 | 17 | class Network(nn.Module): 18 | def __init__(self, input_dim, output_dim): 19 | super(Network,self).__init__() 20 | self.linear1 = nn.Linear(input_dim, 40) 21 | self.linear2 = nn.Linear(40, 40) 22 | self.linear3 = nn.Linear(40, output_dim) 23 | 24 | def forward(self,x): 25 | x = self.linear1(x) 26 | x = F.relu(x) 27 | x = self.linear2(x) 28 | x = F.relu(x) 29 | out = self.linear3(x) 30 | return out 31 | 32 | class Agent: 33 | def __init__(self, state_size, action_size): 34 | 35 | self.state_size = state_size 36 | self.action_size = action_size 37 | self.memory = deque(maxlen=5000) 38 | self.gamma = 0.95 # discount rate 39 | self.epsilon = 0.4 # exploration rate 40 | self.epsilon_start = self.epsilon 41 | self.learning_rate = 0.001 42 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #activate device 43 | 44 | # Our DQN and the Target Network 45 | self.model = Network(state_size, action_size).to(self.device) 46 | self.target_model = Network(state_size, action_size).to(self.device) 47 | 48 | self.criteria = nn.MSELoss() 49 | self.opt = optim.Adam(self.model.parameters(), lr=self.learning_rate) 50 | 51 | def remember(self, state, action, reward, next_state, done): 52 | self.memory.append((state, action, reward, next_state, done)) 53 | 54 | def update_target(self): 55 | self.target_model.load_state_dict(self.model.state_dict()) 56 | 57 | def adapt_epsilon(self,ep): 58 | # Epsilon starts at 0.5 linear increasing to 0.99 by ep 4000: 59 | # linear: epsilon = 0.0001225*ep+self.epsilon_start 60 | # exponent (4000 eps): epsilon = self.epsilon_start + (ep/5714)**2 61 | if ep == 0: 62 | pass 63 | if self.epsilon < 0.98: 64 | self.epsilon = self.epsilon_start + (ep/3800)**2 #4500 65 | 66 | def act(self, state, status = "Train"): 67 | if status == "Play": 68 | self.epsilon = 0.95 69 | if np.random.rand() > self.epsilon: 70 | return random.randrange(self.action_size) 71 | 72 | act_values = self.model(Variable(torch.Tensor(state)).to(self.device)).cpu().data.numpy() 73 | return np.argmax(act_values[0]) 74 | 75 | def give_epsilon(self): 76 | return self.epsilon 77 | 78 | def replay(self, batch_size): 79 | if len(self.memory) < batch_size: 80 | return 81 | minibatch = random.sample(self.memory, batch_size) 82 | 83 | for state, action, reward, next_state, done in minibatch: 84 | target = reward 85 | self.model.train() 86 | if not done: 87 | next_state_v = Variable(torch.Tensor(next_state)) 88 | target = self.target_model(next_state_v.to(self.device)).cpu() # target has to be on cpu for numpy 89 | target = target.data.numpy()[0] 90 | target_actual = self.target_model(Variable(torch.Tensor(state)).to(self.device)).cpu().data.numpy() 91 | target_actual[0][action] = reward + self.gamma *np.amax(target) 92 | 93 | self.opt.zero_grad() 94 | out = self.model(Variable(torch.Tensor(state)).to(self.device)) 95 | loss = self.criteria(out, Variable(torch.Tensor(target_actual)).to(self.device)) 96 | loss.backward() 97 | self.opt.step() 98 | 99 | 100 | 101 | 102 | def play(Ep,agent, status = "train"): 103 | # for active plotting: 104 | learning_graph = [] 105 | epsilons = [] 106 | learning_graph_live = deque(maxlen = 180) 107 | epochs_live = deque(maxlen = 180) 108 | epsilons_live = deque(maxlen = 180) 109 | 110 | batch_size = 64 111 | env = gym.make("CartPole-v1") 112 | env = wrappers.Monitor(env, "Saved_Videos/", resume=True, video_callable=lambda episode_id: episode_id%250==0) 113 | action_space = env.action_space.n 114 | state_space = env.observation_space.shape[0] 115 | if agent == None: 116 | agent = Agent(state_space,action_space) 117 | for ep in range(Ep): 118 | state = env.reset() 119 | state = np.reshape(state,[1,state_space]) 120 | done = False 121 | score = 0 122 | agent.adapt_epsilon(ep) # Increasing the epsilon linear - adjustable to non linear, log,... 123 | while not done: 124 | 125 | if status == "play": 126 | env.render() 127 | action = agent.act(state, status) 128 | new_state, reward, done, _ = env.step(action) 129 | new_state = np.reshape(new_state,[1,state_space]) 130 | agent.remember(state, action, reward, new_state, done) 131 | state = new_state 132 | score +=1 133 | 134 | if done: 135 | break 136 | 137 | 138 | 139 | print("Episode {}# Score: {}# Epsilon {}".format(ep, score + 1,agent.give_epsilon())) 140 | # Update Target Network 141 | if ep % 200 == 0: 142 | agent.update_target() 143 | print("Updated Target Network!") 144 | agent.replay(batch_size) 145 | # Live plot 146 | learning_graph.append(score) 147 | epsilons.append(agent.give_epsilon()*100) 148 | learning_graph_live.append(score) 149 | epochs_live.append(ep) 150 | epsilons_live.append(agent.give_epsilon()*100) 151 | 152 | plt.plot(epochs_live, learning_graph_live,"b") 153 | plt.plot(epochs_live, epsilons_live,"r") 154 | plt.xlabel("Epoch") 155 | plt.ylabel("Score / Epsilon") 156 | plt.title("Score Live Plot") 157 | plt.show() 158 | plt.pause(0.00000001) 159 | plt.clf() 160 | 161 | return learning_graph, epsilons, agent 162 | 163 | def main(): 164 | Episodes = 4000 #4001 165 | graph,epsilons,agent = play(Episodes,None, "train") 166 | plt.plot(graph, "b") 167 | plt.plot(epsilons, "r") 168 | plt.xlabel("Episoden") 169 | plt.ylabel("Score / Epsilon") 170 | plt.show() 171 | 172 | print("Do you want to save the model?") 173 | answer = input("Y/N\n") 174 | if answer == "Y": 175 | name = input("give a name for the model: \n") 176 | agent.save_learnings(name) 177 | else: 178 | pass 179 | 180 | 181 | print("Soll der Agent getestet werden?\n") 182 | n = input("Wie viele Episoden sollen gespielt werden?") 183 | x,y, ag = play(int(n),agent,status = "play") 184 | 185 | if __name__ == "__main__": 186 | fig = plt.figure() 187 | plt.ion() 188 | main() 189 | -------------------------------------------------------------------------------- /Double DQN/Imgs/4000_40-40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/4000_40-40.png -------------------------------------------------------------------------------- /Double DQN/Imgs/CNN_pong_converge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/CNN_pong_converge.png -------------------------------------------------------------------------------- /Double DQN/Imgs/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/test.png -------------------------------------------------------------------------------- /Double DQN/README.md: -------------------------------------------------------------------------------- 1 | # Double Deep Q_Learning with Experience Replay playing Cart Pole 2 | 3 | [image1]: ./Imgs/test.png "Calculation Equation" 4 | [image2]: ./Imgs/Q_table10000.png "Calculation Equation" 5 | [image3]: ./Imgs/CNN_pong_converge.png 6 | 7 | The difference between DQN and Double DQN is, that in Double DQN the target values get generated by a seperate neural network and not by the same that predicts the the Q_value as in DQN. 8 | [Paper](https://arxiv.org/abs/1509.06461) 9 | 10 | ### Learning Curve: 11 | 12 | Learning Curve after 4000 Epochs and and exponentially epsilon Greedy 13 | 14 | ![alt text][image1] 15 | 16 | 17 | 18 | 19 | 20 | ### Youtube Video: 21 | [Deep Q-Network plays Cart Pole](https://www.youtube.com/watch?v=9g2ZLPs5Rs0) 22 | 23 | ## Training to play pong with an Double Deep Q CNN 24 | I trained a Double Deep Q-Network to play the Atari game Pong. After around 150000 frames it converged and beat its opponent constantly. Thereby the convolutional neural network trained itself totally on visual inputs. Therefor the input images got converted to black-and-white and 4 images got stacked together so the network is able to recognize the velocity of the ball - which would be much more difficult to guess by only one state/image. Also the network was trained offline with a memory technique called experienced replay and after each 1000 frames the target network was updated with the weights of the optimized model. 25 | 26 | ![alt text][image3] 27 | 28 | ### Youtube Video: 29 | [Double Deep Q Network learns to play Pong](https://www.youtube.com/watch?v=I3dTyg_5rFc) 30 | -------------------------------------------------------------------------------- /Double DQN/wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | import gym 4 | 5 | from gym import spaces,wrappers 6 | import cv2 7 | cv2.ocl.setUseOpenCL(False) 8 | 9 | class NoopResetEnv(gym.Wrapper): 10 | def __init__(self, env, noop_max=30): 11 | """Sample initial states by taking random number of no-ops on reset. 12 | No-op is assumed to be action 0. 13 | """ 14 | gym.Wrapper.__init__(self, env) 15 | self.noop_max = noop_max 16 | self.override_num_noops = None 17 | self.noop_action = 0 18 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 19 | 20 | def reset(self, **kwargs): 21 | """ Do no-op action for a number of steps in [1, noop_max].""" 22 | self.env.reset(**kwargs) 23 | if self.override_num_noops is not None: 24 | noops = self.override_num_noops 25 | else: 26 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 27 | assert noops > 0 28 | obs = None 29 | for _ in range(noops): 30 | obs, _, done, _ = self.env.step(self.noop_action) 31 | if done: 32 | obs = self.env.reset(**kwargs) 33 | return obs 34 | 35 | def step(self, ac): 36 | return self.env.step(ac) 37 | 38 | class FireResetEnv(gym.Wrapper): 39 | def __init__(self, env): 40 | """Take action on reset for environments that are fixed until firing.""" 41 | gym.Wrapper.__init__(self, env) 42 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 43 | assert len(env.unwrapped.get_action_meanings()) >= 3 44 | 45 | def reset(self, **kwargs): 46 | self.env.reset(**kwargs) 47 | obs, _, done, _ = self.env.step(1) 48 | if done: 49 | self.env.reset(**kwargs) 50 | obs, _, done, _ = self.env.step(2) 51 | if done: 52 | self.env.reset(**kwargs) 53 | return obs 54 | 55 | def step(self, ac): 56 | return self.env.step(ac) 57 | 58 | class EpisodicLifeEnv(gym.Wrapper): 59 | def __init__(self, env): 60 | """Make end-of-life == end-of-episode, but only reset on true game over. 61 | Done by DeepMind for the DQN and co. since it helps value estimation. 62 | """ 63 | gym.Wrapper.__init__(self, env) 64 | self.lives = 0 65 | self.was_real_done = True 66 | 67 | def step(self, action): 68 | obs, reward, done, info = self.env.step(action) 69 | self.was_real_done = done 70 | # check current lives, make loss of life terminal, 71 | # then update lives to handle bonus lives 72 | lives = self.env.unwrapped.ale.lives() 73 | if lives < self.lives and lives > 0: 74 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames 75 | # so its important to keep lives > 0, so that we only reset once 76 | # the environment advertises done. 77 | done = True 78 | self.lives = lives 79 | return obs, reward, done, info 80 | 81 | def reset(self, **kwargs): 82 | """Reset only when lives are exhausted. 83 | This way all states are still reachable even though lives are episodic, 84 | and the learner need not know about any of this behind-the-scenes. 85 | """ 86 | if self.was_real_done: 87 | obs = self.env.reset(**kwargs) 88 | else: 89 | # no-op step to advance from terminal/lost life state 90 | obs, _, _, _ = self.env.step(0) 91 | self.lives = self.env.unwrapped.ale.lives() 92 | return obs 93 | 94 | class MaxAndSkipEnv(gym.Wrapper): 95 | def __init__(self, env, skip=4): 96 | """Return only every `skip`-th frame""" 97 | gym.Wrapper.__init__(self, env) 98 | # most recent raw observations (for max pooling across time steps) 99 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 100 | self._skip = skip 101 | 102 | def reset(self): 103 | return self.env.reset() 104 | 105 | def step(self, action): 106 | """Repeat action, sum reward, and max over last observations.""" 107 | total_reward = 0.0 108 | done = None 109 | for i in range(self._skip): 110 | obs, reward, done, info = self.env.step(action) 111 | if i == self._skip - 2: self._obs_buffer[0] = obs 112 | if i == self._skip - 1: self._obs_buffer[1] = obs 113 | total_reward += reward 114 | if done: 115 | break 116 | # Note that the observation on the done=True frame 117 | # doesn't matter 118 | max_frame = self._obs_buffer.max(axis=0) 119 | 120 | return max_frame, total_reward, done, info 121 | 122 | def reset(self, **kwargs): 123 | return self.env.reset(**kwargs) 124 | 125 | class ClipRewardEnv(gym.RewardWrapper): 126 | def __init__(self, env): 127 | gym.RewardWrapper.__init__(self, env) 128 | 129 | def reward(self, reward): 130 | """Bin reward to {+1, 0, -1} by its sign.""" 131 | return np.sign(reward) 132 | 133 | class WarpFrame(gym.ObservationWrapper): 134 | def __init__(self, env): 135 | """Warp frames to 84x84 as done in the Nature paper and later work.""" 136 | gym.ObservationWrapper.__init__(self, env) 137 | self.width = 84 138 | self.height = 84 139 | self.observation_space = spaces.Box(low=0, high=255, 140 | shape=(self.height, self.width, 1), dtype=np.uint8) 141 | 142 | def observation(self, frame): 143 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 144 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 145 | return frame[:, :, None] 146 | 147 | class FrameStack(gym.Wrapper): 148 | def __init__(self, env, k): 149 | """Stack k last frames. 150 | 151 | Returns lazy array, which is much more memory efficient. 152 | 153 | See Also 154 | -------- 155 | baselines.common.atari_wrappers.LazyFrames 156 | """ 157 | gym.Wrapper.__init__(self, env) 158 | self.k = k 159 | self.frames = deque([], maxlen=k) 160 | shp = env.observation_space.shape 161 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) 162 | 163 | def reset(self): 164 | ob = self.env.reset() 165 | for _ in range(self.k): 166 | self.frames.append(ob) 167 | return self._get_ob() 168 | 169 | def step(self, action): 170 | ob, reward, done, info = self.env.step(action) 171 | self.frames.append(ob) 172 | return self._get_ob(), reward, done, info 173 | 174 | def _get_ob(self): 175 | assert len(self.frames) == self.k 176 | return LazyFrames(list(self.frames)) 177 | 178 | class ScaledFloatFrame(gym.ObservationWrapper): 179 | def __init__(self, env): 180 | gym.ObservationWrapper.__init__(self, env) 181 | 182 | def observation(self, observation): 183 | # careful! This undoes the memory optimization, use 184 | # with smaller replay buffers only. 185 | return np.array(observation).astype(np.float32) / 255.0 186 | 187 | class LazyFrames(object): 188 | def __init__(self, frames): 189 | """This object ensures that common frames between the observations are only stored once. 190 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 191 | buffers. 192 | 193 | This object should only be converted to numpy array before being passed to the model. 194 | 195 | You'd not believe how complex the previous solution was.""" 196 | self._frames = frames 197 | self._out = None 198 | 199 | def _force(self): 200 | if self._out is None: 201 | self._out = np.concatenate(self._frames, axis=2) 202 | self._frames = None 203 | return self._out 204 | 205 | def __array__(self, dtype=None): 206 | out = self._force() 207 | if dtype is not None: 208 | out = out.astype(dtype) 209 | return out 210 | 211 | def __len__(self): 212 | return len(self._force()) 213 | 214 | def __getitem__(self, i): 215 | return self._force()[i] 216 | 217 | # EDIT BY ATAMAI 218 | # Preparing image received from environment and adjust it to expected format of Pytorch 219 | # HWC (height x width x channel) becomes CHW 220 | class PytorchImage(gym.ObservationWrapper): 221 | def __init__(self, env): 222 | super(PytorchImage, self).__init__(env) 223 | # we check current shape of observations in environment 224 | current_shape = self.observation_space.shape 225 | # we change order of dimensions - so last one (-1) becomes first 226 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(current_shape[-1], current_shape[0], current_shape[1])) 227 | 228 | def observation(self, observation): 229 | # and finally we change order of dimensions for every single observation 230 | # here transpose method could be also used 231 | return np.swapaxes(observation, 2, 0) 232 | 233 | def make_atari(env_id, monitor = False, epidsode_capture = 75): 234 | env = gym.make(env_id) 235 | if monitor == True: 236 | env = wrappers.Monitor(env, "Videos/", resume=True, force =True, video_callable=lambda episode_id: episode_id%epidsode_capture==0) 237 | assert 'NoFrameskip' in env.spec.id 238 | env = NoopResetEnv(env, noop_max=30) 239 | env = MaxAndSkipEnv(env, skip=4) 240 | return env 241 | 242 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False, pytorch_img=False): 243 | """Configure environment for DeepMind-style Atari. 244 | """ 245 | if episode_life: 246 | env = EpisodicLifeEnv(env) 247 | if 'FIRE' in env.unwrapped.get_action_meanings(): 248 | env = FireResetEnv(env) 249 | env = WarpFrame(env) 250 | if scale: 251 | env = ScaledFloatFrame(env) 252 | if clip_rewards: 253 | env = ClipRewardEnv(env) 254 | if frame_stack: 255 | env = FrameStack(env, 4) 256 | if pytorch_img: 257 | env = PytorchImage(env) 258 | return env 259 | 260 | -------------------------------------------------------------------------------- /Dueling Deep Q-Network/CNN_Dueling_DDQN.py: -------------------------------------------------------------------------------- 1 | import math, random 2 | from collections import deque 3 | import cv2 4 | 5 | import gym 6 | from gym import wrappers 7 | import wrapper 8 | import numpy as np 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | import torch.autograd as autograd 14 | import torch.nn.functional as F 15 | from IPython.display import clear_output 16 | 17 | import matplotlib.pyplot as plt 18 | 19 | USE_CUDA = torch.cuda.is_available() 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 21 | 22 | class ReplayBuffer(object): 23 | def __init__(self, capacity): 24 | self.buffer = deque(maxlen=capacity) 25 | 26 | def push(self, state, action, reward, next_state, done): 27 | state = np.expand_dims(state, 0) 28 | next_state = np.expand_dims(next_state, 0) 29 | 30 | self.buffer.append((state, action, reward, next_state, done)) 31 | 32 | def sample(self, batch_size): 33 | state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 34 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 35 | 36 | def __len__(self): 37 | return len(self.buffer) 38 | 39 | class CnnDQN(nn.Module): 40 | def __init__(self, input_shape, num_actions): 41 | super(CnnDQN, self).__init__() 42 | 43 | self.input_shape = input_shape 44 | self.num_actions = num_actions 45 | 46 | self.convolutional_layers = nn.Sequential( 47 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 48 | nn.ReLU(), 49 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 50 | nn.ReLU(), 51 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 52 | nn.ReLU() 53 | ) 54 | 55 | self.value_layer = nn.Sequential( 56 | nn.Linear(self.feature_size(), 512), 57 | nn.ReLU(), 58 | nn.Linear(512, 1) 59 | ) 60 | self.advantage_layer = nn.Sequential( 61 | nn.Linear(self.feature_size(), 512), 62 | nn.ReLU(), 63 | nn.Linear(512, self.num_actions) 64 | ) 65 | 66 | def forward(self, x): 67 | x = self.convolutional_layers(x) 68 | x = x.view(x.size(0), -1) 69 | value = self.value_layer(x) # shape [1,1] 70 | value = value.expand(x.size(0), self.num_actions) # shape [1,6] 71 | advantage = self.advantage_layer(x) #shape [1,6] 72 | advantage_mean = advantage.mean(1)#shape [1] 73 | advantage_mean = advantage_mean.unsqueeze(1) #shape[1,1] 74 | advantage_mean = advantage_mean.expand(x.size(0), self.num_actions) #shape [1,6] 75 | Q = value + advantage - advantage_mean 76 | #print("Q-Values: ",Q) 77 | return Q 78 | 79 | def feature_size(self): 80 | #Calculate the output size of the CNN 81 | return self.convolutional_layers(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1) 82 | 83 | def act(self, state, epsilon,action_space): 84 | if random.random() > epsilon: 85 | with torch.no_grad(): 86 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0)) 87 | q_value = self.forward(state) 88 | action = q_value.max(1)[1].data[0] #.max(1) maxdata: values--[0] and idx--[1] 89 | else: 90 | action = random.randrange(action_space) 91 | return action 92 | 93 | def update_target(current_model, target_model): 94 | target_model.load_state_dict(current_model.state_dict()) 95 | 96 | def save_model(model, idx): 97 | torch.save(model, "Saved_models/") 98 | 99 | def epsilon_by_frame(frame_idx): 100 | epsilon_start = 1.0 101 | epsilon_final = 0.01 #0.01 102 | epsilon_decay = 30000 #30000 103 | eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) 104 | return eps 105 | 106 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,replay_buffer): 107 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 108 | # shapes for normal image-- stacked (4,84,84) ... 109 | state = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84) 110 | next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84) 111 | action = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function 112 | reward = Variable(torch.FloatTensor(reward)) #shape [32] 113 | done = Variable(torch.FloatTensor(done)) #shape [32] 114 | 115 | q_values = current_model(state) #shape [32,6] 116 | next_q_values = current_model(next_state) #shape [32,6] 117 | next_q_state_values = target_model(next_state) #shape [32,6] 118 | 119 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action 120 | next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1] 121 | expected_q_value = reward + gamma * next_q_value * (1 - done) # shape [32] 122 | 123 | 124 | # DeepMind took nn.SmoothL1Loss() 125 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss -- .data to get rid of grad_fn= 126 | loss = loss_func(q_value,Variable(expected_q_value.data)) 127 | 128 | opti.zero_grad() 129 | loss.backward() 130 | opti.step() 131 | return loss 132 | 133 | def plot(frame_idx, rewards, losses): 134 | plt.close() 135 | plt.figure(figsize=(20,5)) 136 | plt.subplot(121) 137 | plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2))) 138 | plt.plot(rewards) 139 | plt.subplot(122) 140 | plt.title("loss") 141 | plt.plot(losses) 142 | plt.ylim(0,1) 143 | plt.draw() 144 | plt.pause(0.0001) 145 | 146 | def processing(img): 147 | img = np.expand_dims(cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), (84,84)),axis= 0) 148 | img = img.astype(np.uint8) 149 | #print(img.dtype) 150 | return img 151 | 152 | def main(): 153 | plt.ion() 154 | env = wrapper.make_atari("RiverraidNoFrameskip-v4", monitor=True,epidsode_capture=50) 155 | env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True) 156 | action_space = env.action_space.n 157 | current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape 158 | target_model = CnnDQN(env.observation_space.shape, action_space) 159 | 160 | if USE_CUDA: 161 | current_model = current_model.cuda() 162 | target_model = target_model.cuda() 163 | 164 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000) 165 | #opti = optim.Adam(current_model.parameters(), lr=0.0001) 166 | opti = optim.RMSprop(current_model.parameters(), lr=0.0001) 167 | loss_func = nn.SmoothL1Loss() 168 | 169 | replay_initial = 10000 170 | replay_buffer = ReplayBuffer(100000) 171 | 172 | num_frames = 1000000 173 | batch_size = 32 174 | gamma = 0.99 175 | 176 | losses = [] 177 | all_rewards = [] 178 | episode_reward = 0 179 | 180 | state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84) 181 | # Manuel Stacking 182 | #state = processing(state) 183 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0) 184 | #assert state.shape == (4,84,84) 185 | for frame_idx in range(1, num_frames + 1): 186 | 187 | epsilon = epsilon_by_frame(frame_idx) 188 | print("Training :: Frame {} :: Epsilon {} ".format(frame_idx, round(epsilon,2))) 189 | action = current_model.act(state, epsilon,action_space) 190 | next_state, reward, done, _ = env.step(action) 191 | # Manuel Stacking 192 | #next_state = processing(next_state) 193 | #next_state = np.append(next_state, state[1:, :, :],axis= 0) 194 | #assert next_state.shape == (4,84,84) 195 | replay_buffer.push(state, action, reward, next_state, done) 196 | 197 | state = next_state 198 | episode_reward += reward 199 | 200 | if done: 201 | state = env.reset() 202 | # Manuel Stacking 203 | #state = processing(state) 204 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0) 205 | all_rewards.append(episode_reward) 206 | episode_reward = 0 207 | 208 | if len(replay_buffer) > replay_initial: 209 | loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,replay_buffer) 210 | losses.append(loss.item()) 211 | 212 | if frame_idx % 10000 == 0: 213 | plot(frame_idx, all_rewards, losses) 214 | 215 | if frame_idx % 1000 == 0: 216 | update_target(current_model, target_model) 217 | 218 | #if frame_idx % 100000 ==0: 219 | # save_model(current_model, frame_idx) 220 | 221 | if __name__ == "__main__": 222 | main() -------------------------------------------------------------------------------- /Dueling Deep Q-Network/CNN_Dueling_DDQN_PER.py: -------------------------------------------------------------------------------- 1 | import math, random 2 | from collections import deque 3 | import cv2 4 | 5 | import gym 6 | from gym import wrappers 7 | import wrapper 8 | import numpy as np 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | import torch.autograd as autograd 14 | import torch.nn.functional as F 15 | from IPython.display import clear_output 16 | 17 | import matplotlib.pyplot as plt 18 | from PrioritizedExperienceReplay import PrioritizedReplay 19 | 20 | USE_CUDA = torch.cuda.is_available() 21 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 22 | 23 | class CnnDQN(nn.Module): 24 | def __init__(self, input_shape, num_actions): 25 | super(CnnDQN, self).__init__() 26 | 27 | self.input_shape = input_shape 28 | self.num_actions = num_actions 29 | 30 | self.convolutional_layers = nn.Sequential( 31 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 32 | nn.ReLU(), 33 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 34 | nn.ReLU(), 35 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 36 | nn.ReLU() 37 | ) 38 | 39 | self.value_layer = nn.Sequential( 40 | nn.Linear(self.feature_size(), 512), 41 | nn.ReLU(), 42 | nn.Linear(512, 1) 43 | ) 44 | self.advantage_layer = nn.Sequential( 45 | nn.Linear(self.feature_size(), 512), 46 | nn.ReLU(), 47 | nn.Linear(512, self.num_actions) 48 | ) 49 | 50 | def forward(self, x): 51 | x = self.convolutional_layers(x) 52 | x = x.view(x.size(0), -1) 53 | value = self.value_layer(x) # shape [1,1] 54 | value = value.expand(x.size(0), self.num_actions) # shape [1,6] 55 | advantage = self.advantage_layer(x) #shape [1,6] 56 | advantage_mean = advantage.mean(1)#shape [1] 57 | advantage_mean = advantage_mean.unsqueeze(1) #shape[1,1] 58 | advantage_mean = advantage_mean.expand(x.size(0), self.num_actions) #shape [1,6] 59 | Q = value + advantage - advantage_mean 60 | #print("Q-Values: ",Q) 61 | return Q 62 | 63 | def feature_size(self): 64 | #Calculate the output size of the CNN 65 | return self.convolutional_layers(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1) 66 | 67 | def act(self, state, epsilon,action_space): 68 | if random.random() > epsilon: 69 | with torch.no_grad(): 70 | state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0)) 71 | q_value = self.forward(state) 72 | action = q_value.max(1)[1].data[0] #.max(1) maxdata: values--[0] and idx--[1] 73 | else: 74 | action = random.randrange(action_space) 75 | return action 76 | 77 | def update_target(current_model, target_model): 78 | target_model.load_state_dict(current_model.state_dict()) 79 | 80 | def save_model(model, idx): 81 | torch.save(model, "Saved_models/") 82 | 83 | def epsilon_by_frame(frame_idx): 84 | epsilon_start = 1.0 85 | epsilon_final = 0.01 #0.01 86 | epsilon_decay = 30000 #30000 87 | eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) 88 | return eps 89 | 90 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,PER): 91 | state, action, reward, next_state, done,idx,weights = PER.sample(batch_size) 92 | # shapes for normal image-- stacked (4,84,84) ... 93 | state = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84) 94 | next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84) 95 | action = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function 96 | reward = Variable(torch.FloatTensor(reward)) #shape [32] 97 | done = Variable(torch.FloatTensor(done)) #shape [32] 98 | weights = Variable(torch.FloatTensor(weights)) #shape [32] 99 | 100 | q_values = current_model(state) #shape [32,6] 101 | next_q_values = current_model(next_state) #shape [32,6] 102 | next_q_state_values = target_model(next_state) #shape [32,6] 103 | 104 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action 105 | next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1] 106 | expected_q_value = reward + gamma * next_q_value * (1 - done) # shape [32] 107 | 108 | 109 | # DeepMind took nn.SmoothL1Loss() 110 | #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss -- .data to get rid of grad_fn= 111 | loss = loss_func(q_value,Variable(expected_q_value.data))*weights 112 | prios = loss + 1e-5 113 | loss = loss.mean() 114 | 115 | 116 | opti.zero_grad() 117 | loss.backward() 118 | PER.update_priorities(idx, prios.data.cpu().numpy()) 119 | opti.step() 120 | return loss 121 | 122 | def plot(frame_idx, rewards, losses): 123 | plt.close() 124 | plt.figure(figsize=(20,5)) 125 | plt.subplot(121) 126 | plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2))) 127 | plt.plot(rewards) 128 | plt.subplot(122) 129 | plt.title("loss") 130 | plt.plot(losses) 131 | plt.ylim(0,1) 132 | plt.draw() 133 | plt.pause(0.0001) 134 | 135 | 136 | def main(): 137 | plt.ion() 138 | env = wrapper.make_atari("BreakoutNoFrameskip-v4", monitor=True,epidsode_capture=50)#Riverraid Frostbite Enduro 139 | env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True) 140 | action_space = env.action_space.n 141 | current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape 142 | target_model = CnnDQN(env.observation_space.shape, action_space) 143 | 144 | if USE_CUDA: 145 | current_model = current_model.cuda() 146 | target_model = target_model.cuda() 147 | 148 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000) 149 | #opti = optim.Adam(current_model.parameters(), lr=0.0001) 150 | opti = optim.RMSprop(current_model.parameters(), lr=0.0001) 151 | loss_func = nn.SmoothL1Loss() 152 | 153 | replay_initial = 10000 154 | PER = PrioritizedReplay(100000,alpha = 0.6,beta_start =0.4,beta_frames=1000000) 155 | 156 | num_frames = 1000000 157 | batch_size = 32 158 | gamma = 0.99 159 | 160 | losses = [] 161 | all_rewards = [] 162 | episode_reward = 0 163 | 164 | state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84) 165 | # Manuel Stacking 166 | #state = processing(state) 167 | #state = np.stack((state,state,state,state),axis = 1).squeeze(0) 168 | #assert state.shape == (4,84,84) 169 | for frame_idx in range(1, num_frames + 1): 170 | 171 | epsilon = epsilon_by_frame(frame_idx) 172 | action = current_model.act(state, epsilon,action_space) 173 | next_state, reward, done, _ = env.step(action) 174 | print("Training :: Frame {} :: Epsilon {} :: Reward {} ".format(frame_idx, round(epsilon,2),reward)) 175 | # Manuel Stacking 176 | #next_state = processing(next_state) 177 | #next_state = np.append(next_state, state[1:, :, :],axis= 0) 178 | #assert next_state.shape == (4,84,84) 179 | PER.push(state, action, reward, next_state, done) 180 | 181 | state = next_state 182 | episode_reward += reward 183 | 184 | if done: 185 | state = env.reset() 186 | 187 | all_rewards.append(episode_reward) 188 | episode_reward = 0 189 | 190 | if PER.__len__() > replay_initial: 191 | loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,PER) 192 | losses.append(loss.item()) 193 | 194 | if frame_idx % 10000 == 0: 195 | plot(frame_idx, all_rewards, losses) 196 | 197 | if frame_idx % 1000 == 0: 198 | update_target(current_model, target_model) 199 | 200 | 201 | if __name__ == "__main__": 202 | main() 203 | -------------------------------------------------------------------------------- /Dueling Deep Q-Network/Img/Duel_per.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Img/Duel_per.png -------------------------------------------------------------------------------- /Dueling Deep Q-Network/Img/Dueling_DQN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Img/Dueling_DQN.png -------------------------------------------------------------------------------- /Dueling Deep Q-Network/PrioritizedExperienceReplay.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class PrioritizedReplay(object): 4 | def __init__(self, capacity, alpha=0.6,beta_start = 0.4,beta_frames=100000): 5 | self.alpha = alpha 6 | self.beta_start = beta_start 7 | self.beta_frames = beta_frames 8 | self.frame = 1 #for beta calculation 9 | self.capacity = capacity 10 | self.buffer = [] 11 | self.pos = 0 12 | self.priorities = np.zeros((capacity,), dtype=np.float32) 13 | 14 | def beta_by_frame(self, frame_idx): 15 | return min(1.0, self.beta_start + frame_idx * (1.0 - self.beta_start) / self.beta_frames) 16 | 17 | def push(self, state, action, reward, next_state, done): 18 | assert state.ndim == next_state.ndim 19 | state = np.expand_dims(state, 0) 20 | next_state = np.expand_dims(next_state, 0) 21 | 22 | max_prio = self.priorities.max() if self.buffer else 1.0 # gives max priority if buffer is not empty else 1 23 | 24 | if len(self.buffer) < self.capacity: 25 | self.buffer.append((state, action, reward, next_state, done)) 26 | else: 27 | # puts the new data on the position of the oldes since it circles via pos variable 28 | # since if len(buffer) == capacity -> pos == 0 -> oldest memory (at least for the first round?) 29 | self.buffer[self.pos] = (state, action, reward, next_state, done) 30 | 31 | self.priorities[self.pos] = max_prio 32 | self.pos = (self.pos + 1) % self.capacity # lets the pos circle in the ranges of capacity if pos+1 > cap --> new posi = 0 33 | 34 | def sample(self, batch_size): 35 | N = len(self.buffer) 36 | if N == self.capacity: 37 | prios = self.priorities 38 | else: 39 | prios = self.priorities[:self.pos] 40 | # calc P = p^a/sum(p^a) 41 | probs = prios ** self.alpha 42 | P = probs/probs.sum() 43 | 44 | indices = np.random.choice(N, batch_size, p=P) # gets the indices depending on the probability p 45 | samples = [self.buffer[idx] for idx in indices] 46 | 47 | beta = self.beta_by_frame(self.frame) 48 | self.frame+=1 49 | 50 | #min of ALL probs, not just sampled probs 51 | P_min = P.min() 52 | max_weight = (P_min*N)**(-beta) 53 | 54 | #Compute importance-sampling weight step:10 pseudo code 55 | weights = (N * P[indices]) ** (-beta) 56 | weights /= weights.max() # max_weights 57 | weights = np.array(weights, dtype=np.float32) #torch.tensor(weights, device=device, dtype=torch.float) 58 | 59 | #print("Sample-shape befor zipping: ", samples) 60 | states, actions, rewards, next_states, dones = zip(*samples) # example: p = [[1,2,3],[4,5,6]] ,d=zip(*p) -> d = [(1, 4), (2, 5), (3, 6)] 61 | return np.concatenate(states), actions, rewards, np.concatenate(next_states), dones, indices, weights 62 | 63 | def update_priorities(self, batch_indices, batch_priorities): 64 | for idx, prio in zip(batch_indices, batch_priorities): 65 | self.priorities[idx] = prio 66 | 67 | def __len__(self): 68 | return len(self.buffer) 69 | -------------------------------------------------------------------------------- /Dueling Deep Q-Network/Video/Breakout.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Video/Breakout.mp4 -------------------------------------------------------------------------------- /Dueling Deep Q-Network/Video/Pong.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Video/Pong.mp4 -------------------------------------------------------------------------------- /Dueling Deep Q-Network/wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | import gym 4 | 5 | from gym import spaces,wrappers 6 | import cv2 7 | cv2.ocl.setUseOpenCL(False) 8 | 9 | class NoopResetEnv(gym.Wrapper): 10 | def __init__(self, env, noop_max=30): 11 | """Sample initial states by taking random number of no-ops on reset. 12 | No-op is assumed to be action 0. 13 | """ 14 | gym.Wrapper.__init__(self, env) 15 | self.noop_max = noop_max 16 | self.override_num_noops = None 17 | self.noop_action = 0 18 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 19 | 20 | def reset(self, **kwargs): 21 | """ Do no-op action for a number of steps in [1, noop_max].""" 22 | self.env.reset(**kwargs) 23 | if self.override_num_noops is not None: 24 | noops = self.override_num_noops 25 | else: 26 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 27 | assert noops > 0 28 | obs = None 29 | for _ in range(noops): 30 | obs, _, done, _ = self.env.step(self.noop_action) 31 | if done: 32 | obs = self.env.reset(**kwargs) 33 | return obs 34 | 35 | def step(self, ac): 36 | return self.env.step(ac) 37 | 38 | class FireResetEnv(gym.Wrapper): 39 | def __init__(self, env): 40 | """Take action on reset for environments that are fixed until firing.""" 41 | gym.Wrapper.__init__(self, env) 42 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 43 | assert len(env.unwrapped.get_action_meanings()) >= 3 44 | 45 | def reset(self, **kwargs): 46 | self.env.reset(**kwargs) 47 | obs, _, done, _ = self.env.step(1) 48 | if done: 49 | self.env.reset(**kwargs) 50 | obs, _, done, _ = self.env.step(2) 51 | if done: 52 | self.env.reset(**kwargs) 53 | return obs 54 | 55 | def step(self, ac): 56 | return self.env.step(ac) 57 | 58 | class EpisodicLifeEnv(gym.Wrapper): 59 | def __init__(self, env): 60 | """Make end-of-life == end-of-episode, but only reset on true game over. 61 | Done by DeepMind for the DQN and co. since it helps value estimation. 62 | """ 63 | gym.Wrapper.__init__(self, env) 64 | self.lives = 0 65 | self.was_real_done = True 66 | 67 | def step(self, action): 68 | obs, reward, done, info = self.env.step(action) 69 | self.was_real_done = done 70 | # check current lives, make loss of life terminal, 71 | # then update lives to handle bonus lives 72 | lives = self.env.unwrapped.ale.lives() 73 | if lives < self.lives and lives > 0: 74 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames 75 | # so its important to keep lives > 0, so that we only reset once 76 | # the environment advertises done. 77 | done = True 78 | self.lives = lives 79 | return obs, reward, done, info 80 | 81 | def reset(self, **kwargs): 82 | """Reset only when lives are exhausted. 83 | This way all states are still reachable even though lives are episodic, 84 | and the learner need not know about any of this behind-the-scenes. 85 | """ 86 | if self.was_real_done: 87 | obs = self.env.reset(**kwargs) 88 | else: 89 | # no-op step to advance from terminal/lost life state 90 | obs, _, _, _ = self.env.step(0) 91 | self.lives = self.env.unwrapped.ale.lives() 92 | return obs 93 | 94 | class MaxAndSkipEnv(gym.Wrapper): 95 | def __init__(self, env, skip=4): 96 | """Return only every `skip`-th frame""" 97 | gym.Wrapper.__init__(self, env) 98 | # most recent raw observations (for max pooling across time steps) 99 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 100 | self._skip = skip 101 | 102 | def reset(self): 103 | return self.env.reset() 104 | 105 | def step(self, action): 106 | """Repeat action, sum reward, and max over last observations.""" 107 | total_reward = 0.0 108 | done = None 109 | for i in range(self._skip): 110 | obs, reward, done, info = self.env.step(action) 111 | if i == self._skip - 2: self._obs_buffer[0] = obs 112 | if i == self._skip - 1: self._obs_buffer[1] = obs 113 | total_reward += reward 114 | if done: 115 | break 116 | # Note that the observation on the done=True frame 117 | # doesn't matter 118 | max_frame = self._obs_buffer.max(axis=0) 119 | 120 | return max_frame, total_reward, done, info 121 | 122 | def reset(self, **kwargs): 123 | return self.env.reset(**kwargs) 124 | 125 | class ClipRewardEnv(gym.RewardWrapper): 126 | def __init__(self, env): 127 | gym.RewardWrapper.__init__(self, env) 128 | 129 | def reward(self, reward): 130 | """Bin reward to {+1, 0, -1} by its sign.""" 131 | return np.sign(reward) 132 | 133 | class WarpFrame(gym.ObservationWrapper): 134 | def __init__(self, env): 135 | """Warp frames to 84x84 as done in the Nature paper and later work.""" 136 | gym.ObservationWrapper.__init__(self, env) 137 | self.width = 84 138 | self.height = 84 139 | self.observation_space = spaces.Box(low=0, high=255, 140 | shape=(self.height, self.width, 1), dtype=np.uint8) 141 | 142 | def observation(self, frame): 143 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 144 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 145 | return frame[:, :, None] 146 | 147 | class FrameStack(gym.Wrapper): 148 | def __init__(self, env, k): 149 | """Stack k last frames. 150 | 151 | Returns lazy array, which is much more memory efficient. 152 | 153 | See Also 154 | -------- 155 | baselines.common.atari_wrappers.LazyFrames 156 | """ 157 | gym.Wrapper.__init__(self, env) 158 | self.k = k 159 | self.frames = deque([], maxlen=k) 160 | shp = env.observation_space.shape 161 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) 162 | 163 | def reset(self): 164 | ob = self.env.reset() 165 | for _ in range(self.k): 166 | self.frames.append(ob) 167 | return self._get_ob() 168 | 169 | def step(self, action): 170 | ob, reward, done, info = self.env.step(action) 171 | self.frames.append(ob) 172 | return self._get_ob(), reward, done, info 173 | 174 | def _get_ob(self): 175 | assert len(self.frames) == self.k 176 | return LazyFrames(list(self.frames)) 177 | 178 | class ScaledFloatFrame(gym.ObservationWrapper): 179 | def __init__(self, env): 180 | gym.ObservationWrapper.__init__(self, env) 181 | 182 | def observation(self, observation): 183 | # careful! This undoes the memory optimization, use 184 | # with smaller replay buffers only. 185 | return np.array(observation).astype(np.float32) / 255.0 186 | 187 | class LazyFrames(object): 188 | def __init__(self, frames): 189 | """This object ensures that common frames between the observations are only stored once. 190 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 191 | buffers. 192 | 193 | This object should only be converted to numpy array before being passed to the model. 194 | 195 | You'd not believe how complex the previous solution was.""" 196 | self._frames = frames 197 | self._out = None 198 | 199 | def _force(self): 200 | if self._out is None: 201 | self._out = np.concatenate(self._frames, axis=2) 202 | self._frames = None 203 | return self._out 204 | 205 | def __array__(self, dtype=None): 206 | out = self._force() 207 | if dtype is not None: 208 | out = out.astype(dtype) 209 | return out 210 | 211 | def __len__(self): 212 | return len(self._force()) 213 | 214 | def __getitem__(self, i): 215 | return self._force()[i] 216 | 217 | # EDIT BY ATAMAI 218 | # Preparing image received from environment and adjust it to expected format of Pytorch 219 | # HWC (height x width x channel) becomes CHW 220 | class PytorchImage(gym.ObservationWrapper): 221 | def __init__(self, env): 222 | super(PytorchImage, self).__init__(env) 223 | # we check current shape of observations in environment 224 | current_shape = self.observation_space.shape 225 | # we change order of dimensions - so last one (-1) becomes first 226 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(current_shape[-1], current_shape[0], current_shape[1])) 227 | 228 | def observation(self, observation): 229 | # and finally we change order of dimensions for every single observation 230 | # here transpose method could be also used 231 | return np.swapaxes(observation, 2, 0) 232 | 233 | def make_atari(env_id, monitor = False, epidsode_capture = 75): 234 | env = gym.make(env_id) 235 | if monitor == True: 236 | env = wrappers.Monitor(env, "Videos/", resume=True, force =True, video_callable=lambda episode_id: episode_id%epidsode_capture==0) 237 | assert 'NoFrameskip' in env.spec.id 238 | env = NoopResetEnv(env, noop_max=30) 239 | env = MaxAndSkipEnv(env, skip=4) 240 | return env 241 | 242 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False, pytorch_img=False): 243 | """Configure environment for DeepMind-style Atari. 244 | """ 245 | if episode_life: 246 | env = EpisodicLifeEnv(env) 247 | if 'FIRE' in env.unwrapped.get_action_meanings(): 248 | env = FireResetEnv(env) 249 | env = WarpFrame(env) 250 | if scale: 251 | env = ScaledFloatFrame(env) 252 | if clip_rewards: 253 | env = ClipRewardEnv(env) 254 | if frame_stack: 255 | env = FrameStack(env, 4) 256 | if pytorch_img: 257 | env = PytorchImage(env) 258 | return env 259 | 260 | -------------------------------------------------------------------------------- /Paper/A3C.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/A3C.pdf -------------------------------------------------------------------------------- /Paper/DDPG.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/DDPG.pdf -------------------------------------------------------------------------------- /Paper/DQN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/DQN.pdf -------------------------------------------------------------------------------- /Paper/Distributional DQN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Distributional DQN.pdf -------------------------------------------------------------------------------- /Paper/Double_DQN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Double_DQN.pdf -------------------------------------------------------------------------------- /Paper/Dueling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Dueling.pdf -------------------------------------------------------------------------------- /Paper/GAE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/GAE.pdf -------------------------------------------------------------------------------- /Paper/Noisy_networks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Noisy_networks.pdf -------------------------------------------------------------------------------- /Paper/PPO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/PPO.pdf -------------------------------------------------------------------------------- /Paper/SAC_2019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/SAC_2019.pdf -------------------------------------------------------------------------------- /Paper/TD3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/TD3.pdf -------------------------------------------------------------------------------- /Policy Gradient Algorithms/Parallel_processing.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Pipe 2 | import numpy as np 3 | 4 | def worker(remote, parent_remote, env_fn_wrapper): 5 | parent_remote.close() 6 | env = env_fn_wrapper.x() 7 | while True: 8 | cmd, data = remote.recv() 9 | if cmd == 'step': 10 | ob, reward, done, info = env.step(data) 11 | if done: 12 | ob = env.reset() 13 | remote.send((ob, reward, done, info)) 14 | elif cmd == 'reset': 15 | ob = env.reset() 16 | remote.send(ob) 17 | elif cmd == 'reset_task': 18 | ob = env.reset_task() 19 | remote.send(ob) 20 | elif cmd == 'close': 21 | remote.close() 22 | break 23 | elif cmd == 'get_spaces': 24 | remote.send((env.observation_space, env.action_space)) 25 | else: 26 | raise NotImplementedError 27 | 28 | class CloudpickleWrapper(object): 29 | """ 30 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 31 | """ 32 | def __init__(self, x): 33 | self.x = x 34 | def __getstate__(self): 35 | import cloudpickle 36 | return cloudpickle.dumps(self.x) 37 | def __setstate__(self, ob): 38 | import pickle 39 | self.x = pickle.loads(ob) 40 | 41 | 42 | class VecEnv(object): 43 | """ 44 | An abstract asynchronous, vectorized environment. 45 | """ 46 | def __init__(self, num_envs, observation_space, action_space): 47 | self.num_envs = num_envs 48 | self.observation_space = observation_space 49 | self.action_space = action_space 50 | 51 | def reset(self): 52 | """ 53 | Reset all the environments and return an array of 54 | observations, or a tuple of observation arrays. 55 | If step_async is still doing work, that work will 56 | be cancelled and step_wait() should not be called 57 | until step_async() is invoked again. 58 | """ 59 | pass 60 | 61 | def step_async(self, actions): 62 | """ 63 | Tell all the environments to start taking a step 64 | with the given actions. 65 | Call step_wait() to get the results of the step. 66 | You should not call this if a step_async run is 67 | already pending. 68 | """ 69 | pass 70 | 71 | def step_wait(self): 72 | """ 73 | Wait for the step taken with step_async(). 74 | Returns (obs, rews, dones, infos): 75 | - obs: an array of observations, or a tuple of 76 | arrays of observations. 77 | - rews: an array of rewards 78 | - dones: an array of "episode done" booleans 79 | - infos: a sequence of info objects 80 | """ 81 | pass 82 | 83 | def close(self): 84 | """ 85 | Clean up the environments' resources. 86 | """ 87 | pass 88 | 89 | def step(self, actions): 90 | self.step_async(actions) 91 | return self.step_wait() 92 | 93 | class SubprocVecEnv(VecEnv): 94 | def __init__(self, env_fns, spaces=None): 95 | """ 96 | envs: list of gym environments to run in subprocesses 97 | """ 98 | self.waiting = False 99 | self.closed = False 100 | nenvs = len(env_fns) 101 | self.nenvs = nenvs 102 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 103 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 104 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 105 | for p in self.ps: 106 | p.daemon = True # if the main process crashes, we should not cause things to hang 107 | p.start() 108 | for remote in self.work_remotes: 109 | remote.close() 110 | 111 | self.remotes[0].send(('get_spaces', None)) 112 | observation_space, action_space = self.remotes[0].recv() 113 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 114 | 115 | def step_async(self, actions): 116 | for remote, action in zip(self.remotes, actions): 117 | remote.send(('step', action)) 118 | self.waiting = True 119 | 120 | def step_wait(self): 121 | results = [remote.recv() for remote in self.remotes] 122 | self.waiting = False 123 | obs, rews, dones, infos = zip(*results) 124 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 125 | 126 | def reset(self): 127 | for remote in self.remotes: 128 | remote.send(('reset', None)) 129 | return np.stack([remote.recv() for remote in self.remotes]) 130 | 131 | def reset_task(self): 132 | for remote in self.remotes: 133 | remote.send(('reset_task', None)) 134 | return np.stack([remote.recv() for remote in self.remotes]) 135 | 136 | def close(self): 137 | if self.closed: 138 | return 139 | if self.waiting: 140 | for remote in self.remotes: 141 | remote.recv() 142 | for remote in self.remotes: 143 | remote.send(('close', None)) 144 | for p in self.ps: 145 | p.join() 146 | self.closed = True 147 | 148 | def __len__(self): 149 | return self.nenvs -------------------------------------------------------------------------------- /Policy Gradient Algorithms/REINFORCE/Img/Steps_needed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Policy Gradient Algorithms/REINFORCE/Img/Steps_needed.png -------------------------------------------------------------------------------- /Policy Gradient Algorithms/REINFORCE/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torch.optim as optim 4 | from torch.autograd import Variable 5 | import gym 6 | from gym import wrappers 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from torch.distributions import Categorical 10 | 11 | 12 | class Policy(nn.Module): 13 | def __init__(self,input_shape,action_shape): 14 | super().__init__() 15 | 16 | self.model = nn.Sequential( 17 | nn.Linear(input_shape[0],64), 18 | nn.ReLU(), 19 | nn.Linear(64,32), 20 | nn.ReLU(), 21 | nn.Linear(32,action_shape), 22 | nn.Softmax(dim = 1) 23 | ) 24 | def forward(self,x): 25 | return self.model(x) 26 | 27 | def action(model, s): 28 | # simple pytorch aproach for action-selection and log-prob calc 29 | #https://pytorch.org/docs/stable/distributions.html 30 | prob = model(s) 31 | m = Categorical(prob) 32 | a = m.sample() 33 | # log p(a∣π(s)) 34 | log_p = m.log_prob(a) 35 | #print(a.item(), log_p) 36 | return a.item(), log_p 37 | 38 | # naive own numpy aproach attenion! grad gets lost by transforming prob to numpy: 39 | #possible_actions = [i for i in range(len(prob.data.detach().numpy()[0]))] 40 | # choose accordingly to probability: 41 | #action = np.random.choice(possible_actions, p = prob.data.detach().numpy()[0]) 42 | #calculate the log-prob for the chosen action: 43 | #grad = prob[0][action].grad_fn 44 | #log_prob = np.log(prob.data.detach().numpy()[0][action]) 45 | # transform to torch Tensor: 46 | #log_prob = torch.Tensor([log_prob]).unsqueeze(0) 47 | #log_prob = Variable(log_prob,requires_grad=True) 48 | #log_prob.backward() 49 | #print(log_prob) 50 | #print(action,log_prob) 51 | #return action, log_prob 52 | 53 | def policy_optimization(ep, model, optimizer,batch_rewards,log_probs): 54 | R = 0 55 | gamma = 0.99 56 | policy_loss = [] 57 | rewards = [] 58 | #calc discounted Rewards 59 | for r in batch_rewards[::-1]: # reverses the list of rewards 60 | R = r + gamma * R 61 | rewards.insert(0, R) # inserts the current rewart to first position 62 | 63 | rewards = torch.tensor(rewards) 64 | # standardization to get data of zero mean and varianz 1, stabilizes learning 65 | #-- attention scaling rewards looses information of special events with higher rewards - addapting on different environments 66 | rewards = (rewards - rewards.mean()) / (rewards.std() + ep) 67 | for log_prob, reward in zip(log_probs, rewards): 68 | policy_loss.append(-log_prob * reward) #baseline+ 69 | 70 | optimizer.zero_grad() 71 | policy_loss = torch.cat(policy_loss).sum() 72 | policy_loss.backward() 73 | optimizer.step() 74 | 75 | def run(episodes,model,env): 76 | optimizer = optim.Adam(model.parameters(), lr = 1e-2) 77 | rewards = [] 78 | steps_taken = [] 79 | 80 | for i in range(episodes): 81 | done = False 82 | ep_rewards = 0 83 | batch_rewards = [] 84 | log_probs = [] 85 | state = env.reset() 86 | steps = 0 87 | while not done: 88 | a, log_p = action(model, torch.Tensor(state).unsqueeze(0)) 89 | log_probs.append(log_p) 90 | new_state, reward, done, info = env.step(a) 91 | batch_rewards.append(reward) 92 | ep_rewards += reward 93 | steps +=1 94 | 95 | 96 | 97 | state = new_state 98 | 99 | 100 | rewards.append(ep_rewards) 101 | steps_taken.append(steps) 102 | print("Episode: {} --- Rewards: {} --- Steps: {}".format(i, ep_rewards, steps)) 103 | policy_optimization(i, model, optimizer, batch_rewards,log_probs) 104 | 105 | return steps_taken 106 | 107 | def main(): 108 | USE_CUDA = torch.cuda.is_available() 109 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 110 | 111 | env = gym.make("Acrobot-v1") 112 | env = wrappers.Monitor(env, "Saved_Videos/", resume=True, force = True, video_callable=lambda episode_id: episode_id%40==0) 113 | obs_shape = env.observation_space.shape 114 | action_shape = env.action_space.n 115 | episodes = 240 116 | model = Policy(obs_shape, action_shape) 117 | steps = run(episodes, model, env) 118 | 119 | plt.plot(steps) 120 | plt.xlabel("Episodes") 121 | plt.ylabel("Steps needed to reach goal") 122 | plt.show() 123 | 124 | if __name__ == "__main__": 125 | #Argparse: 126 | main() 127 | -------------------------------------------------------------------------------- /Q_Learning/FrozenLake_q-table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import gym 4 | import time 5 | 6 | EPISODES = 5000 7 | TRYS = 100 8 | EPSILON = 0.9 # epsilon greedy 9 | ALPHA = 0.1 # learning rate 10 | GAMMA = 0.9 #discount factor 11 | 12 | 13 | 14 | 15 | def make_Q_table(actions,n_states): 16 | table = pd.DataFrame( 17 | np.zeros((n_states, actions)), columns = list(range(actions))) # q_table initial values 18 | # print(table) # show table 19 | return table 20 | 21 | def choose_action(state, q_table): 22 | state_actions = q_table.iloc[state, :] 23 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value 24 | action_name = np.random.choice(ACTIONS) 25 | else: # act greedy 26 | action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas 27 | #print("Action_choosen: "+str(action_name)) 28 | return action_name 29 | 30 | 31 | def RL(ACTIONS,N_SPACE): 32 | q_table = make_Q_table(ACTIONS,N_SPACE) 33 | for episode in range(EPISODES): 34 | S = env.reset() 35 | for one_try in range(TRYS): #how long one epidsode lasts 36 | 37 | env.render() 38 | A = choose_action(S, q_table) 39 | 40 | S_,R,done,info = env.step(A) 41 | #print(S_) 42 | #time.sleep(1) 43 | q_old = q_table.loc[S, A] #Current Q-Value of the state 44 | q_learned = R + GAMMA * q_table.iloc[S_, :].max() 45 | q_table.loc[S, A] += ALPHA * (q_learned - q_old) # update 46 | S = S_ # move to next state 47 | if done: 48 | print("Episode finished after {} timesteps".format(one_try+1)) 49 | break 50 | 51 | 52 | return q_table 53 | 54 | 55 | if __name__ =="__main__": 56 | env = gym.make("FrozenLake-v0") 57 | print(gym.__version__) 58 | 59 | env.reset() 60 | # getting space and action 61 | ACTIONS = env.action_space.n #env.unwrapped.get_action_meanings() to get a list of the action names 62 | N_SPACE = env.observation_space.n 63 | #print(ACTIONS) 64 | #print(N_SPACE) 65 | q_table = RL(ACTIONS,N_SPACE) 66 | print("Q-Table: \n") 67 | print(q_table) 68 | 69 | 70 | -------------------------------------------------------------------------------- /Q_Learning/Img/Q_table10000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Q_table10000.png -------------------------------------------------------------------------------- /Q_Learning/Img/Q_value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Q_value.png -------------------------------------------------------------------------------- /Q_Learning/Img/Receivedrewards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Receivedrewards.png -------------------------------------------------------------------------------- /Q_Learning/Img/steps_taken.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/steps_taken.png -------------------------------------------------------------------------------- /Q_Learning/Q_Table_E10000_a0.09_g0.9_eps0.9.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Q_Table_E10000_a0.09_g0.9_eps0.9.pkl -------------------------------------------------------------------------------- /Q_Learning/Q_Table_E3000_a0.09_g0.9_eps0.9.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Q_Table_E3000_a0.09_g0.9_eps0.9.pkl -------------------------------------------------------------------------------- /Q_Learning/Readme.md: -------------------------------------------------------------------------------- 1 | [image1]: ./Img/Q_value.png "Calculation Equation" 2 | [image2]: ./Img/Q_table10000.png "Calculation Equation" 3 | [image3]: ./Img/Receivedrewards.png "Calculation Equation" 4 | [image4]: ./Img/steps_taken.png "Calculation Equation" 5 | 6 | 7 | 8 | 9 | 10 | # Q-Learning and Q-Table 11 | 12 | ## Creating the Q-Table 13 | The Q-Table gets created by the number of states (n_states) and the number of actions (n_actions) and form a matrix: n_states x n_actions 14 | 15 | This even shows the limitations of normal Q-learning with a Q-Table. The number of states has to be finit and not too large. Further, the states are not allowed to change during the game. 16 | 17 | ## Calculating the Q-Values 18 | 19 | The Q-Values get calculated each step by this formula: 20 | 21 | ![alt text][image1] 22 | 23 | Here are as well some limitations. Since the Q-Values are dependent on the given rewards and most of the time the only reward is given when reaching the goal state, there has to be a way to reach the goal state by random actions. Otherwise the Q-Table will stay as a table of zeros. 24 | 25 | ## Testing on Open AI Gyms environment FrozenLake 26 | After Training of 10000 Epochs the following Q-Table got calculated: 27 | 28 | ![alt text][image2] 29 | 30 | Also by looking at the received rewards over the epochs, one can see that after epoch ~1500 almost every following try received an reward of 1 or better won the game. 31 | 32 | ![alt text][image3] 33 | 34 | Same with the steps taken,one can see the increase in taken steps. which happens, since the game doesnt get stopped earlier by failing. 35 | 36 | ![alt text][image4] 37 | 38 | -------------------------------------------------------------------------------- /Q_Learning/play_FrozenLake_Q_table.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import numpy as np 3 | import pandas as pd 4 | import time 5 | import gym 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("-e", "--Episoden",type = int,help ="Die Anzahl der zu trainierenden Episoden") 10 | parser.add_argument("-v", "--Video",type = bool,help ="Sollen die Versuche in einem Video aufgezeichnet werden?") 11 | parser.add_argument("-q", "--Q_Table",type = str,help ="Name der Q_table mit der gespielt werden soll") 12 | 13 | 14 | args = parser.parse_args() 15 | 16 | EPISODES = args.Episoden 17 | TRYS = 100 18 | AUFZEICHNUNG = args.Video 19 | Q_Table_name = args.Q_Table 20 | 21 | 22 | 23 | 24 | def load_Qtable(Q_table): 25 | Q = pd.read_pickle(Q_table) 26 | return Q 27 | 28 | 29 | def choose_action(state,Q_table): 30 | state_actions = Q_table.iloc[state, :] 31 | action_name1 = state_actions.idxmax() 32 | state_actions.pop(action_name1) 33 | action_name2 = state_actions.idxmax() 34 | if (np.random.uniform() > 0.4): 35 | print("Best action choosen!") 36 | return action_name1 37 | else: 38 | print("Second-best-action choosen!") 39 | return action_name2 40 | 41 | def play(): 42 | Q_Table = load_Qtable(Q_Table_name) 43 | for episode in range(EPISODES): 44 | S = env.reset() 45 | for one_try in range(TRYS): #how long one epidsode lasts 46 | 47 | env.render() 48 | A = choose_action(S, Q_Table) 49 | print("Action choosen: {}".format(A)) 50 | S_,R,done,info = env.step(A) 51 | #print(S_) 52 | time.sleep(2) 53 | 54 | # Addapting for further learning 55 | #print() 56 | #q_old = q_table.loc[S, A] #Current Q-Value of the state 57 | #q_learned = R + GAMMA * q_table.iloc[S_, :].max() 58 | #q_table.loc[S, A] += ALPHA * (q_learned - q_old) # update 59 | #S = S_ # move to next state 60 | 61 | if done: 62 | print("Episode finished after {} timesteps".format(one_try+1)) 63 | break 64 | 65 | 66 | if __name__ =="__main__": 67 | 68 | 69 | env = gym.make("FrozenLake-v0") 70 | print(gym.__version__) 71 | env.reset() 72 | 73 | play() 74 | 75 | # 0 - Down 76 | # 1 - -------------------------------------------------------------------------------- /Q_Learning/train_FrozenLake_Qtable.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import gym 4 | import time 5 | import matplotlib.pyplot as plt 6 | import argparse 7 | 8 | 9 | 10 | def make_Q_table(actions,n_states): 11 | table = pd.DataFrame( 12 | np.zeros((n_states, actions)), columns = list(range(actions))) # q_table initial values 13 | # print(table) # show table 14 | return table 15 | 16 | def choose_action(state, q_table): 17 | state_actions = q_table.iloc[state, :] 18 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value 19 | action_name = np.random.choice(ACTIONS) 20 | else: # act greedy 21 | action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas 22 | #print("Action_choosen: "+str(action_name)) 23 | return action_name 24 | 25 | 26 | def RL(ACTIONS,N_SPACE): 27 | q_table = make_Q_table(ACTIONS,N_SPACE) 28 | reward_list = [] 29 | try_list = [] 30 | 31 | for episode in range(EPISODES): 32 | S = env.reset() 33 | rewards = 0 34 | steps = 0 35 | for one_try in range(TRYS): #how long one epidsode lasts 36 | 37 | env.render() 38 | A = choose_action(S, q_table) 39 | 40 | S_,R,done,info = env.step(A) 41 | #print(S_) 42 | #time.sleep(1) 43 | print() 44 | q_old = q_table.loc[S, A] #Current Q-Value of the state 45 | q_learned = R + GAMMA * q_table.iloc[S_, :].max() 46 | q_table.loc[S, A] += ALPHA * (q_learned - q_old) # update 47 | S = S_ # move to next state 48 | rewards += R 49 | steps = one_try 50 | if done: 51 | print("Episode finished after {} timesteps".format(one_try+1)) 52 | steps = one_try+1 53 | break 54 | reward_list.append(rewards) 55 | try_list.append(steps+1) 56 | 57 | 58 | return q_table,reward_list,try_list 59 | 60 | 61 | 62 | 63 | 64 | 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("-e", "--Episoden",type = int,help ="Die Anzahl der zu trainierenden Episoden") 67 | parser.add_argument("-a", "--Alpha",type = float,help ="Learning Rate ~0.1") 68 | parser.add_argument("-g", "--Gamma",type = float,help ="Discount Factor ~0.9") 69 | parser.add_argument("-eps", "--Epsilon",type = float,help ="Epsilon- for the Epsilon-Greedy decision process ~0.9") 70 | 71 | args = parser.parse_args() 72 | 73 | EPISODES = args.Episoden 74 | TRYS = 100 75 | EPSILON = args.Epsilon # epsilon greedy 76 | ALPHA = args.Alpha # learning rate 77 | GAMMA = args.Gamma #discount factor 78 | 79 | if __name__ =="__main__": 80 | 81 | 82 | env = gym.make("FrozenLake-v0") 83 | print(gym.__version__) 84 | env.reset() 85 | # getting space and action 86 | ACTIONS = env.action_space.n #env.unwrapped.get_action_meanings() to get a list of the action names 87 | N_SPACE = env.observation_space.n 88 | #print(ACTIONS) 89 | #print(N_SPACE) 90 | q_table,rlist,steps = RL(ACTIONS,N_SPACE) 91 | 92 | plt.plot(rlist) 93 | plt.title("Received Rewards") 94 | plt.xlabel("Epochs") 95 | plt.ylabel("Rewards") 96 | plt.show() 97 | 98 | plt.plot(steps) 99 | plt.title("Needed steps to finish one episode") 100 | plt.xlabel("Epochs") 101 | plt.ylabel("Steps") 102 | plt.show() 103 | 104 | 105 | 106 | 107 | 108 | print("Q-Table: \n") 109 | print(q_table) 110 | 111 | print("\nDo you want to save the Q-Table? \n") 112 | answer = input("[y/n]") 113 | 114 | if answer == "y": 115 | q_table.to_pickle("./Q_Table_E{}_a{}_g{}_eps{}.pkl".format(EPISODES,ALPHA,GAMMA,EPSILON)) 116 | else: 117 | pass 118 | 119 | -------------------------------------------------------------------------------- /Q_Learning/treasure_q.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import time 4 | 5 | np.random.seed(2) # reproducible 6 | 7 | 8 | N_STATES = 6 # the length of the 1 dimensional world 9 | ACTIONS = ['left', 'right'] # available actions 10 | EPSILON = 0.9 # greedy police 11 | ALPHA = 0.1 # learning rate 12 | GAMMA = 0.9 # discount factor 13 | MAX_EPISODES = 18 # maximum episodes 14 | FRESH_TIME = 0.3 # fresh time for one move 15 | 16 | 17 | def build_q_table(n_states, actions): 18 | table = pd.DataFrame( 19 | np.zeros((n_states, len(actions))), # q_table initial values 20 | columns=actions, # actions's name 21 | ) 22 | # print(table) # show table 23 | return table 24 | 25 | 26 | def choose_action(state, q_table): 27 | # This is how to choose an action 28 | state_actions = q_table.iloc[state, :] 29 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value 30 | action_name = np.random.choice(ACTIONS) 31 | else: # act greedy 32 | action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas 33 | return action_name 34 | 35 | 36 | def get_env_feedback(S, A): 37 | # This is how agent will interact with the environment 38 | if A == 'right': # move right 39 | if S == N_STATES - 2: # terminate 40 | S_ = 'terminal' 41 | R = 1 42 | else: 43 | S_ = S + 1 44 | R = 0 45 | else: # move left 46 | R = 0 47 | if S == 0: 48 | S_ = S # reach the wall 49 | else: 50 | S_ = S - 1 51 | return S_, R 52 | 53 | 54 | def update_env(S, episode, step_counter): 55 | # This is how environment be updated 56 | env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment 57 | if S == 'terminal': 58 | interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter) 59 | print('\r{}'.format(interaction), end='') 60 | time.sleep(2) 61 | print('\r ', end='') 62 | else: 63 | env_list[S] = 'o' 64 | interaction = ''.join(env_list) 65 | print('\r{}'.format(interaction), end='') 66 | time.sleep(FRESH_TIME) 67 | 68 | 69 | def rl(): 70 | # main part of RL loop 71 | q_table = build_q_table(N_STATES, ACTIONS) 72 | for episode in range(MAX_EPISODES): 73 | step_counter = 0 74 | S = 0 75 | is_terminated = False 76 | update_env(S, episode, step_counter) 77 | while not is_terminated: 78 | 79 | A = choose_action(S, q_table) 80 | S_, R = get_env_feedback(S, A) # take action & get next state and reward 81 | q_predict = q_table.loc[S, A] 82 | if S_ != 'terminal': 83 | q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal 84 | else: 85 | q_target = R # next state is terminal 86 | is_terminated = True # terminate this episode 87 | 88 | q_table.loc[S, A] += ALPHA * (q_target - q_predict) # update 89 | S = S_ # move to next state 90 | 91 | update_env(S, episode, step_counter+1) 92 | step_counter += 1 93 | return q_table 94 | 95 | 96 | if __name__ == "__main__": 97 | q_table = rl() 98 | print('\r\nQ-table:\n') 99 | print(q_table) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-Reinforcement-Learning 2 | 3 | 4 | ![Logo](/imgs/web-3706562_640.jpg) 5 | 6 | Collection of Deep Reinforcement Learning Algorithms in PyTorch. 7 | 8 | 9 | Below a list of Jupyter Notebooks with implementations 10 | 11 | # Value Based / Offline Methods 12 | ## Discrete Action Space 13 | 14 | - [Q-Learning](Q_Learning)     [Source/Paper](/Paper/DQN.pdf) 15 | 16 | - [DQN](https://github.com/BY571/Reinforcement-Learning/tree/master/Deep%20Q_Learning)      [Paper](/Paper/DQN.pdf) 17 | 18 | - [Double DQN](https://github.com/BY571/Reinforcement-Learning/tree/master/Double%20DQN)      [Paper](/Paper/Double_DQN.pdf) 19 | 20 | - [Dueling DQN](https://github.com/BY571/DQN-Atari-Agents)      [Paper](/Paper/Dueling.pdf) 21 | 22 | - [N-Step DQN](https://github.com/BY571/DQN-Atari-Agents) 23 | 24 | - [Noisy DQN](https://github.com/BY571/DQN-Atari-Agents) 25 |      [Paper](/Paper/Noisy_networks.pdf) 26 | 27 | - [Rainbow](https://github.com/BY571/DQN-Atari-Agents) 28 |     [Paper](https://arxiv.org/pdf/1710.02298.pdf) 29 | 30 | ## Distributional RL 31 | 32 | - [Categorical DQN - C51](https://github.com/BY571/DQN-Atari-Agents)     [Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/Distributional%20DQN.pdf) 33 | 34 | - [QR-DQN](https://github.com/BY571/QR-DQN) 35 | 36 | - [IQN](https://github.com/BY571/IQN-and-Extensions) 37 | 38 | - [FQF](https://github.com/BY571/FQF-and-Extensions) 39 | 40 | 41 | ## Continuous Action Space 42 | 43 | - [NAF - Normalized Advantage Function](https://github.com/BY571/Normalized-Advantage-Function-NAF-) 44 | 45 | -[Soft-DQN] TODO 46 | _________________________________________________ 47 | # Policy Based / Online Methods 48 | ## Discrete Action Space 49 | 50 | 51 | - [Sarsa](https://github.com/BY571/Reinforcement-Learning/blob/master/Temporal%20Difference%20(Sarsa%2C%20Sarsamax%2C%20Expeted%20Sarsa)/Temporal_Difference.ipynb) 52 | [Source/Paper] 53 | 54 | 55 | - [Vanilla Policy Gradient](https://github.com/BY571/Reinforcement-Learning/blob/master/Policy%20Gradient%20Algorithms/Policy_Gradien_%2B_Baseline_mean.ipynb) [+LSTM](https://github.com/BY571/Reinforcement-Learning/blob/master/Policy%20Gradient%20Algorithms/PolicyGradient_LSTM.ipynb) 56 | [Source/Paper] 57 | 58 | 59 | - A2C 60 | [Paper](/Paper/A3C.pdf) 61 | 62 | - A2C with gae* [TODO] 63 | 64 | - A2C multi environment 65 | 66 | 67 | - PPO 68 | [Paper](/Paper/PPO.pdf) 69 | 70 | - PPO with gae* 71 | 72 | - [PPO with gae and curiosity driven exploration (single, digit inputs)](https://github.com/BY571/Reinforcement-Learning/blob/master/PPO_gae_curios.ipynb) [Paper](/Paper/) 73 | 74 | - PPO multi environment 75 | 76 | 77 | ## Continuous Action Space 78 | 79 | - [A2C](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/A2C_conti_seperate_networks.ipynb) 80 | 81 | - A2C with gae* [TODO] 82 | 83 | - [A2C multi environment](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/A2C_continuous_multienv.ipynb) 84 | 85 | 86 | - [PPO](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_unity_Crawler.ipynb) 87 | 88 | - [PPO with gae*](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/ROBOSCHOOL_PPO_GAE.ipynb)[PPO with gae multi](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_conti_gae_multi.ipynb) 89 | 90 | - [PPO+curiosity&single](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_conti_gae_curios.ipynb)[+curiosity&multi](https://github.com/BY571/Reinforcement-Learning/blob/master/PPO_conti_gae_curio_multi.ipynb) 91 | 92 | - [PPO multi environment](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_unity_Crawler.ipynb) 93 | 94 | 95 | 96 | 97 | gae* = Generalized Advanted Estimation [Source](/Paper/GAE.pdf) 98 | 99 | ______________________________________________ 100 | 101 | # Actor-Critic Algorithms 102 | 103 | - [DDPG](https://github.com/BY571/Udacity-DRL-Nanodegree-P2) 104 | [Source/Paper] 105 | 106 | - [D4PG](https://github.com/BY571/D4PG) 107 | [Source/Paper](https://arxiv.org/pdf/1804.08617.pdf) 108 | 109 | - [Twin Delayed DDPG (TD3)](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/TD3_conti.ipynb) 110 |     [Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/TD3.pdf) 111 | 112 | - [Soft Actor Critic (SAC-newest 2019 version)](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/SAC.ipynb) 113 |     [Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/SAC_2019.pdf) 114 | 115 | ________________________________________________ 116 | 117 | # Upside-Down-Reinforcement-Learning 118 | Discrete and continuous action space implementation of [⅂ꓤ](https://github.com/BY571/Upside-Down-Reinforcement-Learning) 119 | 120 | ________________________________________________ 121 | # Munchausen Reinforcement Learning 122 | 123 | Implementierungen von Munchausen RL 124 | 125 | - [M-DQN](https://github.com/BY571/Munchausen-RL) 126 | 127 | - [M-IQN](https://github.com/BY571/IQN-and-Extensions) 128 | 129 | - [M-FQF](https://github.com/BY571/FQF-and-Extensions) 130 | 131 | - [M-SAC](https://github.com/BY571/Soft-Actor-Critic-and-Extensions) 132 | 133 | 134 | ________________________________________________ 135 | 136 | # Model-Based RL 137 | 138 | __________________________________________________ 139 | 140 | # Black-Box Optimization 141 | 142 | - [Evolution Strategies]() [with mulit processing](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolution_Strategies_parallel+novelty/README.md) [and novelty search](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolution_Strategies_parallel+novelty/README.md) 143 | 144 | - [Genetic Algorithm - GARNE](https://github.com/BY571/GARNE-Genetic-Algorithm-with-Recurrent-Network-and-Novelty-Exploration/blob/master/README.md) 145 | - Genetic Algorithm implementation with LSTM, Multiprocessing over several CPUs and Novelty Search for Exploration 146 | __________________________________________ 147 | # Multi-Agent Deep Reinforcement Learning 148 | 149 | - [Multi-Agent-DDPG](https://github.com/BY571/Udacity-DRL-Nanodegree-P3-Multiagent-RL-) 150 | 151 | # Hyperparameter Tuning 152 | 153 | Gridsearch 154 | 155 | Random Forest [TODO] 156 | 157 | Genetic Algorithm [TODO] 158 | 159 | ==================================== 160 | 161 | 162 | -------------------------------------------------------------------------------- /Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/README.md: -------------------------------------------------------------------------------- 1 | # Taxi Problem 2 | 3 | ### Getting Started 4 | 5 | Read the description of the environment in subsection 3.1 of [this paper](https://arxiv.org/pdf/cs/9905014.pdf). You can verify that the description in the paper matches the OpenAI Gym environment by peeking at the code [here](https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py). 6 | 7 | 8 | ### Instructions 9 | 10 | The repository contains three files: 11 | - `agent.py`: Develop your reinforcement learning agent here. This is the only file that you should modify. 12 | - `monitor.py`: The `interact` function tests how well your agent learns from interaction with the environment. 13 | - `main.py`: Run this file in the terminal to check the performance of your agent. 14 | 15 | Begin by running the following command in the terminal: 16 | ``` 17 | python main.py 18 | ``` 19 | 20 | When you run `main.py`, the agent that you specify in `agent.py` interacts with the environment for 20,000 episodes. The details of the interaction are specified in `monitor.py`, which returns two variables: `avg_rewards` and `best_avg_reward`. 21 | - `avg_rewards` is a deque where `avg_rewards[i]` is the average (undiscounted) return collected by the agent from episodes `i+1` to episode `i+100`, inclusive. So, for instance, `avg_rewards[0]` is the average return collected by the agent over the first 100 episodes. 22 | - `best_avg_reward` is the largest entry in `avg_rewards`. This is the final score that you should use when determining how well your agent performed in the task. 23 | 24 | Your assignment is to modify the `agents.py` file to improve the agent's performance. 25 | - Use the `__init__()` method to define any needed instance variables. Currently, we define the number of actions available to the agent (`nA`) and initialize the action values (`Q`) to an empty dictionary of arrays. Feel free to add more instance variables; for example, you may find it useful to define the value of epsilon if the agent uses an epsilon-greedy policy for selecting actions. 26 | - The `select_action()` method accepts the environment state as input and returns the agent's choice of action. The default code that we have provided randomly selects an action. 27 | - The `step()` method accepts a (`state`, `action`, `reward`, `next_state`) tuple as input, along with the `done` variable, which is `True` if the episode has ended. The default code (which you should certainly change!) increments the action value of the previous state-action pair by 1. You should change this method to use the sampled tuple of experience to update the agent's knowledge of the problem. 28 | 29 | Once you have modified the function, you need only run `python main.py` to test your new agent. 30 | 31 | OpenAI Gym [defines "solving"](https://gym.openai.com/envs/Taxi-v1/) this task as getting average return of 9.7 over 100 consecutive trials. 32 | -------------------------------------------------------------------------------- /Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/agent.cpython-37.pyc -------------------------------------------------------------------------------- /Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/monitor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/monitor.cpython-37.pyc -------------------------------------------------------------------------------- /Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | 4 | 5 | class Agent: 6 | 7 | def __init__(self, nA=6, learning_rate = 0.02, gamma = 0.9): 8 | """ Initialize agent. 9 | 10 | Params 11 | ====== 12 | - nA: number of actions available to the agent 13 | """ 14 | self.nA = nA 15 | self.Q = defaultdict(lambda: np.zeros(self.nA)) 16 | self.lr = learning_rate 17 | self.gamma = gamma 18 | 19 | 20 | def probabilities(self,q, epsilon): 21 | probs = np.ones(self.nA) * epsilon/self.nA 22 | best_action = np.argmax(q) 23 | probs[best_action] = (1 - epsilon) + epsilon/self.nA 24 | return probs 25 | 26 | def select_action(self, state, epsilon): 27 | """ Given the state, select an action. 28 | 29 | Params 30 | ====== 31 | - state: the current state of the environment 32 | 33 | Returns 34 | ======= 35 | - action: an integer, compatible with the task's action space 36 | """ 37 | action = np.random.choice(np.arange(self.nA), p = self.probabilities(self.Q[state], epsilon)) \ 38 | if state in self.Q else np.random.choice(np.arange(self.nA)) 39 | return action 40 | 41 | 42 | 43 | def step(self, state, action, reward, next_state, done): 44 | """ Update the agent's knowledge, using the most recently sampled tuple. 45 | 46 | Params 47 | ====== 48 | - state: the previous state of the environment 49 | - action: the agent's previous choice of action 50 | - reward: last reward received 51 | - next_state: the current state of the environment 52 | - done: whether the episode is complete (True or False) 53 | """ 54 | Q_target = np.max(self.Q[next_state]) 55 | self.Q[state][action] = self.Q[state][action] + self.lr * (reward + self.gamma*(1-done)*Q_target - self.Q[state][action] ) -------------------------------------------------------------------------------- /Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/main.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | from monitor import interact 3 | import gym 4 | import numpy as np 5 | 6 | env = gym.make('Taxi-v2') 7 | agent = Agent() 8 | avg_rewards, best_avg_reward = interact(env, agent) -------------------------------------------------------------------------------- /Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/monitor.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import sys 3 | import math 4 | import numpy as np 5 | 6 | def interact(env, agent, num_episodes=20000, window=100,epsilon_start = 1, epsilon_decay = 0.9999, epsilon_min = 0.01): 7 | """ Monitor agent's performance. 8 | 9 | Params 10 | ====== 11 | - env: instance of OpenAI Gym's Taxi-v1 environment 12 | - agent: instance of class Agent (see Agent.py for details) 13 | - num_episodes: number of episodes of agent-environment interaction 14 | - window: number of episodes to consider when calculating average rewards 15 | 16 | Returns 17 | ======= 18 | - avg_rewards: deque containing average rewards 19 | - best_avg_reward: largest value in the avg_rewards deque 20 | """ 21 | # initialize average rewards 22 | avg_rewards = deque(maxlen=num_episodes) 23 | # initialize best average reward 24 | best_avg_reward = -math.inf 25 | # initialize monitor for most recent rewards 26 | samp_rewards = deque(maxlen=window) 27 | # for each episode 28 | epsilon = epsilon_start 29 | for i_episode in range(1, num_episodes+1): 30 | # begin the episode 31 | state = env.reset() 32 | # initialize the sampled reward 33 | samp_reward = 0 34 | while True: 35 | epsilon = max(epsilon*epsilon_decay,epsilon_min) 36 | # agent selects an action 37 | action = agent.select_action(state, epsilon) # 38 | # agent performs the selected action 39 | next_state, reward, done, _ = env.step(action) 40 | # agent performs internal updates based on sampled experience 41 | agent.step(state, action, reward, next_state, done) 42 | # update the sampled reward 43 | samp_reward += reward 44 | # update the state (s <- s') to next time step 45 | state = next_state 46 | if done: 47 | # save final sampled reward 48 | samp_rewards.append(samp_reward) 49 | break 50 | if (i_episode >= 100): 51 | # get average reward from last 100 episodes 52 | avg_reward = np.mean(samp_rewards) 53 | # append to deque 54 | avg_rewards.append(avg_reward) 55 | # update best average reward 56 | if avg_reward > best_avg_reward: 57 | best_avg_reward = avg_reward 58 | # monitor progress 59 | print("\rEpisode {}/{} || Best average reward {} || Epsilon {}".format(i_episode, num_episodes, best_avg_reward, epsilon), end="") 60 | sys.stdout.flush() 61 | # check if task is solved (according to OpenAI Gym) 62 | if best_avg_reward >= 9.7: 63 | print('\nEnvironment solved in {} episodes.'.format(i_episode), end="") 64 | break 65 | if i_episode == num_episodes: print('\n') 66 | return avg_rewards, best_avg_reward -------------------------------------------------------------------------------- /imgs/web-3706562_640.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/imgs/web-3706562_640.jpg --------------------------------------------------------------------------------