├── Black-Box Optimization
    ├── Evolution_Strategies_parallel+novelty
    │   ├── ES_baseline_parallel.py
    │   ├── ES_disc_parallel_novelty.py
    │   ├── ES_own_conti_parallel.py
    │   ├── README.md
    │   └── imgs
    │   │   └── pendulum.png
    ├── Evolutionary_Strategies_Cartpole.ipynb
    ├── README.md
    ├── genetic_algorithm_base.py
    └── imgs
    │   └── ga_cartpole.png
├── Categorical_DQN.ipynb
├── ContinousControl
    ├── A2C_conti_seperate_networks.ipynb
    ├── A2C_continuous_multienv.ipynb
    ├── DDPG.py
    ├── MultiPro.py
    ├── PPO_conti_gae_curios.ipynb
    ├── PPO_conti_gae_multi.ipynb
    ├── PPO_gae_multi.py
    ├── PPO_test_crawler.ipynb
    ├── PPO_unity_Crawler.ipynb
    ├── Parallel_processing.py
    ├── ROBOSCHOOL_PPO_GAE.ipynb
    ├── SAC.ipynb
    ├── SAC_script.py
    └── TD3_conti.ipynb
├── Cross_entropy
    ├── Cross_entropy.py
    ├── README.md
    └── img
    │   └── Cross_entropy.png
├── Deep Q_Learning
    ├── DQN_Experience_Replay.py
    ├── Img
    │   ├── 4k Learning_curve.png
    │   └── Converging.png
    └── README.md
├── Double DQN
    ├── CNN_Double_DQN.py
    ├── Double_DQN.py
    ├── Imgs
    │   ├── 4000_40-40.png
    │   ├── CNN_pong_converge.png
    │   └── test.png
    ├── README.md
    └── wrapper.py
├── Dueling Deep Q-Network
    ├── CNN_Dueling_DDQN.py
    ├── CNN_Dueling_DDQN_PER.py
    ├── Dueling_DQN.ipynb
    ├── Img
    │   ├── Duel_per.png
    │   └── Dueling_DQN.png
    ├── PrioritizedExperienceReplay.py
    ├── Video
    │   ├── Breakout.mp4
    │   └── Pong.mp4
    └── wrapper.py
├── Grid_search_for_Reinforcement_learning.ipynb
├── Noisy_DQN.ipynb
├── Nstep_DQN.ipynb
├── PPO_conti_gae_curio_multi.ipynb
├── PPO_gae_curios.ipynb
├── Paper
    ├── A3C.pdf
    ├── DDPG.pdf
    ├── DQN.pdf
    ├── Distributional DQN.pdf
    ├── Double_DQN.pdf
    ├── Dueling.pdf
    ├── GAE.pdf
    ├── Noisy_networks.pdf
    ├── PPO.pdf
    ├── SAC_2019.pdf
    └── TD3.pdf
├── Policy Gradient Algorithms
    ├── A2C.ipynb
    ├── A2C_conti_seperate_networks.ipynb
    ├── A2C_continous_action_space.ipynb
    ├── A2C_continuous_multienv.ipynb
    ├── PPO.ipynb
    ├── Parallel_processing.py
    ├── PolicyGradient_LSTM.ipynb
    ├── Policy_Gradien_+_Baseline_mean.ipynb
    └── REINFORCE
    │   ├── Img
    │       └── Steps_needed.png
    │   └── REINFORCE.py
├── Q_Learning
    ├── FrozenLake_q-table.py
    ├── Img
    │   ├── Q_table10000.png
    │   ├── Q_value.png
    │   ├── Receivedrewards.png
    │   └── steps_taken.png
    ├── Q_Table_E10000_a0.09_g0.9_eps0.9.pkl
    ├── Q_Table_E3000_a0.09_g0.9_eps0.9.pkl
    ├── Q_Table_own_example.ipynb
    ├── Readme.md
    ├── play_FrozenLake_Q_table.py
    ├── train_FrozenLake_Qtable.py
    └── treasure_q.py
├── README.md
├── Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)
    ├── Temporal_Difference.ipynb
    └── lab-taxi
    │   ├── README.md
    │   ├── __pycache__
    │       ├── agent.cpython-37.pyc
    │       └── monitor.cpython-37.pyc
    │   ├── agent.py
    │   ├── main.py
    │   └── monitor.py
└── imgs
    └── web-3706562_640.jpg


/Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_baseline_parallel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Oct  9 10:24:39 2019
  4 | 
  5 | @author: Z0014354
  6 | """
  7 | 
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import gym
 11 | import multiprocessing as mp
 12 | import collections
 13 | import copy
 14 | 
 15 | ITERS_PER_UPDATE = 10
 16 | NOISE_STD = 0.01
 17 | LR = 1e-3
 18 | PROCESSES_COUNT = 8 # amount of worker
 19 | HIDDEN_SIZE = 4
 20 | ENV_NAME = "CartPole-v0" 
 21 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps'])
 22 | 
 23 | 
 24 | 
 25 | class Model(object):
 26 | 
 27 |     def __init__(self, stateCnt, actionCnt, hidden_size = HIDDEN_SIZE):
 28 |         # inits zero weights
 29 |         self.weights = [np.zeros(shape=(stateCnt, hidden_size)), np.zeros(shape=(hidden_size, hidden_size)), np.zeros(shape=(hidden_size,actionCnt))]
 30 | 
 31 |     def predict(self, inp):
 32 |         out = np.expand_dims(inp.flatten(), 0)
 33 |         out = out / np.linalg.norm(out)
 34 |         weight_len = len(self.weights)
 35 |         for idx, layer in enumerate(self.weights):
 36 |             # hidden activation
 37 |             if idx < weight_len - 1:
 38 |                 out = self.activation(np.dot(out, layer))
 39 |             # outout activation
 40 |             else:
 41 |                 out = self.activation(np.dot(out, layer), type_="output_layer")
 42 |         return out[0]
 43 |     
 44 |     def activation(self,x, type_="hidden"):
 45 |         if type_ == "hidden":
 46 |             # relu
 47 |             return np.maximum(x,0)
 48 |             
 49 |             # softmax
 50 |             #return (np.exp(x))/sum(np.exp(x))
 51 |             
 52 |             #softplus
 53 |             #return np.log(1 + np.exp(x))
 54 |             
 55 |             #sigmoid
 56 |             #return 1/(1+np.exp(-x))
 57 |             
 58 |             # tanh
 59 |             #return np.tanh(x)
 60 |         else:
 61 |             # softnmax
 62 |             #return (np.exp(x))/sum(np.exp(x))
 63 |             
 64 |             # relu
 65 |             return np.maximum(x,0)
 66 |         
 67 |     def get_weights(self):
 68 |         return self.weights
 69 | 
 70 |     def set_weights(self, weights):
 71 |         self.weights = weights
 72 |         
 73 |         
 74 | def evaluate(env, brain):
 75 |     """
 76 |     Runs an evaluation on the given brain.
 77 |     """
 78 |     state = env.reset()
 79 |     rewards = 0
 80 |     steps = 0
 81 |     while True:
 82 |         state = np.expand_dims(state, axis=0)
 83 |         action_prob = brain.predict(state)
 84 |         action = action_prob.argmax()  # for discrete action space
 85 |         
 86 |         next_state, reward, done, _ = env.step(action)
 87 |         rewards += reward
 88 |         steps  += 1
 89 |         state = next_state
 90 |         if done:
 91 |             break
 92 |       
 93 |     return rewards, steps
 94 | 
 95 | 
 96 | def sample_noise(brain):
 97 |     """
 98 |     Sampling noise to a positive and negative noise buffer.
 99 |     """
100 |     pos = []
101 |     neg = []
102 |     for param in brain.get_weights():
103 |         noise_t = np.random.normal(size = param.shape)
104 |         pos.append(noise_t)
105 |         neg.append(-noise_t)
106 |     return pos, neg
107 | 
108 | 
109 | def eval_with_noise(env, brain, noise, noise_std):
110 |     """
111 |     Evaluates the current brain with added parameter noise
112 |   
113 |     """
114 |   
115 |     old_params = copy.deepcopy(brain.get_weights())
116 |     new_params = []
117 |     for p, p_n in zip(brain.get_weights(), noise):
118 |         p += noise_std*p_n
119 |         new_params.append(p)
120 |     brain.set_weights(new_params)
121 |     r, s = evaluate(env, brain)
122 |     brain.set_weights(old_params)
123 |     return r, s 
124 | 
125 | 
126 | def worker_func(worker_id, params_queue, rewards_queue, noise_std):
127 |     #print("worker: {} has started".format(worker_id))
128 |     env = gym.make(ENV_NAME)
129 |     net = Model(env.observation_space.shape[0], env.action_space.n)
130 | 
131 |     while True:
132 |         params = params_queue.get()
133 |         if params is None:
134 |             break
135 | 
136 |         # set parameters of the queue - equal to: net.load_state_dict(params)
137 |         net.set_weights([param for param in params])
138 |         
139 |         for _ in range(ITERS_PER_UPDATE):
140 |             seed = np.random.randint(low=0, high=65535)
141 |             np.random.seed(seed)
142 |             noise, neg_noise = sample_noise(net)
143 |             pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std)
144 |             neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std)
145 |             #print(_, "\n",noise, pos_reward, neg_reward)
146 |             
147 |             rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps))
148 | 
149 |     pass
150 | 
151 | 
152 | def train_step(brain, batch_noise, batch_rewards, step_idx):
153 |     """
154 |     Optimizes the weights of the NN based on the rewards and noise gathered
155 |     """
156 |     # normalize rewards to have zero mean and unit variance
157 |     norm_reward = np.array(batch_reward)
158 |     norm_reward -= np.mean(norm_reward)
159 |     s = np.std(norm_reward)
160 |     if abs(s) > 1e-6:
161 |         norm_reward /= s
162 |         
163 |     weighted_noise = None
164 |     for noise, reward in zip(batch_noise, norm_reward):
165 |         if weighted_noise is None:
166 |             weighted_noise = [reward * p_n for p_n in noise]
167 |         else:
168 |             for w_n, p_n in zip(weighted_noise, noise):
169 |                 w_n += reward * p_n
170 |         
171 | 
172 |     for p, p_update in zip(brain.get_weights(), weighted_noise):
173 |         update = p_update / (len(batch_reward)*NOISE_STD)
174 |         p += LR * update
175 |         
176 |         
177 |         
178 | if __name__ == "__main__":
179 | 
180 |     env = gym.make(ENV_NAME)
181 |     #env.seed(2)
182 |     brain = Model(env.observation_space.shape[0], env.action_space.n)
183 | 
184 |     iterations = 100 # max iterations to run 
185 | 
186 |     params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)]
187 |     rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE)
188 |     
189 |     
190 |     workers = []
191 | 
192 | 
193 |     for idx, params_queue in enumerate(params_queues):
194 |         proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD))
195 |         proc.start()
196 |         workers.append(proc)
197 | 
198 |     print("All started!")
199 |     step_idx = 0
200 |     reward_history = []
201 |     reward_max =[]
202 |     reward_std = []
203 | 
204 | 
205 |     for step_idx in range(iterations):
206 |         # broadcasting network params
207 |         params = brain.get_weights()
208 |         for q in params_queues:
209 |             q.put(params)
210 | 
211 |         batch_noise = []
212 |         batch_reward = []
213 |         batch_steps_data = []
214 |         batch_steps = 0
215 |         results = 0
216 |         while True: 
217 |             while not rewards_queue.empty():
218 |                 reward = rewards_queue.get_nowait()
219 |                 np.random.seed(reward.seed) # sets the seed of the current worker rewards
220 |                 noise, neg_noise = sample_noise(brain)
221 |                 batch_noise.append(noise)
222 |                 batch_reward.append(reward.pos_reward)
223 |                 batch_noise.append(neg_noise)
224 |                 batch_reward.append(reward.neg_reward)
225 |                 results += 1
226 |                 batch_steps += reward.steps
227 | 
228 |             if results == PROCESSES_COUNT * ITERS_PER_UPDATE:
229 |                 break
230 | 
231 |         step_idx += 1
232 |         m_reward = np.mean(batch_reward)
233 |         reward_history.append(m_reward)
234 |         reward_max.append(np.max(batch_reward))
235 |         reward_std.append(np.std(batch_reward))
236 |         if m_reward > 199:
237 |             print("\nSolved the environment in {} steps".format(step_idx))
238 |             break
239 |         train_step(brain, batch_noise, batch_reward, step_idx)
240 | 
241 |         print("\rStep: {}, Mean_Reward: {:.2f}".format(step_idx, m_reward), end = "", flush = True)
242 | 
243 | 
244 |     for worker, p_queue in zip(workers, params_queues):
245 |         p_queue.put(None)
246 |         worker.join()
247 | 
248 |     plt.figure(figsize = (11,7))
249 |     plt.plot(reward_history, label = "Mean Reward", color = "orange")
250 |     plt.plot(reward_max, label = "Max Reward", color = "blue")
251 |     plt.plot(reward_std, label = "Reward std", color = "green")
252 |     plt.xlabel("Steps")
253 |     plt.ylabel("Rewards")
254 |     plt.legend()
255 |     plt.show()


--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_disc_parallel_novelty.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Oct  9 10:24:39 2019
  4 | 
  5 | @author: Z0014354
  6 | """
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.distributions import Normal
 11 | 
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | import gym
 15 | import torch.multiprocessing as mp
 16 | import collections
 17 | from collections import deque
 18 | import copy
 19 | from tensorboardX import SummaryWriter
 20 | from sklearn.neighbors import NearestNeighbors
 21 | 
 22 | ITERS_PER_UPDATE = 10
 23 | NOISE_STD = 0.1 #0.04 higher std leeds to better exploration - more stable learning
 24 | LR = 2e-2
 25 | PROCESSES_COUNT = 6 # amount of worker default 6
 26 | HIDDEN_SIZE = 5   # 6
 27 | K_NEIGHBORS = 10
 28 | ENV_NAME =  "CartPole-v0"   #"Alien-ram-v0"
 29 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps'])
 30 | 
 31 | 
 32 | 
 33 | class Model(nn.Module):
 34 |     def __init__(self, state_size, action_size, idx, hidden_size=HIDDEN_SIZE):
 35 |         super(Model, self).__init__()
 36 |         self.idx = idx
 37 |         self.fc1 = nn.Linear(state_size, hidden_size)
 38 |         self.fc2 = nn.Linear(hidden_size, hidden_size)
 39 |         self.fc3 = nn.Linear(hidden_size, action_size)
 40 | 
 41 |     def forward(self, x):
 42 |         x = torch.relu(self.fc1(x))
 43 |         x = torch.relu(self.fc2(x))
 44 |         probs = torch.softmax(self.fc3(x), dim=1)
 45 |         return probs
 46 | 
 47 | 
 48 | def evaluate(env, brain):
 49 |     """
 50 |     Runs an evaluation on the given brain.
 51 |     """
 52 |     state = env.reset()
 53 |     rewards = 0
 54 |     steps = 0
 55 |     while True:
 56 |         state = torch.from_numpy(state).unsqueeze(0).float()
 57 |         probs = brain(state)
 58 |         action = probs.max(dim = 1)[1]
 59 |         next_state, reward, done, _ = env.step(action.data.numpy()[0])
 60 |         rewards += reward
 61 |         steps  += 1
 62 |         state = next_state
 63 |         if done:
 64 |             break
 65 | 
 66 |     return rewards, steps
 67 | 
 68 | 
 69 | def sample_noise(brain):
 70 |     """
 71 |     Samples noise from a normal distribution in the shape of the brain parameters. Output are two noisy parameters: + noise and - noise (for better and more stable learning!)
 72 |     """
 73 |     pos = []
 74 |     neg = []
 75 |     for param in brain.parameters():
 76 |         noise_t = torch.tensor(np.random.normal(size = param.data.size()).astype(np.float32))
 77 |         pos.append(noise_t)
 78 |         neg.append(-noise_t)
 79 |     return pos, neg
 80 | 
 81 | 
 82 | def eval_with_noise(env, brain, noise, noise_std):
 83 |     """
 84 |     Evaluates the current brain with added parameter noise
 85 | 
 86 |     """
 87 |     for p, p_n in zip(brain.parameters(), noise):
 88 |         p.data += noise_std * p_n
 89 |     r, s = evaluate(env, brain)
 90 |     for p, p_n in zip(brain.parameters(), noise):
 91 |         p.data -= noise_std * p_n
 92 |     return r, s
 93 | 
 94 | 
 95 | def worker_func(worker_id, params_queue, rewards_queue, noise_std):
 96 |     """
 97 |     Worker function that gathers pos and negative rewards for the optimization process and puts them in the rewards_queue with the network parameter seed:
 98 |         >> rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) <<
 99 |     """
100 |     #print("worker: {} has started".format(worker_id))
101 |     env = gym.make(ENV_NAME)
102 |     net = Model(env.observation_space.shape[0], env.action_space.n, "worker")
103 |     net.eval()
104 |     while True:
105 |         params = params_queue.get()
106 |         if params is None:
107 |             break
108 | 
109 |         # set parameters of the queue
110 |         net.load_state_dict(params)
111 | 
112 |         for _ in range(ITERS_PER_UPDATE):
113 |             seed = np.random.randint(low=0, high=65535)
114 |             np.random.seed(seed)
115 |             noise, neg_noise = sample_noise(net)
116 |             pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std)
117 |             neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std)
118 |             #print(_, "\n",noise, pos_reward, neg_reward)
119 |             rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps))
120 | 
121 |     pass
122 | 
123 | 
124 | def train_step(brain, novelty, batch_noise, batch_rewards, step_idx):
125 |     """
126 |     Optimizes the weights of the NN based on the rewards and noise gathered
127 |     """
128 |     # normalize rewards to have zero mean and unit variance
129 |     norm_reward = np.array(batch_reward)
130 |     norm_reward -= np.mean(norm_reward)
131 |     s = np.std(norm_reward)
132 |     if abs(s) > 1e-6:
133 |         norm_reward /= s
134 | 
135 |     weighted_noise = None
136 |     for noise, reward in zip(batch_noise, norm_reward):
137 |         if weighted_noise is None:
138 |             weighted_noise = [(W*reward* p_n) + ((1-W)*novelty*p_n) for p_n in noise]  # combining reward and novelty
139 |         else:
140 |             for w_n, p_n in zip(weighted_noise, noise):
141 |                 w_n += (W*reward* p_n) + ((1-W)*novelty*p_n)
142 | 
143 | 
144 |     for p, p_update in zip(brain.parameters(), weighted_noise):
145 |         update = p_update / (len(batch_reward)*NOISE_STD)
146 |         p.data += LR * update
147 | 
148 | 
149 | def test_current_params(env, net):
150 |     """
151 |     Runs the current network parameters on the env to visually monitor the progress.
152 |     """
153 |     state = env.reset()
154 | 
155 |     while True:
156 |         env.render()
157 |         state = torch.from_numpy(state).unsqueeze(0).float()
158 |         probs = brain(state)
159 |         action = probs.max(dim = 1)[1]
160 |         state, reward, done, _ = env.step(action.data.numpy()[0])
161 | 
162 |         if done:
163 |             break
164 | 
165 | def get_behavior_char(env, net):
166 |     """
167 |     Returns the initial behavior characterization value b_pi0 for a network.
168 |     The value is defined in this case as the final state of agent in the environment.
169 |     
170 |     >>> Important to find a good behavior characterization. Depents on the environment! <<< -> final state, step count ... 
171 |     
172 |     """
173 |     state = env.reset()
174 |     step_count = 0
175 |     while True:
176 |         state = torch.from_numpy(state).unsqueeze(0).float()
177 |         probs = brain(state)
178 |         action = probs.max(dim = 1)[1]
179 |         state, reward, done, _ = env.step(action.data.numpy()[0])
180 |         step_count += 1
181 |         if done:
182 |             break
183 |     #print(step_count)
184 |     return  np.array([step_count]) #state 
185 | 
186 | 
187 | def get_kNN(archive, bc, n_neighbors):
188 |     """
189 |     Searches and samples the K-nearest-neighbors from the archive and a new behavior characterization
190 |     returns the summed distance between input behavior characterization and the bc in the archive
191 |     
192 |     """
193 | 
194 |     archive = np.concatenate(archive)
195 |     neigh = NearestNeighbors(n_neighbors=n_neighbors)
196 |     neigh.fit(archive)
197 |     distances, idx = neigh.kneighbors(X = bc, n_neighbors=n_neighbors)
198 |     #k_nearest_neighbors = archive[idx].squeeze(0)
199 | 
200 |     return sum(distances.squeeze(0))
201 |     
202 |     
203 | 
204 | # =============================================================================
205 | # def calc_novelty(b_pi_theta, archive):
206 | #     """
207 | #     calculates the novelty of a given arcive of behavior characterizations.
208 | #     returns the mean distance between the initial behavior characterizations and all new gathered behavior characterizations.
209 | #     """
210 | #     # distance loss function:
211 | #     distance = nn.MSELoss() #nn.PairwiseDistance()
212 | #     # creates arcive vector for distance calc
213 | #     archive_v = torch.cat(archive)
214 | #     # create a vector of initial behavior characterizations in the shape of the arcive length
215 | #     b_pi_theta_v = torch.cat([b_pi_theta for i in range(len(archive))])
216 | # 
217 | #     return torch.sqrt(distance(b_pi_theta_v, archive_v)).mean()
218 | # =============================================================================
219 | 
220 | def calc_noveltiy_distribution(novelties):
221 |     """
222 |     Calculates the probabilities of each model parameters of being selected as its
223 |     novelty normalized by the sum of novelty across all policies:
224 | 
225 |     P(theta_m) for each element in the meta_population M - m element M
226 | 
227 |     """
228 |     probabilities = [round((novel/(sum(novelties))),4) for novel in novelties]
229 |     return probabilities
230 | 
231 | 
232 | if __name__ == "__main__":
233 | 
234 |     env = gym.make(ENV_NAME)
235 |     #env.seed(2)
236 |     MPS = 2 # meta population size
237 |     meta_population = [Model(env.observation_space.shape[0],env.action_space.n, idx=i) for i in range(MPS)]
238 | 
239 |     # create arcive for models
240 |     archive = []
241 |     writer = SummaryWriter()
242 |     iterations = 300 #1500 # max iterations to run
243 | 
244 |     delta_reward_buffer = deque(maxlen=10)  # buffer to store the reward gradients to see if rewards stay constant over a defined time horizont ~> local min
245 |     W = 1
246 | 
247 |     params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)]
248 |     rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE)
249 |     workers = []
250 | 
251 |     for idx, params_queue in enumerate(params_queues):
252 |         proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD))
253 |         proc.start()
254 |         workers.append(proc)
255 | 
256 |     print("All started!")
257 |     step_idx = 0
258 |     reward_history = []
259 |     reward_max =[]
260 |     reward_min = []
261 |     reward_std = []
262 | 
263 |     old_m_reward = 0
264 | 
265 |     for step_idx in range(iterations):
266 | 
267 |         ########################## NOVELTY BRAIN SELECTION #############################
268 |         # select new network from the meta population based on its probability:
269 |         if len(archive) > 0:
270 |             novelties = []
271 |             S = np.minimum(K_NEIGHBORS, len(archive))
272 |             for model in meta_population:
273 |                 b_pi_theta = torch.from_numpy(get_behavior_char(env, model)).unsqueeze(0).float()
274 |                 distance = get_kNN(archive, b_pi_theta.numpy(), S)
275 |                 novelty = distance / S
276 |                 if novelty <= 1e-3:
277 |                     novelty = 5e-3
278 |                 novelties.append(novelty)
279 | 
280 |             #print("novelties:", novelties)
281 |             
282 |             probs = calc_noveltiy_distribution(novelties)
283 |             #print("probs: ", probs )
284 |             probs = np.array(probs)
285 |             probs /= probs.sum()   # norm so that sum up to one - does without as well but np gives error because of rounding
286 |             brain_idx = np.random.choice(list(range(MPS)),p=probs) # select new brain based on novelty probabilities
287 |             brain = meta_population[brain_idx]
288 |             novelty = novelties[brain_idx]
289 |         else:
290 |             brain_idx = np.random.randint(0, MPS)
291 |             brain = meta_population[brain_idx]
292 |             novelty = 1
293 |         ###################################################################################
294 | 
295 |         # broadcasting network params
296 |         params = brain.state_dict()
297 |         for q in params_queues:
298 |             q.put(params)
299 | 
300 |         batch_noise = []
301 |         batch_reward = []
302 |         batch_steps_data = []
303 |         batch_steps = 0
304 |         results = 0
305 | 
306 |         while True:
307 |             #print(rewards_queue.qsize())
308 |             while not rewards_queue.empty():
309 |                 reward = rewards_queue.get_nowait()
310 |                 np.random.seed(reward.seed) # sets the seed of the current worker rewards
311 |                 noise, neg_noise = sample_noise(brain)
312 |                 batch_noise.append(noise)
313 |                 batch_reward.append(reward.pos_reward)
314 |                 batch_noise.append(neg_noise)
315 |                 batch_reward.append(reward.neg_reward)
316 |                 results += 1
317 |                 batch_steps += reward.steps
318 | 
319 |             if results == PROCESSES_COUNT * ITERS_PER_UPDATE:
320 |                 break
321 | 
322 |         step_idx += 1
323 |         m_reward = np.mean(batch_reward)
324 | 
325 |         reward_gradient_mean = np.mean(delta_reward_buffer)
326 |         r_koeff = abs(m_reward - reward_gradient_mean)
327 |         # if last few rewards are almost konstant -> stuck in loc minima -> decrease W for exploration: higher novelty weight
328 |         if r_koeff < 1.5:
329 |             W = np.maximum(0, W-0.05)
330 |         else:
331 |             W = np.minimum(1, W+0.05)
332 |         delta_reward_buffer.append(m_reward)
333 |         old_m_reward = m_reward
334 | 
335 |         writer.add_scalar("mean_reward", np.mean(batch_reward), step_idx)
336 |         writer.add_scalar("max_reward", np.max(batch_reward), step_idx)
337 |         writer.add_scalar("min_reward", np.min(batch_reward), step_idx)
338 |         writer.add_scalar("std", np.std(batch_reward), step_idx)
339 |         writer.add_scalar("novelty", novelty, step_idx)
340 |         writer.add_scalar("novelty_w", W, step_idx)
341 | # =============================================================================
342 | #         if m_reward > -250:
343 | #             print("\nSolved the environment in {} steps".format(step_idx))
344 | #             break
345 | # =============================================================================
346 |         train_step(brain, novelty, batch_noise, batch_reward, step_idx)
347 |         # select new behavior:
348 |         b_pix = torch.from_numpy(get_behavior_char(env, brain)).unsqueeze(0).float()
349 |         # append new behavior to specific brain archive
350 |         archive.append(b_pix.numpy())
351 | 
352 |         print("\rStep: {}, Mean_Reward: {:.2f}, Novelty: {:.2f}, W: {:.2f} r_koeff: {:.2f}".format(step_idx, m_reward, novelty, W, r_koeff), end = "", flush = True)
353 | 
354 | #        if step_idx % 10 == 0:
355 | #            test_current_params(env, brain)
356 | 
357 |     for worker, p_queue in zip(workers, params_queues):
358 |         p_queue.put(None)
359 |         worker.join()
360 | 


--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/ES_own_conti_parallel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Oct  9 10:24:39 2019
  4 | 
  5 | @author: Z0014354
  6 | """
  7 | 
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import gym
 11 | import multiprocessing as mp
 12 | import collections
 13 | import copy
 14 | 
 15 | ITERS_PER_UPDATE = 10
 16 | NOISE_STD = 0.1 #0.04 higher std leeds to better exploration - more stable learning 
 17 | LR = 2e-2
 18 | PROCESSES_COUNT = 6 # amount of worker default 6
 19 | HIDDEN_SIZE = 12   # 6
 20 | ENV_NAME = "Damper-v0"
 21 | RewardsItem = collections.namedtuple('RewardsItem', field_names=['seed', 'pos_reward', 'neg_reward', 'steps'])
 22 | 
 23 | 
 24 | 
 25 | class Model(object):
 26 | 
 27 |     def __init__(self, stateCnt, actionCnt, hidden_size = HIDDEN_SIZE):
 28 |         # inits zero weights
 29 |         self.weights = [np.random.uniform(-1,1,size=(stateCnt, hidden_size)), np.random.uniform(-1,1, size=(hidden_size, hidden_size)), np.random.uniform(-1,1,size=(hidden_size,actionCnt))]
 30 | 
 31 |     def predict(self, inp):
 32 |         out = np.expand_dims(inp.flatten(), 0)
 33 |         #out = out / np.linalg.norm(out)
 34 |         weight_len = len(self.weights)
 35 |         for idx, layer in enumerate(self.weights):
 36 |             # hidden activation
 37 |             if idx < weight_len - 1:
 38 |                 out = self.activation(np.dot(out, layer))
 39 |             # outout activation
 40 |             else:
 41 |                 out = self.activation(np.dot(out, layer), type_="output_layer")
 42 |         return out[0]
 43 |     
 44 |     def activation(self,x, type_="hidden"):
 45 |         if type_ == "hidden":
 46 |             # relu
 47 |             return np.maximum(x,0)
 48 |             
 49 |             # softmax
 50 |             #return (np.exp(x))/sum(np.exp(x))
 51 |             
 52 |             #softplus
 53 |             #return np.log(1 + np.exp(x))
 54 |             
 55 |             #sigmoid
 56 |             #return 1/(1+np.exp(-x))
 57 |             
 58 |             # tanh
 59 |             #return np.tanh(x)
 60 |         else:
 61 |             # tanh
 62 |             return np.tanh(x)
 63 |             
 64 |             # relu
 65 |             #return np.maximum(x,0)
 66 |         
 67 |     def get_weights(self):
 68 |         return self.weights
 69 | 
 70 |     def set_weights(self, weights):
 71 |         self.weights = weights
 72 |         
 73 |         
 74 | def evaluate(env, brain):
 75 |     """
 76 |     Runs an evaluation on the given brain.
 77 |     """
 78 |     state = env.reset()
 79 |     rewards = 0
 80 |     steps = 0
 81 |     while True:
 82 |         state = np.expand_dims(state, axis=0)
 83 |         #print("State:", state)
 84 |         action_mean = brain.predict(state)
 85 |         action = np.random.normal(action_mean, scale=0.01)
 86 |         action = np.clip(action, -1, 1)  # pendulums action range is between -2,2  
 87 |         next_state, reward, done, _ = env.step(action)
 88 |         rewards += reward
 89 |         steps  += 1
 90 |         state = next_state
 91 |         if done:
 92 |             break
 93 |       
 94 |     return rewards, steps
 95 | 
 96 | 
 97 | def sample_noise(brain):
 98 |     """
 99 |     Samples noise from a normal distribution in the shape of the brain parameters. Output are two noisy parameters: + noise and - noise (for better and more stable learning!) 
100 |     """
101 |     pos = []
102 |     neg = []
103 |     for param in brain.get_weights():
104 |         noise_t = np.random.normal(size = param.shape)
105 |         pos.append(noise_t)
106 |         neg.append(-noise_t)
107 |     return pos, neg
108 | 
109 | 
110 | def eval_with_noise(env, brain, noise, noise_std):
111 |     """
112 |     Evaluates the current brain with added parameter noise
113 |   
114 |     """
115 |     old_params = copy.deepcopy(brain.get_weights())
116 |     new_params = []
117 |     for p, p_n in zip(brain.get_weights(), noise):
118 |         p += noise_std*p_n
119 |         new_params.append(p)
120 |     brain.set_weights(new_params)
121 |     r, s = evaluate(env, brain)
122 |     brain.set_weights(old_params)
123 |     return r, s 
124 | 
125 | 
126 | def worker_func(worker_id, params_queue, rewards_queue, noise_std):
127 |     """
128 |     Worker function that gathers pos and negative rewards for the optimization process and puts them in the rewards_queue with the network parameter seed:
129 |         >> rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps)) <<
130 |     """
131 |     #print("worker: {} has started".format(worker_id))
132 |     env = gym.make(ENV_NAME)
133 |     net = Model(env.observation_space.shape[0], env.action_space.shape[0])
134 | 
135 |     while True:
136 |         params = params_queue.get()
137 |         if params is None:
138 |             break
139 | 
140 |         # set parameters of the queue - equal to: net.load_state_dict(params)
141 |         net.set_weights([param for param in params])
142 |         
143 |         for _ in range(ITERS_PER_UPDATE):
144 |             seed = np.random.randint(low=0, high=65535)
145 |             np.random.seed(seed)
146 |             noise, neg_noise = sample_noise(net)
147 |             pos_reward, pos_steps = eval_with_noise(env, net, noise, noise_std)
148 |             neg_reward, neg_steps = eval_with_noise(env, net, neg_noise, noise_std)
149 |             #print(_, "\n",noise, pos_reward, neg_reward)         
150 |             rewards_queue.put(RewardsItem(seed=seed, pos_reward=pos_reward, neg_reward=neg_reward, steps=pos_steps+neg_steps))
151 | 
152 |     pass
153 | 
154 | 
155 | def train_step(brain, batch_noise, batch_rewards, step_idx):
156 |     """
157 |     Optimizes the weights of the NN based on the rewards and noise gathered
158 |     """
159 |     # normalize rewards to have zero mean and unit variance
160 |     norm_reward = np.array(batch_reward)
161 |     norm_reward -= np.mean(norm_reward)
162 |     s = np.std(norm_reward)
163 |     if abs(s) > 1e-6:
164 |         norm_reward /= s
165 |         
166 |     weighted_noise = None
167 |     for noise, reward in zip(batch_noise, norm_reward):
168 |         if weighted_noise is None:
169 |             weighted_noise = [reward * p_n for p_n in noise]
170 |         else:
171 |             for w_n, p_n in zip(weighted_noise, noise):
172 |                 w_n += reward * p_n
173 |         
174 | 
175 |     for p, p_update in zip(brain.get_weights(), weighted_noise):
176 |         update = p_update / (len(batch_reward)*NOISE_STD)
177 |         p += LR * update
178 |         
179 | 
180 | def test_current_params(env, brain):
181 |     """
182 |     Runs the current network parameters on the env to visually monitor the progress.
183 |     """
184 |     state = env.reset()
185 |     
186 |     while True:
187 |         env.render()
188 |         state = np.expand_dims(state, axis=0)
189 |         action_mean = brain.predict(state)
190 |         action = np.random.normal(action_mean, scale=0.01)
191 |         action = np.clip(action, -1, 1)  # pendulums action range is between -2,2  
192 |         state, reward, done, _ = env.step(action)
193 | 
194 |         if done:
195 |             break
196 |         
197 |         
198 | if __name__ == "__main__":
199 | 
200 |     env = gym.make(ENV_NAME)
201 |     #env.seed(2)
202 |     brain = Model(env.observation_space.shape[0], env.action_space.shape[0])
203 | 
204 |     iterations = 100 #1500 # max iterations to run 
205 | 
206 |     params_queues = [mp.Queue(maxsize=1) for _ in range(PROCESSES_COUNT)]
207 |     rewards_queue = mp.Queue(maxsize=ITERS_PER_UPDATE)
208 |     workers = []
209 | 
210 |     for idx, params_queue in enumerate(params_queues):
211 |         proc = mp.Process(target=worker_func, args=(idx, params_queue, rewards_queue, NOISE_STD))
212 |         proc.start()
213 |         workers.append(proc)
214 | 
215 |     print("All started!")
216 |     step_idx = 0
217 |     reward_history = []
218 |     reward_max =[]
219 |     reward_min = []
220 |     reward_std = []
221 | 
222 |     for step_idx in range(iterations):
223 |         # broadcasting network params
224 |         params = brain.get_weights()
225 |         for q in params_queues:
226 |             q.put(params)
227 | 
228 |         batch_noise = []
229 |         batch_reward = []
230 |         batch_steps_data = []
231 |         batch_steps = 0
232 |         results = 0
233 |         
234 |         while True: 
235 |             while not rewards_queue.empty():
236 |                 reward = rewards_queue.get_nowait()
237 |                 np.random.seed(reward.seed) # sets the seed of the current worker rewards
238 |                 noise, neg_noise = sample_noise(brain)
239 |                 batch_noise.append(noise)
240 |                 batch_reward.append(reward.pos_reward)
241 |                 batch_noise.append(neg_noise)
242 |                 batch_reward.append(reward.neg_reward)
243 |                 results += 1
244 |                 batch_steps += reward.steps
245 | 
246 |             if results == PROCESSES_COUNT * ITERS_PER_UPDATE:
247 |                 break
248 | 
249 |         step_idx += 1
250 |         m_reward = np.mean(batch_reward)
251 |         reward_history.append(m_reward)
252 |         reward_max.append(np.max(batch_reward))
253 |         reward_min.append(np.min(batch_reward))
254 |         reward_std.append(np.std(batch_reward))
255 | # =============================================================================
256 | #         if m_reward > -250:
257 | #             print("\nSolved the environment in {} steps".format(step_idx))
258 | #             break
259 | # =============================================================================
260 |         train_step(brain, batch_noise, batch_reward, step_idx)
261 | 
262 |         print("\rStep: {}, Mean_Reward: {:.2f}".format(step_idx, m_reward), end = "", flush = True)
263 |         
264 |         if step_idx % 10 == 0:
265 |             test_current_params(env, brain)
266 | 
267 |     for worker, p_queue in zip(workers, params_queues):
268 |         p_queue.put(None)
269 |         worker.join()
270 | 
271 |     plt.figure(figsize = (11,7))
272 |     plt.plot(reward_history, label = "Mean Reward", color = "green")
273 |     plt.plot(reward_max, label = "Max Reward", color = "blue")
274 |     plt.plot(reward_min, label = "Min Reward", color = "red")
275 |     plt.plot(reward_std, label = "Reward std", color = "orange")
276 |     plt.xlabel("Steps")
277 |     plt.ylabel("Rewards")
278 |     plt.legend()
279 |     plt.show()


--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/README.md:
--------------------------------------------------------------------------------
 1 | # Evolution Strategy algorithms
 2 | 
 3 | 
 4 | 
 5 | This folder contains 3 different Evolutionary Strategy algorithms:
 6 | 
 7 | - [ES_baseline_parallel.py](ES_baseline_parallel.py)
 8 | 
 9 | Baseline Evolution Strategy Algorithm for discrete action space that solves the CartPole environment. <br>
10 | Code is based on this [Paper](https://arxiv.org/abs/1703.03864) and on this [book chapter](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16)  
11 | Written solely in numpy!
12 | 
13 | 
14 | - [ES_dis_parallel_novelty.py](ES_dis_parallel_novelty.py)
15 | 
16 | Evolution Strategy algorithm for discrete action space with a novelty search for extra exploration.<br>
17 | Code is based on [Paper EvoStrategy](https://arxiv.org/abs/1703.03864), [Paper novelty seeking agents](http://papers.nips.cc/paper/7750-improving-exploration-in-evolution-strategies-for-deep-reinforcement-learning-via-a-population-of-novelty-seeking-agents) and the [book chapter 16](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16).
18 | Written with pytorch.
19 | 
20 | - [ES_conti_parallel_novelty.py](ES_conti_parallel_novelty.py)
21 | 
22 | Evolution Strategy algorithm for continuous action space with a novelty search for extra exploration.<br>
23 | Code is based on [Paper EvoStrategy](https://arxiv.org/abs/1703.03864), [Paper novelty seeking agents](http://papers.nips.cc/paper/7750-improving-exploration-in-evolution-strategies-for-deep-reinforcement-learning-via-a-population-of-novelty-seeking-agents) and the [book chapter 16](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter16).
24 | Written with pytorch.
25 | 
26 | 
27 | 
28 | Evolution Strategies solving Pendulum:
29 | 
30 | 
31 | ![alt_text](imgs/pendulum.png)
32 | 
33 | 


--------------------------------------------------------------------------------
/Black-Box Optimization/Evolution_Strategies_parallel+novelty/imgs/pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Black-Box Optimization/Evolution_Strategies_parallel+novelty/imgs/pendulum.png


--------------------------------------------------------------------------------
/Black-Box Optimization/README.md:
--------------------------------------------------------------------------------
 1 | # Black-Box Optimization
 2 | 
 3 | ### Evolution Strategy
 4 | #### Baseline implementation [ES_cartpole](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolutionary_Strategies_Cartpole.ipynb)
 5 | 
 6 | 
 7 | 
 8 | ### Genetic Algorithms
 9 | 
10 | #### Baseline implementation [GA_cartpole](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/genetic_algorithm_base.py)
11 | - run 'python genetic_algorithm_base.py' with the flags: `--noise`, `--ps`, `--pc` as:
12 |     - `--noise`(std) that is added as the mutation of the neural network weights, default = 0.05
13 |     - `--ps` as the population size, default = 50
14 |     - `--pc` as the parents count or amount of top performer that build the new population, default = 10
15 |     
16 | Example performance with noise_std = 0.05, ps=30, pc=10
17 | 
18 | ![graph](imgs/ga_cartpole.png)
19 | 


--------------------------------------------------------------------------------
/Black-Box Optimization/genetic_algorithm_base.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn as nn 
  4 | from torch.utils.tensorboard import SummaryWriter
  5 | import numpy as np 
  6 | import argparse
  7 | import gym 
  8 | 
  9 | 
 10 | class Network(nn.Module):
 11 |     def __init__(self,state_size,action_size,hidden_layer,seed):
 12 |         super(Network, self).__init__()
 13 |         self.state_size = state_size
 14 |         self.action_size = action_size
 15 |         self.hidden_layer = hidden_layer
 16 |         
 17 |         self.net = nn.Sequential(
 18 |                 nn.Linear(self.state_size, self.hidden_layer),
 19 |                 nn.ReLU(),
 20 |                 nn.Linear(self.hidden_layer, self.hidden_layer),
 21 |                 nn.ReLU(),
 22 |                 nn.Linear(self.hidden_layer, self.action_size),
 23 |                 nn.Softmax(dim = 1))
 24 | 
 25 |     def forward(self, x):
 26 |         return self.net(x)
 27 | 
 28 | def evaluate(env, net):
 29 |     """
 30 |     Plays a round of the game and returns the obtained reward
 31 |     """
 32 |     state = env.reset()
 33 |     rewards = 0
 34 |     while True:
 35 |         state = torch.from_numpy(state).unsqueeze(0).float()
 36 |         action_prob = net(state)
 37 |         action = action_prob.max(dim=1)[1] #argmax
 38 |         next_state, reward, done, info = env.step(action.data.numpy()[0])
 39 |         rewards += reward 
 40 |         state = next_state
 41 |         if done:
 42 |             break
 43 |     return rewards
 44 | 
 45 | def mutate_parent(net):
 46 |     """
 47 |     Mutates the parent neural nets by adding noise sampled by a normal distribution.
 48 | 
 49 |     """
 50 |     new_net = copy.deepcopy(net)
 51 |     for parameter in new_net.parameters():
 52 |         noise = torch.tensor(np.random.normal(size=parameter.data.size()).astype(np.float32))
 53 |         parameter.data += NOISE_STD * noise 
 54 |     return new_net
 55 | 
 56 | 
 57 | if __name__ == "__main__":
 58 |     # parse input values like
 59 |     # - Noise standard deviation [NOISE_STD]
 60 |     # - Population size [POPULATION_SIZE]
 61 |     # - Parents count [PARENTS_COUNT] 
 62 | 
 63 |     parser = argparse.ArgumentParser(description = "Noise, Population size, Parents count")
 64 |     parser.add_argument("--noise",type = float,default=1e-2)
 65 |     parser.add_argument( "--ps",type=int,default=50)
 66 |     parser.add_argument( "--pc",type=int,default=10)
 67 | 
 68 |     args = parser.parse_args()
 69 |     NOISE_STD = args.noise  
 70 |     POPULATION_SIZE = args.ps
 71 |     PARENTS_COUNT = args.pc 
 72 | 
 73 |     #print(f"Noise: {NOISE_STD}, PopS: {POPULATION_SIZE}, PARENTS_COUNT: {PARENTS_COUNT}")
 74 |     np.random.seed(seed=42)
 75 |     torch.manual_seed(42)
 76 |     writer = SummaryWriter(comment="-CartPole")
 77 |     env = gym.make("CartPole-v0")
 78 |     gen_idx = 0
 79 |     state_size = env.observation_space.shape[0]
 80 |     action_size = env.action_space.n 
 81 | 
 82 |     nets = [Network(state_size, action_size, hidden_layer=32, seed=3) for _ in range(POPULATION_SIZE)] 
 83 |     population = [(net, evaluate(env, net)) for net in nets]
 84 | 
 85 |     while True:
 86 |         population.sort(key=lambda p: p[1], reverse=True)    # sorts the fitness from highest to lowest
 87 |         rewards = [p[1] for p in population[:PARENTS_COUNT]]    # takes the fitness of the top x-parents 
 88 |         reward_mean = np.mean(rewards)
 89 |         reward_max = np.max(rewards)
 90 |         reward_std = np.std(rewards)
 91 | 
 92 |         writer.add_scalar("reward_mean", reward_mean, gen_idx)
 93 |         writer.add_scalar("reward_max", reward_max, gen_idx)
 94 |         writer.add_scalar("reward_std", reward_std, gen_idx)
 95 |         print(f"Generation: {gen_idx} | Reward_mean: {reward_mean} | Reward_max: {reward_max} | Reward_std: {reward_std}")
 96 | 
 97 |         if reward_mean > 199:
 98 |             print("Solved the environment in {} generations".format(gen_idx))
 99 |             break
100 |             writer.close()
101 | 
102 |         prev_population = population 
103 |         population = [population[0]]    # list of the nets
104 | 
105 |         for _ in range(POPULATION_SIZE-1):
106 |             parent_idx = np.random.randint(0, PARENTS_COUNT)    #sample the new population from the top x-parents
107 |             parent = prev_population[parent_idx][0]
108 |             net = mutate_parent(parent)
109 |             population.append((net, evaluate(env, net)))
110 | 
111 |         gen_idx += 1
112 |         
113 |          
114 | 


--------------------------------------------------------------------------------
/Black-Box Optimization/imgs/ga_cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Black-Box Optimization/imgs/ga_cartpole.png


--------------------------------------------------------------------------------
/ContinousControl/DDPG.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from tensorboardX import SummaryWriter
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | import numpy as np 
  7 | import roboschool
  8 | import gym
  9 | from gym import wrappers
 10 | import pybullet_envs
 11 | import time
 12 | 
 13 | class NormalizedActions(gym.ActionWrapper):
 14 | 
 15 |     def _action(self, action):
 16 |         low_bound   = self.action_space.low
 17 |         upper_bound = self.action_space.high
 18 |         
 19 |         action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
 20 |         action = np.clip(action, low_bound, upper_bound)
 21 |         
 22 |         return action
 23 | 
 24 |     def _reverse_action(self, action):
 25 |         low_bound   = self.action_space.low
 26 |         upper_bound = self.action_space.high
 27 |         
 28 |         action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
 29 |         action = np.clip(action, low_bound, upper_bound)
 30 |         
 31 |         return action
 32 | 
 33 | class ReplayBuffer:
 34 |     def __init__(self, capacity):
 35 |         self.capacity = capacity
 36 |         self.buffer = []
 37 |         self.position = 0
 38 |     
 39 |     def push(self, state, action, reward, next_state, done):
 40 |         if len(self.buffer) < self.capacity:
 41 |             self.buffer.append(None)
 42 |         self.buffer[self.position] = (state, action, reward, next_state, done)
 43 |         self.position = (self.position + 1) % self.capacity
 44 |     
 45 |     def sample(self, batch_size):
 46 |         batch = random.sample(self.buffer, batch_size)
 47 |         state, action, reward, next_state, done = map(np.stack, zip(*batch))
 48 |         return state, action, reward, next_state, done
 49 |     
 50 |     def __len__(self):
 51 |         return len(self.buffer)
 52 |       
 53 | class OUNoise(object):
 54 |     def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
 55 |         self.mu           = mu            # mean value -> as "goal state" 0.0 in the sense of no noise
 56 |         self.theta        = theta
 57 |         self.sigma        = max_sigma     # variance of the noise 
 58 |         self.max_sigma    = max_sigma
 59 |         self.min_sigma    = min_sigma
 60 |         self.decay_period = decay_period
 61 |         self.action_dim   = action_space.shape[0]
 62 |         self.low          = action_space.low
 63 |         self.high         = action_space.high
 64 |         self.reset()
 65 |         
 66 |     def reset(self):
 67 |         self.state = np.ones(self.action_dim) * self.mu
 68 |         
 69 |     def evolve_state(self):
 70 |         x  = self.state
 71 |         dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
 72 |         self.state = x + dx
 73 |         return self.state
 74 |     
 75 |     def get_action(self, action, t=0):
 76 |         ou_state = self.evolve_state()
 77 |         self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
 78 |         return np.clip(action + ou_state, self.low, self.high), ou_state
 79 |     
 80 | #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py
 81 | 
 82 | 
 83 | class Actor(nn.Module):
 84 |   def __init__(self, input_shape, action_shape):
 85 |     super(Actor, self).__init__()
 86 |     self.actor = nn.Sequential(nn.Linear(input_shape[0],400),
 87 |                                nn.LayerNorm(400),
 88 |                                nn.ReLU(),
 89 |                                nn.Linear(400,300),
 90 |                                nn.LayerNorm(300),
 91 |                                nn.ReLU(),                               
 92 |                                nn.Linear(300,action_shape[0]),
 93 |                                nn.Tanh())
 94 |   def forward(self, x):
 95 |     state = torch.FloatTensor(x).to(device)
 96 |     return self.actor(state)
 97 | 
 98 | class Critic(nn.Module):
 99 |   def __init__(self, input_shape, action_shape):
100 |     super(Critic, self).__init__()
101 |     
102 |     self.critic1 = nn.Sequential(nn.Linear(input_shape[0],400),
103 |                                   #nn.LayerNorm(256),
104 |                                   nn.ReLU())
105 |     self.critic2 = nn.Sequential(nn.Linear(400+ action_shape[0], 300),
106 |                                   #nn.LayerNorm(256),
107 |                                   nn.ReLU(),
108 |                                   nn.Linear(300,1))
109 |   def forward(self,state, action):
110 |     x = self.critic1(state)
111 |     comb = torch.cat([x,action], dim = 1)
112 |     return self.critic2(comb)
113 | 
114 | def update_and_optimize(batch_size):
115 |   state, action, reward, next_state, done = replay_buffer.sample(batch_size)
116 |   state_v = torch.FloatTensor(state).to(device)        # shape[batch_size,3]
117 |   action_v = torch.FloatTensor(action).to(device)      # shape[batch_size,1]
118 |   reward_v = torch.FloatTensor(reward).unsqueeze(1).to(device)    # shape [batch_size,1]
119 |   next_state_v = torch.FloatTensor(next_state).to(device) # shape [batch_size,3]
120 |   done_v = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) # shape [batch_size,1]
121 |   
122 |   # update critic:
123 |   critic_optim.zero_grad()
124 |   Q_v = critic(state_v, action_v)
125 |   next_action = target_actor(next_state).to(device)
126 |   target_Q = target_critic(next_state_v, next_action.detach())
127 |   discounted_target_Q = (reward_v + 0.99 * target_Q * (1.0 - done_v)).to(device)
128 |   loss = critic_loss(Q_v, discounted_target_Q.detach())
129 |   writer.add_scalar("Critic loss", loss, frame_idx)
130 |   writer.add_scalar("Target_Q", target_Q.mean(), frame_idx)
131 |   loss.backward()
132 |   critic_optim.step()
133 |   
134 |   # update actor:
135 |   actor_optim.zero_grad()
136 |   current_action = actor(state_v.cpu())
137 |   actor_loss = -critic(state_v, current_action.to(device)).mean()
138 |   writer.add_scalar("Actor loss", actor_loss, frame_idx)
139 |   actor_loss.backward()
140 |   actor_optim.step()
141 |   
142 |   # Softupdate 
143 |   soft_tau = 0.01
144 |   for target_param, param in zip(target_critic.parameters(), critic.parameters()):
145 |     target_param.data.copy_(
146 |                 target_param.data * (1.0 - soft_tau) + param.data * soft_tau
147 |             )
148 | 
149 |     for target_param, param in zip(target_actor.parameters(), actor.parameters()):
150 |       target_param.data.copy_(
151 |                 target_param.data * (1.0 - soft_tau) + param.data * soft_tau
152 |             )
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     start = time.time()
157 |     # use cuda
158 |     use_cuda = torch.cuda.is_available()
159 |     device = torch.device("cuda" if use_cuda else "cpu")
160 | 
161 |     print("Device: ",device)
162 |     ENV_ID = "HalfCheetahBulletEnv-v0"  #HalfCheetahBulletEnv-v0 #MinitaurBulletEnv-v0
163 |     env = gym.make(ENV_ID)
164 |     #env = gym.make("RoboschoolHalfCheetah-v1") #RoboschoolHalfCheetah-v1
165 |     env = wrappers.Monitor(env, "Saved_Videos/", resume=True, force = True, video_callable=lambda episode_id: episode_id% 5 ==0)
166 |                                         #, video_callable=lambda x: True, force=True
167 |     env = NormalizedActions(env)
168 | 
169 |     action_space = env.action_space.shape
170 |     observation_space = env.observation_space.shape
171 | 
172 |     critic = Critic(observation_space, action_space).to(device)
173 |     actor = Actor(observation_space, action_space).to(device)
174 |     target_actor = actor
175 |     target_critic = critic
176 |     target_actor.load_state_dict(actor.state_dict())
177 |     target_critic.load_state_dict(critic.state_dict())
178 |     critic_optim = optim.Adam(critic.parameters(), lr = 0.001, weight_decay=1e-2)
179 |     actor_optim = optim.Adam(actor.parameters(), lr = 0.0001)
180 | 
181 |     critic_loss = nn.MSELoss()
182 | 
183 |     replay_buffer_size = 1000000
184 |     replay_buffer = ReplayBuffer(replay_buffer_size)
185 | 
186 |     writer = SummaryWriter()
187 | 
188 |     noise = OUNoise(env.action_space)
189 |     batch_size = 128
190 |     max_frames = 80000 #100000~32 min --300000 ~47 min
191 |     frame_idx = 0
192 |     rewards = []
193 | 
194 |     while frame_idx < max_frames:
195 |         state = env.reset()
196 |         noise.reset()
197 |         ou_states = []
198 |         episode_reward = 0
199 |         done = False 
200 |         step = 0
201 |         print("Training Progress: {:.2f}".format(frame_idx/max_frames *100))
202 |         while not done:
203 |             action = actor(state)
204 |             action, ou_state = noise.get_action(action.cpu().detach().numpy(), frame_idx) #step
205 |             ou_states.append(ou_state)
206 | 
207 |             next_state, reward, done, _ = env.step(action)
208 |             
209 | 
210 | 
211 |             replay_buffer.push(state, action, reward, next_state, done)
212 |             if len(replay_buffer) > batch_size:# and frame_idx % 10 == 0:
213 |                 update_and_optimize(batch_size)
214 |             
215 |             state = next_state
216 |             episode_reward += reward
217 |             frame_idx += 1
218 |             step += 1
219 | 
220 |             
221 |             if done:
222 |                 writer.add_scalar("Rewards", episode_reward, frame_idx)
223 |                 writer.add_scalar("Steps", step, frame_idx)
224 |                 writer.add_scalar("OU_state", np.array(ou_states).mean(), frame_idx)
225 |         
226 |     end = time.time()
227 |     writer.close()
228 |     print("------------------------------\nTraining for {:.2f} minutes".format((end-start)/60))
229 | 


--------------------------------------------------------------------------------
/ContinousControl/MultiPro.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Dec  4 10:31:28 2019
  4 | 
  5 | @author: Z0014354
  6 | 
  7 | """
  8 | 
  9 | from multiprocessing import Process, Pipe
 10 | import numpy as np
 11 | 
 12 | def worker(remote, parent_remote, env_fn_wrapper):
 13 |     parent_remote.close()
 14 |     env = env_fn_wrapper.x()
 15 |     while True:
 16 |         cmd, data = remote.recv()
 17 |         if cmd == 'step':
 18 |             ob, reward, done, info = env.step(data)
 19 |             if done:
 20 |                 ob = env.reset()
 21 |             remote.send((ob, reward, done, info))
 22 |         elif cmd == 'reset':
 23 |             ob = env.reset()
 24 |             remote.send(ob)
 25 |         elif cmd == 'reset_task':
 26 |             ob = env.reset_task()
 27 |             remote.send(ob)
 28 |         elif cmd == 'close':
 29 |             remote.close()
 30 |             break
 31 |         elif cmd == 'get_spaces':
 32 |             remote.send((env.observation_space, env.action_space))
 33 |         else:
 34 |             raise NotImplementedError
 35 | 
 36 | class CloudpickleWrapper(object):
 37 |     """
 38 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
 39 |     """
 40 |     def __init__(self, x):
 41 |         self.x = x
 42 |     def __getstate__(self):
 43 |         import cloudpickle
 44 |         return cloudpickle.dumps(self.x)
 45 |     def __setstate__(self, ob):
 46 |         import pickle
 47 |         self.x = pickle.loads(ob)
 48 |         
 49 | 
 50 | class VecEnv(object):
 51 |     """
 52 |     An abstract asynchronous, vectorized environment.
 53 |     """
 54 |     def __init__(self, num_envs, observation_space, action_space):
 55 |         self.num_envs = num_envs
 56 |         self.observation_space = observation_space
 57 |         self.action_space = action_space
 58 | 
 59 |     def reset(self):
 60 |         """
 61 |         Reset all the environments and return an array of
 62 |         observations, or a tuple of observation arrays.
 63 |         If step_async is still doing work, that work will
 64 |         be cancelled and step_wait() should not be called
 65 |         until step_async() is invoked again.
 66 |         """
 67 |         pass
 68 | 
 69 |     def step_async(self, actions):
 70 |         """
 71 |         Tell all the environments to start taking a step
 72 |         with the given actions.
 73 |         Call step_wait() to get the results of the step.
 74 |         You should not call this if a step_async run is
 75 |         already pending.
 76 |         """
 77 |         pass
 78 | 
 79 |     def step_wait(self):
 80 |         """
 81 |         Wait for the step taken with step_async().
 82 |         Returns (obs, rews, dones, infos):
 83 |          - obs: an array of observations, or a tuple of
 84 |                 arrays of observations.
 85 |          - rews: an array of rewards
 86 |          - dones: an array of "episode done" booleans
 87 |          - infos: a sequence of info objects
 88 |         """
 89 |         pass
 90 | 
 91 |     def close(self):
 92 |         """
 93 |         Clean up the environments' resources.
 94 |         """
 95 |         pass
 96 | 
 97 |     def step(self, actions):
 98 |         self.step_async(actions)
 99 |         return self.step_wait()
100 | 
101 | class SubprocVecEnv(VecEnv):
102 |     def __init__(self, env_fns, spaces=None):
103 |         """
104 |         envs: list of gym environments to run in subprocesses
105 |         """
106 |         self.waiting = False
107 |         self.closed = False
108 |         nenvs = len(env_fns)
109 |         self.nenvs = nenvs
110 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
111 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
112 |             for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
113 |         for p in self.ps:
114 |             p.daemon = True # if the main process crashes, we should not cause things to hang
115 |             p.start()
116 |         for remote in self.work_remotes:
117 |             remote.close()
118 | 
119 |         self.remotes[0].send(('get_spaces', None))
120 |         observation_space, action_space = self.remotes[0].recv()
121 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
122 | 
123 |     def step_async(self, actions):
124 |         for remote, action in zip(self.remotes, actions):
125 |             remote.send(('step', action))
126 |         self.waiting = True
127 | 
128 |     def step_wait(self):
129 |         results = [remote.recv() for remote in self.remotes]
130 |         self.waiting = False
131 |         obs, rews, dones, infos = zip(*results)
132 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
133 | 
134 |     def reset(self):
135 |         for remote in self.remotes:
136 |             remote.send(('reset', None))
137 |         return np.stack([remote.recv() for remote in self.remotes])
138 | 
139 |     def reset_task(self):
140 |         for remote in self.remotes:
141 |             remote.send(('reset_task', None))
142 |         return np.stack([remote.recv() for remote in self.remotes])
143 | 
144 |     def close(self):
145 |         if self.closed:
146 |             return
147 |         if self.waiting:
148 |             for remote in self.remotes:            
149 |                 remote.recv()
150 |         for remote in self.remotes:
151 |             remote.send(('close', None))
152 |         for p in self.ps:
153 |             p.join()
154 |             self.closed = True
155 |             
156 |     def __len__(self):
157 |         return self.nenvs


--------------------------------------------------------------------------------
/ContinousControl/PPO_gae_multi.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Dec  4 10:33:09 2019
  4 | 
  5 | @author: Z0014354
  6 | 
  7 | PPO with GAE implementation of Sebastian Dittert
  8 | """
  9 | 
 10 | import gym
 11 | import math
 12 | import torch 
 13 | import torch.nn as nn
 14 | import torch.optim as optim
 15 | from torch.distributions import Normal
 16 | import torch.nn.functional as F
 17 | import numpy as np
 18 | import matplotlib.pyplot as plt
 19 | from torch.nn.utils import clip_grad_norm_
 20 | from collections import deque
 21 | from tensorboardX import SummaryWriter
 22 | import MultiPro
 23 | import argparse
 24 | import time 
 25 | 
 26 | 
 27 | def hidden_init(layer):
 28 |     fan_in = layer.weight.data.size()[0]
 29 |     lim = 1. / np.sqrt(fan_in)
 30 |     return (-lim, lim)
 31 | 
 32 | class Critic(nn.Module):
 33 |     def __init__(self, input_shape, hidden_size):
 34 |         super(Critic, self).__init__()
 35 |         self.layer1 = nn.Linear(input_shape, hidden_size)
 36 |         self.layer2 = nn.Linear(hidden_size, hidden_size)
 37 |         self.layer3 = nn.Linear(hidden_size, 1)
 38 |         self.reset_parameters()
 39 |     
 40 |     def forward(self,x):
 41 |         x = torch.tanh(self.layer1(x))
 42 |         x = torch.tanh(self.layer2(x))
 43 |         x = self.layer3(x)      
 44 |         
 45 |         return x
 46 |     
 47 |     def reset_parameters(self):
 48 |         nn.init.xavier_uniform_(self.layer1.weight)
 49 |         nn.init.xavier_uniform_(self.layer2.weight)
 50 |         #nn.init.xavier_uniform_(self.layer3.weight)
 51 |       
 52 | class Actor(nn.Module):
 53 |     def __init__(self, input_shape, output_shape, action_high_low, hidden_size):
 54 |         super(Actor, self).__init__()
 55 |         self.layer1 = nn.Linear(input_shape, hidden_size)
 56 |         self.layer2 = nn.Linear(hidden_size,hidden_size)
 57 | 
 58 |         self.mean = nn.Linear(hidden_size, output_shape)
 59 |         self.variance = nn.Linear(hidden_size, output_shape)
 60 |         self.action_high_low = action_high_low
 61 |         #self.reset_parameters()
 62 | 
 63 |     def forward(self, x):
 64 | 
 65 |         x = torch.tanh(self.layer1(x))
 66 |         head = torch.tanh(self.layer2(x))
 67 |         
 68 |         mean = torch.tanh(self.mean(head)) # tanh squashed output to the range of -1..1
 69 |         variance = F.softplus(self.variance(head)) # log(1 + e^x) has the shape of a smoothed ReLU
 70 |         sigma = torch.sqrt(variance.cpu())
 71 |         m = Normal(mean.cpu(), sigma)
 72 |         actions = m.sample()
 73 |         logprobs = m.log_prob(actions) #for the optimization step we create a new distribution based on the new mean and variance - still taking the logprobs based on the old actions!
 74 | 
 75 |         return actions, logprobs, m
 76 |             
 77 | 
 78 |     def reset_parameters(self):
 79 |         nn.init.xavier_uniform_(self.layer1.weight)
 80 |         nn.init.xavier_uniform_(self.layer2.weight)
 81 |         nn.init.xavier_uniform_(self.mean.weight)
 82 |         #nn.init.xavier_uniform_(self.variance.weight)
 83 | 
 84 |     
 85 |     
 86 | class Agent():
 87 |   def __init__(self,
 88 |                state_size,
 89 |                action_size,
 90 |                action_high_low,
 91 |                hidden_size,
 92 |                LR_A=3e-4,
 93 |                LR_C=3e-4,
 94 |                gamma=0.99,
 95 |                lambda_=0.95,
 96 |                mini_batch_size=512,
 97 |                ppo_epochs=5):
 98 |     
 99 |     self.state_size = state_size
100 |     self.actor = Actor(state_size, action_size, action_high_low, hidden_size).to(device)
101 |     self.action_high = action_high_low[0]
102 |     self.action_low = action_high_low[1]
103 |     self.critic = Critic(state_size, hidden_size).to(device)
104 | 
105 |     self.gamma = gamma
106 |     self.lambda_ = lambda_
107 |     self.mini_batch_size = mini_batch_size
108 |     self.ppo_epochs = ppo_epochs
109 |     
110 | 
111 |     self.optimizer_a = optim.Adam(params=self.actor.parameters(), lr=LR_A) #RMSprop
112 |     self.optimizer_c = optim.Adam(params=self.critic.parameters(), lr=LR_C)
113 |        
114 | 
115 |   def test_net(self, env, count = 10):
116 |       """
117 |       Tests the agents performance with current weights.
118 |       """
119 |       rewards = 0.0
120 |       steps = 0
121 |       entropys = 0.0
122 | 
123 |       for _ in range(count):
124 |           obs = env.reset()
125 | 
126 |           while True:
127 |               obs_v = torch.from_numpy(obs).float()
128 |               action, _, dist = self.actor(obs_v.to(device))
129 |               entropy = dist.entropy().detach().cpu().numpy()
130 |               action = action.cpu().numpy()
131 |               action = np.clip(action*self.action_high, self.action_low, self.action_high)
132 |               obs, reward, done, info = env.step(action)
133 | 
134 |               rewards += reward
135 |               entropys += entropy.mean()
136 |               steps += 1
137 |               if done:
138 |                   break
139 | 
140 |       return rewards/count, entropys/count, steps/count 
141 | 
142 | 
143 | 
144 | 
145 |   def compute_gae(self, next_value, rewards, masks, values):
146 |       """
147 |       lambda => 1: high variance, low bias
148 |       lambda => 0: low variance, high bias
149 |       """
150 | 
151 |       rewards_batch = list(zip(*rewards))
152 |       masks_batch = list(zip(*masks))
153 |       values_batch = torch.cat((torch.stack(values, dim=1).squeeze(2), next_value.squeeze(0)),dim=1)
154 |       
155 |       out_discounted_rewards = []
156 |       out_advantage = []
157 |       for rewards, masks, values  in zip(rewards_batch, masks_batch, values_batch):
158 |       
159 |         gae = 0
160 |         disc_returns = []
161 |         advantage = []
162 |         for step in reversed(range(len(rewards))):
163 |             # d = r_t +gamma*V(s_t+1) - V(s)
164 |             delta = rewards[step] + self.gamma * values[step + 1] * masks[step] - values[step]
165 |             # sum(lambda*gamma)^t* delta_t+1
166 |             gae = delta + self.gamma * self.lambda_ * masks[step] * gae
167 | 
168 |             disc_returns.insert(0, gae + values[step]) # adding values since we want the returns and not the advantage yet!  A(a,s) = Q"returns" - V(s)
169 |             advantage.insert(0, gae)
170 |             
171 |         out_discounted_rewards.append(disc_returns)
172 |         out_advantage.append(advantage)
173 |         
174 |       return torch.FloatTensor(out_discounted_rewards).flatten().unsqueeze(1), torch.FloatTensor(out_advantage).flatten().unsqueeze(1)
175 | 
176 | 
177 |   def ppo_iter(self, states, actions, log_probs, advantage, discounted_rewards):
178 |       batch_size = len(states)
179 | 
180 |       for i in range(batch_size // self.mini_batch_size):
181 |           rand_ids = np.random.randint(0, batch_size, self.mini_batch_size)
182 | 
183 |           yield states[rand_ids], actions[rand_ids], log_probs[rand_ids], advantage[rand_ids], discounted_rewards[rand_ids]
184 | 
185 | 
186 | 
187 |   def ppo_update(self, states, actions, log_probs, advantage, discounted_rewards, eps_clip=0.2):
188 |     """
189 | 
190 |     """
191 | 
192 |     a_loss_batch = []
193 |     c_loss_batch = []
194 | 
195 | 
196 |     for _ in range(self.ppo_epochs):
197 |       for states_i, old_actions, old_logprobs, advantage_i, discounted_reward_i  in self.ppo_iter(states, actions, log_probs, advantage, discounted_rewards):
198 | 
199 |         self.optimizer_c.zero_grad()
200 |         #train critic
201 |         new_value = self.critic(states_i.to(device))
202 | 
203 |         c_loss = .5 * (discounted_reward_i - new_value).pow(2).mean() 
204 |         c_loss.backward()
205 |         #print("C: ", c_loss)
206 |         clip_grad_norm_(self.critic.parameters(),CLIP_GRAD)
207 |         self.optimizer_c.step()
208 | 
209 |         #train actor
210 |         self.optimizer_a.zero_grad()
211 |         _, _, dist = self.actor(states_i.to(device))
212 |         new_logprobs = dist.log_prob(old_actions)
213 |         entropy = dist.entropy()
214 |         
215 |         ratio = torch.exp(new_logprobs - old_logprobs.detach())
216 |         surr = ratio * advantage_i
217 |         clip = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip) 
218 |         a_loss = torch.min(surr, clip*advantage_i )
219 |         a_loss = (- a_loss - ENTROPY_BONUS * entropy).mean()
220 |         clip_grad_norm_(self.actor.parameters(),CLIP_GRAD)
221 |         a_loss.backward(retain_graph=True)
222 |         
223 |         self.optimizer_a.step()
224 | 
225 |         c_loss_batch.append(c_loss.detach().numpy())
226 |         a_loss_batch.append(a_loss.detach().numpy())      
227 | 
228 |     return np.array(c_loss_batch).mean(), np.array(a_loss_batch).mean()
229 | 
230 | 
231 | 
232 | def main(args):
233 |     torch.multiprocessing.freeze_support()
234 |     t0 = time.time()
235 |     ENV = args.env #"MountainCarContinuous-v0"  #Pendulum-v0 LunarLanderContinuous-v0
236 |     
237 |     env = gym.make(ENV)#Creating the Environment
238 |     writer = SummaryWriter("runs/"+args.info)
239 |     n_cpu = args.worker
240 |     
241 |     envs = MultiPro.SubprocVecEnv([lambda: gym.make(ENV) for i in range(n_cpu)])
242 |     seed = args.seed
243 |     
244 |     torch.manual_seed(seed)
245 |     torch.cuda.manual_seed(seed)
246 |     np.random.seed(seed)
247 |     env.seed(seed)
248 |             
249 |     
250 |     state_size  = env.observation_space.shape[0]
251 |     action_size = env.action_space.shape[0]
252 |     action_high_low = (env.action_space.high[0], env.action_space.low[0])
253 |     
254 |     agent = Agent(state_size, action_size, action_high_low= action_high_low, hidden_size=args.layer_size, LR_A=args.lr, LR_C=args.lr, gamma=args.gamma, lambda_=args.lambda_, mini_batch_size=args.mini_batch_size, ppo_epochs=args.ppo_updates)
255 |     
256 |     max_episodes = args.ep
257 |     plot_rewards = []
258 |     max_steps = int(args.max_steps/n_cpu)
259 |     
260 |     # calc reshape stacking size
261 |     shape = (max_steps*n_cpu, state_size)
262 |     
263 |     for ep in range(max_episodes+1):
264 |         states = envs.reset()
265 |     
266 |         done = False
267 |         
268 |         state_batch = []
269 |         value_batch = []
270 |         action_batch = []
271 |         logprob_batch = []
272 |         rewards_batch = []
273 |         masks = []
274 |         for step in range(max_steps):
275 |     
276 |             states = torch.from_numpy(states).float()
277 | 
278 |             action, logprob, _  = agent.actor(states.to(device))  
279 |             value = agent.critic(states.to(device))
280 |             action_v = action.cpu().numpy()
281 | 
282 |             action_v = np.clip(action_v*env.action_space.high[0], env.action_space.low[0], env.action_space.high[0])
283 |             next_states, reward, done, _ = envs.step(action_v)
284 | 
285 |             state_batch.append(states)
286 |             value_batch.append(value)
287 |             logprob_batch.append(logprob)
288 |             action_batch.append(action)
289 |             rewards_batch.append(torch.from_numpy(reward).float())  
290 |             masks.append(torch.from_numpy(1 - done).float())
291 |     
292 |             states = next_states
293 | 
294 |     
295 |             if np.any(done):
296 |               states = envs.reset()
297 |         
298 |         # stack all gathered data
299 | 
300 |         state_batch = torch.stack(state_batch, dim=1).reshape(shape)
301 |         actions_batch = torch.stack(action_batch, dim=1).reshape(max_steps*n_cpu,action_size)
302 |         logprob_batch = torch.stack(logprob_batch, dim=1).reshape(max_steps*n_cpu,action_size).detach()    
303 | 
304 |         
305 |         # calculate advantage:
306 |         next_value = agent.critic(torch.from_numpy(next_states).float())
307 |         discounted_rewards, advantage = agent.compute_gae(next_value, rewards_batch, masks, value_batch)
308 | 
309 |         # normalize advantage:
310 |         advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
311 |         
312 |         c_loss, a_loss = agent.ppo_update(states=state_batch, actions=actions_batch, log_probs=logprob_batch, advantage=advantage.detach()  , discounted_rewards=discounted_rewards.detach())
313 |         writer.add_scalar("critic_loss", c_loss, ep)
314 |         writer.add_scalar("actor_loss", a_loss, ep)
315 |     
316 |         
317 |         if ep != 0 and ep % 5 == 0:
318 |           test_rewards, test_entropy, test_steps = agent.test_net(env)
319 |           writer.add_scalar("entropy",test_entropy, ep)
320 |           writer.add_scalar("max_reward",test_rewards, ep)
321 |           plot_rewards.append(test_rewards)
322 |     
323 |           print("\rEpisode: {} | Ep_Reward: {:.2f} | Average_100: {:.2f}".format(ep, test_rewards, np.mean(plot_rewards[-100:])), end = "", flush = True)
324 |           
325 |     envs.close()      
326 |     t1 = time.time()
327 |     plt.pause(60)
328 |     env.close()
329 |     print("training took {} min!".format((t1-t0)/60))
330 |     
331 | if __name__ == "__main__":
332 |     parser = argparse.ArgumentParser(description="")
333 |     parser.add_argument("-env", type=str,default="Pendulum-v0", help="Environment name")
334 |     parser.add_argument("-info", type=str, help="Information or name of the run")
335 |     parser.add_argument("-ep", type=int, default=200, help="The amount of training episodes, default is 200")
336 |     parser.add_argument("-seed", type=int, default=0, help="Seed for the env and torch network weights, default is 0")
337 |     parser.add_argument("-lr", type=float, default=5e-4, help="Learning rate of adapting the network weights, default is 5e-4")
338 |     parser.add_argument("-entropy_bonus", type=float, default=1e-3,  help="Entropy bonus for exploration - default is 1e-2")
339 |     parser.add_argument("-layer_size", type=int, default=64, help="Number of nodes per neural network layer, default is 64")
340 |     parser.add_argument("-worker", type=int, default=8, help="Number of parallel worker -default is 8")
341 |     parser.add_argument("-lambda_", type=float, default=0.95, help="GAE lambda")
342 |     parser.add_argument("-g", "--gamma", type=float, default=0.99, help="discount factor gamma, default is 0.99")
343 |     parser.add_argument("-CG", "--clip_grad", type=float, default=0.25, help="Clip the gradients for updating the network parameters, default is 0.25")
344 |     parser.add_argument("-ms", "--max_steps", type=int, default=2048, help="Maximum steps that are taken by the agent in the environment before updating")
345 |     parser.add_argument("-mbs", "--mini_batch_size", type=int, default=256, help="Mini Batch size for the ppo updates, default is 256")
346 |     parser.add_argument("-updates", "--ppo_updates", type=int, default=7, help="Number of PPO updates, default is 7")
347 |     args = parser.parse_args()
348 |     
349 |     device = "cuda" if torch.cuda.is_available() else "cpu"
350 |     print("Using: ", device)    
351 | 
352 |     ENTROPY_BONUS = args.entropy_bonus
353 |     CLIP_GRAD = args.clip_grad
354 |     main(args)
355 | 


--------------------------------------------------------------------------------
/ContinousControl/PPO_test_crawler.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from unityagents import UnityEnvironment\n",
 10 |     "import gym\n",
 11 |     "import math\n",
 12 |     "import torch \n",
 13 |     "import torch.nn as nn\n",
 14 |     "import torch.optim as optim\n",
 15 |     "from torch.distributions import Normal\n",
 16 |     "import torch.nn.functional as F\n",
 17 |     "import numpy as np\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "from torch.nn.utils import clip_grad_norm_\n",
 20 |     "from collections import deque"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stderr",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "INFO:unityagents:\n",
 33 |       "'Academy' started successfully!\n",
 34 |       "Unity Academy name: Academy\n",
 35 |       "        Number of Brains: 1\n",
 36 |       "        Number of External Brains : 1\n",
 37 |       "        Lesson number : 0\n",
 38 |       "        Reset Parameters :\n",
 39 |       "\t\t\n",
 40 |       "Unity brain name: CrawlerBrain\n",
 41 |       "        Number of Visual Observations (per agent): 0\n",
 42 |       "        Vector Observation space type: continuous\n",
 43 |       "        Vector Observation space size (per agent): 129\n",
 44 |       "        Number of stacked Vector Observation: 1\n",
 45 |       "        Vector Action space type: continuous\n",
 46 |       "        Vector Action space size (per agent): 20\n",
 47 |       "        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , \n"
 48 |      ]
 49 |     },
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "Number of agents: 12\n",
 55 |       "Size of each action: 20\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "env = UnityEnvironment(file_name='Crawler_Linux/Crawler.x86_64')\n",
 61 |     "# get the default brain\n",
 62 |     "brain_name = env.brain_names[0]\n",
 63 |     "brain = env.brains[brain_name]\n",
 64 |     "# reset the environment\n",
 65 |     "env_info = env.reset(train_mode=False)[brain_name]\n",
 66 |     "\n",
 67 |     "# number of agents\n",
 68 |     "num_agents = len(env_info.agents)\n",
 69 |     "print('Number of agents:', num_agents)\n",
 70 |     "\n",
 71 |     "# size of each action\n",
 72 |     "action_size = brain.vector_action_space_size\n",
 73 |     "print('Size of each action:', action_size)\n",
 74 |     "\n",
 75 |     "# examine the state space \n",
 76 |     "states_ = env_info.vector_observations\n",
 77 |     "state_size = states_.shape[1]\n"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "class Critic(nn.Module):\n",
 87 |     "    def __init__(self, input_shape, layer_size):\n",
 88 |     "        super(Critic, self).__init__()\n",
 89 |     "        self.net = nn.Sequential(nn.Linear(input_shape, layer_size),\n",
 90 |     "                                 nn.ReLU(),\n",
 91 |     "                                 nn.Linear(layer_size,layer_size),\n",
 92 |     "                                 nn.ReLU(),\n",
 93 |     "                                 nn.Linear(layer_size, 1))\n",
 94 |     "    \n",
 95 |     "    def forward(self,x):\n",
 96 |     "        x = self.net(x)\n",
 97 |     "        return x\n",
 98 |     "      \n",
 99 |     "class Actor(nn.Module):\n",
100 |     "    def __init__(self, input_shape, output_shape,layer_size):\n",
101 |     "        super(Actor, self).__init__()\n",
102 |     "        self.net = nn.Sequential(nn.Linear(input_shape, layer_size),\n",
103 |     "                                 nn.ReLU(),\n",
104 |     "                                 nn.Linear(layer_size,layer_size),\n",
105 |     "                                 nn.ReLU(),\n",
106 |     "                                 )\n",
107 |     "        self.mean = nn.Sequential(nn.Linear(layer_size, output_shape),\n",
108 |     "                                  nn.Tanh())                    # tanh squashed output to the range of -1..1\n",
109 |     "        self.variance =nn.Sequential(nn.Linear(layer_size, output_shape),\n",
110 |     "                                     nn.Softplus())             # log(1 + e^x) has the shape of a smoothed ReLU\n",
111 |     "    \n",
112 |     "    def forward(self, x):\n",
113 |     "        x = self.net(x)    \n",
114 |     "        sigma = torch.sqrt(self.variance(x).cpu())\n",
115 |     "        m = Normal(self.mean(x).cpu(), sigma)\n",
116 |     "        actions = m.sample()\n",
117 |     "        actions = torch.clamp(actions, -1, 1) # usually clipping between -1,1 but pendulum env has action range of -2,2\n",
118 |     "\n",
119 |     "        logprobs = m.log_prob(actions) #for the optimization step we create a new distribution based on the new mean and variance - still taking the logprobs based on the old actions!\n",
120 |     "\n",
121 |     "    \n",
122 |     "        return actions, logprobs, m\n",
123 |     "    \n",
124 |     "class Agent():\n",
125 |     "    def __init__(self, state_size, action_size, ppo_epochs, mini_batch_size,\\\n",
126 |     "                 layer_size,lr_a, lr_c, gamma, entropy_beta, clip_grad):\n",
127 |     "        self.state_size = state_size\n",
128 |     "        self.action_size = action_size\n",
129 |     "        \n",
130 |     "        self.layer_size = layer_size\n",
131 |     "        self.gamma = gamma\n",
132 |     "        self.entropy_beta = entropy_beta\n",
133 |     "        self.clip_grad = clip_grad\n",
134 |     "        \n",
135 |     "        self.ppo_epochs = ppo_epochs\n",
136 |     "        self.mini_batch_size = mini_batch_size\n",
137 |     "        \n",
138 |     "        self.actor = Actor(state_size, action_size,layer_size).to(device)\n",
139 |     "        self.critic = Critic(state_size,layer_size).to(device)\n",
140 |     "        self.a_optimizer = optim.RMSprop(params = self.actor.parameters(),lr = lr_a)\n",
141 |     "        self.c_optimizer = optim.RMSprop(params = self.critic.parameters(),lr = lr_c)\n",
142 |     "        \n",
143 |     "    def act(self, states):\n",
144 |     "        self.actor.eval()\n",
145 |     "        with torch.no_grad():\n",
146 |     "            actions, logprobs ,_ = self.actor(torch.from_numpy(states).float().to(device))\n",
147 |     "        self.actor.train()\n",
148 |     "        return actions.cpu().numpy(), logprobs\n",
149 |     "        \n",
150 |     "\n",
151 |     "    def compute_returns(self,rewards_tensor, masks_tensor):\n",
152 |     "        output = []\n",
153 |     "        for rewards, masks in zip(rewards_tensor, masks_tensor):\n",
154 |     "            R = 0 \n",
155 |     "            returns = []\n",
156 |     "            for step in reversed(range(len(rewards))):\n",
157 |     "                R = rewards[step] + self.gamma * R * masks[step]\n",
158 |     "                returns.insert(0, R)\n",
159 |     "            output.append(returns)\n",
160 |     "        output = list(zip(*output))\n",
161 |     "        discounted_rewards = [torch.FloatTensor(i).unsqueeze(1) for i in output]\n",
162 |     "        return torch.cat(discounted_rewards)\n",
163 |     "\n",
164 |     "\n",
165 |     "\n",
166 |     "    def ppo_iter(self, states, actions, log_probs, advantage, discounted_rewards):\n",
167 |     "        batch_size = len(states)#.shape[]\n",
168 |     "\n",
169 |     "        for i in range(batch_size // self.mini_batch_size):\n",
170 |     "            rand_ids = np.random.randint(0, batch_size, self.mini_batch_size)\n",
171 |     "\n",
172 |     "            yield torch.cat(states)[rand_ids], torch.cat(actions)[rand_ids], torch.cat(log_probs)[rand_ids], advantage[rand_ids], discounted_rewards[rand_ids]\n",
173 |     "\n",
174 |     "\n",
175 |     "\n",
176 |     "    def ppo_update(self, states, actions, log_probs, advantage, discounted_rewards, eps_clip=0.2):\n",
177 |     "        \"\"\"\n",
178 |     "\n",
179 |     "        \"\"\"\n",
180 |     "\n",
181 |     "        a_loss_batch = []\n",
182 |     "        c_loss_batch = []\n",
183 |     "        entropy_batch = []\n",
184 |     "\n",
185 |     "        for _ in range(self.ppo_epochs):\n",
186 |     "            for states_i, old_actions, old_logprobs, advantage_i, discounted_reward_i  in self.ppo_iter(states, actions, log_probs, advantage, discounted_rewards):\n",
187 |     "\n",
188 |     "                self.c_optimizer.zero_grad()\n",
189 |     "                #tran critic\n",
190 |     "                new_value = self.critic(states_i.to(device))\n",
191 |     "                c_loss = F.mse_loss(new_value, discounted_reward_i).cpu()\n",
192 |     "                c_loss.backward(retain_graph=True)\n",
193 |     "                clip_grad_norm_(self.critic.parameters(),self.clip_grad)\n",
194 |     "                self.c_optimizer.step()\n",
195 |     "\n",
196 |     "                c_loss_batch.append(c_loss.detach().cpu().numpy())\n",
197 |     "\n",
198 |     "\n",
199 |     "                #train actor\n",
200 |     "                self.a_optimizer.zero_grad()\n",
201 |     "                _, _, dist = self.actor(states_i.to(device))\n",
202 |     "                new_logprobs = dist.log_prob(old_actions)\n",
203 |     "                entropy = dist.entropy().mean()\n",
204 |     "                entropy_batch.append(entropy.detach().cpu().numpy())\n",
205 |     "\n",
206 |     "\n",
207 |     "                ratio = torch.exp(new_logprobs - old_logprobs.detach()).cpu()\n",
208 |     "                surr = ratio * advantage_i.cpu()\n",
209 |     "                clip = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip)\n",
210 |     "\n",
211 |     "                \n",
212 |     "                a_loss  = - (torch.min(surr, clip * advantage_i.cpu() ).mean()) + self.entropy_beta * entropy.cpu()\n",
213 |     "                a_loss.backward(retain_graph=True)\n",
214 |     "                clip_grad_norm_(self.actor.parameters(),self.clip_grad)\n",
215 |     "                self.a_optimizer.step()\n",
216 |     "\n",
217 |     "                a_loss_batch.append(a_loss.detach().cpu().numpy())\n",
218 |     "\n",
219 |     "\n",
220 |     "        return np.array(c_loss_batch).mean(), np.array(a_loss_batch).mean(), np.array(entropy_batch).mean()\n",
221 |     "\n",
222 |     "def list_to_tensor(list_):\n",
223 |     "    return np.array(list(zip(*list_)))"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "torch.manual_seed(42)\n",
233 |     "torch.cuda.manual_seed(42)\n",
234 |     "np.random.seed(42)\n",
235 |     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
236 |     "agent = Agent(state_size = state_size, action_size = action_size ,ppo_epochs = 5, mini_batch_size = 512,\\\n",
237 |     "                 layer_size = 512 ,lr_a = 1e-4, lr_c = 1e-4, gamma = 0.99 , entropy_beta = 1e-4, clip_grad = 1)\n",
238 |     "\n",
239 |     "agent.actor.load_state_dict(torch.load(\"Crawler_weights/actor100.pth\"))\n",
240 |     "agent.actor.eval()\n",
241 |     "\n",
242 |     "max_episodes = 0\n",
243 |     "\n",
244 |     "c_loss_list = []\n",
245 |     "a_loss_list = []\n",
246 |     "entropy_list = []\n",
247 |     "\n",
248 |     "\n",
249 |     "average_100 = deque(maxlen = 100)\n",
250 |     "\n",
251 |     "mean_rewards = []\n",
252 |     "max_rewards = []\n",
253 |     "min_rewards = []\n",
254 |     "average_100_rewards = []\n",
255 |     "\n",
256 |     "max_steps = 2024\n",
257 |     "\n",
258 |     "for ep in range(max_episodes+1):\n",
259 |     "    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    \n",
260 |     "    states = env_info.vector_observations                  # get the current state (for each agent)\n",
261 |     "    done = False\n",
262 |     "    \n",
263 |     "    states_batch = []\n",
264 |     "    values_batch = []\n",
265 |     "    actions_batch = []\n",
266 |     "    logprobs_batch = []\n",
267 |     "    rewards_batch = []\n",
268 |     "    masks = []\n",
269 |     "    scores = np.zeros(num_agents)\n",
270 |     "    while True:\n",
271 |     "\n",
272 |     "        actions, logprobs  = agent.act(states)  \n",
273 |     "        env_info = env.step(actions)[brain_name]           # send all actions to tne environment\n",
274 |     "        next_states = env_info.vector_observations         # get next state (for each agent)\n",
275 |     "        rewards = env_info.rewards                         # get reward (for each agent)\n",
276 |     "        dones = env_info.local_done                        # see if episode finished\n",
277 |     "        scores += env_info.rewards\n",
278 |     "        \n",
279 |     "        states = next_states\n",
280 |     "\n",
281 |     "        if np.any(dones):\n",
282 |     "            break\n",
283 |     "\n",
284 |     "    \n",
285 |     "    mean_rewards.append(np.mean(scores))\n",
286 |     "    min_rewards.append(np.min(scores))\n",
287 |     "    max_rewards.append(np.max(scores))\n",
288 |     "    average_100.append(np.mean(scores))\n",
289 |     "    average_100_rewards.append(np.array(average_100).mean())\n",
290 |     "    \n",
291 |     "    print(\"\\rEpisode: {} | mean_reward: {:.2f} | min_reward: {:.2f} | max_reward: {:.2f} | Average_100: {:.2f}\".format(ep, np.mean(scores), np.min(scores), np.max(scores), np.mean(average_100)), end = \"\", flush = True)\n",
292 |     "    if ep != 0 and ep % 100 == 0:\n",
293 |     "        print(\"\\rEpisode: {} | mean_reward: {:.2f} | min_reward: {:.2f} | max_reward: {:.2f} | Average_100: {:.2f}\".format(ep, np.mean(scores), np.min(scores), np.max(scores), np.mean(average_100)))\n",
294 |     "\n",
295 |     "        \n",
296 |     "\n",
297 |     "    \n",
298 |     "env.close()\n",
299 |     "# PLOTTING RESULTS\n",
300 |     "\n",
301 |     "plt.figure(figsize = (20,7))\n",
302 |     "plt.subplot(1,4,1)\n",
303 |     "plt.title(\"actor loss\")\n",
304 |     "plt.plot(a_loss_list)\n",
305 |     "plt.subplot(1,4,2)\n",
306 |     "plt.title(\"critic loss\")\n",
307 |     "plt.plot(c_loss_list)\n",
308 |     "plt.subplot(1,4,3)\n",
309 |     "plt.title(\"entropy\")\n",
310 |     "plt.plot(entropy_list)\n",
311 |     "plt.subplot(1,4,4)\n",
312 |     "plt.title(\"rewards\")\n",
313 |     "plt.plot(mean_rewards, c = \"b\")\n",
314 |     "plt.plot(min_rewards, c = \"y\")\n",
315 |     "plt.plot(max_rewards, c = \"r\")\n",
316 |     "plt.plot(average_100_rewards, c = \"g\")\n",
317 |     "plt.show()"
318 |    ]
319 |   }
320 |  ],
321 |  "metadata": {
322 |   "kernelspec": {
323 |    "display_name": "Python 3",
324 |    "language": "python",
325 |    "name": "python3"
326 |   },
327 |   "language_info": {
328 |    "codemirror_mode": {
329 |     "name": "ipython",
330 |     "version": 3
331 |    },
332 |    "file_extension": ".py",
333 |    "mimetype": "text/x-python",
334 |    "name": "python",
335 |    "nbconvert_exporter": "python",
336 |    "pygments_lexer": "ipython3",
337 |    "version": "3.6.5"
338 |   }
339 |  },
340 |  "nbformat": 4,
341 |  "nbformat_minor": 2
342 | }
343 | 


--------------------------------------------------------------------------------
/ContinousControl/Parallel_processing.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process, Pipe
  2 | import numpy as np
  3 | 
  4 | def worker(remote, parent_remote, env_fn_wrapper):
  5 |     parent_remote.close()
  6 |     env = env_fn_wrapper.x()
  7 |     while True:
  8 |         cmd, data = remote.recv()
  9 |         if cmd == 'step':
 10 |             ob, reward, done, info = env.step(data)
 11 |             if done:
 12 |                 ob = env.reset()
 13 |             remote.send((ob, reward, done, info))
 14 |         elif cmd == 'reset':
 15 |             ob = env.reset()
 16 |             remote.send(ob)
 17 |         elif cmd == 'reset_task':
 18 |             ob = env.reset_task()
 19 |             remote.send(ob)
 20 |         elif cmd == 'close':
 21 |             remote.close()
 22 |             break
 23 |         elif cmd == 'get_spaces':
 24 |             remote.send((env.observation_space, env.action_space))
 25 |         else:
 26 |             raise NotImplementedError
 27 | 
 28 | class CloudpickleWrapper(object):
 29 |     """
 30 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
 31 |     """
 32 |     def __init__(self, x):
 33 |         self.x = x
 34 |     def __getstate__(self):
 35 |         import cloudpickle
 36 |         return cloudpickle.dumps(self.x)
 37 |     def __setstate__(self, ob):
 38 |         import pickle
 39 |         self.x = pickle.loads(ob)
 40 |         
 41 | 
 42 | class VecEnv(object):
 43 |     """
 44 |     An abstract asynchronous, vectorized environment.
 45 |     """
 46 |     def __init__(self, num_envs, observation_space, action_space):
 47 |         self.num_envs = num_envs
 48 |         self.observation_space = observation_space
 49 |         self.action_space = action_space
 50 | 
 51 |     def reset(self):
 52 |         """
 53 |         Reset all the environments and return an array of
 54 |         observations, or a tuple of observation arrays.
 55 |         If step_async is still doing work, that work will
 56 |         be cancelled and step_wait() should not be called
 57 |         until step_async() is invoked again.
 58 |         """
 59 |         pass
 60 | 
 61 |     def step_async(self, actions):
 62 |         """
 63 |         Tell all the environments to start taking a step
 64 |         with the given actions.
 65 |         Call step_wait() to get the results of the step.
 66 |         You should not call this if a step_async run is
 67 |         already pending.
 68 |         """
 69 |         pass
 70 | 
 71 |     def step_wait(self):
 72 |         """
 73 |         Wait for the step taken with step_async().
 74 |         Returns (obs, rews, dones, infos):
 75 |          - obs: an array of observations, or a tuple of
 76 |                 arrays of observations.
 77 |          - rews: an array of rewards
 78 |          - dones: an array of "episode done" booleans
 79 |          - infos: a sequence of info objects
 80 |         """
 81 |         pass
 82 | 
 83 |     def close(self):
 84 |         """
 85 |         Clean up the environments' resources.
 86 |         """
 87 |         pass
 88 | 
 89 |     def step(self, actions):
 90 |         self.step_async(actions)
 91 |         return self.step_wait()
 92 | 
 93 | class SubprocVecEnv(VecEnv):
 94 |     def __init__(self, env_fns, spaces=None):
 95 |         """
 96 |         envs: list of gym environments to run in subprocesses
 97 |         """
 98 |         self.waiting = False
 99 |         self.closed = False
100 |         nenvs = len(env_fns)
101 |         self.nenvs = nenvs
102 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
103 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
104 |             for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
105 |         for p in self.ps:
106 |             p.daemon = True # if the main process crashes, we should not cause things to hang
107 |             p.start()
108 |         for remote in self.work_remotes:
109 |             remote.close()
110 | 
111 |         self.remotes[0].send(('get_spaces', None))
112 |         observation_space, action_space = self.remotes[0].recv()
113 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
114 | 
115 |     def step_async(self, actions):
116 |         for remote, action in zip(self.remotes, actions):
117 |             remote.send(('step', action))
118 |         self.waiting = True
119 | 
120 |     def step_wait(self):
121 |         results = [remote.recv() for remote in self.remotes]
122 |         self.waiting = False
123 |         obs, rews, dones, infos = zip(*results)
124 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
125 | 
126 |     def reset(self):
127 |         for remote in self.remotes:
128 |             remote.send(('reset', None))
129 |         return np.stack([remote.recv() for remote in self.remotes])
130 | 
131 |     def reset_task(self):
132 |         for remote in self.remotes:
133 |             remote.send(('reset_task', None))
134 |         return np.stack([remote.recv() for remote in self.remotes])
135 | 
136 |     def close(self):
137 |         if self.closed:
138 |             return
139 |         if self.waiting:
140 |             for remote in self.remotes:            
141 |                 remote.recv()
142 |         for remote in self.remotes:
143 |             remote.send(('close', None))
144 |         for p in self.ps:
145 |             p.join()
146 |             self.closed = True
147 |             
148 |     def __len__(self):
149 |         return self.nenvs


--------------------------------------------------------------------------------
/ContinousControl/SAC_script.py:
--------------------------------------------------------------------------------
  1 | ﻿# -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Nov  6 12:24:34 2019
  4 | 
  5 | @author: Z0014354
  6 | """
  7 | 
  8 | 
  9 | import numpy as np
 10 | import random
 11 | 
 12 | import gym
 13 | import gym_cartpole_swingup
 14 | from collections import namedtuple, deque
 15 | import torch
 16 | import torch.nn as nn
 17 | import torch.nn.functional as F
 18 | from torch.distributions import Normal, MultivariateNormal
 19 | 
 20 | import torch.optim as optim
 21 | import time
 22 | from tensorboardX import SummaryWriter
 23 | import argparse
 24 | 
 25 | 
 26 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 27 | 
 28 | def hidden_init(layer):
 29 |     fan_in = layer.weight.data.size()[0]
 30 |     lim = 1. / np.sqrt(fan_in)
 31 |     return (-lim, lim)
 32 | 
 33 | class Actor(nn.Module):
 34 |     """Actor (Policy) Model."""
 35 | 
 36 |     def __init__(self, state_size, action_size, seed, hidden_size=32, init_w=3e-3, log_std_min=-20, log_std_max=2):
 37 |         """Initialize parameters and build model.
 38 |         Params
 39 |         ======
 40 |             state_size (int): Dimension of each state
 41 |             action_size (int): Dimension of each action
 42 |             seed (int): Random seed
 43 |             fc1_units (int): Number of nodes in first hidden layer
 44 |             fc2_units (int): Number of nodes in second hidden layer
 45 |         """
 46 |         super(Actor, self).__init__()
 47 |         self.seed = torch.manual_seed(seed)
 48 |         self.log_std_min = log_std_min
 49 |         self.log_std_max = log_std_max
 50 |         
 51 |         self.fc1 = nn.Linear(state_size, hidden_size)
 52 |         self.fc2 = nn.Linear(hidden_size, hidden_size)
 53 |         
 54 |         self.mu = nn.Linear(hidden_size, action_size)
 55 |         self.log_std_linear = nn.Linear(hidden_size, action_size)
 56 | 
 57 |     def reset_parameters(self):
 58 |         self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
 59 |         self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
 60 |         self.mu.weight.data.uniform_(-init_w, init_w)
 61 |         self.log_std_linear.weight.data.uniform_(-init_w, init_w)
 62 | 
 63 |     def forward(self, state):
 64 | 
 65 |         x = F.relu(self.fc1(state), inplace=True)
 66 |         x = F.relu(self.fc2(x), inplace=True)
 67 |         mu = self.mu(x)
 68 | 
 69 |         log_std = self.log_std_linear(x)
 70 |         log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
 71 |         return mu, log_std
 72 |     
 73 |     def evaluate(self, state, epsilon=1e-6):
 74 |         mu, log_std = self.forward(state)
 75 |         std = log_std.exp()
 76 |         dist = Normal(0, 1)
 77 |         e = dist.sample().to(device)
 78 |         action = torch.tanh(mu + e * std)
 79 |         log_prob = Normal(mu, std).log_prob(mu + e * std) - torch.log(1 - action.pow(2) + epsilon)
 80 |         #action = torch.clamp(action*action_high, action_low, action_high)
 81 |         return action, log_prob
 82 |         
 83 |     
 84 |     def get_action(self, state):
 85 |         """
 86 |         returns the action based on a squashed gaussian policy. That means the samples are obtained according to:
 87 |         a(s,e)= tanh(mu(s)+sigma(s)+e)
 88 |         """
 89 |         state = torch.FloatTensor(state).to(device) 
 90 |         mu, log_std = self.forward(state)
 91 |         std = log_std.exp()
 92 |         dist = Normal(0, 1)
 93 |         e      = dist.sample().to(device)
 94 |         action = torch.tanh(mu + e * std).cpu()
 95 | 
 96 |         return action[0]
 97 | 
 98 | 
 99 | class Critic(nn.Module):
100 |     """Critic (Value) Model."""
101 | 
102 |     def __init__(self, state_size, action_size, seed, hidden_size=32):
103 |         """Initialize parameters and build model.
104 |         Params
105 |         ======
106 |             state_size (int): Dimension of each state
107 |             action_size (int): Dimension of each action
108 |             seed (int): Random seed
109 |             hidden_size (int): Number of nodes in the network layers
110 | 
111 |         """
112 |         super(Critic, self).__init__()
113 |         self.seed = torch.manual_seed(seed)
114 |         self.fc1 = nn.Linear(state_size+action_size, hidden_size)
115 |         self.fc2 = nn.Linear(hidden_size, hidden_size)
116 |         self.fc3 = nn.Linear(hidden_size, 1)
117 |         self.reset_parameters()
118 | 
119 |     def reset_parameters(self):
120 |         self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
121 |         self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
122 |         self.fc3.weight.data.uniform_(-3e-3, 3e-3)
123 | 
124 |     def forward(self, state, action):
125 |         """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
126 |         x = torch.cat((state, action), dim=1)
127 |         x = F.relu(self.fc1(x))
128 |         x = F.relu(self.fc2(x))
129 |         return self.fc3(x)
130 | 
131 | class Agent():
132 |     """Interacts with and learns from the environment."""
133 |     
134 |     def __init__(self, state_size, action_size, random_seed, hidden_size, action_prior="uniform"):
135 |         """Initialize an Agent object.
136 |         
137 |         Params
138 |         ======
139 |             state_size (int): dimension of each state
140 |             action_size (int): dimension of each action
141 |             random_seed (int): random seed
142 |         """
143 |         self.state_size = state_size
144 |         self.action_size = action_size
145 |         self.seed = random.seed(random_seed)
146 |         
147 |         self.target_entropy = -action_size  # -dim(A)
148 |         self.alpha = 1
149 |         self.log_alpha = torch.tensor([0.0], requires_grad=True)
150 |         self.alpha_optimizer = optim.Adam(params=[self.log_alpha], lr=LR_ACTOR) 
151 |         self._action_prior = action_prior
152 |         
153 |         print("Using: ", device)
154 |         
155 |         # Actor Network 
156 |         self.actor_local = Actor(state_size, action_size, random_seed, hidden_size).to(device)
157 |         self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)     
158 |         
159 |         # Critic Network (w/ Target Network)
160 |         self.critic1 = Critic(state_size, action_size, random_seed, hidden_size).to(device)
161 |         self.critic2 = Critic(state_size, action_size, random_seed, hidden_size).to(device)
162 |         
163 |         self.critic1_target = Critic(state_size, action_size, random_seed,hidden_size).to(device)
164 |         self.critic1_target.load_state_dict(self.critic1.state_dict())
165 | 
166 |         self.critic2_target = Critic(state_size, action_size, random_seed,hidden_size).to(device)
167 |         self.critic2_target.load_state_dict(self.critic2.state_dict())
168 | 
169 |         self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=LR_CRITIC, weight_decay=0)
170 |         self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=LR_CRITIC, weight_decay=0) 
171 | 
172 |         # Replay memory
173 |         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
174 |         
175 | 
176 |     def step(self, state, action, reward, next_state, done, step):
177 |         """Save experience in replay memory, and use random sample from buffer to learn."""
178 |         # Save experience / reward
179 |         self.memory.add(state, action, reward, next_state, done)
180 | 
181 |         # Learn, if enough samples are available in memory
182 |         if len(self.memory) > BATCH_SIZE:
183 |             experiences = self.memory.sample()
184 |             self.learn(step, experiences, GAMMA)
185 |             
186 |     
187 |     def act(self, state, add_noise=True):
188 |         """Returns actions for given state as per current policy."""
189 |         state = torch.from_numpy(state).float().to(device)
190 |         action = self.actor_local.get_action(state).detach()
191 |         return action
192 | 
193 |     def learn(self, step, experiences, gamma, d=1):
194 |         """Updates actor, critics and entropy_alpha parameters using given batch of experience tuples.
195 |         Q_targets = r + γ * (min_critic_target(next_state, actor_target(next_state)) - α *log_pi(next_action|next_state))
196 |         Critic_loss = MSE(Q, Q_target)
197 |         Actor_loss = α * log_pi(a|s) - Q(s,a)
198 |         where:
199 |             actor_target(state) -> action
200 |             critic_target(state, action) -> Q-value
201 |         Params
202 |         ======
203 |             experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
204 |             gamma (float): discount factor
205 |         """
206 |         states, actions, rewards, next_states, dones = experiences
207 |         
208 | 
209 |         # ---------------------------- update critic ---------------------------- #
210 |         # Get predicted next-state actions and Q values from target models
211 |         next_action, log_pis_next = self.actor_local.evaluate(next_states)
212 | 
213 |         Q_target1_next = self.critic1_target(next_states.to(device), next_action.squeeze(0).to(device))
214 |         Q_target2_next = self.critic2_target(next_states.to(device), next_action.squeeze(0).to(device))
215 | 
216 |         # take the mean of both critics for updating
217 |         Q_target_next = torch.min(Q_target1_next, Q_target2_next)
218 |         
219 |         if FIXED_ALPHA == None:
220 |             # Compute Q targets for current states (y_i)
221 |             Q_targets = rewards + (gamma * (1 - dones) * (Q_target_next - self.alpha * log_pis_next.squeeze(0)))
222 |         else:
223 |             Q_targets = rewards + (gamma * (1 - dones) * (Q_target_next - FIXED_ALPHA * log_pis_next.squeeze(0)))
224 |         # Compute critic loss
225 |         Q_1 = self.critic1(states, actions)
226 |         Q_2 = self.critic2(states, actions)
227 |         critic1_loss = 0.5*F.mse_loss(Q_1, Q_targets.detach())
228 |         critic2_loss = 0.5*F.mse_loss(Q_2, Q_targets.detach())
229 |         # Update critics
230 |         # critic 1
231 |         self.critic1_optimizer.zero_grad()
232 |         critic1_loss.backward()
233 |         self.critic1_optimizer.step()
234 |         # critic 2
235 |         self.critic2_optimizer.zero_grad()
236 |         critic2_loss.backward()
237 |         self.critic2_optimizer.step()
238 |         if step % d == 0:
239 |         # ---------------------------- update actor ---------------------------- #
240 |             if FIXED_ALPHA == None:
241 |                 alpha = torch.exp(self.log_alpha)
242 |                 # Compute alpha loss
243 |                 actions_pred, log_pis = self.actor_local.evaluate(states)
244 |                 alpha_loss = - (self.log_alpha * (log_pis + self.target_entropy).detach()).mean()
245 |                 self.alpha_optimizer.zero_grad()
246 |                 alpha_loss.backward()
247 |                 self.alpha_optimizer.step()
248 |                 
249 |                 self.alpha = alpha
250 |                 # Compute actor loss
251 |                 if self._action_prior == "normal":
252 |                     policy_prior = MultivariateNormal(loc=torch.zeros(self.action_size), scale_tril=torch.ones(self.action_size).unsqueeze(0))
253 |                     policy_prior_log_probs = policy_prior.log_prob(actions_pred)
254 |                 elif self._action_prior == "uniform":
255 |                     policy_prior_log_probs = 0.0
256 |     
257 |                 actor_loss = (alpha * log_pis.squeeze(0) - self.critic1(states, actions_pred.squeeze(0)) - policy_prior_log_probs ).mean()
258 |             else:
259 |                 if self._action_prior == "normal":
260 |                     policy_prior = MultivariateNormal(loc=torch.zeros(self.action_size), scale_tril=torch.ones(self.action_size).unsqueeze(0))
261 |                     policy_prior_log_probs = policy_prior.log_prob(actions_pred)
262 |                 elif self._action_prior == "uniform":
263 |                     policy_prior_log_probs = 0.0
264 |     
265 |                 actor_loss = (FIXED_ALPHA * log_pis.squeeze(0) - self.critic1(states, actions_pred.squeeze(0)) - policy_prior_log_probs ).mean()
266 |             # Minimize the loss
267 |             self.actor_optimizer.zero_grad()
268 |             actor_loss.backward()
269 |             self.actor_optimizer.step()
270 | 
271 |             # ----------------------- update target networks ----------------------- #
272 |             self.soft_update(self.critic1, self.critic1_target, TAU)
273 |             self.soft_update(self.critic2, self.critic2_target, TAU)
274 |                      
275 | 
276 |     
277 |     def soft_update(self, local_model, target_model, tau):
278 |         """Soft update model parameters.
279 |         θ_target = τ*θ_local + (1 - τ)*θ_target
280 |         Params
281 |         ======
282 |             local_model: PyTorch model (weights will be copied from)
283 |             target_model: PyTorch model (weights will be copied to)
284 |             tau (float): interpolation parameter 
285 |         """
286 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
287 |             target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
288 | 
289 | class ReplayBuffer:
290 |     """Fixed-size buffer to store experience tuples."""
291 | 
292 |     def __init__(self, action_size, buffer_size, batch_size, seed):
293 |         """Initialize a ReplayBuffer object.
294 |         Params
295 |         ======
296 |             buffer_size (int): maximum size of buffer
297 |             batch_size (int): size of each training batch
298 |         """
299 |         self.action_size = action_size
300 |         self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
301 |         self.batch_size = batch_size
302 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
303 |         self.seed = random.seed(seed)
304 |     
305 |     def add(self, state, action, reward, next_state, done):
306 |         """Add a new experience to memory."""
307 |         e = self.experience(state, action, reward, next_state, done)
308 |         self.memory.append(e)
309 |     
310 |     def sample(self):
311 |         """Randomly sample a batch of experiences from memory."""
312 |         experiences = random.sample(self.memory, k=self.batch_size)
313 |         
314 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
315 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
316 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
317 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
318 |         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
319 | 
320 |         return (states, actions, rewards, next_states, dones)
321 | 
322 |     def __len__(self):
323 |         """Return the current size of internal memory."""
324 |         return len(self.memory)
325 |     
326 |     
327 |     
328 | def SAC(n_episodes=200, max_t=500, print_every=10):
329 |     scores_deque = deque(maxlen=100)
330 |     scores = []
331 |     average_100_scores = []
332 | 
333 |     for i_episode in range(1, n_episodes+1):
334 | 
335 |         state = env.reset()
336 |         state = state.reshape((1,state_size))
337 |         score = 0
338 |         for t in range(max_t):
339 | 
340 | 
341 |             action = agent.act(state)
342 |             action_v = action[0].numpy()
343 |             action_v = np.clip(action_v*action_high, action_low, action_high)
344 |             next_state, reward, done, info = env.step(action_v)
345 |             next_state = next_state.reshape((1,state_size))
346 |             agent.step(state, action, reward, next_state, done, t)
347 |             state = next_state
348 |             score += reward
349 | 
350 |             if done:
351 |                 break 
352 |         
353 |         scores_deque.append(score)
354 |         writer.add_scalar("max_reward", score, i_episode)
355 |         average_100_scores.append(np.mean(scores_deque))
356 |         
357 |         print('\rEpisode {} Reward: {:.2f}  Average100 Score: {:.2f}'.format(i_episode, score, np.mean(scores_deque)), end="")
358 |         if i_episode % print_every == 0:
359 |             print('\rEpisode {}  Reward: {:.2f}  Average100 Score: {:.2f}'.format(i_episode, score, np.mean(scores_deque)))
360 |             
361 |             
362 |     torch.save(agent.actor_local.state_dict(), args.info + ".pt")
363 |     return scores, average_100_scores
364 | 
365 | 
366 | 
367 | def play():
368 |     agent.actor_local.eval()
369 |     for i_episode in range(1):
370 | 
371 |         state = env.reset()
372 |         state = state.reshape((1,state_size))
373 | 
374 |         while True:
375 |             action = agent.act(state)
376 |             action_v = action[0].numpy()
377 |             action_v = np.clip(action_v*action_high, action_low, action_high)
378 |             next_state, reward, done, info = env.step(action_v)
379 |             next_state = next_state.reshape((1,state_size))
380 |             state = next_state
381 | 
382 |             if done:
383 |                 break 
384 |     
385 | 
386 | 
387 | parser = argparse.ArgumentParser(description="")
388 | parser.add_argument("-env", type=str,default="Pendulum-v0", help="Environment name")
389 | parser.add_argument("-info", type=str, help="Information or name of the run")
390 | parser.add_argument("-ep", type=int, default=200, help="The amount of training episodes, default is 200")
391 | parser.add_argument("-seed", type=int, default=0, help="Seed for the env and torch network weights, default is 0")
392 | parser.add_argument("-lr", type=float, default=5e-4, help="Learning rate of adapting the network weights, default is 5e-4")
393 | parser.add_argument("-a", "--alpha", type=float, help="entropy alpha value, if not choosen the value is leaned by the agent")
394 | parser.add_argument("-layer_size", type=int, default=64, help="Number of nodes per neural network layer, default is 64")
395 | parser.add_argument("-repm", "--replay_memory", type=float, default=1e6, help="Size of the Replay memory, default is 1e6")
396 | parser.add_argument("-bs", "--batch_size", type=int, default=256, help="Batch size, default is 256")
397 | parser.add_argument("-t", "--tau", type=float, default=1e-2, help="Softupdate factor tau, default is 1e-2")
398 | parser.add_argument("-g", "--gamma", type=float, default=0.99, help="discount factor gamma, default is 0.99")
399 | parser.add_argument("--saved_model", type=str, default=None, help="Load a saved model to perform a test run!")
400 | args = parser.parse_args()
401 | 
402 | 
403 | env_name = args.env
404 | seed = args.seed
405 | n_episodes = args.ep
406 | GAMMA = args.gamma
407 | TAU = args.tau
408 | HIDDEN_SIZE = args.layer_size
409 | BUFFER_SIZE = int(args.replay_memory)
410 | BATCH_SIZE = args.batch_size        # minibatch size
411 | LR_ACTOR = args.lr         # learning rate of the actor 
412 | LR_CRITIC = args.lr        # learning rate of the critic
413 | FIXED_ALPHA = args.alpha
414 | saved_model = args.saved_model
415 | 
416 | t0 = time.time()
417 | writer = SummaryWriter("runs/"+args.info)
418 | env = gym.make(env_name)
419 | action_high = env.action_space.high[0]
420 | action_low = env.action_space.low[0]
421 | torch.manual_seed(seed)
422 | env.seed(seed)
423 | state_size = env.observation_space.shape[0]
424 | action_size = env.action_space.shape[0]
425 | agent = Agent(state_size=state_size, action_size=action_size, random_seed=seed,hidden_size=HIDDEN_SIZE, action_prior="uniform") #"normal"
426 | 
427 | if saved_model != None:
428 |     agent.actor_local.load_state_dict(torch.load(saved_model))
429 |     play()
430 | else:    
431 |     scores, average_100 = SAC(n_episodes=n_episodes, max_t=2300, print_every=10)
432 | t1 = time.time()
433 | env.close()
434 | print("training took {} min!".format((t1-t0)/60))
435 | 
436 | 


--------------------------------------------------------------------------------
/Cross_entropy/Cross_entropy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | import gym
 3 | from gym import wrappers
 4 | import torch
 5 | import torch.nn as nn
 6 | from torch import optim
 7 | from collections import namedtuple
 8 | from tensorboardX import SummaryWriter
 9 | # Memory
10 | Episode = namedtuple("Episode", field_names  = ["reward","steps"])
11 | EpisodeStep = namedtuple("EpisodeStep", field_names = ["state", "action"])
12 | 
13 | 
14 | class Network(nn.Module):
15 |     def __init__(self, input_shape, output_shape):
16 |         super(Network, self).__init__()
17 | 
18 |         
19 |         self.net = nn.Sequential(
20 |             nn.Linear(input_shape, 128),
21 |             nn.ReLU(),
22 |             nn.Linear(128, output_shape)
23 |         )
24 | 
25 |     def forward(self,x):
26 |         return self.net(x)
27 | 
28 | def filter_batch(batch, percentile = 70):
29 |     rewards = list(map(lambda s: s.reward, batch))
30 |     reward_bound = np.percentile(rewards, percentile)
31 |     reward_mean = float(np.mean(rewards))
32 | 
33 |     train_obs = []
34 |     train_act = []
35 |     for example in batch:
36 |         if example.reward < reward_bound:
37 |             continue
38 |         train_obs.extend(map(lambda step: step.state, example.steps))
39 |         train_act.extend(map(lambda step: step.action, example.steps))
40 |     train_obs_vector = torch.FloatTensor(train_obs)
41 |     train_act_vector = torch.LongTensor(train_act)
42 |     return train_obs_vector, train_act_vector, reward_bound, reward_mean
43 | 
44 | def iterative_batches(env, network, batch_size = 16):
45 |     batch = []
46 |     episode_reward = 0.0
47 |     episode_steps = []
48 |     state = env.reset()
49 |     softmax = nn.Softmax(dim =1)
50 | 
51 |     while True: 
52 |         state_vector = torch.Tensor([state])
53 |         action_probs_vector = softmax(network(state_vector))
54 |         
55 |         action_probs = action_probs_vector.data.numpy()[0]
56 |         action = np.random.choice(len(action_probs), p = action_probs)
57 | 
58 |         next_state, reward, done, _ = env.step(action)
59 |         episode_reward += reward
60 |         episode_steps.append(EpisodeStep(state = state, action = action))
61 | 
62 |         if done:
63 |             batch.append(Episode(reward = episode_reward, steps = episode_steps))
64 |             episode_reward = 0.0
65 |             episode_steps = []
66 |             next_state = env.reset()
67 |             if len(batch) == batch_size:
68 |                 yield batch
69 |                 batch = []
70 |         state = next_state
71 | 
72 | if __name__ == "__main__":
73 |     env = gym.make("CartPole-v0")
74 |     env = gym.wrappers.Monitor(env, directory = "mon", force = True)
75 |     output_shape = env.action_space.n
76 |     input_shape = env.observation_space.shape[0]
77 | 
78 |     network = Network(input_shape = input_shape, output_shape = output_shape)
79 |     objective = nn.CrossEntropyLoss()
80 |     optimizer = optim.Adam(params = network.parameters(), lr = 0.01)
81 |     writer = SummaryWriter()
82 | 
83 |     for iter_no, batch in enumerate(iterative_batches(env, network)):
84 |         state_vector, action_vector, reward_bound, reward_mean = filter_batch(batch)
85 |         optimizer.zero_grad()
86 |         action_values_vector = network(state_vector)
87 |         loss_vector = objective(action_values_vector, action_vector)
88 |         loss_vector.backward()
89 |         optimizer.step()
90 |         print("{}: loss = {}, reward_mean = {}, reward_boundary = {}".format(iter_no, loss_vector.item(), reward_mean, reward_bound))
91 |         writer.add_scalar("loss", loss_vector.item(), iter_no)
92 |         writer.add_scalar("reward mean", reward_mean, iter_no)
93 |         writer.add_scalar("reward boundary", reward_bound, iter_no)
94 |         if reward_mean > 199:
95 |             print("Solved CartPole Problem!")
96 |             break
97 |     writer.close()


--------------------------------------------------------------------------------
/Cross_entropy/README.md:
--------------------------------------------------------------------------------
1 | # Deep Reinforcement Leanring with Cross entropy
2 | Cross entropy method implemented on the cart pole problem.
3 | based on the example in the book [Deep Reinforcement Learning Hands-on](https://www.amazon.de/Deep-Reinforcement-Learning-Hands-Q-networks-ebook/dp/B076H9VQH6)  by Maxim Lapan
4 | 
5 | 
6 | ![alt text](img/Cross_entropy.png) 
7 | 


--------------------------------------------------------------------------------
/Cross_entropy/img/Cross_entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Cross_entropy/img/Cross_entropy.png


--------------------------------------------------------------------------------
/Deep Q_Learning/DQN_Experience_Replay.py:
--------------------------------------------------------------------------------
  1 | import keras
  2 | from keras.models import Sequential
  3 | from keras.layers import Dense, Dropout
  4 | from keras.optimizers import Adam
  5 | import numpy as np 
  6 | from collections import deque
  7 | from keras.models import load_model
  8 | import random
  9 | import matplotlib.pyplot as plt
 10 | import gym
 11 | from gym import wrappers
 12 | 
 13 | 
 14 | 
 15 | class AI():
 16 |     def __init__(self, state_size, action_size, memory_size, learning_rate, gamma):
 17 |         self.state_size = state_size
 18 |         self.action_size = action_size
 19 |         self.memory = deque(maxlen = memory_size)
 20 |         
 21 | 
 22 |         # HYPERPARAMETER
 23 |         self.learning_rate = learning_rate
 24 |         self.gamma = gamma
 25 |         self.epsilon  = 0.5
 26 |         self.epsilon_start = self.epsilon
 27 | 
 28 |         self.brain = self.build_brain()
 29 | 
 30 | 
 31 |     def build_brain(self):
 32 |         model = Sequential()
 33 |         model.add(Dense(self.state_size, activation='relu'))
 34 |         model.add(Dense(25, activation='relu'))
 35 |         #model.add(Dropout(0.3))
 36 |         model.add(Dense(25, activation='relu'))
 37 |         #model.add(Dropout(0.3))
 38 |   #      model.add(Dense(12, activation='relu'))
 39 |         model.add(Dense(self.action_size, activation='linear'))
 40 |         model.compile(loss = "mse", optimizer = Adam(lr=self.learning_rate))
 41 |         return model
 42 | 
 43 | 
 44 |     def load_model(self, name):
 45 |         """
 46 |         Loads an existing Model
 47 |         Input: string of the model name - h5 data
 48 |         """
 49 |         brain = load_model(name)
 50 |         return None
 51 | 
 52 |     def save_learnings(self, model_name):#
 53 |         """
 54 |         Input string of Modelname
 55 |         """
 56 |         self.brain.save(model_name+".h5")
 57 |     
 58 |     def adapt_epsilon(self,ep):
 59 |         # Epsilon starts at 0.5 linear increasing to 0.99 by ep 4000:
 60 |         # linear: epsilon = 0.0001225*ep+self.epsilon_start
 61 |         # exponent (4000 eps): epsilon = self.epsilon_start + (ep/5714)**2
 62 |         if ep == 0:
 63 |             pass
 64 |         self.epsilon = self.epsilon_start + (ep/5714)**2 
 65 |     
 66 |     def act(self, state, status = "train"):
 67 |         if status == "train": 
 68 |             if np.random.rand() > self.epsilon:
 69 |                 return random.randrange(self.action_size)
 70 |         return np.argmax(self.brain.predict(state)[0]) 
 71 |     
 72 |     def remember(self, state, action, reward, next_state, done):
 73 |         self.memory.append((state, action, reward, next_state, done))
 74 | 
 75 |     def replay(self):
 76 |         batch_size = 32
 77 |         if len(self.memory) < batch_size: 
 78 |             return
 79 | 
 80 |         samples = random.sample(self.memory, batch_size) 
 81 |         for state, action, reward, next_state, done in samples:
 82 |             target = reward
 83 |             
 84 |             if not done:    
 85 |                 target = reward + self.gamma * np.amax(self.brain.predict(next_state)[0]) # Predict the future/target value
 86 |                 #print(target)
 87 |             Q_target_shape = self.brain.predict(state) # normal Q- Value prediction for the training-shape 
 88 |             Q_target_shape[0][action] = target # replacing the best Q-Value with the target 
 89 |             self.brain.fit(state, Q_target_shape, epochs=1, verbose=0) # training with the new Target value (loss = sum(Q_target-Q)exp2)
 90 | 
 91 | 
 92 | 
 93 | 
 94 | def play(Ep, agent, status = "train"):
 95 |     
 96 |     learning_graph = []
 97 |     env = gym.make("CartPole-v1")
 98 |     env = wrappers.Monitor(env, "Saved_DQN_ER_Models/", resume=True, video_callable=lambda episode_id: episode_id%250==0)
 99 |     action_space = env.action_space.n 
100 |     state_space = env.observation_space.shape[0]
101 |     if agent == None:
102 |         agent = AI(state_space,action_space,memory_size = 5000,learning_rate = 0.001,gamma = 0.95) #2500 mem
103 |     for ep in range(Ep):
104 |         state = env.reset()
105 |         state = np.reshape(state,[1,state_space]) 
106 |         done = False
107 |         score = 0
108 |         agent.adapt_epsilon(ep) # Increasing the epsilon linear - adjustable to non linear, log,...
109 |         while not done:
110 | 
111 |             if status == "play":
112 |                 env.render()
113 |             action = agent.act(state, status)
114 |             new_state, reward, done, _ = env.step(action)
115 |             new_state  = np.reshape(new_state,[1,state_space])
116 |             agent.remember(state, action, reward, new_state, done)
117 |             state = new_state
118 |             score +=1
119 |             if done:
120 |                 break
121 |         print("Episode {}# Score: {}".format(ep, score + 1))
122 |         if ep == 250 or ep % 500 == 0:
123 |                 # save model eacht 500 ep for videos
124 |                 agent.save_learnings(str(ep)+","+str(score))
125 |         agent.replay() 
126 |         learning_graph.append(score)
127 |     return learning_graph, agent
128 | 
129 | def main():
130 |     Episodes = 4001 #4001
131 |     graph,agent = play(Episodes,None)
132 |     plt.plot(graph)
133 |     plt.xlabel("Episoden")
134 |     plt.ylabel("Score")
135 |     plt.show()
136 | 
137 |     print("Do you want to save the model?")
138 |     answer = input("Y/N\n")
139 |     if answer == "Y":
140 |         name = input("give a name for the model: \n")
141 |         agent.save_learnings(name)
142 |     else:
143 |         pass
144 |     
145 | 
146 |     print("Soll der Agent getestet werden?\n")
147 |     n = input("Wie viele Episoden sollen gespielt werden?")
148 |     x,y = play(int(n),agent,status = "play")
149 | 
150 | if __name__ == "__main__":
151 |     main()
152 | 


--------------------------------------------------------------------------------
/Deep Q_Learning/Img/4k Learning_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Deep Q_Learning/Img/4k Learning_curve.png


--------------------------------------------------------------------------------
/Deep Q_Learning/Img/Converging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Deep Q_Learning/Img/Converging.png


--------------------------------------------------------------------------------
/Deep Q_Learning/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Q_Learning with Experience Replay playing Cart Pole
 2 | 
 3 | [image1]: ./Img/Converging.png "Calculation Equation"
 4 | [image2]: ./Img/Q_table10000.png "Calculation Equation"
 5 | 
 6 | 
 7 | ### Exponential Epsilon:
 8 | 
 9 | 
10 | 
11 | Learning Curve after 4000 Epochs and and exponentially epsilon Greedy
12 | 
13 | ![alt text][image1]
14 | 
15 | 
16 | 
17 | 
18 | 
19 | ### Youtube Video:
20 | [Deep Q-Network plays Cart Pole](https://www.youtube.com/watch?v=9g2ZLPs5Rs0)
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/Double DQN/CNN_Double_DQN.py:
--------------------------------------------------------------------------------
  1 | import math, random
  2 | from collections import deque
  3 | import cv2
  4 | 
  5 | import gym
  6 | from gym import wrappers
  7 | import wrapper
  8 | import numpy as np
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.optim as optim
 13 | import torch.autograd as autograd 
 14 | import torch.nn.functional as F
 15 | from IPython.display import clear_output
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | 
 19 | USE_CUDA = torch.cuda.is_available()
 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
 21 | 
 22 | class ReplayBuffer(object):
 23 |     def __init__(self, capacity):
 24 |         self.buffer = deque(maxlen=capacity)
 25 |     
 26 |     def push(self, state, action, reward, next_state, done):
 27 |         state      = np.expand_dims(state, 0)
 28 |         next_state = np.expand_dims(next_state, 0)
 29 |             
 30 |         self.buffer.append((state, action, reward, next_state, done))
 31 |     
 32 |     def sample(self, batch_size):
 33 |         state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
 34 |         return np.concatenate(state), action, reward, np.concatenate(next_state), done
 35 |     
 36 |     def __len__(self):
 37 |         return len(self.buffer)
 38 | 
 39 | class CnnDQN(nn.Module):
 40 |     def __init__(self, input_shape, num_actions):
 41 |         super(CnnDQN, self).__init__()
 42 |         
 43 |         self.input_shape = input_shape
 44 |         self.num_actions = num_actions
 45 |         
 46 |         self.features = nn.Sequential(
 47 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 48 |             nn.ReLU(),
 49 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 50 |             nn.ReLU(),
 51 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 52 |             nn.ReLU()
 53 |         )
 54 |         
 55 |         self.fc = nn.Sequential(
 56 |             nn.Linear(self.feature_size(), 512),
 57 |             nn.ReLU(),
 58 |             nn.Linear(512, self.num_actions)
 59 |         )
 60 |         
 61 |     def forward(self, x):
 62 |         x = self.features(x)
 63 |         x = x.view(x.size(0), -1)
 64 |         x = self.fc(x)
 65 |         return x
 66 |     
 67 |     def feature_size(self):
 68 |         return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
 69 |     
 70 |     def act(self, state, epsilon,action_space):
 71 |         if random.random() > epsilon:
 72 |             with torch.no_grad():
 73 |                 state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0))
 74 |                 q_value = self.forward(state)
 75 |                 action  = q_value.max(1)[1].data[0] #.max(1) gives the maxvalues--[0] and idx--[1] 
 76 |         else:
 77 |             action = random.randrange(action_space)
 78 |         return action
 79 |     
 80 | def update_target(current_model, target_model):
 81 |     target_model.load_state_dict(current_model.state_dict())
 82 | 
 83 | def save_model(model, idx):
 84 |     torch.save(model, "Saved_models/")
 85 | 
 86 | def epsilon_by_frame(frame_idx):
 87 |     epsilon_start = 1.0
 88 |     epsilon_final = 0.01 #0.01
 89 |     epsilon_decay = 30000 #30000
 90 |     eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
 91 |     return eps
 92 | 
 93 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,replay_buffer):
 94 |     state, action, reward, next_state, done = replay_buffer.sample(batch_size)
 95 |                                                                 # shapes for normal image-- stacked (4,84,84) ...
 96 |     state      = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84)
 97 |     next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84)
 98 |     action     = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function
 99 |     reward     = Variable(torch.FloatTensor(reward)) #shape [32]
100 |     done       = Variable(torch.FloatTensor(done)) #shape [32]
101 |     
102 |     q_values      = current_model(state) #shape [32,6]
103 |     next_q_values = current_model(next_state) #shape [32,6]
104 |     next_q_state_values = target_model(next_state) #shape [32,6]
105 |     
106 |     q_value       = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action
107 |     next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1] 
108 |     expected_q_value = reward + gamma * next_q_value * (1 - done)  # shape [32]
109 |     
110 |     
111 |     # DeepMind took nn.SmoothL1Loss()
112 |     #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss  -- .data to get rid of grad_fn=<AddBackward0>
113 |     loss = loss_func(q_value,Variable(expected_q_value.data))
114 |     
115 |     opti.zero_grad()
116 |     loss.backward()
117 |     opti.step()
118 |     return loss
119 | 
120 | def plot(frame_idx, rewards, losses):
121 |     plt.close()
122 |     plt.figure(figsize=(20,5))
123 |     plt.subplot(121)
124 |     plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2)))
125 |     plt.plot(rewards)
126 |     plt.subplot(122)
127 |     plt.title("loss")
128 |     plt.plot(losses)
129 |     plt.ylim(0,1)
130 |     plt.draw()
131 |     plt.pause(0.0001)
132 | 
133 | def processing(img):
134 |     img = np.expand_dims(cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), (84,84)),axis= 0)
135 |     img = img.astype(np.uint8)
136 |     #print(img.dtype)
137 |     return img
138 | 
139 | def main():
140 |     plt.ion()
141 |     env = wrapper.make_atari("PongNoFrameskip-v4", monitor=True,epidsode_capture=75)
142 |     env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True)
143 |     action_space = env.action_space.n
144 |     current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape
145 |     target_model  = CnnDQN(env.observation_space.shape, action_space)    
146 | 
147 |     if USE_CUDA:
148 |         current_model = current_model.cuda()
149 |         target_model  = target_model.cuda()
150 | 
151 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000)
152 |     #opti = optim.Adam(current_model.parameters(), lr=0.0001) 
153 |     opti = optim.RMSprop(current_model.parameters(), lr=0.0001)
154 |     loss_func = nn.SmoothL1Loss()
155 | 
156 |     replay_initial = 10000
157 |     replay_buffer = ReplayBuffer(100000)
158 | 
159 |     num_frames = 1000000
160 |     batch_size = 32
161 |     gamma      = 0.99
162 | 
163 |     losses = []
164 |     all_rewards = []
165 |     episode_reward = 0
166 | 
167 |     state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84)
168 |     # Manuel Stacking
169 |     #state = processing(state)
170 |     #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
171 |     #assert state.shape == (4,84,84)
172 |     for frame_idx in range(1, num_frames + 1):
173 |         
174 |         epsilon = epsilon_by_frame(frame_idx)
175 |         print("Training :: Frame {} :: Epsilon {} ".format(frame_idx, round(epsilon,2)))
176 |         action = current_model.act(state, epsilon,action_space)
177 |         next_state, reward, done, _ = env.step(action)
178 |         # Manuel Stacking
179 |         #next_state = processing(next_state)
180 |         #next_state = np.append(next_state, state[1:, :, :],axis= 0)
181 |         #assert next_state.shape == (4,84,84)
182 |         replay_buffer.push(state, action, reward, next_state, done)
183 |         
184 |         state = next_state
185 |         episode_reward += reward
186 |         
187 |         if done:
188 |             state = env.reset()
189 |             # Manuel Stacking
190 |             #state = processing(state)
191 |             #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
192 |             all_rewards.append(episode_reward)
193 |             episode_reward = 0
194 |             
195 |         if len(replay_buffer) > replay_initial:
196 |             loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,replay_buffer)
197 |             losses.append(loss.item()) 
198 |             
199 |         if frame_idx % 10000 == 0:
200 |             plot(frame_idx, all_rewards, losses)
201 |             
202 |         if frame_idx % 1000 == 0:
203 |             update_target(current_model, target_model)
204 |         
205 |         #if frame_idx % 100000 ==0:
206 |         #    save_model(current_model, frame_idx)
207 | 
208 | if __name__ == "__main__":
209 |     main()


--------------------------------------------------------------------------------
/Double DQN/Double_DQN.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import wrappers
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | from torch.autograd import Variable
  8 | 
  9 | import numpy as np
 10 | from collections import deque
 11 | import random
 12 | import matplotlib.pyplot as plt
 13 | import matplotlib.animation as animation
 14 | from matplotlib import style
 15 | import time
 16 | 
 17 | class Network(nn.Module):
 18 |     def __init__(self, input_dim, output_dim):
 19 |         super(Network,self).__init__()
 20 |         self.linear1 = nn.Linear(input_dim, 40)
 21 |         self.linear2 = nn.Linear(40, 40)
 22 |         self.linear3 = nn.Linear(40, output_dim)
 23 | 
 24 |     def forward(self,x):
 25 |         x = self.linear1(x)
 26 |         x = F.relu(x)
 27 |         x = self.linear2(x)
 28 |         x = F.relu(x)
 29 |         out = self.linear3(x)
 30 |         return out
 31 | 
 32 | class Agent:
 33 |     def __init__(self, state_size, action_size):
 34 |         
 35 |         self.state_size = state_size
 36 |         self.action_size = action_size
 37 |         self.memory = deque(maxlen=5000)
 38 |         self.gamma = 0.95    # discount rate
 39 |         self.epsilon = 0.4  # exploration rate
 40 |         self.epsilon_start = self.epsilon
 41 |         self.learning_rate = 0.001
 42 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #activate device
 43 |         
 44 |         # Our DQN and the Target Network
 45 |         self.model = Network(state_size, action_size).to(self.device)
 46 |         self.target_model = Network(state_size, action_size).to(self.device)
 47 | 
 48 |         self.criteria = nn.MSELoss()
 49 |         self.opt = optim.Adam(self.model.parameters(), lr=self.learning_rate)
 50 |     
 51 |     def remember(self, state, action, reward, next_state, done):
 52 |         self.memory.append((state, action, reward, next_state, done))
 53 | 
 54 |     def update_target(self):
 55 |         self.target_model.load_state_dict(self.model.state_dict())
 56 | 
 57 |     def adapt_epsilon(self,ep):
 58 |         # Epsilon starts at 0.5 linear increasing to 0.99 by ep 4000:
 59 |         # linear: epsilon = 0.0001225*ep+self.epsilon_start
 60 |         # exponent (4000 eps): epsilon = self.epsilon_start + (ep/5714)**2
 61 |         if ep == 0:
 62 |             pass
 63 |         if self.epsilon < 0.98:
 64 |             self.epsilon = self.epsilon_start + (ep/3800)**2 #4500
 65 |     
 66 |     def act(self, state, status = "Train"):
 67 |         if status == "Play": 
 68 |             self.epsilon = 0.95
 69 |         if np.random.rand() > self.epsilon:
 70 |             return random.randrange(self.action_size)
 71 |         
 72 |         act_values = self.model(Variable(torch.Tensor(state)).to(self.device)).cpu().data.numpy()
 73 |         return np.argmax(act_values[0])
 74 | 
 75 |     def give_epsilon(self):
 76 |         return self.epsilon
 77 | 
 78 |     def replay(self, batch_size):
 79 |         if len(self.memory) < batch_size: 
 80 |             return
 81 |         minibatch = random.sample(self.memory, batch_size)
 82 | 
 83 |         for state, action, reward, next_state, done in minibatch:
 84 |             target = reward
 85 |             self.model.train()
 86 |             if not done:
 87 |                 next_state_v = Variable(torch.Tensor(next_state))
 88 |                 target = self.target_model(next_state_v.to(self.device)).cpu() # target has to be on cpu for numpy
 89 |                 target = target.data.numpy()[0]
 90 |             target_actual = self.target_model(Variable(torch.Tensor(state)).to(self.device)).cpu().data.numpy()
 91 |             target_actual[0][action] = reward + self.gamma *np.amax(target)
 92 |             
 93 |             self.opt.zero_grad()
 94 |             out = self.model(Variable(torch.Tensor(state)).to(self.device))
 95 |             loss = self.criteria(out, Variable(torch.Tensor(target_actual)).to(self.device))
 96 |             loss.backward()
 97 |             self.opt.step()
 98 |             
 99 | 
100 | 
101 | 
102 | def play(Ep,agent, status = "train"):
103 |     # for active plotting:
104 |     learning_graph = []
105 |     epsilons = []
106 |     learning_graph_live = deque(maxlen = 180)
107 |     epochs_live = deque(maxlen = 180)
108 |     epsilons_live = deque(maxlen = 180)
109 | 
110 |     batch_size = 64
111 |     env = gym.make("CartPole-v1")
112 |     env = wrappers.Monitor(env, "Saved_Videos/", resume=True, video_callable=lambda episode_id: episode_id%250==0)
113 |     action_space = env.action_space.n 
114 |     state_space = env.observation_space.shape[0]
115 |     if agent == None:
116 |         agent = Agent(state_space,action_space)
117 |     for ep in range(Ep):
118 |         state = env.reset()
119 |         state = np.reshape(state,[1,state_space]) 
120 |         done = False
121 |         score = 0
122 |         agent.adapt_epsilon(ep) # Increasing the epsilon linear - adjustable to non linear, log,...
123 |         while not done:
124 | 
125 |             if status == "play":
126 |                 env.render()
127 |             action = agent.act(state, status)
128 |             new_state, reward, done, _ = env.step(action)
129 |             new_state  = np.reshape(new_state,[1,state_space])
130 |             agent.remember(state, action, reward, new_state, done)
131 |             state = new_state
132 |             score +=1
133 | 
134 |             if done:
135 |                 break
136 |         
137 |         
138 |         
139 |         print("Episode {}# Score: {}# Epsilon {}".format(ep, score + 1,agent.give_epsilon()))
140 |         # Update Target Network
141 |         if ep % 200 == 0:
142 |             agent.update_target()
143 |             print("Updated Target Network!")
144 |         agent.replay(batch_size)
145 |         # Live plot
146 |         learning_graph.append(score)
147 |         epsilons.append(agent.give_epsilon()*100)
148 |         learning_graph_live.append(score)
149 |         epochs_live.append(ep)
150 |         epsilons_live.append(agent.give_epsilon()*100)
151 | 
152 |         plt.plot(epochs_live, learning_graph_live,"b")
153 |         plt.plot(epochs_live, epsilons_live,"r")
154 |         plt.xlabel("Epoch")
155 |         plt.ylabel("Score / Epsilon")
156 |         plt.title("Score Live Plot")
157 |         plt.show()
158 |         plt.pause(0.00000001)
159 |         plt.clf()
160 |         
161 |     return learning_graph, epsilons, agent
162 | 
163 | def main():
164 |     Episodes = 4000 #4001
165 |     graph,epsilons,agent = play(Episodes,None, "train")
166 |     plt.plot(graph, "b")
167 |     plt.plot(epsilons, "r")
168 |     plt.xlabel("Episoden")
169 |     plt.ylabel("Score / Epsilon")
170 |     plt.show()
171 | 
172 |     print("Do you want to save the model?")
173 |     answer = input("Y/N\n")
174 |     if answer == "Y":
175 |         name = input("give a name for the model: \n")
176 |         agent.save_learnings(name)
177 |     else:
178 |         pass
179 |     
180 | 
181 |     print("Soll der Agent getestet werden?\n")
182 |     n = input("Wie viele Episoden sollen gespielt werden?")
183 |     x,y, ag = play(int(n),agent,status = "play")
184 | 
185 | if __name__ == "__main__":
186 |     fig = plt.figure()
187 |     plt.ion()
188 |     main()
189 | 


--------------------------------------------------------------------------------
/Double DQN/Imgs/4000_40-40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/4000_40-40.png


--------------------------------------------------------------------------------
/Double DQN/Imgs/CNN_pong_converge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/CNN_pong_converge.png


--------------------------------------------------------------------------------
/Double DQN/Imgs/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Double DQN/Imgs/test.png


--------------------------------------------------------------------------------
/Double DQN/README.md:
--------------------------------------------------------------------------------
 1 | # Double Deep Q_Learning with Experience Replay playing Cart Pole
 2 | 
 3 | [image1]: ./Imgs/test.png "Calculation Equation"
 4 | [image2]: ./Imgs/Q_table10000.png "Calculation Equation"
 5 | [image3]: ./Imgs/CNN_pong_converge.png
 6 | 
 7 | The difference between DQN and Double DQN is, that in Double DQN the target values get generated by a seperate neural network and not by the same that predicts the the Q_value as in DQN. 
 8 | [Paper](https://arxiv.org/abs/1509.06461)
 9 | 
10 | ### Learning Curve:
11 | 
12 | Learning Curve after 4000 Epochs and and exponentially epsilon Greedy
13 | 
14 | ![alt text][image1]
15 | 
16 | 
17 | 
18 | 
19 | 
20 | ### Youtube Video:
21 | [Deep Q-Network plays Cart Pole](https://www.youtube.com/watch?v=9g2ZLPs5Rs0)
22 | 
23 | ## Training to play pong with an Double Deep Q CNN
24 | I trained a Double Deep Q-Network to play the Atari game Pong. After around 150000 frames it converged and beat its opponent constantly.  Thereby the convolutional neural network trained itself totally on visual inputs. Therefor the input images got converted to black-and-white and 4 images got stacked together so the network is able to recognize the velocity of the ball - which would be much more difficult to guess by only one state/image.  Also the network was trained offline with a memory technique called experienced replay and after each 1000 frames the target network was updated with the weights of the optimized model.
25 | 
26 | ![alt text][image3]
27 | 
28 | ### Youtube Video:
29 | [Double Deep Q Network learns to play Pong](https://www.youtube.com/watch?v=I3dTyg_5rFc)
30 | 


--------------------------------------------------------------------------------
/Double DQN/wrapper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import deque
  3 | import gym
  4 | 
  5 | from gym import spaces,wrappers
  6 | import cv2
  7 | cv2.ocl.setUseOpenCL(False)
  8 | 
  9 | class NoopResetEnv(gym.Wrapper):
 10 |     def __init__(self, env, noop_max=30):
 11 |         """Sample initial states by taking random number of no-ops on reset.
 12 |         No-op is assumed to be action 0.
 13 |         """
 14 |         gym.Wrapper.__init__(self, env)
 15 |         self.noop_max = noop_max
 16 |         self.override_num_noops = None
 17 |         self.noop_action = 0
 18 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 19 | 
 20 |     def reset(self, **kwargs):
 21 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 22 |         self.env.reset(**kwargs)
 23 |         if self.override_num_noops is not None:
 24 |             noops = self.override_num_noops
 25 |         else:
 26 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 27 |         assert noops > 0
 28 |         obs = None
 29 |         for _ in range(noops):
 30 |             obs, _, done, _ = self.env.step(self.noop_action)
 31 |             if done:
 32 |                 obs = self.env.reset(**kwargs)
 33 |         return obs
 34 | 
 35 |     def step(self, ac):
 36 |         return self.env.step(ac)
 37 | 
 38 | class FireResetEnv(gym.Wrapper):
 39 |     def __init__(self, env):
 40 |         """Take action on reset for environments that are fixed until firing."""
 41 |         gym.Wrapper.__init__(self, env)
 42 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 43 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 44 | 
 45 |     def reset(self, **kwargs):
 46 |         self.env.reset(**kwargs)
 47 |         obs, _, done, _ = self.env.step(1)
 48 |         if done:
 49 |             self.env.reset(**kwargs)
 50 |         obs, _, done, _ = self.env.step(2)
 51 |         if done:
 52 |             self.env.reset(**kwargs)
 53 |         return obs
 54 | 
 55 |     def step(self, ac):
 56 |         return self.env.step(ac)
 57 | 
 58 | class EpisodicLifeEnv(gym.Wrapper):
 59 |     def __init__(self, env):
 60 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 61 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 62 |         """
 63 |         gym.Wrapper.__init__(self, env)
 64 |         self.lives = 0
 65 |         self.was_real_done  = True
 66 | 
 67 |     def step(self, action):
 68 |         obs, reward, done, info = self.env.step(action)
 69 |         self.was_real_done = done
 70 |         # check current lives, make loss of life terminal,
 71 |         # then update lives to handle bonus lives
 72 |         lives = self.env.unwrapped.ale.lives()
 73 |         if lives < self.lives and lives > 0:
 74 |             # for Qbert sometimes we stay in lives == 0 condtion for a few frames
 75 |             # so its important to keep lives > 0, so that we only reset once
 76 |             # the environment advertises done.
 77 |             done = True
 78 |         self.lives = lives
 79 |         return obs, reward, done, info
 80 | 
 81 |     def reset(self, **kwargs):
 82 |         """Reset only when lives are exhausted.
 83 |         This way all states are still reachable even though lives are episodic,
 84 |         and the learner need not know about any of this behind-the-scenes.
 85 |         """
 86 |         if self.was_real_done:
 87 |             obs = self.env.reset(**kwargs)
 88 |         else:
 89 |             # no-op step to advance from terminal/lost life state
 90 |             obs, _, _, _ = self.env.step(0)
 91 |         self.lives = self.env.unwrapped.ale.lives()
 92 |         return obs
 93 | 
 94 | class MaxAndSkipEnv(gym.Wrapper):
 95 |     def __init__(self, env, skip=4):
 96 |         """Return only every `skip`-th frame"""
 97 |         gym.Wrapper.__init__(self, env)
 98 |         # most recent raw observations (for max pooling across time steps)
 99 |         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
100 |         self._skip       = skip
101 | 
102 |     def reset(self):
103 |         return self.env.reset()
104 | 
105 |     def step(self, action):
106 |         """Repeat action, sum reward, and max over last observations."""
107 |         total_reward = 0.0
108 |         done = None
109 |         for i in range(self._skip):
110 |             obs, reward, done, info = self.env.step(action)
111 |             if i == self._skip - 2: self._obs_buffer[0] = obs
112 |             if i == self._skip - 1: self._obs_buffer[1] = obs
113 |             total_reward += reward
114 |             if done:
115 |                 break
116 |         # Note that the observation on the done=True frame
117 |         # doesn't matter
118 |         max_frame = self._obs_buffer.max(axis=0)
119 | 
120 |         return max_frame, total_reward, done, info
121 | 
122 |     def reset(self, **kwargs):
123 |         return self.env.reset(**kwargs)
124 | 
125 | class ClipRewardEnv(gym.RewardWrapper):
126 |     def __init__(self, env):
127 |         gym.RewardWrapper.__init__(self, env)
128 | 
129 |     def reward(self, reward):
130 |         """Bin reward to {+1, 0, -1} by its sign."""
131 |         return np.sign(reward)
132 | 
133 | class WarpFrame(gym.ObservationWrapper):
134 |     def __init__(self, env):
135 |         """Warp frames to 84x84 as done in the Nature paper and later work."""
136 |         gym.ObservationWrapper.__init__(self, env)
137 |         self.width = 84
138 |         self.height = 84
139 |         self.observation_space = spaces.Box(low=0, high=255,
140 |             shape=(self.height, self.width, 1), dtype=np.uint8)
141 | 
142 |     def observation(self, frame):
143 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
144 |         frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
145 |         return frame[:, :, None]
146 | 
147 | class FrameStack(gym.Wrapper):
148 |     def __init__(self, env, k):
149 |         """Stack k last frames.
150 | 
151 |         Returns lazy array, which is much more memory efficient.
152 | 
153 |         See Also
154 |         --------
155 |         baselines.common.atari_wrappers.LazyFrames
156 |         """
157 |         gym.Wrapper.__init__(self, env)
158 |         self.k = k
159 |         self.frames = deque([], maxlen=k)
160 |         shp = env.observation_space.shape
161 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
162 | 
163 |     def reset(self):
164 |         ob = self.env.reset()
165 |         for _ in range(self.k):
166 |             self.frames.append(ob)
167 |         return self._get_ob()
168 | 
169 |     def step(self, action):
170 |         ob, reward, done, info = self.env.step(action)
171 |         self.frames.append(ob)
172 |         return self._get_ob(), reward, done, info
173 | 
174 |     def _get_ob(self):
175 |         assert len(self.frames) == self.k
176 |         return LazyFrames(list(self.frames))
177 | 
178 | class ScaledFloatFrame(gym.ObservationWrapper):
179 |     def __init__(self, env):
180 |         gym.ObservationWrapper.__init__(self, env)
181 | 
182 |     def observation(self, observation):
183 |         # careful! This undoes the memory optimization, use
184 |         # with smaller replay buffers only.
185 |         return np.array(observation).astype(np.float32) / 255.0
186 | 
187 | class LazyFrames(object):
188 |     def __init__(self, frames):
189 |         """This object ensures that common frames between the observations are only stored once.
190 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
191 |         buffers.
192 | 
193 |         This object should only be converted to numpy array before being passed to the model.
194 | 
195 |         You'd not believe how complex the previous solution was."""
196 |         self._frames = frames
197 |         self._out = None
198 | 
199 |     def _force(self):
200 |         if self._out is None:
201 |             self._out = np.concatenate(self._frames, axis=2)
202 |             self._frames = None
203 |         return self._out
204 | 
205 |     def __array__(self, dtype=None):
206 |         out = self._force()
207 |         if dtype is not None:
208 |             out = out.astype(dtype)
209 |         return out
210 | 
211 |     def __len__(self):
212 |         return len(self._force())
213 | 
214 |     def __getitem__(self, i):
215 |         return self._force()[i]
216 | 
217 | # EDIT BY ATAMAI
218 | # Preparing image received from environment and adjust it to expected format of Pytorch 
219 | # HWC (height x width x channel) becomes CHW
220 | class PytorchImage(gym.ObservationWrapper):
221 |     def __init__(self, env):
222 |         super(PytorchImage, self).__init__(env)
223 | 	# we check current shape of observations in environment
224 |         current_shape = self.observation_space.shape
225 |         # we change order of dimensions - so last one (-1) becomes first
226 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(current_shape[-1], current_shape[0], current_shape[1]))
227 | 
228 |     def observation(self, observation):
229 |         # and finally we change order of dimensions for every single observation
230 |         # here transpose method could be also used
231 |         return np.swapaxes(observation, 2, 0)
232 | 
233 | def make_atari(env_id, monitor = False, epidsode_capture = 75):
234 |     env = gym.make(env_id)
235 |     if monitor == True:
236 |         env = wrappers.Monitor(env, "Videos/", resume=True, force =True, video_callable=lambda episode_id: episode_id%epidsode_capture==0)   
237 |     assert 'NoFrameskip' in env.spec.id
238 |     env = NoopResetEnv(env, noop_max=30)
239 |     env = MaxAndSkipEnv(env, skip=4)
240 |     return env
241 | 
242 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False, pytorch_img=False):
243 |     """Configure environment for DeepMind-style Atari.
244 |     """
245 |     if episode_life:
246 |         env = EpisodicLifeEnv(env)
247 |     if 'FIRE' in env.unwrapped.get_action_meanings():
248 |         env = FireResetEnv(env)
249 |     env = WarpFrame(env)
250 |     if scale:
251 |         env = ScaledFloatFrame(env)
252 |     if clip_rewards:
253 |         env = ClipRewardEnv(env)
254 |     if frame_stack:
255 |         env = FrameStack(env, 4)
256 |     if pytorch_img:
257 |         env = PytorchImage(env)
258 |     return env
259 | 
260 | 


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/CNN_Dueling_DDQN.py:
--------------------------------------------------------------------------------
  1 | import math, random
  2 | from collections import deque
  3 | import cv2
  4 | 
  5 | import gym
  6 | from gym import wrappers
  7 | import wrapper
  8 | import numpy as np
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.optim as optim
 13 | import torch.autograd as autograd 
 14 | import torch.nn.functional as F
 15 | from IPython.display import clear_output
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | 
 19 | USE_CUDA = torch.cuda.is_available()
 20 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
 21 | 
 22 | class ReplayBuffer(object):
 23 |     def __init__(self, capacity):
 24 |         self.buffer = deque(maxlen=capacity)
 25 |     
 26 |     def push(self, state, action, reward, next_state, done):
 27 |         state      = np.expand_dims(state, 0)
 28 |         next_state = np.expand_dims(next_state, 0)
 29 |             
 30 |         self.buffer.append((state, action, reward, next_state, done))
 31 |     
 32 |     def sample(self, batch_size):
 33 |         state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
 34 |         return np.concatenate(state), action, reward, np.concatenate(next_state), done
 35 |     
 36 |     def __len__(self):
 37 |         return len(self.buffer)
 38 | 
 39 | class CnnDQN(nn.Module):
 40 |     def __init__(self, input_shape, num_actions):
 41 |         super(CnnDQN, self).__init__()
 42 |         
 43 |         self.input_shape = input_shape
 44 |         self.num_actions = num_actions
 45 |         
 46 |         self.convolutional_layers = nn.Sequential(
 47 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 48 |             nn.ReLU(),
 49 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 50 |             nn.ReLU(),
 51 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 52 |             nn.ReLU()
 53 |         )
 54 |         
 55 |         self.value_layer = nn.Sequential(
 56 |             nn.Linear(self.feature_size(), 512),
 57 |             nn.ReLU(),
 58 |             nn.Linear(512, 1)
 59 |         )
 60 |         self.advantage_layer = nn.Sequential(
 61 |             nn.Linear(self.feature_size(), 512),
 62 |             nn.ReLU(),
 63 |             nn.Linear(512, self.num_actions)
 64 |         )
 65 | 
 66 |     def forward(self, x):
 67 |         x = self.convolutional_layers(x)
 68 |         x = x.view(x.size(0), -1)
 69 |         value = self.value_layer(x) # shape [1,1]
 70 |         value = value.expand(x.size(0), self.num_actions) # shape [1,6] 
 71 |         advantage = self.advantage_layer(x) #shape [1,6]
 72 |         advantage_mean = advantage.mean(1)#shape [1]
 73 |         advantage_mean = advantage_mean.unsqueeze(1) #shape[1,1]
 74 |         advantage_mean = advantage_mean.expand(x.size(0), self.num_actions) #shape [1,6]
 75 |         Q = value + advantage - advantage_mean
 76 |         #print("Q-Values: ",Q)
 77 |         return Q
 78 |     
 79 |     def feature_size(self):
 80 |         #Calculate the output size of the CNN
 81 |         return self.convolutional_layers(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
 82 |     
 83 |     def act(self, state, epsilon,action_space):
 84 |         if random.random() > epsilon:
 85 |             with torch.no_grad():
 86 |                 state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0))
 87 |                 q_value = self.forward(state)
 88 |                 action  = q_value.max(1)[1].data[0] #.max(1)  maxdata: values--[0] and idx--[1] 
 89 |         else:
 90 |             action = random.randrange(action_space)
 91 |         return action
 92 |     
 93 | def update_target(current_model, target_model):
 94 |     target_model.load_state_dict(current_model.state_dict())
 95 | 
 96 | def save_model(model, idx):
 97 |     torch.save(model, "Saved_models/")
 98 | 
 99 | def epsilon_by_frame(frame_idx):
100 |     epsilon_start = 1.0
101 |     epsilon_final = 0.01 #0.01
102 |     epsilon_decay = 30000 #30000
103 |     eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
104 |     return eps
105 | 
106 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,replay_buffer):
107 |     state, action, reward, next_state, done = replay_buffer.sample(batch_size)
108 |                                                                 # shapes for normal image-- stacked (4,84,84) ...
109 |     state      = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84)
110 |     next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84)
111 |     action     = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function
112 |     reward     = Variable(torch.FloatTensor(reward)) #shape [32]
113 |     done       = Variable(torch.FloatTensor(done)) #shape [32]
114 |     
115 |     q_values      = current_model(state) #shape [32,6]
116 |     next_q_values = current_model(next_state) #shape [32,6]
117 |     next_q_state_values = target_model(next_state) #shape [32,6]
118 |     
119 |     q_value       = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action
120 |     next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1] 
121 |     expected_q_value = reward + gamma * next_q_value * (1 - done)  # shape [32]
122 |     
123 |     
124 |     # DeepMind took nn.SmoothL1Loss()
125 |     #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss  -- .data to get rid of grad_fn=<AddBackward0>
126 |     loss = loss_func(q_value,Variable(expected_q_value.data))
127 |     
128 |     opti.zero_grad()
129 |     loss.backward()
130 |     opti.step()
131 |     return loss
132 | 
133 | def plot(frame_idx, rewards, losses):
134 |     plt.close()
135 |     plt.figure(figsize=(20,5))
136 |     plt.subplot(121)
137 |     plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2)))
138 |     plt.plot(rewards)
139 |     plt.subplot(122)
140 |     plt.title("loss")
141 |     plt.plot(losses)
142 |     plt.ylim(0,1)
143 |     plt.draw()
144 |     plt.pause(0.0001)
145 | 
146 | def processing(img):
147 |     img = np.expand_dims(cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), (84,84)),axis= 0)
148 |     img = img.astype(np.uint8)
149 |     #print(img.dtype)
150 |     return img
151 | 
152 | def main():
153 |     plt.ion()
154 |     env = wrapper.make_atari("RiverraidNoFrameskip-v4", monitor=True,epidsode_capture=50)
155 |     env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True)
156 |     action_space = env.action_space.n
157 |     current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape
158 |     target_model  = CnnDQN(env.observation_space.shape, action_space)    
159 | 
160 |     if USE_CUDA:
161 |         current_model = current_model.cuda()
162 |         target_model  = target_model.cuda()
163 | 
164 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000)
165 |     #opti = optim.Adam(current_model.parameters(), lr=0.0001) 
166 |     opti = optim.RMSprop(current_model.parameters(), lr=0.0001)
167 |     loss_func = nn.SmoothL1Loss()
168 | 
169 |     replay_initial = 10000
170 |     replay_buffer = ReplayBuffer(100000)
171 | 
172 |     num_frames = 1000000
173 |     batch_size = 32
174 |     gamma      = 0.99
175 | 
176 |     losses = []
177 |     all_rewards = []
178 |     episode_reward = 0
179 | 
180 |     state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84)
181 |     # Manuel Stacking
182 |     #state = processing(state)
183 |     #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
184 |     #assert state.shape == (4,84,84)
185 |     for frame_idx in range(1, num_frames + 1):
186 |         
187 |         epsilon = epsilon_by_frame(frame_idx)
188 |         print("Training :: Frame {} :: Epsilon {} ".format(frame_idx, round(epsilon,2)))
189 |         action = current_model.act(state, epsilon,action_space)
190 |         next_state, reward, done, _ = env.step(action)
191 |         # Manuel Stacking
192 |         #next_state = processing(next_state)
193 |         #next_state = np.append(next_state, state[1:, :, :],axis= 0)
194 |         #assert next_state.shape == (4,84,84)
195 |         replay_buffer.push(state, action, reward, next_state, done)
196 |         
197 |         state = next_state
198 |         episode_reward += reward
199 |         
200 |         if done:
201 |             state = env.reset()
202 |             # Manuel Stacking
203 |             #state = processing(state)
204 |             #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
205 |             all_rewards.append(episode_reward)
206 |             episode_reward = 0
207 |             
208 |         if len(replay_buffer) > replay_initial:
209 |             loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,replay_buffer)
210 |             losses.append(loss.item()) 
211 |             
212 |         if frame_idx % 10000 == 0:
213 |             plot(frame_idx, all_rewards, losses)
214 |             
215 |         if frame_idx % 1000 == 0:
216 |             update_target(current_model, target_model)
217 |         
218 |         #if frame_idx % 100000 ==0:
219 |         #    save_model(current_model, frame_idx)
220 | 
221 | if __name__ == "__main__":
222 |     main()


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/CNN_Dueling_DDQN_PER.py:
--------------------------------------------------------------------------------
  1 | import math, random
  2 | from collections import deque
  3 | import cv2
  4 | 
  5 | import gym
  6 | from gym import wrappers
  7 | import wrapper
  8 | import numpy as np
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.optim as optim
 13 | import torch.autograd as autograd 
 14 | import torch.nn.functional as F
 15 | from IPython.display import clear_output
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | from PrioritizedExperienceReplay import PrioritizedReplay
 19 | 
 20 | USE_CUDA = torch.cuda.is_available()
 21 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
 22 | 
 23 | class CnnDQN(nn.Module):
 24 |     def __init__(self, input_shape, num_actions):
 25 |         super(CnnDQN, self).__init__()
 26 |         
 27 |         self.input_shape = input_shape
 28 |         self.num_actions = num_actions
 29 |         
 30 |         self.convolutional_layers = nn.Sequential(
 31 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 32 |             nn.ReLU(),
 33 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 34 |             nn.ReLU(),
 35 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 36 |             nn.ReLU()
 37 |         )
 38 |         
 39 |         self.value_layer = nn.Sequential(
 40 |             nn.Linear(self.feature_size(), 512),
 41 |             nn.ReLU(),
 42 |             nn.Linear(512, 1)
 43 |         )
 44 |         self.advantage_layer = nn.Sequential(
 45 |             nn.Linear(self.feature_size(), 512),
 46 |             nn.ReLU(),
 47 |             nn.Linear(512, self.num_actions)
 48 |         )
 49 | 
 50 |     def forward(self, x):
 51 |         x = self.convolutional_layers(x)
 52 |         x = x.view(x.size(0), -1)
 53 |         value = self.value_layer(x) # shape [1,1]
 54 |         value = value.expand(x.size(0), self.num_actions) # shape [1,6] 
 55 |         advantage = self.advantage_layer(x) #shape [1,6]
 56 |         advantage_mean = advantage.mean(1)#shape [1]
 57 |         advantage_mean = advantage_mean.unsqueeze(1) #shape[1,1]
 58 |         advantage_mean = advantage_mean.expand(x.size(0), self.num_actions) #shape [1,6]
 59 |         Q = value + advantage - advantage_mean
 60 |         #print("Q-Values: ",Q)
 61 |         return Q
 62 |     
 63 |     def feature_size(self):
 64 |         #Calculate the output size of the CNN
 65 |         return self.convolutional_layers(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
 66 |     
 67 |     def act(self, state, epsilon,action_space):
 68 |         if random.random() > epsilon:
 69 |             with torch.no_grad():
 70 |                 state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0))
 71 |                 q_value = self.forward(state)
 72 |                 action  = q_value.max(1)[1].data[0] #.max(1)  maxdata: values--[0] and idx--[1] 
 73 |         else:
 74 |             action = random.randrange(action_space)
 75 |         return action
 76 |     
 77 | def update_target(current_model, target_model):
 78 |     target_model.load_state_dict(current_model.state_dict())
 79 | 
 80 | def save_model(model, idx):
 81 |     torch.save(model, "Saved_models/")
 82 | 
 83 | def epsilon_by_frame(frame_idx):
 84 |     epsilon_start = 1.0
 85 |     epsilon_final = 0.01 #0.01
 86 |     epsilon_decay = 30000 #30000
 87 |     eps = epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
 88 |     return eps
 89 | 
 90 | def compute_td_loss(batch_size,current_model,target_model,opti,loss_func,gamma,PER):
 91 |     state, action, reward, next_state, done,idx,weights = PER.sample(batch_size)
 92 |                                                                 # shapes for normal image-- stacked (4,84,84) ...
 93 |     state      = Variable(torch.FloatTensor(np.float32(state))) #shape (1,84,84)
 94 |     next_state = Variable(torch.FloatTensor(np.float32(next_state))) #shape (1,84,84)
 95 |     action     = Variable(torch.LongTensor(action)) #shape [32] -- has to be long for gather function
 96 |     reward     = Variable(torch.FloatTensor(reward)) #shape [32]
 97 |     done       = Variable(torch.FloatTensor(done)) #shape [32]
 98 |     weights    = Variable(torch.FloatTensor(weights)) #shape [32]
 99 |     
100 |     q_values      = current_model(state) #shape [32,6]
101 |     next_q_values = current_model(next_state) #shape [32,6]
102 |     next_q_state_values = target_model(next_state) #shape [32,6]
103 |     
104 |     q_value       = q_values.gather(1, action.unsqueeze(1)).squeeze(1) #shape [32] gathers q_values by the index of action
105 |     next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) #shape [32] torch.max(nqv,1) gives the maxvalues--[0] and idx--[1] 
106 |     expected_q_value = reward + gamma * next_q_value * (1 - done)  # shape [32]
107 |     
108 |     
109 |     # DeepMind took nn.SmoothL1Loss()
110 |     #loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() #standard loss  -- .data to get rid of grad_fn=<AddBackward0>
111 |     loss = loss_func(q_value,Variable(expected_q_value.data))*weights
112 |     prios = loss + 1e-5
113 |     loss = loss.mean()
114 | 
115 |     
116 |     opti.zero_grad()
117 |     loss.backward()
118 |     PER.update_priorities(idx, prios.data.cpu().numpy())
119 |     opti.step()
120 |     return loss
121 | 
122 | def plot(frame_idx, rewards, losses):
123 |     plt.close()
124 |     plt.figure(figsize=(20,5))
125 |     plt.subplot(121)
126 |     plt.title("frames {}. reward: {}" .format(frame_idx, np.round(np.mean(rewards[-10:]),2)))
127 |     plt.plot(rewards)
128 |     plt.subplot(122)
129 |     plt.title("loss")
130 |     plt.plot(losses)
131 |     plt.ylim(0,1)
132 |     plt.draw()
133 |     plt.pause(0.0001)
134 | 
135 | 
136 | def main():
137 |     plt.ion()
138 |     env = wrapper.make_atari("BreakoutNoFrameskip-v4", monitor=True,epidsode_capture=50)#Riverraid Frostbite Enduro
139 |     env = wrapper.wrap_deepmind(env,frame_stack=True, pytorch_img = True)
140 |     action_space = env.action_space.n
141 |     current_model = CnnDQN(env.observation_space.shape, action_space)#env.observation_space.shape
142 |     target_model  = CnnDQN(env.observation_space.shape, action_space)    
143 | 
144 |     if USE_CUDA:
145 |         current_model = current_model.cuda()
146 |         target_model  = target_model.cuda()
147 | 
148 | #DeepMind took optim.RMSprop(current_model.parameters(), lr=0.000)
149 |     #opti = optim.Adam(current_model.parameters(), lr=0.0001) 
150 |     opti = optim.RMSprop(current_model.parameters(), lr=0.0001)
151 |     loss_func = nn.SmoothL1Loss()
152 | 
153 |     replay_initial = 10000
154 |     PER = PrioritizedReplay(100000,alpha = 0.6,beta_start =0.4,beta_frames=1000000)
155 | 
156 |     num_frames = 1000000
157 |     batch_size = 32
158 |     gamma      = 0.99
159 | 
160 |     losses = []
161 |     all_rewards = []
162 |     episode_reward = 0
163 | 
164 |     state = env.reset() # shape normal:(1,84,84) -stacked (4,84,84)
165 |     # Manuel Stacking
166 |     #state = processing(state)
167 |     #state = np.stack((state,state,state,state),axis = 1).squeeze(0)
168 |     #assert state.shape == (4,84,84)
169 |     for frame_idx in range(1, num_frames + 1):
170 |         
171 |         epsilon = epsilon_by_frame(frame_idx)
172 |         action = current_model.act(state, epsilon,action_space)
173 |         next_state, reward, done, _ = env.step(action)
174 |         print("Training :: Frame {} :: Epsilon {} :: Reward {} ".format(frame_idx, round(epsilon,2),reward))
175 |         # Manuel Stacking
176 |         #next_state = processing(next_state)
177 |         #next_state = np.append(next_state, state[1:, :, :],axis= 0)
178 |         #assert next_state.shape == (4,84,84)
179 |         PER.push(state, action, reward, next_state, done)
180 |         
181 |         state = next_state
182 |         episode_reward += reward
183 |         
184 |         if done:
185 |             state = env.reset()
186 | 
187 |             all_rewards.append(episode_reward)
188 |             episode_reward = 0
189 |             
190 |         if PER.__len__() > replay_initial:
191 |             loss = compute_td_loss(batch_size,current_model, target_model,opti,loss_func,gamma,PER)
192 |             losses.append(loss.item()) 
193 |             
194 |         if frame_idx % 10000 == 0:
195 |             plot(frame_idx, all_rewards, losses)
196 |             
197 |         if frame_idx % 1000 == 0:
198 |             update_target(current_model, target_model)
199 |         
200 | 
201 | if __name__ == "__main__":
202 |     main()
203 | 


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Img/Duel_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Img/Duel_per.png


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Img/Dueling_DQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Img/Dueling_DQN.png


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/PrioritizedExperienceReplay.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class PrioritizedReplay(object):
 4 |     def __init__(self, capacity, alpha=0.6,beta_start = 0.4,beta_frames=100000):
 5 |         self.alpha = alpha
 6 |         self.beta_start = beta_start
 7 |         self.beta_frames = beta_frames
 8 |         self.frame = 1 #for beta calculation
 9 |         self.capacity   = capacity
10 |         self.buffer     = []
11 |         self.pos        = 0
12 |         self.priorities = np.zeros((capacity,), dtype=np.float32)
13 |     
14 |     def beta_by_frame(self, frame_idx):
15 |         return min(1.0, self.beta_start + frame_idx * (1.0 - self.beta_start) / self.beta_frames)
16 |     
17 |     def push(self, state, action, reward, next_state, done):
18 |         assert state.ndim == next_state.ndim
19 |         state      = np.expand_dims(state, 0)
20 |         next_state = np.expand_dims(next_state, 0)
21 |         
22 |         max_prio = self.priorities.max() if self.buffer else 1.0 # gives max priority if buffer is not empty else 1
23 |         
24 |         if len(self.buffer) < self.capacity:
25 |             self.buffer.append((state, action, reward, next_state, done))
26 |         else:
27 |             # puts the new data on the position of the oldes since it circles via pos variable
28 |             # since if len(buffer) == capacity -> pos == 0 -> oldest memory (at least for the first round?) 
29 |             self.buffer[self.pos] = (state, action, reward, next_state, done) 
30 |         
31 |         self.priorities[self.pos] = max_prio
32 |         self.pos = (self.pos + 1) % self.capacity # lets the pos circle in the ranges of capacity if pos+1 > cap --> new posi = 0
33 |     
34 |     def sample(self, batch_size):
35 |         N = len(self.buffer)
36 |         if N == self.capacity:
37 |             prios = self.priorities
38 |         else:
39 |             prios = self.priorities[:self.pos]
40 |         # calc P = p^a/sum(p^a)
41 |         probs  = prios ** self.alpha
42 |         P = probs/probs.sum()
43 |         
44 |         indices = np.random.choice(N, batch_size, p=P) # gets the indices depending on the probability p
45 |         samples = [self.buffer[idx] for idx in indices]
46 |         
47 |         beta = self.beta_by_frame(self.frame)
48 |         self.frame+=1
49 |         
50 |          #min of ALL probs, not just sampled probs
51 |         P_min = P.min()
52 |         max_weight = (P_min*N)**(-beta)
53 |         
54 |         #Compute importance-sampling weight step:10 pseudo code
55 |         weights  = (N * P[indices]) ** (-beta)
56 |         weights /= weights.max() # max_weights
57 |         weights  = np.array(weights, dtype=np.float32) #torch.tensor(weights, device=device, dtype=torch.float)
58 |         
59 |         #print("Sample-shape befor zipping: ", samples)
60 |         states, actions, rewards, next_states, dones = zip(*samples) # example: p = [[1,2,3],[4,5,6]] ,d=zip(*p) -> d = [(1, 4), (2, 5), (3, 6)]
61 |         return np.concatenate(states), actions, rewards, np.concatenate(next_states), dones, indices, weights
62 |     
63 |     def update_priorities(self, batch_indices, batch_priorities):
64 |         for idx, prio in zip(batch_indices, batch_priorities):
65 |             self.priorities[idx] = prio 
66 | 
67 |     def __len__(self):
68 |         return len(self.buffer)
69 | 


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Video/Breakout.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Video/Breakout.mp4


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/Video/Pong.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Dueling Deep Q-Network/Video/Pong.mp4


--------------------------------------------------------------------------------
/Dueling Deep Q-Network/wrapper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import deque
  3 | import gym
  4 | 
  5 | from gym import spaces,wrappers
  6 | import cv2
  7 | cv2.ocl.setUseOpenCL(False)
  8 | 
  9 | class NoopResetEnv(gym.Wrapper):
 10 |     def __init__(self, env, noop_max=30):
 11 |         """Sample initial states by taking random number of no-ops on reset.
 12 |         No-op is assumed to be action 0.
 13 |         """
 14 |         gym.Wrapper.__init__(self, env)
 15 |         self.noop_max = noop_max
 16 |         self.override_num_noops = None
 17 |         self.noop_action = 0
 18 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 19 | 
 20 |     def reset(self, **kwargs):
 21 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 22 |         self.env.reset(**kwargs)
 23 |         if self.override_num_noops is not None:
 24 |             noops = self.override_num_noops
 25 |         else:
 26 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 27 |         assert noops > 0
 28 |         obs = None
 29 |         for _ in range(noops):
 30 |             obs, _, done, _ = self.env.step(self.noop_action)
 31 |             if done:
 32 |                 obs = self.env.reset(**kwargs)
 33 |         return obs
 34 | 
 35 |     def step(self, ac):
 36 |         return self.env.step(ac)
 37 | 
 38 | class FireResetEnv(gym.Wrapper):
 39 |     def __init__(self, env):
 40 |         """Take action on reset for environments that are fixed until firing."""
 41 |         gym.Wrapper.__init__(self, env)
 42 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 43 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 44 | 
 45 |     def reset(self, **kwargs):
 46 |         self.env.reset(**kwargs)
 47 |         obs, _, done, _ = self.env.step(1)
 48 |         if done:
 49 |             self.env.reset(**kwargs)
 50 |         obs, _, done, _ = self.env.step(2)
 51 |         if done:
 52 |             self.env.reset(**kwargs)
 53 |         return obs
 54 | 
 55 |     def step(self, ac):
 56 |         return self.env.step(ac)
 57 | 
 58 | class EpisodicLifeEnv(gym.Wrapper):
 59 |     def __init__(self, env):
 60 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 61 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 62 |         """
 63 |         gym.Wrapper.__init__(self, env)
 64 |         self.lives = 0
 65 |         self.was_real_done  = True
 66 | 
 67 |     def step(self, action):
 68 |         obs, reward, done, info = self.env.step(action)
 69 |         self.was_real_done = done
 70 |         # check current lives, make loss of life terminal,
 71 |         # then update lives to handle bonus lives
 72 |         lives = self.env.unwrapped.ale.lives()
 73 |         if lives < self.lives and lives > 0:
 74 |             # for Qbert sometimes we stay in lives == 0 condtion for a few frames
 75 |             # so its important to keep lives > 0, so that we only reset once
 76 |             # the environment advertises done.
 77 |             done = True
 78 |         self.lives = lives
 79 |         return obs, reward, done, info
 80 | 
 81 |     def reset(self, **kwargs):
 82 |         """Reset only when lives are exhausted.
 83 |         This way all states are still reachable even though lives are episodic,
 84 |         and the learner need not know about any of this behind-the-scenes.
 85 |         """
 86 |         if self.was_real_done:
 87 |             obs = self.env.reset(**kwargs)
 88 |         else:
 89 |             # no-op step to advance from terminal/lost life state
 90 |             obs, _, _, _ = self.env.step(0)
 91 |         self.lives = self.env.unwrapped.ale.lives()
 92 |         return obs
 93 | 
 94 | class MaxAndSkipEnv(gym.Wrapper):
 95 |     def __init__(self, env, skip=4):
 96 |         """Return only every `skip`-th frame"""
 97 |         gym.Wrapper.__init__(self, env)
 98 |         # most recent raw observations (for max pooling across time steps)
 99 |         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
100 |         self._skip       = skip
101 | 
102 |     def reset(self):
103 |         return self.env.reset()
104 | 
105 |     def step(self, action):
106 |         """Repeat action, sum reward, and max over last observations."""
107 |         total_reward = 0.0
108 |         done = None
109 |         for i in range(self._skip):
110 |             obs, reward, done, info = self.env.step(action)
111 |             if i == self._skip - 2: self._obs_buffer[0] = obs
112 |             if i == self._skip - 1: self._obs_buffer[1] = obs
113 |             total_reward += reward
114 |             if done:
115 |                 break
116 |         # Note that the observation on the done=True frame
117 |         # doesn't matter
118 |         max_frame = self._obs_buffer.max(axis=0)
119 | 
120 |         return max_frame, total_reward, done, info
121 | 
122 |     def reset(self, **kwargs):
123 |         return self.env.reset(**kwargs)
124 | 
125 | class ClipRewardEnv(gym.RewardWrapper):
126 |     def __init__(self, env):
127 |         gym.RewardWrapper.__init__(self, env)
128 | 
129 |     def reward(self, reward):
130 |         """Bin reward to {+1, 0, -1} by its sign."""
131 |         return np.sign(reward)
132 | 
133 | class WarpFrame(gym.ObservationWrapper):
134 |     def __init__(self, env):
135 |         """Warp frames to 84x84 as done in the Nature paper and later work."""
136 |         gym.ObservationWrapper.__init__(self, env)
137 |         self.width = 84
138 |         self.height = 84
139 |         self.observation_space = spaces.Box(low=0, high=255,
140 |             shape=(self.height, self.width, 1), dtype=np.uint8)
141 | 
142 |     def observation(self, frame):
143 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
144 |         frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
145 |         return frame[:, :, None]
146 | 
147 | class FrameStack(gym.Wrapper):
148 |     def __init__(self, env, k):
149 |         """Stack k last frames.
150 | 
151 |         Returns lazy array, which is much more memory efficient.
152 | 
153 |         See Also
154 |         --------
155 |         baselines.common.atari_wrappers.LazyFrames
156 |         """
157 |         gym.Wrapper.__init__(self, env)
158 |         self.k = k
159 |         self.frames = deque([], maxlen=k)
160 |         shp = env.observation_space.shape
161 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
162 | 
163 |     def reset(self):
164 |         ob = self.env.reset()
165 |         for _ in range(self.k):
166 |             self.frames.append(ob)
167 |         return self._get_ob()
168 | 
169 |     def step(self, action):
170 |         ob, reward, done, info = self.env.step(action)
171 |         self.frames.append(ob)
172 |         return self._get_ob(), reward, done, info
173 | 
174 |     def _get_ob(self):
175 |         assert len(self.frames) == self.k
176 |         return LazyFrames(list(self.frames))
177 | 
178 | class ScaledFloatFrame(gym.ObservationWrapper):
179 |     def __init__(self, env):
180 |         gym.ObservationWrapper.__init__(self, env)
181 | 
182 |     def observation(self, observation):
183 |         # careful! This undoes the memory optimization, use
184 |         # with smaller replay buffers only.
185 |         return np.array(observation).astype(np.float32) / 255.0
186 | 
187 | class LazyFrames(object):
188 |     def __init__(self, frames):
189 |         """This object ensures that common frames between the observations are only stored once.
190 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
191 |         buffers.
192 | 
193 |         This object should only be converted to numpy array before being passed to the model.
194 | 
195 |         You'd not believe how complex the previous solution was."""
196 |         self._frames = frames
197 |         self._out = None
198 | 
199 |     def _force(self):
200 |         if self._out is None:
201 |             self._out = np.concatenate(self._frames, axis=2)
202 |             self._frames = None
203 |         return self._out
204 | 
205 |     def __array__(self, dtype=None):
206 |         out = self._force()
207 |         if dtype is not None:
208 |             out = out.astype(dtype)
209 |         return out
210 | 
211 |     def __len__(self):
212 |         return len(self._force())
213 | 
214 |     def __getitem__(self, i):
215 |         return self._force()[i]
216 | 
217 | # EDIT BY ATAMAI
218 | # Preparing image received from environment and adjust it to expected format of Pytorch 
219 | # HWC (height x width x channel) becomes CHW
220 | class PytorchImage(gym.ObservationWrapper):
221 |     def __init__(self, env):
222 |         super(PytorchImage, self).__init__(env)
223 | 	# we check current shape of observations in environment
224 |         current_shape = self.observation_space.shape
225 |         # we change order of dimensions - so last one (-1) becomes first
226 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(current_shape[-1], current_shape[0], current_shape[1]))
227 | 
228 |     def observation(self, observation):
229 |         # and finally we change order of dimensions for every single observation
230 |         # here transpose method could be also used
231 |         return np.swapaxes(observation, 2, 0)
232 | 
233 | def make_atari(env_id, monitor = False, epidsode_capture = 75):
234 |     env = gym.make(env_id)
235 |     if monitor == True:
236 |         env = wrappers.Monitor(env, "Videos/", resume=True, force =True, video_callable=lambda episode_id: episode_id%epidsode_capture==0)   
237 |     assert 'NoFrameskip' in env.spec.id
238 |     env = NoopResetEnv(env, noop_max=30)
239 |     env = MaxAndSkipEnv(env, skip=4)
240 |     return env
241 | 
242 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False, pytorch_img=False):
243 |     """Configure environment for DeepMind-style Atari.
244 |     """
245 |     if episode_life:
246 |         env = EpisodicLifeEnv(env)
247 |     if 'FIRE' in env.unwrapped.get_action_meanings():
248 |         env = FireResetEnv(env)
249 |     env = WarpFrame(env)
250 |     if scale:
251 |         env = ScaledFloatFrame(env)
252 |     if clip_rewards:
253 |         env = ClipRewardEnv(env)
254 |     if frame_stack:
255 |         env = FrameStack(env, 4)
256 |     if pytorch_img:
257 |         env = PytorchImage(env)
258 |     return env
259 | 
260 | 


--------------------------------------------------------------------------------
/Paper/A3C.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/A3C.pdf


--------------------------------------------------------------------------------
/Paper/DDPG.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/DDPG.pdf


--------------------------------------------------------------------------------
/Paper/DQN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/DQN.pdf


--------------------------------------------------------------------------------
/Paper/Distributional DQN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Distributional DQN.pdf


--------------------------------------------------------------------------------
/Paper/Double_DQN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Double_DQN.pdf


--------------------------------------------------------------------------------
/Paper/Dueling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Dueling.pdf


--------------------------------------------------------------------------------
/Paper/GAE.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/GAE.pdf


--------------------------------------------------------------------------------
/Paper/Noisy_networks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/Noisy_networks.pdf


--------------------------------------------------------------------------------
/Paper/PPO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/PPO.pdf


--------------------------------------------------------------------------------
/Paper/SAC_2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/SAC_2019.pdf


--------------------------------------------------------------------------------
/Paper/TD3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Paper/TD3.pdf


--------------------------------------------------------------------------------
/Policy Gradient Algorithms/Parallel_processing.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process, Pipe
  2 | import numpy as np
  3 | 
  4 | def worker(remote, parent_remote, env_fn_wrapper):
  5 |     parent_remote.close()
  6 |     env = env_fn_wrapper.x()
  7 |     while True:
  8 |         cmd, data = remote.recv()
  9 |         if cmd == 'step':
 10 |             ob, reward, done, info = env.step(data)
 11 |             if done:
 12 |                 ob = env.reset()
 13 |             remote.send((ob, reward, done, info))
 14 |         elif cmd == 'reset':
 15 |             ob = env.reset()
 16 |             remote.send(ob)
 17 |         elif cmd == 'reset_task':
 18 |             ob = env.reset_task()
 19 |             remote.send(ob)
 20 |         elif cmd == 'close':
 21 |             remote.close()
 22 |             break
 23 |         elif cmd == 'get_spaces':
 24 |             remote.send((env.observation_space, env.action_space))
 25 |         else:
 26 |             raise NotImplementedError
 27 | 
 28 | class CloudpickleWrapper(object):
 29 |     """
 30 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
 31 |     """
 32 |     def __init__(self, x):
 33 |         self.x = x
 34 |     def __getstate__(self):
 35 |         import cloudpickle
 36 |         return cloudpickle.dumps(self.x)
 37 |     def __setstate__(self, ob):
 38 |         import pickle
 39 |         self.x = pickle.loads(ob)
 40 |         
 41 | 
 42 | class VecEnv(object):
 43 |     """
 44 |     An abstract asynchronous, vectorized environment.
 45 |     """
 46 |     def __init__(self, num_envs, observation_space, action_space):
 47 |         self.num_envs = num_envs
 48 |         self.observation_space = observation_space
 49 |         self.action_space = action_space
 50 | 
 51 |     def reset(self):
 52 |         """
 53 |         Reset all the environments and return an array of
 54 |         observations, or a tuple of observation arrays.
 55 |         If step_async is still doing work, that work will
 56 |         be cancelled and step_wait() should not be called
 57 |         until step_async() is invoked again.
 58 |         """
 59 |         pass
 60 | 
 61 |     def step_async(self, actions):
 62 |         """
 63 |         Tell all the environments to start taking a step
 64 |         with the given actions.
 65 |         Call step_wait() to get the results of the step.
 66 |         You should not call this if a step_async run is
 67 |         already pending.
 68 |         """
 69 |         pass
 70 | 
 71 |     def step_wait(self):
 72 |         """
 73 |         Wait for the step taken with step_async().
 74 |         Returns (obs, rews, dones, infos):
 75 |          - obs: an array of observations, or a tuple of
 76 |                 arrays of observations.
 77 |          - rews: an array of rewards
 78 |          - dones: an array of "episode done" booleans
 79 |          - infos: a sequence of info objects
 80 |         """
 81 |         pass
 82 | 
 83 |     def close(self):
 84 |         """
 85 |         Clean up the environments' resources.
 86 |         """
 87 |         pass
 88 | 
 89 |     def step(self, actions):
 90 |         self.step_async(actions)
 91 |         return self.step_wait()
 92 | 
 93 | class SubprocVecEnv(VecEnv):
 94 |     def __init__(self, env_fns, spaces=None):
 95 |         """
 96 |         envs: list of gym environments to run in subprocesses
 97 |         """
 98 |         self.waiting = False
 99 |         self.closed = False
100 |         nenvs = len(env_fns)
101 |         self.nenvs = nenvs
102 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
103 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
104 |             for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
105 |         for p in self.ps:
106 |             p.daemon = True # if the main process crashes, we should not cause things to hang
107 |             p.start()
108 |         for remote in self.work_remotes:
109 |             remote.close()
110 | 
111 |         self.remotes[0].send(('get_spaces', None))
112 |         observation_space, action_space = self.remotes[0].recv()
113 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
114 | 
115 |     def step_async(self, actions):
116 |         for remote, action in zip(self.remotes, actions):
117 |             remote.send(('step', action))
118 |         self.waiting = True
119 | 
120 |     def step_wait(self):
121 |         results = [remote.recv() for remote in self.remotes]
122 |         self.waiting = False
123 |         obs, rews, dones, infos = zip(*results)
124 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
125 | 
126 |     def reset(self):
127 |         for remote in self.remotes:
128 |             remote.send(('reset', None))
129 |         return np.stack([remote.recv() for remote in self.remotes])
130 | 
131 |     def reset_task(self):
132 |         for remote in self.remotes:
133 |             remote.send(('reset_task', None))
134 |         return np.stack([remote.recv() for remote in self.remotes])
135 | 
136 |     def close(self):
137 |         if self.closed:
138 |             return
139 |         if self.waiting:
140 |             for remote in self.remotes:            
141 |                 remote.recv()
142 |         for remote in self.remotes:
143 |             remote.send(('close', None))
144 |         for p in self.ps:
145 |             p.join()
146 |             self.closed = True
147 |             
148 |     def __len__(self):
149 |         return self.nenvs


--------------------------------------------------------------------------------
/Policy Gradient Algorithms/REINFORCE/Img/Steps_needed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Policy Gradient Algorithms/REINFORCE/Img/Steps_needed.png


--------------------------------------------------------------------------------
/Policy Gradient Algorithms/REINFORCE/REINFORCE.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import torch.optim as optim
  4 | from torch.autograd import Variable
  5 | import gym
  6 | from gym import wrappers
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | from torch.distributions import Categorical
 10 | 
 11 | 
 12 | class Policy(nn.Module):
 13 |   def __init__(self,input_shape,action_shape):
 14 |     super().__init__()
 15 |     
 16 |     self.model = nn.Sequential(
 17 |         nn.Linear(input_shape[0],64),
 18 |         nn.ReLU(),
 19 |         nn.Linear(64,32),
 20 |         nn.ReLU(),
 21 |         nn.Linear(32,action_shape),
 22 |         nn.Softmax(dim = 1)
 23 |     )
 24 |   def forward(self,x):
 25 |     return self.model(x)    
 26 | 
 27 | def action(model, s):
 28 |   # simple pytorch aproach for action-selection and log-prob calc 
 29 |   #https://pytorch.org/docs/stable/distributions.html
 30 |   prob = model(s)
 31 |   m = Categorical(prob)
 32 |   a = m.sample()
 33 |   # log p(a∣π(s))
 34 |   log_p = m.log_prob(a)
 35 |   #print(a.item(), log_p)
 36 |   return a.item(), log_p
 37 | 
 38 |   # naive own numpy aproach attenion! grad gets lost by transforming prob to numpy:
 39 |   #possible_actions = [i for i in range(len(prob.data.detach().numpy()[0]))]
 40 |   # choose accordingly to probability:
 41 |   #action = np.random.choice(possible_actions, p = prob.data.detach().numpy()[0])
 42 |   #calculate the log-prob for the chosen action:
 43 |   #grad = prob[0][action].grad_fn
 44 |   #log_prob = np.log(prob.data.detach().numpy()[0][action])
 45 |   # transform to torch Tensor:
 46 |   #log_prob = torch.Tensor([log_prob]).unsqueeze(0)
 47 |   #log_prob = Variable(log_prob,requires_grad=True)
 48 |   #log_prob.backward()
 49 |   #print(log_prob)
 50 |   #print(action,log_prob)
 51 |   #return action, log_prob
 52 | 
 53 | def policy_optimization(ep, model, optimizer,batch_rewards,log_probs):
 54 |   R = 0
 55 |   gamma = 0.99
 56 |   policy_loss = []
 57 |   rewards = []
 58 |   #calc discounted Rewards
 59 |   for r in batch_rewards[::-1]: # reverses the list of rewards 
 60 |     R = r + gamma * R
 61 |     rewards.insert(0, R) # inserts the current rewart to first position
 62 |     
 63 |   rewards = torch.tensor(rewards)
 64 |   # standardization to get data of zero mean and varianz 1, stabilizes learning 
 65 |   #-- attention scaling rewards looses information of special events with higher rewards - addapting on different environments  
 66 |   rewards = (rewards - rewards.mean()) / (rewards.std() + ep)
 67 |   for log_prob, reward in zip(log_probs, rewards):
 68 |     policy_loss.append(-log_prob * reward) #baseline+
 69 |   
 70 |   optimizer.zero_grad()
 71 |   policy_loss = torch.cat(policy_loss).sum()
 72 |   policy_loss.backward()
 73 |   optimizer.step()
 74 |   
 75 | def run(episodes,model,env):
 76 |   optimizer = optim.Adam(model.parameters(), lr = 1e-2)
 77 |   rewards = []
 78 |   steps_taken = []
 79 |   
 80 |   for i in range(episodes):
 81 |     done = False
 82 |     ep_rewards = 0
 83 |     batch_rewards = []
 84 |     log_probs = []
 85 |     state = env.reset()
 86 |     steps = 0
 87 |     while not done:
 88 |       a, log_p = action(model, torch.Tensor(state).unsqueeze(0))
 89 |       log_probs.append(log_p)
 90 |       new_state, reward, done, info = env.step(a)
 91 |       batch_rewards.append(reward)
 92 |       ep_rewards += reward
 93 |       steps +=1
 94 |       
 95 |       
 96 | 
 97 |       state = new_state
 98 |       
 99 |       
100 |     rewards.append(ep_rewards)
101 |     steps_taken.append(steps)
102 |     print("Episode: {} --- Rewards: {} --- Steps: {}".format(i, ep_rewards, steps))
103 |     policy_optimization(i, model, optimizer, batch_rewards,log_probs)
104 | 
105 |   return steps_taken
106 |   
107 | def main():
108 |   USE_CUDA = torch.cuda.is_available()
109 |   Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
110 |   
111 |   env = gym.make("Acrobot-v1")
112 |   env = wrappers.Monitor(env, "Saved_Videos/", resume=True, force = True, video_callable=lambda episode_id: episode_id%40==0)
113 |   obs_shape = env.observation_space.shape
114 |   action_shape = env.action_space.n
115 |   episodes = 240
116 |   model = Policy(obs_shape, action_shape)
117 |   steps = run(episodes, model, env)
118 | 
119 |   plt.plot(steps)
120 |   plt.xlabel("Episodes")
121 |   plt.ylabel("Steps needed to reach goal")
122 |   plt.show()
123 | 
124 | if __name__ == "__main__":
125 |   #Argparse:
126 |   main()
127 | 


--------------------------------------------------------------------------------
/Q_Learning/FrozenLake_q-table.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import numpy as np 
 3 | import gym
 4 | import time
 5 | 
 6 | EPISODES = 5000
 7 | TRYS = 100
 8 | EPSILON = 0.9 # epsilon greedy
 9 | ALPHA = 0.1 # learning rate
10 | GAMMA = 0.9 #discount factor
11 | 
12 | 
13 | 
14 | 
15 | def make_Q_table(actions,n_states):
16 |     table = pd.DataFrame(
17 |         np.zeros((n_states, actions)), columns = list(range(actions)))     # q_table initial values
18 |     # print(table)    # show table
19 |     return table
20 |     
21 | def choose_action(state, q_table):
22 |     state_actions = q_table.iloc[state, :]
23 |     if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
24 |         action_name = np.random.choice(ACTIONS)
25 |     else:   # act greedy
26 |         action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
27 |     #print("Action_choosen: "+str(action_name))
28 |     return action_name
29 | 
30 | 
31 | def RL(ACTIONS,N_SPACE):
32 |     q_table = make_Q_table(ACTIONS,N_SPACE)
33 |     for episode in range(EPISODES):
34 |         S = env.reset()
35 |         for one_try in range(TRYS):  #how long one epidsode lasts
36 | 
37 |             env.render()
38 |             A = choose_action(S, q_table)
39 | 
40 |             S_,R,done,info = env.step(A)
41 |             #print(S_)
42 |             #time.sleep(1)
43 |             q_old = q_table.loc[S, A]  #Current Q-Value of the state
44 |             q_learned = R + GAMMA * q_table.iloc[S_, :].max()
45 |             q_table.loc[S, A] += ALPHA * (q_learned - q_old)  # update
46 |             S = S_  # move to next state  
47 |             if done:
48 |                 print("Episode finished after {} timesteps".format(one_try+1))
49 |                 break
50 | 
51 | 
52 |     return q_table
53 | 
54 | 
55 | if __name__ =="__main__":
56 |     env = gym.make("FrozenLake-v0")
57 |     print(gym.__version__)
58 | 
59 |     env.reset()
60 |     # getting space and action
61 |     ACTIONS = env.action_space.n   #env.unwrapped.get_action_meanings() to get a list of the action names
62 |     N_SPACE = env.observation_space.n
63 |     #print(ACTIONS)
64 |     #print(N_SPACE)
65 |     q_table = RL(ACTIONS,N_SPACE)
66 |     print("Q-Table: \n")
67 |     print(q_table)
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/Q_Learning/Img/Q_table10000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Q_table10000.png


--------------------------------------------------------------------------------
/Q_Learning/Img/Q_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Q_value.png


--------------------------------------------------------------------------------
/Q_Learning/Img/Receivedrewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/Receivedrewards.png


--------------------------------------------------------------------------------
/Q_Learning/Img/steps_taken.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Img/steps_taken.png


--------------------------------------------------------------------------------
/Q_Learning/Q_Table_E10000_a0.09_g0.9_eps0.9.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Q_Table_E10000_a0.09_g0.9_eps0.9.pkl


--------------------------------------------------------------------------------
/Q_Learning/Q_Table_E3000_a0.09_g0.9_eps0.9.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Q_Learning/Q_Table_E3000_a0.09_g0.9_eps0.9.pkl


--------------------------------------------------------------------------------
/Q_Learning/Readme.md:
--------------------------------------------------------------------------------
 1 | [image1]: ./Img/Q_value.png "Calculation Equation"
 2 | [image2]: ./Img/Q_table10000.png "Calculation Equation"
 3 | [image3]: ./Img/Receivedrewards.png "Calculation Equation"
 4 | [image4]: ./Img/steps_taken.png "Calculation Equation"
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | # Q-Learning and Q-Table
11 | 
12 | ## Creating the Q-Table
13 | The Q-Table gets created by the number of states (n_states) and the number of actions (n_actions) and form a matrix: n_states x n_actions 
14 | 
15 | This even shows the limitations of normal Q-learning with a Q-Table. The number of states has to be finit and not too large. Further, the states are not allowed to change during the game. 
16 | 
17 | ## Calculating the Q-Values
18 | 
19 | The Q-Values get calculated each step by this formula:
20 | 
21 | ![alt text][image1]
22 | 
23 | Here are as well some limitations. Since the Q-Values are dependent on the given rewards and most of the time the only reward is given when reaching the goal state, there has to be a way to reach the goal state by random actions. Otherwise the Q-Table will stay as a table of zeros.
24 | 
25 | ## Testing on Open AI Gyms environment FrozenLake
26 | After Training of 10000 Epochs the following Q-Table got calculated:
27 | 
28 | ![alt text][image2]
29 | 
30 | Also by looking at the received rewards over the epochs, one can see that after epoch ~1500 almost every following try received an reward of 1 or better won the game.
31 | 
32 | ![alt text][image3]
33 | 
34 | Same with the steps taken,one can see the increase in taken steps. which happens, since the game doesnt get stopped earlier by failing.
35 | 
36 | ![alt text][image4]
37 | 
38 | 


--------------------------------------------------------------------------------
/Q_Learning/play_FrozenLake_Q_table.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl 
 2 | import numpy as np
 3 | import pandas as pd
 4 | import time
 5 | import gym 
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("-e", "--Episoden",type = int,help ="Die Anzahl der zu trainierenden Episoden")
10 | parser.add_argument("-v", "--Video",type = bool,help ="Sollen die Versuche in einem Video aufgezeichnet werden?")
11 | parser.add_argument("-q", "--Q_Table",type = str,help ="Name der Q_table mit der gespielt werden soll")
12 | 
13 | 
14 | args = parser.parse_args()
15 | 
16 | EPISODES = args.Episoden
17 | TRYS = 100
18 | AUFZEICHNUNG = args.Video 
19 | Q_Table_name = args.Q_Table 
20 | 
21 | 
22 | 
23 | 
24 | def load_Qtable(Q_table):
25 |     Q = pd.read_pickle(Q_table)
26 |     return Q
27 | 
28 | 
29 | def choose_action(state,Q_table):
30 |     state_actions = Q_table.iloc[state, :]
31 |     action_name1 = state_actions.idxmax()
32 |     state_actions.pop(action_name1)
33 |     action_name2 = state_actions.idxmax()
34 |     if (np.random.uniform() > 0.4):
35 |         print("Best action choosen!")
36 |         return action_name1
37 |     else:
38 |         print("Second-best-action choosen!")
39 |         return action_name2 
40 | 
41 | def play():
42 |     Q_Table = load_Qtable(Q_Table_name)
43 |     for episode in range(EPISODES):
44 |         S = env.reset()
45 |         for one_try in range(TRYS):  #how long one epidsode lasts
46 | 
47 |             env.render()
48 |             A = choose_action(S, Q_Table)
49 |             print("Action choosen: {}".format(A))
50 |             S_,R,done,info = env.step(A)
51 |             #print(S_)
52 |             time.sleep(2)
53 | 
54 |             # Addapting for further learning
55 |             #print()
56 |             #q_old = q_table.loc[S, A]  #Current Q-Value of the state
57 |             #q_learned = R + GAMMA * q_table.iloc[S_, :].max()
58 |             #q_table.loc[S, A] += ALPHA * (q_learned - q_old)  # update
59 |             #S = S_  # move to next state
60 |             
61 |             if done:
62 |                 print("Episode finished after {} timesteps".format(one_try+1))
63 |                 break
64 | 
65 | 
66 | if __name__ =="__main__":
67 | 
68 | 
69 |     env = gym.make("FrozenLake-v0")
70 |     print(gym.__version__)
71 |     env.reset()
72 | 
73 |     play()
74 | 
75 |     # 0 - Down
76 |     # 1 - 


--------------------------------------------------------------------------------
/Q_Learning/train_FrozenLake_Qtable.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd 
  2 | import numpy as np 
  3 | import gym
  4 | import time
  5 | import matplotlib.pyplot as plt
  6 | import argparse 
  7 | 
  8 | 
  9 | 
 10 | def make_Q_table(actions,n_states):
 11 |     table = pd.DataFrame(
 12 |         np.zeros((n_states, actions)), columns = list(range(actions)))     # q_table initial values
 13 |     # print(table)    # show table
 14 |     return table
 15 |     
 16 | def choose_action(state, q_table):
 17 |     state_actions = q_table.iloc[state, :]
 18 |     if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
 19 |         action_name = np.random.choice(ACTIONS)
 20 |     else:   # act greedy
 21 |         action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
 22 |     #print("Action_choosen: "+str(action_name))
 23 |     return action_name
 24 | 
 25 | 
 26 | def RL(ACTIONS,N_SPACE):
 27 |     q_table = make_Q_table(ACTIONS,N_SPACE)
 28 |     reward_list = []
 29 |     try_list = []
 30 |     
 31 |     for episode in range(EPISODES):
 32 |         S = env.reset()
 33 |         rewards = 0
 34 |         steps = 0
 35 |         for one_try in range(TRYS):  #how long one epidsode lasts
 36 | 
 37 |             env.render()
 38 |             A = choose_action(S, q_table)
 39 | 
 40 |             S_,R,done,info = env.step(A)
 41 |             #print(S_)
 42 |             #time.sleep(1)
 43 |             print()
 44 |             q_old = q_table.loc[S, A]  #Current Q-Value of the state
 45 |             q_learned = R + GAMMA * q_table.iloc[S_, :].max()
 46 |             q_table.loc[S, A] += ALPHA * (q_learned - q_old)  # update
 47 |             S = S_  # move to next state
 48 |             rewards += R
 49 |             steps = one_try
 50 |             if done:
 51 |                 print("Episode finished after {} timesteps".format(one_try+1))
 52 |                 steps = one_try+1
 53 |                 break
 54 |         reward_list.append(rewards)
 55 |         try_list.append(steps+1)
 56 | 
 57 | 
 58 |     return q_table,reward_list,try_list
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | parser = argparse.ArgumentParser()
 66 | parser.add_argument("-e", "--Episoden",type = int,help ="Die Anzahl der zu trainierenden Episoden")
 67 | parser.add_argument("-a", "--Alpha",type = float,help ="Learning Rate ~0.1")
 68 | parser.add_argument("-g", "--Gamma",type = float,help ="Discount Factor ~0.9")
 69 | parser.add_argument("-eps", "--Epsilon",type = float,help ="Epsilon- for the Epsilon-Greedy decision process ~0.9")
 70 | 
 71 | args = parser.parse_args()
 72 | 
 73 | EPISODES = args.Episoden
 74 | TRYS = 100
 75 | EPSILON = args.Epsilon # epsilon greedy
 76 | ALPHA = args.Alpha # learning rate
 77 | GAMMA = args.Gamma #discount factor
 78 | 
 79 | if __name__ =="__main__":
 80 | 
 81 | 
 82 |     env = gym.make("FrozenLake-v0")
 83 |     print(gym.__version__)
 84 |     env.reset()
 85 |     # getting space and action
 86 |     ACTIONS = env.action_space.n   #env.unwrapped.get_action_meanings() to get a list of the action names
 87 |     N_SPACE = env.observation_space.n
 88 |     #print(ACTIONS)
 89 |     #print(N_SPACE)
 90 |     q_table,rlist,steps = RL(ACTIONS,N_SPACE)
 91 |     
 92 |     plt.plot(rlist)
 93 |     plt.title("Received Rewards")
 94 |     plt.xlabel("Epochs")
 95 |     plt.ylabel("Rewards")
 96 |     plt.show()
 97 | 
 98 |     plt.plot(steps)
 99 |     plt.title("Needed steps to finish one episode")
100 |     plt.xlabel("Epochs")
101 |     plt.ylabel("Steps")
102 |     plt.show()
103 | 
104 | 
105 |     
106 |     
107 |     
108 |     print("Q-Table: \n")
109 |     print(q_table)
110 | 
111 |     print("\nDo you want to save the Q-Table? \n")
112 |     answer = input("[y/n]")
113 | 
114 |     if answer == "y":
115 |         q_table.to_pickle("./Q_Table_E{}_a{}_g{}_eps{}.pkl".format(EPISODES,ALPHA,GAMMA,EPSILON))
116 |     else:
117 |         pass
118 | 
119 | 


--------------------------------------------------------------------------------
/Q_Learning/treasure_q.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import time
 4 | 
 5 | np.random.seed(2)  # reproducible
 6 | 
 7 | 
 8 | N_STATES = 6   # the length of the 1 dimensional world
 9 | ACTIONS = ['left', 'right']     # available actions
10 | EPSILON = 0.9   # greedy police
11 | ALPHA = 0.1     # learning rate
12 | GAMMA = 0.9    # discount factor
13 | MAX_EPISODES = 18   # maximum episodes
14 | FRESH_TIME = 0.3    # fresh time for one move
15 | 
16 | 
17 | def build_q_table(n_states, actions):
18 |     table = pd.DataFrame(
19 |         np.zeros((n_states, len(actions))),     # q_table initial values
20 |         columns=actions,    # actions's name
21 |     )
22 |     # print(table)    # show table
23 |     return table
24 | 
25 | 
26 | def choose_action(state, q_table):
27 |     # This is how to choose an action
28 |     state_actions = q_table.iloc[state, :]
29 |     if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
30 |         action_name = np.random.choice(ACTIONS)
31 |     else:   # act greedy
32 |         action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
33 |     return action_name
34 | 
35 | 
36 | def get_env_feedback(S, A):
37 |     # This is how agent will interact with the environment
38 |     if A == 'right':    # move right
39 |         if S == N_STATES - 2:   # terminate
40 |             S_ = 'terminal'
41 |             R = 1
42 |         else:
43 |             S_ = S + 1
44 |             R = 0
45 |     else:   # move left
46 |         R = 0
47 |         if S == 0:
48 |             S_ = S  # reach the wall
49 |         else:
50 |             S_ = S - 1
51 |     return S_, R
52 | 
53 | 
54 | def update_env(S, episode, step_counter):
55 |     # This is how environment be updated
56 |     env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
57 |     if S == 'terminal':
58 |         interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
59 |         print('\r{}'.format(interaction), end='')
60 |         time.sleep(2)
61 |         print('\r                                ', end='')
62 |     else:
63 |         env_list[S] = 'o'
64 |         interaction = ''.join(env_list)
65 |         print('\r{}'.format(interaction), end='')
66 |         time.sleep(FRESH_TIME)
67 | 
68 | 
69 | def rl():
70 |     # main part of RL loop
71 |     q_table = build_q_table(N_STATES, ACTIONS)
72 |     for episode in range(MAX_EPISODES):
73 |         step_counter = 0
74 |         S = 0
75 |         is_terminated = False
76 |         update_env(S, episode, step_counter)
77 |         while not is_terminated:
78 | 
79 |             A = choose_action(S, q_table)
80 |             S_, R = get_env_feedback(S, A)  # take action & get next state and reward
81 |             q_predict = q_table.loc[S, A]
82 |             if S_ != 'terminal':
83 |                 q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal
84 |             else:
85 |                 q_target = R     # next state is terminal
86 |                 is_terminated = True    # terminate this episode
87 | 
88 |             q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update
89 |             S = S_  # move to next state
90 | 
91 |             update_env(S, episode, step_counter+1)
92 |             step_counter += 1
93 |     return q_table
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     q_table = rl()
98 |     print('\r\nQ-table:\n')
99 |     print(q_table)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deep-Reinforcement-Learning
  2 | 
  3 | 
  4 | ![Logo](/imgs/web-3706562_640.jpg)
  5 | 
  6 | Collection of Deep Reinforcement Learning Algorithms in PyTorch.
  7 | 
  8 | 
  9 | Below a list of Jupyter Notebooks with implementations
 10 | 
 11 | # Value Based / Offline Methods
 12 | ## Discrete Action Space
 13 | 
 14 | - [Q-Learning](Q_Learning) &emsp;&emsp;&emsp; [Source/Paper](/Paper/DQN.pdf)
 15 | 
 16 | - [DQN](https://github.com/BY571/Reinforcement-Learning/tree/master/Deep%20Q_Learning) &emsp;&emsp;&emsp;&emsp; [Paper](/Paper/DQN.pdf)
 17 | 
 18 | - [Double DQN](https://github.com/BY571/Reinforcement-Learning/tree/master/Double%20DQN) &emsp;&emsp;&emsp;&emsp; [Paper](/Paper/Double_DQN.pdf)
 19 | 
 20 | - [Dueling DQN](https://github.com/BY571/DQN-Atari-Agents) &emsp;&emsp;&emsp;&emsp; [Paper](/Paper/Dueling.pdf)
 21 | 
 22 | - [N-Step DQN](https://github.com/BY571/DQN-Atari-Agents)
 23 | 
 24 | - [Noisy DQN](https://github.com/BY571/DQN-Atari-Agents)
 25 | &emsp;&emsp;&emsp;&emsp; [Paper](/Paper/Noisy_networks.pdf)
 26 | 
 27 | - [Rainbow](https://github.com/BY571/DQN-Atari-Agents)
 28 | &emsp;&emsp;&emsp;&emsp;[Paper](https://arxiv.org/pdf/1710.02298.pdf)
 29 | 
 30 | ## Distributional RL 
 31 | 
 32 | - [Categorical DQN - C51](https://github.com/BY571/DQN-Atari-Agents) &emsp;&emsp;&emsp;&emsp;[Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/Distributional%20DQN.pdf)
 33 | 
 34 | - [QR-DQN](https://github.com/BY571/QR-DQN)
 35 | 
 36 | - [IQN](https://github.com/BY571/IQN-and-Extensions)
 37 | 
 38 | - [FQF](https://github.com/BY571/FQF-and-Extensions)
 39 | 
 40 | 
 41 | ## Continuous Action Space
 42 | 
 43 | - [NAF - Normalized Advantage Function](https://github.com/BY571/Normalized-Advantage-Function-NAF-)
 44 | 
 45 | -[Soft-DQN] TODO
 46 | _________________________________________________
 47 | # Policy Based / Online Methods
 48 | ## Discrete Action Space
 49 | 
 50 | 
 51 | - [Sarsa](https://github.com/BY571/Reinforcement-Learning/blob/master/Temporal%20Difference%20(Sarsa%2C%20Sarsamax%2C%20Expeted%20Sarsa)/Temporal_Difference.ipynb)
 52 | [Source/Paper]
 53 | 
 54 | 
 55 | - [Vanilla Policy Gradient](https://github.com/BY571/Reinforcement-Learning/blob/master/Policy%20Gradient%20Algorithms/Policy_Gradien_%2B_Baseline_mean.ipynb) [+LSTM](https://github.com/BY571/Reinforcement-Learning/blob/master/Policy%20Gradient%20Algorithms/PolicyGradient_LSTM.ipynb)
 56 | [Source/Paper]
 57 | 
 58 | 
 59 | - A2C
 60 | [Paper](/Paper/A3C.pdf)
 61 | 
 62 | - A2C with gae* [TODO]
 63 | 
 64 | - A2C multi environment
 65 | 
 66 | 
 67 | - PPO
 68 | [Paper](/Paper/PPO.pdf)
 69 | 
 70 | - PPO with gae*
 71 | 
 72 | - [PPO with gae and curiosity driven exploration (single, digit inputs)](https://github.com/BY571/Reinforcement-Learning/blob/master/PPO_gae_curios.ipynb) [Paper](/Paper/)
 73 | 
 74 | - PPO multi environment
 75 | 
 76 | 
 77 | ## Continuous Action Space
 78 | 
 79 | - [A2C](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/A2C_conti_seperate_networks.ipynb)
 80 | 
 81 | - A2C with gae* [TODO]
 82 | 
 83 | - [A2C multi environment](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/A2C_continuous_multienv.ipynb)
 84 | 
 85 | 
 86 | - [PPO](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_unity_Crawler.ipynb)
 87 | 
 88 | - [PPO with gae*](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/ROBOSCHOOL_PPO_GAE.ipynb)[PPO with gae multi](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_conti_gae_multi.ipynb)
 89 | 
 90 | - [PPO+curiosity&single](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_conti_gae_curios.ipynb)[+curiosity&multi](https://github.com/BY571/Reinforcement-Learning/blob/master/PPO_conti_gae_curio_multi.ipynb)
 91 | 
 92 | - [PPO multi environment](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/PPO_unity_Crawler.ipynb)
 93 | 
 94 | 
 95 | 
 96 | 
 97 | gae* = Generalized Advanted Estimation [Source](/Paper/GAE.pdf)
 98 | 
 99 | ______________________________________________
100 | 
101 | # Actor-Critic Algorithms 
102 | 
103 | - [DDPG](https://github.com/BY571/Udacity-DRL-Nanodegree-P2)
104 | [Source/Paper]
105 | 
106 | - [D4PG](https://github.com/BY571/D4PG)
107 | [Source/Paper](https://arxiv.org/pdf/1804.08617.pdf)
108 | 
109 | - [Twin Delayed DDPG (TD3)](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/TD3_conti.ipynb)
110 | &emsp;&emsp;&emsp;&emsp;[Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/TD3.pdf)
111 | 
112 | - [Soft Actor Critic (SAC-newest 2019 version)](https://github.com/BY571/Reinforcement-Learning/blob/master/ContinousControl/SAC.ipynb)
113 | &emsp;&emsp;&emsp;&emsp;[Paper](https://github.com/BY571/Reinforcement-Learning/blob/master/Paper/SAC_2019.pdf)
114 | 
115 | ________________________________________________
116 | 
117 | # Upside-Down-Reinforcement-Learning
118 | Discrete and continuous action space implementation of [⅂ꓤ](https://github.com/BY571/Upside-Down-Reinforcement-Learning)
119 | 
120 | ________________________________________________
121 | # Munchausen Reinforcement Learning
122 | 
123 | Implementierungen von Munchausen RL
124 | 
125 | - [M-DQN](https://github.com/BY571/Munchausen-RL)
126 | 
127 | - [M-IQN](https://github.com/BY571/IQN-and-Extensions)
128 | 
129 | - [M-FQF](https://github.com/BY571/FQF-and-Extensions)
130 | 
131 | - [M-SAC](https://github.com/BY571/Soft-Actor-Critic-and-Extensions)
132 | 
133 | 
134 | ________________________________________________
135 | 
136 | # Model-Based RL
137 | 
138 | __________________________________________________
139 | 
140 | # Black-Box Optimization
141 | 
142 | - [Evolution Strategies]() [with mulit processing](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolution_Strategies_parallel+novelty/README.md) [and novelty search](https://github.com/BY571/Reinforcement-Learning/blob/master/Black-Box%20Optimization/Evolution_Strategies_parallel+novelty/README.md)
143 | 
144 | - [Genetic Algorithm - GARNE](https://github.com/BY571/GARNE-Genetic-Algorithm-with-Recurrent-Network-and-Novelty-Exploration/blob/master/README.md) 
145 |   - Genetic Algorithm implementation with LSTM, Multiprocessing over several CPUs and Novelty Search for Exploration
146 | __________________________________________
147 | # Multi-Agent Deep Reinforcement Learning
148 | 
149 | - [Multi-Agent-DDPG](https://github.com/BY571/Udacity-DRL-Nanodegree-P3-Multiagent-RL-)
150 | 
151 | # Hyperparameter Tuning
152 | 
153 | Gridsearch
154 | 
155 | Random Forest [TODO]
156 | 
157 | Genetic Algorithm [TODO]
158 | 
159 | ====================================
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/README.md:
--------------------------------------------------------------------------------
 1 | # Taxi Problem
 2 | 
 3 | ### Getting Started
 4 | 
 5 | Read the description of the environment in subsection 3.1 of [this paper](https://arxiv.org/pdf/cs/9905014.pdf).  You can verify that the description in the paper matches the OpenAI Gym environment by peeking at the code [here](https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py).
 6 | 
 7 | 
 8 | ### Instructions
 9 | 
10 | The repository contains three files:
11 | - `agent.py`: Develop your reinforcement learning agent here.  This is the only file that you should modify.
12 | - `monitor.py`: The `interact` function tests how well your agent learns from interaction with the environment.
13 | - `main.py`: Run this file in the terminal to check the performance of your agent.
14 | 
15 | Begin by running the following command in the terminal:
16 | ```
17 | python main.py
18 | ```
19 | 
20 | When you run `main.py`, the agent that you specify in `agent.py` interacts with the environment for 20,000 episodes.  The details of the interaction are specified in `monitor.py`, which returns two variables: `avg_rewards` and `best_avg_reward`.
21 | - `avg_rewards` is a deque where `avg_rewards[i]` is the average (undiscounted) return collected by the agent from episodes `i+1` to episode `i+100`, inclusive.  So, for instance, `avg_rewards[0]` is the average return collected by the agent over the first 100 episodes.
22 | - `best_avg_reward` is the largest entry in `avg_rewards`.  This is the final score that you should use when determining how well your agent performed in the task.
23 | 
24 | Your assignment is to modify the `agents.py` file to improve the agent's performance.
25 | - Use the `__init__()` method to define any needed instance variables.  Currently, we define the number of actions available to the agent (`nA`) and initialize the action values (`Q`) to an empty dictionary of arrays.  Feel free to add more instance variables; for example, you may find it useful to define the value of epsilon if the agent uses an epsilon-greedy policy for selecting actions.
26 | - The `select_action()` method accepts the environment state as input and returns the agent's choice of action.  The default code that we have provided randomly selects an action.
27 | - The `step()` method accepts a (`state`, `action`, `reward`, `next_state`) tuple as input, along with the `done` variable, which is `True` if the episode has ended.  The default code (which you should certainly change!) increments the action value of the previous state-action pair by 1.  You should change this method to use the sampled tuple of experience to update the agent's knowledge of the problem.
28 | 
29 | Once you have modified the function, you need only run `python main.py` to test your new agent.
30 | 
31 | OpenAI Gym [defines "solving"](https://gym.openai.com/envs/Taxi-v1/) this task as getting average return of 9.7 over 100 consecutive trials.  
32 | 


--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/agent.cpython-37.pyc


--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/monitor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/__pycache__/monitor.cpython-37.pyc


--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import defaultdict
 3 | 
 4 | 
 5 | class Agent:
 6 | 
 7 |     def __init__(self, nA=6, learning_rate = 0.02, gamma = 0.9):
 8 |         """ Initialize agent.
 9 | 
10 |         Params
11 |         ======
12 |         - nA: number of actions available to the agent
13 |         """
14 |         self.nA = nA
15 |         self.Q = defaultdict(lambda: np.zeros(self.nA))
16 |         self.lr = learning_rate
17 |         self.gamma = gamma
18 | 
19 | 
20 |     def probabilities(self,q, epsilon):
21 |         probs = np.ones(self.nA) * epsilon/self.nA
22 |         best_action = np.argmax(q)
23 |         probs[best_action] = (1 - epsilon) + epsilon/self.nA 
24 |         return probs
25 | 
26 |     def select_action(self, state, epsilon):
27 |         """ Given the state, select an action.
28 | 
29 |         Params
30 |         ======
31 |         - state: the current state of the environment
32 | 
33 |         Returns
34 |         =======
35 |         - action: an integer, compatible with the task's action space
36 |         """
37 |         action = np.random.choice(np.arange(self.nA), p = self.probabilities(self.Q[state], epsilon)) \
38 |                                             if state in self.Q else np.random.choice(np.arange(self.nA))
39 |         return action
40 | 
41 | 
42 | 
43 |     def step(self, state, action, reward, next_state, done):
44 |         """ Update the agent's knowledge, using the most recently sampled tuple.
45 | 
46 |         Params
47 |         ======
48 |         - state: the previous state of the environment
49 |         - action: the agent's previous choice of action
50 |         - reward: last reward received
51 |         - next_state: the current state of the environment
52 |         - done: whether the episode is complete (True or False)
53 |         """
54 |         Q_target = np.max(self.Q[next_state])
55 |         self.Q[state][action] = self.Q[state][action] + self.lr * (reward + self.gamma*(1-done)*Q_target - self.Q[state][action] )


--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/main.py:
--------------------------------------------------------------------------------
1 | from agent import Agent
2 | from monitor import interact
3 | import gym
4 | import numpy as np
5 | 
6 | env = gym.make('Taxi-v2')
7 | agent = Agent()
8 | avg_rewards, best_avg_reward = interact(env, agent)


--------------------------------------------------------------------------------
/Temporal Difference (Sarsa, Sarsamax, Expeted Sarsa)/lab-taxi/monitor.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import sys
 3 | import math
 4 | import numpy as np
 5 | 
 6 | def interact(env, agent, num_episodes=20000, window=100,epsilon_start = 1, epsilon_decay = 0.9999, epsilon_min = 0.01):
 7 |     """ Monitor agent's performance.
 8 |     
 9 |     Params
10 |     ======
11 |     - env: instance of OpenAI Gym's Taxi-v1 environment
12 |     - agent: instance of class Agent (see Agent.py for details)
13 |     - num_episodes: number of episodes of agent-environment interaction
14 |     - window: number of episodes to consider when calculating average rewards
15 | 
16 |     Returns
17 |     =======
18 |     - avg_rewards: deque containing average rewards
19 |     - best_avg_reward: largest value in the avg_rewards deque
20 |     """
21 |     # initialize average rewards
22 |     avg_rewards = deque(maxlen=num_episodes)
23 |     # initialize best average reward
24 |     best_avg_reward = -math.inf
25 |     # initialize monitor for most recent rewards
26 |     samp_rewards = deque(maxlen=window)
27 |     # for each episode
28 |     epsilon = epsilon_start
29 |     for i_episode in range(1, num_episodes+1):
30 |         # begin the episode
31 |         state = env.reset()
32 |         # initialize the sampled reward
33 |         samp_reward = 0
34 |         while True:
35 |             epsilon = max(epsilon*epsilon_decay,epsilon_min) 
36 |             # agent selects an action
37 |             action = agent.select_action(state, epsilon) # 
38 |             # agent performs the selected action
39 |             next_state, reward, done, _ = env.step(action)
40 |             # agent performs internal updates based on sampled experience
41 |             agent.step(state, action, reward, next_state, done)
42 |             # update the sampled reward
43 |             samp_reward += reward
44 |             # update the state (s <- s') to next time step
45 |             state = next_state
46 |             if done:
47 |                 # save final sampled reward
48 |                 samp_rewards.append(samp_reward)
49 |                 break
50 |         if (i_episode >= 100):
51 |             # get average reward from last 100 episodes
52 |             avg_reward = np.mean(samp_rewards)
53 |             # append to deque
54 |             avg_rewards.append(avg_reward)
55 |             # update best average reward
56 |             if avg_reward > best_avg_reward:
57 |                 best_avg_reward = avg_reward
58 |         # monitor progress
59 |         print("\rEpisode {}/{} || Best average reward {} || Epsilon {}".format(i_episode, num_episodes, best_avg_reward, epsilon), end="")
60 |         sys.stdout.flush()
61 |         # check if task is solved (according to OpenAI Gym)
62 |         if best_avg_reward >= 9.7:
63 |             print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
64 |             break
65 |         if i_episode == num_episodes: print('\n')
66 |     return avg_rewards, best_avg_reward


--------------------------------------------------------------------------------
/imgs/web-3706562_640.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BY571/Deep-Reinforcement-Learning-Algorithm-Collection/2a5f37ce65da561a8e11cd7bbd03b62038e6ccba/imgs/web-3706562_640.jpg


--------------------------------------------------------------------------------