├── .idea ├── .gitignore ├── RL.iml ├── codeStyles │ └── codeStyleConfig.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── other.xml └── vcs.xml ├── A2C ├── A2C Episodic Sync │ ├── SharedAdam.py │ ├── evaluate.py │ ├── pyTorch_CartPole_A2C_ES.py │ └── workers.py ├── A2C Time Interval Sync │ ├── SharedAdam.py │ ├── evaluate.py │ ├── pyTorch_CartPole_A2C_IS.py │ └── workers.py └── README About Vairations.md ├── A3C ├── A3C Episodic Async │ ├── SharedAdam.py │ ├── evaluate.py │ ├── pyTorch_CartPole_A3C_EA.py │ └── workers_PlayGround.py ├── A3C Time Interval Async │ ├── SharedAdam.py │ ├── evaluate.py │ ├── pyTorch_CartPole_A3C_IA.py │ └── workers.py └── README About Vairations.md ├── Ape-X └── PyTorch_Ape-X.py ├── D4PG └── PyTorch_D4PG.py ├── DDPG └── PyTorch_DDPG.py ├── Deuling Double DQN with PER └── PyTorch_Deuling_DDQN_with_PER.py ├── Deuling Double DQN └── PyTorch_Deuling_DDQN.py ├── Experiments ├── Online TD and true Online TD.ipynb └── Seijen2014_True_Online_TD.ipynb ├── MASM ├── Differential_semi_gradient_Sarsa.ipynb └── prototype.ipynb ├── Off-Policy Policy Gradient ├── Experiment Log of failure of Off_policy_Actor_Critic ├── pyTorch_CartPole_Off_Policy_Actor_Critic[not work].ipynb └── pyTorch_CartPole_Off_Policy_REINFORCE.ipynb ├── Plain-Actor-Critic ├── pyTorch_CartPole_Advantage_Actor_Critic_Entropy_Regularized.ipynb ├── pyTorch_CartPole_Advantage_Episode_Wise_Actor_Critic_Huber.ipynb └── pyTorch_CartPole_Step_Wise_Bootstrap_Actor_Critic.ipynb ├── RAINBOW └── PyTorch_RAINBOW.py ├── README.md └── REINFORCE └── pyTorch_CartPole_REINFORCE.ipynb /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /.idea/RL.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /A2C/A2C Episodic Sync/SharedAdam.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py 2 | # A very nice optimization of Adam method to make it receive shared states. 3 | import math 4 | import torch 5 | import torch.optim as optim 6 | 7 | 8 | class SharedAdam(optim.Adam): 9 | """Implements Adam algorithm with shared states. 10 | """ 11 | 12 | def __init__(self, 13 | params, 14 | lr=1e-3, 15 | betas=(0.9, 0.999), 16 | eps=1e-8, 17 | weight_decay=0): 18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 19 | 20 | for group in self.param_groups: 21 | for p in group['params']: 22 | state = self.state[p] 23 | state['step'] = torch.zeros(1) 24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 26 | 27 | def share_memory(self): 28 | for group in self.param_groups: 29 | for p in group['params']: 30 | state = self.state[p] 31 | state['step'].share_memory_() 32 | state['exp_avg'].share_memory_() 33 | state['exp_avg_sq'].share_memory_() 34 | 35 | def step(self, closure=None): 36 | """Performs a single optimization step. 37 | Arguments: 38 | closure (callable, optional): A closure that reevaluates the model 39 | and returns the loss. 40 | """ 41 | loss = None 42 | if closure is not None: 43 | loss = closure() 44 | 45 | for group in self.param_groups: 46 | for p in group['params']: 47 | if p.grad is None: 48 | continue 49 | grad = p.grad.data 50 | state = self.state[p] 51 | 52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 53 | beta1, beta2 = group['betas'] 54 | 55 | state['step'] += 1 56 | 57 | if group['weight_decay'] != 0: 58 | grad = grad.add(group['weight_decay'], p.data) 59 | 60 | # Decay the first and second moment running average coefficient 61 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 63 | 64 | denom = exp_avg_sq.sqrt().add_(group['eps']) 65 | 66 | bias_correction1 = 1 - beta1 ** state['step'].item() 67 | bias_correction2 = 1 - beta2 ** state['step'].item() 68 | step_size = group['lr'] * math.sqrt( 69 | bias_correction2) / bias_correction1 70 | 71 | p.data.addcdiv_(-step_size, exp_avg, denom) 72 | 73 | return loss -------------------------------------------------------------------------------- /A2C/A2C Episodic Sync/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def evaluate(shared_model, q): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | for episode in range(1): 35 | action_log_history = [] 36 | for step in range(200): 37 | actor.load_state_dict(shared_model.state_dict()) 38 | # -----lines below are line-corresponding to the original algorithm---- 39 | obs = np.reshape(obs, [1, -1]) 40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 41 | action_log_probability, V = actor(input_actor) 42 | p = np.exp(action_log_probability[0].detach().cpu()) 43 | action = np.random.choice(2, p=p.numpy()) 44 | action_log_history.append(action_log_probability[0][action]) 45 | obs, reward, done, info = env.step(action) 46 | if done: 47 | q.put(step) 48 | return -------------------------------------------------------------------------------- /A2C/A2C Episodic Sync/pyTorch_CartPole_A2C_ES.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.multiprocessing as mp 9 | from workers import worker 10 | from evaluate import evaluate 11 | from SharedAdam import SharedAdam 12 | from time import perf_counter 13 | 14 | class Actor(torch.nn.Module): 15 | def __init__(self): 16 | super(Actor,self).__init__() 17 | self.fc1 = Linear(4, 128) 18 | self.fc2 = Linear(128, 128) 19 | self.fc3 = Linear(128, 2) 20 | self.fc4 = Linear(128, 1) 21 | self.steps = [] 22 | 23 | def forward(self, x): 24 | x = F.relu(self.fc1(x)) 25 | x = F.relu(self.fc2(x)) 26 | action = F.log_softmax(self.fc3(x), dim=-1) 27 | V = F.relu(self.fc4(x)) 28 | return action, V 29 | 30 | def draw(self, eval = False): 31 | plt.style.use('dark_background') 32 | plt.figure(figsize=(10, 10)) 33 | if eval: 34 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large') 35 | plt.xlabel('Rewards', fontsize='xx-large') 36 | plt.ylabel('Frequency', fontsize='xx-large') 37 | plt.hist(self.steps, range=(0, 200)) 38 | plt.show() 39 | else: 40 | mid = [] 41 | interval = 3 42 | for i in range(len(self.steps) - interval): 43 | mid.append(np.mean(self.steps[i:i + interval + 1])) 44 | plt.title('Performance of A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large') 45 | plt.xlabel('Episodes', fontsize='xx-large') 46 | plt.ylabel('Rewards', fontsize='xx-large') 47 | x_fit = list(range(len(self.steps) - interval)) 48 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data') 49 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average') 50 | plt.legend(loc="best", prop={'size': 12}) 51 | plt.show() 52 | 53 | 54 | if __name__ == '__main__': 55 | device = 'cpu' 56 | mp.set_start_method('spawn') 57 | # Do not change this unless you have multiple GPU. 58 | # update test 59 | q = mp.Queue() 60 | num_workers = 7 61 | processes = [] 62 | shared_model = Actor() 63 | shared_model.to(device) 64 | shared_model.share_memory() 65 | optimizer = SharedAdam(shared_model.parameters(), lr=0.003) 66 | for episode in range(10000): 67 | t1_start = perf_counter() 68 | p = mp.Process(target=evaluate, args=(shared_model, q)) 69 | processes.append(p) 70 | p.start() 71 | for worker_id in range(num_workers): 72 | p = mp.Process(target = worker, args = (shared_model, optimizer)) 73 | processes.append(p) 74 | p.start() 75 | for p in processes: 76 | p.join() 77 | shared_model.steps.append(q.get()) 78 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}') 79 | if np.mean(shared_model.steps[-25:]) == 199: 80 | break 81 | t1_stop = perf_counter() 82 | print("Elapsed time during the whole program in seconds:", 83 | t1_stop - t1_start) 84 | shared_model.draw() 85 | shared_model.step = [] 86 | for episode in range(15): 87 | for worker_id in range(6): 88 | p = mp.Process(target=evaluate, args=(shared_model, q)) 89 | for p in processes: 90 | p.join() 91 | while not q.empty(): 92 | shared_model.steps.append(q.get()) 93 | shared_model.steps.sort() 94 | shared_model.draw(eval = True) 95 | 96 | 97 | -------------------------------------------------------------------------------- /A2C/A2C Episodic Sync/workers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def worker(shared_model, optimizer): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | gamma = 0.99 35 | steps = [] 36 | eps = np.finfo(np.float32).eps.item() 37 | for episode in range(1): 38 | action_log_history = [] 39 | V_history = [] 40 | for step in range(200): 41 | actor.load_state_dict(shared_model.state_dict()) 42 | # -----lines below are line-corresponding to the original algorithm---- 43 | obs = np.reshape(obs, [1, -1]) 44 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 45 | action_log_probability, V = actor(input_actor) 46 | p = np.exp(action_log_probability[0].detach().cpu()) 47 | action = np.random.choice(2, p=p.numpy()) 48 | action_log_history.append(action_log_probability[0][action]) 49 | V_history.append(V) 50 | obs, reward, done, info = env.step(action) 51 | if done: 52 | if step == 199: 53 | break 54 | actor.zero_grad() 55 | steps.append(step) 56 | print(f'episode {episode}, step {step}', end='\r') 57 | obs = env.reset() 58 | reward_list = np.ones((step + 1,)) 59 | for i in range(len(reward_list) - 2, -1, -1): 60 | reward_list[i] += reward_list[i + 1] * gamma 61 | reward_list -= np.mean(reward_list) 62 | reward_list /= (np.std(reward_list) + eps) 63 | Critic_Loss = [] 64 | Delta = [] 65 | for monte_carlo_return, V in zip(reward_list, V_history): 66 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device))) 67 | Delta.append(monte_carlo_return - V.detach()) 68 | Actor_Loss = [] 69 | entropy = 0 70 | for log_p in action_log_history: 71 | entropy -= log_p * torch.exp(log_p) 72 | for delta, log_prob in zip(Delta, action_log_history): 73 | Actor_Loss.append(-log_prob * delta.detach()) 74 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy*0.01 75 | loss.backward() 76 | ensure_shared_grads(actor, shared_model) 77 | optimizer.step() 78 | break 79 | 80 | 81 | def ensure_shared_grads(model, shared_model): 82 | for param, shared_param in zip(model.parameters(), 83 | shared_model.parameters()): 84 | if shared_param.grad is not None: 85 | return 86 | shared_param._grad = param.grad 87 | -------------------------------------------------------------------------------- /A2C/A2C Time Interval Sync/SharedAdam.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py 2 | # A very nice optimization of Adam method to make it receive shared states. 3 | import math 4 | import torch 5 | import torch.optim as optim 6 | 7 | 8 | class SharedAdam(optim.Adam): 9 | """Implements Adam algorithm with shared states. 10 | """ 11 | 12 | def __init__(self, 13 | params, 14 | lr=1e-3, 15 | betas=(0.9, 0.999), 16 | eps=1e-8, 17 | weight_decay=0): 18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 19 | 20 | for group in self.param_groups: 21 | for p in group['params']: 22 | state = self.state[p] 23 | state['step'] = torch.zeros(1) 24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 26 | 27 | def share_memory(self): 28 | for group in self.param_groups: 29 | for p in group['params']: 30 | state = self.state[p] 31 | state['step'].share_memory_() 32 | state['exp_avg'].share_memory_() 33 | state['exp_avg_sq'].share_memory_() 34 | 35 | def step(self, closure=None): 36 | """Performs a single optimization step. 37 | Arguments: 38 | closure (callable, optional): A closure that reevaluates the model 39 | and returns the loss. 40 | """ 41 | loss = None 42 | if closure is not None: 43 | loss = closure() 44 | 45 | for group in self.param_groups: 46 | for p in group['params']: 47 | if p.grad is None: 48 | continue 49 | grad = p.grad.data 50 | state = self.state[p] 51 | 52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 53 | beta1, beta2 = group['betas'] 54 | 55 | state['step'] += 1 56 | 57 | if group['weight_decay'] != 0: 58 | grad = grad.add(group['weight_decay'], p.data) 59 | 60 | # Decay the first and second moment running average coefficient 61 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 63 | 64 | denom = exp_avg_sq.sqrt().add_(group['eps']) 65 | 66 | bias_correction1 = 1 - beta1 ** state['step'].item() 67 | bias_correction2 = 1 - beta2 ** state['step'].item() 68 | step_size = group['lr'] * math.sqrt( 69 | bias_correction2) / bias_correction1 70 | 71 | p.data.addcdiv_(-step_size, exp_avg, denom) 72 | 73 | return loss 74 | -------------------------------------------------------------------------------- /A2C/A2C Time Interval Sync/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def evaluate(shared_model, q): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | for episode in range(1): 35 | action_log_history = [] 36 | for step in range(200): 37 | actor.load_state_dict(shared_model.state_dict()) 38 | # -----lines below are line-corresponding to the original algorithm---- 39 | obs = np.reshape(obs, [1, -1]) 40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 41 | action_log_probability, V = actor(input_actor) 42 | p = np.exp(action_log_probability[0].detach().cpu()) 43 | action = np.random.choice(2, p=p.numpy()) 44 | action_log_history.append(action_log_probability[0][action]) 45 | obs, reward, done, info = env.step(action) 46 | if done: 47 | q.put(step) 48 | return 49 | -------------------------------------------------------------------------------- /A2C/A2C Time Interval Sync/pyTorch_CartPole_A2C_IS.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.multiprocessing as mp 9 | from workers import worker 10 | from evaluate import evaluate 11 | from SharedAdam import SharedAdam 12 | from time import perf_counter 13 | 14 | 15 | class Actor(torch.nn.Module): 16 | def __init__(self): 17 | super(Actor, self).__init__() 18 | self.fc1 = Linear(4, 128) 19 | self.fc2 = Linear(128, 128) 20 | self.fc3 = Linear(128, 2) 21 | self.fc4 = Linear(128, 1) 22 | self.steps = [] 23 | 24 | def forward(self, x): 25 | x = F.relu(self.fc1(x)) 26 | x = F.relu(self.fc2(x)) 27 | action = F.log_softmax(self.fc3(x), dim=-1) 28 | V = F.relu(self.fc4(x)) 29 | return action, V 30 | 31 | def draw(self, eval=False): 32 | plt.style.use('dark_background') 33 | plt.figure(figsize=(10, 10)) 34 | if eval: 35 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large') 36 | plt.xlabel('Rewards', fontsize='xx-large') 37 | plt.ylabel('Frequency', fontsize='xx-large') 38 | plt.hist(self.steps, range=(0, 200)) 39 | plt.show() 40 | else: 41 | mid = [] 42 | interval = 3 43 | for i in range(len(self.steps) - interval): 44 | mid.append(np.mean(self.steps[i:i + interval + 1])) 45 | plt.title('Performance of A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large') 46 | plt.xlabel('Episodes', fontsize='xx-large') 47 | plt.ylabel('Rewards', fontsize='xx-large') 48 | x_fit = list(range(len(self.steps) - interval)) 49 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data') 50 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average') 51 | plt.legend(loc="best", prop={'size': 12}) 52 | plt.show() 53 | 54 | 55 | if __name__ == '__main__': 56 | device = 'cpu' 57 | mp.set_start_method('spawn') 58 | # Do not change this unless you have multiple GPU. 59 | # update test 60 | q = mp.Queue() 61 | num_workers = 7 62 | T = 150 63 | processes = [] 64 | shared_model = Actor() 65 | shared_model.to(device) 66 | shared_model.share_memory() 67 | optimizer = SharedAdam(shared_model.parameters(), lr=0.003) 68 | for episode in range(10000): 69 | t1_start = perf_counter() 70 | p = mp.Process(target=evaluate, args=(shared_model, q)) 71 | processes.append(p) 72 | p.start() 73 | for worker_id in range(num_workers): 74 | p = mp.Process(target=worker, args=(shared_model, optimizer, T)) 75 | processes.append(p) 76 | p.start() 77 | for p in processes: 78 | p.join() 79 | shared_model.steps.append(q.get()) 80 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}') 81 | if np.mean(shared_model.steps[-25:]) == 199: 82 | break 83 | t1_stop = perf_counter() 84 | print("Elapsed time during the whole program in seconds:", 85 | t1_stop - t1_start) 86 | shared_model.draw() 87 | shared_model.step = [] 88 | for episode in range(15): 89 | for worker_id in range(6): 90 | p = mp.Process(target=evaluate, args=(shared_model, q)) 91 | for p in processes: 92 | p.join() 93 | while not q.empty(): 94 | shared_model.steps.append(q.get()) 95 | shared_model.steps.sort() 96 | shared_model.draw(eval=True) 97 | -------------------------------------------------------------------------------- /A2C/A2C Time Interval Sync/workers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def worker(shared_model, optimizer, T): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | gamma = 0.99 35 | steps = [] 36 | eps = np.finfo(np.float32).eps.item() 37 | actor.load_state_dict(shared_model.state_dict()) 38 | t = 0 39 | for episode in range(1): 40 | action_log_history = [] 41 | V_history = [] 42 | for step in range(200): 43 | # -----lines below are line-corresponding to the original algorithm---- 44 | obs = np.reshape(obs, [1, -1]) 45 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 46 | action_log_probability, V = actor(input_actor) 47 | p = np.exp(action_log_probability[0].detach().cpu()) 48 | action = np.random.choice(2, p=p.numpy()) 49 | action_log_history.append(action_log_probability[0][action]) 50 | V_history.append(V) 51 | obs, reward, done, info = env.step(action) 52 | t += 1 53 | if done or t >= T: 54 | if step == 199: 55 | break 56 | actor.zero_grad() 57 | steps.append(step) 58 | if done: 59 | print(f'episode {episode}, step {step}', end='\r') 60 | obs = env.reset() 61 | reward_list = np.ones((step + 1,)) 62 | for i in range(len(reward_list) - 2, -1, -1): 63 | reward_list[i] += reward_list[i + 1] * gamma 64 | reward_list -= np.mean(reward_list) 65 | reward_list /= (np.std(reward_list) + eps) 66 | Critic_Loss = [] 67 | Delta = [] 68 | for monte_carlo_return, V in zip(reward_list, V_history): 69 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device))) 70 | Delta.append(monte_carlo_return - V.detach()) 71 | Actor_Loss = [] 72 | entropy = 0 73 | for log_p in action_log_history: 74 | entropy -= log_p * torch.exp(log_p) 75 | for delta, log_prob in zip(Delta, action_log_history): 76 | Actor_Loss.append(-log_prob * delta.detach()) 77 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy * 0.01 78 | loss.backward() 79 | ensure_shared_grads(actor, shared_model) 80 | optimizer.step() 81 | break 82 | 83 | 84 | def ensure_shared_grads(model, shared_model): 85 | for param, shared_param in zip(model.parameters(), 86 | shared_model.parameters()): 87 | if shared_param.grad is not None: 88 | return 89 | shared_param._grad = param.grad 90 | -------------------------------------------------------------------------------- /A2C/README About Vairations.md: -------------------------------------------------------------------------------- 1 | ## Review of A2C and A3C 2 | We all know the A3C is using a episode count to control the async 3 | process. In this procedure, child process will return the grad 4 | after given episodes or finished early. They will be sync in the next 5 | run. 6 | 7 | A2C however, waits for each child process to finish its segments. 8 | 9 | ## My Variation 10 | In this project, I change the sync or async method from A2C and A3C 11 | into episode wise. That is, the child process will only return 12 | the grad AFTER it completes the episode. 13 | 14 | In variation of A3C, child process will return the grad as soon 15 | as it finished the current episode, return the latest grad, do the backward 16 | then sync with the latest model parameters to go next. It will be put 17 | into an infinite loop. This method will never call join() method to process, 18 | instead, it will monitor a queue that is filled by each child process's game 19 | record. If the last 100 of them are maximum rewards, it will send terminate 20 | message to all child processes. 21 | 22 | In variation of A2C, child process will return the grad as soon 23 | as it finished the current episode but all processes will wait every one 24 | finished and sync with the updated model together. In this case we need 25 | a loop and call joint in each loop. If the queue fulfills the convergence requirement, 26 | loop will be ended. 27 | 28 | Additionally, I have set all mode do not learn at all if it reaches the maximum 29 | reward to facilitate converging. [need TESTED] 30 | 31 | ## More Variations: 32 | There isn't too much difference in each child processes. It could 33 | be a exploratory directions. And, how to measure the differences of each 34 | method need a large amount of time. 35 | 36 | ## Additional Warnings: 37 | when initialize multi-processing, fork does not work. If you are in Linux or Mac, change to spawn. 38 | Reasons unclear to me at this point. 39 | -------------------------------------------------------------------------------- /A3C/A3C Episodic Async/SharedAdam.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py 2 | # A very nice optimization of Adam method to make it receive shared states. 3 | import math 4 | import torch 5 | import torch.optim as optim 6 | 7 | 8 | class SharedAdam(optim.Adam): 9 | """Implements Adam algorithm with shared states. 10 | """ 11 | 12 | def __init__(self, 13 | params, 14 | lr=1e-3, 15 | betas=(0.9, 0.999), 16 | eps=1e-8, 17 | weight_decay=0): 18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 19 | 20 | for group in self.param_groups: 21 | for p in group['params']: 22 | state = self.state[p] 23 | state['step'] = torch.zeros(1) 24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 26 | 27 | def share_memory(self): 28 | for group in self.param_groups: 29 | for p in group['params']: 30 | state = self.state[p] 31 | state['step'].share_memory_() 32 | state['exp_avg'].share_memory_() 33 | state['exp_avg_sq'].share_memory_() 34 | 35 | def step(self, closure=None): 36 | """Performs a single optimization step. 37 | Arguments: 38 | closure (callable, optional): A closure that reevaluates the model 39 | and returns the loss. 40 | """ 41 | loss = None 42 | if closure is not None: 43 | loss = closure() 44 | 45 | for group in self.param_groups: 46 | for p in group['params']: 47 | if p.grad is None: 48 | continue 49 | grad = p.grad.data 50 | state = self.state[p] 51 | 52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 53 | beta1, beta2 = group['betas'] 54 | 55 | state['step'] += 1 56 | 57 | if group['weight_decay'] != 0: 58 | grad = grad.add(group['weight_decay'], p.data) 59 | 60 | # Decay the first and second moment running average coefficient 61 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 63 | 64 | denom = exp_avg_sq.sqrt().add_(group['eps']) 65 | 66 | bias_correction1 = 1 - beta1 ** state['step'].item() 67 | bias_correction2 = 1 - beta2 ** state['step'].item() 68 | step_size = group['lr'] * math.sqrt( 69 | bias_correction2) / bias_correction1 70 | 71 | p.data.addcdiv_(-step_size, exp_avg, denom) 72 | 73 | return loss 74 | -------------------------------------------------------------------------------- /A3C/A3C Episodic Async/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def evaluate(shared_model, q): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | for episode in range(1): 35 | action_log_history = [] 36 | for step in range(200): 37 | actor.load_state_dict(shared_model.state_dict()) 38 | # -----lines below are line-corresponding to the original algorithm---- 39 | obs = np.reshape(obs, [1, -1]) 40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 41 | action_log_probability, V = actor(input_actor) 42 | p = np.exp(action_log_probability[0].detach().cpu()) 43 | action = np.random.choice(2, p=p.numpy()) 44 | action_log_history.append(action_log_probability[0][action]) 45 | obs, reward, done, info = env.step(action) 46 | if done: 47 | q.put(step) 48 | return 49 | -------------------------------------------------------------------------------- /A3C/A3C Episodic Async/pyTorch_CartPole_A3C_EA.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from torch.nn import Linear 5 | import torch.nn.functional as F 6 | import torch.multiprocessing as mp 7 | from workers_PlayGround import worker 8 | from evaluate import evaluate 9 | from SharedAdam import SharedAdam 10 | 11 | 12 | class Actor(torch.nn.Module): 13 | def __init__(self): 14 | super(Actor,self).__init__() 15 | self.fc1 = Linear(4, 128) 16 | self.fc2 = Linear(128, 128) 17 | self.fc3 = Linear(128, 2) 18 | self.fc4 = Linear(128, 1) 19 | self.steps = [] 20 | 21 | def forward(self, x): 22 | x = F.relu(self.fc1(x)) 23 | x = F.relu(self.fc2(x)) 24 | action = F.log_softmax(self.fc3(x), dim=-1) 25 | V = F.relu(self.fc4(x)) 26 | return action, V 27 | 28 | def draw(self, eval = False): 29 | plt.style.use('dark_background') 30 | plt.figure(figsize=(10, 10)) 31 | if eval: 32 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large') 33 | plt.xlabel('Rewards', fontsize='xx-large') 34 | plt.ylabel('Frequency', fontsize='xx-large') 35 | plt.hist(self.steps, range=(0, 200)) 36 | plt.show() 37 | else: 38 | mid = [] 39 | interval = 3 40 | for i in range(len(self.steps) - interval): 41 | mid.append(np.mean(self.steps[i:i + interval + 1])) 42 | plt.title('Performance of True Episode-Wise A3C on CartPole_V0', fontsize='xx-large') 43 | plt.xlabel('Episodes', fontsize='xx-large') 44 | plt.ylabel('Rewards', fontsize='xx-large') 45 | x_fit = list(range(len(self.steps) - interval)) 46 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data') 47 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average') 48 | plt.legend(loc="best", prop={'size': 12}) 49 | plt.show() 50 | 51 | 52 | if __name__ == '__main__': 53 | device = 'cpu' 54 | mp.set_start_method('spawn') 55 | # Do not change this unless you have multiple GPU. 56 | # update test 57 | q = mp.Queue() 58 | num_workers = 7 59 | processes = [] 60 | shared_model = Actor() 61 | shared_model.to(device) 62 | shared_model.share_memory() 63 | optimizer = SharedAdam(shared_model.parameters(), lr=0.001) 64 | p = mp.Process(target=evaluate, args=(shared_model, q)) 65 | processes.append(p) 66 | p.start() 67 | for worker_id in range(num_workers): 68 | p = mp.Process(target = worker, args = (shared_model, optimizer, q)) 69 | processes.append(p) 70 | p.start() 71 | # for p in processes: 72 | # p.join() 73 | episode = 0 74 | 75 | while True: 76 | if not q.empty(): 77 | shared_model.steps.append(q.get()) 78 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}') 79 | episode += 1 80 | if len(shared_model.steps) > 25: 81 | if np.mean(shared_model.steps[-100:]) == 199: 82 | for p in processes: 83 | p.terminate() 84 | while not q.empty(): 85 | shared_model.steps.append(q.get()) 86 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}') 87 | episode += 1 88 | break 89 | shared_model.draw() 90 | # ----evaluation---- 91 | shared_model.step = [] 92 | for episode in range(15): 93 | for worker_id in range(6): 94 | p = mp.Process(target=evaluate, args=(shared_model, q)) 95 | for p in processes: 96 | p.join() 97 | while not q.empty(): 98 | shared_model.steps.append(q.get()) 99 | shared_model.steps.sort() 100 | shared_model.draw(eval = True) 101 | 102 | 103 | -------------------------------------------------------------------------------- /A3C/A3C Episodic Async/workers_PlayGround.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def worker(shared_model, optimizer,q): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | gamma = 0.99 35 | eps = np.finfo(np.float32).eps.item() 36 | while True: 37 | action_log_history = [] 38 | V_history = [] 39 | actor.load_state_dict(shared_model.state_dict()) 40 | for step in range(200): 41 | # -----lines below are line-corresponding to the original algorithm---- 42 | obs = np.reshape(obs, [1, -1]) 43 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 44 | action_log_probability, V = actor(input_actor) 45 | p = np.exp(action_log_probability[0].detach().cpu()) 46 | action = np.random.choice(2, p=p.numpy()) 47 | action_log_history.append(action_log_probability[0][action]) 48 | V_history.append(V) 49 | obs, reward, done, info = env.step(action) 50 | if done: 51 | q.put(step) 52 | actor.zero_grad() 53 | obs = env.reset() 54 | if step == 199: 55 | break 56 | reward_list = np.ones((step + 1,)) 57 | for i in range(len(reward_list) - 2, -1, -1): 58 | reward_list[i] += reward_list[i + 1] * gamma 59 | reward_list -= np.mean(reward_list) 60 | reward_list /= (np.std(reward_list) + eps) 61 | Critic_Loss = [] 62 | Delta = [] 63 | for monte_carlo_return, V in zip(reward_list, V_history): 64 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device))) 65 | Delta.append(monte_carlo_return - V.detach()) 66 | Actor_Loss = [] 67 | entropy = 0 68 | for log_p in action_log_history: 69 | entropy -= log_p * torch.exp(log_p) 70 | for delta, log_prob in zip(Delta, action_log_history): 71 | Actor_Loss.append(-log_prob * delta.detach()) 72 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy*0.01 73 | loss.backward() 74 | ensure_shared_grads(actor, shared_model) 75 | optimizer.step() 76 | break 77 | 78 | 79 | def ensure_shared_grads(model, shared_model): 80 | for param, shared_param in zip(model.parameters(), 81 | shared_model.parameters()): 82 | if shared_param.grad is not None: 83 | return 84 | shared_param._grad = param.grad 85 | -------------------------------------------------------------------------------- /A3C/A3C Time Interval Async/SharedAdam.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py 2 | # A very nice optimization of Adam method to make it receive shared states. 3 | import math 4 | import torch 5 | import torch.optim as optim 6 | 7 | 8 | class SharedAdam(optim.Adam): 9 | """Implements Adam algorithm with shared states. 10 | """ 11 | 12 | def __init__(self, 13 | params, 14 | lr=1e-3, 15 | betas=(0.9, 0.999), 16 | eps=1e-8, 17 | weight_decay=0): 18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 19 | 20 | for group in self.param_groups: 21 | for p in group['params']: 22 | state = self.state[p] 23 | state['step'] = torch.zeros(1) 24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 26 | 27 | def share_memory(self): 28 | for group in self.param_groups: 29 | for p in group['params']: 30 | state = self.state[p] 31 | state['step'].share_memory_() 32 | state['exp_avg'].share_memory_() 33 | state['exp_avg_sq'].share_memory_() 34 | 35 | def step(self, closure=None): 36 | """Performs a single optimization step. 37 | Arguments: 38 | closure (callable, optional): A closure that reevaluates the model 39 | and returns the loss. 40 | """ 41 | loss = None 42 | if closure is not None: 43 | loss = closure() 44 | 45 | for group in self.param_groups: 46 | for p in group['params']: 47 | if p.grad is None: 48 | continue 49 | grad = p.grad.data 50 | state = self.state[p] 51 | 52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 53 | beta1, beta2 = group['betas'] 54 | 55 | state['step'] += 1 56 | 57 | if group['weight_decay'] != 0: 58 | grad = grad.add(group['weight_decay'], p.data) 59 | 60 | # Decay the first and second moment running average coefficient 61 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 63 | 64 | denom = exp_avg_sq.sqrt().add_(group['eps']) 65 | 66 | bias_correction1 = 1 - beta1 ** state['step'].item() 67 | bias_correction2 = 1 - beta2 ** state['step'].item() 68 | step_size = group['lr'] * math.sqrt( 69 | bias_correction2) / bias_correction1 70 | 71 | p.data.addcdiv_(-step_size, exp_avg, denom) 72 | 73 | return loss 74 | -------------------------------------------------------------------------------- /A3C/A3C Time Interval Async/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def evaluate(shared_model, q): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | for episode in range(1): 35 | action_log_history = [] 36 | for step in range(200): 37 | actor.load_state_dict(shared_model.state_dict()) 38 | # -----lines below are line-corresponding to the original algorithm---- 39 | obs = np.reshape(obs, [1, -1]) 40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 41 | action_log_probability, V = actor(input_actor) 42 | p = np.exp(action_log_probability[0].detach().cpu()) 43 | action = np.random.choice(2, p=p.numpy()) 44 | action_log_history.append(action_log_probability[0][action]) 45 | obs, reward, done, info = env.step(action) 46 | if done: 47 | q.put(step) 48 | return 49 | -------------------------------------------------------------------------------- /A3C/A3C Time Interval Async/pyTorch_CartPole_A3C_IA.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from torch.nn import Linear 5 | import torch.nn.functional as F 6 | import torch.multiprocessing as mp 7 | from workers import worker 8 | from evaluate import evaluate 9 | from SharedAdam import SharedAdam 10 | 11 | 12 | class Actor(torch.nn.Module): 13 | def __init__(self): 14 | super(Actor, self).__init__() 15 | self.fc1 = Linear(4, 128) 16 | self.fc2 = Linear(128, 128) 17 | self.fc3 = Linear(128, 2) 18 | self.fc4 = Linear(128, 1) 19 | self.steps = [] 20 | 21 | def forward(self, x): 22 | x = F.relu(self.fc1(x)) 23 | x = F.relu(self.fc2(x)) 24 | action = F.log_softmax(self.fc3(x), dim=-1) 25 | V = F.relu(self.fc4(x)) 26 | return action, V 27 | 28 | def draw(self, eval=False): 29 | plt.style.use('dark_background') 30 | plt.figure(figsize=(10, 10)) 31 | if eval: 32 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large') 33 | plt.xlabel('Rewards', fontsize='xx-large') 34 | plt.ylabel('Frequency', fontsize='xx-large') 35 | plt.hist(self.steps, range=(0, 200)) 36 | plt.show() 37 | else: 38 | mid = [] 39 | interval = 3 40 | for i in range(len(self.steps) - interval): 41 | mid.append(np.mean(self.steps[i:i + interval + 1])) 42 | plt.title('Performance of True Episode-Wise A3C on CartPole_V0', fontsize='xx-large') 43 | plt.xlabel('Episodes', fontsize='xx-large') 44 | plt.ylabel('Rewards', fontsize='xx-large') 45 | x_fit = list(range(len(self.steps) - interval)) 46 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data') 47 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average') 48 | plt.legend(loc="best", prop={'size': 12}) 49 | plt.show() 50 | 51 | 52 | if __name__ == '__main__': 53 | device = 'cpu' 54 | mp.set_start_method('spawn') 55 | # Do not change this unless you have multiple GPU. 56 | # update test 57 | q = mp.Queue() 58 | num_workers = 15 59 | processes = [] 60 | shared_model = Actor() 61 | shared_model.to(device) 62 | shared_model.share_memory() 63 | optimizer = SharedAdam(shared_model.parameters(), lr=0.001) 64 | p = mp.Process(target=evaluate, args=(shared_model, q)) 65 | processes.append(p) 66 | p.start() 67 | T = 300 68 | for worker_id in range(num_workers): 69 | p = mp.Process(target=worker, args=(shared_model, optimizer, q, T)) 70 | processes.append(p) 71 | p.start() 72 | # for p in processes: 73 | # p.join() 74 | episode = 0 75 | 76 | while True: 77 | if not q.empty(): 78 | shared_model.steps.append(q.get()) 79 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}') 80 | episode += 1 81 | if len(shared_model.steps) > 25: 82 | if np.mean(shared_model.steps[-50:]) == 199: 83 | for p in processes: 84 | p.terminate() 85 | while not q.empty(): 86 | shared_model.steps.append(q.get()) 87 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}') 88 | episode += 1 89 | break 90 | shared_model.draw() 91 | # ----evaluation---- 92 | shared_model.step = [] 93 | for episode in range(15): 94 | for worker_id in range(6): 95 | p = mp.Process(target=evaluate, args=(shared_model,q)) 96 | for p in processes: 97 | p.join() 98 | while not q.empty(): 99 | shared_model.steps.append(q.get()) 100 | shared_model.steps.sort() 101 | shared_model.draw(eval=True) 102 | -------------------------------------------------------------------------------- /A3C/A3C Time Interval Async/workers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from torch.nn import Linear, ReLU 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | def worker(shared_model, optimizer, q, T): 11 | class Actor(torch.nn.Module): 12 | def __init__(self): 13 | super(Actor, self).__init__() 14 | self.fc1 = Linear(4, 128) 15 | self.fc2 = Linear(128, 128) 16 | self.fc3 = Linear(128, 2) 17 | self.fc4 = Linear(128, 1) 18 | self.steps = [] 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | action = F.log_softmax(self.fc3(x), dim=-1) 24 | V = F.relu(self.fc4(x)) 25 | return action, V 26 | 27 | device = 'cpu' 28 | # I do not recommend using GPU for this method. CPU is much faster. 29 | # Change this to cuda only if you have a poor CPU or on a cloud 30 | env = gym.make('CartPole-v0') 31 | obs = env.reset() 32 | actor = Actor() 33 | actor.to(device) 34 | gamma = 0.99 35 | eps = np.finfo(np.float32).eps.item() 36 | t = 0 37 | while True: 38 | action_log_history = [] 39 | V_history = [] 40 | for step in range(200): 41 | # -----lines below are line-corresponding to the original algorithm---- 42 | actor.load_state_dict(shared_model.state_dict()) 43 | obs = np.reshape(obs, [1, -1]) 44 | input_actor = Variable(torch.from_numpy(obs).float()).to(device) 45 | action_log_probability, V = actor(input_actor) 46 | p = np.exp(action_log_probability[0].detach().cpu()) 47 | action = np.random.choice(2, p=p.numpy()) 48 | action_log_history.append(action_log_probability[0][action]) 49 | V_history.append(V) 50 | obs, reward, done, info = env.step(action) 51 | t += 1 52 | if done or t >= T: 53 | if done: 54 | q.put(step) 55 | actor.zero_grad() 56 | if done: 57 | obs = env.reset() 58 | reward_list = np.ones((step + 1,)) 59 | for i in range(len(reward_list) - 2, -1, -1): 60 | reward_list[i] += reward_list[i + 1] * gamma 61 | reward_list -= np.mean(reward_list) 62 | reward_list /= (np.std(reward_list) + eps) 63 | Critic_Loss = [] 64 | Delta = [] 65 | for monte_carlo_return, V in zip(reward_list, V_history): 66 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device))) 67 | Delta.append(monte_carlo_return - V.detach()) 68 | Actor_Loss = [] 69 | entropy = 0 70 | for log_p in action_log_history: 71 | entropy -= log_p * torch.exp(log_p) 72 | Delta = Delta[len(Delta) - len(action_log_history):] 73 | for delta, log_prob in zip(Delta, action_log_history): 74 | Actor_Loss.append(-log_prob * delta.detach()) 75 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy * 0.01 76 | loss.backward() 77 | ensure_shared_grads(actor, shared_model) 78 | optimizer.step() 79 | action_log_history = [] 80 | V_history = [] 81 | actor.load_state_dict(shared_model.state_dict()) 82 | if done: 83 | t = 0 84 | break 85 | else: 86 | t = 0 87 | 88 | def ensure_shared_grads(model, shared_model): 89 | for param, shared_param in zip(model.parameters(), 90 | shared_model.parameters()): 91 | if shared_param.grad is not None: 92 | return 93 | shared_param._grad = param.grad 94 | -------------------------------------------------------------------------------- /A3C/README About Vairations.md: -------------------------------------------------------------------------------- 1 | ## Review of A2C and A3C 2 | We all know the A3C is using a episode count to control the async 3 | process. In this procedure, child process will return the grad 4 | after given episodes or finished early. They will be sync in the next 5 | run. 6 | 7 | A2C however, waits for each child process to finish its segments. 8 | 9 | ## My Variation 10 | In this project, I change the sync or async method from A2C and A3C 11 | into episode wise. That is, the child process will only return 12 | the grad AFTER it completes the episode. 13 | 14 | In variation of A3C, child process will return the grad as soon 15 | as it finished the current episode, return the latest grad, do the backward 16 | then sync with the latest model parameters to go next. It will be put 17 | into an infinite loop. This method will never call join() method to process, 18 | instead, it will monitor a queue that is filled by each child process's game 19 | record. If the last 100 of them are maximum rewards, it will send terminate 20 | message to all child processes. 21 | 22 | In variation of A2C, child process will return the grad as soon 23 | as it finished the current episode but all processes will wait every one 24 | finished and sync with the updated model together. In this case we need 25 | a loop and call joint in each loop. If the queue fulfills the convergence requirement, 26 | loop will be ended. 27 | 28 | Additionally, I have set all mode do not learn at all if it reaches the maximum 29 | reward to facilitate converging. [need TESTED] 30 | 31 | ## More Variations: 32 | There isn't too much difference in each child processes. It could 33 | be a exploratory directions. And, how to measure the differences of each 34 | method need a large amount of time. 35 | 36 | ## Additional Warnings: 37 | when initialize multi-processing, fork does not work. If you are in Linux or Mac, change to spawn. 38 | Reasons unclear to me at this point. 39 | -------------------------------------------------------------------------------- /Ape-X/PyTorch_Ape-X.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LyWangPX/Reinforcement_Learning_Coding_Examples/2f40f67f5709c9dc4ea3d9dd15b441b627b595a6/Ape-X/PyTorch_Ape-X.py -------------------------------------------------------------------------------- /D4PG/PyTorch_D4PG.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | -------------------------------------------------------------------------------- /DDPG/PyTorch_DDPG.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn.functional as F 4 | from collections import deque 5 | import gym 6 | import matplotlib.pyplot as plt 7 | from torch.nn import Linear, ReLU 8 | 9 | """ 10 | This is a vanilla implementation of DDPG. (without PER) 11 | """ 12 | 13 | 14 | class Actor(torch.nn.Module): 15 | def __init__(self, maxlen=100000): 16 | super(Actor, self).__init__() 17 | self.fc1 = Linear(3, 256) 18 | self.fc2 = Linear(256, 256) 19 | self.fc3 = Linear(256, 256) 20 | self.fc4 = Linear(256, 1) 21 | self.s_buffer = deque(maxlen=maxlen) 22 | self.a_buffer = deque(maxlen=maxlen) 23 | self.r_buffer = deque(maxlen=maxlen) 24 | self.next_s_buffer = deque(maxlen=maxlen) 25 | 26 | def forward(self, x): 27 | x = F.relu(self.fc1(x)) 28 | x = F.relu(self.fc2(x)) 29 | x = F.relu(self.fc3(x)) 30 | action = 2*torch.tanh(self.fc4(x)) 31 | return action 32 | 33 | def bufferin(self, s, a, r, next_s): 34 | self.s_buffer.append(s) 35 | self.a_buffer.append(a) 36 | self.r_buffer.append(r) 37 | self.next_s_buffer.append(next_s) 38 | 39 | def sample(self, batch_size=64): 40 | indices = np.random.choice(range(len(self.a_buffer)), size=min(len(self.a_buffer), batch_size), replace=False) 41 | s_buffer = [self.s_buffer[i] for i in indices] 42 | a_buffer = [self.a_buffer[i] for i in indices] 43 | r_buffer = [self.r_buffer[i] for i in indices] 44 | next_s_buffer = [self.next_s_buffer[i] for i in indices] 45 | return a_buffer, s_buffer, r_buffer, next_s_buffer 46 | 47 | 48 | class Critic(torch.nn.Module): 49 | def __init__(self): 50 | super(Critic, self).__init__() 51 | self.fc1 = Linear(4, 256) 52 | self.fc2 = Linear(256, 512) 53 | self.fc3 = Linear(512, 1) 54 | self.action = Linear(1, 256) 55 | 56 | def forward(self, x, a): 57 | x = torch.cat([x,a],1) 58 | x = F.relu(self.fc1(x)) 59 | x = F.relu(self.fc2(x)) 60 | Q = self.fc3(x) 61 | return Q 62 | 63 | 64 | def evaluate(target_policy, device, final=False): 65 | target_policy.eval() 66 | env = NormalizedEnv(gym.make('Pendulum-v0')) 67 | s = env.reset() 68 | if final: 69 | result = [] 70 | for episode in range(100): 71 | rewards = 0 72 | for step in range(200): 73 | action = target_policy.forward(torch.FloatTensor(s)) 74 | s, reward, done, _ = env.step([action.detach()]) 75 | rewards += reward 76 | if done: 77 | result.append(rewards) 78 | s = env.reset() 79 | return result 80 | else: 81 | result = [] 82 | for episode in range(1): 83 | rewards = 0 84 | for step in range(200): 85 | action = target_policy.forward(torch.FloatTensor(s)) 86 | s, reward, done, _ = env.step([float(action)]) 87 | rewards += reward 88 | if done: 89 | result.append(rewards) 90 | s = env.reset() 91 | return result 92 | 93 | 94 | def draw(steps, name): 95 | plt.style.use('dark_background') 96 | plt.figure(figsize=(10, 10)) 97 | mid = [] 98 | interval = 3 99 | for i in range(len(steps) - interval): 100 | mid.append(np.mean(steps[i:i + interval + 1])) 101 | plt.title(f'{name} DDPG on Pendulum_V0 ', fontsize='xx-large') 102 | plt.xlabel('Episodes', fontsize='xx-large') 103 | plt.ylabel(f'{name}', fontsize='xx-large') 104 | x_fit = list(range(len(steps) - interval)) 105 | plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data') 106 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average') 107 | plt.legend(loc="best", prop={'size': 12}) 108 | plt.show() 109 | 110 | 111 | # https://github.com/openai/gym/blob/master/gym/core.py 112 | class NormalizedEnv(gym.ActionWrapper): 113 | """ Wrap action """ 114 | 115 | def action(self, action): 116 | act_k = (self.action_space.high - self.action_space.low) / 2. 117 | act_b = (self.action_space.high + self.action_space.low) / 2. 118 | return act_k * action + act_b 119 | 120 | def reverse_action(self, action): 121 | act_k_inv = 2. / (self.action_space.high - self.action_space.low) 122 | act_b = (self.action_space.high + self.action_space.low) / 2. 123 | return act_k_inv * (action - act_b) 124 | 125 | 126 | class Ornstein_Uhlenbeck_Process: 127 | def __init__(self, dt=0.3): 128 | self.theta = 0.15 129 | self.sigma = 0.2 130 | self.dt = dt 131 | self.x = 0 132 | 133 | def step(self): 134 | dW = self.dt ** 2 * np.random.normal() 135 | dx = -self.theta * self.x * self.dt + self.sigma * dW 136 | self.x += dx 137 | return self.x 138 | 139 | 140 | def main(): 141 | # create two identical model 142 | # ---hyper parameter--- 143 | gamma = 0.99 144 | tau = 0.01 145 | # ---hyper parameter--- 146 | steps = [] 147 | device = 'cpu' 148 | actor = Actor().to(device) 149 | actor_optimizer = torch.optim.Adam(actor.parameters(), lr=1e-4) 150 | target_actor = Actor().to(device) 151 | critic = Critic().to(device) 152 | critic_optimizer = torch.optim.Adam(critic.parameters(), lr=1e-3) 153 | target_critic = Critic().to(device) 154 | for target_param, param in zip(target_actor.parameters(), actor.parameters()): 155 | target_param.data.copy_(param.data) 156 | for target_param, param in zip(target_critic.parameters(), critic.parameters()): 157 | target_param.data.copy_(param.data) 158 | 159 | env = gym.make('Pendulum-v0') 160 | s = env.reset() 161 | A_loss = [] 162 | C_loss = [] 163 | actor.train() 164 | critic.train() 165 | for episode in range(100): 166 | rewards = 0 167 | random_process = Ornstein_Uhlenbeck_Process(dt=0.1) 168 | for step in range(250): 169 | 170 | # LINE 1 Select Action 171 | action = (actor.forward(torch.FloatTensor(s)) + random_process.step()) 172 | 173 | # LINE 2 Execute and Observe 174 | next_s, reward, done, _ = env.step(action.detach()) 175 | # LINE 3 Store 176 | actor.bufferin(s, action, reward, next_s) 177 | 178 | s = next_s 179 | rewards += reward 180 | if len(actor.a_buffer) > 180: 181 | # LINE 4 SAMPLE a minibatch 182 | a_buffer, s_buffer, r_buffer, next_s_buffer = actor.sample() 183 | a_buffer = torch.FloatTensor(a_buffer).view(-1,1) 184 | s_buffer = torch.FloatTensor(s_buffer).view(-1,3) 185 | r_buffer = torch.FloatTensor(r_buffer).view(-1,1) 186 | next_s_buffer = torch.FloatTensor(next_s_buffer).view(-1,3) 187 | 188 | # LINE 5 Set y = r + gamma next Q from target critic 189 | next_a = target_actor(next_s_buffer.to(device)) 190 | next_Q = target_critic(next_s_buffer.to(device), next_a.to(device)) 191 | y = r_buffer.to(device) + gamma * next_Q 192 | 193 | 194 | # LINE 7 Update the actor policy using sampled policy gradient 195 | true_a = actor(s_buffer.to(device)) 196 | actor_loss_total = critic.forward(s_buffer.to(device), true_a.to(device)) 197 | actor_loss = -actor_loss_total.mean() 198 | actor.zero_grad() 199 | actor_loss.backward() 200 | actor_optimizer.step() 201 | 202 | # LINE 6 Update critic by minimizing the mse. 203 | Q = critic(s_buffer.to(device), 204 | a_buffer.float().to(device)) 205 | critic_loss = torch.nn.functional.mse_loss(Q, y.detach()) 206 | critic_optimizer.zero_grad() 207 | critic_loss.backward() 208 | critic_optimizer.step() 209 | 210 | A_loss.append(actor_loss.item()) 211 | C_loss.append(critic_loss.item()) 212 | 213 | # LINE 8 Update the target network 214 | for target_param, param in zip(target_actor.parameters(), actor.parameters()): 215 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) 216 | 217 | for target_param, param in zip(target_critic.parameters(), critic.parameters()): 218 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) 219 | if done: 220 | s = env.reset() 221 | steps.append(rewards) 222 | print(f'episode {episode}, total rewards {steps[-1]}') 223 | break 224 | draw(steps, 'rewards') 225 | draw(A_loss, 'A_loss') 226 | draw(C_loss, 'C_loss') 227 | hist = evaluate(target_actor, device, final=True) 228 | draw(hist, 'eval') 229 | 230 | 231 | if __name__ == '__main__': 232 | main() 233 | -------------------------------------------------------------------------------- /Deuling Double DQN with PER/PyTorch_Deuling_DDQN_with_PER.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import torch.nn.functional as F 6 | from torch.nn import Linear, ReLU 7 | from collections import deque 8 | 9 | """ 10 | This is a vanilla Deuling Double DQN with PER (softmax absolute delta) 11 | """ 12 | 13 | 14 | class Q_network(torch.nn.Module): 15 | def __init__(self, n_action=2): 16 | super(Q_network, self).__init__() 17 | self.input = Linear(4, 256) 18 | self.input_to_V = Linear(256, 64) 19 | self.input_to_A = Linear(256, 64) 20 | self.input_to_V2 = Linear(64, 1) 21 | self.input_to_A2 = Linear(64, n_action) 22 | self.a_buffer = deque(maxlen=8192) 23 | self.r_buffer = deque(maxlen=8192) 24 | self.s_buffer = deque(maxlen=8192) 25 | self.done_buffer = deque(maxlen=8192) 26 | self.next_s_buffer = deque(maxlen=8192) 27 | self.priority_buffer = deque(maxlen=8192) 28 | 29 | def forward(self, x): 30 | x = F.relu(self.input(x)) 31 | V_stream = F.relu(self.input_to_V(x)) 32 | V_stream = self.input_to_V2(V_stream) 33 | A_stream = F.relu(self.input_to_A(x)) 34 | A_stream = self.input_to_A2(A_stream) 35 | A_mean = torch.mean(A_stream, dim=1, keepdim=True) 36 | result = V_stream + A_stream - A_mean 37 | return result 38 | 39 | def bufferin(self, tuple_info): 40 | # expect tuple_info with content S, A, R, S' 41 | # ALL in TENSOR FORMAT 42 | state, action, reward, next_S, done, priority = tuple_info 43 | self.a_buffer.append(action) 44 | self.s_buffer.append(state) 45 | self.r_buffer.append(reward) 46 | self.next_s_buffer.append(next_S) 47 | self.done_buffer.append(done) 48 | self.priority_buffer.append(priority) 49 | 50 | def sample(self, size=64): 51 | with torch.no_grad(): 52 | prob = np.array(F.softmax(torch.stack(list(self.priority_buffer))).view(-1)) 53 | prob /= prob.sum() 54 | sample_indices = np.random.choice(range(len(self.a_buffer)), size=64, p=prob, replace=False) 55 | a_sample = [self.a_buffer[i] for i in sample_indices] 56 | r_sample = [self.r_buffer[i] for i in sample_indices] 57 | s_sample = [self.s_buffer[i] for i in sample_indices] 58 | next_s_sample = [self.next_s_buffer[i] for i in sample_indices] 59 | done_sample = [self.done_buffer[i] for i in sample_indices] 60 | 61 | a_sample = torch.Tensor(a_sample).view(-1, 1) 62 | r_sample = torch.Tensor(r_sample).view(-1, 1) 63 | s_sample = torch.stack(s_sample).view(-1, 4) 64 | next_s_sample = torch.stack(next_s_sample).view(-1, 4) 65 | done_sample = torch.Tensor(done_sample).view(-1, 1) 66 | 67 | return s_sample, a_sample, r_sample, next_s_sample, done_sample, sample_indices 68 | 69 | 70 | def main(): 71 | gamma = 0.99 72 | beta = 0.25 73 | env = gym.make('CartPole-v0') 74 | state = env.reset() 75 | state = torch.FloatTensor(state).view(-1, 4) 76 | Q_target = Q_network() 77 | optimizer = torch.optim.Adam(Q_target.parameters(), lr=0.0003) 78 | Q_copy = Q_network() 79 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()): 80 | param_copy.data.copy_(param_target.data) 81 | steps = [] 82 | for episode in range(10000): 83 | Q_mean = 0 84 | for step in range(200): 85 | with torch.no_grad(): 86 | Q_list = Q_target.forward(state) 87 | if np.random.random() > beta: 88 | action = np.argmax(Q_list.detach()) 89 | next_state, reward, done, _ = env.step(action.item()) 90 | else: 91 | action = np.random.randint(2) 92 | next_state, reward, done, _ = env.step(action) 93 | next_state = torch.FloatTensor(next_state).view(-1, 4) 94 | # PER: Calculate delta of this tuple 95 | Q = Q_list[0][action] 96 | Q_prime = Q_copy.forward(next_state) 97 | next_action = np.argmax(Q_prime.detach()) 98 | delta = abs(Q - reward + gamma * Q_prime[0][next_action]) 99 | delta = delta.view(1) 100 | tuple_info = (state, action, torch.Tensor([reward]), next_state, not done, delta) 101 | Q_target.bufferin(tuple_info) 102 | # Learning Part 103 | if len(Q_target.a_buffer) > 64: 104 | s_sample, a_sample, r_sample, next_s_sample, done_sample, ids = Q_target.sample() 105 | # Q values from recorded S and A 106 | Q = Q_target.forward(s_sample) 107 | Q_mean = Q.mean() 108 | Q = Q.gather(1, a_sample.long().view(-1, 1)) 109 | # Q' values from recorded S and A recalculated from Q 110 | next_Q = Q_target.forward(next_s_sample) 111 | Q_values, Q_actions = torch.max(next_Q.detach(), 1) 112 | Q_actions = Q_actions.view(-1, 1) 113 | Q_prime = Q_copy.forward(next_s_sample) 114 | Q_prime = Q_prime.gather(1, Q_actions.long().view(-1, 1)) 115 | y = r_sample + gamma * Q_prime * done_sample 116 | deltas = abs(Q - y.detach()) 117 | for delta, id in zip(deltas, ids): 118 | Q_target.priority_buffer[id] = delta.view(-1) 119 | loss = F.mse_loss(Q, y.detach()) 120 | Q_target.zero_grad() 121 | loss.backward() 122 | optimizer.step() 123 | 124 | # Loop reset Part 125 | if not done: 126 | state = next_state 127 | else: 128 | state = torch.FloatTensor(env.reset()).view(-1, 4) 129 | print(f'episode {episode}, step {step}, Q_average {Q_mean}') 130 | steps.append(step) 131 | break 132 | if episode % 3 == 0: 133 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()): 134 | param_copy.data.copy_(param_target.data) 135 | if episode > 40: 136 | beta = 5 / episode 137 | 138 | if np.mean(steps[-20:]) > 190: 139 | break 140 | 141 | plt.style.use('dark_background') 142 | plt.figure(figsize=(10, 10)) 143 | mid = [] 144 | interval = 3 145 | for i in range(len(steps) - interval): 146 | mid.append(np.mean(steps[i:i + interval + 1])) 147 | plt.title(f'Deuling DDQN on CartPole-v0 with PER', fontsize='xx-large') 148 | plt.xlabel('Episodes', fontsize='xx-large') 149 | plt.ylabel(f'Rewards', fontsize='xx-large') 150 | x_fit = list(range(len(steps) - interval)) 151 | plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data') 152 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average') 153 | plt.legend(loc="best", prop={'size': 12}) 154 | plt.show() 155 | 156 | 157 | if __name__ == '__main__': 158 | main() 159 | -------------------------------------------------------------------------------- /Deuling Double DQN/PyTorch_Deuling_DDQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import torch.nn.functional as F 6 | from torch.nn import Linear 7 | from collections import deque 8 | 9 | """ 10 | This is a vanilla Dueling Double DQN without PER. 11 | """ 12 | 13 | class Q_network(torch.nn.Module): 14 | def __init__(self, n_action=2): 15 | super(Q_network, self).__init__() 16 | self.input = Linear(4, 256) 17 | self.input_to_V = Linear(256, 64) 18 | self.input_to_A = Linear(256, 64) 19 | self.input_to_V2 = Linear(64, 1) 20 | self.input_to_A2 = Linear(64, n_action) 21 | self.a_buffer = deque(maxlen=8192) 22 | self.r_buffer = deque(maxlen=8192) 23 | self.s_buffer = deque(maxlen=8192) 24 | self.done_buffer = deque(maxlen=8192) 25 | self.next_s_buffer = deque(maxlen=8192) 26 | 27 | def forward(self, x): 28 | x = F.relu(self.input(x)) 29 | V_stream = F.relu(self.input_to_V(x)) 30 | V_stream = self.input_to_V2(V_stream) 31 | A_stream = F.relu(self.input_to_A(x)) 32 | A_stream = self.input_to_A2(A_stream) 33 | A_mean = torch.mean(A_stream, dim=1, keepdim=True) 34 | result = V_stream + A_stream - A_mean 35 | return result 36 | 37 | def bufferin(self, tuple_info): 38 | # expect tuple_info with content S, A, R, S' 39 | # ALL in TENSOR FORMAT 40 | state, action, reward, next_S, done = tuple_info 41 | self.a_buffer.append(action) 42 | self.s_buffer.append(state) 43 | self.r_buffer.append(reward) 44 | self.next_s_buffer.append(next_S) 45 | self.done_buffer.append(done) 46 | 47 | def sample(self, size=64): 48 | sample_indices = np.random.choice(range(len(self.a_buffer)), 64, replace=False) 49 | a_sample = [self.a_buffer[i] for i in sample_indices] 50 | r_sample = [self.r_buffer[i] for i in sample_indices] 51 | s_sample = [self.s_buffer[i] for i in sample_indices] 52 | next_s_sample = [self.next_s_buffer[i] for i in sample_indices] 53 | done_sample = [self.done_buffer[i] for i in sample_indices] 54 | 55 | a_sample = torch.Tensor(a_sample).view(-1, 1) 56 | r_sample = torch.Tensor(r_sample).view(-1, 1) 57 | s_sample = torch.stack(s_sample).view(-1, 4) 58 | next_s_sample = torch.stack(next_s_sample).view(-1, 4) 59 | done_sample = torch.Tensor(done_sample).view(-1, 1) 60 | 61 | return s_sample, a_sample, r_sample, next_s_sample, done_sample 62 | 63 | 64 | def main(): 65 | gamma = 0.99 66 | beta = 0.25 67 | env = gym.make('CartPole-v0') 68 | state = env.reset() 69 | state = torch.FloatTensor(state).view(-1, 4) 70 | Q_target = Q_network() 71 | optimizer = torch.optim.Adam(Q_target.parameters(), lr=0.001) 72 | Q_copy = Q_network() 73 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()): 74 | param_copy.data.copy_(param_target.data) 75 | steps = [] 76 | for episode in range(10000): 77 | Q_mean = 0 78 | for step in range(200): 79 | Q_list = Q_target.forward(state) 80 | if np.random.random() > beta: 81 | action = np.argmax(Q_list.detach()) 82 | next_state, reward, done, _ = env.step(action.item()) 83 | else: 84 | action = np.random.randint(2) 85 | next_state, reward, done, _ = env.step(action) 86 | next_state = torch.FloatTensor(next_state).view(-1, 4) 87 | tuple_info = (state, action, torch.Tensor([reward]), next_state, not done) 88 | Q_target.bufferin(tuple_info) 89 | # Learning Part 90 | if len(Q_target.a_buffer) > 64: 91 | s_sample, a_sample, r_sample, next_s_sample, done_sample = Q_target.sample() 92 | # Q values from recorded S and A 93 | Q = Q_target.forward(s_sample) 94 | Q_mean = Q.mean() 95 | Q = Q.gather(1, a_sample.long().view(-1, 1)) 96 | # Q' values from recorded S and A recalculated from Q 97 | next_Q = Q_target.forward(next_s_sample) 98 | Q_values, Q_actions = torch.max(next_Q.detach(), 1) 99 | Q_actions = Q_actions.view(-1,1) 100 | Q_prime = Q_copy.forward(next_s_sample) 101 | Q_prime = Q_prime.gather(1, Q_actions.long().view(-1, 1)) 102 | y = r_sample + gamma * Q_prime * done_sample 103 | loss = F.mse_loss(Q, y.detach()) 104 | Q_target.zero_grad() 105 | loss.backward() 106 | optimizer.step() 107 | 108 | # Loop reset Part 109 | if not done: 110 | state = next_state 111 | else: 112 | state = torch.FloatTensor(env.reset()).view(-1, 4) 113 | print(f'episode {episode}, step {step}, Q_average {Q_mean}') 114 | steps.append(step) 115 | break 116 | if episode % 3 == 0: 117 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()): 118 | param_copy.data.copy_(param_target.data) 119 | if episode > 40: 120 | beta = 5/episode 121 | 122 | if np.mean(steps[-20:]) > 190: 123 | break 124 | 125 | 126 | plt.style.use('dark_background') 127 | plt.figure(figsize=(10, 10)) 128 | mid = [] 129 | interval = 3 130 | for i in range(len(steps) - interval): 131 | mid.append(np.mean(steps[i:i + interval + 1])) 132 | plt.title(f'Deuling DDQN on CartPole-v0', fontsize='xx-large') 133 | plt.xlabel('Episodes', fontsize='xx-large') 134 | plt.ylabel(f'Rewards', fontsize='xx-large') 135 | x_fit = list(range(len(steps) - interval)) 136 | plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data') 137 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average') 138 | plt.legend(loc="best", prop={'size': 12}) 139 | plt.show() 140 | 141 | 142 | if __name__ == '__main__': 143 | main() 144 | -------------------------------------------------------------------------------- /Experiments/Online TD and true Online TD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "\n", 13 | "# Define state 1, 2, 3 ,4 ,5... 19 as normal state with one-hot encoding\n", 14 | "# state 0 and state 20 share the same zero feature vectors.\n", 15 | "\n", 16 | "def feature_map(state):\n", 17 | " zero_model = [0]*19\n", 18 | " zero_model[state-1] = 1\n", 19 | " zero_model = np.array(zero_model)\n", 20 | " zero_model.resize((19,1))\n", 21 | " return np.array(zero_model)\n", 22 | "\n", 23 | "# create a hash table to quickly draw features\n", 24 | "feature_hash = {0: np.zeros((19,1)),\n", 25 | " 20: np.zeros((19,1))}\n", 26 | "for state in range(1,20):\n", 27 | " feature_hash[state] = feature_map(state)\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 65, 33 | "outputs": [], 34 | "source": [ 35 | "history = []\n", 36 | "for episode in range(2):\n", 37 | " local = [10]\n", 38 | " state = 10\n", 39 | " while True:\n", 40 | " if np.random.random() > 0.5:\n", 41 | " state += 1\n", 42 | " else:\n", 43 | " state -= 1\n", 44 | " local.append(state)\n", 45 | " if state == 0 or state == 20:\n", 46 | " history.append(local)\n", 47 | " break" 48 | ], 49 | "metadata": { 50 | "collapsed": false, 51 | "pycharm": { 52 | "name": "#%%\n" 53 | } 54 | } 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 66, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "processing episode 1 horizon 245\r" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "# hand pick the hyper parameters\n", 70 | "alpha = 0.4\n", 71 | "gamma = 0.8\n", 72 | "_lambda = 0.9\n", 73 | "\n", 74 | "# set all ones as initialization\n", 75 | "w_last_episode = np.ones((19,1))\n", 76 | "w_last_round = np.ones((19,1))\n", 77 | "w_forward = {}\n", 78 | "def n_step_G(t, h, w, hist):\n", 79 | " if h == len(hist):\n", 80 | " # v(T) == 0; reward == 1\n", 81 | " if hist[-1] == 20:\n", 82 | " return gamma**(h-t-1)\n", 83 | " else:\n", 84 | " return 0\n", 85 | " else:\n", 86 | " # reward == 0; \n", 87 | " return gamma**(h-t)*(w.T@feature_hash[hist[h-1]])\n", 88 | " \n", 89 | "def lambda_G(t,h,hist):\n", 90 | " first_term = np.sum([_lambda**(n-1)*n_step_G(t,t+n,w_dict[n-1],hist) for n in range(1, h-t)])\n", 91 | " return (1-_lambda)*first_term + _lambda**(h-t-1)*n_step_G(t,h,w_dict[h-1],hist)\n", 92 | " \n", 93 | "for i,hist in enumerate(history):\n", 94 | " w_dict = {0:w_last_episode}\n", 95 | " for h in range(1, len(hist)+1):\n", 96 | " print(f'processing episode {i} horizon {h}', end = '\\r')\n", 97 | " w_old = w_last_episode\n", 98 | " for t in range(1,h+1):\n", 99 | " w = w_old + alpha*(lambda_G(t-1,h,hist) - w_old.T@feature_hash[hist[t-1]])*feature_hash[hist[t-1]]\n", 100 | " w_old = w\n", 101 | " else:\n", 102 | " w_dict[h] = w_old\n", 103 | " else:\n", 104 | " w_forward[i] = w_old\n", 105 | " w_last_episode = w_old\n" 106 | ], 107 | "metadata": { 108 | "collapsed": false, 109 | "pycharm": { 110 | "name": "#%%\n" 111 | } 112 | } 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 67, 117 | "outputs": [], 118 | "source": [ 119 | "w_online = {}\n", 120 | "w_2 = np.ones((19,1))\n", 121 | "for episode, hist in enumerate(history):\n", 122 | " z = np.zeros((19,1))\n", 123 | " V_old = 0\n", 124 | " for i, state in enumerate(hist):\n", 125 | " if i == len(hist)-2:\n", 126 | " if hist[i+1] == 20:\n", 127 | " R = 1\n", 128 | " else:\n", 129 | " R = 0\n", 130 | " done = True\n", 131 | " else:\n", 132 | " R = 0\n", 133 | " done = False\n", 134 | " V = w_2.T@feature_hash[state]\n", 135 | " V_prime = w_2.T@feature_hash[hist[i+1]]\n", 136 | " delta = R + gamma*V_prime - V\n", 137 | " z = _lambda*gamma*z + (1-alpha*gamma*_lambda*z.T@feature_hash[state])*feature_hash[state]\n", 138 | " w_2 = w_2 + alpha*(delta + V - V_old)*z - alpha*(V-V_old)*feature_hash[state]\n", 139 | " V_old = V_prime\n", 140 | " if done:\n", 141 | " w_online[episode] = w_2\n", 142 | " break" 143 | ], 144 | "metadata": { 145 | "collapsed": false, 146 | "pycharm": { 147 | "name": "#%%\n" 148 | } 149 | } 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 70, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": "array([[1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [0.53231318],\n [0.32852723],\n [0.27689236],\n [0.28178901],\n [0.2804106 ],\n [0.27976137],\n [0.2788142 ],\n [0.27958496],\n [0.28473191],\n [0.29893919],\n [0.36824546],\n [0.55494604],\n [0.8125568 ]])" 158 | }, 159 | "execution_count": 70, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "w_forward[0]" 166 | ], 167 | "metadata": { 168 | "collapsed": false, 169 | "pycharm": { 170 | "name": "#%%\n" 171 | } 172 | } 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 71, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": "array([[1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [0.4232541 ],\n [0.15123449],\n [0.02727636],\n [0.01142531],\n [0.00900136],\n [0.01016122],\n [0.01274722],\n [0.01743689],\n [0.04539897],\n [0.12147425],\n [0.28585104],\n [0.58749665],\n [0.90724135]])" 181 | }, 182 | "execution_count": 71, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "w_online[0]" 189 | ], 190 | "metadata": { 191 | "collapsed": false, 192 | "pycharm": { 193 | "name": "#%%\n" 194 | } 195 | } 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "outputs": [], 201 | "source": [ 202 | "\n" 203 | ], 204 | "metadata": { 205 | "collapsed": false, 206 | "pycharm": { 207 | "name": "#%%\n" 208 | } 209 | } 210 | } 211 | ], 212 | "metadata": { 213 | "kernelspec": { 214 | "display_name": "Python 3", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 2 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython2", 228 | "version": "2.7.6" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 0 233 | } -------------------------------------------------------------------------------- /Experiments/Seijen2014_True_Online_TD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "\n", 13 | "state_map = {\n", 14 | " 'A':[('B', -1), ('C', 1), ('D',-1)],\n", 15 | " 'B':[('E', 1), ('T', 1)],\n", 16 | " 'C':[('A', -1), ('B', 1), ('D', 1), ('T', 1)],\n", 17 | " 'D':[('F', 1), ('T', 1)],\n", 18 | " 'E':[('T', 1)],\n", 19 | " 'F':[('T', 1)]\n", 20 | "}\n", 21 | "\n", 22 | "feature_map = {\n", 23 | " 'A':np.array([[1],[0],[0],[0],[0],[0]]),\n", 24 | " 'B':np.array([[0],[1],[0],[0],[0],[0]]),\n", 25 | " 'C':np.array([[0],[0],[1],[0],[0],[0]]),\n", 26 | " 'D':np.array([[0],[0],[0],[1],[0],[0]]),\n", 27 | " 'E':np.array([[0],[0],[0],[0],[1],[0]]),\n", 28 | " 'F':np.array([[0],[0],[0],[0],[0],[1]]),\n", 29 | " 'T':np.array([[0],[0],[0],[0],[0],[0]]),\n", 30 | "}\n", 31 | "theta_2014 = np.array([[0],[0],[0],[0],[0],[0]])\n", 32 | "theta_2016 = np.array([[0],[0],[0],[0],[0],[0]])" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 14, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "-----episode 0-----\n", 44 | "theta_2014: [[-0.01930828]\n", 45 | " [ 0.04260991]\n", 46 | " [ 0.01048235]\n", 47 | " [ 0.06055846]\n", 48 | " [ 0.029701 ]\n", 49 | " [ 0.0199 ]]\n", 50 | "theta_2016: [[-0.01930828]\n", 51 | " [ 0.04260991]\n", 52 | " [ 0.01048235]\n", 53 | " [ 0.06055846]\n", 54 | " [ 0.029701 ]\n", 55 | " [ 0.0199 ]]\n", 56 | "error 1.0408340855860843e-17\n", 57 | "---------------------------\n", 58 | "-----episode 1-----\n", 59 | "theta_2014: [[-0.00789621]\n", 60 | " [ 0.05218381]\n", 61 | " [ 0.02174718]\n", 62 | " [ 0.06055846]\n", 63 | " [ 0.029701 ]\n", 64 | " [ 0.0199 ]]\n", 65 | "theta_2016: [[-0.00789621]\n", 66 | " [ 0.05218381]\n", 67 | " [ 0.02174718]\n", 68 | " [ 0.06055846]\n", 69 | " [ 0.029701 ]\n", 70 | " [ 0.0199 ]]\n", 71 | "error 8.673617379884035e-18\n", 72 | "---------------------------\n", 73 | "-----episode 2-----\n", 74 | "theta_2014: [[-0.01636229]\n", 75 | " [ 0.06166197]\n", 76 | " [ 0.02174718]\n", 77 | " [ 0.06055846]\n", 78 | " [ 0.029701 ]\n", 79 | " [ 0.0199 ]]\n", 80 | "theta_2016: [[-0.01636229]\n", 81 | " [ 0.06166197]\n", 82 | " [ 0.02174718]\n", 83 | " [ 0.06055846]\n", 84 | " [ 0.029701 ]\n", 85 | " [ 0.0199 ]]\n", 86 | "error 6.938893903907228e-18\n", 87 | "---------------------------\n", 88 | "-----episode 3-----\n", 89 | "theta_2014: [[-0.02466909]\n", 90 | " [ 0.06166197]\n", 91 | " [ 0.02174718]\n", 92 | " [ 0.06995287]\n", 93 | " [ 0.029701 ]\n", 94 | " [ 0.0199 ]]\n", 95 | "theta_2016: [[-0.02466909]\n", 96 | " [ 0.06166197]\n", 97 | " [ 0.02174718]\n", 98 | " [ 0.06995287]\n", 99 | " [ 0.029701 ]\n", 100 | " [ 0.0199 ]]\n", 101 | "error 6.938893903907228e-18\n", 102 | "---------------------------\n", 103 | "-----episode 4-----\n", 104 | "theta_2014: [[-0.03280912]\n", 105 | " [ 0.06166197]\n", 106 | " [ 0.02174718]\n", 107 | " [ 0.07925335]\n", 108 | " [ 0.029701 ]\n", 109 | " [ 0.0199 ]]\n", 110 | "theta_2016: [[-0.03280912]\n", 111 | " [ 0.06166197]\n", 112 | " [ 0.02174718]\n", 113 | " [ 0.07925335]\n", 114 | " [ 0.029701 ]\n", 115 | " [ 0.0199 ]]\n", 116 | "error 6.938893903907228e-18\n", 117 | "---------------------------\n", 118 | "-----episode 5-----\n", 119 | "theta_2014: [[-0.04094162]\n", 120 | " [ 0.07104535]\n", 121 | " [ 0.02174718]\n", 122 | " [ 0.07925335]\n", 123 | " [ 0.029701 ]\n", 124 | " [ 0.0199 ]]\n", 125 | "theta_2016: [[-0.04094162]\n", 126 | " [ 0.07104535]\n", 127 | " [ 0.02174718]\n", 128 | " [ 0.07925335]\n", 129 | " [ 0.029701 ]\n", 130 | " [ 0.0199 ]]\n", 131 | "error 6.938893903907228e-18\n", 132 | "---------------------------\n", 133 | "-----episode 6-----\n", 134 | "theta_2014: [[-0.04890919]\n", 135 | " [ 0.0803349 ]\n", 136 | " [ 0.02174718]\n", 137 | " [ 0.07925335]\n", 138 | " [ 0.029701 ]\n", 139 | " [ 0.0199 ]]\n", 140 | "theta_2016: [[-0.04890919]\n", 141 | " [ 0.0803349 ]\n", 142 | " [ 0.02174718]\n", 143 | " [ 0.07925335]\n", 144 | " [ 0.029701 ]\n", 145 | " [ 0.0199 ]]\n", 146 | "error 6.938893903907228e-18\n", 147 | "---------------------------\n", 148 | "-----episode 7-----\n", 149 | "theta_2014: [[-0.05660839]\n", 150 | " [ 0.0803349 ]\n", 151 | " [ 0.02174718]\n", 152 | " [ 0.08962812]\n", 153 | " [ 0.029701 ]\n", 154 | " [ 0.029701 ]]\n", 155 | "theta_2016: [[-0.05660839]\n", 156 | " [ 0.0803349 ]\n", 157 | " [ 0.02174718]\n", 158 | " [ 0.08962812]\n", 159 | " [ 0.029701 ]\n", 160 | " [ 0.029701 ]]\n", 161 | "error 6.938893903907228e-18\n", 162 | "---------------------------\n", 163 | "-----episode 8-----\n", 164 | "theta_2014: [[-0.06412951]\n", 165 | " [ 0.0803349 ]\n", 166 | " [ 0.02174718]\n", 167 | " [ 0.09998648]\n", 168 | " [ 0.029701 ]\n", 169 | " [ 0.03940399]]\n", 170 | "theta_2016: [[-0.06412951]\n", 171 | " [ 0.0803349 ]\n", 172 | " [ 0.02174718]\n", 173 | " [ 0.09998648]\n", 174 | " [ 0.029701 ]\n", 175 | " [ 0.03940399]]\n", 176 | "error 1.3877787807814457e-17\n", 177 | "---------------------------\n", 178 | "-----episode 9-----\n", 179 | "theta_2014: [[-0.07178243]\n", 180 | " [ 0.08953155]\n", 181 | " [ 0.02174718]\n", 182 | " [ 0.09998648]\n", 183 | " [ 0.029701 ]\n", 184 | " [ 0.03940399]]\n", 185 | "theta_2016: [[-0.07178243]\n", 186 | " [ 0.08953155]\n", 187 | " [ 0.02174718]\n", 188 | " [ 0.09998648]\n", 189 | " [ 0.029701 ]\n", 190 | " [ 0.03940399]]\n", 191 | "error 1.3877787807814457e-17\n", 192 | "---------------------------\n", 193 | "-----episode 10-----\n", 194 | "theta_2014: [[-0.06958732]\n", 195 | " [ 0.09863623]\n", 196 | " [ 0.01017452]\n", 197 | " [ 0.09998648]\n", 198 | " [ 0.029701 ]\n", 199 | " [ 0.03940399]]\n", 200 | "theta_2016: [[-0.06958732]\n", 201 | " [ 0.09863623]\n", 202 | " [ 0.01017452]\n", 203 | " [ 0.09998648]\n", 204 | " [ 0.029701 ]\n", 205 | " [ 0.03940399]]\n", 206 | "error 1.3877787807814457e-17\n", 207 | "---------------------------\n", 208 | "-----episode 11-----\n", 209 | "theta_2014: [[-0.07689839]\n", 210 | " [ 0.10890451]\n", 211 | " [ 0.01017452]\n", 212 | " [ 0.09998648]\n", 213 | " [ 0.03940399]\n", 214 | " [ 0.03940399]]\n", 215 | "theta_2016: [[-0.07689839]\n", 216 | " [ 0.10890451]\n", 217 | " [ 0.01017452]\n", 218 | " [ 0.09998648]\n", 219 | " [ 0.03940399]\n", 220 | " [ 0.03940399]]\n", 221 | "error 1.3877787807814457e-17\n", 222 | "---------------------------\n", 223 | "-----episode 12-----\n", 224 | "theta_2014: [[-0.07461403]\n", 225 | " [ 0.10890451]\n", 226 | " [-0.00131933]\n", 227 | " [ 0.10898661]\n", 228 | " [ 0.03940399]\n", 229 | " [ 0.03940399]]\n", 230 | "theta_2016: [[-0.07461403]\n", 231 | " [ 0.10890451]\n", 232 | " [-0.00131933]\n", 233 | " [ 0.10898661]\n", 234 | " [ 0.03940399]\n", 235 | " [ 0.03940399]]\n", 236 | "error 1.4094628242311558e-17\n", 237 | "---------------------------\n", 238 | "-----episode 13-----\n", 239 | "theta_2014: [[-0.08177405]\n", 240 | " [ 0.10890451]\n", 241 | " [-0.00131933]\n", 242 | " [ 0.11923783]\n", 243 | " [ 0.03940399]\n", 244 | " [ 0.04900995]]\n", 245 | "theta_2016: [[-0.08177405]\n", 246 | " [ 0.10890451]\n", 247 | " [-0.00131933]\n", 248 | " [ 0.11923783]\n", 249 | " [ 0.03940399]\n", 250 | " [ 0.04900995]]\n", 251 | "error 2.168404344971009e-19\n", 252 | "---------------------------\n", 253 | "-----episode 14-----\n", 254 | "theta_2014: [[-0.08876266]\n", 255 | " [ 0.10890451]\n", 256 | " [-0.00131933]\n", 257 | " [ 0.12947213]\n", 258 | " [ 0.03940399]\n", 259 | " [ 0.05851985]]\n", 260 | "theta_2016: [[-0.08876266]\n", 261 | " [ 0.10890451]\n", 262 | " [-0.00131933]\n", 263 | " [ 0.12947213]\n", 264 | " [ 0.03940399]\n", 265 | " [ 0.05851985]]\n", 266 | "error 2.7972416050126014e-17\n", 267 | "---------------------------\n", 268 | "-----episode 15-----\n", 269 | "theta_2014: [[-0.09591469]\n", 270 | " [ 0.11781546]\n", 271 | " [-0.00131933]\n", 272 | " [ 0.12947213]\n", 273 | " [ 0.03940399]\n", 274 | " [ 0.05851985]]\n", 275 | "theta_2016: [[-0.09591469]\n", 276 | " [ 0.11781546]\n", 277 | " [-0.00131933]\n", 278 | " [ 0.12947213]\n", 279 | " [ 0.03940399]\n", 280 | " [ 0.05851985]]\n", 281 | "error 2.7972416050126014e-17\n", 282 | "---------------------------\n", 283 | "-----episode 16-----\n", 284 | "theta_2014: [[-0.10291581]\n", 285 | " [ 0.12663731]\n", 286 | " [-0.00131933]\n", 287 | " [ 0.12947213]\n", 288 | " [ 0.03940399]\n", 289 | " [ 0.05851985]]\n", 290 | "theta_2016: [[-0.10291581]\n", 291 | " [ 0.12663731]\n", 292 | " [-0.00131933]\n", 293 | " [ 0.12947213]\n", 294 | " [ 0.03940399]\n", 295 | " [ 0.05851985]]\n", 296 | "error 4.185020385794047e-17\n", 297 | "---------------------------\n", 298 | "-----episode 17-----\n", 299 | "theta_2014: [[-0.10959342]\n", 300 | " [ 0.12663731]\n", 301 | " [-0.00131933]\n", 302 | " [ 0.13968883]\n", 303 | " [ 0.03940399]\n", 304 | " [ 0.06793465]]\n", 305 | "theta_2016: [[-0.10959342]\n", 306 | " [ 0.12663731]\n", 307 | " [-0.00131933]\n", 308 | " [ 0.13968883]\n", 309 | " [ 0.03940399]\n", 310 | " [ 0.06793465]]\n", 311 | "error 4.185020385794047e-17\n", 312 | "---------------------------\n", 313 | "-----episode 18-----\n", 314 | "theta_2014: [[-0.11610493]\n", 315 | " [ 0.12663731]\n", 316 | " [-0.00131933]\n", 317 | " [ 0.14988723]\n", 318 | " [ 0.03940399]\n", 319 | " [ 0.07725531]]\n", 320 | "theta_2016: [[-0.11610493]\n", 321 | " [ 0.12663731]\n", 322 | " [-0.00131933]\n", 323 | " [ 0.14988723]\n", 324 | " [ 0.03940399]\n", 325 | " [ 0.07725531]]\n", 326 | "error 4.185020385794047e-17\n", 327 | "---------------------------\n", 328 | "-----episode 19-----\n", 329 | "theta_2014: [[-0.10375592]\n", 330 | " [ 0.13537093]\n", 331 | " [ 0.0108122 ]\n", 332 | " [ 0.14988723]\n", 333 | " [ 0.03940399]\n", 334 | " [ 0.07725531]]\n", 335 | "theta_2016: [[-0.10375592]\n", 336 | " [ 0.13537093]\n", 337 | " [ 0.0108122 ]\n", 338 | " [ 0.14988723]\n", 339 | " [ 0.03940399]\n", 340 | " [ 0.07725531]]\n", 341 | "error 4.336808689942018e-17\n", 342 | "---------------------------\n", 343 | "-----episode 20-----\n", 344 | "theta_2014: [[-0.10033798]\n", 345 | " [ 0.13537093]\n", 346 | " [-0.00086441]\n", 347 | " [ 0.16006671]\n", 348 | " [ 0.03940399]\n", 349 | " [ 0.08648275]]\n", 350 | "theta_2016: [[-0.10033798]\n", 351 | " [ 0.13537093]\n", 352 | " [-0.00086441]\n", 353 | " [ 0.16006671]\n", 354 | " [ 0.03940399]\n", 355 | " [ 0.08648275]]\n", 356 | "error 2.959871930885427e-17\n", 357 | "---------------------------\n", 358 | "-----episode 21-----\n", 359 | "theta_2014: [[-0.0881131 ]\n", 360 | " [ 0.13537093]\n", 361 | " [ 0.01156042]\n", 362 | " [ 0.16846604]\n", 363 | " [ 0.03940399]\n", 364 | " [ 0.08648275]]\n", 365 | "theta_2016: [[-0.0881131 ]\n", 366 | " [ 0.13537093]\n", 367 | " [ 0.01156042]\n", 368 | " [ 0.16846604]\n", 369 | " [ 0.03940399]\n", 370 | " [ 0.08648275]]\n", 371 | "error 2.949029909160572e-17\n", 372 | "---------------------------\n", 373 | "-----episode 22-----\n", 374 | "theta_2014: [[-0.09456664]\n", 375 | " [ 0.13537093]\n", 376 | " [ 0.01156042]\n", 377 | " [ 0.17854194]\n", 378 | " [ 0.03940399]\n", 379 | " [ 0.09561792]]\n", 380 | "theta_2016: [[-0.09456664]\n", 381 | " [ 0.13537093]\n", 382 | " [ 0.01156042]\n", 383 | " [ 0.17854194]\n", 384 | " [ 0.03940399]\n", 385 | " [ 0.09561792]]\n", 386 | "error 4.336808689942018e-17\n", 387 | "---------------------------\n", 388 | "-----episode 23-----\n", 389 | "theta_2014: [[-0.07240082]\n", 390 | " [ 0.14535831]\n", 391 | " [ 0.01412296]\n", 392 | " [ 0.17854194]\n", 393 | " [ 0.04900995]\n", 394 | " [ 0.09561792]]\n", 395 | "theta_2016: [[-0.07240082]\n", 396 | " [ 0.14535831]\n", 397 | " [ 0.01412296]\n", 398 | " [ 0.17854194]\n", 399 | " [ 0.04900995]\n", 400 | " [ 0.09561792]]\n", 401 | "error 4.683753385137379e-17\n", 402 | "---------------------------\n", 403 | "-----episode 24-----\n", 404 | "theta_2014: [[-0.06032076]\n", 405 | " [ 0.15533141]\n", 406 | " [ 0.02640811]\n", 407 | " [ 0.17854194]\n", 408 | " [ 0.05851985]\n", 409 | " [ 0.09561792]]\n", 410 | "theta_2016: [[-0.06032076]\n", 411 | " [ 0.15533141]\n", 412 | " [ 0.02640811]\n", 413 | " [ 0.17854194]\n", 414 | " [ 0.05851985]\n", 415 | " [ 0.09561792]]\n", 416 | "error 4.85722573273506e-17\n", 417 | "---------------------------\n", 418 | "-----episode 25-----\n", 419 | "theta_2014: [[-0.04849226]\n", 420 | " [ 0.15533141]\n", 421 | " [ 0.03614403]\n", 422 | " [ 0.17854194]\n", 423 | " [ 0.05851985]\n", 424 | " [ 0.09561792]]\n", 425 | "theta_2016: [[-0.04849226]\n", 426 | " [ 0.15533141]\n", 427 | " [ 0.03614403]\n", 428 | " [ 0.17854194]\n", 429 | " [ 0.05851985]\n", 430 | " [ 0.09561792]]\n", 431 | "error 4.85722573273506e-17\n", 432 | "---------------------------\n", 433 | "-----episode 26-----\n", 434 | "theta_2014: [[-0.0554837 ]\n", 435 | " [ 0.16528951]\n", 436 | " [ 0.03614403]\n", 437 | " [ 0.17854194]\n", 438 | " [ 0.06793465]\n", 439 | " [ 0.09561792]]\n", 440 | "theta_2016: [[-0.0554837 ]\n", 441 | " [ 0.16528951]\n", 442 | " [ 0.03614403]\n", 443 | " [ 0.17854194]\n", 444 | " [ 0.06793465]\n", 445 | " [ 0.09561792]]\n", 446 | "error 4.163336342344337e-17\n", 447 | "---------------------------\n", 448 | "-----episode 27-----\n", 449 | "theta_2014: [[-0.06234805]\n", 450 | " [ 0.16528951]\n", 451 | " [ 0.03614403]\n", 452 | " [ 0.18675652]\n", 453 | " [ 0.06793465]\n", 454 | " [ 0.09561792]]\n", 455 | "theta_2016: [[-0.06234805]\n", 456 | " [ 0.16528951]\n", 457 | " [ 0.03614403]\n", 458 | " [ 0.18675652]\n", 459 | " [ 0.06793465]\n", 460 | " [ 0.09561792]]\n", 461 | "error 4.163336342344337e-17\n", 462 | "---------------------------\n", 463 | "-----episode 28-----\n", 464 | "theta_2014: [[-0.06888822]\n", 465 | " [ 0.16528951]\n", 466 | " [ 0.03614403]\n", 467 | " [ 0.19673091]\n", 468 | " [ 0.06793465]\n", 469 | " [ 0.10466175]]\n", 470 | "theta_2016: [[-0.06888822]\n", 471 | " [ 0.16528951]\n", 472 | " [ 0.03614403]\n", 473 | " [ 0.19673091]\n", 474 | " [ 0.06793465]\n", 475 | " [ 0.10466175]]\n", 476 | "error 4.163336342344337e-17\n", 477 | "---------------------------\n", 478 | "-----episode 29-----\n", 479 | "theta_2014: [[-0.05662785]\n", 480 | " [ 0.17523191]\n", 481 | " [ 0.04840326]\n", 482 | " [ 0.19673091]\n", 483 | " [ 0.07725531]\n", 484 | " [ 0.10466175]]\n", 485 | "theta_2016: [[-0.05662785]\n", 486 | " [ 0.17523191]\n", 487 | " [ 0.04840326]\n", 488 | " [ 0.19673091]\n", 489 | " [ 0.07725531]\n", 490 | " [ 0.10466175]]\n", 491 | "error 4.85722573273506e-17\n", 492 | "---------------------------\n", 493 | "-----episode 30-----\n", 494 | "theta_2014: [[-0.0633441 ]\n", 495 | " [ 0.18515793]\n", 496 | " [ 0.04840326]\n", 497 | " [ 0.19673091]\n", 498 | " [ 0.08648275]\n", 499 | " [ 0.10466175]]\n", 500 | "theta_2016: [[-0.0633441 ]\n", 501 | " [ 0.18515793]\n", 502 | " [ 0.04840326]\n", 503 | " [ 0.19673091]\n", 504 | " [ 0.08648275]\n", 505 | " [ 0.10466175]]\n", 506 | "error 4.163336342344337e-17\n", 507 | "---------------------------\n", 508 | "-----episode 31-----\n", 509 | "theta_2014: [[-0.05128938]\n", 510 | " [ 0.18515793]\n", 511 | " [ 0.05791922]\n", 512 | " [ 0.19673091]\n", 513 | " [ 0.08648275]\n", 514 | " [ 0.10466175]]\n", 515 | "theta_2016: [[-0.05128938]\n", 516 | " [ 0.18515793]\n", 517 | " [ 0.05791922]\n", 518 | " [ 0.19673091]\n", 519 | " [ 0.08648275]\n", 520 | " [ 0.10466175]]\n", 521 | "error 4.85722573273506e-17\n", 522 | "---------------------------\n", 523 | "-----episode 32-----\n", 524 | "theta_2014: [[-0.03927043]\n", 525 | " [ 0.18515793]\n", 526 | " [ 0.06734003]\n", 527 | " [ 0.19673091]\n", 528 | " [ 0.08648275]\n", 529 | " [ 0.10466175]]\n", 530 | "theta_2016: [[-0.03927043]\n", 531 | " [ 0.18515793]\n", 532 | " [ 0.06734003]\n", 533 | " [ 0.19673091]\n", 534 | " [ 0.08648275]\n", 535 | " [ 0.10466175]]\n", 536 | "error 5.551115123125783e-17\n", 537 | "---------------------------\n", 538 | "-----episode 33-----\n", 539 | "theta_2014: [[-0.04594452]\n", 540 | " [ 0.18515793]\n", 541 | " [ 0.06734003]\n", 542 | " [ 0.20668614]\n", 543 | " [ 0.08648275]\n", 544 | " [ 0.11361513]]\n", 545 | "theta_2016: [[-0.04594452]\n", 546 | " [ 0.18515793]\n", 547 | " [ 0.06734003]\n", 548 | " [ 0.20668614]\n", 549 | " [ 0.08648275]\n", 550 | " [ 0.11361513]]\n", 551 | "error 5.551115123125783e-17\n", 552 | "---------------------------\n", 553 | "-----episode 34-----\n", 554 | "theta_2014: [[-0.05245527]\n", 555 | " [ 0.18515793]\n", 556 | " [ 0.06734003]\n", 557 | " [ 0.21662159]\n", 558 | " [ 0.08648275]\n", 559 | " [ 0.12247898]]\n", 560 | "theta_2016: [[-0.05245527]\n", 561 | " [ 0.18515793]\n", 562 | " [ 0.06734003]\n", 563 | " [ 0.21662159]\n", 564 | " [ 0.08648275]\n", 565 | " [ 0.12247898]]\n", 566 | "error 6.245004513516506e-17\n", 567 | "---------------------------\n", 568 | "-----episode 35-----\n", 569 | "theta_2014: [[-0.05901062]\n", 570 | " [ 0.18515793]\n", 571 | " [ 0.06734003]\n", 572 | " [ 0.22445537]\n", 573 | " [ 0.08648275]\n", 574 | " [ 0.12247898]]\n", 575 | "theta_2016: [[-0.05901062]\n", 576 | " [ 0.18515793]\n", 577 | " [ 0.06734003]\n", 578 | " [ 0.22445537]\n", 579 | " [ 0.08648275]\n", 580 | " [ 0.12247898]]\n", 581 | "error 6.245004513516506e-17\n", 582 | "---------------------------\n", 583 | "-----episode 36-----\n", 584 | "theta_2014: [[-0.05579577]\n", 585 | " [ 0.19330636]\n", 586 | " [ 0.05551248]\n", 587 | " [ 0.22445537]\n", 588 | " [ 0.08648275]\n", 589 | " [ 0.12247898]]\n", 590 | "theta_2016: [[-0.05579577]\n", 591 | " [ 0.19330636]\n", 592 | " [ 0.05551248]\n", 593 | " [ 0.22445537]\n", 594 | " [ 0.08648275]\n", 595 | " [ 0.12247898]]\n", 596 | "error 6.245004513516506e-17\n", 597 | "---------------------------\n", 598 | "-----episode 37-----\n", 599 | "theta_2014: [[-0.06235116]\n", 600 | " [ 0.20313385]\n", 601 | " [ 0.05551248]\n", 602 | " [ 0.22445537]\n", 603 | " [ 0.09561792]\n", 604 | " [ 0.12247898]]\n", 605 | "theta_2016: [[-0.06235116]\n", 606 | " [ 0.20313385]\n", 607 | " [ 0.05551248]\n", 608 | " [ 0.22445537]\n", 609 | " [ 0.09561792]\n", 610 | " [ 0.12247898]]\n", 611 | "error 6.938893903907228e-17\n", 612 | "---------------------------\n", 613 | "-----episode 38-----\n", 614 | "theta_2014: [[-0.06873775]\n", 615 | " [ 0.20313385]\n", 616 | " [ 0.05551248]\n", 617 | " [ 0.23221082]\n", 618 | " [ 0.09561792]\n", 619 | " [ 0.12247898]]\n", 620 | "theta_2016: [[-0.06873775]\n", 621 | " [ 0.20313385]\n", 622 | " [ 0.05551248]\n", 623 | " [ 0.23221082]\n", 624 | " [ 0.09561792]\n", 625 | " [ 0.12247898]]\n", 626 | "error 6.938893903907228e-17\n", 627 | "---------------------------\n", 628 | "-----episode 39-----\n", 629 | "theta_2014: [[-0.07478533]\n", 630 | " [ 0.20313385]\n", 631 | " [ 0.05551248]\n", 632 | " [ 0.24197 ]\n", 633 | " [ 0.09561792]\n", 634 | " [ 0.13125419]]\n", 635 | "theta_2016: [[-0.07478533]\n", 636 | " [ 0.20313385]\n", 637 | " [ 0.05551248]\n", 638 | " [ 0.24197 ]\n", 639 | " [ 0.09561792]\n", 640 | " [ 0.13125419]]\n", 641 | "error 6.938893903907228e-17\n", 642 | "---------------------------\n", 643 | "-----episode 40-----\n", 644 | "theta_2014: [[-0.08067773]\n", 645 | " [ 0.20313385]\n", 646 | " [ 0.05551248]\n", 647 | " [ 0.25170977]\n", 648 | " [ 0.09561792]\n", 649 | " [ 0.13994165]]\n", 650 | "theta_2016: [[-0.08067773]\n", 651 | " [ 0.20313385]\n", 652 | " [ 0.05551248]\n", 653 | " [ 0.25170977]\n", 654 | " [ 0.09561792]\n", 655 | " [ 0.13994165]]\n", 656 | "error 4.163336342344337e-17\n", 657 | "---------------------------\n", 658 | "-----episode 41-----\n", 659 | "theta_2014: [[-0.08641677]\n", 660 | " [ 0.20313385]\n", 661 | " [ 0.05551248]\n", 662 | " [ 0.26142956]\n", 663 | " [ 0.09561792]\n", 664 | " [ 0.14854223]]\n", 665 | "theta_2016: [[-0.08641677]\n", 666 | " [ 0.20313385]\n", 667 | " [ 0.05551248]\n", 668 | " [ 0.26142956]\n", 669 | " [ 0.09561792]\n", 670 | " [ 0.14854223]]\n", 671 | "error 2.7755575615628914e-17\n", 672 | "---------------------------\n", 673 | "-----episode 42-----\n", 674 | "theta_2014: [[-0.09223327]\n", 675 | " [ 0.20313385]\n", 676 | " [ 0.05551248]\n", 677 | " [ 0.26881526]\n", 678 | " [ 0.09561792]\n", 679 | " [ 0.14854223]]\n", 680 | "theta_2016: [[-0.09223327]\n", 681 | " [ 0.20313385]\n", 682 | " [ 0.05551248]\n", 683 | " [ 0.26881526]\n", 684 | " [ 0.09561792]\n", 685 | " [ 0.14854223]]\n", 686 | "error 4.163336342344337e-17\n", 687 | "---------------------------\n", 688 | "-----episode 43-----\n", 689 | "theta_2014: [[-0.09832866]\n", 690 | " [ 0.21294447]\n", 691 | " [ 0.05551248]\n", 692 | " [ 0.26881526]\n", 693 | " [ 0.10466175]\n", 694 | " [ 0.14854223]]\n", 695 | "theta_2016: [[-0.09832866]\n", 696 | " [ 0.21294447]\n", 697 | " [ 0.05551248]\n", 698 | " [ 0.26881526]\n", 699 | " [ 0.10466175]\n", 700 | " [ 0.14854223]]\n", 701 | "error 2.7755575615628914e-17\n", 702 | "---------------------------\n", 703 | "-----episode 44-----\n", 704 | "theta_2014: [[-0.08557491]\n", 705 | " [ 0.22081503]\n", 706 | " [ 0.06784469]\n", 707 | " [ 0.26881526]\n", 708 | " [ 0.10466175]\n", 709 | " [ 0.14854223]]\n", 710 | "theta_2016: [[-0.08557491]\n", 711 | " [ 0.22081503]\n", 712 | " [ 0.06784469]\n", 713 | " [ 0.26881526]\n", 714 | " [ 0.10466175]\n", 715 | " [ 0.14854223]]\n", 716 | "error 1.3877787807814457e-17\n", 717 | "---------------------------\n", 718 | "-----episode 45-----\n", 719 | "theta_2014: [[-0.09110498]\n", 720 | " [ 0.22081503]\n", 721 | " [ 0.06784469]\n", 722 | " [ 0.27844062]\n", 723 | " [ 0.10466175]\n", 724 | " [ 0.15705681]]\n", 725 | "theta_2016: [[-0.09110498]\n", 726 | " [ 0.22081503]\n", 727 | " [ 0.06784469]\n", 728 | " [ 0.27844062]\n", 729 | " [ 0.10466175]\n", 730 | " [ 0.15705681]]\n", 731 | "error 2.7755575615628914e-17\n", 732 | "---------------------------\n", 733 | "-----episode 46-----\n", 734 | "theta_2014: [[-0.07859943]\n", 735 | " [ 0.22081503]\n", 736 | " [ 0.07716624]\n", 737 | " [ 0.27844062]\n", 738 | " [ 0.10466175]\n", 739 | " [ 0.15705681]]\n", 740 | "theta_2016: [[-0.07859943]\n", 741 | " [ 0.22081503]\n", 742 | " [ 0.07716624]\n", 743 | " [ 0.27844062]\n", 744 | " [ 0.10466175]\n", 745 | " [ 0.15705681]]\n", 746 | "error 2.7755575615628914e-17\n", 747 | "---------------------------\n", 748 | "-----episode 47-----\n", 749 | "theta_2014: [[-0.06579227]\n", 750 | " [ 0.22081503]\n", 751 | " [ 0.08986549]\n", 752 | " [ 0.28565621]\n", 753 | " [ 0.10466175]\n", 754 | " [ 0.15705681]]\n", 755 | "theta_2016: [[-0.06579227]\n", 756 | " [ 0.22081503]\n", 757 | " [ 0.08986549]\n", 758 | " [ 0.28565621]\n", 759 | " [ 0.10466175]\n", 760 | " [ 0.15705681]]\n", 761 | "error 2.7755575615628914e-17\n", 762 | "---------------------------\n", 763 | "-----episode 48-----\n", 764 | "theta_2014: [[-0.07198655]\n", 765 | " [ 0.23052941]\n", 766 | " [ 0.08986549]\n", 767 | " [ 0.28565621]\n", 768 | " [ 0.11361513]\n", 769 | " [ 0.15705681]]\n", 770 | "theta_2016: [[-0.07198655]\n", 771 | " [ 0.23052941]\n", 772 | " [ 0.08986549]\n", 773 | " [ 0.28565621]\n", 774 | " [ 0.11361513]\n", 775 | " [ 0.15705681]]\n", 776 | "error 2.7755575615628914e-17\n", 777 | "---------------------------\n", 778 | "-----episode 49-----\n", 779 | "theta_2014: [[-0.07749494]\n", 780 | " [ 0.23052941]\n", 781 | " [ 0.08986549]\n", 782 | " [ 0.29518903]\n", 783 | " [ 0.11361513]\n", 784 | " [ 0.16548624]]\n", 785 | "theta_2016: [[-0.07749494]\n", 786 | " [ 0.23052941]\n", 787 | " [ 0.08986549]\n", 788 | " [ 0.29518903]\n", 789 | " [ 0.11361513]\n", 790 | " [ 0.16548624]]\n", 791 | "error 2.7755575615628914e-17\n", 792 | "---------------------------\n", 793 | "-----episode 50-----\n", 794 | "theta_2014: [[-0.08367597]\n", 795 | " [ 0.23822412]\n", 796 | " [ 0.08986549]\n", 797 | " [ 0.29518903]\n", 798 | " [ 0.11361513]\n", 799 | " [ 0.16548624]]\n", 800 | "theta_2016: [[-0.08367597]\n", 801 | " [ 0.23822412]\n", 802 | " [ 0.08986549]\n", 803 | " [ 0.29518903]\n", 804 | " [ 0.11361513]\n", 805 | " [ 0.16548624]]\n", 806 | "error 2.7755575615628914e-17\n", 807 | "---------------------------\n", 808 | "-----episode 51-----\n", 809 | "theta_2014: [[-0.08952841]\n", 810 | " [ 0.24784419]\n", 811 | " [ 0.08986549]\n", 812 | " [ 0.29518903]\n", 813 | " [ 0.12247898]\n", 814 | " [ 0.16548624]]\n", 815 | "theta_2016: [[-0.08952841]\n", 816 | " [ 0.24784419]\n", 817 | " [ 0.08986549]\n", 818 | " [ 0.29518903]\n", 819 | " [ 0.12247898]\n", 820 | " [ 0.16548624]]\n", 821 | "error 2.7755575615628914e-17\n", 822 | "---------------------------\n", 823 | "-----episode 52-----\n", 824 | "theta_2014: [[-0.09501299]\n", 825 | " [ 0.24784419]\n", 826 | " [ 0.08986549]\n", 827 | " [ 0.30223714]\n", 828 | " [ 0.12247898]\n", 829 | " [ 0.16548624]]\n", 830 | "theta_2016: [[-0.09501299]\n", 831 | " [ 0.24784419]\n", 832 | " [ 0.08986549]\n", 833 | " [ 0.30223714]\n", 834 | " [ 0.12247898]\n", 835 | " [ 0.16548624]]\n", 836 | "error 2.7755575615628914e-17\n", 837 | "---------------------------\n", 838 | "-----episode 53-----\n", 839 | "theta_2014: [[-0.08227216]\n", 840 | " [ 0.24784419]\n", 841 | " [ 0.09896683]\n", 842 | " [ 0.30223714]\n", 843 | " [ 0.12247898]\n", 844 | " [ 0.16548624]]\n", 845 | "theta_2016: [[-0.08227216]\n", 846 | " [ 0.24784419]\n", 847 | " [ 0.09896683]\n", 848 | " [ 0.30223714]\n", 849 | " [ 0.12247898]\n", 850 | " [ 0.16548624]]\n", 851 | "error 2.7755575615628914e-17\n", 852 | "---------------------------\n", 853 | "-----episode 54-----\n", 854 | "theta_2014: [[-0.0877665 ]\n", 855 | " [ 0.24784419]\n", 856 | " [ 0.09896683]\n", 857 | " [ 0.30921476]\n", 858 | " [ 0.12247898]\n", 859 | " [ 0.16548624]]\n", 860 | "theta_2016: [[-0.0877665 ]\n", 861 | " [ 0.24784419]\n", 862 | " [ 0.09896683]\n", 863 | " [ 0.30921476]\n", 864 | " [ 0.12247898]\n", 865 | " [ 0.16548624]]\n", 866 | "error 1.3877787807814457e-17\n", 867 | "---------------------------\n", 868 | "-----episode 55-----\n", 869 | "theta_2014: [[-0.09314374]\n", 870 | " [ 0.24784419]\n", 871 | " [ 0.09896683]\n", 872 | " [ 0.31612262]\n", 873 | " [ 0.12247898]\n", 874 | " [ 0.16548624]]\n", 875 | "theta_2016: [[-0.09314374]\n", 876 | " [ 0.24784419]\n", 877 | " [ 0.09896683]\n", 878 | " [ 0.31612262]\n", 879 | " [ 0.12247898]\n", 880 | " [ 0.16548624]]\n", 881 | "error 1.3877787807814457e-17\n", 882 | "---------------------------\n", 883 | "-----episode 56-----\n", 884 | "theta_2014: [[-0.09901401]\n", 885 | " [ 0.25536575]\n", 886 | " [ 0.09896683]\n", 887 | " [ 0.31612262]\n", 888 | " [ 0.12247898]\n", 889 | " [ 0.16548624]]\n", 890 | "theta_2016: [[-0.09901401]\n", 891 | " [ 0.25536575]\n", 892 | " [ 0.09896683]\n", 893 | " [ 0.31612262]\n", 894 | " [ 0.12247898]\n", 895 | " [ 0.16548624]]\n", 896 | "error 1.3877787807814457e-17\n", 897 | "---------------------------\n", 898 | "-----episode 57-----\n", 899 | "theta_2014: [[-0.10421721]\n", 900 | " [ 0.25536575]\n", 901 | " [ 0.09896683]\n", 902 | " [ 0.32296139]\n", 903 | " [ 0.12247898]\n", 904 | " [ 0.16548624]]\n", 905 | "theta_2016: [[-0.10421721]\n", 906 | " [ 0.25536575]\n", 907 | " [ 0.09896683]\n", 908 | " [ 0.32296139]\n", 909 | " [ 0.12247898]\n", 910 | " [ 0.16548624]]\n", 911 | "error 1.3877787807814457e-17\n", 912 | "---------------------------\n", 913 | "-----episode 58-----\n", 914 | "theta_2014: [[-0.09092036]\n", 915 | " [ 0.25536575]\n", 916 | " [ 0.11184475]\n", 917 | " [ 0.32973178]\n", 918 | " [ 0.12247898]\n", 919 | " [ 0.16548624]]\n", 920 | "theta_2016: [[-0.09092036]\n", 921 | " [ 0.25536575]\n", 922 | " [ 0.11184475]\n", 923 | " [ 0.32973178]\n", 924 | " [ 0.12247898]\n", 925 | " [ 0.16548624]]\n", 926 | "error 1.3877787807814457e-17\n", 927 | "---------------------------\n", 928 | "-----episode 59-----\n", 929 | "theta_2014: [[-0.07802462]\n", 930 | " [ 0.25536575]\n", 931 | " [ 0.1207263 ]\n", 932 | " [ 0.32973178]\n", 933 | " [ 0.12247898]\n", 934 | " [ 0.16548624]]\n", 935 | "theta_2016: [[-0.07802462]\n", 936 | " [ 0.25536575]\n", 937 | " [ 0.1207263 ]\n", 938 | " [ 0.32973178]\n", 939 | " [ 0.12247898]\n", 940 | " [ 0.16548624]]\n", 941 | "error 2.7755575615628914e-17\n", 942 | "---------------------------\n", 943 | "-----episode 60-----\n", 944 | "theta_2014: [[-0.08397906]\n", 945 | " [ 0.26281209]\n", 946 | " [ 0.1207263 ]\n", 947 | " [ 0.32973178]\n", 948 | " [ 0.12247898]\n", 949 | " [ 0.16548624]]\n", 950 | "theta_2016: [[-0.08397906]\n", 951 | " [ 0.26281209]\n", 952 | " [ 0.1207263 ]\n", 953 | " [ 0.32973178]\n", 954 | " [ 0.12247898]\n", 955 | " [ 0.16548624]]\n", 956 | "error 1.3877787807814457e-17\n", 957 | "---------------------------\n", 958 | "-----episode 61-----\n", 959 | "theta_2014: [[-0.08960157]\n", 960 | " [ 0.27226525]\n", 961 | " [ 0.1207263 ]\n", 962 | " [ 0.32973178]\n", 963 | " [ 0.13125419]\n", 964 | " [ 0.16548624]]\n", 965 | "theta_2016: [[-0.08960157]\n", 966 | " [ 0.27226525]\n", 967 | " [ 0.1207263 ]\n", 968 | " [ 0.32973178]\n", 969 | " [ 0.13125419]\n", 970 | " [ 0.16548624]]\n", 971 | "error 1.3877787807814457e-17\n", 972 | "---------------------------\n", 973 | "-----episode 62-----\n", 974 | "theta_2014: [[-0.09528967]\n", 975 | " [ 0.2795426 ]\n", 976 | " [ 0.1207263 ]\n", 977 | " [ 0.32973178]\n", 978 | " [ 0.13125419]\n", 979 | " [ 0.16548624]]\n", 980 | "theta_2016: [[-0.09528967]\n", 981 | " [ 0.2795426 ]\n", 982 | " [ 0.1207263 ]\n", 983 | " [ 0.32973178]\n", 984 | " [ 0.13125419]\n", 985 | " [ 0.16548624]]\n", 986 | "error 1.3877787807814457e-17\n", 987 | "---------------------------\n", 988 | "-----episode 63-----\n", 989 | "theta_2014: [[-0.10040886]\n", 990 | " [ 0.2795426 ]\n", 991 | " [ 0.1207263 ]\n", 992 | " [ 0.33643446]\n", 993 | " [ 0.13125419]\n", 994 | " [ 0.16548624]]\n", 995 | "theta_2016: [[-0.10040886]\n", 996 | " [ 0.2795426 ]\n", 997 | " [ 0.1207263 ]\n", 998 | " [ 0.33643446]\n", 999 | " [ 0.13125419]\n", 1000 | " [ 0.16548624]]\n", 1001 | "error 1.3877787807814457e-17\n", 1002 | "---------------------------\n", 1003 | "-----episode 64-----\n", 1004 | "theta_2014: [[-0.09457365]\n", 1005 | " [ 0.2795426 ]\n", 1006 | " [ 0.10813787]\n", 1007 | " [ 0.34307011]\n", 1008 | " [ 0.13125419]\n", 1009 | " [ 0.16548624]]\n", 1010 | "theta_2016: [[-0.09457365]\n", 1011 | " [ 0.2795426 ]\n", 1012 | " [ 0.10813787]\n", 1013 | " [ 0.34307011]\n", 1014 | " [ 0.13125419]\n", 1015 | " [ 0.16548624]]\n", 1016 | "error 1.3877787807814457e-17\n", 1017 | "---------------------------\n", 1018 | "-----episode 65-----\n", 1019 | "theta_2014: [[-0.10014719]\n", 1020 | " [ 0.28674718]\n", 1021 | " [ 0.10813787]\n", 1022 | " [ 0.34307011]\n", 1023 | " [ 0.13125419]\n", 1024 | " [ 0.16548624]]\n", 1025 | "theta_2016: [[-0.10014719]\n", 1026 | " [ 0.28674718]\n", 1027 | " [ 0.10813787]\n", 1028 | " [ 0.34307011]\n", 1029 | " [ 0.13125419]\n", 1030 | " [ 0.16548624]]\n", 1031 | "error 2.7755575615628914e-17\n", 1032 | "---------------------------\n", 1033 | "-----episode 66-----\n", 1034 | "theta_2014: [[-0.08719221]\n", 1035 | " [ 0.28674718]\n", 1036 | " [ 0.11705649]\n", 1037 | " [ 0.34307011]\n", 1038 | " [ 0.13125419]\n", 1039 | " [ 0.16548624]]\n", 1040 | "theta_2016: [[-0.08719221]\n", 1041 | " [ 0.28674718]\n", 1042 | " [ 0.11705649]\n", 1043 | " [ 0.34307011]\n", 1044 | " [ 0.13125419]\n", 1045 | " [ 0.16548624]]\n", 1046 | "error 2.7755575615628914e-17\n", 1047 | "---------------------------\n", 1048 | "-----episode 67-----\n", 1049 | "theta_2014: [[-0.09256158]\n", 1050 | " [ 0.29603918]\n", 1051 | " [ 0.11705649]\n", 1052 | " [ 0.34307011]\n", 1053 | " [ 0.13994165]\n", 1054 | " [ 0.16548624]]\n", 1055 | "theta_2016: [[-0.09256158]\n", 1056 | " [ 0.29603918]\n", 1057 | " [ 0.11705649]\n", 1058 | " [ 0.34307011]\n", 1059 | " [ 0.13994165]\n", 1060 | " [ 0.16548624]]\n", 1061 | "error 4.163336342344337e-17\n", 1062 | "---------------------------\n", 1063 | "-----episode 68-----\n", 1064 | "theta_2014: [[-0.0977868 ]\n", 1065 | " [ 0.30531567]\n", 1066 | " [ 0.11705649]\n", 1067 | " [ 0.34307011]\n", 1068 | " [ 0.14854223]\n", 1069 | " [ 0.16548624]]\n", 1070 | "theta_2016: [[-0.0977868 ]\n", 1071 | " [ 0.30531567]\n", 1072 | " [ 0.11705649]\n", 1073 | " [ 0.34307011]\n", 1074 | " [ 0.14854223]\n", 1075 | " [ 0.16548624]]\n", 1076 | "error 2.7755575615628914e-17\n", 1077 | "---------------------------\n", 1078 | "-----episode 69-----\n", 1079 | "theta_2014: [[-0.10286954]\n", 1080 | " [ 0.31457602]\n", 1081 | " [ 0.11705649]\n", 1082 | " [ 0.34307011]\n", 1083 | " [ 0.15705681]\n", 1084 | " [ 0.16548624]]\n", 1085 | "theta_2016: [[-0.10286954]\n", 1086 | " [ 0.31457602]\n", 1087 | " [ 0.11705649]\n", 1088 | " [ 0.34307011]\n", 1089 | " [ 0.15705681]\n", 1090 | " [ 0.16548624]]\n", 1091 | "error 2.7755575615628914e-17\n", 1092 | "---------------------------\n", 1093 | "-----episode 70-----\n", 1094 | "theta_2014: [[-0.08980787]\n", 1095 | " [ 0.31457602]\n", 1096 | " [ 0.12588592]\n", 1097 | " [ 0.34307011]\n", 1098 | " [ 0.15705681]\n", 1099 | " [ 0.16548624]]\n", 1100 | "theta_2016: [[-0.08980787]\n", 1101 | " [ 0.31457602]\n", 1102 | " [ 0.12588592]\n", 1103 | " [ 0.34307011]\n", 1104 | " [ 0.15705681]\n", 1105 | " [ 0.16548624]]\n", 1106 | "error 2.7755575615628914e-17\n", 1107 | "---------------------------\n", 1108 | "-----episode 71-----\n", 1109 | "theta_2014: [[-0.07639752]\n", 1110 | " [ 0.31457602]\n", 1111 | " [ 0.13867382]\n", 1112 | " [ 0.34963941]\n", 1113 | " [ 0.15705681]\n", 1114 | " [ 0.16548624]]\n", 1115 | "theta_2016: [[-0.07639752]\n", 1116 | " [ 0.31457602]\n", 1117 | " [ 0.13867382]\n", 1118 | " [ 0.34963941]\n", 1119 | " [ 0.15705681]\n", 1120 | " [ 0.16548624]]\n", 1121 | "error 2.7755575615628914e-17\n", 1122 | "---------------------------\n", 1123 | "-----episode 72-----\n", 1124 | "theta_2014: [[-0.08160412]\n", 1125 | " [ 0.32381964]\n", 1126 | " [ 0.13867382]\n", 1127 | " [ 0.34963941]\n", 1128 | " [ 0.16548624]\n", 1129 | " [ 0.16548624]]\n", 1130 | "theta_2016: [[-0.08160412]\n", 1131 | " [ 0.32381964]\n", 1132 | " [ 0.13867382]\n", 1133 | " [ 0.34963941]\n", 1134 | " [ 0.16548624]\n", 1135 | " [ 0.16548624]]\n", 1136 | "error 2.7755575615628914e-17\n", 1137 | "---------------------------\n", 1138 | "-----episode 73-----\n", 1139 | "theta_2014: [[-0.08691285]\n", 1140 | " [ 0.33058144]\n", 1141 | " [ 0.13867382]\n", 1142 | " [ 0.34963941]\n", 1143 | " [ 0.16548624]\n", 1144 | " [ 0.16548624]]\n", 1145 | "theta_2016: [[-0.08691285]\n", 1146 | " [ 0.33058144]\n", 1147 | " [ 0.13867382]\n", 1148 | " [ 0.34963941]\n", 1149 | " [ 0.16548624]\n", 1150 | " [ 0.16548624]]\n", 1151 | "error 4.163336342344337e-17\n", 1152 | "---------------------------\n", 1153 | "-----episode 74-----\n", 1154 | "theta_2014: [[-0.09169445]\n", 1155 | " [ 0.33058144]\n", 1156 | " [ 0.13867382]\n", 1157 | " [ 0.3586075 ]\n", 1158 | " [ 0.16548624]\n", 1159 | " [ 0.17383138]]\n", 1160 | "theta_2016: [[-0.09169445]\n", 1161 | " [ 0.33058144]\n", 1162 | " [ 0.13867382]\n", 1163 | " [ 0.3586075 ]\n", 1164 | " [ 0.16548624]\n", 1165 | " [ 0.17383138]]\n", 1166 | "error 4.163336342344337e-17\n", 1167 | "---------------------------\n", 1168 | "-----episode 75-----\n", 1169 | "theta_2014: [[-0.09684202]\n", 1170 | " [ 0.33727563]\n", 1171 | " [ 0.13867382]\n", 1172 | " [ 0.3586075 ]\n", 1173 | " [ 0.16548624]\n", 1174 | " [ 0.17383138]]\n", 1175 | "theta_2016: [[-0.09684202]\n", 1176 | " [ 0.33727563]\n", 1177 | " [ 0.13867382]\n", 1178 | " [ 0.3586075 ]\n", 1179 | " [ 0.16548624]\n", 1180 | " [ 0.17383138]]\n", 1181 | "error 4.163336342344337e-17\n", 1182 | "---------------------------\n", 1183 | "-----episode 76-----\n", 1184 | "theta_2014: [[-0.10187848]\n", 1185 | " [ 0.34390287]\n", 1186 | " [ 0.13867382]\n", 1187 | " [ 0.3586075 ]\n", 1188 | " [ 0.16548624]\n", 1189 | " [ 0.17383138]]\n", 1190 | "theta_2016: [[-0.10187848]\n", 1191 | " [ 0.34390287]\n", 1192 | " [ 0.13867382]\n", 1193 | " [ 0.3586075 ]\n", 1194 | " [ 0.16548624]\n", 1195 | " [ 0.17383138]]\n", 1196 | "error 4.163336342344337e-17\n", 1197 | "---------------------------\n", 1198 | "-----episode 77-----\n", 1199 | "theta_2014: [[-0.10642315]\n", 1200 | " [ 0.34390287]\n", 1201 | " [ 0.13867382]\n", 1202 | " [ 0.36756026]\n", 1203 | " [ 0.16548624]\n", 1204 | " [ 0.18209306]]\n", 1205 | "theta_2016: [[-0.10642315]\n", 1206 | " [ 0.34390287]\n", 1207 | " [ 0.13867382]\n", 1208 | " [ 0.36756026]\n", 1209 | " [ 0.16548624]\n", 1210 | " [ 0.18209306]]\n", 1211 | "error 4.163336342344337e-17\n", 1212 | "---------------------------\n", 1213 | "-----episode 78-----\n", 1214 | "theta_2014: [[-0.11130475]\n", 1215 | " [ 0.35046384]\n", 1216 | " [ 0.13867382]\n", 1217 | " [ 0.36756026]\n", 1218 | " [ 0.16548624]\n", 1219 | " [ 0.18209306]]\n", 1220 | "theta_2016: [[-0.11130475]\n", 1221 | " [ 0.35046384]\n", 1222 | " [ 0.13867382]\n", 1223 | " [ 0.36756026]\n", 1224 | " [ 0.16548624]\n", 1225 | " [ 0.18209306]]\n", 1226 | "error 4.163336342344337e-17\n", 1227 | "---------------------------\n", 1228 | "-----episode 79-----\n", 1229 | "theta_2014: [[-0.1156681 ]\n", 1230 | " [ 0.35046384]\n", 1231 | " [ 0.13867382]\n", 1232 | " [ 0.37649711]\n", 1233 | " [ 0.16548624]\n", 1234 | " [ 0.19027213]]\n", 1235 | "theta_2016: [[-0.1156681 ]\n", 1236 | " [ 0.35046384]\n", 1237 | " [ 0.13867382]\n", 1238 | " [ 0.37649711]\n", 1239 | " [ 0.16548624]\n", 1240 | " [ 0.19027213]]\n", 1241 | "error 4.163336342344337e-17\n", 1242 | "---------------------------\n", 1243 | "-----episode 80-----\n", 1244 | "theta_2014: [[-0.12039879]\n", 1245 | " [ 0.3569592 ]\n", 1246 | " [ 0.13867382]\n", 1247 | " [ 0.37649711]\n", 1248 | " [ 0.16548624]\n", 1249 | " [ 0.19027213]]\n", 1250 | "theta_2016: [[-0.12039879]\n", 1251 | " [ 0.3569592 ]\n", 1252 | " [ 0.13867382]\n", 1253 | " [ 0.37649711]\n", 1254 | " [ 0.16548624]\n", 1255 | " [ 0.19027213]]\n", 1256 | "error 2.7755575615628914e-17\n", 1257 | "---------------------------\n", 1258 | "-----episode 81-----\n", 1259 | "theta_2014: [[-0.12458437]\n", 1260 | " [ 0.3569592 ]\n", 1261 | " [ 0.13867382]\n", 1262 | " [ 0.38541746]\n", 1263 | " [ 0.16548624]\n", 1264 | " [ 0.19836941]]\n", 1265 | "theta_2016: [[-0.12458437]\n", 1266 | " [ 0.3569592 ]\n", 1267 | " [ 0.13867382]\n", 1268 | " [ 0.38541746]\n", 1269 | " [ 0.16548624]\n", 1270 | " [ 0.19836941]]\n", 1271 | "error 5.551115123125783e-17\n", 1272 | "---------------------------\n", 1273 | "-----episode 82-----\n", 1274 | "theta_2014: [[-0.12864146]\n", 1275 | " [ 0.3569592 ]\n", 1276 | " [ 0.13867382]\n", 1277 | " [ 0.39432076]\n", 1278 | " [ 0.16548624]\n", 1279 | " [ 0.20638572]]\n", 1280 | "theta_2016: [[-0.12864146]\n", 1281 | " [ 0.3569592 ]\n", 1282 | " [ 0.13867382]\n", 1283 | " [ 0.39432076]\n", 1284 | " [ 0.16548624]\n", 1285 | " [ 0.20638572]]\n", 1286 | "error 5.551115123125783e-17\n", 1287 | "---------------------------\n", 1288 | "-----episode 83-----\n", 1289 | "theta_2014: [[-0.13257159]\n", 1290 | " [ 0.3569592 ]\n", 1291 | " [ 0.13867382]\n", 1292 | " [ 0.40320645]\n", 1293 | " [ 0.16548624]\n", 1294 | " [ 0.21432186]]\n", 1295 | "theta_2016: [[-0.13257159]\n", 1296 | " [ 0.3569592 ]\n", 1297 | " [ 0.13867382]\n", 1298 | " [ 0.40320645]\n", 1299 | " [ 0.16548624]\n", 1300 | " [ 0.21432186]]\n", 1301 | "error 5.551115123125783e-17\n", 1302 | "---------------------------\n", 1303 | "-----episode 84-----\n", 1304 | "theta_2014: [[-0.13683138]\n", 1305 | " [ 0.36585409]\n", 1306 | " [ 0.13867382]\n", 1307 | " [ 0.40320645]\n", 1308 | " [ 0.17383138]\n", 1309 | " [ 0.21432186]]\n", 1310 | "theta_2016: [[-0.13683138]\n", 1311 | " [ 0.36585409]\n", 1312 | " [ 0.13867382]\n", 1313 | " [ 0.40320645]\n", 1314 | " [ 0.17383138]\n", 1315 | " [ 0.21432186]]\n", 1316 | "error 5.551115123125783e-17\n", 1317 | "---------------------------\n", 1318 | "-----episode 85-----\n", 1319 | "theta_2014: [[-0.12954367]\n", 1320 | " [ 0.36585409]\n", 1321 | " [ 0.12564511]\n", 1322 | " [ 0.40917439]\n", 1323 | " [ 0.17383138]\n", 1324 | " [ 0.21432186]]\n", 1325 | "theta_2016: [[-0.12954367]\n", 1326 | " [ 0.36585409]\n", 1327 | " [ 0.12564511]\n", 1328 | " [ 0.40917439]\n", 1329 | " [ 0.17383138]\n", 1330 | " [ 0.21432186]]\n", 1331 | "error 5.551115123125783e-17\n", 1332 | "---------------------------\n", 1333 | "-----episode 86-----\n", 1334 | "theta_2014: [[-0.13361249]\n", 1335 | " [ 0.36585409]\n", 1336 | " [ 0.12564511]\n", 1337 | " [ 0.41508264]\n", 1338 | " [ 0.17383138]\n", 1339 | " [ 0.21432186]]\n", 1340 | "theta_2016: [[-0.13361249]\n", 1341 | " [ 0.36585409]\n", 1342 | " [ 0.12564511]\n", 1343 | " [ 0.41508264]\n", 1344 | " [ 0.17383138]\n", 1345 | " [ 0.21432186]]\n", 1346 | "error 5.551115123125783e-17\n", 1347 | "---------------------------\n", 1348 | "-----episode 87-----\n", 1349 | "theta_2014: [[-0.11970272]\n", 1350 | " [ 0.36585409]\n", 1351 | " [ 0.13907705]\n", 1352 | " [ 0.42093182]\n", 1353 | " [ 0.17383138]\n", 1354 | " [ 0.21432186]]\n", 1355 | "theta_2016: [[-0.11970272]\n", 1356 | " [ 0.36585409]\n", 1357 | " [ 0.13907705]\n", 1358 | " [ 0.42093182]\n", 1359 | " [ 0.17383138]\n", 1360 | " [ 0.21432186]]\n", 1361 | "error 5.551115123125783e-17\n", 1362 | "---------------------------\n", 1363 | "-----episode 88-----\n", 1364 | "theta_2014: [[-0.1058072 ]\n", 1365 | " [ 0.36585409]\n", 1366 | " [ 0.15242678]\n", 1367 | " [ 0.4267225 ]\n", 1368 | " [ 0.17383138]\n", 1369 | " [ 0.21432186]]\n", 1370 | "theta_2016: [[-0.1058072 ]\n", 1371 | " [ 0.36585409]\n", 1372 | " [ 0.15242678]\n", 1373 | " [ 0.4267225 ]\n", 1374 | " [ 0.17383138]\n", 1375 | " [ 0.21432186]]\n", 1376 | "error 5.551115123125783e-17\n", 1377 | "---------------------------\n", 1378 | "-----episode 89-----\n", 1379 | "theta_2014: [[-0.09192659]\n", 1380 | " [ 0.36585409]\n", 1381 | " [ 0.16569461]\n", 1382 | " [ 0.43245527]\n", 1383 | " [ 0.17383138]\n", 1384 | " [ 0.21432186]]\n", 1385 | "theta_2016: [[-0.09192659]\n", 1386 | " [ 0.36585409]\n", 1387 | " [ 0.16569461]\n", 1388 | " [ 0.43245527]\n", 1389 | " [ 0.17383138]\n", 1390 | " [ 0.21432186]]\n", 1391 | "error 4.163336342344337e-17\n", 1392 | "---------------------------\n", 1393 | "-----episode 90-----\n", 1394 | "theta_2014: [[-0.09675757]\n", 1395 | " [ 0.37219555]\n", 1396 | " [ 0.16569461]\n", 1397 | " [ 0.43245527]\n", 1398 | " [ 0.17383138]\n", 1399 | " [ 0.21432186]]\n", 1400 | "theta_2016: [[-0.09675757]\n", 1401 | " [ 0.37219555]\n", 1402 | " [ 0.16569461]\n", 1403 | " [ 0.43245527]\n", 1404 | " [ 0.17383138]\n", 1405 | " [ 0.21432186]]\n", 1406 | "error 4.163336342344337e-17\n", 1407 | "---------------------------\n", 1408 | "-----episode 91-----\n", 1409 | "theta_2014: [[-0.10065975]\n", 1410 | " [ 0.37219555]\n", 1411 | " [ 0.16569461]\n", 1412 | " [ 0.44103033]\n", 1413 | " [ 0.17383138]\n", 1414 | " [ 0.22217864]]\n", 1415 | "theta_2016: [[-0.10065975]\n", 1416 | " [ 0.37219555]\n", 1417 | " [ 0.16569461]\n", 1418 | " [ 0.44103033]\n", 1419 | " [ 0.17383138]\n", 1420 | " [ 0.22217864]]\n", 1421 | "error 4.163336342344337e-17\n", 1422 | "---------------------------\n", 1423 | "-----episode 92-----\n", 1424 | "theta_2014: [[-0.10509555]\n", 1425 | " [ 0.38101244]\n", 1426 | " [ 0.16569461]\n", 1427 | " [ 0.44103033]\n", 1428 | " [ 0.18209306]\n", 1429 | " [ 0.22217864]]\n", 1430 | "theta_2016: [[-0.10509555]\n", 1431 | " [ 0.38101244]\n", 1432 | " [ 0.16569461]\n", 1433 | " [ 0.44103033]\n", 1434 | " [ 0.18209306]\n", 1435 | " [ 0.22217864]]\n", 1436 | "error 4.163336342344337e-17\n", 1437 | "---------------------------\n", 1438 | "-----episode 93-----\n", 1439 | "theta_2014: [[-0.10965977]\n", 1440 | " [ 0.38720231]\n", 1441 | " [ 0.16569461]\n", 1442 | " [ 0.44103033]\n", 1443 | " [ 0.18209306]\n", 1444 | " [ 0.22217864]]\n", 1445 | "theta_2016: [[-0.10965977]\n", 1446 | " [ 0.38720231]\n", 1447 | " [ 0.16569461]\n", 1448 | " [ 0.44103033]\n", 1449 | " [ 0.18209306]\n", 1450 | " [ 0.22217864]]\n", 1451 | "error 5.551115123125783e-17\n", 1452 | "---------------------------\n", 1453 | "-----episode 94-----\n", 1454 | "theta_2014: [[-0.1141232 ]\n", 1455 | " [ 0.39333029]\n", 1456 | " [ 0.16569461]\n", 1457 | " [ 0.44103033]\n", 1458 | " [ 0.18209306]\n", 1459 | " [ 0.22217864]]\n", 1460 | "theta_2016: [[-0.1141232 ]\n", 1461 | " [ 0.39333029]\n", 1462 | " [ 0.16569461]\n", 1463 | " [ 0.44103033]\n", 1464 | " [ 0.18209306]\n", 1465 | " [ 0.22217864]]\n", 1466 | "error 5.551115123125783e-17\n", 1467 | "---------------------------\n", 1468 | "-----episode 95-----\n", 1469 | "theta_2014: [[-0.11822876]\n", 1470 | " [ 0.40200943]\n", 1471 | " [ 0.16569461]\n", 1472 | " [ 0.44103033]\n", 1473 | " [ 0.19027213]\n", 1474 | " [ 0.22217864]]\n", 1475 | "theta_2016: [[-0.11822876]\n", 1476 | " [ 0.40200943]\n", 1477 | " [ 0.16569461]\n", 1478 | " [ 0.44103033]\n", 1479 | " [ 0.19027213]\n", 1480 | " [ 0.22217864]]\n", 1481 | "error 5.551115123125783e-17\n", 1482 | "---------------------------\n", 1483 | "-----episode 96-----\n", 1484 | "theta_2014: [[-0.12247457]\n", 1485 | " [ 0.40798934]\n", 1486 | " [ 0.16569461]\n", 1487 | " [ 0.44103033]\n", 1488 | " [ 0.19027213]\n", 1489 | " [ 0.22217864]]\n", 1490 | "theta_2016: [[-0.12247457]\n", 1491 | " [ 0.40798934]\n", 1492 | " [ 0.16569461]\n", 1493 | " [ 0.44103033]\n", 1494 | " [ 0.19027213]\n", 1495 | " [ 0.22217864]]\n", 1496 | "error 5.551115123125783e-17\n", 1497 | "---------------------------\n", 1498 | "-----episode 97-----\n", 1499 | "theta_2014: [[-0.10878349]\n", 1500 | " [ 0.40798934]\n", 1501 | " [ 0.17403766]\n", 1502 | " [ 0.44103033]\n", 1503 | " [ 0.19027213]\n", 1504 | " [ 0.22217864]]\n", 1505 | "theta_2016: [[-0.10878349]\n", 1506 | " [ 0.40798934]\n", 1507 | " [ 0.17403766]\n", 1508 | " [ 0.44103033]\n", 1509 | " [ 0.19027213]\n", 1510 | " [ 0.22217864]]\n", 1511 | "error 5.551115123125783e-17\n", 1512 | "---------------------------\n", 1513 | "-----episode 98-----\n", 1514 | "theta_2014: [[-0.11277607]\n", 1515 | " [ 0.40798934]\n", 1516 | " [ 0.17403766]\n", 1517 | " [ 0.44662002]\n", 1518 | " [ 0.19027213]\n", 1519 | " [ 0.22217864]]\n", 1520 | "theta_2016: [[-0.11277607]\n", 1521 | " [ 0.40798934]\n", 1522 | " [ 0.17403766]\n", 1523 | " [ 0.44662002]\n", 1524 | " [ 0.19027213]\n", 1525 | " [ 0.22217864]]\n", 1526 | "error 6.938893903907228e-17\n", 1527 | "---------------------------\n", 1528 | "-----episode 99-----\n", 1529 | "theta_2014: [[-0.09858656]\n", 1530 | " [ 0.40798934]\n", 1531 | " [ 0.18756066]\n", 1532 | " [ 0.45512344]\n", 1533 | " [ 0.19027213]\n", 1534 | " [ 0.22995685]]\n", 1535 | "theta_2016: [[-0.09858656]\n", 1536 | " [ 0.40798934]\n", 1537 | " [ 0.18756066]\n", 1538 | " [ 0.45512344]\n", 1539 | " [ 0.19027213]\n", 1540 | " [ 0.22995685]]\n", 1541 | "error 6.938893903907228e-17\n", 1542 | "---------------------------\n" 1543 | ] 1544 | } 1545 | ], 1546 | "source": [ 1547 | "EPISODES = 100\n", 1548 | "gamma = 0.99\n", 1549 | "alpha = 0.01\n", 1550 | "_lambda = 0.1\n", 1551 | "for episode in range(EPISODES):\n", 1552 | " e_2014 = np.array([[0],[0],[0],[0],[0],[0]])\n", 1553 | " e_2016 = np.array([[0],[0],[0],[0],[0],[0]])\n", 1554 | "\n", 1555 | " S = 'A'\n", 1556 | " # 2014\n", 1557 | " v_s = theta_2014.T@feature_map['A']\n", 1558 | " # 2016\n", 1559 | " V_old = 0\n", 1560 | " while True:\n", 1561 | " if S == 'T':\n", 1562 | " print(f'-----episode {episode}-----')\n", 1563 | " print(f'theta_2014: {theta_2014}')\n", 1564 | " print(f'theta_2016: {theta_2016}')\n", 1565 | " print('error ',np.sum(abs(theta_2014 - theta_2016)))\n", 1566 | " print(f'---------------------------')\n", 1567 | " break\n", 1568 | "\n", 1569 | " random_choice = np.random.choice(len(state_map[S]))\n", 1570 | " next_S, R = state_map[S][random_choice]\n", 1571 | " # 2014\n", 1572 | " v_next_s = theta_2014.T@feature_map[next_S]\n", 1573 | " delta_2014 = R + gamma*v_next_s - v_s\n", 1574 | " e_2014 = gamma*_lambda*e_2014 + alpha*(1-gamma*_lambda*e_2014.T@feature_map[S])*feature_map[S]\n", 1575 | " theta_2014 = theta_2014 + delta_2014*e_2014 + alpha*(v_s - theta_2014.T@feature_map[S])*feature_map[S]\n", 1576 | "\n", 1577 | " # 2016\n", 1578 | " V = theta_2016.T@feature_map[S]\n", 1579 | " V_prime = theta_2016.T@feature_map[next_S]\n", 1580 | " delta_2016 = R + gamma*V_prime - V\n", 1581 | " e_2016 = gamma*_lambda*e_2016 + feature_map[S] - alpha*gamma*_lambda * (e_2016.T@feature_map[S]) * feature_map[S]\n", 1582 | " theta_2016 = theta_2016 + alpha*(delta_2016 + V - V_old)*e_2016 - alpha*(V-V_old)*feature_map[S]\n", 1583 | " V_old = V_prime\n", 1584 | "\n", 1585 | " # 2014\n", 1586 | " v_s = v_next_s\n", 1587 | "\n", 1588 | " # 2014 & 2016\n", 1589 | " S = next_S\n" 1590 | ], 1591 | "metadata": { 1592 | "collapsed": false, 1593 | "pycharm": { 1594 | "name": "#%%\n" 1595 | } 1596 | } 1597 | }, 1598 | { 1599 | "cell_type": "code", 1600 | "execution_count": null, 1601 | "outputs": [], 1602 | "source": [ 1603 | "\n" 1604 | ], 1605 | "metadata": { 1606 | "collapsed": false, 1607 | "pycharm": { 1608 | "name": "#%%\n" 1609 | } 1610 | } 1611 | } 1612 | ], 1613 | "metadata": { 1614 | "kernelspec": { 1615 | "display_name": "Python 3", 1616 | "language": "python", 1617 | "name": "python3" 1618 | }, 1619 | "language_info": { 1620 | "codemirror_mode": { 1621 | "name": "ipython", 1622 | "version": 2 1623 | }, 1624 | "file_extension": ".py", 1625 | "mimetype": "text/x-python", 1626 | "name": "python", 1627 | "nbconvert_exporter": "python", 1628 | "pygments_lexer": "ipython2", 1629 | "version": "2.7.6" 1630 | } 1631 | }, 1632 | "nbformat": 4, 1633 | "nbformat_minor": 0 1634 | } -------------------------------------------------------------------------------- /MASM/Differential_semi_gradient_Sarsa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 83, 6 | "metadata": { 7 | "collapsed": true, 8 | "pycharm": { 9 | "is_executing": false 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import numpy as np\n", 15 | "class State:\n", 16 | " def __init__(self, name, value):\n", 17 | " self.name = name\n", 18 | " self.value = value\n", 19 | "A = State('A', np.array([[1,0,0]]))\n", 20 | "B = State('B',np.array([[0,1,0]]))\n", 21 | "C = State('C',np.array([[0,0,1]]))\n", 22 | "\n", 23 | "w = np.random.random((1,3))\n", 24 | "R = {'A':1, 'B':0, 'C':0}\n", 25 | "policy = {'A':B, 'B':C, 'C':A}" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 84, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "text": [ 35 | "[[4.15223411e-14]]\n", 36 | "[[1.38777878e-14]]\n", 37 | "[[-5.54001289e-14]]\n", 38 | "[[4.15223411e-14]]\n", 39 | "[[1.38777878e-14]]\n", 40 | "[[-5.54001289e-14]]\n", 41 | "[[4.15223411e-14]]\n", 42 | "[[1.38777878e-14]]\n", 43 | "[[-5.54001289e-14]]\n", 44 | "[[4.15223411e-14]]\n", 45 | "[[1.38777878e-14]]\n", 46 | "[[-5.54001289e-14]]\n", 47 | "[[4.15223411e-14]]\n", 48 | "[[1.38777878e-14]]\n", 49 | "[[-5.54001289e-14]]\n", 50 | "[[4.15223411e-14]]\n", 51 | "[[1.38777878e-14]]\n", 52 | "[[-5.54001289e-14]]\n", 53 | "[[4.15223411e-14]]\n", 54 | "[[1.38777878e-14]]\n", 55 | "[[-5.54001289e-14]]\n", 56 | "[[4.15223411e-14]]\n", 57 | "[[1.38777878e-14]]\n", 58 | "[[-5.54001289e-14]]\n", 59 | "[[4.15223411e-14]]\n", 60 | "[[1.38777878e-14]]\n", 61 | "[[-5.54001289e-14]]\n", 62 | "[[4.15223411e-14]]\n", 63 | "[[1.38777878e-14]]\n" 64 | ], 65 | "output_type": "stream" 66 | } 67 | ], 68 | "source": [ 69 | "S = A\n", 70 | "R_bar = 0\n", 71 | "limit = 100000\n", 72 | "for step in range(limit):\n", 73 | " S_prime = policy[S.name]\n", 74 | " delta = R[S_prime.name] - R_bar + S_prime.value@w.T - S.value@w.T\n", 75 | " R_bar += 0.001*delta\n", 76 | " w += 0.001*delta*S.value\n", 77 | " S = S_prime\n", 78 | " if step > limit -30:\n", 79 | " print(delta)" 80 | ], 81 | "metadata": { 82 | "collapsed": false, 83 | "pycharm": { 84 | "name": "#%%\n", 85 | "is_executing": false 86 | } 87 | } 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 85, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "text": [ 96 | "[array([[0.17260156]]), array([[0.50593489]]), array([[0.83926822]])]\n" 97 | ], 98 | "output_type": "stream" 99 | } 100 | ], 101 | "source": [ 102 | "result = [S.value@w.T for S in [A,B,C]]\n", 103 | "print(result)" 104 | ], 105 | "metadata": { 106 | "collapsed": false, 107 | "pycharm": { 108 | "name": "#%%\n", 109 | "is_executing": false 110 | } 111 | } 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "outputs": [], 117 | "source": [ 118 | "\n" 119 | ], 120 | "metadata": { 121 | "collapsed": false, 122 | "pycharm": { 123 | "name": "#%%\n" 124 | } 125 | } 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 2 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython2", 144 | "version": "2.7.6" 145 | }, 146 | "pycharm": { 147 | "stem_cell": { 148 | "cell_type": "raw", 149 | "source": [], 150 | "metadata": { 151 | "collapsed": false 152 | } 153 | } 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 0 158 | } -------------------------------------------------------------------------------- /MASM/prototype.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "pycharm": { 9 | "is_executing": false 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import torch\n", 15 | "import numpy as np\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "from torch.nn import Linear\n", 18 | "import torch.nn.functional as F\n", 19 | "from collections import deque\n", 20 | "import random\n", 21 | "import gym\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "from torch.nn import Linear, ReLU\n", 24 | "from torch.autograd import Variable" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "outputs": [], 31 | "source": [ 32 | "class Seller_Env:\n", 33 | " def __init__(self, size_of_list = 10, initial_price = 1000):\n", 34 | " self.mu = np.random.randint(initial_price*0.7, initial_price*1.1)\n", 35 | " self.days = 0\n", 36 | " self.size_of_list = size_of_list\n", 37 | " self.item_list = [int(np.random.normal(self.mu, scale = self.mu/10)) for _ in range(self.size_of_list)]\n", 38 | " self.baseline = self.item_list.mean()\n", 39 | " \n", 40 | " def step(self, current_price):\n", 41 | " self.days += 1\n", 42 | " offer = [int(np.random.normal(self.mu*0.7, scale = self.mu/10)) \n", 43 | " for _ in range(np.random.randint(0, self.size_of_list//2))]\n", 44 | " for _ in range(self.size_of_list//2 - len(offer)):\n", 45 | " offer.append(0)\n", 46 | " if max(offer) > current_price:\n", 47 | " return [],True\n", 48 | " else:\n", 49 | " return offer,False\n", 50 | " \n", 51 | " def reset(self):\n", 52 | " self.days = 0\n", 53 | " return self.item_list, \n", 54 | " \n", 55 | "class Seller:\n", 56 | " def __init__(self, min_price):\n", 57 | " self.min_price = min_price\n", 58 | " def model(self):\n", 59 | " " 60 | ], 61 | "metadata": { 62 | "collapsed": false, 63 | "pycharm": { 64 | "name": "#%%\n" 65 | } 66 | } 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython2", 85 | "version": "2.7.6" 86 | }, 87 | "pycharm": { 88 | "stem_cell": { 89 | "cell_type": "raw", 90 | "source": [], 91 | "metadata": { 92 | "collapsed": false 93 | } 94 | } 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 0 99 | } -------------------------------------------------------------------------------- /Off-Policy Policy Gradient/Experiment Log of failure of Off_policy_Actor_Critic: -------------------------------------------------------------------------------- 1 | ## Design 2 | The experiment is not succssful but I will record the detail about my thought. 3 | 4 | Inspiring from off-policy policy gradient method, I intend to try use off-policy method train 5 | an actor critic. 6 | 7 | I understand that using bootstrap, functional method plus off-policy is called deadly triads in 8 | Suttons' RL book. I still want to give a try. 9 | 10 | In this experiement, there is only one model, Actor. 11 | It has 2 outcomes, both action probabilites and V estimation. 12 | 13 | The behavior policy is epsilon-Actor. That is the same formulation from epsilon-greedy methods. 14 | While behaving, with probability 90%, the policy chooses the action based on its probability output like 15 | in normal actor-critic method. With 10% probability however, the policy chooses randomly among the action 16 | space, i.e, {0,1}. 17 | 18 | A buffer inside the class will record the observation of states, 19 | reward (because its always 1 in this case I make it constant), probability of the action taken. 20 | 21 | Then, the target policy is when we do not have epsilon. When buffer hits an end in episode, it will 22 | make the model learn. Recalculating each p based on stored observations, using stored p and simple algebra 23 | to generate rho, and update the model. buffer will also only keep the latest 100 records. 24 | 25 | After training, model is automatically evaluated by itself. 26 | 27 | ## Result: 28 | 29 | The learning never takes place but the loss can. It is not diverging like the one usually met in REINFORCE, 30 | but a natural behavior that the thing is converging to a place that is never a best policy. 31 | 32 | ## Improvement: 33 | 34 | Delete the V part of the actor model, and create a off-policy REINFORCE instead. -------------------------------------------------------------------------------- /RAINBOW/PyTorch_RAINBOW.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LyWangPX/Reinforcement_Learning_Coding_Examples/2f40f67f5709c9dc4ea3d9dd15b441b627b595a6/RAINBOW/PyTorch_RAINBOW.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome 2 | 3 | This is my personal practice of implementing various algorithms of RL from scratch. 4 | 5 | Most of them will be in jupyter notebook and some of them involving multiprocess 6 | would be in normal python files. 7 | 8 | The framework will always be PyTorch, as a personal practice too. 9 | 10 | Normally I use cartpole for easy algorithms in this project and I skip the 11 | visual input part. (which is quite trivial if you add few conv layers). 12 | 13 | And for 14 | harder and visual-related algorithms I will pick various atari game as my environment. 15 | 16 | Due to time limit, I will not provide systematic analysis to any particular algorithm. 17 | And be aware these are personal usage so bugs do appear frequently. 18 | 19 | If the project is mature, I will accept open issues. 20 | For now, however, let me dive in. (I guess no one even read this repo though) 21 | 22 | Project file structure will be changed continuously to match my needs. 23 | 24 | # PLAN: 25 | ## Model-Free RL 26 | ### Policy Gradient 27 | - [x] REINFORCE 28 | - [x] Off-Policy REINFORCE 29 | - [x] Basic Actor Critic 30 | - [x] Advantage Actor Critic using Huber loss and Entropy 31 | - [x] A3C 32 | - [x] A2C 33 | - [x] DDPG 34 | - [ ] D4PG 35 | - [ ] MADDPG 36 | - [ ] TRPO 37 | - [ ] PPO 38 | - [ ] ACER 39 | - [ ] ACTKR 40 | - [ ] SAC 41 | - [ ] SAC with AAT(Automatically Adjusted Temperature 42 | - [ ] TD3 43 | - [ ] SVPG 44 | - [ ] IMPALA 45 | ### Deep Q Learning 46 | - [X] Dueling DDQN 47 | - [x] Dueling DDQN + PER 48 | - [ ] Rainbow DQN 49 | - [ ] Ape-X 50 | ### Distributed RL 51 | - [ ] C51 52 | - [ ] QR-DQN 53 | - [ ] IQN 54 | - [ ] Dopamine (DQN + C51 + IQN + Rainbow) 55 | ### Policy Gradient with Action-Dependent Baselines: 56 | - [ ] Q-prop 57 | - [ ] Stein Control Variates 58 | ### Path-Consistency Learning 59 | - [ ] PCL 60 | - [ ] Trust-PCL 61 | ### Q-learning + Policy Gradient: 62 | - [ ] PGQL 63 | - [ ] Reactor 64 | - [ ] IPG 65 | ### Evolutionary Algorithm 66 | ### Monte Carlo Tree (Alpha Zero) 67 | ## Exploration RL 68 | ### Intrinsic Motivation 69 | - [ ] VIME 70 | - [ ] CTS-based Pseudocounts 71 | - [ ] PixelCNN-based Pseudocounts 72 | - [ ] Hash-based Counts 73 | - [ ] EX2 74 | - [ ] ICM 75 | - [ ] RND 76 | ### Unsupervised RL 77 | - [ ] VIC 78 | - [ ] DIAYN 79 | - [ ] VALOR 80 | ## Hierachy RL 81 | ## Memory RL 82 | ## Model-Based RL 83 | ## Meta-RL 84 | ## Scaling-RL --------------------------------------------------------------------------------