├── .idea
├── .gitignore
├── RL.iml
├── codeStyles
│ └── codeStyleConfig.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── other.xml
└── vcs.xml
├── A2C
├── A2C Episodic Sync
│ ├── SharedAdam.py
│ ├── evaluate.py
│ ├── pyTorch_CartPole_A2C_ES.py
│ └── workers.py
├── A2C Time Interval Sync
│ ├── SharedAdam.py
│ ├── evaluate.py
│ ├── pyTorch_CartPole_A2C_IS.py
│ └── workers.py
└── README About Vairations.md
├── A3C
├── A3C Episodic Async
│ ├── SharedAdam.py
│ ├── evaluate.py
│ ├── pyTorch_CartPole_A3C_EA.py
│ └── workers_PlayGround.py
├── A3C Time Interval Async
│ ├── SharedAdam.py
│ ├── evaluate.py
│ ├── pyTorch_CartPole_A3C_IA.py
│ └── workers.py
└── README About Vairations.md
├── Ape-X
└── PyTorch_Ape-X.py
├── D4PG
└── PyTorch_D4PG.py
├── DDPG
└── PyTorch_DDPG.py
├── Deuling Double DQN with PER
└── PyTorch_Deuling_DDQN_with_PER.py
├── Deuling Double DQN
└── PyTorch_Deuling_DDQN.py
├── Experiments
├── Online TD and true Online TD.ipynb
└── Seijen2014_True_Online_TD.ipynb
├── MASM
├── Differential_semi_gradient_Sarsa.ipynb
└── prototype.ipynb
├── Off-Policy Policy Gradient
├── Experiment Log of failure of Off_policy_Actor_Critic
├── pyTorch_CartPole_Off_Policy_Actor_Critic[not work].ipynb
└── pyTorch_CartPole_Off_Policy_REINFORCE.ipynb
├── Plain-Actor-Critic
├── pyTorch_CartPole_Advantage_Actor_Critic_Entropy_Regularized.ipynb
├── pyTorch_CartPole_Advantage_Episode_Wise_Actor_Critic_Huber.ipynb
└── pyTorch_CartPole_Step_Wise_Bootstrap_Actor_Critic.ipynb
├── RAINBOW
└── PyTorch_RAINBOW.py
├── README.md
└── REINFORCE
└── pyTorch_CartPole_REINFORCE.ipynb
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
--------------------------------------------------------------------------------
/.idea/RL.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/SharedAdam.py:
--------------------------------------------------------------------------------
1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
2 | # A very nice optimization of Adam method to make it receive shared states.
3 | import math
4 | import torch
5 | import torch.optim as optim
6 |
7 |
8 | class SharedAdam(optim.Adam):
9 | """Implements Adam algorithm with shared states.
10 | """
11 |
12 | def __init__(self,
13 | params,
14 | lr=1e-3,
15 | betas=(0.9, 0.999),
16 | eps=1e-8,
17 | weight_decay=0):
18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 |
20 | for group in self.param_groups:
21 | for p in group['params']:
22 | state = self.state[p]
23 | state['step'] = torch.zeros(1)
24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 |
27 | def share_memory(self):
28 | for group in self.param_groups:
29 | for p in group['params']:
30 | state = self.state[p]
31 | state['step'].share_memory_()
32 | state['exp_avg'].share_memory_()
33 | state['exp_avg_sq'].share_memory_()
34 |
35 | def step(self, closure=None):
36 | """Performs a single optimization step.
37 | Arguments:
38 | closure (callable, optional): A closure that reevaluates the model
39 | and returns the loss.
40 | """
41 | loss = None
42 | if closure is not None:
43 | loss = closure()
44 |
45 | for group in self.param_groups:
46 | for p in group['params']:
47 | if p.grad is None:
48 | continue
49 | grad = p.grad.data
50 | state = self.state[p]
51 |
52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 | beta1, beta2 = group['betas']
54 |
55 | state['step'] += 1
56 |
57 | if group['weight_decay'] != 0:
58 | grad = grad.add(group['weight_decay'], p.data)
59 |
60 | # Decay the first and second moment running average coefficient
61 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 |
64 | denom = exp_avg_sq.sqrt().add_(group['eps'])
65 |
66 | bias_correction1 = 1 - beta1 ** state['step'].item()
67 | bias_correction2 = 1 - beta2 ** state['step'].item()
68 | step_size = group['lr'] * math.sqrt(
69 | bias_correction2) / bias_correction1
70 |
71 | p.data.addcdiv_(-step_size, exp_avg, denom)
72 |
73 | return loss
--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/evaluate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def evaluate(shared_model, q):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | for episode in range(1):
35 | action_log_history = []
36 | for step in range(200):
37 | actor.load_state_dict(shared_model.state_dict())
38 | # -----lines below are line-corresponding to the original algorithm----
39 | obs = np.reshape(obs, [1, -1])
40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 | action_log_probability, V = actor(input_actor)
42 | p = np.exp(action_log_probability[0].detach().cpu())
43 | action = np.random.choice(2, p=p.numpy())
44 | action_log_history.append(action_log_probability[0][action])
45 | obs, reward, done, info = env.step(action)
46 | if done:
47 | q.put(step)
48 | return
--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/pyTorch_CartPole_A2C_ES.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import torch.multiprocessing as mp
9 | from workers import worker
10 | from evaluate import evaluate
11 | from SharedAdam import SharedAdam
12 | from time import perf_counter
13 |
14 | class Actor(torch.nn.Module):
15 | def __init__(self):
16 | super(Actor,self).__init__()
17 | self.fc1 = Linear(4, 128)
18 | self.fc2 = Linear(128, 128)
19 | self.fc3 = Linear(128, 2)
20 | self.fc4 = Linear(128, 1)
21 | self.steps = []
22 |
23 | def forward(self, x):
24 | x = F.relu(self.fc1(x))
25 | x = F.relu(self.fc2(x))
26 | action = F.log_softmax(self.fc3(x), dim=-1)
27 | V = F.relu(self.fc4(x))
28 | return action, V
29 |
30 | def draw(self, eval = False):
31 | plt.style.use('dark_background')
32 | plt.figure(figsize=(10, 10))
33 | if eval:
34 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
35 | plt.xlabel('Rewards', fontsize='xx-large')
36 | plt.ylabel('Frequency', fontsize='xx-large')
37 | plt.hist(self.steps, range=(0, 200))
38 | plt.show()
39 | else:
40 | mid = []
41 | interval = 3
42 | for i in range(len(self.steps) - interval):
43 | mid.append(np.mean(self.steps[i:i + interval + 1]))
44 | plt.title('Performance of A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
45 | plt.xlabel('Episodes', fontsize='xx-large')
46 | plt.ylabel('Rewards', fontsize='xx-large')
47 | x_fit = list(range(len(self.steps) - interval))
48 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
49 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
50 | plt.legend(loc="best", prop={'size': 12})
51 | plt.show()
52 |
53 |
54 | if __name__ == '__main__':
55 | device = 'cpu'
56 | mp.set_start_method('spawn')
57 | # Do not change this unless you have multiple GPU.
58 | # update test
59 | q = mp.Queue()
60 | num_workers = 7
61 | processes = []
62 | shared_model = Actor()
63 | shared_model.to(device)
64 | shared_model.share_memory()
65 | optimizer = SharedAdam(shared_model.parameters(), lr=0.003)
66 | for episode in range(10000):
67 | t1_start = perf_counter()
68 | p = mp.Process(target=evaluate, args=(shared_model, q))
69 | processes.append(p)
70 | p.start()
71 | for worker_id in range(num_workers):
72 | p = mp.Process(target = worker, args = (shared_model, optimizer))
73 | processes.append(p)
74 | p.start()
75 | for p in processes:
76 | p.join()
77 | shared_model.steps.append(q.get())
78 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
79 | if np.mean(shared_model.steps[-25:]) == 199:
80 | break
81 | t1_stop = perf_counter()
82 | print("Elapsed time during the whole program in seconds:",
83 | t1_stop - t1_start)
84 | shared_model.draw()
85 | shared_model.step = []
86 | for episode in range(15):
87 | for worker_id in range(6):
88 | p = mp.Process(target=evaluate, args=(shared_model, q))
89 | for p in processes:
90 | p.join()
91 | while not q.empty():
92 | shared_model.steps.append(q.get())
93 | shared_model.steps.sort()
94 | shared_model.draw(eval = True)
95 |
96 |
97 |
--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/workers.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def worker(shared_model, optimizer):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | gamma = 0.99
35 | steps = []
36 | eps = np.finfo(np.float32).eps.item()
37 | for episode in range(1):
38 | action_log_history = []
39 | V_history = []
40 | for step in range(200):
41 | actor.load_state_dict(shared_model.state_dict())
42 | # -----lines below are line-corresponding to the original algorithm----
43 | obs = np.reshape(obs, [1, -1])
44 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
45 | action_log_probability, V = actor(input_actor)
46 | p = np.exp(action_log_probability[0].detach().cpu())
47 | action = np.random.choice(2, p=p.numpy())
48 | action_log_history.append(action_log_probability[0][action])
49 | V_history.append(V)
50 | obs, reward, done, info = env.step(action)
51 | if done:
52 | if step == 199:
53 | break
54 | actor.zero_grad()
55 | steps.append(step)
56 | print(f'episode {episode}, step {step}', end='\r')
57 | obs = env.reset()
58 | reward_list = np.ones((step + 1,))
59 | for i in range(len(reward_list) - 2, -1, -1):
60 | reward_list[i] += reward_list[i + 1] * gamma
61 | reward_list -= np.mean(reward_list)
62 | reward_list /= (np.std(reward_list) + eps)
63 | Critic_Loss = []
64 | Delta = []
65 | for monte_carlo_return, V in zip(reward_list, V_history):
66 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
67 | Delta.append(monte_carlo_return - V.detach())
68 | Actor_Loss = []
69 | entropy = 0
70 | for log_p in action_log_history:
71 | entropy -= log_p * torch.exp(log_p)
72 | for delta, log_prob in zip(Delta, action_log_history):
73 | Actor_Loss.append(-log_prob * delta.detach())
74 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy*0.01
75 | loss.backward()
76 | ensure_shared_grads(actor, shared_model)
77 | optimizer.step()
78 | break
79 |
80 |
81 | def ensure_shared_grads(model, shared_model):
82 | for param, shared_param in zip(model.parameters(),
83 | shared_model.parameters()):
84 | if shared_param.grad is not None:
85 | return
86 | shared_param._grad = param.grad
87 |
--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/SharedAdam.py:
--------------------------------------------------------------------------------
1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
2 | # A very nice optimization of Adam method to make it receive shared states.
3 | import math
4 | import torch
5 | import torch.optim as optim
6 |
7 |
8 | class SharedAdam(optim.Adam):
9 | """Implements Adam algorithm with shared states.
10 | """
11 |
12 | def __init__(self,
13 | params,
14 | lr=1e-3,
15 | betas=(0.9, 0.999),
16 | eps=1e-8,
17 | weight_decay=0):
18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 |
20 | for group in self.param_groups:
21 | for p in group['params']:
22 | state = self.state[p]
23 | state['step'] = torch.zeros(1)
24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 |
27 | def share_memory(self):
28 | for group in self.param_groups:
29 | for p in group['params']:
30 | state = self.state[p]
31 | state['step'].share_memory_()
32 | state['exp_avg'].share_memory_()
33 | state['exp_avg_sq'].share_memory_()
34 |
35 | def step(self, closure=None):
36 | """Performs a single optimization step.
37 | Arguments:
38 | closure (callable, optional): A closure that reevaluates the model
39 | and returns the loss.
40 | """
41 | loss = None
42 | if closure is not None:
43 | loss = closure()
44 |
45 | for group in self.param_groups:
46 | for p in group['params']:
47 | if p.grad is None:
48 | continue
49 | grad = p.grad.data
50 | state = self.state[p]
51 |
52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 | beta1, beta2 = group['betas']
54 |
55 | state['step'] += 1
56 |
57 | if group['weight_decay'] != 0:
58 | grad = grad.add(group['weight_decay'], p.data)
59 |
60 | # Decay the first and second moment running average coefficient
61 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 |
64 | denom = exp_avg_sq.sqrt().add_(group['eps'])
65 |
66 | bias_correction1 = 1 - beta1 ** state['step'].item()
67 | bias_correction2 = 1 - beta2 ** state['step'].item()
68 | step_size = group['lr'] * math.sqrt(
69 | bias_correction2) / bias_correction1
70 |
71 | p.data.addcdiv_(-step_size, exp_avg, denom)
72 |
73 | return loss
74 |
--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/evaluate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def evaluate(shared_model, q):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | for episode in range(1):
35 | action_log_history = []
36 | for step in range(200):
37 | actor.load_state_dict(shared_model.state_dict())
38 | # -----lines below are line-corresponding to the original algorithm----
39 | obs = np.reshape(obs, [1, -1])
40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 | action_log_probability, V = actor(input_actor)
42 | p = np.exp(action_log_probability[0].detach().cpu())
43 | action = np.random.choice(2, p=p.numpy())
44 | action_log_history.append(action_log_probability[0][action])
45 | obs, reward, done, info = env.step(action)
46 | if done:
47 | q.put(step)
48 | return
49 |
--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/pyTorch_CartPole_A2C_IS.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import torch.multiprocessing as mp
9 | from workers import worker
10 | from evaluate import evaluate
11 | from SharedAdam import SharedAdam
12 | from time import perf_counter
13 |
14 |
15 | class Actor(torch.nn.Module):
16 | def __init__(self):
17 | super(Actor, self).__init__()
18 | self.fc1 = Linear(4, 128)
19 | self.fc2 = Linear(128, 128)
20 | self.fc3 = Linear(128, 2)
21 | self.fc4 = Linear(128, 1)
22 | self.steps = []
23 |
24 | def forward(self, x):
25 | x = F.relu(self.fc1(x))
26 | x = F.relu(self.fc2(x))
27 | action = F.log_softmax(self.fc3(x), dim=-1)
28 | V = F.relu(self.fc4(x))
29 | return action, V
30 |
31 | def draw(self, eval=False):
32 | plt.style.use('dark_background')
33 | plt.figure(figsize=(10, 10))
34 | if eval:
35 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
36 | plt.xlabel('Rewards', fontsize='xx-large')
37 | plt.ylabel('Frequency', fontsize='xx-large')
38 | plt.hist(self.steps, range=(0, 200))
39 | plt.show()
40 | else:
41 | mid = []
42 | interval = 3
43 | for i in range(len(self.steps) - interval):
44 | mid.append(np.mean(self.steps[i:i + interval + 1]))
45 | plt.title('Performance of A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
46 | plt.xlabel('Episodes', fontsize='xx-large')
47 | plt.ylabel('Rewards', fontsize='xx-large')
48 | x_fit = list(range(len(self.steps) - interval))
49 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
50 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
51 | plt.legend(loc="best", prop={'size': 12})
52 | plt.show()
53 |
54 |
55 | if __name__ == '__main__':
56 | device = 'cpu'
57 | mp.set_start_method('spawn')
58 | # Do not change this unless you have multiple GPU.
59 | # update test
60 | q = mp.Queue()
61 | num_workers = 7
62 | T = 150
63 | processes = []
64 | shared_model = Actor()
65 | shared_model.to(device)
66 | shared_model.share_memory()
67 | optimizer = SharedAdam(shared_model.parameters(), lr=0.003)
68 | for episode in range(10000):
69 | t1_start = perf_counter()
70 | p = mp.Process(target=evaluate, args=(shared_model, q))
71 | processes.append(p)
72 | p.start()
73 | for worker_id in range(num_workers):
74 | p = mp.Process(target=worker, args=(shared_model, optimizer, T))
75 | processes.append(p)
76 | p.start()
77 | for p in processes:
78 | p.join()
79 | shared_model.steps.append(q.get())
80 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
81 | if np.mean(shared_model.steps[-25:]) == 199:
82 | break
83 | t1_stop = perf_counter()
84 | print("Elapsed time during the whole program in seconds:",
85 | t1_stop - t1_start)
86 | shared_model.draw()
87 | shared_model.step = []
88 | for episode in range(15):
89 | for worker_id in range(6):
90 | p = mp.Process(target=evaluate, args=(shared_model, q))
91 | for p in processes:
92 | p.join()
93 | while not q.empty():
94 | shared_model.steps.append(q.get())
95 | shared_model.steps.sort()
96 | shared_model.draw(eval=True)
97 |
--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/workers.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def worker(shared_model, optimizer, T):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | gamma = 0.99
35 | steps = []
36 | eps = np.finfo(np.float32).eps.item()
37 | actor.load_state_dict(shared_model.state_dict())
38 | t = 0
39 | for episode in range(1):
40 | action_log_history = []
41 | V_history = []
42 | for step in range(200):
43 | # -----lines below are line-corresponding to the original algorithm----
44 | obs = np.reshape(obs, [1, -1])
45 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
46 | action_log_probability, V = actor(input_actor)
47 | p = np.exp(action_log_probability[0].detach().cpu())
48 | action = np.random.choice(2, p=p.numpy())
49 | action_log_history.append(action_log_probability[0][action])
50 | V_history.append(V)
51 | obs, reward, done, info = env.step(action)
52 | t += 1
53 | if done or t >= T:
54 | if step == 199:
55 | break
56 | actor.zero_grad()
57 | steps.append(step)
58 | if done:
59 | print(f'episode {episode}, step {step}', end='\r')
60 | obs = env.reset()
61 | reward_list = np.ones((step + 1,))
62 | for i in range(len(reward_list) - 2, -1, -1):
63 | reward_list[i] += reward_list[i + 1] * gamma
64 | reward_list -= np.mean(reward_list)
65 | reward_list /= (np.std(reward_list) + eps)
66 | Critic_Loss = []
67 | Delta = []
68 | for monte_carlo_return, V in zip(reward_list, V_history):
69 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
70 | Delta.append(monte_carlo_return - V.detach())
71 | Actor_Loss = []
72 | entropy = 0
73 | for log_p in action_log_history:
74 | entropy -= log_p * torch.exp(log_p)
75 | for delta, log_prob in zip(Delta, action_log_history):
76 | Actor_Loss.append(-log_prob * delta.detach())
77 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy * 0.01
78 | loss.backward()
79 | ensure_shared_grads(actor, shared_model)
80 | optimizer.step()
81 | break
82 |
83 |
84 | def ensure_shared_grads(model, shared_model):
85 | for param, shared_param in zip(model.parameters(),
86 | shared_model.parameters()):
87 | if shared_param.grad is not None:
88 | return
89 | shared_param._grad = param.grad
90 |
--------------------------------------------------------------------------------
/A2C/README About Vairations.md:
--------------------------------------------------------------------------------
1 | ## Review of A2C and A3C
2 | We all know the A3C is using a episode count to control the async
3 | process. In this procedure, child process will return the grad
4 | after given episodes or finished early. They will be sync in the next
5 | run.
6 |
7 | A2C however, waits for each child process to finish its segments.
8 |
9 | ## My Variation
10 | In this project, I change the sync or async method from A2C and A3C
11 | into episode wise. That is, the child process will only return
12 | the grad AFTER it completes the episode.
13 |
14 | In variation of A3C, child process will return the grad as soon
15 | as it finished the current episode, return the latest grad, do the backward
16 | then sync with the latest model parameters to go next. It will be put
17 | into an infinite loop. This method will never call join() method to process,
18 | instead, it will monitor a queue that is filled by each child process's game
19 | record. If the last 100 of them are maximum rewards, it will send terminate
20 | message to all child processes.
21 |
22 | In variation of A2C, child process will return the grad as soon
23 | as it finished the current episode but all processes will wait every one
24 | finished and sync with the updated model together. In this case we need
25 | a loop and call joint in each loop. If the queue fulfills the convergence requirement,
26 | loop will be ended.
27 |
28 | Additionally, I have set all mode do not learn at all if it reaches the maximum
29 | reward to facilitate converging. [need TESTED]
30 |
31 | ## More Variations:
32 | There isn't too much difference in each child processes. It could
33 | be a exploratory directions. And, how to measure the differences of each
34 | method need a large amount of time.
35 |
36 | ## Additional Warnings:
37 | when initialize multi-processing, fork does not work. If you are in Linux or Mac, change to spawn.
38 | Reasons unclear to me at this point.
39 |
--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/SharedAdam.py:
--------------------------------------------------------------------------------
1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
2 | # A very nice optimization of Adam method to make it receive shared states.
3 | import math
4 | import torch
5 | import torch.optim as optim
6 |
7 |
8 | class SharedAdam(optim.Adam):
9 | """Implements Adam algorithm with shared states.
10 | """
11 |
12 | def __init__(self,
13 | params,
14 | lr=1e-3,
15 | betas=(0.9, 0.999),
16 | eps=1e-8,
17 | weight_decay=0):
18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 |
20 | for group in self.param_groups:
21 | for p in group['params']:
22 | state = self.state[p]
23 | state['step'] = torch.zeros(1)
24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 |
27 | def share_memory(self):
28 | for group in self.param_groups:
29 | for p in group['params']:
30 | state = self.state[p]
31 | state['step'].share_memory_()
32 | state['exp_avg'].share_memory_()
33 | state['exp_avg_sq'].share_memory_()
34 |
35 | def step(self, closure=None):
36 | """Performs a single optimization step.
37 | Arguments:
38 | closure (callable, optional): A closure that reevaluates the model
39 | and returns the loss.
40 | """
41 | loss = None
42 | if closure is not None:
43 | loss = closure()
44 |
45 | for group in self.param_groups:
46 | for p in group['params']:
47 | if p.grad is None:
48 | continue
49 | grad = p.grad.data
50 | state = self.state[p]
51 |
52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 | beta1, beta2 = group['betas']
54 |
55 | state['step'] += 1
56 |
57 | if group['weight_decay'] != 0:
58 | grad = grad.add(group['weight_decay'], p.data)
59 |
60 | # Decay the first and second moment running average coefficient
61 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 |
64 | denom = exp_avg_sq.sqrt().add_(group['eps'])
65 |
66 | bias_correction1 = 1 - beta1 ** state['step'].item()
67 | bias_correction2 = 1 - beta2 ** state['step'].item()
68 | step_size = group['lr'] * math.sqrt(
69 | bias_correction2) / bias_correction1
70 |
71 | p.data.addcdiv_(-step_size, exp_avg, denom)
72 |
73 | return loss
74 |
--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/evaluate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def evaluate(shared_model, q):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | for episode in range(1):
35 | action_log_history = []
36 | for step in range(200):
37 | actor.load_state_dict(shared_model.state_dict())
38 | # -----lines below are line-corresponding to the original algorithm----
39 | obs = np.reshape(obs, [1, -1])
40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 | action_log_probability, V = actor(input_actor)
42 | p = np.exp(action_log_probability[0].detach().cpu())
43 | action = np.random.choice(2, p=p.numpy())
44 | action_log_history.append(action_log_probability[0][action])
45 | obs, reward, done, info = env.step(action)
46 | if done:
47 | q.put(step)
48 | return
49 |
--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/pyTorch_CartPole_A3C_EA.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from torch.nn import Linear
5 | import torch.nn.functional as F
6 | import torch.multiprocessing as mp
7 | from workers_PlayGround import worker
8 | from evaluate import evaluate
9 | from SharedAdam import SharedAdam
10 |
11 |
12 | class Actor(torch.nn.Module):
13 | def __init__(self):
14 | super(Actor,self).__init__()
15 | self.fc1 = Linear(4, 128)
16 | self.fc2 = Linear(128, 128)
17 | self.fc3 = Linear(128, 2)
18 | self.fc4 = Linear(128, 1)
19 | self.steps = []
20 |
21 | def forward(self, x):
22 | x = F.relu(self.fc1(x))
23 | x = F.relu(self.fc2(x))
24 | action = F.log_softmax(self.fc3(x), dim=-1)
25 | V = F.relu(self.fc4(x))
26 | return action, V
27 |
28 | def draw(self, eval = False):
29 | plt.style.use('dark_background')
30 | plt.figure(figsize=(10, 10))
31 | if eval:
32 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
33 | plt.xlabel('Rewards', fontsize='xx-large')
34 | plt.ylabel('Frequency', fontsize='xx-large')
35 | plt.hist(self.steps, range=(0, 200))
36 | plt.show()
37 | else:
38 | mid = []
39 | interval = 3
40 | for i in range(len(self.steps) - interval):
41 | mid.append(np.mean(self.steps[i:i + interval + 1]))
42 | plt.title('Performance of True Episode-Wise A3C on CartPole_V0', fontsize='xx-large')
43 | plt.xlabel('Episodes', fontsize='xx-large')
44 | plt.ylabel('Rewards', fontsize='xx-large')
45 | x_fit = list(range(len(self.steps) - interval))
46 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
47 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
48 | plt.legend(loc="best", prop={'size': 12})
49 | plt.show()
50 |
51 |
52 | if __name__ == '__main__':
53 | device = 'cpu'
54 | mp.set_start_method('spawn')
55 | # Do not change this unless you have multiple GPU.
56 | # update test
57 | q = mp.Queue()
58 | num_workers = 7
59 | processes = []
60 | shared_model = Actor()
61 | shared_model.to(device)
62 | shared_model.share_memory()
63 | optimizer = SharedAdam(shared_model.parameters(), lr=0.001)
64 | p = mp.Process(target=evaluate, args=(shared_model, q))
65 | processes.append(p)
66 | p.start()
67 | for worker_id in range(num_workers):
68 | p = mp.Process(target = worker, args = (shared_model, optimizer, q))
69 | processes.append(p)
70 | p.start()
71 | # for p in processes:
72 | # p.join()
73 | episode = 0
74 |
75 | while True:
76 | if not q.empty():
77 | shared_model.steps.append(q.get())
78 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
79 | episode += 1
80 | if len(shared_model.steps) > 25:
81 | if np.mean(shared_model.steps[-100:]) == 199:
82 | for p in processes:
83 | p.terminate()
84 | while not q.empty():
85 | shared_model.steps.append(q.get())
86 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
87 | episode += 1
88 | break
89 | shared_model.draw()
90 | # ----evaluation----
91 | shared_model.step = []
92 | for episode in range(15):
93 | for worker_id in range(6):
94 | p = mp.Process(target=evaluate, args=(shared_model, q))
95 | for p in processes:
96 | p.join()
97 | while not q.empty():
98 | shared_model.steps.append(q.get())
99 | shared_model.steps.sort()
100 | shared_model.draw(eval = True)
101 |
102 |
103 |
--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/workers_PlayGround.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def worker(shared_model, optimizer,q):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | gamma = 0.99
35 | eps = np.finfo(np.float32).eps.item()
36 | while True:
37 | action_log_history = []
38 | V_history = []
39 | actor.load_state_dict(shared_model.state_dict())
40 | for step in range(200):
41 | # -----lines below are line-corresponding to the original algorithm----
42 | obs = np.reshape(obs, [1, -1])
43 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
44 | action_log_probability, V = actor(input_actor)
45 | p = np.exp(action_log_probability[0].detach().cpu())
46 | action = np.random.choice(2, p=p.numpy())
47 | action_log_history.append(action_log_probability[0][action])
48 | V_history.append(V)
49 | obs, reward, done, info = env.step(action)
50 | if done:
51 | q.put(step)
52 | actor.zero_grad()
53 | obs = env.reset()
54 | if step == 199:
55 | break
56 | reward_list = np.ones((step + 1,))
57 | for i in range(len(reward_list) - 2, -1, -1):
58 | reward_list[i] += reward_list[i + 1] * gamma
59 | reward_list -= np.mean(reward_list)
60 | reward_list /= (np.std(reward_list) + eps)
61 | Critic_Loss = []
62 | Delta = []
63 | for monte_carlo_return, V in zip(reward_list, V_history):
64 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
65 | Delta.append(monte_carlo_return - V.detach())
66 | Actor_Loss = []
67 | entropy = 0
68 | for log_p in action_log_history:
69 | entropy -= log_p * torch.exp(log_p)
70 | for delta, log_prob in zip(Delta, action_log_history):
71 | Actor_Loss.append(-log_prob * delta.detach())
72 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy*0.01
73 | loss.backward()
74 | ensure_shared_grads(actor, shared_model)
75 | optimizer.step()
76 | break
77 |
78 |
79 | def ensure_shared_grads(model, shared_model):
80 | for param, shared_param in zip(model.parameters(),
81 | shared_model.parameters()):
82 | if shared_param.grad is not None:
83 | return
84 | shared_param._grad = param.grad
85 |
--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/SharedAdam.py:
--------------------------------------------------------------------------------
1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
2 | # A very nice optimization of Adam method to make it receive shared states.
3 | import math
4 | import torch
5 | import torch.optim as optim
6 |
7 |
8 | class SharedAdam(optim.Adam):
9 | """Implements Adam algorithm with shared states.
10 | """
11 |
12 | def __init__(self,
13 | params,
14 | lr=1e-3,
15 | betas=(0.9, 0.999),
16 | eps=1e-8,
17 | weight_decay=0):
18 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 |
20 | for group in self.param_groups:
21 | for p in group['params']:
22 | state = self.state[p]
23 | state['step'] = torch.zeros(1)
24 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 |
27 | def share_memory(self):
28 | for group in self.param_groups:
29 | for p in group['params']:
30 | state = self.state[p]
31 | state['step'].share_memory_()
32 | state['exp_avg'].share_memory_()
33 | state['exp_avg_sq'].share_memory_()
34 |
35 | def step(self, closure=None):
36 | """Performs a single optimization step.
37 | Arguments:
38 | closure (callable, optional): A closure that reevaluates the model
39 | and returns the loss.
40 | """
41 | loss = None
42 | if closure is not None:
43 | loss = closure()
44 |
45 | for group in self.param_groups:
46 | for p in group['params']:
47 | if p.grad is None:
48 | continue
49 | grad = p.grad.data
50 | state = self.state[p]
51 |
52 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 | beta1, beta2 = group['betas']
54 |
55 | state['step'] += 1
56 |
57 | if group['weight_decay'] != 0:
58 | grad = grad.add(group['weight_decay'], p.data)
59 |
60 | # Decay the first and second moment running average coefficient
61 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 |
64 | denom = exp_avg_sq.sqrt().add_(group['eps'])
65 |
66 | bias_correction1 = 1 - beta1 ** state['step'].item()
67 | bias_correction2 = 1 - beta2 ** state['step'].item()
68 | step_size = group['lr'] * math.sqrt(
69 | bias_correction2) / bias_correction1
70 |
71 | p.data.addcdiv_(-step_size, exp_avg, denom)
72 |
73 | return loss
74 |
--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/evaluate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def evaluate(shared_model, q):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | for episode in range(1):
35 | action_log_history = []
36 | for step in range(200):
37 | actor.load_state_dict(shared_model.state_dict())
38 | # -----lines below are line-corresponding to the original algorithm----
39 | obs = np.reshape(obs, [1, -1])
40 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 | action_log_probability, V = actor(input_actor)
42 | p = np.exp(action_log_probability[0].detach().cpu())
43 | action = np.random.choice(2, p=p.numpy())
44 | action_log_history.append(action_log_probability[0][action])
45 | obs, reward, done, info = env.step(action)
46 | if done:
47 | q.put(step)
48 | return
49 |
--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/pyTorch_CartPole_A3C_IA.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from torch.nn import Linear
5 | import torch.nn.functional as F
6 | import torch.multiprocessing as mp
7 | from workers import worker
8 | from evaluate import evaluate
9 | from SharedAdam import SharedAdam
10 |
11 |
12 | class Actor(torch.nn.Module):
13 | def __init__(self):
14 | super(Actor, self).__init__()
15 | self.fc1 = Linear(4, 128)
16 | self.fc2 = Linear(128, 128)
17 | self.fc3 = Linear(128, 2)
18 | self.fc4 = Linear(128, 1)
19 | self.steps = []
20 |
21 | def forward(self, x):
22 | x = F.relu(self.fc1(x))
23 | x = F.relu(self.fc2(x))
24 | action = F.log_softmax(self.fc3(x), dim=-1)
25 | V = F.relu(self.fc4(x))
26 | return action, V
27 |
28 | def draw(self, eval=False):
29 | plt.style.use('dark_background')
30 | plt.figure(figsize=(10, 10))
31 | if eval:
32 | plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
33 | plt.xlabel('Rewards', fontsize='xx-large')
34 | plt.ylabel('Frequency', fontsize='xx-large')
35 | plt.hist(self.steps, range=(0, 200))
36 | plt.show()
37 | else:
38 | mid = []
39 | interval = 3
40 | for i in range(len(self.steps) - interval):
41 | mid.append(np.mean(self.steps[i:i + interval + 1]))
42 | plt.title('Performance of True Episode-Wise A3C on CartPole_V0', fontsize='xx-large')
43 | plt.xlabel('Episodes', fontsize='xx-large')
44 | plt.ylabel('Rewards', fontsize='xx-large')
45 | x_fit = list(range(len(self.steps) - interval))
46 | plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
47 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
48 | plt.legend(loc="best", prop={'size': 12})
49 | plt.show()
50 |
51 |
52 | if __name__ == '__main__':
53 | device = 'cpu'
54 | mp.set_start_method('spawn')
55 | # Do not change this unless you have multiple GPU.
56 | # update test
57 | q = mp.Queue()
58 | num_workers = 15
59 | processes = []
60 | shared_model = Actor()
61 | shared_model.to(device)
62 | shared_model.share_memory()
63 | optimizer = SharedAdam(shared_model.parameters(), lr=0.001)
64 | p = mp.Process(target=evaluate, args=(shared_model, q))
65 | processes.append(p)
66 | p.start()
67 | T = 300
68 | for worker_id in range(num_workers):
69 | p = mp.Process(target=worker, args=(shared_model, optimizer, q, T))
70 | processes.append(p)
71 | p.start()
72 | # for p in processes:
73 | # p.join()
74 | episode = 0
75 |
76 | while True:
77 | if not q.empty():
78 | shared_model.steps.append(q.get())
79 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
80 | episode += 1
81 | if len(shared_model.steps) > 25:
82 | if np.mean(shared_model.steps[-50:]) == 199:
83 | for p in processes:
84 | p.terminate()
85 | while not q.empty():
86 | shared_model.steps.append(q.get())
87 | print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
88 | episode += 1
89 | break
90 | shared_model.draw()
91 | # ----evaluation----
92 | shared_model.step = []
93 | for episode in range(15):
94 | for worker_id in range(6):
95 | p = mp.Process(target=evaluate, args=(shared_model,q))
96 | for p in processes:
97 | p.join()
98 | while not q.empty():
99 | shared_model.steps.append(q.get())
100 | shared_model.steps.sort()
101 | shared_model.draw(eval=True)
102 |
--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/workers.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from torch.nn import Linear, ReLU
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 |
9 |
10 | def worker(shared_model, optimizer, q, T):
11 | class Actor(torch.nn.Module):
12 | def __init__(self):
13 | super(Actor, self).__init__()
14 | self.fc1 = Linear(4, 128)
15 | self.fc2 = Linear(128, 128)
16 | self.fc3 = Linear(128, 2)
17 | self.fc4 = Linear(128, 1)
18 | self.steps = []
19 |
20 | def forward(self, x):
21 | x = F.relu(self.fc1(x))
22 | x = F.relu(self.fc2(x))
23 | action = F.log_softmax(self.fc3(x), dim=-1)
24 | V = F.relu(self.fc4(x))
25 | return action, V
26 |
27 | device = 'cpu'
28 | # I do not recommend using GPU for this method. CPU is much faster.
29 | # Change this to cuda only if you have a poor CPU or on a cloud
30 | env = gym.make('CartPole-v0')
31 | obs = env.reset()
32 | actor = Actor()
33 | actor.to(device)
34 | gamma = 0.99
35 | eps = np.finfo(np.float32).eps.item()
36 | t = 0
37 | while True:
38 | action_log_history = []
39 | V_history = []
40 | for step in range(200):
41 | # -----lines below are line-corresponding to the original algorithm----
42 | actor.load_state_dict(shared_model.state_dict())
43 | obs = np.reshape(obs, [1, -1])
44 | input_actor = Variable(torch.from_numpy(obs).float()).to(device)
45 | action_log_probability, V = actor(input_actor)
46 | p = np.exp(action_log_probability[0].detach().cpu())
47 | action = np.random.choice(2, p=p.numpy())
48 | action_log_history.append(action_log_probability[0][action])
49 | V_history.append(V)
50 | obs, reward, done, info = env.step(action)
51 | t += 1
52 | if done or t >= T:
53 | if done:
54 | q.put(step)
55 | actor.zero_grad()
56 | if done:
57 | obs = env.reset()
58 | reward_list = np.ones((step + 1,))
59 | for i in range(len(reward_list) - 2, -1, -1):
60 | reward_list[i] += reward_list[i + 1] * gamma
61 | reward_list -= np.mean(reward_list)
62 | reward_list /= (np.std(reward_list) + eps)
63 | Critic_Loss = []
64 | Delta = []
65 | for monte_carlo_return, V in zip(reward_list, V_history):
66 | Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
67 | Delta.append(monte_carlo_return - V.detach())
68 | Actor_Loss = []
69 | entropy = 0
70 | for log_p in action_log_history:
71 | entropy -= log_p * torch.exp(log_p)
72 | Delta = Delta[len(Delta) - len(action_log_history):]
73 | for delta, log_prob in zip(Delta, action_log_history):
74 | Actor_Loss.append(-log_prob * delta.detach())
75 | loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy * 0.01
76 | loss.backward()
77 | ensure_shared_grads(actor, shared_model)
78 | optimizer.step()
79 | action_log_history = []
80 | V_history = []
81 | actor.load_state_dict(shared_model.state_dict())
82 | if done:
83 | t = 0
84 | break
85 | else:
86 | t = 0
87 |
88 | def ensure_shared_grads(model, shared_model):
89 | for param, shared_param in zip(model.parameters(),
90 | shared_model.parameters()):
91 | if shared_param.grad is not None:
92 | return
93 | shared_param._grad = param.grad
94 |
--------------------------------------------------------------------------------
/A3C/README About Vairations.md:
--------------------------------------------------------------------------------
1 | ## Review of A2C and A3C
2 | We all know the A3C is using a episode count to control the async
3 | process. In this procedure, child process will return the grad
4 | after given episodes or finished early. They will be sync in the next
5 | run.
6 |
7 | A2C however, waits for each child process to finish its segments.
8 |
9 | ## My Variation
10 | In this project, I change the sync or async method from A2C and A3C
11 | into episode wise. That is, the child process will only return
12 | the grad AFTER it completes the episode.
13 |
14 | In variation of A3C, child process will return the grad as soon
15 | as it finished the current episode, return the latest grad, do the backward
16 | then sync with the latest model parameters to go next. It will be put
17 | into an infinite loop. This method will never call join() method to process,
18 | instead, it will monitor a queue that is filled by each child process's game
19 | record. If the last 100 of them are maximum rewards, it will send terminate
20 | message to all child processes.
21 |
22 | In variation of A2C, child process will return the grad as soon
23 | as it finished the current episode but all processes will wait every one
24 | finished and sync with the updated model together. In this case we need
25 | a loop and call joint in each loop. If the queue fulfills the convergence requirement,
26 | loop will be ended.
27 |
28 | Additionally, I have set all mode do not learn at all if it reaches the maximum
29 | reward to facilitate converging. [need TESTED]
30 |
31 | ## More Variations:
32 | There isn't too much difference in each child processes. It could
33 | be a exploratory directions. And, how to measure the differences of each
34 | method need a large amount of time.
35 |
36 | ## Additional Warnings:
37 | when initialize multi-processing, fork does not work. If you are in Linux or Mac, change to spawn.
38 | Reasons unclear to me at this point.
39 |
--------------------------------------------------------------------------------
/Ape-X/PyTorch_Ape-X.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LyWangPX/Reinforcement_Learning_Coding_Examples/2f40f67f5709c9dc4ea3d9dd15b441b627b595a6/Ape-X/PyTorch_Ape-X.py
--------------------------------------------------------------------------------
/D4PG/PyTorch_D4PG.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 |
--------------------------------------------------------------------------------
/DDPG/PyTorch_DDPG.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import torch.nn.functional as F
4 | from collections import deque
5 | import gym
6 | import matplotlib.pyplot as plt
7 | from torch.nn import Linear, ReLU
8 |
9 | """
10 | This is a vanilla implementation of DDPG. (without PER)
11 | """
12 |
13 |
14 | class Actor(torch.nn.Module):
15 | def __init__(self, maxlen=100000):
16 | super(Actor, self).__init__()
17 | self.fc1 = Linear(3, 256)
18 | self.fc2 = Linear(256, 256)
19 | self.fc3 = Linear(256, 256)
20 | self.fc4 = Linear(256, 1)
21 | self.s_buffer = deque(maxlen=maxlen)
22 | self.a_buffer = deque(maxlen=maxlen)
23 | self.r_buffer = deque(maxlen=maxlen)
24 | self.next_s_buffer = deque(maxlen=maxlen)
25 |
26 | def forward(self, x):
27 | x = F.relu(self.fc1(x))
28 | x = F.relu(self.fc2(x))
29 | x = F.relu(self.fc3(x))
30 | action = 2*torch.tanh(self.fc4(x))
31 | return action
32 |
33 | def bufferin(self, s, a, r, next_s):
34 | self.s_buffer.append(s)
35 | self.a_buffer.append(a)
36 | self.r_buffer.append(r)
37 | self.next_s_buffer.append(next_s)
38 |
39 | def sample(self, batch_size=64):
40 | indices = np.random.choice(range(len(self.a_buffer)), size=min(len(self.a_buffer), batch_size), replace=False)
41 | s_buffer = [self.s_buffer[i] for i in indices]
42 | a_buffer = [self.a_buffer[i] for i in indices]
43 | r_buffer = [self.r_buffer[i] for i in indices]
44 | next_s_buffer = [self.next_s_buffer[i] for i in indices]
45 | return a_buffer, s_buffer, r_buffer, next_s_buffer
46 |
47 |
48 | class Critic(torch.nn.Module):
49 | def __init__(self):
50 | super(Critic, self).__init__()
51 | self.fc1 = Linear(4, 256)
52 | self.fc2 = Linear(256, 512)
53 | self.fc3 = Linear(512, 1)
54 | self.action = Linear(1, 256)
55 |
56 | def forward(self, x, a):
57 | x = torch.cat([x,a],1)
58 | x = F.relu(self.fc1(x))
59 | x = F.relu(self.fc2(x))
60 | Q = self.fc3(x)
61 | return Q
62 |
63 |
64 | def evaluate(target_policy, device, final=False):
65 | target_policy.eval()
66 | env = NormalizedEnv(gym.make('Pendulum-v0'))
67 | s = env.reset()
68 | if final:
69 | result = []
70 | for episode in range(100):
71 | rewards = 0
72 | for step in range(200):
73 | action = target_policy.forward(torch.FloatTensor(s))
74 | s, reward, done, _ = env.step([action.detach()])
75 | rewards += reward
76 | if done:
77 | result.append(rewards)
78 | s = env.reset()
79 | return result
80 | else:
81 | result = []
82 | for episode in range(1):
83 | rewards = 0
84 | for step in range(200):
85 | action = target_policy.forward(torch.FloatTensor(s))
86 | s, reward, done, _ = env.step([float(action)])
87 | rewards += reward
88 | if done:
89 | result.append(rewards)
90 | s = env.reset()
91 | return result
92 |
93 |
94 | def draw(steps, name):
95 | plt.style.use('dark_background')
96 | plt.figure(figsize=(10, 10))
97 | mid = []
98 | interval = 3
99 | for i in range(len(steps) - interval):
100 | mid.append(np.mean(steps[i:i + interval + 1]))
101 | plt.title(f'{name} DDPG on Pendulum_V0 ', fontsize='xx-large')
102 | plt.xlabel('Episodes', fontsize='xx-large')
103 | plt.ylabel(f'{name}', fontsize='xx-large')
104 | x_fit = list(range(len(steps) - interval))
105 | plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data')
106 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
107 | plt.legend(loc="best", prop={'size': 12})
108 | plt.show()
109 |
110 |
111 | # https://github.com/openai/gym/blob/master/gym/core.py
112 | class NormalizedEnv(gym.ActionWrapper):
113 | """ Wrap action """
114 |
115 | def action(self, action):
116 | act_k = (self.action_space.high - self.action_space.low) / 2.
117 | act_b = (self.action_space.high + self.action_space.low) / 2.
118 | return act_k * action + act_b
119 |
120 | def reverse_action(self, action):
121 | act_k_inv = 2. / (self.action_space.high - self.action_space.low)
122 | act_b = (self.action_space.high + self.action_space.low) / 2.
123 | return act_k_inv * (action - act_b)
124 |
125 |
126 | class Ornstein_Uhlenbeck_Process:
127 | def __init__(self, dt=0.3):
128 | self.theta = 0.15
129 | self.sigma = 0.2
130 | self.dt = dt
131 | self.x = 0
132 |
133 | def step(self):
134 | dW = self.dt ** 2 * np.random.normal()
135 | dx = -self.theta * self.x * self.dt + self.sigma * dW
136 | self.x += dx
137 | return self.x
138 |
139 |
140 | def main():
141 | # create two identical model
142 | # ---hyper parameter---
143 | gamma = 0.99
144 | tau = 0.01
145 | # ---hyper parameter---
146 | steps = []
147 | device = 'cpu'
148 | actor = Actor().to(device)
149 | actor_optimizer = torch.optim.Adam(actor.parameters(), lr=1e-4)
150 | target_actor = Actor().to(device)
151 | critic = Critic().to(device)
152 | critic_optimizer = torch.optim.Adam(critic.parameters(), lr=1e-3)
153 | target_critic = Critic().to(device)
154 | for target_param, param in zip(target_actor.parameters(), actor.parameters()):
155 | target_param.data.copy_(param.data)
156 | for target_param, param in zip(target_critic.parameters(), critic.parameters()):
157 | target_param.data.copy_(param.data)
158 |
159 | env = gym.make('Pendulum-v0')
160 | s = env.reset()
161 | A_loss = []
162 | C_loss = []
163 | actor.train()
164 | critic.train()
165 | for episode in range(100):
166 | rewards = 0
167 | random_process = Ornstein_Uhlenbeck_Process(dt=0.1)
168 | for step in range(250):
169 |
170 | # LINE 1 Select Action
171 | action = (actor.forward(torch.FloatTensor(s)) + random_process.step())
172 |
173 | # LINE 2 Execute and Observe
174 | next_s, reward, done, _ = env.step(action.detach())
175 | # LINE 3 Store
176 | actor.bufferin(s, action, reward, next_s)
177 |
178 | s = next_s
179 | rewards += reward
180 | if len(actor.a_buffer) > 180:
181 | # LINE 4 SAMPLE a minibatch
182 | a_buffer, s_buffer, r_buffer, next_s_buffer = actor.sample()
183 | a_buffer = torch.FloatTensor(a_buffer).view(-1,1)
184 | s_buffer = torch.FloatTensor(s_buffer).view(-1,3)
185 | r_buffer = torch.FloatTensor(r_buffer).view(-1,1)
186 | next_s_buffer = torch.FloatTensor(next_s_buffer).view(-1,3)
187 |
188 | # LINE 5 Set y = r + gamma next Q from target critic
189 | next_a = target_actor(next_s_buffer.to(device))
190 | next_Q = target_critic(next_s_buffer.to(device), next_a.to(device))
191 | y = r_buffer.to(device) + gamma * next_Q
192 |
193 |
194 | # LINE 7 Update the actor policy using sampled policy gradient
195 | true_a = actor(s_buffer.to(device))
196 | actor_loss_total = critic.forward(s_buffer.to(device), true_a.to(device))
197 | actor_loss = -actor_loss_total.mean()
198 | actor.zero_grad()
199 | actor_loss.backward()
200 | actor_optimizer.step()
201 |
202 | # LINE 6 Update critic by minimizing the mse.
203 | Q = critic(s_buffer.to(device),
204 | a_buffer.float().to(device))
205 | critic_loss = torch.nn.functional.mse_loss(Q, y.detach())
206 | critic_optimizer.zero_grad()
207 | critic_loss.backward()
208 | critic_optimizer.step()
209 |
210 | A_loss.append(actor_loss.item())
211 | C_loss.append(critic_loss.item())
212 |
213 | # LINE 8 Update the target network
214 | for target_param, param in zip(target_actor.parameters(), actor.parameters()):
215 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
216 |
217 | for target_param, param in zip(target_critic.parameters(), critic.parameters()):
218 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
219 | if done:
220 | s = env.reset()
221 | steps.append(rewards)
222 | print(f'episode {episode}, total rewards {steps[-1]}')
223 | break
224 | draw(steps, 'rewards')
225 | draw(A_loss, 'A_loss')
226 | draw(C_loss, 'C_loss')
227 | hist = evaluate(target_actor, device, final=True)
228 | draw(hist, 'eval')
229 |
230 |
231 | if __name__ == '__main__':
232 | main()
233 |
--------------------------------------------------------------------------------
/Deuling Double DQN with PER/PyTorch_Deuling_DDQN_with_PER.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import torch.nn.functional as F
6 | from torch.nn import Linear, ReLU
7 | from collections import deque
8 |
9 | """
10 | This is a vanilla Deuling Double DQN with PER (softmax absolute delta)
11 | """
12 |
13 |
14 | class Q_network(torch.nn.Module):
15 | def __init__(self, n_action=2):
16 | super(Q_network, self).__init__()
17 | self.input = Linear(4, 256)
18 | self.input_to_V = Linear(256, 64)
19 | self.input_to_A = Linear(256, 64)
20 | self.input_to_V2 = Linear(64, 1)
21 | self.input_to_A2 = Linear(64, n_action)
22 | self.a_buffer = deque(maxlen=8192)
23 | self.r_buffer = deque(maxlen=8192)
24 | self.s_buffer = deque(maxlen=8192)
25 | self.done_buffer = deque(maxlen=8192)
26 | self.next_s_buffer = deque(maxlen=8192)
27 | self.priority_buffer = deque(maxlen=8192)
28 |
29 | def forward(self, x):
30 | x = F.relu(self.input(x))
31 | V_stream = F.relu(self.input_to_V(x))
32 | V_stream = self.input_to_V2(V_stream)
33 | A_stream = F.relu(self.input_to_A(x))
34 | A_stream = self.input_to_A2(A_stream)
35 | A_mean = torch.mean(A_stream, dim=1, keepdim=True)
36 | result = V_stream + A_stream - A_mean
37 | return result
38 |
39 | def bufferin(self, tuple_info):
40 | # expect tuple_info with content S, A, R, S'
41 | # ALL in TENSOR FORMAT
42 | state, action, reward, next_S, done, priority = tuple_info
43 | self.a_buffer.append(action)
44 | self.s_buffer.append(state)
45 | self.r_buffer.append(reward)
46 | self.next_s_buffer.append(next_S)
47 | self.done_buffer.append(done)
48 | self.priority_buffer.append(priority)
49 |
50 | def sample(self, size=64):
51 | with torch.no_grad():
52 | prob = np.array(F.softmax(torch.stack(list(self.priority_buffer))).view(-1))
53 | prob /= prob.sum()
54 | sample_indices = np.random.choice(range(len(self.a_buffer)), size=64, p=prob, replace=False)
55 | a_sample = [self.a_buffer[i] for i in sample_indices]
56 | r_sample = [self.r_buffer[i] for i in sample_indices]
57 | s_sample = [self.s_buffer[i] for i in sample_indices]
58 | next_s_sample = [self.next_s_buffer[i] for i in sample_indices]
59 | done_sample = [self.done_buffer[i] for i in sample_indices]
60 |
61 | a_sample = torch.Tensor(a_sample).view(-1, 1)
62 | r_sample = torch.Tensor(r_sample).view(-1, 1)
63 | s_sample = torch.stack(s_sample).view(-1, 4)
64 | next_s_sample = torch.stack(next_s_sample).view(-1, 4)
65 | done_sample = torch.Tensor(done_sample).view(-1, 1)
66 |
67 | return s_sample, a_sample, r_sample, next_s_sample, done_sample, sample_indices
68 |
69 |
70 | def main():
71 | gamma = 0.99
72 | beta = 0.25
73 | env = gym.make('CartPole-v0')
74 | state = env.reset()
75 | state = torch.FloatTensor(state).view(-1, 4)
76 | Q_target = Q_network()
77 | optimizer = torch.optim.Adam(Q_target.parameters(), lr=0.0003)
78 | Q_copy = Q_network()
79 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
80 | param_copy.data.copy_(param_target.data)
81 | steps = []
82 | for episode in range(10000):
83 | Q_mean = 0
84 | for step in range(200):
85 | with torch.no_grad():
86 | Q_list = Q_target.forward(state)
87 | if np.random.random() > beta:
88 | action = np.argmax(Q_list.detach())
89 | next_state, reward, done, _ = env.step(action.item())
90 | else:
91 | action = np.random.randint(2)
92 | next_state, reward, done, _ = env.step(action)
93 | next_state = torch.FloatTensor(next_state).view(-1, 4)
94 | # PER: Calculate delta of this tuple
95 | Q = Q_list[0][action]
96 | Q_prime = Q_copy.forward(next_state)
97 | next_action = np.argmax(Q_prime.detach())
98 | delta = abs(Q - reward + gamma * Q_prime[0][next_action])
99 | delta = delta.view(1)
100 | tuple_info = (state, action, torch.Tensor([reward]), next_state, not done, delta)
101 | Q_target.bufferin(tuple_info)
102 | # Learning Part
103 | if len(Q_target.a_buffer) > 64:
104 | s_sample, a_sample, r_sample, next_s_sample, done_sample, ids = Q_target.sample()
105 | # Q values from recorded S and A
106 | Q = Q_target.forward(s_sample)
107 | Q_mean = Q.mean()
108 | Q = Q.gather(1, a_sample.long().view(-1, 1))
109 | # Q' values from recorded S and A recalculated from Q
110 | next_Q = Q_target.forward(next_s_sample)
111 | Q_values, Q_actions = torch.max(next_Q.detach(), 1)
112 | Q_actions = Q_actions.view(-1, 1)
113 | Q_prime = Q_copy.forward(next_s_sample)
114 | Q_prime = Q_prime.gather(1, Q_actions.long().view(-1, 1))
115 | y = r_sample + gamma * Q_prime * done_sample
116 | deltas = abs(Q - y.detach())
117 | for delta, id in zip(deltas, ids):
118 | Q_target.priority_buffer[id] = delta.view(-1)
119 | loss = F.mse_loss(Q, y.detach())
120 | Q_target.zero_grad()
121 | loss.backward()
122 | optimizer.step()
123 |
124 | # Loop reset Part
125 | if not done:
126 | state = next_state
127 | else:
128 | state = torch.FloatTensor(env.reset()).view(-1, 4)
129 | print(f'episode {episode}, step {step}, Q_average {Q_mean}')
130 | steps.append(step)
131 | break
132 | if episode % 3 == 0:
133 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
134 | param_copy.data.copy_(param_target.data)
135 | if episode > 40:
136 | beta = 5 / episode
137 |
138 | if np.mean(steps[-20:]) > 190:
139 | break
140 |
141 | plt.style.use('dark_background')
142 | plt.figure(figsize=(10, 10))
143 | mid = []
144 | interval = 3
145 | for i in range(len(steps) - interval):
146 | mid.append(np.mean(steps[i:i + interval + 1]))
147 | plt.title(f'Deuling DDQN on CartPole-v0 with PER', fontsize='xx-large')
148 | plt.xlabel('Episodes', fontsize='xx-large')
149 | plt.ylabel(f'Rewards', fontsize='xx-large')
150 | x_fit = list(range(len(steps) - interval))
151 | plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data')
152 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
153 | plt.legend(loc="best", prop={'size': 12})
154 | plt.show()
155 |
156 |
157 | if __name__ == '__main__':
158 | main()
159 |
--------------------------------------------------------------------------------
/Deuling Double DQN/PyTorch_Deuling_DDQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import torch.nn.functional as F
6 | from torch.nn import Linear
7 | from collections import deque
8 |
9 | """
10 | This is a vanilla Dueling Double DQN without PER.
11 | """
12 |
13 | class Q_network(torch.nn.Module):
14 | def __init__(self, n_action=2):
15 | super(Q_network, self).__init__()
16 | self.input = Linear(4, 256)
17 | self.input_to_V = Linear(256, 64)
18 | self.input_to_A = Linear(256, 64)
19 | self.input_to_V2 = Linear(64, 1)
20 | self.input_to_A2 = Linear(64, n_action)
21 | self.a_buffer = deque(maxlen=8192)
22 | self.r_buffer = deque(maxlen=8192)
23 | self.s_buffer = deque(maxlen=8192)
24 | self.done_buffer = deque(maxlen=8192)
25 | self.next_s_buffer = deque(maxlen=8192)
26 |
27 | def forward(self, x):
28 | x = F.relu(self.input(x))
29 | V_stream = F.relu(self.input_to_V(x))
30 | V_stream = self.input_to_V2(V_stream)
31 | A_stream = F.relu(self.input_to_A(x))
32 | A_stream = self.input_to_A2(A_stream)
33 | A_mean = torch.mean(A_stream, dim=1, keepdim=True)
34 | result = V_stream + A_stream - A_mean
35 | return result
36 |
37 | def bufferin(self, tuple_info):
38 | # expect tuple_info with content S, A, R, S'
39 | # ALL in TENSOR FORMAT
40 | state, action, reward, next_S, done = tuple_info
41 | self.a_buffer.append(action)
42 | self.s_buffer.append(state)
43 | self.r_buffer.append(reward)
44 | self.next_s_buffer.append(next_S)
45 | self.done_buffer.append(done)
46 |
47 | def sample(self, size=64):
48 | sample_indices = np.random.choice(range(len(self.a_buffer)), 64, replace=False)
49 | a_sample = [self.a_buffer[i] for i in sample_indices]
50 | r_sample = [self.r_buffer[i] for i in sample_indices]
51 | s_sample = [self.s_buffer[i] for i in sample_indices]
52 | next_s_sample = [self.next_s_buffer[i] for i in sample_indices]
53 | done_sample = [self.done_buffer[i] for i in sample_indices]
54 |
55 | a_sample = torch.Tensor(a_sample).view(-1, 1)
56 | r_sample = torch.Tensor(r_sample).view(-1, 1)
57 | s_sample = torch.stack(s_sample).view(-1, 4)
58 | next_s_sample = torch.stack(next_s_sample).view(-1, 4)
59 | done_sample = torch.Tensor(done_sample).view(-1, 1)
60 |
61 | return s_sample, a_sample, r_sample, next_s_sample, done_sample
62 |
63 |
64 | def main():
65 | gamma = 0.99
66 | beta = 0.25
67 | env = gym.make('CartPole-v0')
68 | state = env.reset()
69 | state = torch.FloatTensor(state).view(-1, 4)
70 | Q_target = Q_network()
71 | optimizer = torch.optim.Adam(Q_target.parameters(), lr=0.001)
72 | Q_copy = Q_network()
73 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
74 | param_copy.data.copy_(param_target.data)
75 | steps = []
76 | for episode in range(10000):
77 | Q_mean = 0
78 | for step in range(200):
79 | Q_list = Q_target.forward(state)
80 | if np.random.random() > beta:
81 | action = np.argmax(Q_list.detach())
82 | next_state, reward, done, _ = env.step(action.item())
83 | else:
84 | action = np.random.randint(2)
85 | next_state, reward, done, _ = env.step(action)
86 | next_state = torch.FloatTensor(next_state).view(-1, 4)
87 | tuple_info = (state, action, torch.Tensor([reward]), next_state, not done)
88 | Q_target.bufferin(tuple_info)
89 | # Learning Part
90 | if len(Q_target.a_buffer) > 64:
91 | s_sample, a_sample, r_sample, next_s_sample, done_sample = Q_target.sample()
92 | # Q values from recorded S and A
93 | Q = Q_target.forward(s_sample)
94 | Q_mean = Q.mean()
95 | Q = Q.gather(1, a_sample.long().view(-1, 1))
96 | # Q' values from recorded S and A recalculated from Q
97 | next_Q = Q_target.forward(next_s_sample)
98 | Q_values, Q_actions = torch.max(next_Q.detach(), 1)
99 | Q_actions = Q_actions.view(-1,1)
100 | Q_prime = Q_copy.forward(next_s_sample)
101 | Q_prime = Q_prime.gather(1, Q_actions.long().view(-1, 1))
102 | y = r_sample + gamma * Q_prime * done_sample
103 | loss = F.mse_loss(Q, y.detach())
104 | Q_target.zero_grad()
105 | loss.backward()
106 | optimizer.step()
107 |
108 | # Loop reset Part
109 | if not done:
110 | state = next_state
111 | else:
112 | state = torch.FloatTensor(env.reset()).view(-1, 4)
113 | print(f'episode {episode}, step {step}, Q_average {Q_mean}')
114 | steps.append(step)
115 | break
116 | if episode % 3 == 0:
117 | for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
118 | param_copy.data.copy_(param_target.data)
119 | if episode > 40:
120 | beta = 5/episode
121 |
122 | if np.mean(steps[-20:]) > 190:
123 | break
124 |
125 |
126 | plt.style.use('dark_background')
127 | plt.figure(figsize=(10, 10))
128 | mid = []
129 | interval = 3
130 | for i in range(len(steps) - interval):
131 | mid.append(np.mean(steps[i:i + interval + 1]))
132 | plt.title(f'Deuling DDQN on CartPole-v0', fontsize='xx-large')
133 | plt.xlabel('Episodes', fontsize='xx-large')
134 | plt.ylabel(f'Rewards', fontsize='xx-large')
135 | x_fit = list(range(len(steps) - interval))
136 | plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data')
137 | plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
138 | plt.legend(loc="best", prop={'size': 12})
139 | plt.show()
140 |
141 |
142 | if __name__ == '__main__':
143 | main()
144 |
--------------------------------------------------------------------------------
/Experiments/Online TD and true Online TD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "\n",
13 | "# Define state 1, 2, 3 ,4 ,5... 19 as normal state with one-hot encoding\n",
14 | "# state 0 and state 20 share the same zero feature vectors.\n",
15 | "\n",
16 | "def feature_map(state):\n",
17 | " zero_model = [0]*19\n",
18 | " zero_model[state-1] = 1\n",
19 | " zero_model = np.array(zero_model)\n",
20 | " zero_model.resize((19,1))\n",
21 | " return np.array(zero_model)\n",
22 | "\n",
23 | "# create a hash table to quickly draw features\n",
24 | "feature_hash = {0: np.zeros((19,1)),\n",
25 | " 20: np.zeros((19,1))}\n",
26 | "for state in range(1,20):\n",
27 | " feature_hash[state] = feature_map(state)\n"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 65,
33 | "outputs": [],
34 | "source": [
35 | "history = []\n",
36 | "for episode in range(2):\n",
37 | " local = [10]\n",
38 | " state = 10\n",
39 | " while True:\n",
40 | " if np.random.random() > 0.5:\n",
41 | " state += 1\n",
42 | " else:\n",
43 | " state -= 1\n",
44 | " local.append(state)\n",
45 | " if state == 0 or state == 20:\n",
46 | " history.append(local)\n",
47 | " break"
48 | ],
49 | "metadata": {
50 | "collapsed": false,
51 | "pycharm": {
52 | "name": "#%%\n"
53 | }
54 | }
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 66,
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "processing episode 1 horizon 245\r"
65 | ]
66 | }
67 | ],
68 | "source": [
69 | "# hand pick the hyper parameters\n",
70 | "alpha = 0.4\n",
71 | "gamma = 0.8\n",
72 | "_lambda = 0.9\n",
73 | "\n",
74 | "# set all ones as initialization\n",
75 | "w_last_episode = np.ones((19,1))\n",
76 | "w_last_round = np.ones((19,1))\n",
77 | "w_forward = {}\n",
78 | "def n_step_G(t, h, w, hist):\n",
79 | " if h == len(hist):\n",
80 | " # v(T) == 0; reward == 1\n",
81 | " if hist[-1] == 20:\n",
82 | " return gamma**(h-t-1)\n",
83 | " else:\n",
84 | " return 0\n",
85 | " else:\n",
86 | " # reward == 0; \n",
87 | " return gamma**(h-t)*(w.T@feature_hash[hist[h-1]])\n",
88 | " \n",
89 | "def lambda_G(t,h,hist):\n",
90 | " first_term = np.sum([_lambda**(n-1)*n_step_G(t,t+n,w_dict[n-1],hist) for n in range(1, h-t)])\n",
91 | " return (1-_lambda)*first_term + _lambda**(h-t-1)*n_step_G(t,h,w_dict[h-1],hist)\n",
92 | " \n",
93 | "for i,hist in enumerate(history):\n",
94 | " w_dict = {0:w_last_episode}\n",
95 | " for h in range(1, len(hist)+1):\n",
96 | " print(f'processing episode {i} horizon {h}', end = '\\r')\n",
97 | " w_old = w_last_episode\n",
98 | " for t in range(1,h+1):\n",
99 | " w = w_old + alpha*(lambda_G(t-1,h,hist) - w_old.T@feature_hash[hist[t-1]])*feature_hash[hist[t-1]]\n",
100 | " w_old = w\n",
101 | " else:\n",
102 | " w_dict[h] = w_old\n",
103 | " else:\n",
104 | " w_forward[i] = w_old\n",
105 | " w_last_episode = w_old\n"
106 | ],
107 | "metadata": {
108 | "collapsed": false,
109 | "pycharm": {
110 | "name": "#%%\n"
111 | }
112 | }
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 67,
117 | "outputs": [],
118 | "source": [
119 | "w_online = {}\n",
120 | "w_2 = np.ones((19,1))\n",
121 | "for episode, hist in enumerate(history):\n",
122 | " z = np.zeros((19,1))\n",
123 | " V_old = 0\n",
124 | " for i, state in enumerate(hist):\n",
125 | " if i == len(hist)-2:\n",
126 | " if hist[i+1] == 20:\n",
127 | " R = 1\n",
128 | " else:\n",
129 | " R = 0\n",
130 | " done = True\n",
131 | " else:\n",
132 | " R = 0\n",
133 | " done = False\n",
134 | " V = w_2.T@feature_hash[state]\n",
135 | " V_prime = w_2.T@feature_hash[hist[i+1]]\n",
136 | " delta = R + gamma*V_prime - V\n",
137 | " z = _lambda*gamma*z + (1-alpha*gamma*_lambda*z.T@feature_hash[state])*feature_hash[state]\n",
138 | " w_2 = w_2 + alpha*(delta + V - V_old)*z - alpha*(V-V_old)*feature_hash[state]\n",
139 | " V_old = V_prime\n",
140 | " if done:\n",
141 | " w_online[episode] = w_2\n",
142 | " break"
143 | ],
144 | "metadata": {
145 | "collapsed": false,
146 | "pycharm": {
147 | "name": "#%%\n"
148 | }
149 | }
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 70,
154 | "outputs": [
155 | {
156 | "data": {
157 | "text/plain": "array([[1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [0.53231318],\n [0.32852723],\n [0.27689236],\n [0.28178901],\n [0.2804106 ],\n [0.27976137],\n [0.2788142 ],\n [0.27958496],\n [0.28473191],\n [0.29893919],\n [0.36824546],\n [0.55494604],\n [0.8125568 ]])"
158 | },
159 | "execution_count": 70,
160 | "metadata": {},
161 | "output_type": "execute_result"
162 | }
163 | ],
164 | "source": [
165 | "w_forward[0]"
166 | ],
167 | "metadata": {
168 | "collapsed": false,
169 | "pycharm": {
170 | "name": "#%%\n"
171 | }
172 | }
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 71,
177 | "outputs": [
178 | {
179 | "data": {
180 | "text/plain": "array([[1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [1. ],\n [0.4232541 ],\n [0.15123449],\n [0.02727636],\n [0.01142531],\n [0.00900136],\n [0.01016122],\n [0.01274722],\n [0.01743689],\n [0.04539897],\n [0.12147425],\n [0.28585104],\n [0.58749665],\n [0.90724135]])"
181 | },
182 | "execution_count": 71,
183 | "metadata": {},
184 | "output_type": "execute_result"
185 | }
186 | ],
187 | "source": [
188 | "w_online[0]"
189 | ],
190 | "metadata": {
191 | "collapsed": false,
192 | "pycharm": {
193 | "name": "#%%\n"
194 | }
195 | }
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "outputs": [],
201 | "source": [
202 | "\n"
203 | ],
204 | "metadata": {
205 | "collapsed": false,
206 | "pycharm": {
207 | "name": "#%%\n"
208 | }
209 | }
210 | }
211 | ],
212 | "metadata": {
213 | "kernelspec": {
214 | "display_name": "Python 3",
215 | "language": "python",
216 | "name": "python3"
217 | },
218 | "language_info": {
219 | "codemirror_mode": {
220 | "name": "ipython",
221 | "version": 2
222 | },
223 | "file_extension": ".py",
224 | "mimetype": "text/x-python",
225 | "name": "python",
226 | "nbconvert_exporter": "python",
227 | "pygments_lexer": "ipython2",
228 | "version": "2.7.6"
229 | }
230 | },
231 | "nbformat": 4,
232 | "nbformat_minor": 0
233 | }
--------------------------------------------------------------------------------
/Experiments/Seijen2014_True_Online_TD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 12,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "\n",
13 | "state_map = {\n",
14 | " 'A':[('B', -1), ('C', 1), ('D',-1)],\n",
15 | " 'B':[('E', 1), ('T', 1)],\n",
16 | " 'C':[('A', -1), ('B', 1), ('D', 1), ('T', 1)],\n",
17 | " 'D':[('F', 1), ('T', 1)],\n",
18 | " 'E':[('T', 1)],\n",
19 | " 'F':[('T', 1)]\n",
20 | "}\n",
21 | "\n",
22 | "feature_map = {\n",
23 | " 'A':np.array([[1],[0],[0],[0],[0],[0]]),\n",
24 | " 'B':np.array([[0],[1],[0],[0],[0],[0]]),\n",
25 | " 'C':np.array([[0],[0],[1],[0],[0],[0]]),\n",
26 | " 'D':np.array([[0],[0],[0],[1],[0],[0]]),\n",
27 | " 'E':np.array([[0],[0],[0],[0],[1],[0]]),\n",
28 | " 'F':np.array([[0],[0],[0],[0],[0],[1]]),\n",
29 | " 'T':np.array([[0],[0],[0],[0],[0],[0]]),\n",
30 | "}\n",
31 | "theta_2014 = np.array([[0],[0],[0],[0],[0],[0]])\n",
32 | "theta_2016 = np.array([[0],[0],[0],[0],[0],[0]])"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 14,
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "-----episode 0-----\n",
44 | "theta_2014: [[-0.01930828]\n",
45 | " [ 0.04260991]\n",
46 | " [ 0.01048235]\n",
47 | " [ 0.06055846]\n",
48 | " [ 0.029701 ]\n",
49 | " [ 0.0199 ]]\n",
50 | "theta_2016: [[-0.01930828]\n",
51 | " [ 0.04260991]\n",
52 | " [ 0.01048235]\n",
53 | " [ 0.06055846]\n",
54 | " [ 0.029701 ]\n",
55 | " [ 0.0199 ]]\n",
56 | "error 1.0408340855860843e-17\n",
57 | "---------------------------\n",
58 | "-----episode 1-----\n",
59 | "theta_2014: [[-0.00789621]\n",
60 | " [ 0.05218381]\n",
61 | " [ 0.02174718]\n",
62 | " [ 0.06055846]\n",
63 | " [ 0.029701 ]\n",
64 | " [ 0.0199 ]]\n",
65 | "theta_2016: [[-0.00789621]\n",
66 | " [ 0.05218381]\n",
67 | " [ 0.02174718]\n",
68 | " [ 0.06055846]\n",
69 | " [ 0.029701 ]\n",
70 | " [ 0.0199 ]]\n",
71 | "error 8.673617379884035e-18\n",
72 | "---------------------------\n",
73 | "-----episode 2-----\n",
74 | "theta_2014: [[-0.01636229]\n",
75 | " [ 0.06166197]\n",
76 | " [ 0.02174718]\n",
77 | " [ 0.06055846]\n",
78 | " [ 0.029701 ]\n",
79 | " [ 0.0199 ]]\n",
80 | "theta_2016: [[-0.01636229]\n",
81 | " [ 0.06166197]\n",
82 | " [ 0.02174718]\n",
83 | " [ 0.06055846]\n",
84 | " [ 0.029701 ]\n",
85 | " [ 0.0199 ]]\n",
86 | "error 6.938893903907228e-18\n",
87 | "---------------------------\n",
88 | "-----episode 3-----\n",
89 | "theta_2014: [[-0.02466909]\n",
90 | " [ 0.06166197]\n",
91 | " [ 0.02174718]\n",
92 | " [ 0.06995287]\n",
93 | " [ 0.029701 ]\n",
94 | " [ 0.0199 ]]\n",
95 | "theta_2016: [[-0.02466909]\n",
96 | " [ 0.06166197]\n",
97 | " [ 0.02174718]\n",
98 | " [ 0.06995287]\n",
99 | " [ 0.029701 ]\n",
100 | " [ 0.0199 ]]\n",
101 | "error 6.938893903907228e-18\n",
102 | "---------------------------\n",
103 | "-----episode 4-----\n",
104 | "theta_2014: [[-0.03280912]\n",
105 | " [ 0.06166197]\n",
106 | " [ 0.02174718]\n",
107 | " [ 0.07925335]\n",
108 | " [ 0.029701 ]\n",
109 | " [ 0.0199 ]]\n",
110 | "theta_2016: [[-0.03280912]\n",
111 | " [ 0.06166197]\n",
112 | " [ 0.02174718]\n",
113 | " [ 0.07925335]\n",
114 | " [ 0.029701 ]\n",
115 | " [ 0.0199 ]]\n",
116 | "error 6.938893903907228e-18\n",
117 | "---------------------------\n",
118 | "-----episode 5-----\n",
119 | "theta_2014: [[-0.04094162]\n",
120 | " [ 0.07104535]\n",
121 | " [ 0.02174718]\n",
122 | " [ 0.07925335]\n",
123 | " [ 0.029701 ]\n",
124 | " [ 0.0199 ]]\n",
125 | "theta_2016: [[-0.04094162]\n",
126 | " [ 0.07104535]\n",
127 | " [ 0.02174718]\n",
128 | " [ 0.07925335]\n",
129 | " [ 0.029701 ]\n",
130 | " [ 0.0199 ]]\n",
131 | "error 6.938893903907228e-18\n",
132 | "---------------------------\n",
133 | "-----episode 6-----\n",
134 | "theta_2014: [[-0.04890919]\n",
135 | " [ 0.0803349 ]\n",
136 | " [ 0.02174718]\n",
137 | " [ 0.07925335]\n",
138 | " [ 0.029701 ]\n",
139 | " [ 0.0199 ]]\n",
140 | "theta_2016: [[-0.04890919]\n",
141 | " [ 0.0803349 ]\n",
142 | " [ 0.02174718]\n",
143 | " [ 0.07925335]\n",
144 | " [ 0.029701 ]\n",
145 | " [ 0.0199 ]]\n",
146 | "error 6.938893903907228e-18\n",
147 | "---------------------------\n",
148 | "-----episode 7-----\n",
149 | "theta_2014: [[-0.05660839]\n",
150 | " [ 0.0803349 ]\n",
151 | " [ 0.02174718]\n",
152 | " [ 0.08962812]\n",
153 | " [ 0.029701 ]\n",
154 | " [ 0.029701 ]]\n",
155 | "theta_2016: [[-0.05660839]\n",
156 | " [ 0.0803349 ]\n",
157 | " [ 0.02174718]\n",
158 | " [ 0.08962812]\n",
159 | " [ 0.029701 ]\n",
160 | " [ 0.029701 ]]\n",
161 | "error 6.938893903907228e-18\n",
162 | "---------------------------\n",
163 | "-----episode 8-----\n",
164 | "theta_2014: [[-0.06412951]\n",
165 | " [ 0.0803349 ]\n",
166 | " [ 0.02174718]\n",
167 | " [ 0.09998648]\n",
168 | " [ 0.029701 ]\n",
169 | " [ 0.03940399]]\n",
170 | "theta_2016: [[-0.06412951]\n",
171 | " [ 0.0803349 ]\n",
172 | " [ 0.02174718]\n",
173 | " [ 0.09998648]\n",
174 | " [ 0.029701 ]\n",
175 | " [ 0.03940399]]\n",
176 | "error 1.3877787807814457e-17\n",
177 | "---------------------------\n",
178 | "-----episode 9-----\n",
179 | "theta_2014: [[-0.07178243]\n",
180 | " [ 0.08953155]\n",
181 | " [ 0.02174718]\n",
182 | " [ 0.09998648]\n",
183 | " [ 0.029701 ]\n",
184 | " [ 0.03940399]]\n",
185 | "theta_2016: [[-0.07178243]\n",
186 | " [ 0.08953155]\n",
187 | " [ 0.02174718]\n",
188 | " [ 0.09998648]\n",
189 | " [ 0.029701 ]\n",
190 | " [ 0.03940399]]\n",
191 | "error 1.3877787807814457e-17\n",
192 | "---------------------------\n",
193 | "-----episode 10-----\n",
194 | "theta_2014: [[-0.06958732]\n",
195 | " [ 0.09863623]\n",
196 | " [ 0.01017452]\n",
197 | " [ 0.09998648]\n",
198 | " [ 0.029701 ]\n",
199 | " [ 0.03940399]]\n",
200 | "theta_2016: [[-0.06958732]\n",
201 | " [ 0.09863623]\n",
202 | " [ 0.01017452]\n",
203 | " [ 0.09998648]\n",
204 | " [ 0.029701 ]\n",
205 | " [ 0.03940399]]\n",
206 | "error 1.3877787807814457e-17\n",
207 | "---------------------------\n",
208 | "-----episode 11-----\n",
209 | "theta_2014: [[-0.07689839]\n",
210 | " [ 0.10890451]\n",
211 | " [ 0.01017452]\n",
212 | " [ 0.09998648]\n",
213 | " [ 0.03940399]\n",
214 | " [ 0.03940399]]\n",
215 | "theta_2016: [[-0.07689839]\n",
216 | " [ 0.10890451]\n",
217 | " [ 0.01017452]\n",
218 | " [ 0.09998648]\n",
219 | " [ 0.03940399]\n",
220 | " [ 0.03940399]]\n",
221 | "error 1.3877787807814457e-17\n",
222 | "---------------------------\n",
223 | "-----episode 12-----\n",
224 | "theta_2014: [[-0.07461403]\n",
225 | " [ 0.10890451]\n",
226 | " [-0.00131933]\n",
227 | " [ 0.10898661]\n",
228 | " [ 0.03940399]\n",
229 | " [ 0.03940399]]\n",
230 | "theta_2016: [[-0.07461403]\n",
231 | " [ 0.10890451]\n",
232 | " [-0.00131933]\n",
233 | " [ 0.10898661]\n",
234 | " [ 0.03940399]\n",
235 | " [ 0.03940399]]\n",
236 | "error 1.4094628242311558e-17\n",
237 | "---------------------------\n",
238 | "-----episode 13-----\n",
239 | "theta_2014: [[-0.08177405]\n",
240 | " [ 0.10890451]\n",
241 | " [-0.00131933]\n",
242 | " [ 0.11923783]\n",
243 | " [ 0.03940399]\n",
244 | " [ 0.04900995]]\n",
245 | "theta_2016: [[-0.08177405]\n",
246 | " [ 0.10890451]\n",
247 | " [-0.00131933]\n",
248 | " [ 0.11923783]\n",
249 | " [ 0.03940399]\n",
250 | " [ 0.04900995]]\n",
251 | "error 2.168404344971009e-19\n",
252 | "---------------------------\n",
253 | "-----episode 14-----\n",
254 | "theta_2014: [[-0.08876266]\n",
255 | " [ 0.10890451]\n",
256 | " [-0.00131933]\n",
257 | " [ 0.12947213]\n",
258 | " [ 0.03940399]\n",
259 | " [ 0.05851985]]\n",
260 | "theta_2016: [[-0.08876266]\n",
261 | " [ 0.10890451]\n",
262 | " [-0.00131933]\n",
263 | " [ 0.12947213]\n",
264 | " [ 0.03940399]\n",
265 | " [ 0.05851985]]\n",
266 | "error 2.7972416050126014e-17\n",
267 | "---------------------------\n",
268 | "-----episode 15-----\n",
269 | "theta_2014: [[-0.09591469]\n",
270 | " [ 0.11781546]\n",
271 | " [-0.00131933]\n",
272 | " [ 0.12947213]\n",
273 | " [ 0.03940399]\n",
274 | " [ 0.05851985]]\n",
275 | "theta_2016: [[-0.09591469]\n",
276 | " [ 0.11781546]\n",
277 | " [-0.00131933]\n",
278 | " [ 0.12947213]\n",
279 | " [ 0.03940399]\n",
280 | " [ 0.05851985]]\n",
281 | "error 2.7972416050126014e-17\n",
282 | "---------------------------\n",
283 | "-----episode 16-----\n",
284 | "theta_2014: [[-0.10291581]\n",
285 | " [ 0.12663731]\n",
286 | " [-0.00131933]\n",
287 | " [ 0.12947213]\n",
288 | " [ 0.03940399]\n",
289 | " [ 0.05851985]]\n",
290 | "theta_2016: [[-0.10291581]\n",
291 | " [ 0.12663731]\n",
292 | " [-0.00131933]\n",
293 | " [ 0.12947213]\n",
294 | " [ 0.03940399]\n",
295 | " [ 0.05851985]]\n",
296 | "error 4.185020385794047e-17\n",
297 | "---------------------------\n",
298 | "-----episode 17-----\n",
299 | "theta_2014: [[-0.10959342]\n",
300 | " [ 0.12663731]\n",
301 | " [-0.00131933]\n",
302 | " [ 0.13968883]\n",
303 | " [ 0.03940399]\n",
304 | " [ 0.06793465]]\n",
305 | "theta_2016: [[-0.10959342]\n",
306 | " [ 0.12663731]\n",
307 | " [-0.00131933]\n",
308 | " [ 0.13968883]\n",
309 | " [ 0.03940399]\n",
310 | " [ 0.06793465]]\n",
311 | "error 4.185020385794047e-17\n",
312 | "---------------------------\n",
313 | "-----episode 18-----\n",
314 | "theta_2014: [[-0.11610493]\n",
315 | " [ 0.12663731]\n",
316 | " [-0.00131933]\n",
317 | " [ 0.14988723]\n",
318 | " [ 0.03940399]\n",
319 | " [ 0.07725531]]\n",
320 | "theta_2016: [[-0.11610493]\n",
321 | " [ 0.12663731]\n",
322 | " [-0.00131933]\n",
323 | " [ 0.14988723]\n",
324 | " [ 0.03940399]\n",
325 | " [ 0.07725531]]\n",
326 | "error 4.185020385794047e-17\n",
327 | "---------------------------\n",
328 | "-----episode 19-----\n",
329 | "theta_2014: [[-0.10375592]\n",
330 | " [ 0.13537093]\n",
331 | " [ 0.0108122 ]\n",
332 | " [ 0.14988723]\n",
333 | " [ 0.03940399]\n",
334 | " [ 0.07725531]]\n",
335 | "theta_2016: [[-0.10375592]\n",
336 | " [ 0.13537093]\n",
337 | " [ 0.0108122 ]\n",
338 | " [ 0.14988723]\n",
339 | " [ 0.03940399]\n",
340 | " [ 0.07725531]]\n",
341 | "error 4.336808689942018e-17\n",
342 | "---------------------------\n",
343 | "-----episode 20-----\n",
344 | "theta_2014: [[-0.10033798]\n",
345 | " [ 0.13537093]\n",
346 | " [-0.00086441]\n",
347 | " [ 0.16006671]\n",
348 | " [ 0.03940399]\n",
349 | " [ 0.08648275]]\n",
350 | "theta_2016: [[-0.10033798]\n",
351 | " [ 0.13537093]\n",
352 | " [-0.00086441]\n",
353 | " [ 0.16006671]\n",
354 | " [ 0.03940399]\n",
355 | " [ 0.08648275]]\n",
356 | "error 2.959871930885427e-17\n",
357 | "---------------------------\n",
358 | "-----episode 21-----\n",
359 | "theta_2014: [[-0.0881131 ]\n",
360 | " [ 0.13537093]\n",
361 | " [ 0.01156042]\n",
362 | " [ 0.16846604]\n",
363 | " [ 0.03940399]\n",
364 | " [ 0.08648275]]\n",
365 | "theta_2016: [[-0.0881131 ]\n",
366 | " [ 0.13537093]\n",
367 | " [ 0.01156042]\n",
368 | " [ 0.16846604]\n",
369 | " [ 0.03940399]\n",
370 | " [ 0.08648275]]\n",
371 | "error 2.949029909160572e-17\n",
372 | "---------------------------\n",
373 | "-----episode 22-----\n",
374 | "theta_2014: [[-0.09456664]\n",
375 | " [ 0.13537093]\n",
376 | " [ 0.01156042]\n",
377 | " [ 0.17854194]\n",
378 | " [ 0.03940399]\n",
379 | " [ 0.09561792]]\n",
380 | "theta_2016: [[-0.09456664]\n",
381 | " [ 0.13537093]\n",
382 | " [ 0.01156042]\n",
383 | " [ 0.17854194]\n",
384 | " [ 0.03940399]\n",
385 | " [ 0.09561792]]\n",
386 | "error 4.336808689942018e-17\n",
387 | "---------------------------\n",
388 | "-----episode 23-----\n",
389 | "theta_2014: [[-0.07240082]\n",
390 | " [ 0.14535831]\n",
391 | " [ 0.01412296]\n",
392 | " [ 0.17854194]\n",
393 | " [ 0.04900995]\n",
394 | " [ 0.09561792]]\n",
395 | "theta_2016: [[-0.07240082]\n",
396 | " [ 0.14535831]\n",
397 | " [ 0.01412296]\n",
398 | " [ 0.17854194]\n",
399 | " [ 0.04900995]\n",
400 | " [ 0.09561792]]\n",
401 | "error 4.683753385137379e-17\n",
402 | "---------------------------\n",
403 | "-----episode 24-----\n",
404 | "theta_2014: [[-0.06032076]\n",
405 | " [ 0.15533141]\n",
406 | " [ 0.02640811]\n",
407 | " [ 0.17854194]\n",
408 | " [ 0.05851985]\n",
409 | " [ 0.09561792]]\n",
410 | "theta_2016: [[-0.06032076]\n",
411 | " [ 0.15533141]\n",
412 | " [ 0.02640811]\n",
413 | " [ 0.17854194]\n",
414 | " [ 0.05851985]\n",
415 | " [ 0.09561792]]\n",
416 | "error 4.85722573273506e-17\n",
417 | "---------------------------\n",
418 | "-----episode 25-----\n",
419 | "theta_2014: [[-0.04849226]\n",
420 | " [ 0.15533141]\n",
421 | " [ 0.03614403]\n",
422 | " [ 0.17854194]\n",
423 | " [ 0.05851985]\n",
424 | " [ 0.09561792]]\n",
425 | "theta_2016: [[-0.04849226]\n",
426 | " [ 0.15533141]\n",
427 | " [ 0.03614403]\n",
428 | " [ 0.17854194]\n",
429 | " [ 0.05851985]\n",
430 | " [ 0.09561792]]\n",
431 | "error 4.85722573273506e-17\n",
432 | "---------------------------\n",
433 | "-----episode 26-----\n",
434 | "theta_2014: [[-0.0554837 ]\n",
435 | " [ 0.16528951]\n",
436 | " [ 0.03614403]\n",
437 | " [ 0.17854194]\n",
438 | " [ 0.06793465]\n",
439 | " [ 0.09561792]]\n",
440 | "theta_2016: [[-0.0554837 ]\n",
441 | " [ 0.16528951]\n",
442 | " [ 0.03614403]\n",
443 | " [ 0.17854194]\n",
444 | " [ 0.06793465]\n",
445 | " [ 0.09561792]]\n",
446 | "error 4.163336342344337e-17\n",
447 | "---------------------------\n",
448 | "-----episode 27-----\n",
449 | "theta_2014: [[-0.06234805]\n",
450 | " [ 0.16528951]\n",
451 | " [ 0.03614403]\n",
452 | " [ 0.18675652]\n",
453 | " [ 0.06793465]\n",
454 | " [ 0.09561792]]\n",
455 | "theta_2016: [[-0.06234805]\n",
456 | " [ 0.16528951]\n",
457 | " [ 0.03614403]\n",
458 | " [ 0.18675652]\n",
459 | " [ 0.06793465]\n",
460 | " [ 0.09561792]]\n",
461 | "error 4.163336342344337e-17\n",
462 | "---------------------------\n",
463 | "-----episode 28-----\n",
464 | "theta_2014: [[-0.06888822]\n",
465 | " [ 0.16528951]\n",
466 | " [ 0.03614403]\n",
467 | " [ 0.19673091]\n",
468 | " [ 0.06793465]\n",
469 | " [ 0.10466175]]\n",
470 | "theta_2016: [[-0.06888822]\n",
471 | " [ 0.16528951]\n",
472 | " [ 0.03614403]\n",
473 | " [ 0.19673091]\n",
474 | " [ 0.06793465]\n",
475 | " [ 0.10466175]]\n",
476 | "error 4.163336342344337e-17\n",
477 | "---------------------------\n",
478 | "-----episode 29-----\n",
479 | "theta_2014: [[-0.05662785]\n",
480 | " [ 0.17523191]\n",
481 | " [ 0.04840326]\n",
482 | " [ 0.19673091]\n",
483 | " [ 0.07725531]\n",
484 | " [ 0.10466175]]\n",
485 | "theta_2016: [[-0.05662785]\n",
486 | " [ 0.17523191]\n",
487 | " [ 0.04840326]\n",
488 | " [ 0.19673091]\n",
489 | " [ 0.07725531]\n",
490 | " [ 0.10466175]]\n",
491 | "error 4.85722573273506e-17\n",
492 | "---------------------------\n",
493 | "-----episode 30-----\n",
494 | "theta_2014: [[-0.0633441 ]\n",
495 | " [ 0.18515793]\n",
496 | " [ 0.04840326]\n",
497 | " [ 0.19673091]\n",
498 | " [ 0.08648275]\n",
499 | " [ 0.10466175]]\n",
500 | "theta_2016: [[-0.0633441 ]\n",
501 | " [ 0.18515793]\n",
502 | " [ 0.04840326]\n",
503 | " [ 0.19673091]\n",
504 | " [ 0.08648275]\n",
505 | " [ 0.10466175]]\n",
506 | "error 4.163336342344337e-17\n",
507 | "---------------------------\n",
508 | "-----episode 31-----\n",
509 | "theta_2014: [[-0.05128938]\n",
510 | " [ 0.18515793]\n",
511 | " [ 0.05791922]\n",
512 | " [ 0.19673091]\n",
513 | " [ 0.08648275]\n",
514 | " [ 0.10466175]]\n",
515 | "theta_2016: [[-0.05128938]\n",
516 | " [ 0.18515793]\n",
517 | " [ 0.05791922]\n",
518 | " [ 0.19673091]\n",
519 | " [ 0.08648275]\n",
520 | " [ 0.10466175]]\n",
521 | "error 4.85722573273506e-17\n",
522 | "---------------------------\n",
523 | "-----episode 32-----\n",
524 | "theta_2014: [[-0.03927043]\n",
525 | " [ 0.18515793]\n",
526 | " [ 0.06734003]\n",
527 | " [ 0.19673091]\n",
528 | " [ 0.08648275]\n",
529 | " [ 0.10466175]]\n",
530 | "theta_2016: [[-0.03927043]\n",
531 | " [ 0.18515793]\n",
532 | " [ 0.06734003]\n",
533 | " [ 0.19673091]\n",
534 | " [ 0.08648275]\n",
535 | " [ 0.10466175]]\n",
536 | "error 5.551115123125783e-17\n",
537 | "---------------------------\n",
538 | "-----episode 33-----\n",
539 | "theta_2014: [[-0.04594452]\n",
540 | " [ 0.18515793]\n",
541 | " [ 0.06734003]\n",
542 | " [ 0.20668614]\n",
543 | " [ 0.08648275]\n",
544 | " [ 0.11361513]]\n",
545 | "theta_2016: [[-0.04594452]\n",
546 | " [ 0.18515793]\n",
547 | " [ 0.06734003]\n",
548 | " [ 0.20668614]\n",
549 | " [ 0.08648275]\n",
550 | " [ 0.11361513]]\n",
551 | "error 5.551115123125783e-17\n",
552 | "---------------------------\n",
553 | "-----episode 34-----\n",
554 | "theta_2014: [[-0.05245527]\n",
555 | " [ 0.18515793]\n",
556 | " [ 0.06734003]\n",
557 | " [ 0.21662159]\n",
558 | " [ 0.08648275]\n",
559 | " [ 0.12247898]]\n",
560 | "theta_2016: [[-0.05245527]\n",
561 | " [ 0.18515793]\n",
562 | " [ 0.06734003]\n",
563 | " [ 0.21662159]\n",
564 | " [ 0.08648275]\n",
565 | " [ 0.12247898]]\n",
566 | "error 6.245004513516506e-17\n",
567 | "---------------------------\n",
568 | "-----episode 35-----\n",
569 | "theta_2014: [[-0.05901062]\n",
570 | " [ 0.18515793]\n",
571 | " [ 0.06734003]\n",
572 | " [ 0.22445537]\n",
573 | " [ 0.08648275]\n",
574 | " [ 0.12247898]]\n",
575 | "theta_2016: [[-0.05901062]\n",
576 | " [ 0.18515793]\n",
577 | " [ 0.06734003]\n",
578 | " [ 0.22445537]\n",
579 | " [ 0.08648275]\n",
580 | " [ 0.12247898]]\n",
581 | "error 6.245004513516506e-17\n",
582 | "---------------------------\n",
583 | "-----episode 36-----\n",
584 | "theta_2014: [[-0.05579577]\n",
585 | " [ 0.19330636]\n",
586 | " [ 0.05551248]\n",
587 | " [ 0.22445537]\n",
588 | " [ 0.08648275]\n",
589 | " [ 0.12247898]]\n",
590 | "theta_2016: [[-0.05579577]\n",
591 | " [ 0.19330636]\n",
592 | " [ 0.05551248]\n",
593 | " [ 0.22445537]\n",
594 | " [ 0.08648275]\n",
595 | " [ 0.12247898]]\n",
596 | "error 6.245004513516506e-17\n",
597 | "---------------------------\n",
598 | "-----episode 37-----\n",
599 | "theta_2014: [[-0.06235116]\n",
600 | " [ 0.20313385]\n",
601 | " [ 0.05551248]\n",
602 | " [ 0.22445537]\n",
603 | " [ 0.09561792]\n",
604 | " [ 0.12247898]]\n",
605 | "theta_2016: [[-0.06235116]\n",
606 | " [ 0.20313385]\n",
607 | " [ 0.05551248]\n",
608 | " [ 0.22445537]\n",
609 | " [ 0.09561792]\n",
610 | " [ 0.12247898]]\n",
611 | "error 6.938893903907228e-17\n",
612 | "---------------------------\n",
613 | "-----episode 38-----\n",
614 | "theta_2014: [[-0.06873775]\n",
615 | " [ 0.20313385]\n",
616 | " [ 0.05551248]\n",
617 | " [ 0.23221082]\n",
618 | " [ 0.09561792]\n",
619 | " [ 0.12247898]]\n",
620 | "theta_2016: [[-0.06873775]\n",
621 | " [ 0.20313385]\n",
622 | " [ 0.05551248]\n",
623 | " [ 0.23221082]\n",
624 | " [ 0.09561792]\n",
625 | " [ 0.12247898]]\n",
626 | "error 6.938893903907228e-17\n",
627 | "---------------------------\n",
628 | "-----episode 39-----\n",
629 | "theta_2014: [[-0.07478533]\n",
630 | " [ 0.20313385]\n",
631 | " [ 0.05551248]\n",
632 | " [ 0.24197 ]\n",
633 | " [ 0.09561792]\n",
634 | " [ 0.13125419]]\n",
635 | "theta_2016: [[-0.07478533]\n",
636 | " [ 0.20313385]\n",
637 | " [ 0.05551248]\n",
638 | " [ 0.24197 ]\n",
639 | " [ 0.09561792]\n",
640 | " [ 0.13125419]]\n",
641 | "error 6.938893903907228e-17\n",
642 | "---------------------------\n",
643 | "-----episode 40-----\n",
644 | "theta_2014: [[-0.08067773]\n",
645 | " [ 0.20313385]\n",
646 | " [ 0.05551248]\n",
647 | " [ 0.25170977]\n",
648 | " [ 0.09561792]\n",
649 | " [ 0.13994165]]\n",
650 | "theta_2016: [[-0.08067773]\n",
651 | " [ 0.20313385]\n",
652 | " [ 0.05551248]\n",
653 | " [ 0.25170977]\n",
654 | " [ 0.09561792]\n",
655 | " [ 0.13994165]]\n",
656 | "error 4.163336342344337e-17\n",
657 | "---------------------------\n",
658 | "-----episode 41-----\n",
659 | "theta_2014: [[-0.08641677]\n",
660 | " [ 0.20313385]\n",
661 | " [ 0.05551248]\n",
662 | " [ 0.26142956]\n",
663 | " [ 0.09561792]\n",
664 | " [ 0.14854223]]\n",
665 | "theta_2016: [[-0.08641677]\n",
666 | " [ 0.20313385]\n",
667 | " [ 0.05551248]\n",
668 | " [ 0.26142956]\n",
669 | " [ 0.09561792]\n",
670 | " [ 0.14854223]]\n",
671 | "error 2.7755575615628914e-17\n",
672 | "---------------------------\n",
673 | "-----episode 42-----\n",
674 | "theta_2014: [[-0.09223327]\n",
675 | " [ 0.20313385]\n",
676 | " [ 0.05551248]\n",
677 | " [ 0.26881526]\n",
678 | " [ 0.09561792]\n",
679 | " [ 0.14854223]]\n",
680 | "theta_2016: [[-0.09223327]\n",
681 | " [ 0.20313385]\n",
682 | " [ 0.05551248]\n",
683 | " [ 0.26881526]\n",
684 | " [ 0.09561792]\n",
685 | " [ 0.14854223]]\n",
686 | "error 4.163336342344337e-17\n",
687 | "---------------------------\n",
688 | "-----episode 43-----\n",
689 | "theta_2014: [[-0.09832866]\n",
690 | " [ 0.21294447]\n",
691 | " [ 0.05551248]\n",
692 | " [ 0.26881526]\n",
693 | " [ 0.10466175]\n",
694 | " [ 0.14854223]]\n",
695 | "theta_2016: [[-0.09832866]\n",
696 | " [ 0.21294447]\n",
697 | " [ 0.05551248]\n",
698 | " [ 0.26881526]\n",
699 | " [ 0.10466175]\n",
700 | " [ 0.14854223]]\n",
701 | "error 2.7755575615628914e-17\n",
702 | "---------------------------\n",
703 | "-----episode 44-----\n",
704 | "theta_2014: [[-0.08557491]\n",
705 | " [ 0.22081503]\n",
706 | " [ 0.06784469]\n",
707 | " [ 0.26881526]\n",
708 | " [ 0.10466175]\n",
709 | " [ 0.14854223]]\n",
710 | "theta_2016: [[-0.08557491]\n",
711 | " [ 0.22081503]\n",
712 | " [ 0.06784469]\n",
713 | " [ 0.26881526]\n",
714 | " [ 0.10466175]\n",
715 | " [ 0.14854223]]\n",
716 | "error 1.3877787807814457e-17\n",
717 | "---------------------------\n",
718 | "-----episode 45-----\n",
719 | "theta_2014: [[-0.09110498]\n",
720 | " [ 0.22081503]\n",
721 | " [ 0.06784469]\n",
722 | " [ 0.27844062]\n",
723 | " [ 0.10466175]\n",
724 | " [ 0.15705681]]\n",
725 | "theta_2016: [[-0.09110498]\n",
726 | " [ 0.22081503]\n",
727 | " [ 0.06784469]\n",
728 | " [ 0.27844062]\n",
729 | " [ 0.10466175]\n",
730 | " [ 0.15705681]]\n",
731 | "error 2.7755575615628914e-17\n",
732 | "---------------------------\n",
733 | "-----episode 46-----\n",
734 | "theta_2014: [[-0.07859943]\n",
735 | " [ 0.22081503]\n",
736 | " [ 0.07716624]\n",
737 | " [ 0.27844062]\n",
738 | " [ 0.10466175]\n",
739 | " [ 0.15705681]]\n",
740 | "theta_2016: [[-0.07859943]\n",
741 | " [ 0.22081503]\n",
742 | " [ 0.07716624]\n",
743 | " [ 0.27844062]\n",
744 | " [ 0.10466175]\n",
745 | " [ 0.15705681]]\n",
746 | "error 2.7755575615628914e-17\n",
747 | "---------------------------\n",
748 | "-----episode 47-----\n",
749 | "theta_2014: [[-0.06579227]\n",
750 | " [ 0.22081503]\n",
751 | " [ 0.08986549]\n",
752 | " [ 0.28565621]\n",
753 | " [ 0.10466175]\n",
754 | " [ 0.15705681]]\n",
755 | "theta_2016: [[-0.06579227]\n",
756 | " [ 0.22081503]\n",
757 | " [ 0.08986549]\n",
758 | " [ 0.28565621]\n",
759 | " [ 0.10466175]\n",
760 | " [ 0.15705681]]\n",
761 | "error 2.7755575615628914e-17\n",
762 | "---------------------------\n",
763 | "-----episode 48-----\n",
764 | "theta_2014: [[-0.07198655]\n",
765 | " [ 0.23052941]\n",
766 | " [ 0.08986549]\n",
767 | " [ 0.28565621]\n",
768 | " [ 0.11361513]\n",
769 | " [ 0.15705681]]\n",
770 | "theta_2016: [[-0.07198655]\n",
771 | " [ 0.23052941]\n",
772 | " [ 0.08986549]\n",
773 | " [ 0.28565621]\n",
774 | " [ 0.11361513]\n",
775 | " [ 0.15705681]]\n",
776 | "error 2.7755575615628914e-17\n",
777 | "---------------------------\n",
778 | "-----episode 49-----\n",
779 | "theta_2014: [[-0.07749494]\n",
780 | " [ 0.23052941]\n",
781 | " [ 0.08986549]\n",
782 | " [ 0.29518903]\n",
783 | " [ 0.11361513]\n",
784 | " [ 0.16548624]]\n",
785 | "theta_2016: [[-0.07749494]\n",
786 | " [ 0.23052941]\n",
787 | " [ 0.08986549]\n",
788 | " [ 0.29518903]\n",
789 | " [ 0.11361513]\n",
790 | " [ 0.16548624]]\n",
791 | "error 2.7755575615628914e-17\n",
792 | "---------------------------\n",
793 | "-----episode 50-----\n",
794 | "theta_2014: [[-0.08367597]\n",
795 | " [ 0.23822412]\n",
796 | " [ 0.08986549]\n",
797 | " [ 0.29518903]\n",
798 | " [ 0.11361513]\n",
799 | " [ 0.16548624]]\n",
800 | "theta_2016: [[-0.08367597]\n",
801 | " [ 0.23822412]\n",
802 | " [ 0.08986549]\n",
803 | " [ 0.29518903]\n",
804 | " [ 0.11361513]\n",
805 | " [ 0.16548624]]\n",
806 | "error 2.7755575615628914e-17\n",
807 | "---------------------------\n",
808 | "-----episode 51-----\n",
809 | "theta_2014: [[-0.08952841]\n",
810 | " [ 0.24784419]\n",
811 | " [ 0.08986549]\n",
812 | " [ 0.29518903]\n",
813 | " [ 0.12247898]\n",
814 | " [ 0.16548624]]\n",
815 | "theta_2016: [[-0.08952841]\n",
816 | " [ 0.24784419]\n",
817 | " [ 0.08986549]\n",
818 | " [ 0.29518903]\n",
819 | " [ 0.12247898]\n",
820 | " [ 0.16548624]]\n",
821 | "error 2.7755575615628914e-17\n",
822 | "---------------------------\n",
823 | "-----episode 52-----\n",
824 | "theta_2014: [[-0.09501299]\n",
825 | " [ 0.24784419]\n",
826 | " [ 0.08986549]\n",
827 | " [ 0.30223714]\n",
828 | " [ 0.12247898]\n",
829 | " [ 0.16548624]]\n",
830 | "theta_2016: [[-0.09501299]\n",
831 | " [ 0.24784419]\n",
832 | " [ 0.08986549]\n",
833 | " [ 0.30223714]\n",
834 | " [ 0.12247898]\n",
835 | " [ 0.16548624]]\n",
836 | "error 2.7755575615628914e-17\n",
837 | "---------------------------\n",
838 | "-----episode 53-----\n",
839 | "theta_2014: [[-0.08227216]\n",
840 | " [ 0.24784419]\n",
841 | " [ 0.09896683]\n",
842 | " [ 0.30223714]\n",
843 | " [ 0.12247898]\n",
844 | " [ 0.16548624]]\n",
845 | "theta_2016: [[-0.08227216]\n",
846 | " [ 0.24784419]\n",
847 | " [ 0.09896683]\n",
848 | " [ 0.30223714]\n",
849 | " [ 0.12247898]\n",
850 | " [ 0.16548624]]\n",
851 | "error 2.7755575615628914e-17\n",
852 | "---------------------------\n",
853 | "-----episode 54-----\n",
854 | "theta_2014: [[-0.0877665 ]\n",
855 | " [ 0.24784419]\n",
856 | " [ 0.09896683]\n",
857 | " [ 0.30921476]\n",
858 | " [ 0.12247898]\n",
859 | " [ 0.16548624]]\n",
860 | "theta_2016: [[-0.0877665 ]\n",
861 | " [ 0.24784419]\n",
862 | " [ 0.09896683]\n",
863 | " [ 0.30921476]\n",
864 | " [ 0.12247898]\n",
865 | " [ 0.16548624]]\n",
866 | "error 1.3877787807814457e-17\n",
867 | "---------------------------\n",
868 | "-----episode 55-----\n",
869 | "theta_2014: [[-0.09314374]\n",
870 | " [ 0.24784419]\n",
871 | " [ 0.09896683]\n",
872 | " [ 0.31612262]\n",
873 | " [ 0.12247898]\n",
874 | " [ 0.16548624]]\n",
875 | "theta_2016: [[-0.09314374]\n",
876 | " [ 0.24784419]\n",
877 | " [ 0.09896683]\n",
878 | " [ 0.31612262]\n",
879 | " [ 0.12247898]\n",
880 | " [ 0.16548624]]\n",
881 | "error 1.3877787807814457e-17\n",
882 | "---------------------------\n",
883 | "-----episode 56-----\n",
884 | "theta_2014: [[-0.09901401]\n",
885 | " [ 0.25536575]\n",
886 | " [ 0.09896683]\n",
887 | " [ 0.31612262]\n",
888 | " [ 0.12247898]\n",
889 | " [ 0.16548624]]\n",
890 | "theta_2016: [[-0.09901401]\n",
891 | " [ 0.25536575]\n",
892 | " [ 0.09896683]\n",
893 | " [ 0.31612262]\n",
894 | " [ 0.12247898]\n",
895 | " [ 0.16548624]]\n",
896 | "error 1.3877787807814457e-17\n",
897 | "---------------------------\n",
898 | "-----episode 57-----\n",
899 | "theta_2014: [[-0.10421721]\n",
900 | " [ 0.25536575]\n",
901 | " [ 0.09896683]\n",
902 | " [ 0.32296139]\n",
903 | " [ 0.12247898]\n",
904 | " [ 0.16548624]]\n",
905 | "theta_2016: [[-0.10421721]\n",
906 | " [ 0.25536575]\n",
907 | " [ 0.09896683]\n",
908 | " [ 0.32296139]\n",
909 | " [ 0.12247898]\n",
910 | " [ 0.16548624]]\n",
911 | "error 1.3877787807814457e-17\n",
912 | "---------------------------\n",
913 | "-----episode 58-----\n",
914 | "theta_2014: [[-0.09092036]\n",
915 | " [ 0.25536575]\n",
916 | " [ 0.11184475]\n",
917 | " [ 0.32973178]\n",
918 | " [ 0.12247898]\n",
919 | " [ 0.16548624]]\n",
920 | "theta_2016: [[-0.09092036]\n",
921 | " [ 0.25536575]\n",
922 | " [ 0.11184475]\n",
923 | " [ 0.32973178]\n",
924 | " [ 0.12247898]\n",
925 | " [ 0.16548624]]\n",
926 | "error 1.3877787807814457e-17\n",
927 | "---------------------------\n",
928 | "-----episode 59-----\n",
929 | "theta_2014: [[-0.07802462]\n",
930 | " [ 0.25536575]\n",
931 | " [ 0.1207263 ]\n",
932 | " [ 0.32973178]\n",
933 | " [ 0.12247898]\n",
934 | " [ 0.16548624]]\n",
935 | "theta_2016: [[-0.07802462]\n",
936 | " [ 0.25536575]\n",
937 | " [ 0.1207263 ]\n",
938 | " [ 0.32973178]\n",
939 | " [ 0.12247898]\n",
940 | " [ 0.16548624]]\n",
941 | "error 2.7755575615628914e-17\n",
942 | "---------------------------\n",
943 | "-----episode 60-----\n",
944 | "theta_2014: [[-0.08397906]\n",
945 | " [ 0.26281209]\n",
946 | " [ 0.1207263 ]\n",
947 | " [ 0.32973178]\n",
948 | " [ 0.12247898]\n",
949 | " [ 0.16548624]]\n",
950 | "theta_2016: [[-0.08397906]\n",
951 | " [ 0.26281209]\n",
952 | " [ 0.1207263 ]\n",
953 | " [ 0.32973178]\n",
954 | " [ 0.12247898]\n",
955 | " [ 0.16548624]]\n",
956 | "error 1.3877787807814457e-17\n",
957 | "---------------------------\n",
958 | "-----episode 61-----\n",
959 | "theta_2014: [[-0.08960157]\n",
960 | " [ 0.27226525]\n",
961 | " [ 0.1207263 ]\n",
962 | " [ 0.32973178]\n",
963 | " [ 0.13125419]\n",
964 | " [ 0.16548624]]\n",
965 | "theta_2016: [[-0.08960157]\n",
966 | " [ 0.27226525]\n",
967 | " [ 0.1207263 ]\n",
968 | " [ 0.32973178]\n",
969 | " [ 0.13125419]\n",
970 | " [ 0.16548624]]\n",
971 | "error 1.3877787807814457e-17\n",
972 | "---------------------------\n",
973 | "-----episode 62-----\n",
974 | "theta_2014: [[-0.09528967]\n",
975 | " [ 0.2795426 ]\n",
976 | " [ 0.1207263 ]\n",
977 | " [ 0.32973178]\n",
978 | " [ 0.13125419]\n",
979 | " [ 0.16548624]]\n",
980 | "theta_2016: [[-0.09528967]\n",
981 | " [ 0.2795426 ]\n",
982 | " [ 0.1207263 ]\n",
983 | " [ 0.32973178]\n",
984 | " [ 0.13125419]\n",
985 | " [ 0.16548624]]\n",
986 | "error 1.3877787807814457e-17\n",
987 | "---------------------------\n",
988 | "-----episode 63-----\n",
989 | "theta_2014: [[-0.10040886]\n",
990 | " [ 0.2795426 ]\n",
991 | " [ 0.1207263 ]\n",
992 | " [ 0.33643446]\n",
993 | " [ 0.13125419]\n",
994 | " [ 0.16548624]]\n",
995 | "theta_2016: [[-0.10040886]\n",
996 | " [ 0.2795426 ]\n",
997 | " [ 0.1207263 ]\n",
998 | " [ 0.33643446]\n",
999 | " [ 0.13125419]\n",
1000 | " [ 0.16548624]]\n",
1001 | "error 1.3877787807814457e-17\n",
1002 | "---------------------------\n",
1003 | "-----episode 64-----\n",
1004 | "theta_2014: [[-0.09457365]\n",
1005 | " [ 0.2795426 ]\n",
1006 | " [ 0.10813787]\n",
1007 | " [ 0.34307011]\n",
1008 | " [ 0.13125419]\n",
1009 | " [ 0.16548624]]\n",
1010 | "theta_2016: [[-0.09457365]\n",
1011 | " [ 0.2795426 ]\n",
1012 | " [ 0.10813787]\n",
1013 | " [ 0.34307011]\n",
1014 | " [ 0.13125419]\n",
1015 | " [ 0.16548624]]\n",
1016 | "error 1.3877787807814457e-17\n",
1017 | "---------------------------\n",
1018 | "-----episode 65-----\n",
1019 | "theta_2014: [[-0.10014719]\n",
1020 | " [ 0.28674718]\n",
1021 | " [ 0.10813787]\n",
1022 | " [ 0.34307011]\n",
1023 | " [ 0.13125419]\n",
1024 | " [ 0.16548624]]\n",
1025 | "theta_2016: [[-0.10014719]\n",
1026 | " [ 0.28674718]\n",
1027 | " [ 0.10813787]\n",
1028 | " [ 0.34307011]\n",
1029 | " [ 0.13125419]\n",
1030 | " [ 0.16548624]]\n",
1031 | "error 2.7755575615628914e-17\n",
1032 | "---------------------------\n",
1033 | "-----episode 66-----\n",
1034 | "theta_2014: [[-0.08719221]\n",
1035 | " [ 0.28674718]\n",
1036 | " [ 0.11705649]\n",
1037 | " [ 0.34307011]\n",
1038 | " [ 0.13125419]\n",
1039 | " [ 0.16548624]]\n",
1040 | "theta_2016: [[-0.08719221]\n",
1041 | " [ 0.28674718]\n",
1042 | " [ 0.11705649]\n",
1043 | " [ 0.34307011]\n",
1044 | " [ 0.13125419]\n",
1045 | " [ 0.16548624]]\n",
1046 | "error 2.7755575615628914e-17\n",
1047 | "---------------------------\n",
1048 | "-----episode 67-----\n",
1049 | "theta_2014: [[-0.09256158]\n",
1050 | " [ 0.29603918]\n",
1051 | " [ 0.11705649]\n",
1052 | " [ 0.34307011]\n",
1053 | " [ 0.13994165]\n",
1054 | " [ 0.16548624]]\n",
1055 | "theta_2016: [[-0.09256158]\n",
1056 | " [ 0.29603918]\n",
1057 | " [ 0.11705649]\n",
1058 | " [ 0.34307011]\n",
1059 | " [ 0.13994165]\n",
1060 | " [ 0.16548624]]\n",
1061 | "error 4.163336342344337e-17\n",
1062 | "---------------------------\n",
1063 | "-----episode 68-----\n",
1064 | "theta_2014: [[-0.0977868 ]\n",
1065 | " [ 0.30531567]\n",
1066 | " [ 0.11705649]\n",
1067 | " [ 0.34307011]\n",
1068 | " [ 0.14854223]\n",
1069 | " [ 0.16548624]]\n",
1070 | "theta_2016: [[-0.0977868 ]\n",
1071 | " [ 0.30531567]\n",
1072 | " [ 0.11705649]\n",
1073 | " [ 0.34307011]\n",
1074 | " [ 0.14854223]\n",
1075 | " [ 0.16548624]]\n",
1076 | "error 2.7755575615628914e-17\n",
1077 | "---------------------------\n",
1078 | "-----episode 69-----\n",
1079 | "theta_2014: [[-0.10286954]\n",
1080 | " [ 0.31457602]\n",
1081 | " [ 0.11705649]\n",
1082 | " [ 0.34307011]\n",
1083 | " [ 0.15705681]\n",
1084 | " [ 0.16548624]]\n",
1085 | "theta_2016: [[-0.10286954]\n",
1086 | " [ 0.31457602]\n",
1087 | " [ 0.11705649]\n",
1088 | " [ 0.34307011]\n",
1089 | " [ 0.15705681]\n",
1090 | " [ 0.16548624]]\n",
1091 | "error 2.7755575615628914e-17\n",
1092 | "---------------------------\n",
1093 | "-----episode 70-----\n",
1094 | "theta_2014: [[-0.08980787]\n",
1095 | " [ 0.31457602]\n",
1096 | " [ 0.12588592]\n",
1097 | " [ 0.34307011]\n",
1098 | " [ 0.15705681]\n",
1099 | " [ 0.16548624]]\n",
1100 | "theta_2016: [[-0.08980787]\n",
1101 | " [ 0.31457602]\n",
1102 | " [ 0.12588592]\n",
1103 | " [ 0.34307011]\n",
1104 | " [ 0.15705681]\n",
1105 | " [ 0.16548624]]\n",
1106 | "error 2.7755575615628914e-17\n",
1107 | "---------------------------\n",
1108 | "-----episode 71-----\n",
1109 | "theta_2014: [[-0.07639752]\n",
1110 | " [ 0.31457602]\n",
1111 | " [ 0.13867382]\n",
1112 | " [ 0.34963941]\n",
1113 | " [ 0.15705681]\n",
1114 | " [ 0.16548624]]\n",
1115 | "theta_2016: [[-0.07639752]\n",
1116 | " [ 0.31457602]\n",
1117 | " [ 0.13867382]\n",
1118 | " [ 0.34963941]\n",
1119 | " [ 0.15705681]\n",
1120 | " [ 0.16548624]]\n",
1121 | "error 2.7755575615628914e-17\n",
1122 | "---------------------------\n",
1123 | "-----episode 72-----\n",
1124 | "theta_2014: [[-0.08160412]\n",
1125 | " [ 0.32381964]\n",
1126 | " [ 0.13867382]\n",
1127 | " [ 0.34963941]\n",
1128 | " [ 0.16548624]\n",
1129 | " [ 0.16548624]]\n",
1130 | "theta_2016: [[-0.08160412]\n",
1131 | " [ 0.32381964]\n",
1132 | " [ 0.13867382]\n",
1133 | " [ 0.34963941]\n",
1134 | " [ 0.16548624]\n",
1135 | " [ 0.16548624]]\n",
1136 | "error 2.7755575615628914e-17\n",
1137 | "---------------------------\n",
1138 | "-----episode 73-----\n",
1139 | "theta_2014: [[-0.08691285]\n",
1140 | " [ 0.33058144]\n",
1141 | " [ 0.13867382]\n",
1142 | " [ 0.34963941]\n",
1143 | " [ 0.16548624]\n",
1144 | " [ 0.16548624]]\n",
1145 | "theta_2016: [[-0.08691285]\n",
1146 | " [ 0.33058144]\n",
1147 | " [ 0.13867382]\n",
1148 | " [ 0.34963941]\n",
1149 | " [ 0.16548624]\n",
1150 | " [ 0.16548624]]\n",
1151 | "error 4.163336342344337e-17\n",
1152 | "---------------------------\n",
1153 | "-----episode 74-----\n",
1154 | "theta_2014: [[-0.09169445]\n",
1155 | " [ 0.33058144]\n",
1156 | " [ 0.13867382]\n",
1157 | " [ 0.3586075 ]\n",
1158 | " [ 0.16548624]\n",
1159 | " [ 0.17383138]]\n",
1160 | "theta_2016: [[-0.09169445]\n",
1161 | " [ 0.33058144]\n",
1162 | " [ 0.13867382]\n",
1163 | " [ 0.3586075 ]\n",
1164 | " [ 0.16548624]\n",
1165 | " [ 0.17383138]]\n",
1166 | "error 4.163336342344337e-17\n",
1167 | "---------------------------\n",
1168 | "-----episode 75-----\n",
1169 | "theta_2014: [[-0.09684202]\n",
1170 | " [ 0.33727563]\n",
1171 | " [ 0.13867382]\n",
1172 | " [ 0.3586075 ]\n",
1173 | " [ 0.16548624]\n",
1174 | " [ 0.17383138]]\n",
1175 | "theta_2016: [[-0.09684202]\n",
1176 | " [ 0.33727563]\n",
1177 | " [ 0.13867382]\n",
1178 | " [ 0.3586075 ]\n",
1179 | " [ 0.16548624]\n",
1180 | " [ 0.17383138]]\n",
1181 | "error 4.163336342344337e-17\n",
1182 | "---------------------------\n",
1183 | "-----episode 76-----\n",
1184 | "theta_2014: [[-0.10187848]\n",
1185 | " [ 0.34390287]\n",
1186 | " [ 0.13867382]\n",
1187 | " [ 0.3586075 ]\n",
1188 | " [ 0.16548624]\n",
1189 | " [ 0.17383138]]\n",
1190 | "theta_2016: [[-0.10187848]\n",
1191 | " [ 0.34390287]\n",
1192 | " [ 0.13867382]\n",
1193 | " [ 0.3586075 ]\n",
1194 | " [ 0.16548624]\n",
1195 | " [ 0.17383138]]\n",
1196 | "error 4.163336342344337e-17\n",
1197 | "---------------------------\n",
1198 | "-----episode 77-----\n",
1199 | "theta_2014: [[-0.10642315]\n",
1200 | " [ 0.34390287]\n",
1201 | " [ 0.13867382]\n",
1202 | " [ 0.36756026]\n",
1203 | " [ 0.16548624]\n",
1204 | " [ 0.18209306]]\n",
1205 | "theta_2016: [[-0.10642315]\n",
1206 | " [ 0.34390287]\n",
1207 | " [ 0.13867382]\n",
1208 | " [ 0.36756026]\n",
1209 | " [ 0.16548624]\n",
1210 | " [ 0.18209306]]\n",
1211 | "error 4.163336342344337e-17\n",
1212 | "---------------------------\n",
1213 | "-----episode 78-----\n",
1214 | "theta_2014: [[-0.11130475]\n",
1215 | " [ 0.35046384]\n",
1216 | " [ 0.13867382]\n",
1217 | " [ 0.36756026]\n",
1218 | " [ 0.16548624]\n",
1219 | " [ 0.18209306]]\n",
1220 | "theta_2016: [[-0.11130475]\n",
1221 | " [ 0.35046384]\n",
1222 | " [ 0.13867382]\n",
1223 | " [ 0.36756026]\n",
1224 | " [ 0.16548624]\n",
1225 | " [ 0.18209306]]\n",
1226 | "error 4.163336342344337e-17\n",
1227 | "---------------------------\n",
1228 | "-----episode 79-----\n",
1229 | "theta_2014: [[-0.1156681 ]\n",
1230 | " [ 0.35046384]\n",
1231 | " [ 0.13867382]\n",
1232 | " [ 0.37649711]\n",
1233 | " [ 0.16548624]\n",
1234 | " [ 0.19027213]]\n",
1235 | "theta_2016: [[-0.1156681 ]\n",
1236 | " [ 0.35046384]\n",
1237 | " [ 0.13867382]\n",
1238 | " [ 0.37649711]\n",
1239 | " [ 0.16548624]\n",
1240 | " [ 0.19027213]]\n",
1241 | "error 4.163336342344337e-17\n",
1242 | "---------------------------\n",
1243 | "-----episode 80-----\n",
1244 | "theta_2014: [[-0.12039879]\n",
1245 | " [ 0.3569592 ]\n",
1246 | " [ 0.13867382]\n",
1247 | " [ 0.37649711]\n",
1248 | " [ 0.16548624]\n",
1249 | " [ 0.19027213]]\n",
1250 | "theta_2016: [[-0.12039879]\n",
1251 | " [ 0.3569592 ]\n",
1252 | " [ 0.13867382]\n",
1253 | " [ 0.37649711]\n",
1254 | " [ 0.16548624]\n",
1255 | " [ 0.19027213]]\n",
1256 | "error 2.7755575615628914e-17\n",
1257 | "---------------------------\n",
1258 | "-----episode 81-----\n",
1259 | "theta_2014: [[-0.12458437]\n",
1260 | " [ 0.3569592 ]\n",
1261 | " [ 0.13867382]\n",
1262 | " [ 0.38541746]\n",
1263 | " [ 0.16548624]\n",
1264 | " [ 0.19836941]]\n",
1265 | "theta_2016: [[-0.12458437]\n",
1266 | " [ 0.3569592 ]\n",
1267 | " [ 0.13867382]\n",
1268 | " [ 0.38541746]\n",
1269 | " [ 0.16548624]\n",
1270 | " [ 0.19836941]]\n",
1271 | "error 5.551115123125783e-17\n",
1272 | "---------------------------\n",
1273 | "-----episode 82-----\n",
1274 | "theta_2014: [[-0.12864146]\n",
1275 | " [ 0.3569592 ]\n",
1276 | " [ 0.13867382]\n",
1277 | " [ 0.39432076]\n",
1278 | " [ 0.16548624]\n",
1279 | " [ 0.20638572]]\n",
1280 | "theta_2016: [[-0.12864146]\n",
1281 | " [ 0.3569592 ]\n",
1282 | " [ 0.13867382]\n",
1283 | " [ 0.39432076]\n",
1284 | " [ 0.16548624]\n",
1285 | " [ 0.20638572]]\n",
1286 | "error 5.551115123125783e-17\n",
1287 | "---------------------------\n",
1288 | "-----episode 83-----\n",
1289 | "theta_2014: [[-0.13257159]\n",
1290 | " [ 0.3569592 ]\n",
1291 | " [ 0.13867382]\n",
1292 | " [ 0.40320645]\n",
1293 | " [ 0.16548624]\n",
1294 | " [ 0.21432186]]\n",
1295 | "theta_2016: [[-0.13257159]\n",
1296 | " [ 0.3569592 ]\n",
1297 | " [ 0.13867382]\n",
1298 | " [ 0.40320645]\n",
1299 | " [ 0.16548624]\n",
1300 | " [ 0.21432186]]\n",
1301 | "error 5.551115123125783e-17\n",
1302 | "---------------------------\n",
1303 | "-----episode 84-----\n",
1304 | "theta_2014: [[-0.13683138]\n",
1305 | " [ 0.36585409]\n",
1306 | " [ 0.13867382]\n",
1307 | " [ 0.40320645]\n",
1308 | " [ 0.17383138]\n",
1309 | " [ 0.21432186]]\n",
1310 | "theta_2016: [[-0.13683138]\n",
1311 | " [ 0.36585409]\n",
1312 | " [ 0.13867382]\n",
1313 | " [ 0.40320645]\n",
1314 | " [ 0.17383138]\n",
1315 | " [ 0.21432186]]\n",
1316 | "error 5.551115123125783e-17\n",
1317 | "---------------------------\n",
1318 | "-----episode 85-----\n",
1319 | "theta_2014: [[-0.12954367]\n",
1320 | " [ 0.36585409]\n",
1321 | " [ 0.12564511]\n",
1322 | " [ 0.40917439]\n",
1323 | " [ 0.17383138]\n",
1324 | " [ 0.21432186]]\n",
1325 | "theta_2016: [[-0.12954367]\n",
1326 | " [ 0.36585409]\n",
1327 | " [ 0.12564511]\n",
1328 | " [ 0.40917439]\n",
1329 | " [ 0.17383138]\n",
1330 | " [ 0.21432186]]\n",
1331 | "error 5.551115123125783e-17\n",
1332 | "---------------------------\n",
1333 | "-----episode 86-----\n",
1334 | "theta_2014: [[-0.13361249]\n",
1335 | " [ 0.36585409]\n",
1336 | " [ 0.12564511]\n",
1337 | " [ 0.41508264]\n",
1338 | " [ 0.17383138]\n",
1339 | " [ 0.21432186]]\n",
1340 | "theta_2016: [[-0.13361249]\n",
1341 | " [ 0.36585409]\n",
1342 | " [ 0.12564511]\n",
1343 | " [ 0.41508264]\n",
1344 | " [ 0.17383138]\n",
1345 | " [ 0.21432186]]\n",
1346 | "error 5.551115123125783e-17\n",
1347 | "---------------------------\n",
1348 | "-----episode 87-----\n",
1349 | "theta_2014: [[-0.11970272]\n",
1350 | " [ 0.36585409]\n",
1351 | " [ 0.13907705]\n",
1352 | " [ 0.42093182]\n",
1353 | " [ 0.17383138]\n",
1354 | " [ 0.21432186]]\n",
1355 | "theta_2016: [[-0.11970272]\n",
1356 | " [ 0.36585409]\n",
1357 | " [ 0.13907705]\n",
1358 | " [ 0.42093182]\n",
1359 | " [ 0.17383138]\n",
1360 | " [ 0.21432186]]\n",
1361 | "error 5.551115123125783e-17\n",
1362 | "---------------------------\n",
1363 | "-----episode 88-----\n",
1364 | "theta_2014: [[-0.1058072 ]\n",
1365 | " [ 0.36585409]\n",
1366 | " [ 0.15242678]\n",
1367 | " [ 0.4267225 ]\n",
1368 | " [ 0.17383138]\n",
1369 | " [ 0.21432186]]\n",
1370 | "theta_2016: [[-0.1058072 ]\n",
1371 | " [ 0.36585409]\n",
1372 | " [ 0.15242678]\n",
1373 | " [ 0.4267225 ]\n",
1374 | " [ 0.17383138]\n",
1375 | " [ 0.21432186]]\n",
1376 | "error 5.551115123125783e-17\n",
1377 | "---------------------------\n",
1378 | "-----episode 89-----\n",
1379 | "theta_2014: [[-0.09192659]\n",
1380 | " [ 0.36585409]\n",
1381 | " [ 0.16569461]\n",
1382 | " [ 0.43245527]\n",
1383 | " [ 0.17383138]\n",
1384 | " [ 0.21432186]]\n",
1385 | "theta_2016: [[-0.09192659]\n",
1386 | " [ 0.36585409]\n",
1387 | " [ 0.16569461]\n",
1388 | " [ 0.43245527]\n",
1389 | " [ 0.17383138]\n",
1390 | " [ 0.21432186]]\n",
1391 | "error 4.163336342344337e-17\n",
1392 | "---------------------------\n",
1393 | "-----episode 90-----\n",
1394 | "theta_2014: [[-0.09675757]\n",
1395 | " [ 0.37219555]\n",
1396 | " [ 0.16569461]\n",
1397 | " [ 0.43245527]\n",
1398 | " [ 0.17383138]\n",
1399 | " [ 0.21432186]]\n",
1400 | "theta_2016: [[-0.09675757]\n",
1401 | " [ 0.37219555]\n",
1402 | " [ 0.16569461]\n",
1403 | " [ 0.43245527]\n",
1404 | " [ 0.17383138]\n",
1405 | " [ 0.21432186]]\n",
1406 | "error 4.163336342344337e-17\n",
1407 | "---------------------------\n",
1408 | "-----episode 91-----\n",
1409 | "theta_2014: [[-0.10065975]\n",
1410 | " [ 0.37219555]\n",
1411 | " [ 0.16569461]\n",
1412 | " [ 0.44103033]\n",
1413 | " [ 0.17383138]\n",
1414 | " [ 0.22217864]]\n",
1415 | "theta_2016: [[-0.10065975]\n",
1416 | " [ 0.37219555]\n",
1417 | " [ 0.16569461]\n",
1418 | " [ 0.44103033]\n",
1419 | " [ 0.17383138]\n",
1420 | " [ 0.22217864]]\n",
1421 | "error 4.163336342344337e-17\n",
1422 | "---------------------------\n",
1423 | "-----episode 92-----\n",
1424 | "theta_2014: [[-0.10509555]\n",
1425 | " [ 0.38101244]\n",
1426 | " [ 0.16569461]\n",
1427 | " [ 0.44103033]\n",
1428 | " [ 0.18209306]\n",
1429 | " [ 0.22217864]]\n",
1430 | "theta_2016: [[-0.10509555]\n",
1431 | " [ 0.38101244]\n",
1432 | " [ 0.16569461]\n",
1433 | " [ 0.44103033]\n",
1434 | " [ 0.18209306]\n",
1435 | " [ 0.22217864]]\n",
1436 | "error 4.163336342344337e-17\n",
1437 | "---------------------------\n",
1438 | "-----episode 93-----\n",
1439 | "theta_2014: [[-0.10965977]\n",
1440 | " [ 0.38720231]\n",
1441 | " [ 0.16569461]\n",
1442 | " [ 0.44103033]\n",
1443 | " [ 0.18209306]\n",
1444 | " [ 0.22217864]]\n",
1445 | "theta_2016: [[-0.10965977]\n",
1446 | " [ 0.38720231]\n",
1447 | " [ 0.16569461]\n",
1448 | " [ 0.44103033]\n",
1449 | " [ 0.18209306]\n",
1450 | " [ 0.22217864]]\n",
1451 | "error 5.551115123125783e-17\n",
1452 | "---------------------------\n",
1453 | "-----episode 94-----\n",
1454 | "theta_2014: [[-0.1141232 ]\n",
1455 | " [ 0.39333029]\n",
1456 | " [ 0.16569461]\n",
1457 | " [ 0.44103033]\n",
1458 | " [ 0.18209306]\n",
1459 | " [ 0.22217864]]\n",
1460 | "theta_2016: [[-0.1141232 ]\n",
1461 | " [ 0.39333029]\n",
1462 | " [ 0.16569461]\n",
1463 | " [ 0.44103033]\n",
1464 | " [ 0.18209306]\n",
1465 | " [ 0.22217864]]\n",
1466 | "error 5.551115123125783e-17\n",
1467 | "---------------------------\n",
1468 | "-----episode 95-----\n",
1469 | "theta_2014: [[-0.11822876]\n",
1470 | " [ 0.40200943]\n",
1471 | " [ 0.16569461]\n",
1472 | " [ 0.44103033]\n",
1473 | " [ 0.19027213]\n",
1474 | " [ 0.22217864]]\n",
1475 | "theta_2016: [[-0.11822876]\n",
1476 | " [ 0.40200943]\n",
1477 | " [ 0.16569461]\n",
1478 | " [ 0.44103033]\n",
1479 | " [ 0.19027213]\n",
1480 | " [ 0.22217864]]\n",
1481 | "error 5.551115123125783e-17\n",
1482 | "---------------------------\n",
1483 | "-----episode 96-----\n",
1484 | "theta_2014: [[-0.12247457]\n",
1485 | " [ 0.40798934]\n",
1486 | " [ 0.16569461]\n",
1487 | " [ 0.44103033]\n",
1488 | " [ 0.19027213]\n",
1489 | " [ 0.22217864]]\n",
1490 | "theta_2016: [[-0.12247457]\n",
1491 | " [ 0.40798934]\n",
1492 | " [ 0.16569461]\n",
1493 | " [ 0.44103033]\n",
1494 | " [ 0.19027213]\n",
1495 | " [ 0.22217864]]\n",
1496 | "error 5.551115123125783e-17\n",
1497 | "---------------------------\n",
1498 | "-----episode 97-----\n",
1499 | "theta_2014: [[-0.10878349]\n",
1500 | " [ 0.40798934]\n",
1501 | " [ 0.17403766]\n",
1502 | " [ 0.44103033]\n",
1503 | " [ 0.19027213]\n",
1504 | " [ 0.22217864]]\n",
1505 | "theta_2016: [[-0.10878349]\n",
1506 | " [ 0.40798934]\n",
1507 | " [ 0.17403766]\n",
1508 | " [ 0.44103033]\n",
1509 | " [ 0.19027213]\n",
1510 | " [ 0.22217864]]\n",
1511 | "error 5.551115123125783e-17\n",
1512 | "---------------------------\n",
1513 | "-----episode 98-----\n",
1514 | "theta_2014: [[-0.11277607]\n",
1515 | " [ 0.40798934]\n",
1516 | " [ 0.17403766]\n",
1517 | " [ 0.44662002]\n",
1518 | " [ 0.19027213]\n",
1519 | " [ 0.22217864]]\n",
1520 | "theta_2016: [[-0.11277607]\n",
1521 | " [ 0.40798934]\n",
1522 | " [ 0.17403766]\n",
1523 | " [ 0.44662002]\n",
1524 | " [ 0.19027213]\n",
1525 | " [ 0.22217864]]\n",
1526 | "error 6.938893903907228e-17\n",
1527 | "---------------------------\n",
1528 | "-----episode 99-----\n",
1529 | "theta_2014: [[-0.09858656]\n",
1530 | " [ 0.40798934]\n",
1531 | " [ 0.18756066]\n",
1532 | " [ 0.45512344]\n",
1533 | " [ 0.19027213]\n",
1534 | " [ 0.22995685]]\n",
1535 | "theta_2016: [[-0.09858656]\n",
1536 | " [ 0.40798934]\n",
1537 | " [ 0.18756066]\n",
1538 | " [ 0.45512344]\n",
1539 | " [ 0.19027213]\n",
1540 | " [ 0.22995685]]\n",
1541 | "error 6.938893903907228e-17\n",
1542 | "---------------------------\n"
1543 | ]
1544 | }
1545 | ],
1546 | "source": [
1547 | "EPISODES = 100\n",
1548 | "gamma = 0.99\n",
1549 | "alpha = 0.01\n",
1550 | "_lambda = 0.1\n",
1551 | "for episode in range(EPISODES):\n",
1552 | " e_2014 = np.array([[0],[0],[0],[0],[0],[0]])\n",
1553 | " e_2016 = np.array([[0],[0],[0],[0],[0],[0]])\n",
1554 | "\n",
1555 | " S = 'A'\n",
1556 | " # 2014\n",
1557 | " v_s = theta_2014.T@feature_map['A']\n",
1558 | " # 2016\n",
1559 | " V_old = 0\n",
1560 | " while True:\n",
1561 | " if S == 'T':\n",
1562 | " print(f'-----episode {episode}-----')\n",
1563 | " print(f'theta_2014: {theta_2014}')\n",
1564 | " print(f'theta_2016: {theta_2016}')\n",
1565 | " print('error ',np.sum(abs(theta_2014 - theta_2016)))\n",
1566 | " print(f'---------------------------')\n",
1567 | " break\n",
1568 | "\n",
1569 | " random_choice = np.random.choice(len(state_map[S]))\n",
1570 | " next_S, R = state_map[S][random_choice]\n",
1571 | " # 2014\n",
1572 | " v_next_s = theta_2014.T@feature_map[next_S]\n",
1573 | " delta_2014 = R + gamma*v_next_s - v_s\n",
1574 | " e_2014 = gamma*_lambda*e_2014 + alpha*(1-gamma*_lambda*e_2014.T@feature_map[S])*feature_map[S]\n",
1575 | " theta_2014 = theta_2014 + delta_2014*e_2014 + alpha*(v_s - theta_2014.T@feature_map[S])*feature_map[S]\n",
1576 | "\n",
1577 | " # 2016\n",
1578 | " V = theta_2016.T@feature_map[S]\n",
1579 | " V_prime = theta_2016.T@feature_map[next_S]\n",
1580 | " delta_2016 = R + gamma*V_prime - V\n",
1581 | " e_2016 = gamma*_lambda*e_2016 + feature_map[S] - alpha*gamma*_lambda * (e_2016.T@feature_map[S]) * feature_map[S]\n",
1582 | " theta_2016 = theta_2016 + alpha*(delta_2016 + V - V_old)*e_2016 - alpha*(V-V_old)*feature_map[S]\n",
1583 | " V_old = V_prime\n",
1584 | "\n",
1585 | " # 2014\n",
1586 | " v_s = v_next_s\n",
1587 | "\n",
1588 | " # 2014 & 2016\n",
1589 | " S = next_S\n"
1590 | ],
1591 | "metadata": {
1592 | "collapsed": false,
1593 | "pycharm": {
1594 | "name": "#%%\n"
1595 | }
1596 | }
1597 | },
1598 | {
1599 | "cell_type": "code",
1600 | "execution_count": null,
1601 | "outputs": [],
1602 | "source": [
1603 | "\n"
1604 | ],
1605 | "metadata": {
1606 | "collapsed": false,
1607 | "pycharm": {
1608 | "name": "#%%\n"
1609 | }
1610 | }
1611 | }
1612 | ],
1613 | "metadata": {
1614 | "kernelspec": {
1615 | "display_name": "Python 3",
1616 | "language": "python",
1617 | "name": "python3"
1618 | },
1619 | "language_info": {
1620 | "codemirror_mode": {
1621 | "name": "ipython",
1622 | "version": 2
1623 | },
1624 | "file_extension": ".py",
1625 | "mimetype": "text/x-python",
1626 | "name": "python",
1627 | "nbconvert_exporter": "python",
1628 | "pygments_lexer": "ipython2",
1629 | "version": "2.7.6"
1630 | }
1631 | },
1632 | "nbformat": 4,
1633 | "nbformat_minor": 0
1634 | }
--------------------------------------------------------------------------------
/MASM/Differential_semi_gradient_Sarsa.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 83,
6 | "metadata": {
7 | "collapsed": true,
8 | "pycharm": {
9 | "is_executing": false
10 | }
11 | },
12 | "outputs": [],
13 | "source": [
14 | "import numpy as np\n",
15 | "class State:\n",
16 | " def __init__(self, name, value):\n",
17 | " self.name = name\n",
18 | " self.value = value\n",
19 | "A = State('A', np.array([[1,0,0]]))\n",
20 | "B = State('B',np.array([[0,1,0]]))\n",
21 | "C = State('C',np.array([[0,0,1]]))\n",
22 | "\n",
23 | "w = np.random.random((1,3))\n",
24 | "R = {'A':1, 'B':0, 'C':0}\n",
25 | "policy = {'A':B, 'B':C, 'C':A}"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 84,
31 | "outputs": [
32 | {
33 | "name": "stdout",
34 | "text": [
35 | "[[4.15223411e-14]]\n",
36 | "[[1.38777878e-14]]\n",
37 | "[[-5.54001289e-14]]\n",
38 | "[[4.15223411e-14]]\n",
39 | "[[1.38777878e-14]]\n",
40 | "[[-5.54001289e-14]]\n",
41 | "[[4.15223411e-14]]\n",
42 | "[[1.38777878e-14]]\n",
43 | "[[-5.54001289e-14]]\n",
44 | "[[4.15223411e-14]]\n",
45 | "[[1.38777878e-14]]\n",
46 | "[[-5.54001289e-14]]\n",
47 | "[[4.15223411e-14]]\n",
48 | "[[1.38777878e-14]]\n",
49 | "[[-5.54001289e-14]]\n",
50 | "[[4.15223411e-14]]\n",
51 | "[[1.38777878e-14]]\n",
52 | "[[-5.54001289e-14]]\n",
53 | "[[4.15223411e-14]]\n",
54 | "[[1.38777878e-14]]\n",
55 | "[[-5.54001289e-14]]\n",
56 | "[[4.15223411e-14]]\n",
57 | "[[1.38777878e-14]]\n",
58 | "[[-5.54001289e-14]]\n",
59 | "[[4.15223411e-14]]\n",
60 | "[[1.38777878e-14]]\n",
61 | "[[-5.54001289e-14]]\n",
62 | "[[4.15223411e-14]]\n",
63 | "[[1.38777878e-14]]\n"
64 | ],
65 | "output_type": "stream"
66 | }
67 | ],
68 | "source": [
69 | "S = A\n",
70 | "R_bar = 0\n",
71 | "limit = 100000\n",
72 | "for step in range(limit):\n",
73 | " S_prime = policy[S.name]\n",
74 | " delta = R[S_prime.name] - R_bar + S_prime.value@w.T - S.value@w.T\n",
75 | " R_bar += 0.001*delta\n",
76 | " w += 0.001*delta*S.value\n",
77 | " S = S_prime\n",
78 | " if step > limit -30:\n",
79 | " print(delta)"
80 | ],
81 | "metadata": {
82 | "collapsed": false,
83 | "pycharm": {
84 | "name": "#%%\n",
85 | "is_executing": false
86 | }
87 | }
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 85,
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "text": [
96 | "[array([[0.17260156]]), array([[0.50593489]]), array([[0.83926822]])]\n"
97 | ],
98 | "output_type": "stream"
99 | }
100 | ],
101 | "source": [
102 | "result = [S.value@w.T for S in [A,B,C]]\n",
103 | "print(result)"
104 | ],
105 | "metadata": {
106 | "collapsed": false,
107 | "pycharm": {
108 | "name": "#%%\n",
109 | "is_executing": false
110 | }
111 | }
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "outputs": [],
117 | "source": [
118 | "\n"
119 | ],
120 | "metadata": {
121 | "collapsed": false,
122 | "pycharm": {
123 | "name": "#%%\n"
124 | }
125 | }
126 | }
127 | ],
128 | "metadata": {
129 | "kernelspec": {
130 | "display_name": "Python 3",
131 | "language": "python",
132 | "name": "python3"
133 | },
134 | "language_info": {
135 | "codemirror_mode": {
136 | "name": "ipython",
137 | "version": 2
138 | },
139 | "file_extension": ".py",
140 | "mimetype": "text/x-python",
141 | "name": "python",
142 | "nbconvert_exporter": "python",
143 | "pygments_lexer": "ipython2",
144 | "version": "2.7.6"
145 | },
146 | "pycharm": {
147 | "stem_cell": {
148 | "cell_type": "raw",
149 | "source": [],
150 | "metadata": {
151 | "collapsed": false
152 | }
153 | }
154 | }
155 | },
156 | "nbformat": 4,
157 | "nbformat_minor": 0
158 | }
--------------------------------------------------------------------------------
/MASM/prototype.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true,
8 | "pycharm": {
9 | "is_executing": false
10 | }
11 | },
12 | "outputs": [],
13 | "source": [
14 | "import torch\n",
15 | "import numpy as np\n",
16 | "import matplotlib.pyplot as plt\n",
17 | "from torch.nn import Linear\n",
18 | "import torch.nn.functional as F\n",
19 | "from collections import deque\n",
20 | "import random\n",
21 | "import gym\n",
22 | "import matplotlib.pyplot as plt\n",
23 | "from torch.nn import Linear, ReLU\n",
24 | "from torch.autograd import Variable"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "outputs": [],
31 | "source": [
32 | "class Seller_Env:\n",
33 | " def __init__(self, size_of_list = 10, initial_price = 1000):\n",
34 | " self.mu = np.random.randint(initial_price*0.7, initial_price*1.1)\n",
35 | " self.days = 0\n",
36 | " self.size_of_list = size_of_list\n",
37 | " self.item_list = [int(np.random.normal(self.mu, scale = self.mu/10)) for _ in range(self.size_of_list)]\n",
38 | " self.baseline = self.item_list.mean()\n",
39 | " \n",
40 | " def step(self, current_price):\n",
41 | " self.days += 1\n",
42 | " offer = [int(np.random.normal(self.mu*0.7, scale = self.mu/10)) \n",
43 | " for _ in range(np.random.randint(0, self.size_of_list//2))]\n",
44 | " for _ in range(self.size_of_list//2 - len(offer)):\n",
45 | " offer.append(0)\n",
46 | " if max(offer) > current_price:\n",
47 | " return [],True\n",
48 | " else:\n",
49 | " return offer,False\n",
50 | " \n",
51 | " def reset(self):\n",
52 | " self.days = 0\n",
53 | " return self.item_list, \n",
54 | " \n",
55 | "class Seller:\n",
56 | " def __init__(self, min_price):\n",
57 | " self.min_price = min_price\n",
58 | " def model(self):\n",
59 | " "
60 | ],
61 | "metadata": {
62 | "collapsed": false,
63 | "pycharm": {
64 | "name": "#%%\n"
65 | }
66 | }
67 | }
68 | ],
69 | "metadata": {
70 | "kernelspec": {
71 | "display_name": "Python 3",
72 | "language": "python",
73 | "name": "python3"
74 | },
75 | "language_info": {
76 | "codemirror_mode": {
77 | "name": "ipython",
78 | "version": 2
79 | },
80 | "file_extension": ".py",
81 | "mimetype": "text/x-python",
82 | "name": "python",
83 | "nbconvert_exporter": "python",
84 | "pygments_lexer": "ipython2",
85 | "version": "2.7.6"
86 | },
87 | "pycharm": {
88 | "stem_cell": {
89 | "cell_type": "raw",
90 | "source": [],
91 | "metadata": {
92 | "collapsed": false
93 | }
94 | }
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 0
99 | }
--------------------------------------------------------------------------------
/Off-Policy Policy Gradient/Experiment Log of failure of Off_policy_Actor_Critic:
--------------------------------------------------------------------------------
1 | ## Design
2 | The experiment is not succssful but I will record the detail about my thought.
3 |
4 | Inspiring from off-policy policy gradient method, I intend to try use off-policy method train
5 | an actor critic.
6 |
7 | I understand that using bootstrap, functional method plus off-policy is called deadly triads in
8 | Suttons' RL book. I still want to give a try.
9 |
10 | In this experiement, there is only one model, Actor.
11 | It has 2 outcomes, both action probabilites and V estimation.
12 |
13 | The behavior policy is epsilon-Actor. That is the same formulation from epsilon-greedy methods.
14 | While behaving, with probability 90%, the policy chooses the action based on its probability output like
15 | in normal actor-critic method. With 10% probability however, the policy chooses randomly among the action
16 | space, i.e, {0,1}.
17 |
18 | A buffer inside the class will record the observation of states,
19 | reward (because its always 1 in this case I make it constant), probability of the action taken.
20 |
21 | Then, the target policy is when we do not have epsilon. When buffer hits an end in episode, it will
22 | make the model learn. Recalculating each p based on stored observations, using stored p and simple algebra
23 | to generate rho, and update the model. buffer will also only keep the latest 100 records.
24 |
25 | After training, model is automatically evaluated by itself.
26 |
27 | ## Result:
28 |
29 | The learning never takes place but the loss can. It is not diverging like the one usually met in REINFORCE,
30 | but a natural behavior that the thing is converging to a place that is never a best policy.
31 |
32 | ## Improvement:
33 |
34 | Delete the V part of the actor model, and create a off-policy REINFORCE instead.
--------------------------------------------------------------------------------
/RAINBOW/PyTorch_RAINBOW.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LyWangPX/Reinforcement_Learning_Coding_Examples/2f40f67f5709c9dc4ea3d9dd15b441b627b595a6/RAINBOW/PyTorch_RAINBOW.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Welcome
2 |
3 | This is my personal practice of implementing various algorithms of RL from scratch.
4 |
5 | Most of them will be in jupyter notebook and some of them involving multiprocess
6 | would be in normal python files.
7 |
8 | The framework will always be PyTorch, as a personal practice too.
9 |
10 | Normally I use cartpole for easy algorithms in this project and I skip the
11 | visual input part. (which is quite trivial if you add few conv layers).
12 |
13 | And for
14 | harder and visual-related algorithms I will pick various atari game as my environment.
15 |
16 | Due to time limit, I will not provide systematic analysis to any particular algorithm.
17 | And be aware these are personal usage so bugs do appear frequently.
18 |
19 | If the project is mature, I will accept open issues.
20 | For now, however, let me dive in. (I guess no one even read this repo though)
21 |
22 | Project file structure will be changed continuously to match my needs.
23 |
24 | # PLAN:
25 | ## Model-Free RL
26 | ### Policy Gradient
27 | - [x] REINFORCE
28 | - [x] Off-Policy REINFORCE
29 | - [x] Basic Actor Critic
30 | - [x] Advantage Actor Critic using Huber loss and Entropy
31 | - [x] A3C
32 | - [x] A2C
33 | - [x] DDPG
34 | - [ ] D4PG
35 | - [ ] MADDPG
36 | - [ ] TRPO
37 | - [ ] PPO
38 | - [ ] ACER
39 | - [ ] ACTKR
40 | - [ ] SAC
41 | - [ ] SAC with AAT(Automatically Adjusted Temperature
42 | - [ ] TD3
43 | - [ ] SVPG
44 | - [ ] IMPALA
45 | ### Deep Q Learning
46 | - [X] Dueling DDQN
47 | - [x] Dueling DDQN + PER
48 | - [ ] Rainbow DQN
49 | - [ ] Ape-X
50 | ### Distributed RL
51 | - [ ] C51
52 | - [ ] QR-DQN
53 | - [ ] IQN
54 | - [ ] Dopamine (DQN + C51 + IQN + Rainbow)
55 | ### Policy Gradient with Action-Dependent Baselines:
56 | - [ ] Q-prop
57 | - [ ] Stein Control Variates
58 | ### Path-Consistency Learning
59 | - [ ] PCL
60 | - [ ] Trust-PCL
61 | ### Q-learning + Policy Gradient:
62 | - [ ] PGQL
63 | - [ ] Reactor
64 | - [ ] IPG
65 | ### Evolutionary Algorithm
66 | ### Monte Carlo Tree (Alpha Zero)
67 | ## Exploration RL
68 | ### Intrinsic Motivation
69 | - [ ] VIME
70 | - [ ] CTS-based Pseudocounts
71 | - [ ] PixelCNN-based Pseudocounts
72 | - [ ] Hash-based Counts
73 | - [ ] EX2
74 | - [ ] ICM
75 | - [ ] RND
76 | ### Unsupervised RL
77 | - [ ] VIC
78 | - [ ] DIAYN
79 | - [ ] VALOR
80 | ## Hierachy RL
81 | ## Memory RL
82 | ## Model-Based RL
83 | ## Meta-RL
84 | ## Scaling-RL
--------------------------------------------------------------------------------