├── .idea
    ├── .gitignore
    ├── RL.iml
    ├── codeStyles
    │   └── codeStyleConfig.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── other.xml
    └── vcs.xml
├── A2C
    ├── A2C Episodic Sync
    │   ├── SharedAdam.py
    │   ├── evaluate.py
    │   ├── pyTorch_CartPole_A2C_ES.py
    │   └── workers.py
    ├── A2C Time Interval Sync
    │   ├── SharedAdam.py
    │   ├── evaluate.py
    │   ├── pyTorch_CartPole_A2C_IS.py
    │   └── workers.py
    └── README About Vairations.md
├── A3C
    ├── A3C Episodic Async
    │   ├── SharedAdam.py
    │   ├── evaluate.py
    │   ├── pyTorch_CartPole_A3C_EA.py
    │   └── workers_PlayGround.py
    ├── A3C Time Interval Async
    │   ├── SharedAdam.py
    │   ├── evaluate.py
    │   ├── pyTorch_CartPole_A3C_IA.py
    │   └── workers.py
    └── README About Vairations.md
├── Ape-X
    └── PyTorch_Ape-X.py
├── D4PG
    └── PyTorch_D4PG.py
├── DDPG
    └── PyTorch_DDPG.py
├── Deuling Double DQN with PER
    └── PyTorch_Deuling_DDQN_with_PER.py
├── Deuling Double DQN
    └── PyTorch_Deuling_DDQN.py
├── Experiments
    ├── Online TD and true Online TD.ipynb
    └── Seijen2014_True_Online_TD.ipynb
├── MASM
    ├── Differential_semi_gradient_Sarsa.ipynb
    └── prototype.ipynb
├── Off-Policy Policy Gradient
    ├── Experiment Log of failure of  Off_policy_Actor_Critic
    ├── pyTorch_CartPole_Off_Policy_Actor_Critic[not work].ipynb
    └── pyTorch_CartPole_Off_Policy_REINFORCE.ipynb
├── Plain-Actor-Critic
    ├── pyTorch_CartPole_Advantage_Actor_Critic_Entropy_Regularized.ipynb
    ├── pyTorch_CartPole_Advantage_Episode_Wise_Actor_Critic_Huber.ipynb
    └── pyTorch_CartPole_Step_Wise_Bootstrap_Actor_Critic.ipynb
├── RAINBOW
    └── PyTorch_RAINBOW.py
├── README.md
└── REINFORCE
    └── pyTorch_CartPole_REINFORCE.ipynb


/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml


--------------------------------------------------------------------------------
/.idea/RL.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/PyTorch/CartPole [No Visual Input]/A2C/A2C Episodic Sync" isTestSource="false" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Python 3.7 (lab)" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="PyDocumentationSettings">
11 |     <option name="renderExternalDocumentation" value="true" />
12 |   </component>
13 | </module>


--------------------------------------------------------------------------------
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectCodeStyleConfiguration">
2 |   <state>
3 |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (lab)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/RL.iml" filepath="$PROJECT_DIR$/.idea/RL.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PySciProjectComponent">
4 |     <option name="PY_SCI_VIEW" value="true" />
5 |     <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/SharedAdam.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
 2 | # A very nice optimization of Adam method to make it receive shared states.
 3 | import math
 4 | import torch
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class SharedAdam(optim.Adam):
 9 |     """Implements Adam algorithm with shared states.
10 |     """
11 | 
12 |     def __init__(self,
13 |                  params,
14 |                  lr=1e-3,
15 |                  betas=(0.9, 0.999),
16 |                  eps=1e-8,
17 |                  weight_decay=0):
18 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 | 
20 |         for group in self.param_groups:
21 |             for p in group['params']:
22 |                 state = self.state[p]
23 |                 state['step'] = torch.zeros(1)
24 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 | 
27 |     def share_memory(self):
28 |         for group in self.param_groups:
29 |             for p in group['params']:
30 |                 state = self.state[p]
31 |                 state['step'].share_memory_()
32 |                 state['exp_avg'].share_memory_()
33 |                 state['exp_avg_sq'].share_memory_()
34 | 
35 |     def step(self, closure=None):
36 |         """Performs a single optimization step.
37 |         Arguments:
38 |             closure (callable, optional): A closure that reevaluates the model
39 |                 and returns the loss.
40 |         """
41 |         loss = None
42 |         if closure is not None:
43 |             loss = closure()
44 | 
45 |         for group in self.param_groups:
46 |             for p in group['params']:
47 |                 if p.grad is None:
48 |                     continue
49 |                 grad = p.grad.data
50 |                 state = self.state[p]
51 | 
52 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 |                 beta1, beta2 = group['betas']
54 | 
55 |                 state['step'] += 1
56 | 
57 |                 if group['weight_decay'] != 0:
58 |                     grad = grad.add(group['weight_decay'], p.data)
59 | 
60 |                 # Decay the first and second moment running average coefficient
61 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 | 
64 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
65 | 
66 |                 bias_correction1 = 1 - beta1 ** state['step'].item()
67 |                 bias_correction2 = 1 - beta2 ** state['step'].item()
68 |                 step_size = group['lr'] * math.sqrt(
69 |                     bias_correction2) / bias_correction1
70 | 
71 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
72 | 
73 |         return loss


--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import gym
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def evaluate(shared_model, q):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     for episode in range(1):
35 |         action_log_history = []
36 |         for step in range(200):
37 |             actor.load_state_dict(shared_model.state_dict())
38 |             # -----lines below are line-corresponding to the original algorithm----
39 |             obs = np.reshape(obs, [1, -1])
40 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 |             action_log_probability, V = actor(input_actor)
42 |             p = np.exp(action_log_probability[0].detach().cpu())
43 |             action = np.random.choice(2, p=p.numpy())
44 |             action_log_history.append(action_log_probability[0][action])
45 |             obs, reward, done, info = env.step(action)
46 |             if done:
47 |                 q.put(step)
48 |                 return


--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/pyTorch_CartPole_A2C_ES.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | import torch.multiprocessing as mp
 9 | from workers import worker
10 | from evaluate import evaluate
11 | from SharedAdam import SharedAdam
12 | from time import perf_counter
13 | 
14 | class Actor(torch.nn.Module):
15 |     def __init__(self):
16 |         super(Actor,self).__init__()
17 |         self.fc1 = Linear(4, 128)
18 |         self.fc2 = Linear(128, 128)
19 |         self.fc3 = Linear(128, 2)
20 |         self.fc4 = Linear(128, 1)
21 |         self.steps = []
22 | 
23 |     def forward(self, x):
24 |         x = F.relu(self.fc1(x))
25 |         x = F.relu(self.fc2(x))
26 |         action = F.log_softmax(self.fc3(x), dim=-1)
27 |         V = F.relu(self.fc4(x))
28 |         return action, V
29 | 
30 |     def draw(self, eval = False):
31 |         plt.style.use('dark_background')
32 |         plt.figure(figsize=(10, 10))
33 |         if eval:
34 |             plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
35 |             plt.xlabel('Rewards', fontsize='xx-large')
36 |             plt.ylabel('Frequency', fontsize='xx-large')
37 |             plt.hist(self.steps, range=(0, 200))
38 |             plt.show()
39 |         else:
40 |             mid = []
41 |             interval = 3
42 |             for i in range(len(self.steps) - interval):
43 |                 mid.append(np.mean(self.steps[i:i + interval + 1]))
44 |             plt.title('Performance of A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
45 |             plt.xlabel('Episodes', fontsize='xx-large')
46 |             plt.ylabel('Rewards', fontsize='xx-large')
47 |             x_fit = list(range(len(self.steps) - interval))
48 |             plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
49 |             plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
50 |             plt.legend(loc="best", prop={'size': 12})
51 |             plt.show()
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     device = 'cpu'
56 |     mp.set_start_method('spawn')
57 |     # Do not change this unless you have multiple GPU.
58 |     # update test
59 |     q = mp.Queue()
60 |     num_workers = 7
61 |     processes = []
62 |     shared_model = Actor()
63 |     shared_model.to(device)
64 |     shared_model.share_memory()
65 |     optimizer = SharedAdam(shared_model.parameters(), lr=0.003)
66 |     for episode in range(10000):
67 |         t1_start = perf_counter()
68 |         p = mp.Process(target=evaluate, args=(shared_model, q))
69 |         processes.append(p)
70 |         p.start()
71 |         for worker_id in range(num_workers):
72 |             p = mp.Process(target = worker, args = (shared_model, optimizer))
73 |             processes.append(p)
74 |             p.start()
75 |         for p in processes:
76 |             p.join()
77 |         shared_model.steps.append(q.get())
78 |         print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
79 |         if np.mean(shared_model.steps[-25:]) == 199:
80 |             break
81 |         t1_stop = perf_counter()
82 |         print("Elapsed time during the whole program in seconds:",
83 |               t1_stop - t1_start)
84 |     shared_model.draw()
85 |     shared_model.step = []
86 |     for episode in range(15):
87 |         for worker_id in range(6):
88 |             p = mp.Process(target=evaluate, args=(shared_model, q))
89 |     for p in processes:
90 |         p.join()
91 |     while not q.empty():
92 |         shared_model.steps.append(q.get())
93 |     shared_model.steps.sort()
94 |     shared_model.draw(eval = True)
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/A2C/A2C Episodic Sync/workers.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def worker(shared_model, optimizer):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     gamma = 0.99
35 |     steps = []
36 |     eps = np.finfo(np.float32).eps.item()
37 |     for episode in range(1):
38 |         action_log_history = []
39 |         V_history = []
40 |         for step in range(200):
41 |             actor.load_state_dict(shared_model.state_dict())
42 |             # -----lines below are line-corresponding to the original algorithm----
43 |             obs = np.reshape(obs, [1, -1])
44 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
45 |             action_log_probability, V = actor(input_actor)
46 |             p = np.exp(action_log_probability[0].detach().cpu())
47 |             action = np.random.choice(2, p=p.numpy())
48 |             action_log_history.append(action_log_probability[0][action])
49 |             V_history.append(V)
50 |             obs, reward, done, info = env.step(action)
51 |             if done:
52 |                 if step == 199:
53 |                     break
54 |                 actor.zero_grad()
55 |                 steps.append(step)
56 |                 print(f'episode {episode}, step {step}', end='\r')
57 |                 obs = env.reset()
58 |                 reward_list = np.ones((step + 1,))
59 |                 for i in range(len(reward_list) - 2, -1, -1):
60 |                     reward_list[i] += reward_list[i + 1] * gamma
61 |                 reward_list -= np.mean(reward_list)
62 |                 reward_list /= (np.std(reward_list) + eps)
63 |                 Critic_Loss = []
64 |                 Delta = []
65 |                 for monte_carlo_return, V in zip(reward_list, V_history):
66 |                     Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
67 |                     Delta.append(monte_carlo_return - V.detach())
68 |                 Actor_Loss = []
69 |                 entropy = 0
70 |                 for log_p in action_log_history:
71 |                     entropy -= log_p * torch.exp(log_p)
72 |                 for delta, log_prob in zip(Delta, action_log_history):
73 |                     Actor_Loss.append(-log_prob * delta.detach())
74 |                 loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy*0.01
75 |                 loss.backward()
76 |                 ensure_shared_grads(actor, shared_model)
77 |                 optimizer.step()
78 |                 break
79 | 
80 | 
81 | def ensure_shared_grads(model, shared_model):
82 |     for param, shared_param in zip(model.parameters(),
83 |                                    shared_model.parameters()):
84 |         if shared_param.grad is not None:
85 |             return
86 |         shared_param._grad = param.grad
87 | 


--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/SharedAdam.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
 2 | # A very nice optimization of Adam method to make it receive shared states.
 3 | import math
 4 | import torch
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class SharedAdam(optim.Adam):
 9 |     """Implements Adam algorithm with shared states.
10 |     """
11 | 
12 |     def __init__(self,
13 |                  params,
14 |                  lr=1e-3,
15 |                  betas=(0.9, 0.999),
16 |                  eps=1e-8,
17 |                  weight_decay=0):
18 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 | 
20 |         for group in self.param_groups:
21 |             for p in group['params']:
22 |                 state = self.state[p]
23 |                 state['step'] = torch.zeros(1)
24 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 | 
27 |     def share_memory(self):
28 |         for group in self.param_groups:
29 |             for p in group['params']:
30 |                 state = self.state[p]
31 |                 state['step'].share_memory_()
32 |                 state['exp_avg'].share_memory_()
33 |                 state['exp_avg_sq'].share_memory_()
34 | 
35 |     def step(self, closure=None):
36 |         """Performs a single optimization step.
37 |         Arguments:
38 |             closure (callable, optional): A closure that reevaluates the model
39 |                 and returns the loss.
40 |         """
41 |         loss = None
42 |         if closure is not None:
43 |             loss = closure()
44 | 
45 |         for group in self.param_groups:
46 |             for p in group['params']:
47 |                 if p.grad is None:
48 |                     continue
49 |                 grad = p.grad.data
50 |                 state = self.state[p]
51 | 
52 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 |                 beta1, beta2 = group['betas']
54 | 
55 |                 state['step'] += 1
56 | 
57 |                 if group['weight_decay'] != 0:
58 |                     grad = grad.add(group['weight_decay'], p.data)
59 | 
60 |                 # Decay the first and second moment running average coefficient
61 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 | 
64 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
65 | 
66 |                 bias_correction1 = 1 - beta1 ** state['step'].item()
67 |                 bias_correction2 = 1 - beta2 ** state['step'].item()
68 |                 step_size = group['lr'] * math.sqrt(
69 |                     bias_correction2) / bias_correction1
70 | 
71 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
72 | 
73 |         return loss
74 | 


--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import gym
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def evaluate(shared_model, q):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     for episode in range(1):
35 |         action_log_history = []
36 |         for step in range(200):
37 |             actor.load_state_dict(shared_model.state_dict())
38 |             # -----lines below are line-corresponding to the original algorithm----
39 |             obs = np.reshape(obs, [1, -1])
40 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 |             action_log_probability, V = actor(input_actor)
42 |             p = np.exp(action_log_probability[0].detach().cpu())
43 |             action = np.random.choice(2, p=p.numpy())
44 |             action_log_history.append(action_log_probability[0][action])
45 |             obs, reward, done, info = env.step(action)
46 |             if done:
47 |                 q.put(step)
48 |                 return
49 | 


--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/pyTorch_CartPole_A2C_IS.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | import torch.multiprocessing as mp
 9 | from workers import worker
10 | from evaluate import evaluate
11 | from SharedAdam import SharedAdam
12 | from time import perf_counter
13 | 
14 | 
15 | class Actor(torch.nn.Module):
16 |     def __init__(self):
17 |         super(Actor, self).__init__()
18 |         self.fc1 = Linear(4, 128)
19 |         self.fc2 = Linear(128, 128)
20 |         self.fc3 = Linear(128, 2)
21 |         self.fc4 = Linear(128, 1)
22 |         self.steps = []
23 | 
24 |     def forward(self, x):
25 |         x = F.relu(self.fc1(x))
26 |         x = F.relu(self.fc2(x))
27 |         action = F.log_softmax(self.fc3(x), dim=-1)
28 |         V = F.relu(self.fc4(x))
29 |         return action, V
30 | 
31 |     def draw(self, eval=False):
32 |         plt.style.use('dark_background')
33 |         plt.figure(figsize=(10, 10))
34 |         if eval:
35 |             plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
36 |             plt.xlabel('Rewards', fontsize='xx-large')
37 |             plt.ylabel('Frequency', fontsize='xx-large')
38 |             plt.hist(self.steps, range=(0, 200))
39 |             plt.show()
40 |         else:
41 |             mid = []
42 |             interval = 3
43 |             for i in range(len(self.steps) - interval):
44 |                 mid.append(np.mean(self.steps[i:i + interval + 1]))
45 |             plt.title('Performance of A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
46 |             plt.xlabel('Episodes', fontsize='xx-large')
47 |             plt.ylabel('Rewards', fontsize='xx-large')
48 |             x_fit = list(range(len(self.steps) - interval))
49 |             plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
50 |             plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
51 |             plt.legend(loc="best", prop={'size': 12})
52 |             plt.show()
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     device = 'cpu'
57 |     mp.set_start_method('spawn')
58 |     # Do not change this unless you have multiple GPU.
59 |     # update test
60 |     q = mp.Queue()
61 |     num_workers = 7
62 |     T = 150
63 |     processes = []
64 |     shared_model = Actor()
65 |     shared_model.to(device)
66 |     shared_model.share_memory()
67 |     optimizer = SharedAdam(shared_model.parameters(), lr=0.003)
68 |     for episode in range(10000):
69 |         t1_start = perf_counter()
70 |         p = mp.Process(target=evaluate, args=(shared_model, q))
71 |         processes.append(p)
72 |         p.start()
73 |         for worker_id in range(num_workers):
74 |             p = mp.Process(target=worker, args=(shared_model, optimizer, T))
75 |             processes.append(p)
76 |             p.start()
77 |         for p in processes:
78 |             p.join()
79 |         shared_model.steps.append(q.get())
80 |         print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
81 |         if np.mean(shared_model.steps[-25:]) == 199:
82 |             break
83 |         t1_stop = perf_counter()
84 |         print("Elapsed time during the whole program in seconds:",
85 |               t1_stop - t1_start)
86 |     shared_model.draw()
87 |     shared_model.step = []
88 |     for episode in range(15):
89 |         for worker_id in range(6):
90 |             p = mp.Process(target=evaluate, args=(shared_model, q))
91 |     for p in processes:
92 |         p.join()
93 |     while not q.empty():
94 |         shared_model.steps.append(q.get())
95 |     shared_model.steps.sort()
96 |     shared_model.draw(eval=True)
97 | 


--------------------------------------------------------------------------------
/A2C/A2C Time Interval Sync/workers.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def worker(shared_model, optimizer, T):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     gamma = 0.99
35 |     steps = []
36 |     eps = np.finfo(np.float32).eps.item()
37 |     actor.load_state_dict(shared_model.state_dict())
38 |     t = 0
39 |     for episode in range(1):
40 |         action_log_history = []
41 |         V_history = []
42 |         for step in range(200):
43 |             # -----lines below are line-corresponding to the original algorithm----
44 |             obs = np.reshape(obs, [1, -1])
45 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
46 |             action_log_probability, V = actor(input_actor)
47 |             p = np.exp(action_log_probability[0].detach().cpu())
48 |             action = np.random.choice(2, p=p.numpy())
49 |             action_log_history.append(action_log_probability[0][action])
50 |             V_history.append(V)
51 |             obs, reward, done, info = env.step(action)
52 |             t += 1
53 |             if done or t >= T:
54 |                 if step == 199:
55 |                     break
56 |                 actor.zero_grad()
57 |                 steps.append(step)
58 |                 if done:
59 |                     print(f'episode {episode}, step {step}', end='\r')
60 |                     obs = env.reset()
61 |                 reward_list = np.ones((step + 1,))
62 |                 for i in range(len(reward_list) - 2, -1, -1):
63 |                     reward_list[i] += reward_list[i + 1] * gamma
64 |                 reward_list -= np.mean(reward_list)
65 |                 reward_list /= (np.std(reward_list) + eps)
66 |                 Critic_Loss = []
67 |                 Delta = []
68 |                 for monte_carlo_return, V in zip(reward_list, V_history):
69 |                     Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
70 |                     Delta.append(monte_carlo_return - V.detach())
71 |                 Actor_Loss = []
72 |                 entropy = 0
73 |                 for log_p in action_log_history:
74 |                     entropy -= log_p * torch.exp(log_p)
75 |                 for delta, log_prob in zip(Delta, action_log_history):
76 |                     Actor_Loss.append(-log_prob * delta.detach())
77 |                 loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy * 0.01
78 |                 loss.backward()
79 |                 ensure_shared_grads(actor, shared_model)
80 |                 optimizer.step()
81 |                 break
82 | 
83 | 
84 | def ensure_shared_grads(model, shared_model):
85 |     for param, shared_param in zip(model.parameters(),
86 |                                    shared_model.parameters()):
87 |         if shared_param.grad is not None:
88 |             return
89 |         shared_param._grad = param.grad
90 | 


--------------------------------------------------------------------------------
/A2C/README About Vairations.md:
--------------------------------------------------------------------------------
 1 | ## Review of A2C and A3C
 2 | We all know the A3C is using a episode count to control the async
 3 | process. In this procedure, child process will return the grad
 4 | after given episodes or finished early. They will be sync in the next
 5 | run. 
 6 | 
 7 | A2C however, waits for each child process to finish its segments.
 8 | 
 9 | ## My Variation
10 | In this project, I change the sync or async method from A2C and A3C
11 | into episode wise. That is, the child process will only return 
12 | the grad AFTER it completes the episode.
13 | 
14 | In variation of A3C, child process will return the grad as soon
15 | as it finished the current episode, return the latest grad, do the backward
16 | then sync with the latest model parameters to go next. It will be put
17 | into an infinite loop. This method will never call join() method to process,
18 | instead, it will monitor a queue that is filled by each child process's game
19 | record. If the last 100 of them are maximum rewards, it will send terminate
20 | message to all child processes.
21 | 
22 | In variation of A2C, child process will return the grad as soon 
23 | as it finished the current episode but all processes will wait every one
24 | finished and sync with the updated model together. In this case we need
25 | a loop and call joint in each loop. If the queue fulfills the convergence requirement,
26 | loop will be ended.
27 | 
28 | Additionally, I have set all mode do not learn at all if it reaches the maximum 
29 | reward to facilitate converging. [need TESTED]
30 | 
31 | ## More Variations:
32 | There isn't too much difference in each child processes. It could 
33 | be a exploratory directions. And, how to measure the differences of each 
34 | method need a large amount of time.
35 | 
36 | ## Additional Warnings:
37 | when initialize multi-processing, fork does not work. If you are in Linux or Mac, change to spawn.
38 | Reasons unclear to me at this point.
39 | 


--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/SharedAdam.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
 2 | # A very nice optimization of Adam method to make it receive shared states.
 3 | import math
 4 | import torch
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class SharedAdam(optim.Adam):
 9 |     """Implements Adam algorithm with shared states.
10 |     """
11 | 
12 |     def __init__(self,
13 |                  params,
14 |                  lr=1e-3,
15 |                  betas=(0.9, 0.999),
16 |                  eps=1e-8,
17 |                  weight_decay=0):
18 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 | 
20 |         for group in self.param_groups:
21 |             for p in group['params']:
22 |                 state = self.state[p]
23 |                 state['step'] = torch.zeros(1)
24 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 | 
27 |     def share_memory(self):
28 |         for group in self.param_groups:
29 |             for p in group['params']:
30 |                 state = self.state[p]
31 |                 state['step'].share_memory_()
32 |                 state['exp_avg'].share_memory_()
33 |                 state['exp_avg_sq'].share_memory_()
34 | 
35 |     def step(self, closure=None):
36 |         """Performs a single optimization step.
37 |         Arguments:
38 |             closure (callable, optional): A closure that reevaluates the model
39 |                 and returns the loss.
40 |         """
41 |         loss = None
42 |         if closure is not None:
43 |             loss = closure()
44 | 
45 |         for group in self.param_groups:
46 |             for p in group['params']:
47 |                 if p.grad is None:
48 |                     continue
49 |                 grad = p.grad.data
50 |                 state = self.state[p]
51 | 
52 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 |                 beta1, beta2 = group['betas']
54 | 
55 |                 state['step'] += 1
56 | 
57 |                 if group['weight_decay'] != 0:
58 |                     grad = grad.add(group['weight_decay'], p.data)
59 | 
60 |                 # Decay the first and second moment running average coefficient
61 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 | 
64 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
65 | 
66 |                 bias_correction1 = 1 - beta1 ** state['step'].item()
67 |                 bias_correction2 = 1 - beta2 ** state['step'].item()
68 |                 step_size = group['lr'] * math.sqrt(
69 |                     bias_correction2) / bias_correction1
70 | 
71 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
72 | 
73 |         return loss
74 | 


--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import gym
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def evaluate(shared_model, q):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     for episode in range(1):
35 |         action_log_history = []
36 |         for step in range(200):
37 |             actor.load_state_dict(shared_model.state_dict())
38 |             # -----lines below are line-corresponding to the original algorithm----
39 |             obs = np.reshape(obs, [1, -1])
40 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 |             action_log_probability, V = actor(input_actor)
42 |             p = np.exp(action_log_probability[0].detach().cpu())
43 |             action = np.random.choice(2, p=p.numpy())
44 |             action_log_history.append(action_log_probability[0][action])
45 |             obs, reward, done, info = env.step(action)
46 |             if done:
47 |                 q.put(step)
48 |                 return
49 | 


--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/pyTorch_CartPole_A3C_EA.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from torch.nn import Linear
  5 | import torch.nn.functional as F
  6 | import torch.multiprocessing as mp
  7 | from workers_PlayGround import worker
  8 | from evaluate import evaluate
  9 | from SharedAdam import SharedAdam
 10 | 
 11 | 
 12 | class Actor(torch.nn.Module):
 13 |     def __init__(self):
 14 |         super(Actor,self).__init__()
 15 |         self.fc1 = Linear(4, 128)
 16 |         self.fc2 = Linear(128, 128)
 17 |         self.fc3 = Linear(128, 2)
 18 |         self.fc4 = Linear(128, 1)
 19 |         self.steps = []
 20 | 
 21 |     def forward(self, x):
 22 |         x = F.relu(self.fc1(x))
 23 |         x = F.relu(self.fc2(x))
 24 |         action = F.log_softmax(self.fc3(x), dim=-1)
 25 |         V = F.relu(self.fc4(x))
 26 |         return action, V
 27 | 
 28 |     def draw(self, eval = False):
 29 |         plt.style.use('dark_background')
 30 |         plt.figure(figsize=(10, 10))
 31 |         if eval:
 32 |             plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
 33 |             plt.xlabel('Rewards', fontsize='xx-large')
 34 |             plt.ylabel('Frequency', fontsize='xx-large')
 35 |             plt.hist(self.steps, range=(0, 200))
 36 |             plt.show()
 37 |         else:
 38 |             mid = []
 39 |             interval = 3
 40 |             for i in range(len(self.steps) - interval):
 41 |                 mid.append(np.mean(self.steps[i:i + interval + 1]))
 42 |             plt.title('Performance of True Episode-Wise A3C on CartPole_V0', fontsize='xx-large')
 43 |             plt.xlabel('Episodes', fontsize='xx-large')
 44 |             plt.ylabel('Rewards', fontsize='xx-large')
 45 |             x_fit = list(range(len(self.steps) - interval))
 46 |             plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
 47 |             plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
 48 |             plt.legend(loc="best", prop={'size': 12})
 49 |             plt.show()
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     device = 'cpu'
 54 |     mp.set_start_method('spawn')
 55 |     # Do not change this unless you have multiple GPU.
 56 |     # update test
 57 |     q = mp.Queue()
 58 |     num_workers = 7
 59 |     processes = []
 60 |     shared_model = Actor()
 61 |     shared_model.to(device)
 62 |     shared_model.share_memory()
 63 |     optimizer = SharedAdam(shared_model.parameters(), lr=0.001)
 64 |     p = mp.Process(target=evaluate, args=(shared_model, q))
 65 |     processes.append(p)
 66 |     p.start()
 67 |     for worker_id in range(num_workers):
 68 |         p = mp.Process(target = worker, args = (shared_model, optimizer, q))
 69 |         processes.append(p)
 70 |         p.start()
 71 |     # for p in processes:
 72 |     #     p.join()
 73 |     episode = 0
 74 | 
 75 |     while True:
 76 |         if not q.empty():
 77 |             shared_model.steps.append(q.get())
 78 |             print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
 79 |             episode += 1
 80 |         if len(shared_model.steps) > 25:
 81 |             if np.mean(shared_model.steps[-100:]) == 199:
 82 |                 for p in processes:
 83 |                     p.terminate()
 84 |                 while not q.empty():
 85 |                     shared_model.steps.append(q.get())
 86 |                     print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
 87 |                     episode += 1
 88 |                 break
 89 |     shared_model.draw()
 90 |     # ----evaluation----
 91 |     shared_model.step = []
 92 |     for episode in range(15):
 93 |         for worker_id in range(6):
 94 |             p = mp.Process(target=evaluate, args=(shared_model, q))
 95 |     for p in processes:
 96 |         p.join()
 97 |     while not q.empty():
 98 |         shared_model.steps.append(q.get())
 99 |     shared_model.steps.sort()
100 |     shared_model.draw(eval = True)
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/A3C/A3C Episodic Async/workers_PlayGround.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def worker(shared_model, optimizer,q):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     gamma = 0.99
35 |     eps = np.finfo(np.float32).eps.item()
36 |     while True:
37 |         action_log_history = []
38 |         V_history = []
39 |         actor.load_state_dict(shared_model.state_dict())
40 |         for step in range(200):
41 |             # -----lines below are line-corresponding to the original algorithm----
42 |             obs = np.reshape(obs, [1, -1])
43 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
44 |             action_log_probability, V = actor(input_actor)
45 |             p = np.exp(action_log_probability[0].detach().cpu())
46 |             action = np.random.choice(2, p=p.numpy())
47 |             action_log_history.append(action_log_probability[0][action])
48 |             V_history.append(V)
49 |             obs, reward, done, info = env.step(action)
50 |             if done:
51 |                 q.put(step)
52 |                 actor.zero_grad()
53 |                 obs = env.reset()
54 |                 if step == 199:
55 |                     break
56 |                 reward_list = np.ones((step + 1,))
57 |                 for i in range(len(reward_list) - 2, -1, -1):
58 |                     reward_list[i] += reward_list[i + 1] * gamma
59 |                 reward_list -= np.mean(reward_list)
60 |                 reward_list /= (np.std(reward_list) + eps)
61 |                 Critic_Loss = []
62 |                 Delta = []
63 |                 for monte_carlo_return, V in zip(reward_list, V_history):
64 |                     Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
65 |                     Delta.append(monte_carlo_return - V.detach())
66 |                 Actor_Loss = []
67 |                 entropy = 0
68 |                 for log_p in action_log_history:
69 |                     entropy -= log_p * torch.exp(log_p)
70 |                 for delta, log_prob in zip(Delta, action_log_history):
71 |                     Actor_Loss.append(-log_prob * delta.detach())
72 |                 loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy*0.01
73 |                 loss.backward()
74 |                 ensure_shared_grads(actor, shared_model)
75 |                 optimizer.step()
76 |                 break
77 | 
78 | 
79 | def ensure_shared_grads(model, shared_model):
80 |     for param, shared_param in zip(model.parameters(),
81 |                                    shared_model.parameters()):
82 |         if shared_param.grad is not None:
83 |             return
84 |         shared_param._grad = param.grad
85 | 


--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/SharedAdam.py:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
 2 | # A very nice optimization of Adam method to make it receive shared states.
 3 | import math
 4 | import torch
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class SharedAdam(optim.Adam):
 9 |     """Implements Adam algorithm with shared states.
10 |     """
11 | 
12 |     def __init__(self,
13 |                  params,
14 |                  lr=1e-3,
15 |                  betas=(0.9, 0.999),
16 |                  eps=1e-8,
17 |                  weight_decay=0):
18 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
19 | 
20 |         for group in self.param_groups:
21 |             for p in group['params']:
22 |                 state = self.state[p]
23 |                 state['step'] = torch.zeros(1)
24 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
25 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
26 | 
27 |     def share_memory(self):
28 |         for group in self.param_groups:
29 |             for p in group['params']:
30 |                 state = self.state[p]
31 |                 state['step'].share_memory_()
32 |                 state['exp_avg'].share_memory_()
33 |                 state['exp_avg_sq'].share_memory_()
34 | 
35 |     def step(self, closure=None):
36 |         """Performs a single optimization step.
37 |         Arguments:
38 |             closure (callable, optional): A closure that reevaluates the model
39 |                 and returns the loss.
40 |         """
41 |         loss = None
42 |         if closure is not None:
43 |             loss = closure()
44 | 
45 |         for group in self.param_groups:
46 |             for p in group['params']:
47 |                 if p.grad is None:
48 |                     continue
49 |                 grad = p.grad.data
50 |                 state = self.state[p]
51 | 
52 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
53 |                 beta1, beta2 = group['betas']
54 | 
55 |                 state['step'] += 1
56 | 
57 |                 if group['weight_decay'] != 0:
58 |                     grad = grad.add(group['weight_decay'], p.data)
59 | 
60 |                 # Decay the first and second moment running average coefficient
61 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
62 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
63 | 
64 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
65 | 
66 |                 bias_correction1 = 1 - beta1 ** state['step'].item()
67 |                 bias_correction2 = 1 - beta2 ** state['step'].item()
68 |                 step_size = group['lr'] * math.sqrt(
69 |                     bias_correction2) / bias_correction1
70 | 
71 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
72 | 
73 |         return loss
74 | 


--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import gym
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def evaluate(shared_model, q):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     for episode in range(1):
35 |         action_log_history = []
36 |         for step in range(200):
37 |             actor.load_state_dict(shared_model.state_dict())
38 |             # -----lines below are line-corresponding to the original algorithm----
39 |             obs = np.reshape(obs, [1, -1])
40 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
41 |             action_log_probability, V = actor(input_actor)
42 |             p = np.exp(action_log_probability[0].detach().cpu())
43 |             action = np.random.choice(2, p=p.numpy())
44 |             action_log_history.append(action_log_probability[0][action])
45 |             obs, reward, done, info = env.step(action)
46 |             if done:
47 |                 q.put(step)
48 |                 return
49 | 


--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/pyTorch_CartPole_A3C_IA.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from torch.nn import Linear
  5 | import torch.nn.functional as F
  6 | import torch.multiprocessing as mp
  7 | from workers import worker
  8 | from evaluate import evaluate
  9 | from SharedAdam import SharedAdam
 10 | 
 11 | 
 12 | class Actor(torch.nn.Module):
 13 |     def __init__(self):
 14 |         super(Actor, self).__init__()
 15 |         self.fc1 = Linear(4, 128)
 16 |         self.fc2 = Linear(128, 128)
 17 |         self.fc3 = Linear(128, 2)
 18 |         self.fc4 = Linear(128, 1)
 19 |         self.steps = []
 20 | 
 21 |     def forward(self, x):
 22 |         x = F.relu(self.fc1(x))
 23 |         x = F.relu(self.fc2(x))
 24 |         action = F.log_softmax(self.fc3(x), dim=-1)
 25 |         V = F.relu(self.fc4(x))
 26 |         return action, V
 27 | 
 28 |     def draw(self, eval=False):
 29 |         plt.style.use('dark_background')
 30 |         plt.figure(figsize=(10, 10))
 31 |         if eval:
 32 |             plt.title('Evaluation of trained A3C with Shared Adam Optimizer on CartPole_V0', fontsize='xx-large')
 33 |             plt.xlabel('Rewards', fontsize='xx-large')
 34 |             plt.ylabel('Frequency', fontsize='xx-large')
 35 |             plt.hist(self.steps, range=(0, 200))
 36 |             plt.show()
 37 |         else:
 38 |             mid = []
 39 |             interval = 3
 40 |             for i in range(len(self.steps) - interval):
 41 |                 mid.append(np.mean(self.steps[i:i + interval + 1]))
 42 |             plt.title('Performance of True Episode-Wise A3C on CartPole_V0', fontsize='xx-large')
 43 |             plt.xlabel('Episodes', fontsize='xx-large')
 44 |             plt.ylabel('Rewards', fontsize='xx-large')
 45 |             x_fit = list(range(len(self.steps) - interval))
 46 |             plt.plot(x_fit, self.steps[interval:], '-', c='gray', label='Episode-Wise data')
 47 |             plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
 48 |             plt.legend(loc="best", prop={'size': 12})
 49 |             plt.show()
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     device = 'cpu'
 54 |     mp.set_start_method('spawn')
 55 |     # Do not change this unless you have multiple GPU.
 56 |     # update test
 57 |     q = mp.Queue()
 58 |     num_workers = 15
 59 |     processes = []
 60 |     shared_model = Actor()
 61 |     shared_model.to(device)
 62 |     shared_model.share_memory()
 63 |     optimizer = SharedAdam(shared_model.parameters(), lr=0.001)
 64 |     p = mp.Process(target=evaluate, args=(shared_model, q))
 65 |     processes.append(p)
 66 |     p.start()
 67 |     T = 300
 68 |     for worker_id in range(num_workers):
 69 |         p = mp.Process(target=worker, args=(shared_model, optimizer, q, T))
 70 |         processes.append(p)
 71 |         p.start()
 72 |     # for p in processes:
 73 |     #     p.join()
 74 |     episode = 0
 75 | 
 76 |     while True:
 77 |         if not q.empty():
 78 |             shared_model.steps.append(q.get())
 79 |             print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
 80 |             episode += 1
 81 |         if len(shared_model.steps) > 25:
 82 |             if np.mean(shared_model.steps[-50:]) == 199:
 83 |                 for p in processes:
 84 |                     p.terminate()
 85 |                 while not q.empty():
 86 |                     shared_model.steps.append(q.get())
 87 |                     print(f'Training on episode {episode} Step {shared_model.steps[-1]}')
 88 |                     episode += 1
 89 |                 break
 90 |     shared_model.draw()
 91 |     # ----evaluation----
 92 |     shared_model.step = []
 93 |     for episode in range(15):
 94 |         for worker_id in range(6):
 95 |             p = mp.Process(target=evaluate, args=(shared_model,q))
 96 |     for p in processes:
 97 |         p.join()
 98 |     while not q.empty():
 99 |         shared_model.steps.append(q.get())
100 |     shared_model.steps.sort()
101 |     shared_model.draw(eval=True)
102 | 


--------------------------------------------------------------------------------
/A3C/A3C Time Interval Async/workers.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from torch.nn import Linear, ReLU
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | def worker(shared_model, optimizer, q, T):
11 |     class Actor(torch.nn.Module):
12 |         def __init__(self):
13 |             super(Actor, self).__init__()
14 |             self.fc1 = Linear(4, 128)
15 |             self.fc2 = Linear(128, 128)
16 |             self.fc3 = Linear(128, 2)
17 |             self.fc4 = Linear(128, 1)
18 |             self.steps = []
19 | 
20 |         def forward(self, x):
21 |             x = F.relu(self.fc1(x))
22 |             x = F.relu(self.fc2(x))
23 |             action = F.log_softmax(self.fc3(x), dim=-1)
24 |             V = F.relu(self.fc4(x))
25 |             return action, V
26 | 
27 |     device = 'cpu'
28 |     # I do not recommend using GPU for this method. CPU is much faster.
29 |     # Change this to cuda only if you have a poor CPU or on a cloud
30 |     env = gym.make('CartPole-v0')
31 |     obs = env.reset()
32 |     actor = Actor()
33 |     actor.to(device)
34 |     gamma = 0.99
35 |     eps = np.finfo(np.float32).eps.item()
36 |     t = 0
37 |     while True:
38 |         action_log_history = []
39 |         V_history = []
40 |         for step in range(200):
41 |             # -----lines below are line-corresponding to the original algorithm----
42 |             actor.load_state_dict(shared_model.state_dict())
43 |             obs = np.reshape(obs, [1, -1])
44 |             input_actor = Variable(torch.from_numpy(obs).float()).to(device)
45 |             action_log_probability, V = actor(input_actor)
46 |             p = np.exp(action_log_probability[0].detach().cpu())
47 |             action = np.random.choice(2, p=p.numpy())
48 |             action_log_history.append(action_log_probability[0][action])
49 |             V_history.append(V)
50 |             obs, reward, done, info = env.step(action)
51 |             t += 1
52 |             if done or t >= T:
53 |                 if done:
54 |                     q.put(step)
55 |                 actor.zero_grad()
56 |                 if done:
57 |                     obs = env.reset()
58 |                 reward_list = np.ones((step + 1,))
59 |                 for i in range(len(reward_list) - 2, -1, -1):
60 |                     reward_list[i] += reward_list[i + 1] * gamma
61 |                 reward_list -= np.mean(reward_list)
62 |                 reward_list /= (np.std(reward_list) + eps)
63 |                 Critic_Loss = []
64 |                 Delta = []
65 |                 for monte_carlo_return, V in zip(reward_list, V_history):
66 |                     Critic_Loss.append(F.smooth_l1_loss(V, torch.tensor([[monte_carlo_return]]).to(device)))
67 |                     Delta.append(monte_carlo_return - V.detach())
68 |                 Actor_Loss = []
69 |                 entropy = 0
70 |                 for log_p in action_log_history:
71 |                     entropy -= log_p * torch.exp(log_p)
72 |                 Delta = Delta[len(Delta) - len(action_log_history):]
73 |                 for delta, log_prob in zip(Delta, action_log_history):
74 |                     Actor_Loss.append(-log_prob * delta.detach())
75 |                 loss = torch.stack(Critic_Loss).sum() + torch.stack(Actor_Loss).sum() + entropy * 0.01
76 |                 loss.backward()
77 |                 ensure_shared_grads(actor, shared_model)
78 |                 optimizer.step()
79 |                 action_log_history = []
80 |                 V_history = []
81 |                 actor.load_state_dict(shared_model.state_dict())
82 |                 if done:
83 |                     t = 0
84 |                     break
85 |                 else:
86 |                     t = 0
87 | 
88 | def ensure_shared_grads(model, shared_model):
89 |     for param, shared_param in zip(model.parameters(),
90 |                                    shared_model.parameters()):
91 |         if shared_param.grad is not None:
92 |             return
93 |         shared_param._grad = param.grad
94 | 


--------------------------------------------------------------------------------
/A3C/README About Vairations.md:
--------------------------------------------------------------------------------
 1 | ## Review of A2C and A3C
 2 | We all know the A3C is using a episode count to control the async
 3 | process. In this procedure, child process will return the grad
 4 | after given episodes or finished early. They will be sync in the next
 5 | run. 
 6 | 
 7 | A2C however, waits for each child process to finish its segments.
 8 | 
 9 | ## My Variation
10 | In this project, I change the sync or async method from A2C and A3C
11 | into episode wise. That is, the child process will only return 
12 | the grad AFTER it completes the episode.
13 | 
14 | In variation of A3C, child process will return the grad as soon
15 | as it finished the current episode, return the latest grad, do the backward
16 | then sync with the latest model parameters to go next. It will be put
17 | into an infinite loop. This method will never call join() method to process,
18 | instead, it will monitor a queue that is filled by each child process's game
19 | record. If the last 100 of them are maximum rewards, it will send terminate
20 | message to all child processes.
21 | 
22 | In variation of A2C, child process will return the grad as soon 
23 | as it finished the current episode but all processes will wait every one
24 | finished and sync with the updated model together. In this case we need
25 | a loop and call joint in each loop. If the queue fulfills the convergence requirement,
26 | loop will be ended.
27 | 
28 | Additionally, I have set all mode do not learn at all if it reaches the maximum 
29 | reward to facilitate converging. [need TESTED]
30 | 
31 | ## More Variations:
32 | There isn't too much difference in each child processes. It could 
33 | be a exploratory directions. And, how to measure the differences of each 
34 | method need a large amount of time.
35 | 
36 | ## Additional Warnings:
37 | when initialize multi-processing, fork does not work. If you are in Linux or Mac, change to spawn.
38 | Reasons unclear to me at this point.
39 | 


--------------------------------------------------------------------------------
/Ape-X/PyTorch_Ape-X.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LyWangPX/Reinforcement_Learning_Coding_Examples/2f40f67f5709c9dc4ea3d9dd15b441b627b595a6/Ape-X/PyTorch_Ape-X.py


--------------------------------------------------------------------------------
/D4PG/PyTorch_D4PG.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | 
5 | 


--------------------------------------------------------------------------------
/DDPG/PyTorch_DDPG.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import torch.nn.functional as F
  4 | from collections import deque
  5 | import gym
  6 | import matplotlib.pyplot as plt
  7 | from torch.nn import Linear, ReLU
  8 | 
  9 | """
 10 | This is a vanilla implementation of DDPG. (without PER)
 11 | """
 12 | 
 13 | 
 14 | class Actor(torch.nn.Module):
 15 |     def __init__(self, maxlen=100000):
 16 |         super(Actor, self).__init__()
 17 |         self.fc1 = Linear(3, 256)
 18 |         self.fc2 = Linear(256, 256)
 19 |         self.fc3 = Linear(256, 256)
 20 |         self.fc4 = Linear(256, 1)
 21 |         self.s_buffer = deque(maxlen=maxlen)
 22 |         self.a_buffer = deque(maxlen=maxlen)
 23 |         self.r_buffer = deque(maxlen=maxlen)
 24 |         self.next_s_buffer = deque(maxlen=maxlen)
 25 | 
 26 |     def forward(self, x):
 27 |         x = F.relu(self.fc1(x))
 28 |         x = F.relu(self.fc2(x))
 29 |         x = F.relu(self.fc3(x))
 30 |         action = 2*torch.tanh(self.fc4(x))
 31 |         return action
 32 | 
 33 |     def bufferin(self, s, a, r, next_s):
 34 |         self.s_buffer.append(s)
 35 |         self.a_buffer.append(a)
 36 |         self.r_buffer.append(r)
 37 |         self.next_s_buffer.append(next_s)
 38 | 
 39 |     def sample(self, batch_size=64):
 40 |         indices = np.random.choice(range(len(self.a_buffer)), size=min(len(self.a_buffer), batch_size), replace=False)
 41 |         s_buffer = [self.s_buffer[i] for i in indices]
 42 |         a_buffer = [self.a_buffer[i] for i in indices]
 43 |         r_buffer = [self.r_buffer[i] for i in indices]
 44 |         next_s_buffer = [self.next_s_buffer[i] for i in indices]
 45 |         return a_buffer, s_buffer, r_buffer, next_s_buffer
 46 | 
 47 | 
 48 | class Critic(torch.nn.Module):
 49 |     def __init__(self):
 50 |         super(Critic, self).__init__()
 51 |         self.fc1 = Linear(4, 256)
 52 |         self.fc2 = Linear(256, 512)
 53 |         self.fc3 = Linear(512, 1)
 54 |         self.action = Linear(1, 256)
 55 | 
 56 |     def forward(self, x, a):
 57 |         x = torch.cat([x,a],1)
 58 |         x = F.relu(self.fc1(x))
 59 |         x = F.relu(self.fc2(x))
 60 |         Q = self.fc3(x)
 61 |         return Q
 62 | 
 63 | 
 64 | def evaluate(target_policy, device, final=False):
 65 |     target_policy.eval()
 66 |     env = NormalizedEnv(gym.make('Pendulum-v0'))
 67 |     s = env.reset()
 68 |     if final:
 69 |         result = []
 70 |         for episode in range(100):
 71 |             rewards = 0
 72 |             for step in range(200):
 73 |                 action = target_policy.forward(torch.FloatTensor(s))
 74 |                 s, reward, done, _ = env.step([action.detach()])
 75 |                 rewards += reward
 76 |                 if done:
 77 |                     result.append(rewards)
 78 |                     s = env.reset()
 79 |         return result
 80 |     else:
 81 |         result = []
 82 |         for episode in range(1):
 83 |             rewards = 0
 84 |             for step in range(200):
 85 |                 action = target_policy.forward(torch.FloatTensor(s))
 86 |                 s, reward, done, _ = env.step([float(action)])
 87 |                 rewards += reward
 88 |                 if done:
 89 |                     result.append(rewards)
 90 |                     s = env.reset()
 91 |         return result
 92 | 
 93 | 
 94 | def draw(steps, name):
 95 |     plt.style.use('dark_background')
 96 |     plt.figure(figsize=(10, 10))
 97 |     mid = []
 98 |     interval = 3
 99 |     for i in range(len(steps) - interval):
100 |         mid.append(np.mean(steps[i:i + interval + 1]))
101 |     plt.title(f'{name} DDPG on Pendulum_V0 ', fontsize='xx-large')
102 |     plt.xlabel('Episodes', fontsize='xx-large')
103 |     plt.ylabel(f'{name}', fontsize='xx-large')
104 |     x_fit = list(range(len(steps) - interval))
105 |     plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data')
106 |     plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
107 |     plt.legend(loc="best", prop={'size': 12})
108 |     plt.show()
109 | 
110 | 
111 | # https://github.com/openai/gym/blob/master/gym/core.py
112 | class NormalizedEnv(gym.ActionWrapper):
113 |     """ Wrap action """
114 | 
115 |     def action(self, action):
116 |         act_k = (self.action_space.high - self.action_space.low) / 2.
117 |         act_b = (self.action_space.high + self.action_space.low) / 2.
118 |         return act_k * action + act_b
119 | 
120 |     def reverse_action(self, action):
121 |         act_k_inv = 2. / (self.action_space.high - self.action_space.low)
122 |         act_b = (self.action_space.high + self.action_space.low) / 2.
123 |         return act_k_inv * (action - act_b)
124 | 
125 | 
126 | class Ornstein_Uhlenbeck_Process:
127 |     def __init__(self, dt=0.3):
128 |         self.theta = 0.15
129 |         self.sigma = 0.2
130 |         self.dt = dt
131 |         self.x = 0
132 | 
133 |     def step(self):
134 |         dW = self.dt ** 2 * np.random.normal()
135 |         dx = -self.theta * self.x * self.dt + self.sigma * dW
136 |         self.x += dx
137 |         return self.x
138 | 
139 | 
140 | def main():
141 |     # create two identical model
142 |     # ---hyper parameter---
143 |     gamma = 0.99
144 |     tau = 0.01
145 |     # ---hyper parameter---
146 |     steps = []
147 |     device = 'cpu'
148 |     actor = Actor().to(device)
149 |     actor_optimizer = torch.optim.Adam(actor.parameters(), lr=1e-4)
150 |     target_actor = Actor().to(device)
151 |     critic = Critic().to(device)
152 |     critic_optimizer = torch.optim.Adam(critic.parameters(), lr=1e-3)
153 |     target_critic = Critic().to(device)
154 |     for target_param, param in zip(target_actor.parameters(), actor.parameters()):
155 |         target_param.data.copy_(param.data)
156 |     for target_param, param in zip(target_critic.parameters(), critic.parameters()):
157 |         target_param.data.copy_(param.data)
158 | 
159 |     env = gym.make('Pendulum-v0')
160 |     s = env.reset()
161 |     A_loss = []
162 |     C_loss = []
163 |     actor.train()
164 |     critic.train()
165 |     for episode in range(100):
166 |         rewards = 0
167 |         random_process = Ornstein_Uhlenbeck_Process(dt=0.1)
168 |         for step in range(250):
169 | 
170 |             # LINE 1 Select Action
171 |             action = (actor.forward(torch.FloatTensor(s)) + random_process.step())
172 | 
173 |             # LINE 2 Execute and Observe
174 |             next_s, reward, done, _ = env.step(action.detach())
175 |             # LINE 3 Store
176 |             actor.bufferin(s, action, reward, next_s)
177 | 
178 |             s = next_s
179 |             rewards += reward
180 |             if len(actor.a_buffer) > 180:
181 |                 # LINE 4 SAMPLE a minibatch
182 |                 a_buffer, s_buffer, r_buffer, next_s_buffer = actor.sample()
183 |                 a_buffer = torch.FloatTensor(a_buffer).view(-1,1)
184 |                 s_buffer = torch.FloatTensor(s_buffer).view(-1,3)
185 |                 r_buffer = torch.FloatTensor(r_buffer).view(-1,1)
186 |                 next_s_buffer = torch.FloatTensor(next_s_buffer).view(-1,3)
187 | 
188 |                 # LINE 5 Set y = r + gamma next Q from target critic
189 |                 next_a = target_actor(next_s_buffer.to(device))
190 |                 next_Q = target_critic(next_s_buffer.to(device), next_a.to(device))
191 |                 y = r_buffer.to(device) + gamma * next_Q
192 | 
193 | 
194 |                 # LINE 7 Update the actor policy using sampled policy gradient
195 |                 true_a = actor(s_buffer.to(device))
196 |                 actor_loss_total = critic.forward(s_buffer.to(device), true_a.to(device))
197 |                 actor_loss = -actor_loss_total.mean()
198 |                 actor.zero_grad()
199 |                 actor_loss.backward()
200 |                 actor_optimizer.step()
201 | 
202 |                 # LINE 6 Update critic by minimizing the mse.
203 |                 Q = critic(s_buffer.to(device),
204 |                            a_buffer.float().to(device))
205 |                 critic_loss = torch.nn.functional.mse_loss(Q, y.detach())
206 |                 critic_optimizer.zero_grad()
207 |                 critic_loss.backward()
208 |                 critic_optimizer.step()
209 | 
210 |                 A_loss.append(actor_loss.item())
211 |                 C_loss.append(critic_loss.item())
212 | 
213 |                 # LINE 8 Update the target network
214 |                 for target_param, param in zip(target_actor.parameters(), actor.parameters()):
215 |                     target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
216 | 
217 |                 for target_param, param in zip(target_critic.parameters(), critic.parameters()):
218 |                     target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
219 |             if done:
220 |                 s = env.reset()
221 |                 steps.append(rewards)
222 |                 print(f'episode {episode}, total rewards {steps[-1]}')
223 |                 break
224 |     draw(steps, 'rewards')
225 |     draw(A_loss, 'A_loss')
226 |     draw(C_loss, 'C_loss')
227 |     hist = evaluate(target_actor, device, final=True)
228 |     draw(hist, 'eval')
229 | 
230 | 
231 | if __name__ == '__main__':
232 |     main()
233 | 


--------------------------------------------------------------------------------
/Deuling Double DQN with PER/PyTorch_Deuling_DDQN_with_PER.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import gym
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import torch.nn.functional as F
  6 | from torch.nn import Linear, ReLU
  7 | from collections import deque
  8 | 
  9 | """
 10 | This is a vanilla Deuling Double DQN with PER (softmax absolute delta)
 11 | """
 12 | 
 13 | 
 14 | class Q_network(torch.nn.Module):
 15 |     def __init__(self, n_action=2):
 16 |         super(Q_network, self).__init__()
 17 |         self.input = Linear(4, 256)
 18 |         self.input_to_V = Linear(256, 64)
 19 |         self.input_to_A = Linear(256, 64)
 20 |         self.input_to_V2 = Linear(64, 1)
 21 |         self.input_to_A2 = Linear(64, n_action)
 22 |         self.a_buffer = deque(maxlen=8192)
 23 |         self.r_buffer = deque(maxlen=8192)
 24 |         self.s_buffer = deque(maxlen=8192)
 25 |         self.done_buffer = deque(maxlen=8192)
 26 |         self.next_s_buffer = deque(maxlen=8192)
 27 |         self.priority_buffer = deque(maxlen=8192)
 28 | 
 29 |     def forward(self, x):
 30 |         x = F.relu(self.input(x))
 31 |         V_stream = F.relu(self.input_to_V(x))
 32 |         V_stream = self.input_to_V2(V_stream)
 33 |         A_stream = F.relu(self.input_to_A(x))
 34 |         A_stream = self.input_to_A2(A_stream)
 35 |         A_mean = torch.mean(A_stream, dim=1, keepdim=True)
 36 |         result = V_stream + A_stream - A_mean
 37 |         return result
 38 | 
 39 |     def bufferin(self, tuple_info):
 40 |         # expect tuple_info with content S, A, R, S'
 41 |         # ALL in TENSOR FORMAT
 42 |         state, action, reward, next_S, done, priority = tuple_info
 43 |         self.a_buffer.append(action)
 44 |         self.s_buffer.append(state)
 45 |         self.r_buffer.append(reward)
 46 |         self.next_s_buffer.append(next_S)
 47 |         self.done_buffer.append(done)
 48 |         self.priority_buffer.append(priority)
 49 | 
 50 |     def sample(self, size=64):
 51 |         with torch.no_grad():
 52 |             prob = np.array(F.softmax(torch.stack(list(self.priority_buffer))).view(-1))
 53 |             prob /= prob.sum()
 54 |             sample_indices = np.random.choice(range(len(self.a_buffer)), size=64, p=prob, replace=False)
 55 |             a_sample = [self.a_buffer[i] for i in sample_indices]
 56 |             r_sample = [self.r_buffer[i] for i in sample_indices]
 57 |             s_sample = [self.s_buffer[i] for i in sample_indices]
 58 |             next_s_sample = [self.next_s_buffer[i] for i in sample_indices]
 59 |             done_sample = [self.done_buffer[i] for i in sample_indices]
 60 | 
 61 |             a_sample = torch.Tensor(a_sample).view(-1, 1)
 62 |             r_sample = torch.Tensor(r_sample).view(-1, 1)
 63 |             s_sample = torch.stack(s_sample).view(-1, 4)
 64 |             next_s_sample = torch.stack(next_s_sample).view(-1, 4)
 65 |             done_sample = torch.Tensor(done_sample).view(-1, 1)
 66 | 
 67 |         return s_sample, a_sample, r_sample, next_s_sample, done_sample, sample_indices
 68 | 
 69 | 
 70 | def main():
 71 |     gamma = 0.99
 72 |     beta = 0.25
 73 |     env = gym.make('CartPole-v0')
 74 |     state = env.reset()
 75 |     state = torch.FloatTensor(state).view(-1, 4)
 76 |     Q_target = Q_network()
 77 |     optimizer = torch.optim.Adam(Q_target.parameters(), lr=0.0003)
 78 |     Q_copy = Q_network()
 79 |     for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
 80 |         param_copy.data.copy_(param_target.data)
 81 |     steps = []
 82 |     for episode in range(10000):
 83 |         Q_mean = 0
 84 |         for step in range(200):
 85 |             with torch.no_grad():
 86 |                 Q_list = Q_target.forward(state)
 87 |                 if np.random.random() > beta:
 88 |                     action = np.argmax(Q_list.detach())
 89 |                     next_state, reward, done, _ = env.step(action.item())
 90 |                 else:
 91 |                     action = np.random.randint(2)
 92 |                     next_state, reward, done, _ = env.step(action)
 93 |                 next_state = torch.FloatTensor(next_state).view(-1, 4)
 94 |                 # PER: Calculate delta of this tuple
 95 |                 Q = Q_list[0][action]
 96 |                 Q_prime = Q_copy.forward(next_state)
 97 |                 next_action = np.argmax(Q_prime.detach())
 98 |                 delta = abs(Q - reward + gamma * Q_prime[0][next_action])
 99 |                 delta = delta.view(1)
100 |                 tuple_info = (state, action, torch.Tensor([reward]), next_state, not done, delta)
101 |                 Q_target.bufferin(tuple_info)
102 |             # Learning Part
103 |             if len(Q_target.a_buffer) > 64:
104 |                 s_sample, a_sample, r_sample, next_s_sample, done_sample, ids = Q_target.sample()
105 |                 # Q values from recorded S and A
106 |                 Q = Q_target.forward(s_sample)
107 |                 Q_mean = Q.mean()
108 |                 Q = Q.gather(1, a_sample.long().view(-1, 1))
109 |                 # Q' values from recorded S and A recalculated from Q
110 |                 next_Q = Q_target.forward(next_s_sample)
111 |                 Q_values, Q_actions = torch.max(next_Q.detach(), 1)
112 |                 Q_actions = Q_actions.view(-1, 1)
113 |                 Q_prime = Q_copy.forward(next_s_sample)
114 |                 Q_prime = Q_prime.gather(1, Q_actions.long().view(-1, 1))
115 |                 y = r_sample + gamma * Q_prime * done_sample
116 |                 deltas = abs(Q - y.detach())
117 |                 for delta, id in zip(deltas, ids):
118 |                     Q_target.priority_buffer[id] = delta.view(-1)
119 |                 loss = F.mse_loss(Q, y.detach())
120 |                 Q_target.zero_grad()
121 |                 loss.backward()
122 |                 optimizer.step()
123 | 
124 |             # Loop reset Part
125 |             if not done:
126 |                 state = next_state
127 |             else:
128 |                 state = torch.FloatTensor(env.reset()).view(-1, 4)
129 |                 print(f'episode {episode}, step {step}, Q_average {Q_mean}')
130 |                 steps.append(step)
131 |                 break
132 |         if episode % 3 == 0:
133 |             for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
134 |                 param_copy.data.copy_(param_target.data)
135 |         if episode > 40:
136 |             beta = 5 / episode
137 | 
138 |         if np.mean(steps[-20:]) > 190:
139 |             break
140 | 
141 |     plt.style.use('dark_background')
142 |     plt.figure(figsize=(10, 10))
143 |     mid = []
144 |     interval = 3
145 |     for i in range(len(steps) - interval):
146 |         mid.append(np.mean(steps[i:i + interval + 1]))
147 |     plt.title(f'Deuling DDQN on CartPole-v0 with PER', fontsize='xx-large')
148 |     plt.xlabel('Episodes', fontsize='xx-large')
149 |     plt.ylabel(f'Rewards', fontsize='xx-large')
150 |     x_fit = list(range(len(steps) - interval))
151 |     plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data')
152 |     plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
153 |     plt.legend(loc="best", prop={'size': 12})
154 |     plt.show()
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     main()
159 | 


--------------------------------------------------------------------------------
/Deuling Double DQN/PyTorch_Deuling_DDQN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import gym
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import torch.nn.functional as F
  6 | from torch.nn import Linear
  7 | from collections import deque
  8 | 
  9 | """
 10 | This is a vanilla Dueling Double DQN without PER.
 11 | """
 12 | 
 13 | class Q_network(torch.nn.Module):
 14 |     def __init__(self, n_action=2):
 15 |         super(Q_network, self).__init__()
 16 |         self.input = Linear(4, 256)
 17 |         self.input_to_V = Linear(256, 64)
 18 |         self.input_to_A = Linear(256, 64)
 19 |         self.input_to_V2 = Linear(64, 1)
 20 |         self.input_to_A2 = Linear(64, n_action)
 21 |         self.a_buffer = deque(maxlen=8192)
 22 |         self.r_buffer = deque(maxlen=8192)
 23 |         self.s_buffer = deque(maxlen=8192)
 24 |         self.done_buffer = deque(maxlen=8192)
 25 |         self.next_s_buffer = deque(maxlen=8192)
 26 | 
 27 |     def forward(self, x):
 28 |         x = F.relu(self.input(x))
 29 |         V_stream = F.relu(self.input_to_V(x))
 30 |         V_stream = self.input_to_V2(V_stream)
 31 |         A_stream = F.relu(self.input_to_A(x))
 32 |         A_stream = self.input_to_A2(A_stream)
 33 |         A_mean = torch.mean(A_stream, dim=1, keepdim=True)
 34 |         result = V_stream + A_stream - A_mean
 35 |         return result
 36 | 
 37 |     def bufferin(self, tuple_info):
 38 |         # expect tuple_info with content S, A, R, S'
 39 |         # ALL in TENSOR FORMAT
 40 |         state, action, reward, next_S, done = tuple_info
 41 |         self.a_buffer.append(action)
 42 |         self.s_buffer.append(state)
 43 |         self.r_buffer.append(reward)
 44 |         self.next_s_buffer.append(next_S)
 45 |         self.done_buffer.append(done)
 46 | 
 47 |     def sample(self, size=64):
 48 |         sample_indices = np.random.choice(range(len(self.a_buffer)), 64, replace=False)
 49 |         a_sample = [self.a_buffer[i] for i in sample_indices]
 50 |         r_sample = [self.r_buffer[i] for i in sample_indices]
 51 |         s_sample = [self.s_buffer[i] for i in sample_indices]
 52 |         next_s_sample = [self.next_s_buffer[i] for i in sample_indices]
 53 |         done_sample = [self.done_buffer[i] for i in sample_indices]
 54 | 
 55 |         a_sample = torch.Tensor(a_sample).view(-1, 1)
 56 |         r_sample = torch.Tensor(r_sample).view(-1, 1)
 57 |         s_sample = torch.stack(s_sample).view(-1, 4)
 58 |         next_s_sample = torch.stack(next_s_sample).view(-1, 4)
 59 |         done_sample = torch.Tensor(done_sample).view(-1, 1)
 60 | 
 61 |         return s_sample, a_sample, r_sample, next_s_sample, done_sample
 62 | 
 63 | 
 64 | def main():
 65 |     gamma = 0.99
 66 |     beta = 0.25
 67 |     env = gym.make('CartPole-v0')
 68 |     state = env.reset()
 69 |     state = torch.FloatTensor(state).view(-1, 4)
 70 |     Q_target = Q_network()
 71 |     optimizer = torch.optim.Adam(Q_target.parameters(), lr=0.001)
 72 |     Q_copy = Q_network()
 73 |     for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
 74 |         param_copy.data.copy_(param_target.data)
 75 |     steps = []
 76 |     for episode in range(10000):
 77 |         Q_mean = 0
 78 |         for step in range(200):
 79 |             Q_list = Q_target.forward(state)
 80 |             if np.random.random() > beta:
 81 |                 action = np.argmax(Q_list.detach())
 82 |                 next_state, reward, done, _ = env.step(action.item())
 83 |             else:
 84 |                 action = np.random.randint(2)
 85 |                 next_state, reward, done, _ = env.step(action)
 86 |             next_state = torch.FloatTensor(next_state).view(-1, 4)
 87 |             tuple_info = (state, action, torch.Tensor([reward]), next_state, not done)
 88 |             Q_target.bufferin(tuple_info)
 89 |             # Learning Part
 90 |             if len(Q_target.a_buffer) > 64:
 91 |                 s_sample, a_sample, r_sample, next_s_sample, done_sample = Q_target.sample()
 92 |                 # Q values from recorded S and A
 93 |                 Q = Q_target.forward(s_sample)
 94 |                 Q_mean = Q.mean()
 95 |                 Q = Q.gather(1, a_sample.long().view(-1, 1))
 96 |                 # Q' values from recorded S and A recalculated from Q
 97 |                 next_Q = Q_target.forward(next_s_sample)
 98 |                 Q_values, Q_actions = torch.max(next_Q.detach(), 1)
 99 |                 Q_actions = Q_actions.view(-1,1)
100 |                 Q_prime = Q_copy.forward(next_s_sample)
101 |                 Q_prime = Q_prime.gather(1, Q_actions.long().view(-1, 1))
102 |                 y = r_sample + gamma * Q_prime * done_sample
103 |                 loss = F.mse_loss(Q, y.detach())
104 |                 Q_target.zero_grad()
105 |                 loss.backward()
106 |                 optimizer.step()
107 | 
108 |             # Loop reset Part
109 |             if not done:
110 |                 state = next_state
111 |             else:
112 |                 state = torch.FloatTensor(env.reset()).view(-1, 4)
113 |                 print(f'episode {episode}, step {step}, Q_average {Q_mean}')
114 |                 steps.append(step)
115 |                 break
116 |         if episode % 3 == 0:
117 |             for param_target, param_copy in zip(Q_target.parameters(), Q_copy.parameters()):
118 |                 param_copy.data.copy_(param_target.data)
119 |         if episode > 40:
120 |             beta = 5/episode
121 | 
122 |         if np.mean(steps[-20:]) > 190:
123 |             break
124 | 
125 | 
126 |     plt.style.use('dark_background')
127 |     plt.figure(figsize=(10, 10))
128 |     mid = []
129 |     interval = 3
130 |     for i in range(len(steps) - interval):
131 |         mid.append(np.mean(steps[i:i + interval + 1]))
132 |     plt.title(f'Deuling DDQN on CartPole-v0', fontsize='xx-large')
133 |     plt.xlabel('Episodes', fontsize='xx-large')
134 |     plt.ylabel(f'Rewards', fontsize='xx-large')
135 |     x_fit = list(range(len(steps) - interval))
136 |     plt.plot(x_fit, steps[interval:], '-', c='gray', label='Episode-Wise data')
137 |     plt.plot(mid, '-', c='green', linewidth=5, label='Moving Average')
138 |     plt.legend(loc="best", prop={'size': 12})
139 |     plt.show()
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     main()
144 | 


--------------------------------------------------------------------------------
/Experiments/Online TD and true Online TD.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "# Define state 1, 2, 3 ,4 ,5... 19 as normal state with one-hot encoding\n",
 14 |     "# state 0 and state 20 share the same zero feature vectors.\n",
 15 |     "\n",
 16 |     "def feature_map(state):\n",
 17 |     "    zero_model = [0]*19\n",
 18 |     "    zero_model[state-1] = 1\n",
 19 |     "    zero_model = np.array(zero_model)\n",
 20 |     "    zero_model.resize((19,1))\n",
 21 |     "    return np.array(zero_model)\n",
 22 |     "\n",
 23 |     "# create a hash table to quickly draw features\n",
 24 |     "feature_hash = {0: np.zeros((19,1)),\n",
 25 |     "                20: np.zeros((19,1))}\n",
 26 |     "for state in range(1,20):\n",
 27 |     "    feature_hash[state] = feature_map(state)\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 65,
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "history = []\n",
 36 |     "for episode in range(2):\n",
 37 |     "    local = [10]\n",
 38 |     "    state = 10\n",
 39 |     "    while True:\n",
 40 |     "        if np.random.random() > 0.5:\n",
 41 |     "            state += 1\n",
 42 |     "        else:\n",
 43 |     "            state -= 1\n",
 44 |     "        local.append(state)\n",
 45 |     "        if state == 0 or state == 20:\n",
 46 |     "            history.append(local)\n",
 47 |     "            break"
 48 |    ],
 49 |    "metadata": {
 50 |     "collapsed": false,
 51 |     "pycharm": {
 52 |      "name": "#%%\n"
 53 |     }
 54 |    }
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 66,
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "processing episode 1 horizon 245\r"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "# hand pick the hyper parameters\n",
 70 |     "alpha = 0.4\n",
 71 |     "gamma = 0.8\n",
 72 |     "_lambda = 0.9\n",
 73 |     "\n",
 74 |     "# set all ones as initialization\n",
 75 |     "w_last_episode = np.ones((19,1))\n",
 76 |     "w_last_round = np.ones((19,1))\n",
 77 |     "w_forward = {}\n",
 78 |     "def n_step_G(t, h, w, hist):\n",
 79 |     "    if h == len(hist):\n",
 80 |     "        # v(T) == 0; reward == 1\n",
 81 |     "        if hist[-1] == 20:\n",
 82 |     "            return gamma**(h-t-1)\n",
 83 |     "        else:\n",
 84 |     "            return 0\n",
 85 |     "    else:\n",
 86 |     "        # reward == 0; \n",
 87 |     "        return gamma**(h-t)*(w.T@feature_hash[hist[h-1]])\n",
 88 |     "    \n",
 89 |     "def lambda_G(t,h,hist):\n",
 90 |     "    first_term = np.sum([_lambda**(n-1)*n_step_G(t,t+n,w_dict[n-1],hist) for n in range(1, h-t)])\n",
 91 |     "    return (1-_lambda)*first_term + _lambda**(h-t-1)*n_step_G(t,h,w_dict[h-1],hist)\n",
 92 |     "        \n",
 93 |     "for i,hist in enumerate(history):\n",
 94 |     "    w_dict = {0:w_last_episode}\n",
 95 |     "    for h in range(1, len(hist)+1):\n",
 96 |     "        print(f'processing episode {i} horizon {h}', end = '\\r')\n",
 97 |     "        w_old = w_last_episode\n",
 98 |     "        for t in range(1,h+1):\n",
 99 |     "            w = w_old + alpha*(lambda_G(t-1,h,hist) - w_old.T@feature_hash[hist[t-1]])*feature_hash[hist[t-1]]\n",
100 |     "            w_old = w\n",
101 |     "        else:\n",
102 |     "            w_dict[h] = w_old\n",
103 |     "    else:\n",
104 |     "        w_forward[i] = w_old\n",
105 |     "        w_last_episode = w_old\n"
106 |    ],
107 |    "metadata": {
108 |     "collapsed": false,
109 |     "pycharm": {
110 |      "name": "#%%\n"
111 |     }
112 |    }
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 67,
117 |    "outputs": [],
118 |    "source": [
119 |     "w_online = {}\n",
120 |     "w_2 = np.ones((19,1))\n",
121 |     "for episode, hist in enumerate(history):\n",
122 |     "    z = np.zeros((19,1))\n",
123 |     "    V_old = 0\n",
124 |     "    for i, state in enumerate(hist):\n",
125 |     "        if i == len(hist)-2:\n",
126 |     "            if hist[i+1] == 20:\n",
127 |     "                R = 1\n",
128 |     "            else:\n",
129 |     "                R = 0\n",
130 |     "            done = True\n",
131 |     "        else:\n",
132 |     "            R = 0\n",
133 |     "            done = False\n",
134 |     "        V = w_2.T@feature_hash[state]\n",
135 |     "        V_prime = w_2.T@feature_hash[hist[i+1]]\n",
136 |     "        delta = R + gamma*V_prime - V\n",
137 |     "        z = _lambda*gamma*z + (1-alpha*gamma*_lambda*z.T@feature_hash[state])*feature_hash[state]\n",
138 |     "        w_2 = w_2 + alpha*(delta + V - V_old)*z - alpha*(V-V_old)*feature_hash[state]\n",
139 |     "        V_old = V_prime\n",
140 |     "        if done:\n",
141 |     "            w_online[episode] = w_2\n",
142 |     "            break"
143 |    ],
144 |    "metadata": {
145 |     "collapsed": false,
146 |     "pycharm": {
147 |      "name": "#%%\n"
148 |     }
149 |    }
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 70,
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/plain": "array([[1.        ],\n       [1.        ],\n       [1.        ],\n       [1.        ],\n       [1.        ],\n       [1.        ],\n       [0.53231318],\n       [0.32852723],\n       [0.27689236],\n       [0.28178901],\n       [0.2804106 ],\n       [0.27976137],\n       [0.2788142 ],\n       [0.27958496],\n       [0.28473191],\n       [0.29893919],\n       [0.36824546],\n       [0.55494604],\n       [0.8125568 ]])"
158 |      },
159 |      "execution_count": 70,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "w_forward[0]"
166 |    ],
167 |    "metadata": {
168 |     "collapsed": false,
169 |     "pycharm": {
170 |      "name": "#%%\n"
171 |     }
172 |    }
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 71,
177 |    "outputs": [
178 |     {
179 |      "data": {
180 |       "text/plain": "array([[1.        ],\n       [1.        ],\n       [1.        ],\n       [1.        ],\n       [1.        ],\n       [1.        ],\n       [0.4232541 ],\n       [0.15123449],\n       [0.02727636],\n       [0.01142531],\n       [0.00900136],\n       [0.01016122],\n       [0.01274722],\n       [0.01743689],\n       [0.04539897],\n       [0.12147425],\n       [0.28585104],\n       [0.58749665],\n       [0.90724135]])"
181 |      },
182 |      "execution_count": 71,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "w_online[0]"
189 |    ],
190 |    "metadata": {
191 |     "collapsed": false,
192 |     "pycharm": {
193 |      "name": "#%%\n"
194 |     }
195 |    }
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "outputs": [],
201 |    "source": [
202 |     "\n"
203 |    ],
204 |    "metadata": {
205 |     "collapsed": false,
206 |     "pycharm": {
207 |      "name": "#%%\n"
208 |     }
209 |    }
210 |   }
211 |  ],
212 |  "metadata": {
213 |   "kernelspec": {
214 |    "display_name": "Python 3",
215 |    "language": "python",
216 |    "name": "python3"
217 |   },
218 |   "language_info": {
219 |    "codemirror_mode": {
220 |     "name": "ipython",
221 |     "version": 2
222 |    },
223 |    "file_extension": ".py",
224 |    "mimetype": "text/x-python",
225 |    "name": "python",
226 |    "nbconvert_exporter": "python",
227 |    "pygments_lexer": "ipython2",
228 |    "version": "2.7.6"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 0
233 | }


--------------------------------------------------------------------------------
/Experiments/Seijen2014_True_Online_TD.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 12,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "import numpy as np\n",
  12 |     "\n",
  13 |     "state_map = {\n",
  14 |     "   'A':[('B', -1), ('C', 1), ('D',-1)],\n",
  15 |     "   'B':[('E', 1), ('T', 1)],\n",
  16 |     "   'C':[('A', -1), ('B', 1), ('D', 1), ('T', 1)],\n",
  17 |     "   'D':[('F', 1), ('T', 1)],\n",
  18 |     "   'E':[('T', 1)],\n",
  19 |     "   'F':[('T', 1)]\n",
  20 |     "}\n",
  21 |     "\n",
  22 |     "feature_map = {\n",
  23 |     "   'A':np.array([[1],[0],[0],[0],[0],[0]]),\n",
  24 |     "   'B':np.array([[0],[1],[0],[0],[0],[0]]),\n",
  25 |     "   'C':np.array([[0],[0],[1],[0],[0],[0]]),\n",
  26 |     "   'D':np.array([[0],[0],[0],[1],[0],[0]]),\n",
  27 |     "   'E':np.array([[0],[0],[0],[0],[1],[0]]),\n",
  28 |     "   'F':np.array([[0],[0],[0],[0],[0],[1]]),\n",
  29 |     "   'T':np.array([[0],[0],[0],[0],[0],[0]]),\n",
  30 |     "}\n",
  31 |     "theta_2014 = np.array([[0],[0],[0],[0],[0],[0]])\n",
  32 |     "theta_2016 = np.array([[0],[0],[0],[0],[0],[0]])"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 14,
  38 |    "outputs": [
  39 |     {
  40 |      "name": "stdout",
  41 |      "output_type": "stream",
  42 |      "text": [
  43 |       "-----episode 0-----\n",
  44 |       "theta_2014: [[-0.01930828]\n",
  45 |       " [ 0.04260991]\n",
  46 |       " [ 0.01048235]\n",
  47 |       " [ 0.06055846]\n",
  48 |       " [ 0.029701  ]\n",
  49 |       " [ 0.0199    ]]\n",
  50 |       "theta_2016: [[-0.01930828]\n",
  51 |       " [ 0.04260991]\n",
  52 |       " [ 0.01048235]\n",
  53 |       " [ 0.06055846]\n",
  54 |       " [ 0.029701  ]\n",
  55 |       " [ 0.0199    ]]\n",
  56 |       "error  1.0408340855860843e-17\n",
  57 |       "---------------------------\n",
  58 |       "-----episode 1-----\n",
  59 |       "theta_2014: [[-0.00789621]\n",
  60 |       " [ 0.05218381]\n",
  61 |       " [ 0.02174718]\n",
  62 |       " [ 0.06055846]\n",
  63 |       " [ 0.029701  ]\n",
  64 |       " [ 0.0199    ]]\n",
  65 |       "theta_2016: [[-0.00789621]\n",
  66 |       " [ 0.05218381]\n",
  67 |       " [ 0.02174718]\n",
  68 |       " [ 0.06055846]\n",
  69 |       " [ 0.029701  ]\n",
  70 |       " [ 0.0199    ]]\n",
  71 |       "error  8.673617379884035e-18\n",
  72 |       "---------------------------\n",
  73 |       "-----episode 2-----\n",
  74 |       "theta_2014: [[-0.01636229]\n",
  75 |       " [ 0.06166197]\n",
  76 |       " [ 0.02174718]\n",
  77 |       " [ 0.06055846]\n",
  78 |       " [ 0.029701  ]\n",
  79 |       " [ 0.0199    ]]\n",
  80 |       "theta_2016: [[-0.01636229]\n",
  81 |       " [ 0.06166197]\n",
  82 |       " [ 0.02174718]\n",
  83 |       " [ 0.06055846]\n",
  84 |       " [ 0.029701  ]\n",
  85 |       " [ 0.0199    ]]\n",
  86 |       "error  6.938893903907228e-18\n",
  87 |       "---------------------------\n",
  88 |       "-----episode 3-----\n",
  89 |       "theta_2014: [[-0.02466909]\n",
  90 |       " [ 0.06166197]\n",
  91 |       " [ 0.02174718]\n",
  92 |       " [ 0.06995287]\n",
  93 |       " [ 0.029701  ]\n",
  94 |       " [ 0.0199    ]]\n",
  95 |       "theta_2016: [[-0.02466909]\n",
  96 |       " [ 0.06166197]\n",
  97 |       " [ 0.02174718]\n",
  98 |       " [ 0.06995287]\n",
  99 |       " [ 0.029701  ]\n",
 100 |       " [ 0.0199    ]]\n",
 101 |       "error  6.938893903907228e-18\n",
 102 |       "---------------------------\n",
 103 |       "-----episode 4-----\n",
 104 |       "theta_2014: [[-0.03280912]\n",
 105 |       " [ 0.06166197]\n",
 106 |       " [ 0.02174718]\n",
 107 |       " [ 0.07925335]\n",
 108 |       " [ 0.029701  ]\n",
 109 |       " [ 0.0199    ]]\n",
 110 |       "theta_2016: [[-0.03280912]\n",
 111 |       " [ 0.06166197]\n",
 112 |       " [ 0.02174718]\n",
 113 |       " [ 0.07925335]\n",
 114 |       " [ 0.029701  ]\n",
 115 |       " [ 0.0199    ]]\n",
 116 |       "error  6.938893903907228e-18\n",
 117 |       "---------------------------\n",
 118 |       "-----episode 5-----\n",
 119 |       "theta_2014: [[-0.04094162]\n",
 120 |       " [ 0.07104535]\n",
 121 |       " [ 0.02174718]\n",
 122 |       " [ 0.07925335]\n",
 123 |       " [ 0.029701  ]\n",
 124 |       " [ 0.0199    ]]\n",
 125 |       "theta_2016: [[-0.04094162]\n",
 126 |       " [ 0.07104535]\n",
 127 |       " [ 0.02174718]\n",
 128 |       " [ 0.07925335]\n",
 129 |       " [ 0.029701  ]\n",
 130 |       " [ 0.0199    ]]\n",
 131 |       "error  6.938893903907228e-18\n",
 132 |       "---------------------------\n",
 133 |       "-----episode 6-----\n",
 134 |       "theta_2014: [[-0.04890919]\n",
 135 |       " [ 0.0803349 ]\n",
 136 |       " [ 0.02174718]\n",
 137 |       " [ 0.07925335]\n",
 138 |       " [ 0.029701  ]\n",
 139 |       " [ 0.0199    ]]\n",
 140 |       "theta_2016: [[-0.04890919]\n",
 141 |       " [ 0.0803349 ]\n",
 142 |       " [ 0.02174718]\n",
 143 |       " [ 0.07925335]\n",
 144 |       " [ 0.029701  ]\n",
 145 |       " [ 0.0199    ]]\n",
 146 |       "error  6.938893903907228e-18\n",
 147 |       "---------------------------\n",
 148 |       "-----episode 7-----\n",
 149 |       "theta_2014: [[-0.05660839]\n",
 150 |       " [ 0.0803349 ]\n",
 151 |       " [ 0.02174718]\n",
 152 |       " [ 0.08962812]\n",
 153 |       " [ 0.029701  ]\n",
 154 |       " [ 0.029701  ]]\n",
 155 |       "theta_2016: [[-0.05660839]\n",
 156 |       " [ 0.0803349 ]\n",
 157 |       " [ 0.02174718]\n",
 158 |       " [ 0.08962812]\n",
 159 |       " [ 0.029701  ]\n",
 160 |       " [ 0.029701  ]]\n",
 161 |       "error  6.938893903907228e-18\n",
 162 |       "---------------------------\n",
 163 |       "-----episode 8-----\n",
 164 |       "theta_2014: [[-0.06412951]\n",
 165 |       " [ 0.0803349 ]\n",
 166 |       " [ 0.02174718]\n",
 167 |       " [ 0.09998648]\n",
 168 |       " [ 0.029701  ]\n",
 169 |       " [ 0.03940399]]\n",
 170 |       "theta_2016: [[-0.06412951]\n",
 171 |       " [ 0.0803349 ]\n",
 172 |       " [ 0.02174718]\n",
 173 |       " [ 0.09998648]\n",
 174 |       " [ 0.029701  ]\n",
 175 |       " [ 0.03940399]]\n",
 176 |       "error  1.3877787807814457e-17\n",
 177 |       "---------------------------\n",
 178 |       "-----episode 9-----\n",
 179 |       "theta_2014: [[-0.07178243]\n",
 180 |       " [ 0.08953155]\n",
 181 |       " [ 0.02174718]\n",
 182 |       " [ 0.09998648]\n",
 183 |       " [ 0.029701  ]\n",
 184 |       " [ 0.03940399]]\n",
 185 |       "theta_2016: [[-0.07178243]\n",
 186 |       " [ 0.08953155]\n",
 187 |       " [ 0.02174718]\n",
 188 |       " [ 0.09998648]\n",
 189 |       " [ 0.029701  ]\n",
 190 |       " [ 0.03940399]]\n",
 191 |       "error  1.3877787807814457e-17\n",
 192 |       "---------------------------\n",
 193 |       "-----episode 10-----\n",
 194 |       "theta_2014: [[-0.06958732]\n",
 195 |       " [ 0.09863623]\n",
 196 |       " [ 0.01017452]\n",
 197 |       " [ 0.09998648]\n",
 198 |       " [ 0.029701  ]\n",
 199 |       " [ 0.03940399]]\n",
 200 |       "theta_2016: [[-0.06958732]\n",
 201 |       " [ 0.09863623]\n",
 202 |       " [ 0.01017452]\n",
 203 |       " [ 0.09998648]\n",
 204 |       " [ 0.029701  ]\n",
 205 |       " [ 0.03940399]]\n",
 206 |       "error  1.3877787807814457e-17\n",
 207 |       "---------------------------\n",
 208 |       "-----episode 11-----\n",
 209 |       "theta_2014: [[-0.07689839]\n",
 210 |       " [ 0.10890451]\n",
 211 |       " [ 0.01017452]\n",
 212 |       " [ 0.09998648]\n",
 213 |       " [ 0.03940399]\n",
 214 |       " [ 0.03940399]]\n",
 215 |       "theta_2016: [[-0.07689839]\n",
 216 |       " [ 0.10890451]\n",
 217 |       " [ 0.01017452]\n",
 218 |       " [ 0.09998648]\n",
 219 |       " [ 0.03940399]\n",
 220 |       " [ 0.03940399]]\n",
 221 |       "error  1.3877787807814457e-17\n",
 222 |       "---------------------------\n",
 223 |       "-----episode 12-----\n",
 224 |       "theta_2014: [[-0.07461403]\n",
 225 |       " [ 0.10890451]\n",
 226 |       " [-0.00131933]\n",
 227 |       " [ 0.10898661]\n",
 228 |       " [ 0.03940399]\n",
 229 |       " [ 0.03940399]]\n",
 230 |       "theta_2016: [[-0.07461403]\n",
 231 |       " [ 0.10890451]\n",
 232 |       " [-0.00131933]\n",
 233 |       " [ 0.10898661]\n",
 234 |       " [ 0.03940399]\n",
 235 |       " [ 0.03940399]]\n",
 236 |       "error  1.4094628242311558e-17\n",
 237 |       "---------------------------\n",
 238 |       "-----episode 13-----\n",
 239 |       "theta_2014: [[-0.08177405]\n",
 240 |       " [ 0.10890451]\n",
 241 |       " [-0.00131933]\n",
 242 |       " [ 0.11923783]\n",
 243 |       " [ 0.03940399]\n",
 244 |       " [ 0.04900995]]\n",
 245 |       "theta_2016: [[-0.08177405]\n",
 246 |       " [ 0.10890451]\n",
 247 |       " [-0.00131933]\n",
 248 |       " [ 0.11923783]\n",
 249 |       " [ 0.03940399]\n",
 250 |       " [ 0.04900995]]\n",
 251 |       "error  2.168404344971009e-19\n",
 252 |       "---------------------------\n",
 253 |       "-----episode 14-----\n",
 254 |       "theta_2014: [[-0.08876266]\n",
 255 |       " [ 0.10890451]\n",
 256 |       " [-0.00131933]\n",
 257 |       " [ 0.12947213]\n",
 258 |       " [ 0.03940399]\n",
 259 |       " [ 0.05851985]]\n",
 260 |       "theta_2016: [[-0.08876266]\n",
 261 |       " [ 0.10890451]\n",
 262 |       " [-0.00131933]\n",
 263 |       " [ 0.12947213]\n",
 264 |       " [ 0.03940399]\n",
 265 |       " [ 0.05851985]]\n",
 266 |       "error  2.7972416050126014e-17\n",
 267 |       "---------------------------\n",
 268 |       "-----episode 15-----\n",
 269 |       "theta_2014: [[-0.09591469]\n",
 270 |       " [ 0.11781546]\n",
 271 |       " [-0.00131933]\n",
 272 |       " [ 0.12947213]\n",
 273 |       " [ 0.03940399]\n",
 274 |       " [ 0.05851985]]\n",
 275 |       "theta_2016: [[-0.09591469]\n",
 276 |       " [ 0.11781546]\n",
 277 |       " [-0.00131933]\n",
 278 |       " [ 0.12947213]\n",
 279 |       " [ 0.03940399]\n",
 280 |       " [ 0.05851985]]\n",
 281 |       "error  2.7972416050126014e-17\n",
 282 |       "---------------------------\n",
 283 |       "-----episode 16-----\n",
 284 |       "theta_2014: [[-0.10291581]\n",
 285 |       " [ 0.12663731]\n",
 286 |       " [-0.00131933]\n",
 287 |       " [ 0.12947213]\n",
 288 |       " [ 0.03940399]\n",
 289 |       " [ 0.05851985]]\n",
 290 |       "theta_2016: [[-0.10291581]\n",
 291 |       " [ 0.12663731]\n",
 292 |       " [-0.00131933]\n",
 293 |       " [ 0.12947213]\n",
 294 |       " [ 0.03940399]\n",
 295 |       " [ 0.05851985]]\n",
 296 |       "error  4.185020385794047e-17\n",
 297 |       "---------------------------\n",
 298 |       "-----episode 17-----\n",
 299 |       "theta_2014: [[-0.10959342]\n",
 300 |       " [ 0.12663731]\n",
 301 |       " [-0.00131933]\n",
 302 |       " [ 0.13968883]\n",
 303 |       " [ 0.03940399]\n",
 304 |       " [ 0.06793465]]\n",
 305 |       "theta_2016: [[-0.10959342]\n",
 306 |       " [ 0.12663731]\n",
 307 |       " [-0.00131933]\n",
 308 |       " [ 0.13968883]\n",
 309 |       " [ 0.03940399]\n",
 310 |       " [ 0.06793465]]\n",
 311 |       "error  4.185020385794047e-17\n",
 312 |       "---------------------------\n",
 313 |       "-----episode 18-----\n",
 314 |       "theta_2014: [[-0.11610493]\n",
 315 |       " [ 0.12663731]\n",
 316 |       " [-0.00131933]\n",
 317 |       " [ 0.14988723]\n",
 318 |       " [ 0.03940399]\n",
 319 |       " [ 0.07725531]]\n",
 320 |       "theta_2016: [[-0.11610493]\n",
 321 |       " [ 0.12663731]\n",
 322 |       " [-0.00131933]\n",
 323 |       " [ 0.14988723]\n",
 324 |       " [ 0.03940399]\n",
 325 |       " [ 0.07725531]]\n",
 326 |       "error  4.185020385794047e-17\n",
 327 |       "---------------------------\n",
 328 |       "-----episode 19-----\n",
 329 |       "theta_2014: [[-0.10375592]\n",
 330 |       " [ 0.13537093]\n",
 331 |       " [ 0.0108122 ]\n",
 332 |       " [ 0.14988723]\n",
 333 |       " [ 0.03940399]\n",
 334 |       " [ 0.07725531]]\n",
 335 |       "theta_2016: [[-0.10375592]\n",
 336 |       " [ 0.13537093]\n",
 337 |       " [ 0.0108122 ]\n",
 338 |       " [ 0.14988723]\n",
 339 |       " [ 0.03940399]\n",
 340 |       " [ 0.07725531]]\n",
 341 |       "error  4.336808689942018e-17\n",
 342 |       "---------------------------\n",
 343 |       "-----episode 20-----\n",
 344 |       "theta_2014: [[-0.10033798]\n",
 345 |       " [ 0.13537093]\n",
 346 |       " [-0.00086441]\n",
 347 |       " [ 0.16006671]\n",
 348 |       " [ 0.03940399]\n",
 349 |       " [ 0.08648275]]\n",
 350 |       "theta_2016: [[-0.10033798]\n",
 351 |       " [ 0.13537093]\n",
 352 |       " [-0.00086441]\n",
 353 |       " [ 0.16006671]\n",
 354 |       " [ 0.03940399]\n",
 355 |       " [ 0.08648275]]\n",
 356 |       "error  2.959871930885427e-17\n",
 357 |       "---------------------------\n",
 358 |       "-----episode 21-----\n",
 359 |       "theta_2014: [[-0.0881131 ]\n",
 360 |       " [ 0.13537093]\n",
 361 |       " [ 0.01156042]\n",
 362 |       " [ 0.16846604]\n",
 363 |       " [ 0.03940399]\n",
 364 |       " [ 0.08648275]]\n",
 365 |       "theta_2016: [[-0.0881131 ]\n",
 366 |       " [ 0.13537093]\n",
 367 |       " [ 0.01156042]\n",
 368 |       " [ 0.16846604]\n",
 369 |       " [ 0.03940399]\n",
 370 |       " [ 0.08648275]]\n",
 371 |       "error  2.949029909160572e-17\n",
 372 |       "---------------------------\n",
 373 |       "-----episode 22-----\n",
 374 |       "theta_2014: [[-0.09456664]\n",
 375 |       " [ 0.13537093]\n",
 376 |       " [ 0.01156042]\n",
 377 |       " [ 0.17854194]\n",
 378 |       " [ 0.03940399]\n",
 379 |       " [ 0.09561792]]\n",
 380 |       "theta_2016: [[-0.09456664]\n",
 381 |       " [ 0.13537093]\n",
 382 |       " [ 0.01156042]\n",
 383 |       " [ 0.17854194]\n",
 384 |       " [ 0.03940399]\n",
 385 |       " [ 0.09561792]]\n",
 386 |       "error  4.336808689942018e-17\n",
 387 |       "---------------------------\n",
 388 |       "-----episode 23-----\n",
 389 |       "theta_2014: [[-0.07240082]\n",
 390 |       " [ 0.14535831]\n",
 391 |       " [ 0.01412296]\n",
 392 |       " [ 0.17854194]\n",
 393 |       " [ 0.04900995]\n",
 394 |       " [ 0.09561792]]\n",
 395 |       "theta_2016: [[-0.07240082]\n",
 396 |       " [ 0.14535831]\n",
 397 |       " [ 0.01412296]\n",
 398 |       " [ 0.17854194]\n",
 399 |       " [ 0.04900995]\n",
 400 |       " [ 0.09561792]]\n",
 401 |       "error  4.683753385137379e-17\n",
 402 |       "---------------------------\n",
 403 |       "-----episode 24-----\n",
 404 |       "theta_2014: [[-0.06032076]\n",
 405 |       " [ 0.15533141]\n",
 406 |       " [ 0.02640811]\n",
 407 |       " [ 0.17854194]\n",
 408 |       " [ 0.05851985]\n",
 409 |       " [ 0.09561792]]\n",
 410 |       "theta_2016: [[-0.06032076]\n",
 411 |       " [ 0.15533141]\n",
 412 |       " [ 0.02640811]\n",
 413 |       " [ 0.17854194]\n",
 414 |       " [ 0.05851985]\n",
 415 |       " [ 0.09561792]]\n",
 416 |       "error  4.85722573273506e-17\n",
 417 |       "---------------------------\n",
 418 |       "-----episode 25-----\n",
 419 |       "theta_2014: [[-0.04849226]\n",
 420 |       " [ 0.15533141]\n",
 421 |       " [ 0.03614403]\n",
 422 |       " [ 0.17854194]\n",
 423 |       " [ 0.05851985]\n",
 424 |       " [ 0.09561792]]\n",
 425 |       "theta_2016: [[-0.04849226]\n",
 426 |       " [ 0.15533141]\n",
 427 |       " [ 0.03614403]\n",
 428 |       " [ 0.17854194]\n",
 429 |       " [ 0.05851985]\n",
 430 |       " [ 0.09561792]]\n",
 431 |       "error  4.85722573273506e-17\n",
 432 |       "---------------------------\n",
 433 |       "-----episode 26-----\n",
 434 |       "theta_2014: [[-0.0554837 ]\n",
 435 |       " [ 0.16528951]\n",
 436 |       " [ 0.03614403]\n",
 437 |       " [ 0.17854194]\n",
 438 |       " [ 0.06793465]\n",
 439 |       " [ 0.09561792]]\n",
 440 |       "theta_2016: [[-0.0554837 ]\n",
 441 |       " [ 0.16528951]\n",
 442 |       " [ 0.03614403]\n",
 443 |       " [ 0.17854194]\n",
 444 |       " [ 0.06793465]\n",
 445 |       " [ 0.09561792]]\n",
 446 |       "error  4.163336342344337e-17\n",
 447 |       "---------------------------\n",
 448 |       "-----episode 27-----\n",
 449 |       "theta_2014: [[-0.06234805]\n",
 450 |       " [ 0.16528951]\n",
 451 |       " [ 0.03614403]\n",
 452 |       " [ 0.18675652]\n",
 453 |       " [ 0.06793465]\n",
 454 |       " [ 0.09561792]]\n",
 455 |       "theta_2016: [[-0.06234805]\n",
 456 |       " [ 0.16528951]\n",
 457 |       " [ 0.03614403]\n",
 458 |       " [ 0.18675652]\n",
 459 |       " [ 0.06793465]\n",
 460 |       " [ 0.09561792]]\n",
 461 |       "error  4.163336342344337e-17\n",
 462 |       "---------------------------\n",
 463 |       "-----episode 28-----\n",
 464 |       "theta_2014: [[-0.06888822]\n",
 465 |       " [ 0.16528951]\n",
 466 |       " [ 0.03614403]\n",
 467 |       " [ 0.19673091]\n",
 468 |       " [ 0.06793465]\n",
 469 |       " [ 0.10466175]]\n",
 470 |       "theta_2016: [[-0.06888822]\n",
 471 |       " [ 0.16528951]\n",
 472 |       " [ 0.03614403]\n",
 473 |       " [ 0.19673091]\n",
 474 |       " [ 0.06793465]\n",
 475 |       " [ 0.10466175]]\n",
 476 |       "error  4.163336342344337e-17\n",
 477 |       "---------------------------\n",
 478 |       "-----episode 29-----\n",
 479 |       "theta_2014: [[-0.05662785]\n",
 480 |       " [ 0.17523191]\n",
 481 |       " [ 0.04840326]\n",
 482 |       " [ 0.19673091]\n",
 483 |       " [ 0.07725531]\n",
 484 |       " [ 0.10466175]]\n",
 485 |       "theta_2016: [[-0.05662785]\n",
 486 |       " [ 0.17523191]\n",
 487 |       " [ 0.04840326]\n",
 488 |       " [ 0.19673091]\n",
 489 |       " [ 0.07725531]\n",
 490 |       " [ 0.10466175]]\n",
 491 |       "error  4.85722573273506e-17\n",
 492 |       "---------------------------\n",
 493 |       "-----episode 30-----\n",
 494 |       "theta_2014: [[-0.0633441 ]\n",
 495 |       " [ 0.18515793]\n",
 496 |       " [ 0.04840326]\n",
 497 |       " [ 0.19673091]\n",
 498 |       " [ 0.08648275]\n",
 499 |       " [ 0.10466175]]\n",
 500 |       "theta_2016: [[-0.0633441 ]\n",
 501 |       " [ 0.18515793]\n",
 502 |       " [ 0.04840326]\n",
 503 |       " [ 0.19673091]\n",
 504 |       " [ 0.08648275]\n",
 505 |       " [ 0.10466175]]\n",
 506 |       "error  4.163336342344337e-17\n",
 507 |       "---------------------------\n",
 508 |       "-----episode 31-----\n",
 509 |       "theta_2014: [[-0.05128938]\n",
 510 |       " [ 0.18515793]\n",
 511 |       " [ 0.05791922]\n",
 512 |       " [ 0.19673091]\n",
 513 |       " [ 0.08648275]\n",
 514 |       " [ 0.10466175]]\n",
 515 |       "theta_2016: [[-0.05128938]\n",
 516 |       " [ 0.18515793]\n",
 517 |       " [ 0.05791922]\n",
 518 |       " [ 0.19673091]\n",
 519 |       " [ 0.08648275]\n",
 520 |       " [ 0.10466175]]\n",
 521 |       "error  4.85722573273506e-17\n",
 522 |       "---------------------------\n",
 523 |       "-----episode 32-----\n",
 524 |       "theta_2014: [[-0.03927043]\n",
 525 |       " [ 0.18515793]\n",
 526 |       " [ 0.06734003]\n",
 527 |       " [ 0.19673091]\n",
 528 |       " [ 0.08648275]\n",
 529 |       " [ 0.10466175]]\n",
 530 |       "theta_2016: [[-0.03927043]\n",
 531 |       " [ 0.18515793]\n",
 532 |       " [ 0.06734003]\n",
 533 |       " [ 0.19673091]\n",
 534 |       " [ 0.08648275]\n",
 535 |       " [ 0.10466175]]\n",
 536 |       "error  5.551115123125783e-17\n",
 537 |       "---------------------------\n",
 538 |       "-----episode 33-----\n",
 539 |       "theta_2014: [[-0.04594452]\n",
 540 |       " [ 0.18515793]\n",
 541 |       " [ 0.06734003]\n",
 542 |       " [ 0.20668614]\n",
 543 |       " [ 0.08648275]\n",
 544 |       " [ 0.11361513]]\n",
 545 |       "theta_2016: [[-0.04594452]\n",
 546 |       " [ 0.18515793]\n",
 547 |       " [ 0.06734003]\n",
 548 |       " [ 0.20668614]\n",
 549 |       " [ 0.08648275]\n",
 550 |       " [ 0.11361513]]\n",
 551 |       "error  5.551115123125783e-17\n",
 552 |       "---------------------------\n",
 553 |       "-----episode 34-----\n",
 554 |       "theta_2014: [[-0.05245527]\n",
 555 |       " [ 0.18515793]\n",
 556 |       " [ 0.06734003]\n",
 557 |       " [ 0.21662159]\n",
 558 |       " [ 0.08648275]\n",
 559 |       " [ 0.12247898]]\n",
 560 |       "theta_2016: [[-0.05245527]\n",
 561 |       " [ 0.18515793]\n",
 562 |       " [ 0.06734003]\n",
 563 |       " [ 0.21662159]\n",
 564 |       " [ 0.08648275]\n",
 565 |       " [ 0.12247898]]\n",
 566 |       "error  6.245004513516506e-17\n",
 567 |       "---------------------------\n",
 568 |       "-----episode 35-----\n",
 569 |       "theta_2014: [[-0.05901062]\n",
 570 |       " [ 0.18515793]\n",
 571 |       " [ 0.06734003]\n",
 572 |       " [ 0.22445537]\n",
 573 |       " [ 0.08648275]\n",
 574 |       " [ 0.12247898]]\n",
 575 |       "theta_2016: [[-0.05901062]\n",
 576 |       " [ 0.18515793]\n",
 577 |       " [ 0.06734003]\n",
 578 |       " [ 0.22445537]\n",
 579 |       " [ 0.08648275]\n",
 580 |       " [ 0.12247898]]\n",
 581 |       "error  6.245004513516506e-17\n",
 582 |       "---------------------------\n",
 583 |       "-----episode 36-----\n",
 584 |       "theta_2014: [[-0.05579577]\n",
 585 |       " [ 0.19330636]\n",
 586 |       " [ 0.05551248]\n",
 587 |       " [ 0.22445537]\n",
 588 |       " [ 0.08648275]\n",
 589 |       " [ 0.12247898]]\n",
 590 |       "theta_2016: [[-0.05579577]\n",
 591 |       " [ 0.19330636]\n",
 592 |       " [ 0.05551248]\n",
 593 |       " [ 0.22445537]\n",
 594 |       " [ 0.08648275]\n",
 595 |       " [ 0.12247898]]\n",
 596 |       "error  6.245004513516506e-17\n",
 597 |       "---------------------------\n",
 598 |       "-----episode 37-----\n",
 599 |       "theta_2014: [[-0.06235116]\n",
 600 |       " [ 0.20313385]\n",
 601 |       " [ 0.05551248]\n",
 602 |       " [ 0.22445537]\n",
 603 |       " [ 0.09561792]\n",
 604 |       " [ 0.12247898]]\n",
 605 |       "theta_2016: [[-0.06235116]\n",
 606 |       " [ 0.20313385]\n",
 607 |       " [ 0.05551248]\n",
 608 |       " [ 0.22445537]\n",
 609 |       " [ 0.09561792]\n",
 610 |       " [ 0.12247898]]\n",
 611 |       "error  6.938893903907228e-17\n",
 612 |       "---------------------------\n",
 613 |       "-----episode 38-----\n",
 614 |       "theta_2014: [[-0.06873775]\n",
 615 |       " [ 0.20313385]\n",
 616 |       " [ 0.05551248]\n",
 617 |       " [ 0.23221082]\n",
 618 |       " [ 0.09561792]\n",
 619 |       " [ 0.12247898]]\n",
 620 |       "theta_2016: [[-0.06873775]\n",
 621 |       " [ 0.20313385]\n",
 622 |       " [ 0.05551248]\n",
 623 |       " [ 0.23221082]\n",
 624 |       " [ 0.09561792]\n",
 625 |       " [ 0.12247898]]\n",
 626 |       "error  6.938893903907228e-17\n",
 627 |       "---------------------------\n",
 628 |       "-----episode 39-----\n",
 629 |       "theta_2014: [[-0.07478533]\n",
 630 |       " [ 0.20313385]\n",
 631 |       " [ 0.05551248]\n",
 632 |       " [ 0.24197   ]\n",
 633 |       " [ 0.09561792]\n",
 634 |       " [ 0.13125419]]\n",
 635 |       "theta_2016: [[-0.07478533]\n",
 636 |       " [ 0.20313385]\n",
 637 |       " [ 0.05551248]\n",
 638 |       " [ 0.24197   ]\n",
 639 |       " [ 0.09561792]\n",
 640 |       " [ 0.13125419]]\n",
 641 |       "error  6.938893903907228e-17\n",
 642 |       "---------------------------\n",
 643 |       "-----episode 40-----\n",
 644 |       "theta_2014: [[-0.08067773]\n",
 645 |       " [ 0.20313385]\n",
 646 |       " [ 0.05551248]\n",
 647 |       " [ 0.25170977]\n",
 648 |       " [ 0.09561792]\n",
 649 |       " [ 0.13994165]]\n",
 650 |       "theta_2016: [[-0.08067773]\n",
 651 |       " [ 0.20313385]\n",
 652 |       " [ 0.05551248]\n",
 653 |       " [ 0.25170977]\n",
 654 |       " [ 0.09561792]\n",
 655 |       " [ 0.13994165]]\n",
 656 |       "error  4.163336342344337e-17\n",
 657 |       "---------------------------\n",
 658 |       "-----episode 41-----\n",
 659 |       "theta_2014: [[-0.08641677]\n",
 660 |       " [ 0.20313385]\n",
 661 |       " [ 0.05551248]\n",
 662 |       " [ 0.26142956]\n",
 663 |       " [ 0.09561792]\n",
 664 |       " [ 0.14854223]]\n",
 665 |       "theta_2016: [[-0.08641677]\n",
 666 |       " [ 0.20313385]\n",
 667 |       " [ 0.05551248]\n",
 668 |       " [ 0.26142956]\n",
 669 |       " [ 0.09561792]\n",
 670 |       " [ 0.14854223]]\n",
 671 |       "error  2.7755575615628914e-17\n",
 672 |       "---------------------------\n",
 673 |       "-----episode 42-----\n",
 674 |       "theta_2014: [[-0.09223327]\n",
 675 |       " [ 0.20313385]\n",
 676 |       " [ 0.05551248]\n",
 677 |       " [ 0.26881526]\n",
 678 |       " [ 0.09561792]\n",
 679 |       " [ 0.14854223]]\n",
 680 |       "theta_2016: [[-0.09223327]\n",
 681 |       " [ 0.20313385]\n",
 682 |       " [ 0.05551248]\n",
 683 |       " [ 0.26881526]\n",
 684 |       " [ 0.09561792]\n",
 685 |       " [ 0.14854223]]\n",
 686 |       "error  4.163336342344337e-17\n",
 687 |       "---------------------------\n",
 688 |       "-----episode 43-----\n",
 689 |       "theta_2014: [[-0.09832866]\n",
 690 |       " [ 0.21294447]\n",
 691 |       " [ 0.05551248]\n",
 692 |       " [ 0.26881526]\n",
 693 |       " [ 0.10466175]\n",
 694 |       " [ 0.14854223]]\n",
 695 |       "theta_2016: [[-0.09832866]\n",
 696 |       " [ 0.21294447]\n",
 697 |       " [ 0.05551248]\n",
 698 |       " [ 0.26881526]\n",
 699 |       " [ 0.10466175]\n",
 700 |       " [ 0.14854223]]\n",
 701 |       "error  2.7755575615628914e-17\n",
 702 |       "---------------------------\n",
 703 |       "-----episode 44-----\n",
 704 |       "theta_2014: [[-0.08557491]\n",
 705 |       " [ 0.22081503]\n",
 706 |       " [ 0.06784469]\n",
 707 |       " [ 0.26881526]\n",
 708 |       " [ 0.10466175]\n",
 709 |       " [ 0.14854223]]\n",
 710 |       "theta_2016: [[-0.08557491]\n",
 711 |       " [ 0.22081503]\n",
 712 |       " [ 0.06784469]\n",
 713 |       " [ 0.26881526]\n",
 714 |       " [ 0.10466175]\n",
 715 |       " [ 0.14854223]]\n",
 716 |       "error  1.3877787807814457e-17\n",
 717 |       "---------------------------\n",
 718 |       "-----episode 45-----\n",
 719 |       "theta_2014: [[-0.09110498]\n",
 720 |       " [ 0.22081503]\n",
 721 |       " [ 0.06784469]\n",
 722 |       " [ 0.27844062]\n",
 723 |       " [ 0.10466175]\n",
 724 |       " [ 0.15705681]]\n",
 725 |       "theta_2016: [[-0.09110498]\n",
 726 |       " [ 0.22081503]\n",
 727 |       " [ 0.06784469]\n",
 728 |       " [ 0.27844062]\n",
 729 |       " [ 0.10466175]\n",
 730 |       " [ 0.15705681]]\n",
 731 |       "error  2.7755575615628914e-17\n",
 732 |       "---------------------------\n",
 733 |       "-----episode 46-----\n",
 734 |       "theta_2014: [[-0.07859943]\n",
 735 |       " [ 0.22081503]\n",
 736 |       " [ 0.07716624]\n",
 737 |       " [ 0.27844062]\n",
 738 |       " [ 0.10466175]\n",
 739 |       " [ 0.15705681]]\n",
 740 |       "theta_2016: [[-0.07859943]\n",
 741 |       " [ 0.22081503]\n",
 742 |       " [ 0.07716624]\n",
 743 |       " [ 0.27844062]\n",
 744 |       " [ 0.10466175]\n",
 745 |       " [ 0.15705681]]\n",
 746 |       "error  2.7755575615628914e-17\n",
 747 |       "---------------------------\n",
 748 |       "-----episode 47-----\n",
 749 |       "theta_2014: [[-0.06579227]\n",
 750 |       " [ 0.22081503]\n",
 751 |       " [ 0.08986549]\n",
 752 |       " [ 0.28565621]\n",
 753 |       " [ 0.10466175]\n",
 754 |       " [ 0.15705681]]\n",
 755 |       "theta_2016: [[-0.06579227]\n",
 756 |       " [ 0.22081503]\n",
 757 |       " [ 0.08986549]\n",
 758 |       " [ 0.28565621]\n",
 759 |       " [ 0.10466175]\n",
 760 |       " [ 0.15705681]]\n",
 761 |       "error  2.7755575615628914e-17\n",
 762 |       "---------------------------\n",
 763 |       "-----episode 48-----\n",
 764 |       "theta_2014: [[-0.07198655]\n",
 765 |       " [ 0.23052941]\n",
 766 |       " [ 0.08986549]\n",
 767 |       " [ 0.28565621]\n",
 768 |       " [ 0.11361513]\n",
 769 |       " [ 0.15705681]]\n",
 770 |       "theta_2016: [[-0.07198655]\n",
 771 |       " [ 0.23052941]\n",
 772 |       " [ 0.08986549]\n",
 773 |       " [ 0.28565621]\n",
 774 |       " [ 0.11361513]\n",
 775 |       " [ 0.15705681]]\n",
 776 |       "error  2.7755575615628914e-17\n",
 777 |       "---------------------------\n",
 778 |       "-----episode 49-----\n",
 779 |       "theta_2014: [[-0.07749494]\n",
 780 |       " [ 0.23052941]\n",
 781 |       " [ 0.08986549]\n",
 782 |       " [ 0.29518903]\n",
 783 |       " [ 0.11361513]\n",
 784 |       " [ 0.16548624]]\n",
 785 |       "theta_2016: [[-0.07749494]\n",
 786 |       " [ 0.23052941]\n",
 787 |       " [ 0.08986549]\n",
 788 |       " [ 0.29518903]\n",
 789 |       " [ 0.11361513]\n",
 790 |       " [ 0.16548624]]\n",
 791 |       "error  2.7755575615628914e-17\n",
 792 |       "---------------------------\n",
 793 |       "-----episode 50-----\n",
 794 |       "theta_2014: [[-0.08367597]\n",
 795 |       " [ 0.23822412]\n",
 796 |       " [ 0.08986549]\n",
 797 |       " [ 0.29518903]\n",
 798 |       " [ 0.11361513]\n",
 799 |       " [ 0.16548624]]\n",
 800 |       "theta_2016: [[-0.08367597]\n",
 801 |       " [ 0.23822412]\n",
 802 |       " [ 0.08986549]\n",
 803 |       " [ 0.29518903]\n",
 804 |       " [ 0.11361513]\n",
 805 |       " [ 0.16548624]]\n",
 806 |       "error  2.7755575615628914e-17\n",
 807 |       "---------------------------\n",
 808 |       "-----episode 51-----\n",
 809 |       "theta_2014: [[-0.08952841]\n",
 810 |       " [ 0.24784419]\n",
 811 |       " [ 0.08986549]\n",
 812 |       " [ 0.29518903]\n",
 813 |       " [ 0.12247898]\n",
 814 |       " [ 0.16548624]]\n",
 815 |       "theta_2016: [[-0.08952841]\n",
 816 |       " [ 0.24784419]\n",
 817 |       " [ 0.08986549]\n",
 818 |       " [ 0.29518903]\n",
 819 |       " [ 0.12247898]\n",
 820 |       " [ 0.16548624]]\n",
 821 |       "error  2.7755575615628914e-17\n",
 822 |       "---------------------------\n",
 823 |       "-----episode 52-----\n",
 824 |       "theta_2014: [[-0.09501299]\n",
 825 |       " [ 0.24784419]\n",
 826 |       " [ 0.08986549]\n",
 827 |       " [ 0.30223714]\n",
 828 |       " [ 0.12247898]\n",
 829 |       " [ 0.16548624]]\n",
 830 |       "theta_2016: [[-0.09501299]\n",
 831 |       " [ 0.24784419]\n",
 832 |       " [ 0.08986549]\n",
 833 |       " [ 0.30223714]\n",
 834 |       " [ 0.12247898]\n",
 835 |       " [ 0.16548624]]\n",
 836 |       "error  2.7755575615628914e-17\n",
 837 |       "---------------------------\n",
 838 |       "-----episode 53-----\n",
 839 |       "theta_2014: [[-0.08227216]\n",
 840 |       " [ 0.24784419]\n",
 841 |       " [ 0.09896683]\n",
 842 |       " [ 0.30223714]\n",
 843 |       " [ 0.12247898]\n",
 844 |       " [ 0.16548624]]\n",
 845 |       "theta_2016: [[-0.08227216]\n",
 846 |       " [ 0.24784419]\n",
 847 |       " [ 0.09896683]\n",
 848 |       " [ 0.30223714]\n",
 849 |       " [ 0.12247898]\n",
 850 |       " [ 0.16548624]]\n",
 851 |       "error  2.7755575615628914e-17\n",
 852 |       "---------------------------\n",
 853 |       "-----episode 54-----\n",
 854 |       "theta_2014: [[-0.0877665 ]\n",
 855 |       " [ 0.24784419]\n",
 856 |       " [ 0.09896683]\n",
 857 |       " [ 0.30921476]\n",
 858 |       " [ 0.12247898]\n",
 859 |       " [ 0.16548624]]\n",
 860 |       "theta_2016: [[-0.0877665 ]\n",
 861 |       " [ 0.24784419]\n",
 862 |       " [ 0.09896683]\n",
 863 |       " [ 0.30921476]\n",
 864 |       " [ 0.12247898]\n",
 865 |       " [ 0.16548624]]\n",
 866 |       "error  1.3877787807814457e-17\n",
 867 |       "---------------------------\n",
 868 |       "-----episode 55-----\n",
 869 |       "theta_2014: [[-0.09314374]\n",
 870 |       " [ 0.24784419]\n",
 871 |       " [ 0.09896683]\n",
 872 |       " [ 0.31612262]\n",
 873 |       " [ 0.12247898]\n",
 874 |       " [ 0.16548624]]\n",
 875 |       "theta_2016: [[-0.09314374]\n",
 876 |       " [ 0.24784419]\n",
 877 |       " [ 0.09896683]\n",
 878 |       " [ 0.31612262]\n",
 879 |       " [ 0.12247898]\n",
 880 |       " [ 0.16548624]]\n",
 881 |       "error  1.3877787807814457e-17\n",
 882 |       "---------------------------\n",
 883 |       "-----episode 56-----\n",
 884 |       "theta_2014: [[-0.09901401]\n",
 885 |       " [ 0.25536575]\n",
 886 |       " [ 0.09896683]\n",
 887 |       " [ 0.31612262]\n",
 888 |       " [ 0.12247898]\n",
 889 |       " [ 0.16548624]]\n",
 890 |       "theta_2016: [[-0.09901401]\n",
 891 |       " [ 0.25536575]\n",
 892 |       " [ 0.09896683]\n",
 893 |       " [ 0.31612262]\n",
 894 |       " [ 0.12247898]\n",
 895 |       " [ 0.16548624]]\n",
 896 |       "error  1.3877787807814457e-17\n",
 897 |       "---------------------------\n",
 898 |       "-----episode 57-----\n",
 899 |       "theta_2014: [[-0.10421721]\n",
 900 |       " [ 0.25536575]\n",
 901 |       " [ 0.09896683]\n",
 902 |       " [ 0.32296139]\n",
 903 |       " [ 0.12247898]\n",
 904 |       " [ 0.16548624]]\n",
 905 |       "theta_2016: [[-0.10421721]\n",
 906 |       " [ 0.25536575]\n",
 907 |       " [ 0.09896683]\n",
 908 |       " [ 0.32296139]\n",
 909 |       " [ 0.12247898]\n",
 910 |       " [ 0.16548624]]\n",
 911 |       "error  1.3877787807814457e-17\n",
 912 |       "---------------------------\n",
 913 |       "-----episode 58-----\n",
 914 |       "theta_2014: [[-0.09092036]\n",
 915 |       " [ 0.25536575]\n",
 916 |       " [ 0.11184475]\n",
 917 |       " [ 0.32973178]\n",
 918 |       " [ 0.12247898]\n",
 919 |       " [ 0.16548624]]\n",
 920 |       "theta_2016: [[-0.09092036]\n",
 921 |       " [ 0.25536575]\n",
 922 |       " [ 0.11184475]\n",
 923 |       " [ 0.32973178]\n",
 924 |       " [ 0.12247898]\n",
 925 |       " [ 0.16548624]]\n",
 926 |       "error  1.3877787807814457e-17\n",
 927 |       "---------------------------\n",
 928 |       "-----episode 59-----\n",
 929 |       "theta_2014: [[-0.07802462]\n",
 930 |       " [ 0.25536575]\n",
 931 |       " [ 0.1207263 ]\n",
 932 |       " [ 0.32973178]\n",
 933 |       " [ 0.12247898]\n",
 934 |       " [ 0.16548624]]\n",
 935 |       "theta_2016: [[-0.07802462]\n",
 936 |       " [ 0.25536575]\n",
 937 |       " [ 0.1207263 ]\n",
 938 |       " [ 0.32973178]\n",
 939 |       " [ 0.12247898]\n",
 940 |       " [ 0.16548624]]\n",
 941 |       "error  2.7755575615628914e-17\n",
 942 |       "---------------------------\n",
 943 |       "-----episode 60-----\n",
 944 |       "theta_2014: [[-0.08397906]\n",
 945 |       " [ 0.26281209]\n",
 946 |       " [ 0.1207263 ]\n",
 947 |       " [ 0.32973178]\n",
 948 |       " [ 0.12247898]\n",
 949 |       " [ 0.16548624]]\n",
 950 |       "theta_2016: [[-0.08397906]\n",
 951 |       " [ 0.26281209]\n",
 952 |       " [ 0.1207263 ]\n",
 953 |       " [ 0.32973178]\n",
 954 |       " [ 0.12247898]\n",
 955 |       " [ 0.16548624]]\n",
 956 |       "error  1.3877787807814457e-17\n",
 957 |       "---------------------------\n",
 958 |       "-----episode 61-----\n",
 959 |       "theta_2014: [[-0.08960157]\n",
 960 |       " [ 0.27226525]\n",
 961 |       " [ 0.1207263 ]\n",
 962 |       " [ 0.32973178]\n",
 963 |       " [ 0.13125419]\n",
 964 |       " [ 0.16548624]]\n",
 965 |       "theta_2016: [[-0.08960157]\n",
 966 |       " [ 0.27226525]\n",
 967 |       " [ 0.1207263 ]\n",
 968 |       " [ 0.32973178]\n",
 969 |       " [ 0.13125419]\n",
 970 |       " [ 0.16548624]]\n",
 971 |       "error  1.3877787807814457e-17\n",
 972 |       "---------------------------\n",
 973 |       "-----episode 62-----\n",
 974 |       "theta_2014: [[-0.09528967]\n",
 975 |       " [ 0.2795426 ]\n",
 976 |       " [ 0.1207263 ]\n",
 977 |       " [ 0.32973178]\n",
 978 |       " [ 0.13125419]\n",
 979 |       " [ 0.16548624]]\n",
 980 |       "theta_2016: [[-0.09528967]\n",
 981 |       " [ 0.2795426 ]\n",
 982 |       " [ 0.1207263 ]\n",
 983 |       " [ 0.32973178]\n",
 984 |       " [ 0.13125419]\n",
 985 |       " [ 0.16548624]]\n",
 986 |       "error  1.3877787807814457e-17\n",
 987 |       "---------------------------\n",
 988 |       "-----episode 63-----\n",
 989 |       "theta_2014: [[-0.10040886]\n",
 990 |       " [ 0.2795426 ]\n",
 991 |       " [ 0.1207263 ]\n",
 992 |       " [ 0.33643446]\n",
 993 |       " [ 0.13125419]\n",
 994 |       " [ 0.16548624]]\n",
 995 |       "theta_2016: [[-0.10040886]\n",
 996 |       " [ 0.2795426 ]\n",
 997 |       " [ 0.1207263 ]\n",
 998 |       " [ 0.33643446]\n",
 999 |       " [ 0.13125419]\n",
1000 |       " [ 0.16548624]]\n",
1001 |       "error  1.3877787807814457e-17\n",
1002 |       "---------------------------\n",
1003 |       "-----episode 64-----\n",
1004 |       "theta_2014: [[-0.09457365]\n",
1005 |       " [ 0.2795426 ]\n",
1006 |       " [ 0.10813787]\n",
1007 |       " [ 0.34307011]\n",
1008 |       " [ 0.13125419]\n",
1009 |       " [ 0.16548624]]\n",
1010 |       "theta_2016: [[-0.09457365]\n",
1011 |       " [ 0.2795426 ]\n",
1012 |       " [ 0.10813787]\n",
1013 |       " [ 0.34307011]\n",
1014 |       " [ 0.13125419]\n",
1015 |       " [ 0.16548624]]\n",
1016 |       "error  1.3877787807814457e-17\n",
1017 |       "---------------------------\n",
1018 |       "-----episode 65-----\n",
1019 |       "theta_2014: [[-0.10014719]\n",
1020 |       " [ 0.28674718]\n",
1021 |       " [ 0.10813787]\n",
1022 |       " [ 0.34307011]\n",
1023 |       " [ 0.13125419]\n",
1024 |       " [ 0.16548624]]\n",
1025 |       "theta_2016: [[-0.10014719]\n",
1026 |       " [ 0.28674718]\n",
1027 |       " [ 0.10813787]\n",
1028 |       " [ 0.34307011]\n",
1029 |       " [ 0.13125419]\n",
1030 |       " [ 0.16548624]]\n",
1031 |       "error  2.7755575615628914e-17\n",
1032 |       "---------------------------\n",
1033 |       "-----episode 66-----\n",
1034 |       "theta_2014: [[-0.08719221]\n",
1035 |       " [ 0.28674718]\n",
1036 |       " [ 0.11705649]\n",
1037 |       " [ 0.34307011]\n",
1038 |       " [ 0.13125419]\n",
1039 |       " [ 0.16548624]]\n",
1040 |       "theta_2016: [[-0.08719221]\n",
1041 |       " [ 0.28674718]\n",
1042 |       " [ 0.11705649]\n",
1043 |       " [ 0.34307011]\n",
1044 |       " [ 0.13125419]\n",
1045 |       " [ 0.16548624]]\n",
1046 |       "error  2.7755575615628914e-17\n",
1047 |       "---------------------------\n",
1048 |       "-----episode 67-----\n",
1049 |       "theta_2014: [[-0.09256158]\n",
1050 |       " [ 0.29603918]\n",
1051 |       " [ 0.11705649]\n",
1052 |       " [ 0.34307011]\n",
1053 |       " [ 0.13994165]\n",
1054 |       " [ 0.16548624]]\n",
1055 |       "theta_2016: [[-0.09256158]\n",
1056 |       " [ 0.29603918]\n",
1057 |       " [ 0.11705649]\n",
1058 |       " [ 0.34307011]\n",
1059 |       " [ 0.13994165]\n",
1060 |       " [ 0.16548624]]\n",
1061 |       "error  4.163336342344337e-17\n",
1062 |       "---------------------------\n",
1063 |       "-----episode 68-----\n",
1064 |       "theta_2014: [[-0.0977868 ]\n",
1065 |       " [ 0.30531567]\n",
1066 |       " [ 0.11705649]\n",
1067 |       " [ 0.34307011]\n",
1068 |       " [ 0.14854223]\n",
1069 |       " [ 0.16548624]]\n",
1070 |       "theta_2016: [[-0.0977868 ]\n",
1071 |       " [ 0.30531567]\n",
1072 |       " [ 0.11705649]\n",
1073 |       " [ 0.34307011]\n",
1074 |       " [ 0.14854223]\n",
1075 |       " [ 0.16548624]]\n",
1076 |       "error  2.7755575615628914e-17\n",
1077 |       "---------------------------\n",
1078 |       "-----episode 69-----\n",
1079 |       "theta_2014: [[-0.10286954]\n",
1080 |       " [ 0.31457602]\n",
1081 |       " [ 0.11705649]\n",
1082 |       " [ 0.34307011]\n",
1083 |       " [ 0.15705681]\n",
1084 |       " [ 0.16548624]]\n",
1085 |       "theta_2016: [[-0.10286954]\n",
1086 |       " [ 0.31457602]\n",
1087 |       " [ 0.11705649]\n",
1088 |       " [ 0.34307011]\n",
1089 |       " [ 0.15705681]\n",
1090 |       " [ 0.16548624]]\n",
1091 |       "error  2.7755575615628914e-17\n",
1092 |       "---------------------------\n",
1093 |       "-----episode 70-----\n",
1094 |       "theta_2014: [[-0.08980787]\n",
1095 |       " [ 0.31457602]\n",
1096 |       " [ 0.12588592]\n",
1097 |       " [ 0.34307011]\n",
1098 |       " [ 0.15705681]\n",
1099 |       " [ 0.16548624]]\n",
1100 |       "theta_2016: [[-0.08980787]\n",
1101 |       " [ 0.31457602]\n",
1102 |       " [ 0.12588592]\n",
1103 |       " [ 0.34307011]\n",
1104 |       " [ 0.15705681]\n",
1105 |       " [ 0.16548624]]\n",
1106 |       "error  2.7755575615628914e-17\n",
1107 |       "---------------------------\n",
1108 |       "-----episode 71-----\n",
1109 |       "theta_2014: [[-0.07639752]\n",
1110 |       " [ 0.31457602]\n",
1111 |       " [ 0.13867382]\n",
1112 |       " [ 0.34963941]\n",
1113 |       " [ 0.15705681]\n",
1114 |       " [ 0.16548624]]\n",
1115 |       "theta_2016: [[-0.07639752]\n",
1116 |       " [ 0.31457602]\n",
1117 |       " [ 0.13867382]\n",
1118 |       " [ 0.34963941]\n",
1119 |       " [ 0.15705681]\n",
1120 |       " [ 0.16548624]]\n",
1121 |       "error  2.7755575615628914e-17\n",
1122 |       "---------------------------\n",
1123 |       "-----episode 72-----\n",
1124 |       "theta_2014: [[-0.08160412]\n",
1125 |       " [ 0.32381964]\n",
1126 |       " [ 0.13867382]\n",
1127 |       " [ 0.34963941]\n",
1128 |       " [ 0.16548624]\n",
1129 |       " [ 0.16548624]]\n",
1130 |       "theta_2016: [[-0.08160412]\n",
1131 |       " [ 0.32381964]\n",
1132 |       " [ 0.13867382]\n",
1133 |       " [ 0.34963941]\n",
1134 |       " [ 0.16548624]\n",
1135 |       " [ 0.16548624]]\n",
1136 |       "error  2.7755575615628914e-17\n",
1137 |       "---------------------------\n",
1138 |       "-----episode 73-----\n",
1139 |       "theta_2014: [[-0.08691285]\n",
1140 |       " [ 0.33058144]\n",
1141 |       " [ 0.13867382]\n",
1142 |       " [ 0.34963941]\n",
1143 |       " [ 0.16548624]\n",
1144 |       " [ 0.16548624]]\n",
1145 |       "theta_2016: [[-0.08691285]\n",
1146 |       " [ 0.33058144]\n",
1147 |       " [ 0.13867382]\n",
1148 |       " [ 0.34963941]\n",
1149 |       " [ 0.16548624]\n",
1150 |       " [ 0.16548624]]\n",
1151 |       "error  4.163336342344337e-17\n",
1152 |       "---------------------------\n",
1153 |       "-----episode 74-----\n",
1154 |       "theta_2014: [[-0.09169445]\n",
1155 |       " [ 0.33058144]\n",
1156 |       " [ 0.13867382]\n",
1157 |       " [ 0.3586075 ]\n",
1158 |       " [ 0.16548624]\n",
1159 |       " [ 0.17383138]]\n",
1160 |       "theta_2016: [[-0.09169445]\n",
1161 |       " [ 0.33058144]\n",
1162 |       " [ 0.13867382]\n",
1163 |       " [ 0.3586075 ]\n",
1164 |       " [ 0.16548624]\n",
1165 |       " [ 0.17383138]]\n",
1166 |       "error  4.163336342344337e-17\n",
1167 |       "---------------------------\n",
1168 |       "-----episode 75-----\n",
1169 |       "theta_2014: [[-0.09684202]\n",
1170 |       " [ 0.33727563]\n",
1171 |       " [ 0.13867382]\n",
1172 |       " [ 0.3586075 ]\n",
1173 |       " [ 0.16548624]\n",
1174 |       " [ 0.17383138]]\n",
1175 |       "theta_2016: [[-0.09684202]\n",
1176 |       " [ 0.33727563]\n",
1177 |       " [ 0.13867382]\n",
1178 |       " [ 0.3586075 ]\n",
1179 |       " [ 0.16548624]\n",
1180 |       " [ 0.17383138]]\n",
1181 |       "error  4.163336342344337e-17\n",
1182 |       "---------------------------\n",
1183 |       "-----episode 76-----\n",
1184 |       "theta_2014: [[-0.10187848]\n",
1185 |       " [ 0.34390287]\n",
1186 |       " [ 0.13867382]\n",
1187 |       " [ 0.3586075 ]\n",
1188 |       " [ 0.16548624]\n",
1189 |       " [ 0.17383138]]\n",
1190 |       "theta_2016: [[-0.10187848]\n",
1191 |       " [ 0.34390287]\n",
1192 |       " [ 0.13867382]\n",
1193 |       " [ 0.3586075 ]\n",
1194 |       " [ 0.16548624]\n",
1195 |       " [ 0.17383138]]\n",
1196 |       "error  4.163336342344337e-17\n",
1197 |       "---------------------------\n",
1198 |       "-----episode 77-----\n",
1199 |       "theta_2014: [[-0.10642315]\n",
1200 |       " [ 0.34390287]\n",
1201 |       " [ 0.13867382]\n",
1202 |       " [ 0.36756026]\n",
1203 |       " [ 0.16548624]\n",
1204 |       " [ 0.18209306]]\n",
1205 |       "theta_2016: [[-0.10642315]\n",
1206 |       " [ 0.34390287]\n",
1207 |       " [ 0.13867382]\n",
1208 |       " [ 0.36756026]\n",
1209 |       " [ 0.16548624]\n",
1210 |       " [ 0.18209306]]\n",
1211 |       "error  4.163336342344337e-17\n",
1212 |       "---------------------------\n",
1213 |       "-----episode 78-----\n",
1214 |       "theta_2014: [[-0.11130475]\n",
1215 |       " [ 0.35046384]\n",
1216 |       " [ 0.13867382]\n",
1217 |       " [ 0.36756026]\n",
1218 |       " [ 0.16548624]\n",
1219 |       " [ 0.18209306]]\n",
1220 |       "theta_2016: [[-0.11130475]\n",
1221 |       " [ 0.35046384]\n",
1222 |       " [ 0.13867382]\n",
1223 |       " [ 0.36756026]\n",
1224 |       " [ 0.16548624]\n",
1225 |       " [ 0.18209306]]\n",
1226 |       "error  4.163336342344337e-17\n",
1227 |       "---------------------------\n",
1228 |       "-----episode 79-----\n",
1229 |       "theta_2014: [[-0.1156681 ]\n",
1230 |       " [ 0.35046384]\n",
1231 |       " [ 0.13867382]\n",
1232 |       " [ 0.37649711]\n",
1233 |       " [ 0.16548624]\n",
1234 |       " [ 0.19027213]]\n",
1235 |       "theta_2016: [[-0.1156681 ]\n",
1236 |       " [ 0.35046384]\n",
1237 |       " [ 0.13867382]\n",
1238 |       " [ 0.37649711]\n",
1239 |       " [ 0.16548624]\n",
1240 |       " [ 0.19027213]]\n",
1241 |       "error  4.163336342344337e-17\n",
1242 |       "---------------------------\n",
1243 |       "-----episode 80-----\n",
1244 |       "theta_2014: [[-0.12039879]\n",
1245 |       " [ 0.3569592 ]\n",
1246 |       " [ 0.13867382]\n",
1247 |       " [ 0.37649711]\n",
1248 |       " [ 0.16548624]\n",
1249 |       " [ 0.19027213]]\n",
1250 |       "theta_2016: [[-0.12039879]\n",
1251 |       " [ 0.3569592 ]\n",
1252 |       " [ 0.13867382]\n",
1253 |       " [ 0.37649711]\n",
1254 |       " [ 0.16548624]\n",
1255 |       " [ 0.19027213]]\n",
1256 |       "error  2.7755575615628914e-17\n",
1257 |       "---------------------------\n",
1258 |       "-----episode 81-----\n",
1259 |       "theta_2014: [[-0.12458437]\n",
1260 |       " [ 0.3569592 ]\n",
1261 |       " [ 0.13867382]\n",
1262 |       " [ 0.38541746]\n",
1263 |       " [ 0.16548624]\n",
1264 |       " [ 0.19836941]]\n",
1265 |       "theta_2016: [[-0.12458437]\n",
1266 |       " [ 0.3569592 ]\n",
1267 |       " [ 0.13867382]\n",
1268 |       " [ 0.38541746]\n",
1269 |       " [ 0.16548624]\n",
1270 |       " [ 0.19836941]]\n",
1271 |       "error  5.551115123125783e-17\n",
1272 |       "---------------------------\n",
1273 |       "-----episode 82-----\n",
1274 |       "theta_2014: [[-0.12864146]\n",
1275 |       " [ 0.3569592 ]\n",
1276 |       " [ 0.13867382]\n",
1277 |       " [ 0.39432076]\n",
1278 |       " [ 0.16548624]\n",
1279 |       " [ 0.20638572]]\n",
1280 |       "theta_2016: [[-0.12864146]\n",
1281 |       " [ 0.3569592 ]\n",
1282 |       " [ 0.13867382]\n",
1283 |       " [ 0.39432076]\n",
1284 |       " [ 0.16548624]\n",
1285 |       " [ 0.20638572]]\n",
1286 |       "error  5.551115123125783e-17\n",
1287 |       "---------------------------\n",
1288 |       "-----episode 83-----\n",
1289 |       "theta_2014: [[-0.13257159]\n",
1290 |       " [ 0.3569592 ]\n",
1291 |       " [ 0.13867382]\n",
1292 |       " [ 0.40320645]\n",
1293 |       " [ 0.16548624]\n",
1294 |       " [ 0.21432186]]\n",
1295 |       "theta_2016: [[-0.13257159]\n",
1296 |       " [ 0.3569592 ]\n",
1297 |       " [ 0.13867382]\n",
1298 |       " [ 0.40320645]\n",
1299 |       " [ 0.16548624]\n",
1300 |       " [ 0.21432186]]\n",
1301 |       "error  5.551115123125783e-17\n",
1302 |       "---------------------------\n",
1303 |       "-----episode 84-----\n",
1304 |       "theta_2014: [[-0.13683138]\n",
1305 |       " [ 0.36585409]\n",
1306 |       " [ 0.13867382]\n",
1307 |       " [ 0.40320645]\n",
1308 |       " [ 0.17383138]\n",
1309 |       " [ 0.21432186]]\n",
1310 |       "theta_2016: [[-0.13683138]\n",
1311 |       " [ 0.36585409]\n",
1312 |       " [ 0.13867382]\n",
1313 |       " [ 0.40320645]\n",
1314 |       " [ 0.17383138]\n",
1315 |       " [ 0.21432186]]\n",
1316 |       "error  5.551115123125783e-17\n",
1317 |       "---------------------------\n",
1318 |       "-----episode 85-----\n",
1319 |       "theta_2014: [[-0.12954367]\n",
1320 |       " [ 0.36585409]\n",
1321 |       " [ 0.12564511]\n",
1322 |       " [ 0.40917439]\n",
1323 |       " [ 0.17383138]\n",
1324 |       " [ 0.21432186]]\n",
1325 |       "theta_2016: [[-0.12954367]\n",
1326 |       " [ 0.36585409]\n",
1327 |       " [ 0.12564511]\n",
1328 |       " [ 0.40917439]\n",
1329 |       " [ 0.17383138]\n",
1330 |       " [ 0.21432186]]\n",
1331 |       "error  5.551115123125783e-17\n",
1332 |       "---------------------------\n",
1333 |       "-----episode 86-----\n",
1334 |       "theta_2014: [[-0.13361249]\n",
1335 |       " [ 0.36585409]\n",
1336 |       " [ 0.12564511]\n",
1337 |       " [ 0.41508264]\n",
1338 |       " [ 0.17383138]\n",
1339 |       " [ 0.21432186]]\n",
1340 |       "theta_2016: [[-0.13361249]\n",
1341 |       " [ 0.36585409]\n",
1342 |       " [ 0.12564511]\n",
1343 |       " [ 0.41508264]\n",
1344 |       " [ 0.17383138]\n",
1345 |       " [ 0.21432186]]\n",
1346 |       "error  5.551115123125783e-17\n",
1347 |       "---------------------------\n",
1348 |       "-----episode 87-----\n",
1349 |       "theta_2014: [[-0.11970272]\n",
1350 |       " [ 0.36585409]\n",
1351 |       " [ 0.13907705]\n",
1352 |       " [ 0.42093182]\n",
1353 |       " [ 0.17383138]\n",
1354 |       " [ 0.21432186]]\n",
1355 |       "theta_2016: [[-0.11970272]\n",
1356 |       " [ 0.36585409]\n",
1357 |       " [ 0.13907705]\n",
1358 |       " [ 0.42093182]\n",
1359 |       " [ 0.17383138]\n",
1360 |       " [ 0.21432186]]\n",
1361 |       "error  5.551115123125783e-17\n",
1362 |       "---------------------------\n",
1363 |       "-----episode 88-----\n",
1364 |       "theta_2014: [[-0.1058072 ]\n",
1365 |       " [ 0.36585409]\n",
1366 |       " [ 0.15242678]\n",
1367 |       " [ 0.4267225 ]\n",
1368 |       " [ 0.17383138]\n",
1369 |       " [ 0.21432186]]\n",
1370 |       "theta_2016: [[-0.1058072 ]\n",
1371 |       " [ 0.36585409]\n",
1372 |       " [ 0.15242678]\n",
1373 |       " [ 0.4267225 ]\n",
1374 |       " [ 0.17383138]\n",
1375 |       " [ 0.21432186]]\n",
1376 |       "error  5.551115123125783e-17\n",
1377 |       "---------------------------\n",
1378 |       "-----episode 89-----\n",
1379 |       "theta_2014: [[-0.09192659]\n",
1380 |       " [ 0.36585409]\n",
1381 |       " [ 0.16569461]\n",
1382 |       " [ 0.43245527]\n",
1383 |       " [ 0.17383138]\n",
1384 |       " [ 0.21432186]]\n",
1385 |       "theta_2016: [[-0.09192659]\n",
1386 |       " [ 0.36585409]\n",
1387 |       " [ 0.16569461]\n",
1388 |       " [ 0.43245527]\n",
1389 |       " [ 0.17383138]\n",
1390 |       " [ 0.21432186]]\n",
1391 |       "error  4.163336342344337e-17\n",
1392 |       "---------------------------\n",
1393 |       "-----episode 90-----\n",
1394 |       "theta_2014: [[-0.09675757]\n",
1395 |       " [ 0.37219555]\n",
1396 |       " [ 0.16569461]\n",
1397 |       " [ 0.43245527]\n",
1398 |       " [ 0.17383138]\n",
1399 |       " [ 0.21432186]]\n",
1400 |       "theta_2016: [[-0.09675757]\n",
1401 |       " [ 0.37219555]\n",
1402 |       " [ 0.16569461]\n",
1403 |       " [ 0.43245527]\n",
1404 |       " [ 0.17383138]\n",
1405 |       " [ 0.21432186]]\n",
1406 |       "error  4.163336342344337e-17\n",
1407 |       "---------------------------\n",
1408 |       "-----episode 91-----\n",
1409 |       "theta_2014: [[-0.10065975]\n",
1410 |       " [ 0.37219555]\n",
1411 |       " [ 0.16569461]\n",
1412 |       " [ 0.44103033]\n",
1413 |       " [ 0.17383138]\n",
1414 |       " [ 0.22217864]]\n",
1415 |       "theta_2016: [[-0.10065975]\n",
1416 |       " [ 0.37219555]\n",
1417 |       " [ 0.16569461]\n",
1418 |       " [ 0.44103033]\n",
1419 |       " [ 0.17383138]\n",
1420 |       " [ 0.22217864]]\n",
1421 |       "error  4.163336342344337e-17\n",
1422 |       "---------------------------\n",
1423 |       "-----episode 92-----\n",
1424 |       "theta_2014: [[-0.10509555]\n",
1425 |       " [ 0.38101244]\n",
1426 |       " [ 0.16569461]\n",
1427 |       " [ 0.44103033]\n",
1428 |       " [ 0.18209306]\n",
1429 |       " [ 0.22217864]]\n",
1430 |       "theta_2016: [[-0.10509555]\n",
1431 |       " [ 0.38101244]\n",
1432 |       " [ 0.16569461]\n",
1433 |       " [ 0.44103033]\n",
1434 |       " [ 0.18209306]\n",
1435 |       " [ 0.22217864]]\n",
1436 |       "error  4.163336342344337e-17\n",
1437 |       "---------------------------\n",
1438 |       "-----episode 93-----\n",
1439 |       "theta_2014: [[-0.10965977]\n",
1440 |       " [ 0.38720231]\n",
1441 |       " [ 0.16569461]\n",
1442 |       " [ 0.44103033]\n",
1443 |       " [ 0.18209306]\n",
1444 |       " [ 0.22217864]]\n",
1445 |       "theta_2016: [[-0.10965977]\n",
1446 |       " [ 0.38720231]\n",
1447 |       " [ 0.16569461]\n",
1448 |       " [ 0.44103033]\n",
1449 |       " [ 0.18209306]\n",
1450 |       " [ 0.22217864]]\n",
1451 |       "error  5.551115123125783e-17\n",
1452 |       "---------------------------\n",
1453 |       "-----episode 94-----\n",
1454 |       "theta_2014: [[-0.1141232 ]\n",
1455 |       " [ 0.39333029]\n",
1456 |       " [ 0.16569461]\n",
1457 |       " [ 0.44103033]\n",
1458 |       " [ 0.18209306]\n",
1459 |       " [ 0.22217864]]\n",
1460 |       "theta_2016: [[-0.1141232 ]\n",
1461 |       " [ 0.39333029]\n",
1462 |       " [ 0.16569461]\n",
1463 |       " [ 0.44103033]\n",
1464 |       " [ 0.18209306]\n",
1465 |       " [ 0.22217864]]\n",
1466 |       "error  5.551115123125783e-17\n",
1467 |       "---------------------------\n",
1468 |       "-----episode 95-----\n",
1469 |       "theta_2014: [[-0.11822876]\n",
1470 |       " [ 0.40200943]\n",
1471 |       " [ 0.16569461]\n",
1472 |       " [ 0.44103033]\n",
1473 |       " [ 0.19027213]\n",
1474 |       " [ 0.22217864]]\n",
1475 |       "theta_2016: [[-0.11822876]\n",
1476 |       " [ 0.40200943]\n",
1477 |       " [ 0.16569461]\n",
1478 |       " [ 0.44103033]\n",
1479 |       " [ 0.19027213]\n",
1480 |       " [ 0.22217864]]\n",
1481 |       "error  5.551115123125783e-17\n",
1482 |       "---------------------------\n",
1483 |       "-----episode 96-----\n",
1484 |       "theta_2014: [[-0.12247457]\n",
1485 |       " [ 0.40798934]\n",
1486 |       " [ 0.16569461]\n",
1487 |       " [ 0.44103033]\n",
1488 |       " [ 0.19027213]\n",
1489 |       " [ 0.22217864]]\n",
1490 |       "theta_2016: [[-0.12247457]\n",
1491 |       " [ 0.40798934]\n",
1492 |       " [ 0.16569461]\n",
1493 |       " [ 0.44103033]\n",
1494 |       " [ 0.19027213]\n",
1495 |       " [ 0.22217864]]\n",
1496 |       "error  5.551115123125783e-17\n",
1497 |       "---------------------------\n",
1498 |       "-----episode 97-----\n",
1499 |       "theta_2014: [[-0.10878349]\n",
1500 |       " [ 0.40798934]\n",
1501 |       " [ 0.17403766]\n",
1502 |       " [ 0.44103033]\n",
1503 |       " [ 0.19027213]\n",
1504 |       " [ 0.22217864]]\n",
1505 |       "theta_2016: [[-0.10878349]\n",
1506 |       " [ 0.40798934]\n",
1507 |       " [ 0.17403766]\n",
1508 |       " [ 0.44103033]\n",
1509 |       " [ 0.19027213]\n",
1510 |       " [ 0.22217864]]\n",
1511 |       "error  5.551115123125783e-17\n",
1512 |       "---------------------------\n",
1513 |       "-----episode 98-----\n",
1514 |       "theta_2014: [[-0.11277607]\n",
1515 |       " [ 0.40798934]\n",
1516 |       " [ 0.17403766]\n",
1517 |       " [ 0.44662002]\n",
1518 |       " [ 0.19027213]\n",
1519 |       " [ 0.22217864]]\n",
1520 |       "theta_2016: [[-0.11277607]\n",
1521 |       " [ 0.40798934]\n",
1522 |       " [ 0.17403766]\n",
1523 |       " [ 0.44662002]\n",
1524 |       " [ 0.19027213]\n",
1525 |       " [ 0.22217864]]\n",
1526 |       "error  6.938893903907228e-17\n",
1527 |       "---------------------------\n",
1528 |       "-----episode 99-----\n",
1529 |       "theta_2014: [[-0.09858656]\n",
1530 |       " [ 0.40798934]\n",
1531 |       " [ 0.18756066]\n",
1532 |       " [ 0.45512344]\n",
1533 |       " [ 0.19027213]\n",
1534 |       " [ 0.22995685]]\n",
1535 |       "theta_2016: [[-0.09858656]\n",
1536 |       " [ 0.40798934]\n",
1537 |       " [ 0.18756066]\n",
1538 |       " [ 0.45512344]\n",
1539 |       " [ 0.19027213]\n",
1540 |       " [ 0.22995685]]\n",
1541 |       "error  6.938893903907228e-17\n",
1542 |       "---------------------------\n"
1543 |      ]
1544 |     }
1545 |    ],
1546 |    "source": [
1547 |     "EPISODES = 100\n",
1548 |     "gamma = 0.99\n",
1549 |     "alpha = 0.01\n",
1550 |     "_lambda = 0.1\n",
1551 |     "for episode in range(EPISODES):\n",
1552 |     "    e_2014 = np.array([[0],[0],[0],[0],[0],[0]])\n",
1553 |     "    e_2016 = np.array([[0],[0],[0],[0],[0],[0]])\n",
1554 |     "\n",
1555 |     "    S = 'A'\n",
1556 |     "    # 2014\n",
1557 |     "    v_s = theta_2014.T@feature_map['A']\n",
1558 |     "    # 2016\n",
1559 |     "    V_old = 0\n",
1560 |     "    while True:\n",
1561 |     "        if S == 'T':\n",
1562 |     "            print(f'-----episode {episode}-----')\n",
1563 |     "            print(f'theta_2014: {theta_2014}')\n",
1564 |     "            print(f'theta_2016: {theta_2016}')\n",
1565 |     "            print('error ',np.sum(abs(theta_2014 - theta_2016)))\n",
1566 |     "            print(f'---------------------------')\n",
1567 |     "            break\n",
1568 |     "\n",
1569 |     "        random_choice = np.random.choice(len(state_map[S]))\n",
1570 |     "        next_S, R = state_map[S][random_choice]\n",
1571 |     "        # 2014\n",
1572 |     "        v_next_s = theta_2014.T@feature_map[next_S]\n",
1573 |     "        delta_2014 = R + gamma*v_next_s - v_s\n",
1574 |     "        e_2014 = gamma*_lambda*e_2014 + alpha*(1-gamma*_lambda*e_2014.T@feature_map[S])*feature_map[S]\n",
1575 |     "        theta_2014 = theta_2014 + delta_2014*e_2014 + alpha*(v_s - theta_2014.T@feature_map[S])*feature_map[S]\n",
1576 |     "\n",
1577 |     "        # 2016\n",
1578 |     "        V = theta_2016.T@feature_map[S]\n",
1579 |     "        V_prime = theta_2016.T@feature_map[next_S]\n",
1580 |     "        delta_2016 = R + gamma*V_prime - V\n",
1581 |     "        e_2016 = gamma*_lambda*e_2016 + feature_map[S] - alpha*gamma*_lambda * (e_2016.T@feature_map[S]) * feature_map[S]\n",
1582 |     "        theta_2016 = theta_2016 + alpha*(delta_2016 + V - V_old)*e_2016 - alpha*(V-V_old)*feature_map[S]\n",
1583 |     "        V_old = V_prime\n",
1584 |     "\n",
1585 |     "        # 2014\n",
1586 |     "        v_s = v_next_s\n",
1587 |     "\n",
1588 |     "        # 2014 & 2016\n",
1589 |     "        S = next_S\n"
1590 |    ],
1591 |    "metadata": {
1592 |     "collapsed": false,
1593 |     "pycharm": {
1594 |      "name": "#%%\n"
1595 |     }
1596 |    }
1597 |   },
1598 |   {
1599 |    "cell_type": "code",
1600 |    "execution_count": null,
1601 |    "outputs": [],
1602 |    "source": [
1603 |     "\n"
1604 |    ],
1605 |    "metadata": {
1606 |     "collapsed": false,
1607 |     "pycharm": {
1608 |      "name": "#%%\n"
1609 |     }
1610 |    }
1611 |   }
1612 |  ],
1613 |  "metadata": {
1614 |   "kernelspec": {
1615 |    "display_name": "Python 3",
1616 |    "language": "python",
1617 |    "name": "python3"
1618 |   },
1619 |   "language_info": {
1620 |    "codemirror_mode": {
1621 |     "name": "ipython",
1622 |     "version": 2
1623 |    },
1624 |    "file_extension": ".py",
1625 |    "mimetype": "text/x-python",
1626 |    "name": "python",
1627 |    "nbconvert_exporter": "python",
1628 |    "pygments_lexer": "ipython2",
1629 |    "version": "2.7.6"
1630 |   }
1631 |  },
1632 |  "nbformat": 4,
1633 |  "nbformat_minor": 0
1634 | }


--------------------------------------------------------------------------------
/MASM/Differential_semi_gradient_Sarsa.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 83,
  6 |    "metadata": {
  7 |     "collapsed": true,
  8 |     "pycharm": {
  9 |      "is_executing": false
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import numpy as np\n",
 15 |     "class State:\n",
 16 |     "    def __init__(self, name, value):\n",
 17 |     "        self.name = name\n",
 18 |     "        self.value = value\n",
 19 |     "A = State('A', np.array([[1,0,0]]))\n",
 20 |     "B = State('B',np.array([[0,1,0]]))\n",
 21 |     "C = State('C',np.array([[0,0,1]]))\n",
 22 |     "\n",
 23 |     "w = np.random.random((1,3))\n",
 24 |     "R = {'A':1, 'B':0, 'C':0}\n",
 25 |     "policy = {'A':B, 'B':C, 'C':A}"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 84,
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stdout",
 34 |      "text": [
 35 |       "[[4.15223411e-14]]\n",
 36 |       "[[1.38777878e-14]]\n",
 37 |       "[[-5.54001289e-14]]\n",
 38 |       "[[4.15223411e-14]]\n",
 39 |       "[[1.38777878e-14]]\n",
 40 |       "[[-5.54001289e-14]]\n",
 41 |       "[[4.15223411e-14]]\n",
 42 |       "[[1.38777878e-14]]\n",
 43 |       "[[-5.54001289e-14]]\n",
 44 |       "[[4.15223411e-14]]\n",
 45 |       "[[1.38777878e-14]]\n",
 46 |       "[[-5.54001289e-14]]\n",
 47 |       "[[4.15223411e-14]]\n",
 48 |       "[[1.38777878e-14]]\n",
 49 |       "[[-5.54001289e-14]]\n",
 50 |       "[[4.15223411e-14]]\n",
 51 |       "[[1.38777878e-14]]\n",
 52 |       "[[-5.54001289e-14]]\n",
 53 |       "[[4.15223411e-14]]\n",
 54 |       "[[1.38777878e-14]]\n",
 55 |       "[[-5.54001289e-14]]\n",
 56 |       "[[4.15223411e-14]]\n",
 57 |       "[[1.38777878e-14]]\n",
 58 |       "[[-5.54001289e-14]]\n",
 59 |       "[[4.15223411e-14]]\n",
 60 |       "[[1.38777878e-14]]\n",
 61 |       "[[-5.54001289e-14]]\n",
 62 |       "[[4.15223411e-14]]\n",
 63 |       "[[1.38777878e-14]]\n"
 64 |      ],
 65 |      "output_type": "stream"
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "S = A\n",
 70 |     "R_bar = 0\n",
 71 |     "limit = 100000\n",
 72 |     "for step in range(limit):\n",
 73 |     "   S_prime = policy[S.name]\n",
 74 |     "   delta = R[S_prime.name] - R_bar + S_prime.value@w.T - S.value@w.T\n",
 75 |     "   R_bar += 0.001*delta\n",
 76 |     "   w += 0.001*delta*S.value\n",
 77 |     "   S = S_prime\n",
 78 |     "   if step > limit -30:\n",
 79 |     "       print(delta)"
 80 |    ],
 81 |    "metadata": {
 82 |     "collapsed": false,
 83 |     "pycharm": {
 84 |      "name": "#%%\n",
 85 |      "is_executing": false
 86 |     }
 87 |    }
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 85,
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "text": [
 96 |       "[array([[0.17260156]]), array([[0.50593489]]), array([[0.83926822]])]\n"
 97 |      ],
 98 |      "output_type": "stream"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "result = [S.value@w.T for S in [A,B,C]]\n",
103 |     "print(result)"
104 |    ],
105 |    "metadata": {
106 |     "collapsed": false,
107 |     "pycharm": {
108 |      "name": "#%%\n",
109 |      "is_executing": false
110 |     }
111 |    }
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "outputs": [],
117 |    "source": [
118 |     "\n"
119 |    ],
120 |    "metadata": {
121 |     "collapsed": false,
122 |     "pycharm": {
123 |      "name": "#%%\n"
124 |     }
125 |    }
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "Python 3",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 2
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython2",
144 |    "version": "2.7.6"
145 |   },
146 |   "pycharm": {
147 |    "stem_cell": {
148 |     "cell_type": "raw",
149 |     "source": [],
150 |     "metadata": {
151 |      "collapsed": false
152 |     }
153 |    }
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 0
158 | }


--------------------------------------------------------------------------------
/MASM/prototype.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": true,
 8 |     "pycharm": {
 9 |      "is_executing": false
10 |     }
11 |    },
12 |    "outputs": [],
13 |    "source": [
14 |     "import torch\n",
15 |     "import numpy as np\n",
16 |     "import matplotlib.pyplot as plt\n",
17 |     "from torch.nn import Linear\n",
18 |     "import torch.nn.functional as F\n",
19 |     "from collections import deque\n",
20 |     "import random\n",
21 |     "import gym\n",
22 |     "import matplotlib.pyplot as plt\n",
23 |     "from torch.nn import Linear, ReLU\n",
24 |     "from torch.autograd import Variable"
25 |    ]
26 |   },
27 |   {
28 |    "cell_type": "code",
29 |    "execution_count": null,
30 |    "outputs": [],
31 |    "source": [
32 |     "class Seller_Env:\n",
33 |     "    def __init__(self, size_of_list = 10, initial_price = 1000):\n",
34 |     "        self.mu = np.random.randint(initial_price*0.7, initial_price*1.1)\n",
35 |     "        self.days = 0\n",
36 |     "        self.size_of_list = size_of_list\n",
37 |     "        self.item_list = [int(np.random.normal(self.mu, scale = self.mu/10)) for _ in range(self.size_of_list)]\n",
38 |     "        self.baseline = self.item_list.mean()\n",
39 |     "        \n",
40 |     "    def step(self, current_price):\n",
41 |     "        self.days += 1\n",
42 |     "        offer = [int(np.random.normal(self.mu*0.7, scale = self.mu/10)) \n",
43 |     "                 for _ in range(np.random.randint(0, self.size_of_list//2))]\n",
44 |     "        for _ in range(self.size_of_list//2 - len(offer)):\n",
45 |     "            offer.append(0)\n",
46 |     "        if max(offer) > current_price:\n",
47 |     "            return [],True\n",
48 |     "        else:\n",
49 |     "            return offer,False\n",
50 |     "    \n",
51 |     "    def reset(self):\n",
52 |     "        self.days = 0\n",
53 |     "        return self.item_list, \n",
54 |     "        \n",
55 |     "class Seller:\n",
56 |     "    def __init__(self, min_price):\n",
57 |     "        self.min_price = min_price\n",
58 |     "    def model(self):\n",
59 |     "        "
60 |    ],
61 |    "metadata": {
62 |     "collapsed": false,
63 |     "pycharm": {
64 |      "name": "#%%\n"
65 |     }
66 |    }
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 2
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython2",
85 |    "version": "2.7.6"
86 |   },
87 |   "pycharm": {
88 |    "stem_cell": {
89 |     "cell_type": "raw",
90 |     "source": [],
91 |     "metadata": {
92 |      "collapsed": false
93 |     }
94 |    }
95 |   }
96 |  },
97 |  "nbformat": 4,
98 |  "nbformat_minor": 0
99 | }


--------------------------------------------------------------------------------
/Off-Policy Policy Gradient/Experiment Log of failure of  Off_policy_Actor_Critic:
--------------------------------------------------------------------------------
 1 | ## Design
 2 | The experiment is not succssful but I will record the detail about my thought.
 3 | 
 4 | Inspiring from off-policy policy gradient method, I intend to try use off-policy method train 
 5 | an actor critic. 
 6 | 
 7 | I understand that using bootstrap, functional method plus off-policy is called deadly triads in 
 8 | Suttons' RL book. I still want to give a try.
 9 | 
10 | In this experiement, there is only one model, Actor. 
11 | It has 2 outcomes, both action probabilites and V estimation.
12 | 
13 | The behavior policy is epsilon-Actor.  That is the same formulation from epsilon-greedy methods.
14 | While behaving, with probability 90%, the policy chooses the action based on its probability output like
15 | in normal actor-critic method. With 10% probability however, the policy chooses randomly among the action
16 | space, i.e, {0,1}.
17 | 
18 | A buffer inside the class will record the observation of states, 
19 | reward (because its always 1 in this case I make it constant), probability of the action taken.
20 | 
21 | Then, the target policy is when we do not have epsilon. When buffer hits an end in episode, it will 
22 | make the model learn. Recalculating each p based on stored observations, using stored p and simple algebra
23 | to generate rho, and update the model. buffer will also only keep the latest 100 records.
24 | 
25 | After training, model is automatically evaluated by itself.
26 | 
27 | ## Result:
28 | 
29 | The learning never takes place but the loss can. It is not diverging like the one usually met in REINFORCE,
30 | but a natural behavior that the thing is converging to a place that is never a best policy.
31 | 
32 | ## Improvement:
33 | 
34 | Delete the V part of the actor model, and create a off-policy REINFORCE instead.


--------------------------------------------------------------------------------
/RAINBOW/PyTorch_RAINBOW.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LyWangPX/Reinforcement_Learning_Coding_Examples/2f40f67f5709c9dc4ea3d9dd15b441b627b595a6/RAINBOW/PyTorch_RAINBOW.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Welcome
 2 | 
 3 | This is my personal practice of implementing various algorithms of RL from scratch.
 4 | 
 5 | Most of them will be in jupyter notebook and some of them involving multiprocess
 6 | would be in normal python files.
 7 | 
 8 | The framework will always be PyTorch, as a personal practice too.
 9 | 
10 | Normally I use cartpole for easy algorithms in this project and I skip the 
11 | visual input part. (which is quite trivial if you add few conv layers). 
12 | 
13 | And for
14 | harder and visual-related algorithms I will pick various atari game as my environment.
15 | 
16 | Due to time limit, I will not provide systematic analysis to any particular algorithm. 
17 | And be aware these are personal usage so bugs do appear frequently.
18 | 
19 | If the project is mature, I will accept open issues.
20 | For now, however, let me dive in. (I guess no one even read this repo though)
21 | 
22 | Project file structure will be changed continuously to match my needs.
23 | 
24 | # PLAN: 
25 | ## Model-Free RL
26 | ### Policy Gradient
27 | - [x] REINFORCE 
28 | - [x] Off-Policy REINFORCE
29 | - [x] Basic Actor Critic
30 | - [x] Advantage Actor Critic using Huber loss and Entropy 
31 | - [x] A3C
32 | - [x] A2C 
33 | - [x] DDPG
34 | - [ ] D4PG
35 | - [ ] MADDPG
36 | - [ ] TRPO
37 | - [ ] PPO
38 | - [ ] ACER
39 | - [ ] ACTKR
40 | - [ ] SAC
41 | - [ ] SAC with AAT(Automatically Adjusted Temperature
42 | - [ ] TD3
43 | - [ ] SVPG
44 | - [ ] IMPALA
45 | ### Deep Q Learning 
46 | - [X] Dueling DDQN
47 | - [x] Dueling DDQN + PER
48 | - [ ] Rainbow DQN
49 | - [ ] Ape-X
50 | ### Distributed RL 
51 | - [ ] C51
52 | - [ ] QR-DQN
53 | - [ ] IQN
54 | - [ ] Dopamine (DQN + C51 + IQN + Rainbow)
55 | ### Policy Gradient with Action-Dependent Baselines:
56 | - [ ] Q-prop
57 | - [ ] Stein Control Variates
58 | ### Path-Consistency Learning
59 | - [ ] PCL
60 | - [ ] Trust-PCL
61 | ### Q-learning + Policy Gradient:
62 | - [ ] PGQL
63 | - [ ] Reactor
64 | - [ ] IPG
65 | ### Evolutionary Algorithm
66 | ### Monte Carlo Tree (Alpha Zero)
67 | ## Exploration RL
68 | ### Intrinsic Motivation
69 | - [ ] VIME
70 | - [ ] CTS-based Pseudocounts
71 | - [ ] PixelCNN-based Pseudocounts
72 | - [ ] Hash-based Counts
73 | - [ ] EX2
74 | - [ ] ICM
75 | - [ ] RND
76 | ### Unsupervised RL
77 | - [ ] VIC
78 | - [ ] DIAYN
79 | - [ ] VALOR
80 | ## Hierachy RL
81 | ## Memory RL
82 | ## Model-Based RL
83 | ## Meta-RL
84 | ## Scaling-RL


--------------------------------------------------------------------------------