├── .gitignore
├── Code
    ├── RL.sln
    ├── RL.pyproj
    ├── REINFORCE.py
    ├── SARSA.py
    ├── A2C_NStepReturns.py
    ├── DQN.py
    ├── A2C_GAE.py
    ├── DoubleDQN.py
    ├── PPO.py
    └── DoubleDQN_PER.py
├── LICENSE
├── LICENSE - SLM-LAB
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | env/
2 | swigwin-4.0.2/
3 | __pycache__/
4 | .vs/
5 | 


--------------------------------------------------------------------------------
/Code/RL.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.30621.155
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "RL", "RL.pyproj", "{0A36CCDE-1FF8-4FDA-94AA-372C6F453B9F}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Release|Any CPU = Release|Any CPU
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{0A36CCDE-1FF8-4FDA-94AA-372C6F453B9F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | 		{0A36CCDE-1FF8-4FDA-94AA-372C6F453B9F}.Release|Any CPU.ActiveCfg = Release|Any CPU
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | 	GlobalSection(ExtensibilityGlobals) = postSolution
21 | 		SolutionGuid = {F9323F88-115B-4719-BFDE-FC2A8BE74F1D}
22 | 	EndGlobalSection
23 | EndGlobal
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 YoungWook Yang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE - SLM-LAB:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Wah Loon Keng, Laura Graesser
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Code/RL.pyproj:
--------------------------------------------------------------------------------
 1 | <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
 2 |   <PropertyGroup>
 3 |     <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
 4 |     <SchemaVersion>2.0</SchemaVersion>
 5 |     <ProjectGuid>0a36ccde-1ff8-4fda-94aa-372c6f453b9f</ProjectGuid>
 6 |     <ProjectHome>
 7 |     </ProjectHome>
 8 |     <StartupFile>PPO.py</StartupFile>
 9 |     <SearchPath>
10 |     </SearchPath>
11 |     <WorkingDirectory>.</WorkingDirectory>
12 |     <OutputPath>.</OutputPath>
13 |     <Name>RL</Name>
14 |     <RootNamespace>RL</RootNamespace>
15 |     <InterpreterId>MSBuild|env|$(MSBuildProjectFullPath)</InterpreterId>
16 |   </PropertyGroup>
17 |   <PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
18 |     <DebugSymbols>true</DebugSymbols>
19 |     <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
20 |   </PropertyGroup>
21 |   <PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
22 |     <DebugSymbols>true</DebugSymbols>
23 |     <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
24 |   </PropertyGroup>
25 |   <ItemGroup>
26 |     <Compile Include="A2C_GAE.py" />
27 |     <Compile Include="A2C_NStepReturns.py" />
28 |     <Compile Include="DoubleDQN.py" />
29 |     <Compile Include="DoubleDQN_PER.py" />
30 |     <Compile Include="DQN.py" />
31 |     <Compile Include="PPO.py" />
32 |     <Compile Include="SARSA.py">
33 |       <SubType>Code</SubType>
34 |     </Compile>
35 |     <Compile Include="REINFORCE.py" />
36 |   </ItemGroup>
37 |   <ItemGroup>
38 |     <Interpreter Include="..\env\">
39 |       <Id>env</Id>
40 |       <Version>3.7</Version>
41 |       <Description>env (Python 3.7 (64-bit))</Description>
42 |       <InterpreterPath>Scripts\python.exe</InterpreterPath>
43 |       <WindowsInterpreterPath>Scripts\pythonw.exe</WindowsInterpreterPath>
44 |       <PathEnvironmentVariable>PYTHONPATH</PathEnvironmentVariable>
45 |       <Architecture>X64</Architecture>
46 |     </Interpreter>
47 |   </ItemGroup>
48 |   <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
49 |   <!-- Uncomment the CoreCompile target to enable the Build command in
50 |        Visual Studio and specify your pre- and post-build commands in
51 |        the BeforeBuild and AfterBuild targets below. -->
52 |   <!--<Target Name="CoreCompile" />-->
53 |   <Target Name="BeforeBuild">
54 |   </Target>
55 |   <Target Name="AfterBuild">
56 |   </Target>
57 | </Project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RL
 2 | Reinforcement Learning Algorithms
 3 | 
 4 | # Background
 5 | I've implemented all the RL algorithms introduced in this book - ["Foundations Of Deep Reinforcement Learning"](https://www.amazon.com/gp/product/0135172381). They are REINFORCE, SARSA, DQN, A2C and PPO.
 6 | 
 7 | ![alt text](https://images-na.ssl-images-amazon.com/images/I/41HraVa1zgS._SX218_BO1,204,203,200_QL40_FMwebp_.jpg)
 8 | 
 9 | While the book is really awesome, its code examples are not easy to read as they are implemented as a part of this big RL framework called [SLM-Lab](https://slm-lab.gitbook.io/slm-lab/) except for the very first algorithm, REINFORCE. Therefore, I've decided to write simple and easy to understand code that shows each algorithm's core element clearly. Most of the code lines here are copied from https://github.com/kengz/SLM-Lab and modified so that each file contains a complete single algorithm without relying on any other files. I've also add some more comments on parts that I was confused. I hope it helps anyone studying the book or those RL algorithms. :)
10 | 
11 | # Install
12 | Please install the below two packages to run.
13 | 
14 | - PyTorch
15 | https://pytorch.org/get-started/locally/
16 | 
17 | ```
18 | pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
19 | ```
20 | 
21 | - Gym
22 | https://gym.openai.com/docs/
23 | 
24 | ```
25 | pip install gym
26 | ```
27 | # Run
28 | You should be able to run each python file representing one RL algorithm. I used [CartPole-v0](https://gym.openai.com/envs/CartPole-v0/) as a problem to solve and verified that all the algorithms can solve it by reaching 200 as a total reward. Here is an example output when running [DoubleDQN_PER.py](Code/DoubleDQN_PER.py)
29 | ```
30 | Episode done: cur_frame=556 current_training_step=4192 total_reward=10.0
31 | Episode done: cur_frame=570 current_training_step=4320 total_reward=14.0
32 | Episode done: cur_frame=605 current_training_step=4608 total_reward=35.0
33 | Episode done: cur_frame=805 current_training_step=6208 total_reward=200.0
34 | Episode done: cur_frame=911 current_training_step=7040 total_reward=106.0
35 | Episode done: cur_frame=1051 current_training_step=8160 total_reward=140.0
36 | Episode done: cur_frame=1160 current_training_step=9024 total_reward=109.0
37 | Episode done: cur_frame=1322 current_training_step=10336 total_reward=162.0
38 | Episode done: cur_frame=1404 current_training_step=10976 total_reward=82.0
39 | Episode done: cur_frame=1466 current_training_step=11488 total_reward=62.0
40 | Episode done: cur_frame=1666 current_training_step=13088 total_reward=200.0
41 | ```
42 | 
43 | 


--------------------------------------------------------------------------------
/Code/REINFORCE.py:
--------------------------------------------------------------------------------
  1 | from torch.distributions import Categorical
  2 | import gym
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | 
  8 | '''
  9 | Chapter 2. REINFORCE
 10 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 11 | '''
 12 | 
 13 | gamma = 0.99
 14 | 
 15 | # Policy Pi
 16 | class Pi(nn.Module):
 17 |     def __init__(self, in_dim, out_dim):
 18 |         super(Pi, self).__init__()
 19 | 
 20 |         layers = [
 21 |             nn.Linear(in_dim, 64),
 22 |             nn.ReLU(),
 23 |             nn.Linear(64, out_dim),
 24 |         ]
 25 | 
 26 |         self.model = nn.Sequential(*layers)
 27 |         self.onpolicy_reset()
 28 |         self.train()
 29 | 
 30 |     def onpolicy_reset(self):
 31 |         self.log_probs = []
 32 |         self.rewards = []
 33 | 
 34 |     def forward(self, x):
 35 |         pdparam = self.model(x)
 36 |         return pdparam
 37 | 
 38 |     def act(self, state):
 39 |         
 40 |         """
 41 |         - Action probability distribution = Policy(state) : NN(state) generates probabilities for all actions.
 42 |         - They are actually just logits which are not normalized, unlike probabilities that sum up to 1.
 43 |         - Categorical() will sample action based on these logits by using Softmax.
 44 |         - Softmax - https://miro.medium.com/max/875/1*ReYpdIZ3ZSAPb2W8cJpkBg.jpeg
 45 |         - Categorical() also provides log(action_probability) that we need for calculating loss.
 46 |         """
 47 | 
 48 |         x = torch.from_numpy(state.astype(np.float32)) # to tensor
 49 |         pdparam = self.forward(x) # forward pass
 50 | 
 51 |         pd = Categorical(logits=pdparam) # probability distribution
 52 |         action = pd.sample() # pi(a|s) in action via pd
 53 |         log_prob = pd.log_prob(action) # log_prob prob of pi(a|s)
 54 |         self.log_probs.append(log_prob)
 55 |         return action.item()
 56 | 
 57 | def train(pi, optimizer):
 58 |     # Inner gradient-ascent loop of REINFORCE algorithm
 59 |     T = len(pi.rewards)
 60 |     rets = np.empty(T, dtype=np.float32) # the returns
 61 |     future_ret = 0.0
 62 | 
 63 |     # Compute the discounted returns efficiently in a reversed order.
 64 |     for t in reversed(range(T)):
 65 |         future_ret = pi.rewards[t] + gamma * future_ret
 66 |         rets[t] = future_ret
 67 | 
 68 |     # Compute loss (which is really opposite of reward)
 69 |     rets = torch.tensor(rets)
 70 |     log_probs = torch.stack(pi.log_probs)
 71 |     loss = - log_probs * rets # gradient term: Negative for maximizing reward
 72 |     loss = torch.sum(loss)
 73 | 
 74 |     # Backpropagation
 75 |     optimizer.zero_grad()
 76 |     loss.backward() # backpropagate, compute gradients that will be stored by the tensors (parameters)
 77 |     optimizer.step() # gradient-ascent, update the weights
 78 | 
 79 |     return loss
 80 | 
 81 | def main():
 82 |     env = gym.make("CartPole-v0")
 83 |     in_dim = env.observation_space.shape[0] # 4
 84 |     out_dim = env.action_space.n # 2
 85 |     pi = Pi(in_dim, out_dim)
 86 |     optimizer = optim.Adam(pi.parameters(), lr=0.01)
 87 | 
 88 |     for epi in range(300):
 89 |         state = env.reset()
 90 | 
 91 |         for t in range(200): # cartpole max timestep is 200
 92 |             action = pi.act(state)
 93 |             state, reward, done, _ = env.step(action)
 94 |             pi.rewards.append(reward)
 95 |             # env.render()
 96 | 
 97 |             if done:
 98 |                 break
 99 | 
100 |         loss = train(pi, optimizer)
101 |         total_reward = sum(pi.rewards)
102 |         solved = total_reward > 195.0
103 |         pi.onpolicy_reset()
104 |         print(f"Episode {epi}, loss: {loss}, total_reward: {total_reward}, solved: {solved}")
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/Code/SARSA.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | from torch import distributions
  8 | 
  9 | '''
 10 | Chapter 3. SARSA
 11 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 12 | '''
 13 | 
 14 | # This is a modified Categorical distribution class to implement greedy policy.
 15 | class Argmax(distributions.Categorical):
 16 |     '''
 17 |     Special distribution class for argmax sampling, where probability is always 1 for the argmax.
 18 |     NOTE although argmax is not a sampling distribution, this implementation is for API consistency.
 19 |     '''
 20 |     def __init__(self, probs=None, logits=None, validate_args=None):
 21 |         if probs is not None:
 22 |             new_probs = torch.zeros_like(probs, dtype=torch.float)
 23 |             new_probs[probs == probs.max(dim=-1, keepdim=True)[0]] = 1.0
 24 |             probs = new_probs
 25 |         elif logits is not None:
 26 |             new_logits = torch.full_like(logits, -1e8, dtype=torch.float)
 27 |             new_logits[logits == logits.max(dim=-1, keepdim=True)[0]] = 1.0
 28 |             logits = new_logits
 29 | 
 30 |         super().__init__(probs=probs, logits=logits, validate_args=validate_args)
 31 | 
 32 | class SARSA(nn.Module):
 33 |     def __init__(self, env):
 34 |         super(SARSA, self).__init__()
 35 | 
 36 |         self.env = env
 37 |         in_dim = env.observation_space.shape[0] # 4 for CartPole
 38 |         out_dim = env.action_space.n # 2 for CardPole
 39 | 
 40 |         # Initialize the neural network used to learn the Q function
 41 |         # Let's use a single hidden layer with 64 units and SELU as an activation function.
 42 |         layers = [
 43 |             nn.Linear(in_dim, 64),
 44 |             nn.SELU(),
 45 |             nn.Linear(64, out_dim),
 46 |         ]
 47 |         self.model = nn.Sequential(*layers)
 48 |         self.train()
 49 | 
 50 |         # Optimizer - RMSprop
 51 |         self.optim = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
 52 | 
 53 |         # Gamma
 54 |         self.gamma = 0.99
 55 | 
 56 |         # Memory for batch
 57 |         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
 58 |         self.memory = {k: [] for k in self.data_keys}
 59 | 
 60 |         # Epsilon greedy policy
 61 |         self.epsilon_start = 1.0
 62 |         self.epsilon_end = 0.05
 63 |         self.epsilon_max_steps = 10000
 64 |         self.epsilon = self.epsilon_start
 65 |         self.current_step = 0
 66 | 
 67 |         # etc.
 68 |         self.to_train = 0
 69 |         self.training_frequency = 32 # number of experiences to collect before training.
 70 | 
 71 |     def act(self, state):
 72 |         state = torch.from_numpy(state.astype(np.float32))
 73 | 
 74 |         action = None
 75 | 
 76 |         # Epsilon greedy to balance between exploring and exploiting.
 77 |         if self.epsilon > np.random.rand():
 78 |             action = self.random_policy()
 79 |         else:
 80 |             action = self.greedy_policy(state)
 81 | 
 82 |         return action.item()
 83 | 
 84 |     def random_policy(self):
 85 |         action = [self.env.action_space.sample()]
 86 |         return torch.tensor(action)
 87 | 
 88 |     def greedy_policy(self, state):
 89 |         pdparam = self.model(state)
 90 |         action_pd = Argmax(logits=pdparam)  
 91 |         return action_pd.sample()
 92 | 
 93 |     def sample(self):
 94 |         # Create batch
 95 |         batch = {k: self.memory[k] for k in self.data_keys}
 96 | 
 97 |         # 'next_actions' is copied from 'actions' from index 1 and its last element will be always 0.
 98 |         # This is safe for next_action at done since the calculated act_next_q_preds will be multiplied by (1 - batch['dones'])
 99 |         batch['next_actions'] = np.zeros_like(batch['actions'])
100 |         batch['next_actions'][:-1] = batch['actions'][1:]
101 | 
102 |         for k in batch:
103 |             batch[k] = np.array(batch[k])
104 |             batch[k] = torch.from_numpy(batch[k].astype(np.float32))
105 | 
106 |         return batch
107 | 
108 |     def calc_q_loss(self, batch):
109 |         states = batch['states']
110 |         next_states = batch['next_states']
111 | 
112 |         q_preds = self.model(states)
113 |         with torch.no_grad():
114 |             next_q_preds = self.model(next_states)
115 | 
116 |         """
117 |         This is the gut of SARSA implementation.
118 |             : Q-value = Q(state, action) : NN(state) generates Q-values for all actions.
119 | 
120 |         We treat that our NN (self.model) generates Q-values for each action. For example, 
121 |         q_preds are just logits from NN and we assume logits[0] is the Q-value for action 0.
122 |         Therefore, act_q_preds should be the Q-value (logit) of the action selected for the state.
123 |         We can do this by selecting one of logits(q_preds) by using 'action' as an index.
124 |         torch.gather(torch.tensor.gather) exactly does that.
125 |         """
126 | 
127 |         act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
128 |         act_next_q_preds = next_q_preds.gather(-1, batch['next_actions'].long().unsqueeze(-1)).squeeze(-1)
129 |         act_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * act_next_q_preds
130 | 
131 |         #print(f'act_q_preds: {act_q_preds}\nact_q_targets: {act_q_targets}')
132 | 
133 |         """
134 |         You can now easily understand that SARSA is on-policy RL algorithm that cannot reuse
135 |         experience produced by a different policy (i.e NN whose parameters are different).
136 |         This is because the loss is only meaningful to update NN when act_q_preds and act_q_targets
137 |         are from the same NN. It makes no sense to update NN parameters to reduce this difference
138 |         when act_q_preds and act_q_targets are came from different NNs.
139 |         """
140 | 
141 |         # Let's use mean-squared-error loss function.
142 |         loss = nn.MSELoss()
143 |         q_loss = loss(act_q_preds, act_q_targets)
144 |         return q_loss
145 | 
146 |     def check_train(self):
147 |         if self.to_train == 1:
148 | 
149 |             # Compute loss for the batch.
150 |             batch = self.sample()
151 |             loss = self.calc_q_loss(batch)
152 | 
153 |             # Compute gradients with backpropagation.
154 |             self.optim.zero_grad()
155 |             loss.backward()
156 | 
157 |             # Update NN parameters.
158 |             self.optim.step()
159 | 
160 |             # Reset
161 |             self.to_train = 0
162 |             self.memory = {k: [] for k in self.data_keys}
163 | 
164 |     def update_memory(self, state, action, reward, next_state, done):
165 |         # Add this exp to memory.
166 |         most_recent = (state, action, reward, next_state, done)
167 |         for idx, k in enumerate(self.data_keys):
168 |             self.memory[k].append(most_recent[idx])
169 | 
170 |         # If it has collected the desired number of experiences, it is ready to train.
171 |         if len(self.memory['states']) == self.training_frequency:
172 |             self.to_train = 1
173 | 
174 |     def update_epsilon(self):
175 |         # Simple linear decay
176 |         if self.epsilon_max_steps <= self.current_step:
177 |             self.epsilon = self.epsilon_end
178 |             return
179 | 
180 |         slope = (self.epsilon_end - self.epsilon_start) / (self.epsilon_max_steps - self.current_step)
181 |         self.epsilon = max(slope*self.current_step + self.epsilon_start, self.epsilon_end)
182 | 
183 |     def update(self, state, action, reward, next_state, done):
184 |         self.current_step += 1
185 |         self.update_memory(state, action, reward, next_state, done)
186 |         self.check_train()
187 |         self.update_epsilon()
188 | 
189 | def run_rl(sarsa, env, max_frame):
190 |     state = env.reset()
191 |     done = False
192 |     cur_frame = 0
193 |     total_reward = 0
194 |     while True:
195 |         if done:  # before starting another episode
196 |             print(f'Episode done: cur_frame={cur_frame} total_reward={total_reward}')
197 |             total_reward = 0
198 | 
199 |             if cur_frame < max_frame:  # reset and continue
200 |                 state = env.reset()
201 |                 done = False
202 | 
203 |         if cur_frame >= max_frame:  # finish
204 |             break
205 | 
206 |         cur_frame += 1
207 | 
208 |         action = sarsa.act(state)
209 |         next_state, reward, done, info = env.step(action)
210 |         sarsa.update(state, action, reward, next_state, done)
211 |         state = next_state
212 |         total_reward += reward
213 | 
214 | def main():
215 |     env = gym.make("CartPole-v0")
216 |     sarsa = SARSA(env)
217 | 
218 |     run_rl(sarsa, env, max_frame=100000)
219 | 
220 | if __name__ == '__main__':
221 |     main()
222 | 
223 |    
224 | 
225 | 


--------------------------------------------------------------------------------
/Code/A2C_NStepReturns.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from torch.distributions import Categorical
  3 | import gym
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | from torch import distributions
  9 | import random
 10 | 
 11 | '''
 12 | Chapter 6. Advantage Actor-Critic (A2C)
 13 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 14 | 
 15 | This exmaple will create two seperate networks for actor and critic. For advantage estimation,
 16 | it will use N-step returns method.
 17 | '''
 18 | 
 19 | class A2C(nn.Module):
 20 |     def __init__(self, env):
 21 |         super(A2C, self).__init__()
 22 | 
 23 |         self.env = env
 24 |         in_dim = env.observation_space.shape[0] # 4 for CartPole
 25 |         out_dim = env.action_space.n # 2 for CardPole
 26 | 
 27 |         # Initialize the neural networks for Actor and Critic
 28 |         # We do not share NN between actor and critic for this example.
 29 | 
 30 |         # Actor
 31 |         actor_layers = [
 32 |             nn.Linear(in_dim, 64),
 33 |             nn.SELU(),
 34 |             nn.Linear(64, out_dim),
 35 |         ]
 36 |         self.actor_model = nn.Sequential(*actor_layers)
 37 |         self.actor_optim = torch.optim.RMSprop(self.actor_model.parameters(), lr=0.01)
 38 |         self.actor_policy_loss_coef = 1.0
 39 |         self.actor_entropy_coef = 0.001
 40 | 
 41 |         # Critic
 42 |         critic_layers = [
 43 |             nn.Linear(in_dim, 64),
 44 |             nn.SELU(),
 45 |             nn.Linear(64, 1),
 46 |         ]
 47 |         self.critic_model = nn.Sequential(*critic_layers)
 48 |         self.critic_optim = torch.optim.RMSprop(self.critic_model.parameters(), lr=0.01)
 49 |         self.critic_val_loss_coef = 1.0
 50 | 
 51 |         self.train()
 52 | 
 53 |         # Gamma
 54 |         self.gamma = 0.99
 55 | 
 56 |         # Memory for batch
 57 |         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
 58 |         self.memory = {k: [] for k in self.data_keys}
 59 | 
 60 |         # Training
 61 |         self.to_train = 0
 62 |         self.num_step_returns = 32
 63 |         self.training_frequency = self.num_step_returns
 64 | 
 65 |     def act(self, state):      
 66 |         """
 67 |         - Probability distribution = Policy(state) : NN(state) generates v value for all actions given a state.
 68 |         - They are actually just logits which are not normalized, unlike probabilities that sum up to 1.
 69 |         - Categorical() will sample action based on these logits by using Softmax.
 70 |         - Softmax - https://miro.medium.com/max/875/1*ReYpdIZ3ZSAPb2W8cJpkBg.jpeg
 71 |         - Categorical() also provides log(action_probability) that we need for calculating loss.
 72 |         """
 73 |         x = torch.from_numpy(state.astype(np.float32)) # to tensor        
 74 |         pdparam = self.actor_model(x) # forward pass
 75 | 
 76 |         pd = Categorical(logits=pdparam) # probability distribution
 77 |         action = pd.sample() # pi(a|s) in action via pd
 78 |         return action.item()
 79 | 
 80 |     def sample(self):
 81 |         # Create batch
 82 |         batch = {k: self.memory[k] for k in self.data_keys}
 83 | 
 84 |         for k in batch:
 85 |             batch[k] = np.array(batch[k])
 86 |             batch[k] = torch.from_numpy(batch[k].astype(np.float32))
 87 | 
 88 |         return batch
 89 | 
 90 |     def update_memory(self, state, action, reward, next_state, done):
 91 |         # Add this exp to memory.
 92 |         most_recent = (state, action, reward, next_state, done)
 93 |         for idx, k in enumerate(self.data_keys):
 94 |             self.memory[k].append(most_recent[idx])
 95 | 
 96 |         # If it has collected the desired number of experiences, it is ready to train.
 97 |         if len(self.memory['states']) == self.training_frequency:
 98 |             self.to_train = 1
 99 | 
100 |     def calc_v(self, x):
101 |         '''
102 |         Forward-pass to calculate the predicted state-value from critic_net.
103 |         '''
104 |         return self.critic_model(x).view(-1)
105 | 
106 |     def calc_pdparam_v(self, batch):
107 |         '''Efficiently forward to get pdparam and v by batch for loss computation'''
108 |         states = batch['states']
109 |         pdparam = self.actor_model(states)
110 |         v_pred = self.calc_v(states)
111 |         return pdparam, v_pred
112 | 
113 |     def calc_nstep_returns(self, batch, next_v_pred):
114 |         '''
115 |         Estimate the advantages using n-step returns. Ref: http://www-anw.cs.umass.edu/~barto/courses/cs687/Chapter%207.pdf
116 |         Also see Algorithm S3 from A3C paper https://arxiv.org/pdf/1602.01783.pdf for the calculation used below
117 |         R^(n)_t = r_{t} + gamma r_{t+1} + ... + gamma^(n-1) r_{t+n-1} + gamma^(n) V(s_{t+n})
118 |         This is how we estimate q value (s,a) with rewards and v value (s).
119 |         '''
120 |         rewards = batch['rewards']
121 |         dones = batch['dones']
122 |         rets = torch.zeros_like(rewards)
123 |         future_ret = next_v_pred
124 |         not_dones = 1 - dones
125 | 
126 |         for t in reversed(range(self.num_step_returns)):
127 |             rets[t] = future_ret = rewards[t] + self.gamma * future_ret * not_dones[t]
128 | 
129 |         return rets
130 | 
131 |     def calc_nstep_advs_v_targets(self, batch, v_preds):
132 |         '''
133 |         Calculate N-step returns, and advs = nstep_rets - v_preds, v_targets = nstep_rets
134 |         See n-step advantage under http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
135 |         '''
136 |         next_states = batch['next_states'][-1]
137 |         next_states = next_states.unsqueeze(dim=0)
138 | 
139 |         with torch.no_grad():
140 |             next_v_pred = self.calc_v(next_states)
141 | 
142 |         v_preds = v_preds.detach()  # adv does not accumulate grad
143 |         nstep_rets = self.calc_nstep_returns(batch, next_v_pred)
144 |         advs = nstep_rets - v_preds
145 |         v_targets = nstep_rets
146 | 
147 |         #print(f'advs: {advs}\nv_targets: {v_targets}')
148 | 
149 |         return advs, v_targets
150 | 
151 |     def calc_policy_loss(self, batch, pdparams, advs):
152 |         '''Calculate the actor's policy loss'''        
153 |         action_pd = Categorical(logits=pdparams) # probability distribution
154 |         actions = batch['actions']
155 |         log_probs = action_pd.log_prob(actions)
156 |         policy_loss = -self.actor_policy_loss_coef * (log_probs * advs).mean()
157 | 
158 |         # Entropy Regularization
159 |         entropy = action_pd.entropy().mean()
160 |         policy_loss += (-self.actor_entropy_coef * entropy)
161 | 
162 |         #print(f'Actor policy loss: {policy_loss:g}')
163 |         return policy_loss
164 | 
165 |     def calc_val_loss(self, v_preds, v_targets):
166 |         '''Calculate the critic's value loss'''
167 |         assert v_preds.shape == v_targets.shape, f'{v_preds.shape} != {v_targets.shape}'
168 | 
169 |         # Let's use mean-squared-error loss function.
170 |         loss = nn.MSELoss()
171 |         val_loss = self.critic_val_loss_coef * loss(v_preds, v_targets)
172 |         #print(f'Critic value loss: {val_loss:g}')
173 |         return val_loss
174 | 
175 |     def check_train(self):
176 |         if self.to_train == 1:
177 |             batch = self.sample()
178 |             pdparams, v_preds = self.calc_pdparam_v(batch)
179 |             advs, v_targets = self.calc_nstep_advs_v_targets(batch, v_preds)
180 |             policy_loss = self.calc_policy_loss(batch, pdparams, advs)  # from actor
181 |             val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
182 | 
183 |             # actor update
184 |             self.actor_optim.zero_grad()
185 |             policy_loss.backward()
186 |             self.actor_optim.step()
187 | 
188 |             # critic update
189 |             self.critic_optim.zero_grad()
190 |             val_loss.backward()
191 |             self.critic_optim.step()
192 | 
193 |             # Reset- A2C is an on-policy algorithm so we cannot reuse data for next training.
194 |             self.to_train = 0
195 |             self.memory = {k: [] for k in self.data_keys}
196 |             
197 | 
198 |     def update(self, state, action, reward, next_state, done):
199 |         self.update_memory(state, action, reward, next_state, done)
200 |         self.check_train()
201 | 
202 | 
203 | def run_rl(a2c, env, max_frame):
204 |     state = env.reset()
205 |     done = False
206 |     total_reward = 0
207 |     current_frame = 0
208 |     total_reward = 0
209 | 
210 |     while True:
211 |         if done:  # before starting another episode
212 |             print(f'Episode done: cur_frame={current_frame} total_reward={total_reward}')
213 |             total_reward = 0
214 | 
215 |             if current_frame < max_frame:  # reset and continue
216 |                 state = env.reset()
217 |                 done = False
218 | 
219 |         if current_frame >= max_frame:  # finish
220 |             break
221 | 
222 |         action = a2c.act(state)
223 |         next_state, reward, done, info = env.step(action)
224 |         a2c.update(state, action, reward, next_state, done)
225 |         state = next_state
226 |         total_reward += reward
227 | 
228 | def main():
229 |     env = gym.make("CartPole-v0")
230 |     a2c = A2C(env)
231 |     run_rl(a2c, env, max_frame=1000)
232 | 
233 | if __name__ == '__main__':
234 |     main()


--------------------------------------------------------------------------------
/Code/DQN.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | from torch import distributions
  8 | 
  9 | '''
 10 | Chapter 4. DQN
 11 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 12 | '''
 13 | 
 14 | class DQN(nn.Module):
 15 |     def __init__(self, env):
 16 |         super(DQN, self).__init__()
 17 | 
 18 |         self.env = env
 19 |         in_dim = env.observation_space.shape[0] # 4 for CartPole
 20 |         out_dim = env.action_space.n # 2 for CardPole
 21 | 
 22 |         # Initialize the neural network used to learn the Q function
 23 |         layers = [
 24 |             nn.Linear(in_dim, 64),
 25 |             nn.SELU(),
 26 |             nn.Linear(64, 32),
 27 |             nn.SELU(),
 28 |             nn.Linear(32, out_dim),
 29 |         ]
 30 |         self.model = nn.Sequential(*layers)
 31 |         self.train()
 32 | 
 33 |         # Optimizer - Adam with learning rate linear decay.
 34 |         self.learning_rate = 0.01
 35 |         self.learning_rate_max_steps = 10000
 36 |         self.optim = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
 37 |         self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.optim, 
 38 |                                                               lr_lambda=lambda x: 1 - x/self.learning_rate_max_steps if x < self.learning_rate_max_steps else 1/self.learning_rate_max_steps)
 39 |         # Gamma
 40 |         self.gamma = 0.99
 41 | 
 42 |         # Memory for batch
 43 |         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
 44 |         self.memory_batch_size = 32
 45 |         self.memory_max_size = 10000
 46 |         self.memory_cur_size = 0
 47 |         self.memory_seen_size = 0
 48 |         self.memory_head = -1
 49 |         self.memory = {k: [None] * self.memory_max_size for k in self.data_keys}
 50 | 
 51 |         # Boltzmann policy
 52 |         self.boltzmann_tau_start = 5.0
 53 |         self.boltzmann_tau_end = 0.5
 54 |         self.boltzmann_tau_max_steps = 10000
 55 |         self.boltzmann_tau = self.boltzmann_tau_start
 56 | 
 57 |         # Training with Replay Experiences
 58 |         self.to_train = 0
 59 |         self.training_batch_iter = 8
 60 |         self.training_iter = 4
 61 |         self.training_frequency = 4
 62 |         self.training_start_step = 32
 63 |         self.current_training_step = 0
 64 | 
 65 |         # Frame
 66 |         self.current_frame = 0
 67 |         self.max_frame = 10000
 68 | 
 69 |     def act(self, state):
 70 |         state = torch.from_numpy(state.astype(np.float32))
 71 |         action = self.boltzmann_policy(state)
 72 |         return action.item()
 73 | 
 74 |     def boltzmann_policy(self, state):
 75 |         '''
 76 |         Boltzmann policy: adjust pdparam with temperature tau; 
 77 |         the higher the more randomness/noise in action.
 78 |         '''
 79 |         pdparam = self.model(state)
 80 |         pdparam /= self.boltzmann_tau
 81 |         action_pd = distributions.Categorical(logits=pdparam)
 82 |         return action_pd.sample()
 83 | 
 84 |     def sample(self):
 85 |         # Batch indices a sampled random uniformly among experiences in memory.
 86 |         batch_idxs = np.random.randint(self.memory_cur_size, size=self.memory_batch_size)
 87 | 
 88 |         # Create batch.
 89 |         batch = {k: [] for k in self.data_keys}
 90 |         for index in batch_idxs:
 91 |             for k in self.data_keys:
 92 |                 batch[k].append(self.memory[k][index])
 93 | 
 94 |         for k in batch:
 95 |             batch[k] = np.array(batch[k])
 96 |             batch[k] = torch.from_numpy(batch[k].astype(np.float32))
 97 | 
 98 |         return batch
 99 | 
100 |     def calc_q_loss(self, batch):
101 |         states = batch['states']
102 |         next_states = batch['next_states']
103 | 
104 |         q_preds = self.model(states)
105 |         with torch.no_grad():
106 |             next_q_preds = self.model(next_states)
107 | 
108 |         """
109 |         This is the gut of DQN implementation.
110 |             : Q-value = Q(state, action) : NN(state) generates Q-values for all actions.
111 | 
112 |         We treat that our NN (self.model) generates Q-values for each action. For example, 
113 |         q_preds are just logits from NN and we assume logits[0] is the Q-value for action 0.
114 |         Therefore, act_q_preds should be the Q-value (logit) of the action selected for the state.
115 |         We can do this by selecting one of logits(q_preds) by using 'action' as an index.
116 |         torch.gather(torch.tensor.gather) exactly does that.
117 |         """
118 | 
119 |         act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
120 | 
121 |         """
122 |         For SARSA, we calculate Q-values for the next action taken during the episode like the below.
123 |         act_next_q_preds = next_q_preds.gather(-1, batch['next_actions'].long().unsqueeze(-1)).squeeze(-1)        
124 | 
125 |         For DQN, it assumes there is a perfect policy and the next action should be always the best.
126 |         Thus, instead of taking the Q value of the next action, it just takes the maximum Q-value for
127 |         the next state. This is why DQN is off-policy RL algorithm since it does not rely on the current
128 |         policy (that is used to choose next action) while training.
129 |         """
130 | 
131 |         max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=False)
132 |         act_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds
133 | 
134 |         #print(f'act_q_preds: {act_q_preds}\nmax_next_q_preds: {max_next_q_preds}')
135 | 
136 |         # Let's use mean-squared-error loss function.
137 |         loss = nn.MSELoss()
138 |         q_loss = loss(act_q_preds, act_q_targets)
139 |         return q_loss
140 | 
141 |     def check_train(self):
142 |         if self.to_train == 1:
143 | 
144 |             for _ in range(self.training_iter):
145 |                 batch = self.sample()
146 | 
147 |                 for _ in range(self.training_batch_iter):
148 |                     # Compute loss for the batch.
149 |                     loss = self.calc_q_loss(batch)
150 | 
151 |                     # Compute gradients with backpropagation.
152 |                     self.optim.zero_grad()
153 |                     loss.backward()
154 | 
155 |                     # Update NN parameters.
156 |                     self.optim.step()
157 | 
158 |                     self.current_training_step += 1
159 | 
160 |             # Reset
161 |             self.to_train = 0
162 | 
163 | 
164 |     def update_memory(self, state, action, reward, next_state, done):
165 |         """
166 |         Add this exp to memory. Since DQN is off-policy algorithm, we can reuse
167 |         any experiences generated during training regardless of which policy (NN)
168 |         is used. We will discard the oldest exp if there is no space to add new one.
169 |         """
170 | 
171 |         most_recent = (state, action, reward, next_state, done)
172 |         self.memory_head = (self.memory_head + 1) % self.memory_max_size
173 | 
174 |         for idx, k in enumerate(self.data_keys):
175 |             self.memory[k][self.memory_head] = most_recent[idx]
176 | 
177 |         if self.memory_cur_size < self.memory_max_size:
178 |             self.memory_cur_size += 1
179 | 
180 |         self.memory_seen_size += 1
181 | 
182 |         self.to_train = self.memory_seen_size > self.training_start_step and self.memory_head % self.training_frequency == 0;
183 | 
184 |     def update_tau(self):
185 |         # Simple linear decay
186 |         if self.boltzmann_tau_max_steps <= self.current_frame:
187 |             self.boltzmann_tau = self.boltzmann_tau_end
188 |             return
189 | 
190 |         slope = (self.boltzmann_tau_end - self.boltzmann_tau_start) / (self.boltzmann_tau_max_steps - self.current_frame)
191 |         self.boltzmann_tau = max(slope*self.current_frame + self.boltzmann_tau_start, self.boltzmann_tau_end)
192 | 
193 |     def update(self, state, action, reward, next_state, done):
194 |         self.current_frame += 1
195 |         self.update_memory(state, action, reward, next_state, done)
196 |         self.check_train()
197 |         self.update_tau()
198 |         if (self.memory_seen_size > self.training_start_step):
199 |             self.lr_scheduler.step()
200 | 
201 | def run_rl(dqn, env):
202 |     state = env.reset()
203 |     done = False
204 |     total_reward = 0
205 |     while True:
206 |         if done:  # before starting another episode
207 |             print(f'Episode done: cur_frame={dqn.current_frame} current_training_step={dqn.current_training_step} total_reward={total_reward}')
208 |             total_reward = 0
209 | 
210 |             if dqn.current_frame < dqn.max_frame:  # reset and continue
211 |                 state = env.reset()
212 |                 done = False
213 | 
214 |         if dqn.current_frame >= dqn.max_frame:  # finish
215 |             break
216 | 
217 |         action = dqn.act(state)
218 |         next_state, reward, done, info = env.step(action)
219 |         dqn.update(state, action, reward, next_state, done)
220 |         state = next_state
221 |         total_reward += reward
222 | 
223 | def main():
224 |     env = gym.make("CartPole-v0")
225 |     dqn = DQN(env)
226 |     run_rl(dqn, env)
227 | 
228 | if __name__ == '__main__':
229 |     main()


--------------------------------------------------------------------------------
/Code/A2C_GAE.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from torch.distributions import Categorical
  3 | import gym
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | from torch import distributions
  9 | import random
 10 | 
 11 | '''
 12 | Chapter 6. Advantage Actor-Critic (A2C)
 13 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 14 | 
 15 | This exmaple will create one shared network for actor and critic. For advantage estimation,
 16 | it will use GAE (Generalized Advantage Estimation) method.
 17 | '''
 18 | 
 19 | class A2C(nn.Module):
 20 |     def __init__(self, env):
 21 |         super(A2C, self).__init__()
 22 | 
 23 |         self.env = env
 24 |         in_dim = env.observation_space.shape[0] # 4 for CartPole
 25 |         out_dim = env.action_space.n # 2 for CardPole
 26 | 
 27 |         # Initialize the neural networks between Actor and Critic.
 28 |         # We will use a shared NN between actor and critic for this example.
 29 | 
 30 |         # Shared NN
 31 |         shared_layres = [
 32 |             nn.Linear(in_dim, 64),
 33 |             nn.ReLU(),
 34 |         ]
 35 |         self.shared_model = nn.Sequential(*shared_layres)
 36 | 
 37 |         # Actor
 38 |         actor_layers = [
 39 |             nn.Linear(64, out_dim),
 40 |         ]
 41 |         self.actor_model = nn.Sequential(*actor_layers)
 42 |         self.actor_policy_loss_coef = 1.0
 43 |         self.actor_entropy_coef = 0.001
 44 | 
 45 |         # Critic
 46 |         critic_layers = [
 47 |             nn.Linear(64, 1),
 48 |         ]
 49 |         self.critic_model = nn.Sequential(*critic_layers)
 50 |         self.critic_val_loss_coef = 0.5
 51 | 
 52 |         # Optimizer for all three models
 53 |         params = []
 54 |         params += self.shared_model.parameters()
 55 |         params += self.actor_model.parameters()
 56 |         params += self.critic_model.parameters()
 57 |         self.optim = torch.optim.RMSprop(params, lr=0.01)
 58 | 
 59 |         self.train()
 60 | 
 61 |         # Gamma
 62 |         self.gamma = 0.99
 63 | 
 64 |         # Memory for batch
 65 |         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
 66 |         self.memory = {k: [] for k in self.data_keys}
 67 | 
 68 |         # GAE
 69 |         self.gae_lambda = 0.95
 70 | 
 71 |         # Training
 72 |         self.to_train = 0
 73 |         self.training_frequency = 32       
 74 | 
 75 |     def act(self, state):      
 76 |         """
 77 |         - Probability distribution = Policy(state) : NN(state) generates v value for all actions given a state.
 78 |         - They are actually just logits which are not normalized, unlike probabilities that sum up to 1.
 79 |         - Categorical() will sample action based on these logits by using Softmax.
 80 |         - Softmax - https://miro.medium.com/max/875/1*ReYpdIZ3ZSAPb2W8cJpkBg.jpeg
 81 |         - Categorical() also provides log(action_probability) that we need for calculating loss.
 82 |         """
 83 |         x = torch.from_numpy(state.astype(np.float32)) # to tensor
 84 |         x = self.shared_model(x)
 85 |         pdparam = self.actor_model(x) # forward pass
 86 | 
 87 |         pd = Categorical(logits=pdparam) # probability distribution
 88 |         action = pd.sample() # pi(a|s) in action via pd
 89 |         return action.item()
 90 | 
 91 |     def sample(self):
 92 |         # Create batch
 93 |         batch = {k: self.memory[k] for k in self.data_keys}
 94 | 
 95 |         for k in batch:
 96 |             batch[k] = np.array(batch[k])
 97 |             batch[k] = torch.from_numpy(batch[k].astype(np.float32))
 98 | 
 99 |         return batch
100 | 
101 |     def update_memory(self, state, action, reward, next_state, done):
102 |         # Add this exp to memory.
103 |         most_recent = (state, action, reward, next_state, done)
104 |         for idx, k in enumerate(self.data_keys):
105 |             self.memory[k].append(most_recent[idx])
106 | 
107 |         # If it has collected the desired number of experiences, it is ready to train.
108 |         if len(self.memory['states']) == self.training_frequency:
109 |             self.to_train = 1
110 | 
111 |     def calc_v(self, states):
112 |         '''
113 |         Forward-pass to calculate the predicted state-value from critic_net.
114 |         '''
115 |         x = self.shared_model(states)
116 |         return self.critic_model(x).view(-1)
117 | 
118 |     def calc_pdparam_v(self, batch):
119 |         '''Efficiently forward to get pdparam and v by batch for loss computation'''
120 |         states = batch['states']
121 | 
122 |         x = self.shared_model(states)
123 |         pdparam = self.actor_model(x)
124 | 
125 |         v_pred = self.calc_v(states)
126 |         return pdparam, v_pred
127 | 
128 |     def calc_gaes(self, rewards, dones, v_preds):
129 |         '''
130 |         Estimate the advantages using GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf
131 |         v_preds are values predicted for current states, with one last element as the final next_state
132 |         delta is defined as r + gamma * V(s') - V(s) in eqn 10
133 |         GAE is defined in eqn 16
134 |         This method computes in torch tensor to prevent unnecessary moves between devices (e.g. GPU tensor to CPU numpy)
135 |         NOTE any standardization is done outside of this method
136 |         '''
137 |         T = len(rewards)
138 |         assert T + 1 == len(v_preds), f'T+1: {T+1} v.s. v_preds.shape: {v_preds.shape}'  # v_preds runs into t+1
139 |         gaes = torch.zeros_like(rewards)
140 |         future_gae = torch.tensor(0.0, dtype=rewards.dtype)
141 |         not_dones = 1 - dones  # to reset at episode boundary by multiplying 0
142 |         deltas = rewards + self.gamma * v_preds[1:] * not_dones - v_preds[:-1]
143 |         coef = self.gamma * self.gae_lambda
144 |         for t in reversed(range(T)):
145 |             gaes[t] = future_gae = deltas[t] + coef * not_dones[t] * future_gae
146 |         return gaes
147 | 
148 |     def calc_gae_advs_v_targets(self, batch, v_preds):
149 |         '''
150 |         Calculate GAE, and advs = GAE, v_targets = advs + v_preds
151 |         See GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf
152 |         '''
153 |         next_states = batch['next_states'][-1]
154 |         next_states = next_states.unsqueeze(dim=0)
155 | 
156 |         with torch.no_grad():
157 |             next_v_pred = self.calc_v(next_states)
158 | 
159 |         v_preds = v_preds.detach()  # adv does not accumulate grad
160 |         v_preds_all = torch.cat((v_preds, next_v_pred), dim=0)
161 |         advs = self.calc_gaes(batch['rewards'], batch['dones'], v_preds_all)
162 |         v_targets = advs + v_preds            
163 |         advs = (advs - advs.mean()) / (advs.std() + 1e-08)  # standardize only for advs, not v_targets
164 | 
165 |         #print(f'advs: {advs}\nv_targets: {v_targets}')
166 |         return advs, v_targets
167 | 
168 | 
169 |     def calc_policy_loss(self, batch, pdparams, advs):
170 |         '''Calculate the actor's policy loss'''        
171 |         action_pd = Categorical(logits=pdparams) # probability distribution
172 |         actions = batch['actions']
173 |         log_probs = action_pd.log_prob(actions)
174 |         policy_loss = -self.actor_policy_loss_coef * (log_probs * advs).mean()
175 | 
176 |         # Entropy Regularization
177 |         entropy = action_pd.entropy().mean()
178 |         policy_loss += (-self.actor_entropy_coef * entropy)
179 | 
180 |         #print(f'Actor policy loss: {policy_loss:g}')
181 |         return policy_loss
182 | 
183 |     def calc_val_loss(self, v_preds, v_targets):
184 |         '''Calculate the critic's value loss'''
185 |         assert v_preds.shape == v_targets.shape, f'{v_preds.shape} != {v_targets.shape}'
186 | 
187 |         # Let's use mean-squared-error loss function.
188 |         loss = nn.MSELoss()
189 |         val_loss = self.critic_val_loss_coef * loss(v_preds, v_targets)
190 |         #print(f'Critic value loss: {val_loss:g}')
191 |         return val_loss
192 | 
193 |     def check_train(self):
194 |         if self.to_train == 1:
195 |             batch = self.sample()
196 |             pdparams, v_preds = self.calc_pdparam_v(batch)
197 |             advs, v_targets = self.calc_gae_advs_v_targets(batch, v_preds)
198 |             policy_loss = self.calc_policy_loss(batch, pdparams, advs)  # from actor
199 |             val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
200 | 
201 |             loss = policy_loss + val_loss
202 | 
203 |             self.optim.zero_grad()
204 |             loss.backward()
205 |             self.optim.step()
206 | 
207 |             # Reset- A2C is an on-policy algorithm so we cannot reuse data for next training.
208 |             self.to_train = 0
209 |             self.memory = {k: [] for k in self.data_keys}            
210 | 
211 |     def update(self, state, action, reward, next_state, done):
212 |         self.update_memory(state, action, reward, next_state, done)
213 |         self.check_train()
214 | 
215 | 
216 | def run_rl(a2c, env, max_frame):
217 |     state = env.reset()
218 |     done = False
219 |     total_reward = 0
220 |     current_frame = 0
221 |     total_reward = 0
222 | 
223 |     while True:
224 |         if done:  # before starting another episode
225 |             print(f'Episode done: cur_frame={current_frame} total_reward={total_reward}')
226 |             total_reward = 0
227 | 
228 |             if current_frame < max_frame:  # reset and continue
229 |                 state = env.reset()
230 |                 done = False
231 | 
232 |         if current_frame >= max_frame:  # finish
233 |             break
234 | 
235 |         action = a2c.act(state)
236 |         next_state, reward, done, info = env.step(action)
237 |         a2c.update(state, action, reward, next_state, done)
238 |         state = next_state
239 |         total_reward += reward
240 | 
241 | def main():
242 |     env = gym.make("CartPole-v0")
243 |     a2c = A2C(env)
244 |     run_rl(a2c, env, max_frame=1000)
245 | 
246 | if __name__ == '__main__':
247 |     main()


--------------------------------------------------------------------------------
/Code/DoubleDQN.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | from torch import distributions
  8 | 
  9 | '''
 10 | Chapter 5. Improving DQN
 11 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 12 | 
 13 | This is for using two different networs to solve the original DQN's issues: overestimation of Q and moving target.
 14 | '''
 15 | 
 16 | class DoubleDQN(nn.Module):
 17 |     def __init__(self, env):
 18 |         super(DoubleDQN, self).__init__()
 19 | 
 20 |         self.env = env
 21 |         in_dim = env.observation_space.shape[0] # 4 for CartPole
 22 |         out_dim = env.action_space.n # 2 for CardPole
 23 | 
 24 |         # Initialize the neural network used to learn the Q function
 25 |         layers = [
 26 |             nn.Linear(in_dim, 64),
 27 |             nn.SELU(),
 28 |             nn.Linear(64, 32),
 29 |             nn.SELU(),
 30 |             nn.Linear(32, out_dim),
 31 |         ]
 32 |         self.model = nn.Sequential(*layers)
 33 |         self.target_model = nn.Sequential(*layers)
 34 |         self.model_update_frequency = 1000
 35 |         self.train()
 36 | 
 37 |         # Optimizer - Adam with learning rate linear decay.
 38 |         self.learning_rate = 0.01
 39 |         self.learning_rate_max_steps = 10000
 40 |         self.optim = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
 41 |         self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.optim, 
 42 |                                                               lr_lambda=lambda x: 1 - x/self.learning_rate_max_steps if x < self.learning_rate_max_steps else 1/self.learning_rate_max_steps)
 43 |         # Gamma
 44 |         self.gamma = 0.99
 45 | 
 46 |         # Memory for batch
 47 |         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
 48 |         self.memory_batch_size = 32
 49 |         self.memory_max_size = 10000
 50 |         self.memory_cur_size = 0
 51 |         self.memory_seen_size = 0
 52 |         self.memory_head = -1
 53 |         self.memory = {k: [None] * self.memory_max_size for k in self.data_keys}
 54 | 
 55 |         # Boltzmann policy
 56 |         self.boltzmann_tau_start = 5.0
 57 |         self.boltzmann_tau_end = 0.5
 58 |         self.boltzmann_tau_max_steps = 10000
 59 |         self.boltzmann_tau = self.boltzmann_tau_start
 60 | 
 61 |         # Training with Replay Experiences
 62 |         self.to_train = 0
 63 |         self.training_batch_iter = 8
 64 |         self.training_iter = 4
 65 |         self.training_frequency = 4
 66 |         self.training_start_step = 32
 67 |         self.current_training_step = 0
 68 | 
 69 |         # Frame
 70 |         self.current_frame = 0
 71 |         self.max_frame = 10000
 72 | 
 73 |     def act(self, state):
 74 |         state = torch.from_numpy(state.astype(np.float32))
 75 |         action = self.boltzmann_policy(state)
 76 |         return action.item()
 77 | 
 78 |     def boltzmann_policy(self, state):
 79 |         '''
 80 |         Boltzmann policy: adjust pdparam with temperature tau; 
 81 |         the higher the more randomness/noise in action.
 82 |         '''
 83 |         pdparam = self.model(state)
 84 |         pdparam /= self.boltzmann_tau
 85 |         action_pd = distributions.Categorical(logits=pdparam)
 86 |         return action_pd.sample()
 87 | 
 88 |     def sample(self):
 89 |         # Batch indices a sampled random uniformly among experiences in memory.
 90 |         batch_idxs = np.random.randint(self.memory_cur_size, size=self.memory_batch_size)
 91 | 
 92 |         # Create batch.
 93 |         batch = {k: [] for k in self.data_keys}
 94 |         for index in batch_idxs:
 95 |             for k in self.data_keys:
 96 |                 batch[k].append(self.memory[k][index])
 97 | 
 98 |         for k in batch:
 99 |             batch[k] = np.array(batch[k])
100 |             batch[k] = torch.from_numpy(batch[k].astype(np.float32))
101 | 
102 |         return batch
103 | 
104 |     def calc_q_loss(self, batch):
105 |         states = batch['states']
106 |         next_states = batch['next_states']
107 | 
108 |         q_preds = self.model(states)
109 | 
110 |         """
111 |         This is the gut of Double DQN implementation.
112 |             : We use two different models (networks) to reduce the overestimation of Q-value.
113 |             : One for selecting the Q-maximizing action, a`, and another for the Q value of a` given next state, s`.
114 |             : self.model is representing the first model and self.target_model is the second model here.
115 |             :
116 |             : The second model also acts as a target network that helps with stabilizing learning by reducing
117 |             : a moving target issue (which is Q_target that keeps changing if a single model (network) is used).
118 |         """
119 | 
120 |         with torch.no_grad():
121 |             online_next_q_preds = self.model(next_states)
122 |             next_q_preds = self.target_model(next_states)
123 | 
124 |         act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
125 | 
126 |         """
127 |         For SARSA, we calculate Q-values for the next action taken during the episode like the below.
128 | 
129 |             act_next_q_preds = next_q_preds.gather(-1, batch['next_actions'].long().unsqueeze(-1)).squeeze(-1)        
130 | 
131 |         For DQN, it assumes there is a perfect policy and the next action should be always the best.
132 |         Thus, instead of taking the Q value of the next action, it just takes the maximum Q-value for
133 |         the next state. This is why DQN is off-policy RL algorithm since it does not rely on the current
134 |         policy (that is used to choose next action) while training.
135 | 
136 |             max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=False)
137 | 
138 |         For Double DQN, we use the first model (self.model) to choose the Q-maximizing action (online_action).
139 |         Then use the second model (self.target_model) to get the Q value of a` and s`.
140 |         """
141 | 
142 |         online_actions = online_next_q_preds.argmax(dim=-1, keepdim=True)
143 |         max_next_q_preds = next_q_preds.gather(-1, online_actions).squeeze(-1)
144 | 
145 |         act_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds
146 | 
147 |         #print(f'act_q_preds: {act_q_preds}\nmax_next_q_preds: {max_next_q_preds}')
148 | 
149 |         # Let's use mean-squared-error loss function.
150 |         loss = nn.MSELoss()
151 |         q_loss = loss(act_q_preds, act_q_targets)
152 |         return q_loss
153 | 
154 |     def check_train(self):
155 |         if self.to_train == 1:
156 | 
157 |             for _ in range(self.training_iter):
158 |                 batch = self.sample()
159 | 
160 |                 for _ in range(self.training_batch_iter):
161 |                     # Compute loss for the batch.
162 |                     loss = self.calc_q_loss(batch)
163 | 
164 |                     # Compute gradients with backpropagation.
165 |                     self.optim.zero_grad()
166 |                     loss.backward()
167 | 
168 |                     # Update NN parameters.
169 |                     self.optim.step()
170 | 
171 |                     self.current_training_step += 1
172 | 
173 |             # Reset
174 |             self.to_train = 0
175 | 
176 | 
177 |     def update_memory(self, state, action, reward, next_state, done):
178 |         """
179 |         Add this exp to memory. Since DQN is off-policy algorithm, we can reuse
180 |         any experiences generated during training regardless of which policy (NN)
181 |         is used. We will discard the oldest exp if there is no space to add new one.
182 |         """
183 | 
184 |         most_recent = (state, action, reward, next_state, done)
185 |         self.memory_head = (self.memory_head + 1) % self.memory_max_size
186 | 
187 |         for idx, k in enumerate(self.data_keys):
188 |             self.memory[k][self.memory_head] = most_recent[idx]
189 | 
190 |         if self.memory_cur_size < self.memory_max_size:
191 |             self.memory_cur_size += 1
192 | 
193 |         self.memory_seen_size += 1
194 | 
195 |         self.to_train = self.memory_seen_size > self.training_start_step and self.memory_head % self.training_frequency == 0;
196 | 
197 |     def update_tau(self):
198 |         # Simple linear decay
199 |         if self.boltzmann_tau_max_steps <= self.current_frame:
200 |             self.boltzmann_tau = self.boltzmann_tau_end
201 |             return
202 | 
203 |         slope = (self.boltzmann_tau_end - self.boltzmann_tau_start) / (self.boltzmann_tau_max_steps - self.current_frame)
204 |         self.boltzmann_tau = max(slope*self.current_frame + self.boltzmann_tau_start, self.boltzmann_tau_end)
205 | 
206 |     def update_models(self):
207 |         if self.current_frame % self.model_update_frequency == 0:
208 |             # replace
209 |             self.target_model.load_state_dict(self.model.state_dict())
210 | 
211 |     def update(self, state, action, reward, next_state, done):
212 |         self.current_frame += 1
213 |         self.update_memory(state, action, reward, next_state, done)
214 |         self.check_train()
215 |         self.update_tau()
216 |         self.update_models()
217 |         if (self.memory_seen_size > self.training_start_step):
218 |             self.lr_scheduler.step()
219 | 
220 | def run_rl(dqn, env):
221 |     state = env.reset()
222 |     done = False
223 |     total_reward = 0
224 |     while True:
225 |         if done:  # before starting another episode
226 |             print(f'Episode done: cur_frame={dqn.current_frame} current_training_step={dqn.current_training_step} total_reward={total_reward}')
227 |             total_reward = 0
228 | 
229 |             if dqn.current_frame < dqn.max_frame:  # reset and continue
230 |                 state = env.reset()
231 |                 done = False
232 | 
233 |         if dqn.current_frame >= dqn.max_frame:  # finish
234 |             break
235 | 
236 |         action = dqn.act(state)
237 |         next_state, reward, done, info = env.step(action)
238 |         dqn.update(state, action, reward, next_state, done)
239 |         state = next_state
240 |         total_reward += reward
241 | 
242 | def main():
243 |     env = gym.make("CartPole-v0")
244 |     dqn = DoubleDQN(env)
245 |     run_rl(dqn, env)
246 | 
247 | if __name__ == '__main__':
248 |     main()


--------------------------------------------------------------------------------
/Code/PPO.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from copy import deepcopy
  3 | from torch.distributions import Categorical
  4 | import gym
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.optim as optim
  9 | from torch import distributions
 10 | import random
 11 | 
 12 | '''
 13 | Chapter 7. Proximal Policy Optimization (PPO)
 14 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 15 | 
 16 | For this example, we are implementing PPO with clipped surrogate objective with two seperate networks for actor and critic.
 17 | '''
 18 | 
 19 | class PPO(nn.Module):
 20 |     def __init__(self, env):
 21 |         super(PPO, self).__init__()
 22 | 
 23 |         self.env = env
 24 |         in_dim = env.observation_space.shape[0] # 4 for CartPole
 25 |         out_dim = env.action_space.n # 2 for CardPole
 26 | 
 27 |         # Initialize the neural networks for Actor and Critic
 28 |         # We do not share NN between actor and critic for this example.
 29 | 
 30 |         # Actor
 31 |         actor_layers = [
 32 |             nn.Linear(in_dim, 64),
 33 |             nn.ReLU(),
 34 |             nn.Linear(64, out_dim),
 35 |         ]
 36 |         self.actor_model = nn.Sequential(*actor_layers)
 37 |         self.actor_optim = torch.optim.Adam(self.actor_model.parameters(), lr=0.01)
 38 |         self.actor_policy_loss_coef = 1.0
 39 |         self.actor_entropy_coef = 0.001
 40 | 
 41 |         # Critic
 42 |         critic_layers = [
 43 |             nn.Linear(in_dim, 64),
 44 |             nn.ReLU(),
 45 |             nn.Linear(64, 1),
 46 |         ]
 47 |         self.critic_model = nn.Sequential(*critic_layers)
 48 |         self.critic_optim = torch.optim.Adam(self.critic_model.parameters(), lr=0.01)
 49 |         self.critic_val_loss_coef = 1.0
 50 | 
 51 |         # Create old net to calculate ratio
 52 |         self.old_actor_model = deepcopy(self.actor_model)
 53 | 
 54 |         self.train()
 55 | 
 56 |         # Gamma
 57 |         self.gamma = 0.99
 58 | 
 59 |         # Memory for batch
 60 |         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones']
 61 |         self.memory = {k: [] for k in self.data_keys}
 62 | 
 63 |         # PPO uses GAE
 64 |         self.gae_lambda = 0.95
 65 |         self.clip_eps = 0.2
 66 | 
 67 |         # Training
 68 |         self.to_train = 0
 69 |         self.training_frequency = 32
 70 |         self.training_epoch = 4
 71 | 
 72 |     def act(self, state):      
 73 |         """
 74 |         - Probability distribution = Policy(state) : NN(state) generates v value for all actions given a state.
 75 |         - They are actually just logits which are not normalized, unlike probabilities that sum up to 1.
 76 |         - Categorical() will sample action based on these logits by using Softmax.
 77 |         - Softmax - https://miro.medium.com/max/875/1*ReYpdIZ3ZSAPb2W8cJpkBg.jpeg
 78 |         - Categorical() also provides log(action_probability) that we need for calculating loss.
 79 |         """
 80 |         x = torch.from_numpy(state.astype(np.float32)) # to tensor        
 81 |         pdparam = self.actor_model(x) # forward pass
 82 | 
 83 |         pd = Categorical(logits=pdparam) # probability distribution
 84 |         action = pd.sample() # pi(a|s) in action via pd
 85 |         return action.item()
 86 | 
 87 |     def sample(self):
 88 |         # Create batch
 89 |         batch = {k: self.memory[k] for k in self.data_keys}
 90 | 
 91 |         for k in batch:
 92 |             batch[k] = np.array(batch[k])
 93 |             batch[k] = torch.from_numpy(batch[k].astype(np.float32))
 94 | 
 95 |         return batch
 96 | 
 97 |     def update_memory(self, state, action, reward, next_state, done):
 98 |         # Add this exp to memory.
 99 |         most_recent = (state, action, reward, next_state, done)
100 |         for idx, k in enumerate(self.data_keys):
101 |             self.memory[k].append(most_recent[idx])
102 | 
103 |         # If it has collected the desired number of experiences, it is ready to train.
104 |         if len(self.memory['states']) == self.training_frequency:
105 |             self.to_train = 1
106 | 
107 |     def calc_v(self, x):
108 |         '''
109 |         Forward-pass to calculate the predicted state-value from critic_net.
110 |         '''
111 |         return self.critic_model(x).view(-1)
112 | 
113 |     def calc_pdparam_v(self, batch):
114 |         '''Efficiently forward to get pdparam and v by batch for loss computation'''
115 |         states = batch['states']
116 |         pdparam = self.actor_model(states)
117 |         v_pred = self.calc_v(states)
118 |         return pdparam, v_pred
119 | 
120 | 
121 |     def calc_gaes(self, rewards, dones, v_preds):
122 |         '''
123 |         Estimate the advantages using GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf
124 |         v_preds are values predicted for current states, with one last element as the final next_state
125 |         delta is defined as r + gamma * V(s') - V(s) in eqn 10
126 |         GAE is defined in eqn 16
127 |         This method computes in torch tensor to prevent unnecessary moves between devices (e.g. GPU tensor to CPU numpy)
128 |         NOTE any standardization is done outside of this method
129 |         '''
130 |         T = len(rewards)
131 |         assert T + 1 == len(v_preds), f'T+1: {T+1} v.s. v_preds.shape: {v_preds.shape}'  # v_preds runs into t+1
132 |         gaes = torch.zeros_like(rewards)
133 |         future_gae = torch.tensor(0.0, dtype=rewards.dtype)
134 |         not_dones = 1 - dones  # to reset at episode boundary by multiplying 0
135 |         deltas = rewards + self.gamma * v_preds[1:] * not_dones - v_preds[:-1]
136 |         coef = self.gamma * self.gae_lambda
137 |         for t in reversed(range(T)):
138 |             gaes[t] = future_gae = deltas[t] + coef * not_dones[t] * future_gae
139 |         return gaes
140 | 
141 |     def calc_gae_advs_v_targets(self, batch, v_preds):
142 |         '''
143 |         Calculate GAE, and advs = GAE, v_targets = advs + v_preds
144 |         See GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf
145 |         '''
146 |         next_states = batch['next_states'][-1]
147 |         next_states = next_states.unsqueeze(dim=0)
148 | 
149 |         with torch.no_grad():
150 |             next_v_pred = self.calc_v(next_states)
151 | 
152 |         v_preds = v_preds.detach()  # adv does not accumulate grad
153 |         v_preds_all = torch.cat((v_preds, next_v_pred), dim=0)
154 |         advs = self.calc_gaes(batch['rewards'], batch['dones'], v_preds_all)
155 |         v_targets = advs + v_preds            
156 |         advs = (advs - advs.mean()) / (advs.std() + 1e-08)  # standardize only for advs, not v_targets
157 | 
158 |         #print(f'advs: {advs}\nv_targets: {v_targets}')
159 |         return advs, v_targets
160 | 
161 |     def calc_policy_loss(self, batch, pdparams, advs):
162 |         '''Calculate the actor's policy loss'''        
163 |         action_pd = Categorical(logits=pdparams) # probability distribution
164 |         actions = batch['actions']
165 |         log_probs = action_pd.log_prob(actions)
166 |         policy_loss = -self.actor_policy_loss_coef * (log_probs * advs).mean()
167 | 
168 |         # Entropy Regularization
169 |         entropy = action_pd.entropy().mean()
170 |         policy_loss += (-self.actor_entropy_coef * entropy)
171 | 
172 |         #print(f'Actor policy loss: {policy_loss:g}')
173 |         return policy_loss
174 | 
175 |     def calc_policy_loss(self, batch, pdparams, advs):
176 |         '''
177 |         The PPO loss function (subscript t is omitted)
178 |         L^{CLIP+VF+S} = E[ L^CLIP - c1 * L^VF + c2 * H[pi](s) ]
179 | 
180 |         Breakdown piecewise,
181 |         1. L^CLIP = E[ min(ratio * A, clip(ratio, 1-eps, 1+eps) * A) ]
182 |         where ratio = pi(a|s) / pi_old(a|s)
183 | 
184 |         2. L^VF = E[ mse(V(s_t), V^target) ]
185 | 
186 |         3. H = E[ entropy ]
187 |         '''
188 | 
189 |         action_pd = Categorical(logits=pdparams) # probability distribution
190 |         states = batch['states']
191 |         actions = batch['actions']
192 | 
193 |         # L^CLIP
194 |         log_probs = action_pd.log_prob(actions)
195 |         with torch.no_grad():
196 |             old_pdparams = self.old_actor_model(states)
197 |             old_action_pd = Categorical(logits=old_pdparams) # probability distribution
198 |             old_log_probs = old_action_pd.log_prob(actions)
199 |         assert log_probs.shape == old_log_probs.shape
200 |         ratios = torch.exp(log_probs - old_log_probs)
201 |         #print(f'ratios: {ratios}')
202 |         sur_1 = ratios * advs
203 |         sur_2 = torch.clamp(ratios, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * advs
204 |         # flip sign because need to maximize
205 |         clip_loss = -torch.min(sur_1, sur_2).mean()
206 |         #print(f'clip_loss: {clip_loss}')
207 | 
208 |         # L^VF (inherit from ActorCritic)
209 | 
210 |         # H entropy regularization
211 |         entropy = action_pd.entropy().mean()
212 |         ent_penalty = -self.actor_entropy_coef * entropy
213 |         #print(f'ent_penalty: {ent_penalty}')
214 | 
215 |         policy_loss = clip_loss + ent_penalty
216 |         #print(f'PPO Actor policy loss: {policy_loss:g}')
217 |         return policy_loss
218 | 
219 | 
220 |     def calc_val_loss(self, v_preds, v_targets):
221 |         '''Calculate the critic's value loss'''
222 |         assert v_preds.shape == v_targets.shape, f'{v_preds.shape} != {v_targets.shape}'
223 | 
224 |         # Let's use mean-squared-error loss function.
225 |         loss = nn.MSELoss()
226 |         val_loss = self.critic_val_loss_coef * loss(v_preds, v_targets)
227 |         #print(f'Critic value loss: {val_loss:g}')
228 |         return val_loss
229 | 
230 |     def check_train(self):
231 |         if self.to_train == 1:
232 |             # update old net
233 |             self.old_actor_model.load_state_dict(self.actor_model.state_dict()) 
234 | 
235 |             batch = self.sample()
236 |             pdparams, v_preds = self.calc_pdparam_v(batch)
237 |             advs, v_targets = self.calc_gae_advs_v_targets(batch, v_preds)
238 | 
239 |             for _ in range(self.training_epoch):
240 |                 pdparams, v_preds = self.calc_pdparam_v(batch)
241 |                 policy_loss = self.calc_policy_loss(batch, pdparams, advs)  # from actor
242 |                 val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
243 | 
244 |                 # actor update
245 |                 self.actor_optim.zero_grad()
246 |                 policy_loss.backward()
247 |                 self.actor_optim.step()
248 | 
249 |                 # critic update
250 |                 self.critic_optim.zero_grad()
251 |                 val_loss.backward()
252 |                 self.critic_optim.step()
253 | 
254 |             # reset
255 |             self.to_train = 0
256 |             self.memory = {k: [] for k in self.data_keys}           
257 | 
258 |     def update(self, state, action, reward, next_state, done):
259 |         self.update_memory(state, action, reward, next_state, done)
260 |         self.check_train()
261 | 
262 | 
263 | def run_rl(a2c, env, max_frame):
264 |     state = env.reset()
265 |     done = False
266 |     total_reward = 0
267 |     current_frame = 0
268 |     total_reward = 0
269 | 
270 |     while True:
271 |         if done:  # before starting another episode
272 |             print(f'Episode done: cur_frame={current_frame} total_reward={total_reward}')
273 |             total_reward = 0
274 | 
275 |             if current_frame < max_frame:  # reset and continue
276 |                 state = env.reset()
277 |                 done = False
278 | 
279 |         if current_frame >= max_frame:  # finish
280 |             break
281 | 
282 |         action = a2c.act(state)
283 |         next_state, reward, done, info = env.step(action)
284 |         a2c.update(state, action, reward, next_state, done)
285 |         state = next_state
286 |         total_reward += reward
287 | 
288 | def main():
289 |     env = gym.make("CartPole-v0")
290 |     ppo = PPO(env)
291 |     run_rl(ppo, env, max_frame=1000)
292 | 
293 | if __name__ == '__main__':
294 |     main()


--------------------------------------------------------------------------------
/Code/DoubleDQN_PER.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | from torch import distributions
  8 | import random
  9 | 
 10 | '''
 11 | Chapter 5. Improving DQN
 12 | Most code here is copied from SLM-Lab first and then modified to show a plain torch implementation.
 13 | 
 14 | This is for introducing PER(Prioritized Experience Replay) to choose most beniefical samples to train.
 15 | '''
 16 | 
 17 | class SumTree:
 18 |     '''
 19 |     Helper class for PrioritizedReplay
 20 | 
 21 |     This implementation is, with minor adaptations, Jaromír Janisch's. The license is reproduced below.
 22 |     For more information see his excellent blog series "Let's make a DQN" https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/
 23 | 
 24 |     MIT License
 25 | 
 26 |     Copyright (c) 2018 Jaromír Janisch
 27 | 
 28 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 29 |     of this software and associated documentation files (the "Software"), to deal
 30 |     in the Software without restriction, including without limitation the rights
 31 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 32 |     copies of the Software, and to permit persons to whom the Software is
 33 |     furnished to do so, subject to the following conditions:
 34 |     '''
 35 |     write = 0
 36 | 
 37 |     def __init__(self, capacity):
 38 |         self.capacity = capacity
 39 |         self.tree = np.zeros(2 * capacity - 1)  # Stores the priorities and sums of priorities
 40 |         self.indices = np.zeros(capacity)  # Stores the indices of the experiences
 41 | 
 42 |     def _propagate(self, idx, change):
 43 |         parent = (idx - 1) // 2
 44 | 
 45 |         self.tree[parent] += change
 46 | 
 47 |         if parent != 0:
 48 |             self._propagate(parent, change)
 49 | 
 50 |     def _retrieve(self, idx, s):
 51 |         left = 2 * idx + 1
 52 |         right = left + 1
 53 | 
 54 |         if left >= len(self.tree):
 55 |             return idx
 56 | 
 57 |         if s <= self.tree[left]:
 58 |             return self._retrieve(left, s)
 59 |         else:
 60 |             return self._retrieve(right, s - self.tree[left])
 61 | 
 62 |     def total(self):
 63 |         return self.tree[0]
 64 | 
 65 |     def add(self, p, index):
 66 |         idx = self.write + self.capacity - 1
 67 | 
 68 |         self.indices[self.write] = index
 69 |         self.update(idx, p)
 70 | 
 71 |         self.write += 1
 72 |         if self.write >= self.capacity:
 73 |             self.write = 0
 74 | 
 75 |     def update(self, idx, p):
 76 |         change = p - self.tree[idx]
 77 | 
 78 |         self.tree[idx] = p
 79 |         self._propagate(idx, change)
 80 | 
 81 |     def get(self, s):
 82 |         assert s <= self.total()
 83 |         idx = self._retrieve(0, s)
 84 |         indexIdx = idx - self.capacity + 1
 85 | 
 86 |         return (idx, self.tree[idx], self.indices[indexIdx])
 87 | 
 88 |     def print_tree(self):
 89 |         for i in range(len(self.indices)):
 90 |             j = i + self.capacity - 1
 91 |             print(f'Idx: {i}, Data idx: {self.indices[i]}, Prio: {self.tree[j]}')
 92 | 
 93 | 
 94 | class DoubleDQN_PER(nn.Module):
 95 |     def __init__(self, env):
 96 |         super(DoubleDQN_PER, self).__init__()
 97 | 
 98 |         self.env = env
 99 |         in_dim = env.observation_space.shape[0] # 4 for CartPole
100 |         out_dim = env.action_space.n # 2 for CardPole
101 | 
102 |         # Initialize the neural network used to learn the Q function
103 |         layers = [
104 |             nn.Linear(in_dim, 64),
105 |             nn.SELU(),
106 |             nn.Linear(64, 32),
107 |             nn.SELU(),
108 |             nn.Linear(32, out_dim),
109 |         ]
110 |         self.model = nn.Sequential(*layers)
111 |         self.target_model = nn.Sequential(*layers)
112 |         self.model_update_frequency = 1000
113 |         self.train()
114 | 
115 |         # Optimizer - Adam with learning rate linear decay.
116 |         self.learning_rate = 0.01
117 |         self.learning_rate_max_steps = 10000
118 |         self.optim = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
119 |         self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.optim, 
120 |                                                               lr_lambda=lambda x: 1 - x/self.learning_rate_max_steps if x < self.learning_rate_max_steps else 1/self.learning_rate_max_steps)
121 |         # Gamma
122 |         self.gamma = 0.99
123 | 
124 |         # Memory for batch
125 |         # adds a 'priorities' scalar to the data_keys for PER.
126 |         self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones', 'priorities']
127 |         self.memory_batch_size = 32
128 |         self.memory_max_size = 10000
129 |         self.memory_cur_size = 0
130 |         self.memory_seen_size = 0
131 |         self.memory_head = -1
132 |         self.memory = {k: [None] * self.memory_max_size for k in self.data_keys}
133 | 
134 |         # PER
135 |         self.batch_idxs = None
136 |         self.tree_idxs = None
137 |         self.alpha = 0.6 # 
138 |         self.epsilon = 0.0001 # a small positive number that prevent sexperiences from never being sampled.
139 |         self.tree = SumTree(self.memory_max_size)
140 | 
141 |         # Boltzmann policy
142 |         self.boltzmann_tau_start = 5.0
143 |         self.boltzmann_tau_end = 0.5
144 |         self.boltzmann_tau_max_steps = 10000
145 |         self.boltzmann_tau = self.boltzmann_tau_start
146 | 
147 |         # Training with Replay Experiences
148 |         self.to_train = 0
149 |         self.training_batch_iter = 8
150 |         self.training_iter = 4
151 |         self.training_frequency = 4
152 |         self.training_start_step = 32
153 |         self.current_training_step = 0
154 | 
155 |         # Frame
156 |         self.current_frame = 0
157 |         self.max_frame = 10000
158 | 
159 |     def act(self, state):
160 |         state = torch.from_numpy(state.astype(np.float32))
161 |         action = self.boltzmann_policy(state)
162 |         return action.item()
163 | 
164 |     def boltzmann_policy(self, state):
165 |         '''
166 |         Boltzmann policy: adjust pdparam with temperature tau; 
167 |         the higher the more randomness/noise in action.
168 |         '''
169 |         pdparam = self.model(state)
170 |         pdparam /= self.boltzmann_tau
171 |         action_pd = distributions.Categorical(logits=pdparam)
172 |         return action_pd.sample()
173 | 
174 |     def sample_idxs(self):
175 |         '''Samples batch_size indices from memory in proportional to their priority.'''
176 |         batch_idxs = np.zeros(self.memory_batch_size)
177 |         tree_idxs = np.zeros(self.memory_batch_size, dtype=int)
178 | 
179 |         for i in range(self.memory_batch_size):
180 |             s = random.uniform(0, self.tree.total())
181 |             (tree_idx, p, idx) = self.tree.get(s)
182 |             batch_idxs[i] = idx
183 |             tree_idxs[i] = tree_idx
184 | 
185 |         self.batch_idxs = np.asarray(batch_idxs).astype(int)
186 |         self.tree_idxs = tree_idxs
187 | 
188 |     def sample(self):
189 | 
190 |         self.sample_idxs()
191 | 
192 |         # Create batch.
193 |         batch = {k: [] for k in self.data_keys}
194 |         for index in self.batch_idxs:
195 |             for k in self.data_keys:
196 |                 batch[k].append(self.memory[k][index])
197 | 
198 |         for k in batch:
199 |             batch[k] = np.array(batch[k])
200 |             batch[k] = torch.from_numpy(batch[k].astype(np.float32))
201 | 
202 |         return batch
203 | 
204 |     def calc_q_loss(self, batch):
205 |         states = batch['states']
206 |         next_states = batch['next_states']
207 | 
208 |         q_preds = self.model(states)
209 | 
210 |         """
211 |         This is the gut of Double DQN implementation.
212 |             : We use two different models (networks) to reduce the overestimation of Q-value.
213 |             : One for selecting the Q-maximizing action, a`, and another for the Q value of a` given next state, s`.
214 |             : self.model is representing the first model and self.target_model is the second model here.
215 |             :
216 |             : The second model also acts as a target network that helps with stabilizing learning by reducing
217 |             : a moving target issue (which is Q_target that keeps changing if a single model (network) is used).
218 |         """
219 | 
220 |         with torch.no_grad():
221 |             online_next_q_preds = self.model(next_states)
222 |             next_q_preds = self.target_model(next_states)
223 | 
224 |         act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
225 | 
226 |         """
227 |         For SARSA, we calculate Q-values for the next action taken during the episode like the below.
228 | 
229 |             act_next_q_preds = next_q_preds.gather(-1, batch['next_actions'].long().unsqueeze(-1)).squeeze(-1)        
230 | 
231 |         For DQN, it assumes there is a perfect policy and the next action should be always the best.
232 |         Thus, instead of taking the Q value of the next action, it just takes the maximum Q-value for
233 |         the next state. This is why DQN is off-policy RL algorithm since it does not rely on the current
234 |         policy (that is used to choose next action) while training.
235 | 
236 |             max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=False)
237 | 
238 |         For Double DQN, we use the first model (self.model) to choose the Q-maximizing action (online_action).
239 |         Then use the second model (self.target_model) to get the Q value of a` and s`.
240 |         """
241 | 
242 |         online_actions = online_next_q_preds.argmax(dim=-1, keepdim=True)
243 |         max_next_q_preds = next_q_preds.gather(-1, online_actions).squeeze(-1)
244 | 
245 |         act_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds
246 | 
247 |         #print(f'act_q_preds: {act_q_preds}\nmax_next_q_preds: {max_next_q_preds}')
248 | 
249 |         # Let's use mean-squared-error loss function.
250 |         loss = nn.MSELoss()
251 |         q_loss = loss(act_q_preds, act_q_targets)
252 | 
253 |         # PER: Update priorities of these batch samples with q estimation differences.
254 |         errors = (act_q_targets - act_q_preds.detach()).abs().cpu().numpy()
255 |         self.update_priorities(errors)
256 | 
257 |         return q_loss
258 | 
259 |     def check_train(self):
260 |         if self.to_train == 1:
261 | 
262 |             for _ in range(self.training_iter):
263 |                 batch = self.sample()
264 | 
265 |                 for _ in range(self.training_batch_iter):
266 |                     # Compute loss for the batch.
267 |                     loss = self.calc_q_loss(batch)
268 | 
269 |                     # Compute gradients with backpropagation.
270 |                     self.optim.zero_grad()
271 |                     loss.backward()
272 | 
273 |                     # Update NN parameters.
274 |                     self.optim.step()
275 | 
276 |                     self.current_training_step += 1
277 | 
278 |             # Reset
279 |             self.to_train = 0
280 | 
281 |     def get_priority(self, error):
282 |         '''Takes in the error of one or more examples and returns the proportional priority'''
283 |         return np.power(error + self.epsilon, self.alpha).squeeze()
284 | 
285 |     def update_priorities(self, errors):
286 |         '''
287 |         Updates the priorities from the most recent batch
288 |         Assumes the relevant batch indices are stored in self.batch_idxs
289 |         '''
290 |         priorities = self.get_priority(errors)
291 |         assert len(priorities) == self.batch_idxs.size
292 |         for idx, p in zip(self.batch_idxs, priorities):
293 |             self.memory['priorities'][idx] = p
294 |         for p, i in zip(priorities, self.tree_idxs):
295 |             self.tree.update(i, p)
296 | 
297 |     def update_memory(self, state, action, reward, next_state, done, error=100000):
298 |         """
299 |         Add this exp to memory. Since DQN is off-policy algorithm, we can reuse
300 |         any experiences generated during training regardless of which policy (NN)
301 |         is used. We will discard the oldest exp if there is no space to add new one.
302 |         """
303 | 
304 |         priority = self.get_priority(error)
305 |         most_recent = (state, action, reward, next_state, done, priority)
306 |         self.memory_head = (self.memory_head + 1) % self.memory_max_size
307 | 
308 |         for idx, k in enumerate(self.data_keys):
309 |             self.memory[k][self.memory_head] = most_recent[idx]
310 | 
311 |         self.tree.add(priority, self.memory_head)
312 | 
313 |         if self.memory_cur_size < self.memory_max_size:
314 |             self.memory_cur_size += 1
315 | 
316 |         self.memory_seen_size += 1
317 | 
318 |         self.to_train = self.memory_seen_size > self.training_start_step and self.memory_head % self.training_frequency == 0;
319 | 
320 |     def update_tau(self):
321 |         # Simple linear decay
322 |         if self.boltzmann_tau_max_steps <= self.current_frame:
323 |             self.boltzmann_tau = self.boltzmann_tau_end
324 |             return
325 | 
326 |         slope = (self.boltzmann_tau_end - self.boltzmann_tau_start) / (self.boltzmann_tau_max_steps - self.current_frame)
327 |         self.boltzmann_tau = max(slope*self.current_frame + self.boltzmann_tau_start, self.boltzmann_tau_end)
328 | 
329 |     def update_models(self):
330 |         if self.current_frame % self.model_update_frequency == 0:
331 |             # replace
332 |             self.target_model.load_state_dict(self.model.state_dict())
333 | 
334 |     def update(self, state, action, reward, next_state, done):
335 |         self.current_frame += 1
336 |         self.update_memory(state, action, reward, next_state, done)
337 |         self.check_train()
338 |         self.update_tau()
339 |         self.update_models()
340 |         if (self.memory_seen_size > self.training_start_step):
341 |             self.lr_scheduler.step()
342 | 
343 | def run_rl(dqn, env):
344 |     state = env.reset()
345 |     done = False
346 |     total_reward = 0
347 |     while True:
348 |         if done:  # before starting another episode
349 |             print(f'Episode done: cur_frame={dqn.current_frame} current_training_step={dqn.current_training_step} total_reward={total_reward}')
350 |             total_reward = 0
351 | 
352 |             if dqn.current_frame < dqn.max_frame:  # reset and continue
353 |                 state = env.reset()
354 |                 done = False
355 | 
356 |         if dqn.current_frame >= dqn.max_frame:  # finish
357 |             break
358 | 
359 |         action = dqn.act(state)
360 |         next_state, reward, done, info = env.step(action)
361 |         dqn.update(state, action, reward, next_state, done)
362 |         state = next_state
363 |         total_reward += reward
364 | 
365 | def main():
366 |     env = gym.make("CartPole-v0")
367 |     dqn = DoubleDQN_PER(env)
368 |     run_rl(dqn, env)
369 | 
370 | if __name__ == '__main__':
371 |     main()


--------------------------------------------------------------------------------