├── LICENSE
├── README.md
├── gif
    ├── 1.gif
    ├── gif 2.gif
    └── gif3.gif
├── model.py
├── preTrained
    ├── LunarLander_0.02_0.9_0.999.pth
    ├── LunarLander_FOUR.pth
    ├── LunarLander_ONE.pth
    ├── LunarLander_THREE.pth
    └── LunarLander_TWO.pth
├── test.py
└── train.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018,
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Actor-Critic
 2 | 
 3 | Solution for Lunar Lander environment v2 of Open AI gym.
 4 | The algorithm used is actor-critic (vanilla policy gradient with baseline),
 5 | 
 6 | more info : http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
 7 | 
 8 | -> Dependencies:
 9 | 
10 |         OpenAI gym
11 | 
12 |         PyTorch 0.4.1
13 | 
14 |         PIL
15 | 
16 | 
17 | -> Hyperparameters can be changed by editing them in respective files
18 | 
19 | -> To train : run train.py
20 | 
21 | -> Converges within 1500 episodes
22 | 
23 | -> To test a pretrained model : run test.py
24 | 
25 | 
26 | ![alt-text](https://github.com/nikhilbarhate99/Actor-Critic/blob/master/gif/gif3.gif)
27 | 


--------------------------------------------------------------------------------
/gif/1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/gif/1.gif


--------------------------------------------------------------------------------
/gif/gif 2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/gif/gif 2.gif


--------------------------------------------------------------------------------
/gif/gif3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/gif/gif3.gif


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.distributions import Categorical
 5 | 
 6 | class ActorCritic(nn.Module):
 7 |     def __init__(self):
 8 |         super(ActorCritic, self).__init__()
 9 |         self.affine = nn.Linear(8, 128)
10 |         
11 |         self.action_layer = nn.Linear(128, 4)
12 |         self.value_layer = nn.Linear(128, 1)
13 |         
14 |         self.logprobs = []
15 |         self.state_values = []
16 |         self.rewards = []
17 | 
18 |     def forward(self, state):
19 |         state = torch.from_numpy(state).float()
20 |         state = F.relu(self.affine(state))
21 |         
22 |         state_value = self.value_layer(state)
23 |         
24 |         action_probs = F.softmax(self.action_layer(state))
25 |         action_distribution = Categorical(action_probs)
26 |         action = action_distribution.sample()
27 |         
28 |         self.logprobs.append(action_distribution.log_prob(action))
29 |         self.state_values.append(state_value)
30 |         
31 |         return action.item()
32 |     
33 |     def calculateLoss(self, gamma=0.99):
34 |         
35 |         # calculating discounted rewards:
36 |         rewards = []
37 |         dis_reward = 0
38 |         for reward in self.rewards[::-1]:
39 |             dis_reward = reward + gamma * dis_reward
40 |             rewards.insert(0, dis_reward)
41 |                 
42 |         # normalizing the rewards:
43 |         rewards = torch.tensor(rewards)
44 |         rewards = (rewards - rewards.mean()) / (rewards.std())
45 |         
46 |         loss = 0
47 |         for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
48 |             advantage = reward  - value.item()
49 |             action_loss = -logprob * advantage
50 |             value_loss = F.smooth_l1_loss(value, reward)
51 |             loss += (action_loss + value_loss)   
52 |         return loss
53 |     
54 |     def clearMemory(self):
55 |         del self.logprobs[:]
56 |         del self.state_values[:]
57 |         del self.rewards[:]
58 | 


--------------------------------------------------------------------------------
/preTrained/LunarLander_0.02_0.9_0.999.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_0.02_0.9_0.999.pth


--------------------------------------------------------------------------------
/preTrained/LunarLander_FOUR.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_FOUR.pth


--------------------------------------------------------------------------------
/preTrained/LunarLander_ONE.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_ONE.pth


--------------------------------------------------------------------------------
/preTrained/LunarLander_THREE.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_THREE.pth


--------------------------------------------------------------------------------
/preTrained/LunarLander_TWO.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_TWO.pth


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from model import ActorCritic
 2 | import torch
 3 | import gym
 4 | from PIL import Image
 5 | 
 6 | def test(n_episodes=5, name='LunarLander_TWO.pth'):
 7 |     env = gym.make('LunarLander-v2')
 8 |     policy = ActorCritic()
 9 |     
10 |     policy.load_state_dict(torch.load('./preTrained/{}'.format(name)))
11 |     
12 |     render = True
13 |     save_gif = False
14 | 
15 |     for i_episode in range(1, n_episodes+1):
16 |         state = env.reset()
17 |         running_reward = 0
18 |         for t in range(10000):
19 |             action = policy(state)
20 |             state, reward, done, _ = env.step(action)
21 |             running_reward += reward
22 |             if render:
23 |                  env.render()
24 |                  if save_gif:
25 |                      img = env.render(mode = 'rgb_array')
26 |                      img = Image.fromarray(img)
27 |                      img.save('./gif/{}.jpg'.format(t))
28 |             if done:
29 |                 break
30 |         print('Episode {}\tReward: {}'.format(i_episode, running_reward))
31 |     env.close()
32 |             
33 | if __name__ == '__main__':
34 |     test()
35 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | from test import test
 2 | from model import ActorCritic
 3 | import torch
 4 | import torch.optim as optim
 5 | import gym
 6 | 
 7 | def train():
 8 |     # Defaults parameters:
 9 |     #    gamma = 0.99
10 |     #    lr = 0.02
11 |     #    betas = (0.9, 0.999)
12 |     #    random_seed = 543
13 | 
14 |     render = False
15 |     gamma = 0.99
16 |     lr = 0.02
17 |     betas = (0.9, 0.999)
18 |     random_seed = 543
19 |     
20 |     torch.manual_seed(random_seed)
21 |     
22 |     env = gym.make('LunarLander-v2')
23 |     env.seed(random_seed)
24 |     
25 |     policy = ActorCritic()
26 |     optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
27 |     print(lr,betas)
28 |     
29 |     running_reward = 0
30 |     for i_episode in range(0, 10000):
31 |         state = env.reset()
32 |         for t in range(10000):
33 |             action = policy(state)
34 |             state, reward, done, _ = env.step(action)
35 |             policy.rewards.append(reward)
36 |             running_reward += reward
37 |             if render and i_episode > 1000:
38 |                 env.render()
39 |             if done:
40 |                 break
41 |                     
42 |         # Updating the policy :
43 |         optimizer.zero_grad()
44 |         loss = policy.calculateLoss(gamma)
45 |         loss.backward()
46 |         optimizer.step()        
47 |         policy.clearMemory()
48 |         
49 |         # saving the model if episodes > 999 OR avg reward > 200 
50 |         #if i_episode > 999:
51 |         #    torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
52 |         
53 |         if running_reward > 4000:
54 |             torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
55 |             print("########## Solved! ##########")
56 |             test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
57 |             break
58 |         
59 |         if i_episode % 20 == 0:
60 |             running_reward = running_reward/20
61 |             print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
62 |             running_reward = 0
63 |             
64 | if __name__ == '__main__':
65 |     train()
66 | 


--------------------------------------------------------------------------------