├── LICENSE ├── README.md ├── gif ├── 1.gif ├── gif 2.gif └── gif3.gif ├── model.py ├── preTrained ├── LunarLander_0.02_0.9_0.999.pth ├── LunarLander_FOUR.pth ├── LunarLander_ONE.pth ├── LunarLander_THREE.pth └── LunarLander_TWO.pth ├── test.py └── train.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Actor-Critic 2 | 3 | Solution for Lunar Lander environment v2 of Open AI gym. 4 | The algorithm used is actor-critic (vanilla policy gradient with baseline), 5 | 6 | more info : http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf 7 | 8 | -> Dependencies: 9 | 10 | OpenAI gym 11 | 12 | PyTorch 0.4.1 13 | 14 | PIL 15 | 16 | 17 | -> Hyperparameters can be changed by editing them in respective files 18 | 19 | -> To train : run train.py 20 | 21 | -> Converges within 1500 episodes 22 | 23 | -> To test a pretrained model : run test.py 24 | 25 | 26 | ![alt-text](https://github.com/nikhilbarhate99/Actor-Critic/blob/master/gif/gif3.gif) 27 | -------------------------------------------------------------------------------- /gif/1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/gif/1.gif -------------------------------------------------------------------------------- /gif/gif 2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/gif/gif 2.gif -------------------------------------------------------------------------------- /gif/gif3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/gif/gif3.gif -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Categorical 5 | 6 | class ActorCritic(nn.Module): 7 | def __init__(self): 8 | super(ActorCritic, self).__init__() 9 | self.affine = nn.Linear(8, 128) 10 | 11 | self.action_layer = nn.Linear(128, 4) 12 | self.value_layer = nn.Linear(128, 1) 13 | 14 | self.logprobs = [] 15 | self.state_values = [] 16 | self.rewards = [] 17 | 18 | def forward(self, state): 19 | state = torch.from_numpy(state).float() 20 | state = F.relu(self.affine(state)) 21 | 22 | state_value = self.value_layer(state) 23 | 24 | action_probs = F.softmax(self.action_layer(state)) 25 | action_distribution = Categorical(action_probs) 26 | action = action_distribution.sample() 27 | 28 | self.logprobs.append(action_distribution.log_prob(action)) 29 | self.state_values.append(state_value) 30 | 31 | return action.item() 32 | 33 | def calculateLoss(self, gamma=0.99): 34 | 35 | # calculating discounted rewards: 36 | rewards = [] 37 | dis_reward = 0 38 | for reward in self.rewards[::-1]: 39 | dis_reward = reward + gamma * dis_reward 40 | rewards.insert(0, dis_reward) 41 | 42 | # normalizing the rewards: 43 | rewards = torch.tensor(rewards) 44 | rewards = (rewards - rewards.mean()) / (rewards.std()) 45 | 46 | loss = 0 47 | for logprob, value, reward in zip(self.logprobs, self.state_values, rewards): 48 | advantage = reward - value.item() 49 | action_loss = -logprob * advantage 50 | value_loss = F.smooth_l1_loss(value, reward) 51 | loss += (action_loss + value_loss) 52 | return loss 53 | 54 | def clearMemory(self): 55 | del self.logprobs[:] 56 | del self.state_values[:] 57 | del self.rewards[:] 58 | -------------------------------------------------------------------------------- /preTrained/LunarLander_0.02_0.9_0.999.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_0.02_0.9_0.999.pth -------------------------------------------------------------------------------- /preTrained/LunarLander_FOUR.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_FOUR.pth -------------------------------------------------------------------------------- /preTrained/LunarLander_ONE.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_ONE.pth -------------------------------------------------------------------------------- /preTrained/LunarLander_THREE.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_THREE.pth -------------------------------------------------------------------------------- /preTrained/LunarLander_TWO.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikhilbarhate99/Actor-Critic-PyTorch/01c833e83006be5762151a29f0719cc9c03c204d/preTrained/LunarLander_TWO.pth -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from model import ActorCritic 2 | import torch 3 | import gym 4 | from PIL import Image 5 | 6 | def test(n_episodes=5, name='LunarLander_TWO.pth'): 7 | env = gym.make('LunarLander-v2') 8 | policy = ActorCritic() 9 | 10 | policy.load_state_dict(torch.load('./preTrained/{}'.format(name))) 11 | 12 | render = True 13 | save_gif = False 14 | 15 | for i_episode in range(1, n_episodes+1): 16 | state = env.reset() 17 | running_reward = 0 18 | for t in range(10000): 19 | action = policy(state) 20 | state, reward, done, _ = env.step(action) 21 | running_reward += reward 22 | if render: 23 | env.render() 24 | if save_gif: 25 | img = env.render(mode = 'rgb_array') 26 | img = Image.fromarray(img) 27 | img.save('./gif/{}.jpg'.format(t)) 28 | if done: 29 | break 30 | print('Episode {}\tReward: {}'.format(i_episode, running_reward)) 31 | env.close() 32 | 33 | if __name__ == '__main__': 34 | test() 35 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from test import test 2 | from model import ActorCritic 3 | import torch 4 | import torch.optim as optim 5 | import gym 6 | 7 | def train(): 8 | # Defaults parameters: 9 | # gamma = 0.99 10 | # lr = 0.02 11 | # betas = (0.9, 0.999) 12 | # random_seed = 543 13 | 14 | render = False 15 | gamma = 0.99 16 | lr = 0.02 17 | betas = (0.9, 0.999) 18 | random_seed = 543 19 | 20 | torch.manual_seed(random_seed) 21 | 22 | env = gym.make('LunarLander-v2') 23 | env.seed(random_seed) 24 | 25 | policy = ActorCritic() 26 | optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) 27 | print(lr,betas) 28 | 29 | running_reward = 0 30 | for i_episode in range(0, 10000): 31 | state = env.reset() 32 | for t in range(10000): 33 | action = policy(state) 34 | state, reward, done, _ = env.step(action) 35 | policy.rewards.append(reward) 36 | running_reward += reward 37 | if render and i_episode > 1000: 38 | env.render() 39 | if done: 40 | break 41 | 42 | # Updating the policy : 43 | optimizer.zero_grad() 44 | loss = policy.calculateLoss(gamma) 45 | loss.backward() 46 | optimizer.step() 47 | policy.clearMemory() 48 | 49 | # saving the model if episodes > 999 OR avg reward > 200 50 | #if i_episode > 999: 51 | # torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) 52 | 53 | if running_reward > 4000: 54 | torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) 55 | print("########## Solved! ##########") 56 | test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) 57 | break 58 | 59 | if i_episode % 20 == 0: 60 | running_reward = running_reward/20 61 | print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward)) 62 | running_reward = 0 63 | 64 | if __name__ == '__main__': 65 | train() 66 | --------------------------------------------------------------------------------