├── README.md └── a2c_cartpole.py /README.md: -------------------------------------------------------------------------------- 1 | ## A2C CartPole -------------------------------------------------------------------------------- /a2c_cartpole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import math 8 | import random 9 | import os 10 | import gym 11 | 12 | # Hyper Parameters 13 | STATE_DIM = 4 14 | ACTION_DIM = 2 15 | STEP = 2000 16 | SAMPLE_NUMS = 30 17 | 18 | 19 | class ActorNetwork(nn.Module): 20 | 21 | def __init__(self,input_size,hidden_size,action_size): 22 | super(ActorNetwork, self).__init__() 23 | self.fc1 = nn.Linear(input_size,hidden_size) 24 | self.fc2 = nn.Linear(hidden_size,hidden_size) 25 | self.fc3 = nn.Linear(hidden_size,action_size) 26 | 27 | def forward(self,x): 28 | out = F.relu(self.fc1(x)) 29 | out = F.relu(self.fc2(out)) 30 | out = F.log_softmax(self.fc3(out)) 31 | return out 32 | 33 | class ValueNetwork(nn.Module): 34 | 35 | def __init__(self,input_size,hidden_size,output_size): 36 | super(ValueNetwork, self).__init__() 37 | self.fc1 = nn.Linear(input_size,hidden_size) 38 | self.fc2 = nn.Linear(hidden_size,hidden_size) 39 | self.fc3 = nn.Linear(hidden_size,output_size) 40 | 41 | def forward(self,x): 42 | out = F.relu(self.fc1(x)) 43 | out = F.relu(self.fc2(out)) 44 | out = self.fc3(out) 45 | return out 46 | 47 | def roll_out(actor_network,task,sample_nums,value_network,init_state): 48 | #task.reset() 49 | states = [] 50 | actions = [] 51 | rewards = [] 52 | is_done = False 53 | final_r = 0 54 | state = init_state 55 | 56 | for j in range(sample_nums): 57 | states.append(state) 58 | log_softmax_action = actor_network(Variable(torch.Tensor([state]))) 59 | softmax_action = torch.exp(log_softmax_action) 60 | action = np.random.choice(ACTION_DIM,p=softmax_action.cpu().data.numpy()[0]) 61 | one_hot_action = [int(k == action) for k in range(ACTION_DIM)] 62 | next_state,reward,done,_ = task.step(action) 63 | #fix_reward = -10 if done else 1 64 | actions.append(one_hot_action) 65 | rewards.append(reward) 66 | final_state = next_state 67 | state = next_state 68 | if done: 69 | is_done = True 70 | state = task.reset() 71 | break 72 | if not is_done: 73 | final_r = value_network(Variable(torch.Tensor([final_state]))).cpu().data.numpy() 74 | 75 | return states,actions,rewards,final_r,state 76 | 77 | def discount_reward(r, gamma,final_r): 78 | discounted_r = np.zeros_like(r) 79 | running_add = final_r 80 | for t in reversed(range(0, len(r))): 81 | running_add = running_add * gamma + r[t] 82 | discounted_r[t] = running_add 83 | return discounted_r 84 | 85 | def main(): 86 | # init a task generator for data fetching 87 | task = gym.make("CartPole-v0") 88 | init_state = task.reset() 89 | 90 | # init value network 91 | value_network = ValueNetwork(input_size = STATE_DIM,hidden_size = 40,output_size = 1) 92 | value_network_optim = torch.optim.Adam(value_network.parameters(),lr=0.01) 93 | 94 | # init actor network 95 | actor_network = ActorNetwork(STATE_DIM,40,ACTION_DIM) 96 | actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.01) 97 | 98 | steps =[] 99 | task_episodes =[] 100 | test_results =[] 101 | 102 | for step in range(STEP): 103 | states,actions,rewards,final_r,current_state = roll_out(actor_network,task,SAMPLE_NUMS,value_network,init_state) 104 | init_state = current_state 105 | actions_var = Variable(torch.Tensor(actions).view(-1,ACTION_DIM)) 106 | states_var = Variable(torch.Tensor(states).view(-1,STATE_DIM)) 107 | 108 | # train actor network 109 | actor_network_optim.zero_grad() 110 | log_softmax_actions = actor_network(states_var) 111 | vs = value_network(states_var).detach() 112 | # calculate qs 113 | qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))) 114 | 115 | advantages = qs - vs 116 | actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) 117 | actor_network_loss.backward() 118 | torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5) 119 | actor_network_optim.step() 120 | 121 | # train value network 122 | value_network_optim.zero_grad() 123 | target_values = qs 124 | values = value_network(states_var) 125 | criterion = nn.MSELoss() 126 | value_network_loss = criterion(values,target_values) 127 | value_network_loss.backward() 128 | torch.nn.utils.clip_grad_norm(value_network.parameters(),0.5) 129 | value_network_optim.step() 130 | 131 | # Testing 132 | if (step + 1) % 50== 0: 133 | result = 0 134 | test_task = gym.make("CartPole-v0") 135 | for test_epi in range(10): 136 | state = test_task.reset() 137 | for test_step in range(200): 138 | softmax_action = torch.exp(actor_network(Variable(torch.Tensor([state])))) 139 | #print(softmax_action.data) 140 | action = np.argmax(softmax_action.data.numpy()[0]) 141 | next_state,reward,done,_ = test_task.step(action) 142 | result += reward 143 | state = next_state 144 | if done: 145 | break 146 | print("step:",step+1,"test result:",result/10.0) 147 | steps.append(step+1) 148 | test_results.append(result/10) 149 | 150 | if __name__ == '__main__': 151 | main() 152 | --------------------------------------------------------------------------------