├── reward per iter.txt ├── Report_PPO_Humanoid.pdf ├── hparams.py ├── test_env.py ├── README.md ├── model.py ├── running_state.py ├── utils.py ├── vanila_pg.py ├── ppo.py ├── main.py ├── npg.py └── trpo.py /reward per iter.txt: -------------------------------------------------------------------------------- 1 | 2157,355.4177571612188 2 | -------------------------------------------------------------------------------- /Report_PPO_Humanoid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EgOrlukha/MuJoCo-PyTorch/HEAD/Report_PPO_Humanoid.pdf -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | class HyperParams: 2 | gamma = 0.99 3 | lamda = 0.98 4 | hidden = 64 5 | critic_lr = 0.0003 6 | actor_lr = 0.0003 7 | batch_size = 64 8 | l2_rate = 0.001 9 | max_kl = 0.01 10 | clip_param = 0.2 11 | -------------------------------------------------------------------------------- /test_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | # you can choose other environments. 4 | # possible environments: Ant-v2, HalfCheetah-v2, Hopper-v2, Humanoid-v2, 5 | # HumanoidStandup-v2, InvertedPendulum-v2, Reacher-v2, Swimmer-v2, Walker2D-v2 6 | env = gym.make("Humanoid-v2") 7 | 8 | num_inputs = env.observation_space.shape[0] 9 | num_actions = env.action_space.shape[0] 10 | 11 | print('state size:', num_inputs) 12 | print('action size:', num_actions) 13 | 14 | env.reset() 15 | for _ in range(1000): 16 | env.render() 17 | state, reward, done, _ = env.step(env.action_space.sample()) 18 | # print('state:', state) 19 | 20 | # reward = forward velocity - sum(action^2) + live_bonus 21 | print('reward:', reward) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-trpo 2 | PyTorch implementation of Vanilla Policy Gradient, Truncated Natural Policy Gradient, Trust Region Policy Optimization, Proximal Policy Optimization. 3 | ### Analysis of workability of the system is in Report_PPO_Humanoid.pdf file. 4 | 5 | # Train 6 | * **algorithm**: PG, NPG, TRPO, PPO 7 | * **env**: Ant-v2, HalfCheetah-v2, Hopper-v2, Humanoid-v2, HumanoidStandup-v2, InvertedPendulum-v2, Reacher-v2, Swimmer-v2, Walker2d-v2 8 | * The system trains a Humanoid-v2 env using the PPO algorithm by default. If you wanna specify other envs and algs use: 9 | ~~~ 10 | python main.py --algorithm "algorithm name" --env "environment name" 11 | ~~~ 12 | 13 | # Reference 14 | This code is modified version of codes 15 | * [OpenAI Baseline](https://github.com/openai/baselines/tree/master/baselines/trpo_mpi) 16 | * [Pytorch implemetation of TRPO](https://github.com/ikostrikov/pytorch-trpo) 17 | * [Pytorch implementation of all these algorithms which was modified](https://github.com/dnddnjs/mujoco-pg) 18 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from hparams import HyperParams as hp 5 | 6 | 7 | class Actor(nn.Module): 8 | def __init__(self, num_inputs, num_outputs): 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | super(Actor, self).__init__() 12 | self.fc1 = nn.Linear(num_inputs, hp.hidden) 13 | self.fc2 = nn.Linear(hp.hidden, hp.hidden) 14 | self.fc3 = nn.Linear(hp.hidden, num_outputs) 15 | self.fc3.weight.data.mul_(0.1) 16 | self.fc3.bias.data.mul_(0.0) 17 | 18 | def forward(self, x): 19 | x = F.tanh(self.fc1(x)) 20 | x = F.tanh(self.fc2(x)) 21 | mu = self.fc3(x) 22 | logstd = torch.zeros_like(mu) 23 | std = torch.exp(logstd) 24 | return mu, std, logstd 25 | 26 | 27 | class Critic(nn.Module): 28 | def __init__(self, num_inputs): 29 | super(Critic, self).__init__() 30 | self.fc1 = nn.Linear(num_inputs, hp.hidden) 31 | self.fc2 = nn.Linear(hp.hidden, hp.hidden) 32 | self.fc3 = nn.Linear(hp.hidden, 1) 33 | self.fc3.weight.data.mul_(0.1) 34 | self.fc3.bias.data.mul_(0.0) 35 | 36 | def forward(self, x): 37 | x = F.tanh(self.fc1(x)) 38 | x = F.tanh(self.fc2(x)) 39 | v = self.fc3(x) 40 | return v 41 | -------------------------------------------------------------------------------- /running_state.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import numpy as np 4 | 5 | 6 | # from https://github.com/joschu/modular_rl 7 | # http://www.johndcook.com/blog/standard_deviation/ 8 | class RunningStat(object): 9 | def __init__(self, shape): 10 | self._n = 0 11 | self._M = np.zeros(shape) 12 | self._S = np.zeros(shape) 13 | 14 | def push(self, x): 15 | x = np.asarray(x) 16 | assert x.shape == self._M.shape 17 | self._n += 1 18 | if self._n == 1: 19 | self._M[...] = x 20 | else: 21 | oldM = self._M.copy() 22 | self._M[...] = oldM + (x - oldM) / self._n 23 | self._S[...] = self._S + (x - oldM) * (x - self._M) 24 | 25 | @property 26 | def n(self): 27 | return self._n 28 | 29 | @property 30 | def mean(self): 31 | return self._M 32 | 33 | @property 34 | def var(self): 35 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 36 | 37 | @property 38 | def std(self): 39 | return np.sqrt(self.var) 40 | 41 | @property 42 | def shape(self): 43 | return self._M.shape 44 | 45 | 46 | class ZFilter: 47 | """ 48 | y = (x-mean)/std 49 | using running estimates of mean,std 50 | """ 51 | 52 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 53 | self.demean = demean 54 | self.destd = destd 55 | self.clip = clip 56 | 57 | self.rs = RunningStat(shape) 58 | 59 | def __call__(self, x, update=True): 60 | if update: self.rs.push(x) 61 | if self.demean: 62 | x = x - self.rs.mean 63 | if self.destd: 64 | x = x / (self.rs.std + 1e-8) 65 | if self.clip: 66 | x = np.clip(x, -self.clip, self.clip) 67 | return x 68 | 69 | def output_shape(self, input_space): 70 | return input_space.shape 71 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | 5 | def get_action(mu, std): 6 | action = torch.normal(mu, std) 7 | action = action.data.numpy() 8 | return action 9 | 10 | 11 | def log_density(x, mu, std, logstd): 12 | var = std.pow(2) 13 | log_density = -(x - mu).pow(2) / (2 * var) \ 14 | - 0.5 * math.log(2 * math.pi) - logstd 15 | return log_density.sum(1, keepdim=True) 16 | 17 | 18 | def flat_grad(grads): 19 | grad_flatten = [] 20 | for grad in grads: 21 | grad_flatten.append(grad.view(-1)) 22 | grad_flatten = torch.cat(grad_flatten) 23 | return grad_flatten 24 | 25 | 26 | def flat_hessian(hessians): 27 | hessians_flatten = [] 28 | for hessian in hessians: 29 | hessians_flatten.append(hessian.contiguous().view(-1)) 30 | hessians_flatten = torch.cat(hessians_flatten).data 31 | return hessians_flatten 32 | 33 | 34 | def flat_params(model): 35 | params = [] 36 | for param in model.parameters(): 37 | params.append(param.data.view(-1)) 38 | params_flatten = torch.cat(params) 39 | return params_flatten 40 | 41 | 42 | def update_model(model, new_params): 43 | index = 0 44 | for params in model.parameters(): 45 | params_length = len(params.view(-1)) 46 | new_param = new_params[index: index + params_length] 47 | new_param = new_param.view(params.size()) 48 | params.data.copy_(new_param) 49 | index += params_length 50 | 51 | 52 | def kl_divergence(new_actor, old_actor, states): 53 | mu, std, logstd = new_actor(torch.Tensor(states)) 54 | mu_old, std_old, logstd_old = old_actor(torch.Tensor(states)) 55 | mu_old = mu_old.detach() 56 | std_old = std_old.detach() 57 | logstd_old = logstd_old.detach() 58 | 59 | # kl divergence between old policy and new policy : D( pi_old || pi_new ) 60 | # pi_old -> mu0, logstd0, std0 / pi_new -> mu, logstd, std 61 | # be careful of calculating KL-divergence. It is not symmetric metric 62 | kl = logstd_old - logstd + (std_old.pow(2) + (mu_old - mu).pow(2)) / \ 63 | (2.0 * std.pow(2)) - 0.5 64 | return kl.sum(1, keepdim=True) 65 | 66 | 67 | -------------------------------------------------------------------------------- /vanila_pg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from hparams import HyperParams as hp 4 | from utils import log_density 5 | 6 | 7 | def get_returns(rewards, masks): 8 | rewards = torch.Tensor(rewards) 9 | masks = torch.Tensor(masks) 10 | returns = torch.zeros_like(rewards) 11 | 12 | running_returns = 0 13 | 14 | for t in reversed(range(0, len(rewards))): 15 | running_returns = rewards[t] + hp.gamma * running_returns * masks[t] 16 | returns[t] = running_returns 17 | 18 | returns = (returns - returns.mean()) / returns.std() 19 | return returns 20 | 21 | 22 | def get_loss(actor, returns, states, actions): 23 | mu, std, logstd = actor(torch.Tensor(states)) 24 | log_policy = log_density(torch.Tensor(actions), mu, std, logstd) 25 | returns = returns.unsqueeze(1) 26 | 27 | objective = returns * log_policy 28 | objective = objective.mean() 29 | return - objective 30 | 31 | 32 | def train_critic(critic, states, returns, critic_optim): 33 | criterion = torch.nn.MSELoss() 34 | n = len(states) 35 | arr = np.arange(n) 36 | 37 | for epoch in range(5): 38 | np.random.shuffle(arr) 39 | 40 | for i in range(n // hp.batch_size): 41 | batch_index = arr[hp.batch_size * i: hp.batch_size * (i + 1)] 42 | batch_index = torch.LongTensor(batch_index) 43 | inputs = torch.Tensor(states)[batch_index] 44 | target = returns.unsqueeze(1)[batch_index] 45 | 46 | values = critic(inputs) 47 | loss = criterion(values, target) 48 | critic_optim.zero_grad() 49 | loss.backward() 50 | critic_optim.step() 51 | 52 | 53 | def train_actor(actor, returns, states, actions, actor_optim): 54 | loss = get_loss(actor, returns, states, actions) 55 | actor_optim.zero_grad() 56 | loss.backward() 57 | actor_optim.step() 58 | 59 | 60 | def train_model(actor, critic, memory, actor_optim, critic_optim): 61 | memory = np.array(memory) 62 | states = np.vstack(memory[:, 0]) 63 | actions = list(memory[:, 1]) 64 | rewards = list(memory[:, 2]) 65 | masks = list(memory[:, 3]) 66 | 67 | returns = get_returns(rewards, masks) 68 | train_critic(critic, states, returns, critic_optim) 69 | train_actor(actor, returns, states, actions, actor_optim) 70 | return returns 71 | 72 | 73 | -------------------------------------------------------------------------------- /ppo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import * 3 | from hparams import HyperParams as hp 4 | 5 | 6 | def get_gae(rewards, masks, values): 7 | rewards = torch.Tensor(rewards) 8 | masks = torch.Tensor(masks) 9 | returns = torch.zeros_like(rewards) 10 | advants = torch.zeros_like(rewards) 11 | 12 | running_returns = 0 13 | previous_value = 0 14 | running_advants = 0 15 | 16 | for t in reversed(range(0, len(rewards))): 17 | running_returns = rewards[t] + hp.gamma * running_returns * masks[t] 18 | running_tderror = rewards[t] + hp.gamma * previous_value * masks[t] - \ 19 | values.data[t] 20 | running_advants = running_tderror + hp.gamma * hp.lamda * \ 21 | running_advants * masks[t] 22 | 23 | returns[t] = running_returns 24 | previous_value = values.data[t] 25 | advants[t] = running_advants 26 | 27 | advants = (advants - advants.mean()) / advants.std() 28 | return returns, advants 29 | 30 | 31 | def surrogate_loss(actor, advants, states, old_policy, actions, index): 32 | mu, std, logstd = actor(torch.Tensor(states)) 33 | new_policy = log_density(actions, mu, std, logstd) 34 | old_policy = old_policy[index] 35 | 36 | ratio = torch.exp(new_policy - old_policy) 37 | surrogate = ratio * advants 38 | return surrogate, ratio 39 | 40 | 41 | def train_model(actor, critic, memory, actor_optim, critic_optim): 42 | memory = np.array(memory) 43 | states = np.vstack(memory[:, 0]) 44 | actions = list(memory[:, 1]) 45 | rewards = list(memory[:, 2]) 46 | masks = list(memory[:, 3]) 47 | values = critic(torch.Tensor(states)) 48 | 49 | # ---------------------------- 50 | # step 1: get returns and GAEs and log probability of old policy 51 | returns, advants = get_gae(rewards, masks, values) 52 | mu, std, logstd = actor(torch.Tensor(states)) 53 | old_policy = log_density(torch.Tensor(actions), mu, std, logstd) 54 | 55 | criterion = torch.nn.MSELoss() 56 | n = len(states) 57 | arr = np.arange(n) 58 | 59 | # ---------------------------- 60 | # step 2: get value loss and actor loss and update actor & critic 61 | for epoch in range(10): 62 | np.random.shuffle(arr) 63 | 64 | for i in range(n // hp.batch_size): 65 | batch_index = arr[hp.batch_size * i: hp.batch_size * (i + 1)] 66 | batch_index = torch.LongTensor(batch_index) 67 | inputs = torch.Tensor(states)[batch_index] 68 | returns_samples = returns.unsqueeze(1)[batch_index] 69 | advants_samples = advants.unsqueeze(1)[batch_index] 70 | actions_samples = torch.Tensor(actions)[batch_index] 71 | 72 | loss, ratio = surrogate_loss(actor, advants_samples, inputs, 73 | old_policy.detach(), actions_samples, 74 | batch_index) 75 | 76 | values = critic(inputs) 77 | critic_loss = criterion(values, returns_samples) 78 | critic_optim.zero_grad() 79 | critic_loss.backward() 80 | critic_optim.step() 81 | 82 | clipped_ratio = torch.clamp(ratio, 83 | 1.0 - hp.clip_param, 84 | 1.0 + hp.clip_param) 85 | clipped_loss = clipped_ratio * advants_samples 86 | actor_loss = -torch.min(loss, clipped_loss).mean() 87 | 88 | actor_optim.zero_grad() 89 | actor_loss.backward() 90 | actor_optim.step() 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import argparse 4 | import numpy as np 5 | import torch.optim as optim 6 | from model import Actor, Critic 7 | from utils import get_action 8 | from collections import deque 9 | from running_state import ZFilter 10 | from hparams import HyperParams as hp 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--algorithm', type=str, default='PPO', 16 | help='select one of algorithms among Vanilla_PG, NPG, TPRO, PPO') 17 | parser.add_argument('--env', type=str, default="Humanoid-v2", 18 | help='name of Mujoco environement') 19 | parser.add_argument('--render', default=False) 20 | args = parser.parse_args() 21 | 22 | if args.algorithm == "PG": 23 | from vanila_pg import train_model 24 | elif args.algorithm == "NPG": 25 | from npg import train_model 26 | elif args.algorithm == "TRPO": 27 | from trpo import train_model 28 | elif args.algorithm == "PPO": 29 | from ppo import train_model 30 | 31 | 32 | if __name__=="__main__": 33 | # you can choose other environments. 34 | # possible environments: Ant-v2, HalfCheetah-v2, Hopper-v2, Humanoid-v2, 35 | # HumanoidStandup-v2, InvertedPendulum-v2, Reacher-v2, Swimmer-v2, Walker2d-v2 36 | env = gym.make(args.env) 37 | env.seed(500) 38 | torch.manual_seed(500) 39 | 40 | num_inputs = env.observation_space.shape[0] 41 | num_actions = env.action_space.shape[0] 42 | 43 | print('state size:', num_inputs) 44 | print('action size:', num_actions) 45 | 46 | actor = Actor(num_inputs, num_actions) 47 | critic = Critic(num_inputs) 48 | 49 | actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr) 50 | critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr, 51 | weight_decay=hp.l2_rate) 52 | 53 | running_state = ZFilter((num_inputs,), clip=5) 54 | episodes = 0 55 | xar = [] 56 | yar = [] 57 | for iter in range(50): 58 | actor.eval(), critic.eval() 59 | memory = deque() 60 | 61 | steps = 0 62 | scores = [] 63 | while steps < 2048: 64 | episodes += 1 65 | state = env.reset() 66 | state = running_state(state) 67 | score = 0 68 | for _ in range(10000): 69 | if episodes % 50 == 0: 70 | env.render() 71 | 72 | steps += 1 73 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) 74 | action = get_action(mu, std)[0] 75 | next_state, reward, done, _ = env.step(action) 76 | next_state = running_state(next_state) 77 | 78 | if done: 79 | mask = 0 80 | else: 81 | mask = 1 82 | 83 | memory.append([state, action, reward, mask]) 84 | 85 | score += reward 86 | state = next_state 87 | 88 | if done: 89 | break 90 | scores.append(score) 91 | 92 | score_avg = np.mean(scores) 93 | print('{} episode score is {:.2f}'.format(episodes, score_avg)) 94 | 95 | with open('reward per iter.txt','w') as file: 96 | file.write(str(episodes)+","+str(score_avg)) 97 | file.write("\n") 98 | file.close 99 | xar.append(int(episodes)) 100 | yar.append(int(score_avg)) 101 | 102 | actor.train(), critic.train() 103 | train_model(actor, critic, memory, actor_optim, critic_optim) 104 | 105 | def plotting(): 106 | plt.plot(xar, yar, linewidth=3) 107 | plt.title("Avg score/Episodes", fontsize=19) 108 | plt.xlabel("Episodes", fontsize=10) 109 | plt.ylabel("Avg score", fontsize=10) 110 | plt.tick_params(axis='both', labelsize=9) 111 | plt.show() 112 | 113 | plotting() 114 | print(xar,'\n',yar) 115 | # env.render() -------------------------------------------------------------------------------- /npg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import * 3 | from hparams import HyperParams as hp 4 | 5 | 6 | def get_returns(rewards, masks): 7 | rewards = torch.Tensor(rewards) 8 | masks = torch.Tensor(masks) 9 | returns = torch.zeros_like(rewards) 10 | 11 | running_returns = 0 12 | 13 | for t in reversed(range(0, len(rewards))): 14 | running_returns = rewards[t] + hp.gamma * running_returns * masks[t] 15 | returns[t] = running_returns 16 | 17 | returns = (returns - returns.mean()) / returns.std() 18 | return returns 19 | 20 | 21 | def get_loss(actor, returns, states, actions): 22 | mu, std, logstd = actor(torch.Tensor(states)) 23 | log_policy = log_density(torch.Tensor(actions), mu, std, logstd) 24 | returns = returns.unsqueeze(1) 25 | 26 | objective = returns * log_policy 27 | objective = objective.mean() 28 | return objective 29 | 30 | 31 | def train_critic(critic, states, returns, critic_optim): 32 | criterion = torch.nn.MSELoss() 33 | n = len(states) 34 | arr = np.arange(n) 35 | 36 | for epoch in range(5): 37 | np.random.shuffle(arr) 38 | 39 | for i in range(n // hp.batch_size): 40 | batch_index = arr[hp.batch_size * i: hp.batch_size * (i + 1)] 41 | batch_index = torch.LongTensor(batch_index) 42 | inputs = torch.Tensor(states)[batch_index] 43 | target = returns.unsqueeze(1)[batch_index] 44 | 45 | values = critic(inputs) 46 | loss = criterion(values, target) 47 | critic_optim.zero_grad() 48 | loss.backward() 49 | critic_optim.step() 50 | 51 | 52 | def fisher_vector_product(actor, states, p): 53 | p.detach() 54 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states) 55 | kl = kl.mean() 56 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True) 57 | kl_grad = flat_grad(kl_grad) # check kl_grad == 0 58 | 59 | kl_grad_p = (kl_grad * p).sum() 60 | kl_hessian_p = torch.autograd.grad(kl_grad_p, actor.parameters()) 61 | kl_hessian_p = flat_hessian(kl_hessian_p) 62 | 63 | return kl_hessian_p + 0.1 * p 64 | 65 | 66 | # from openai baseline code 67 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py 68 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10): 69 | x = torch.zeros(b.size()) 70 | r = b.clone() 71 | p = b.clone() 72 | rdotr = torch.dot(r, r) 73 | for i in range(nsteps): 74 | _Avp = fisher_vector_product(actor, states, p) 75 | alpha = rdotr / torch.dot(p, _Avp) 76 | x += alpha * p 77 | r -= alpha * _Avp 78 | new_rdotr = torch.dot(r, r) 79 | betta = new_rdotr / rdotr 80 | p = r + betta * p 81 | rdotr = new_rdotr 82 | if rdotr < residual_tol: 83 | break 84 | return x 85 | 86 | 87 | def train_model(actor, critic, memory, actor_optim, critic_optim): 88 | memory = np.array(memory) 89 | states = np.vstack(memory[:, 0]) 90 | actions = list(memory[:, 1]) 91 | rewards = list(memory[:, 2]) 92 | masks = list(memory[:, 3]) 93 | 94 | # ---------------------------- 95 | # step 1: get returns 96 | returns = get_returns(rewards, masks) 97 | 98 | # ---------------------------- 99 | # step 2: train critic several steps with respect to returns 100 | train_critic(critic, states, returns, critic_optim) 101 | 102 | # ---------------------------- 103 | # step 3: get gradient of loss and hessian of kl 104 | loss = get_loss(actor, returns, states, actions) 105 | loss_grad = torch.autograd.grad(loss, actor.parameters()) 106 | loss_grad = flat_grad(loss_grad) 107 | step_dir = conjugate_gradient(actor, states, loss_grad.data, nsteps=10) 108 | 109 | # ---------------------------- 110 | # step 4: get step direction and step size and update actor 111 | params = flat_params(actor) 112 | new_params = params + 0.5 * step_dir 113 | update_model(actor, new_params) 114 | 115 | 116 | -------------------------------------------------------------------------------- /trpo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import * 3 | from hparams import HyperParams as hp 4 | from model import Actor 5 | 6 | 7 | def get_gae(rewards, masks, values): 8 | rewards = torch.Tensor(rewards) 9 | masks = torch.Tensor(masks) 10 | returns = torch.zeros_like(rewards) 11 | advants = torch.zeros_like(rewards) 12 | 13 | running_returns = 0 14 | previous_value = 0 15 | running_advants = 0 16 | 17 | for t in reversed(range(0, len(rewards))): 18 | running_returns = rewards[t] + hp.gamma * running_returns * masks[t] 19 | running_tderror = rewards[t] + hp.gamma * previous_value * masks[t] - \ 20 | values.data[t] 21 | running_advants = running_tderror + hp.gamma * hp.lamda * \ 22 | running_advants * masks[t] 23 | 24 | returns[t] = running_returns 25 | previous_value = values.data[t] 26 | advants[t] = running_advants 27 | 28 | advants = (advants - advants.mean()) / advants.std() 29 | return returns, advants 30 | 31 | 32 | def surrogate_loss(actor, advants, states, old_policy, actions): 33 | mu, std, logstd = actor(torch.Tensor(states)) 34 | new_policy = log_density(torch.Tensor(actions), mu, std, logstd) 35 | advants = advants.unsqueeze(1) 36 | 37 | surrogate = advants * torch.exp(new_policy - old_policy) 38 | surrogate = surrogate.mean() 39 | return surrogate 40 | 41 | 42 | def train_critic(critic, states, returns, advants, critic_optim): 43 | criterion = torch.nn.MSELoss() 44 | n = len(states) 45 | arr = np.arange(n) 46 | 47 | for epoch in range(5): 48 | np.random.shuffle(arr) 49 | 50 | for i in range(n // hp.batch_size): 51 | batch_index = arr[hp.batch_size * i: hp.batch_size * (i + 1)] 52 | batch_index = torch.LongTensor(batch_index) 53 | inputs = torch.Tensor(states)[batch_index] 54 | target1 = returns.unsqueeze(1)[batch_index] 55 | target2 = advants.unsqueeze(1)[batch_index] 56 | 57 | values = critic(inputs) 58 | loss = criterion(values, target1 + target2) 59 | critic_optim.zero_grad() 60 | loss.backward() 61 | critic_optim.step() 62 | 63 | 64 | def fisher_vector_product(actor, states, p): 65 | p.detach() 66 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states) 67 | kl = kl.mean() 68 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True) 69 | kl_grad = flat_grad(kl_grad) # check kl_grad == 0 70 | 71 | kl_grad_p = (kl_grad * p).sum() 72 | kl_hessian_p = torch.autograd.grad(kl_grad_p, actor.parameters()) 73 | kl_hessian_p = flat_hessian(kl_hessian_p) 74 | 75 | return kl_hessian_p + 0.1 * p 76 | 77 | 78 | # from openai baseline code 79 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py 80 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10): 81 | x = torch.zeros(b.size()) 82 | r = b.clone() 83 | p = b.clone() 84 | rdotr = torch.dot(r, r) 85 | for i in range(nsteps): 86 | _Avp = fisher_vector_product(actor, states, p) 87 | alpha = rdotr / torch.dot(p, _Avp) 88 | x += alpha * p 89 | r -= alpha * _Avp 90 | new_rdotr = torch.dot(r, r) 91 | betta = new_rdotr / rdotr 92 | p = r + betta * p 93 | rdotr = new_rdotr 94 | if rdotr < residual_tol: 95 | break 96 | return x 97 | 98 | 99 | def train_model(actor, critic, memory, actor_optim, critic_optim): 100 | memory = np.array(memory) 101 | states = np.vstack(memory[:, 0]) 102 | actions = list(memory[:, 1]) 103 | rewards = list(memory[:, 2]) 104 | masks = list(memory[:, 3]) 105 | values = critic(torch.Tensor(states)) 106 | 107 | # ---------------------------- 108 | # step 1: get returns and GAEs 109 | returns, advants = get_gae(rewards, masks, values) 110 | 111 | # ---------------------------- 112 | # step 2: train critic several steps with respect to returns 113 | train_critic(critic, states, returns, advants, critic_optim) 114 | 115 | # ---------------------------- 116 | # step 3: get gradient of loss and hessian of kl 117 | mu, std, logstd = actor(torch.Tensor(states)) 118 | old_policy = log_density(torch.Tensor(actions), mu, std, logstd) 119 | 120 | loss = surrogate_loss(actor, advants, states, old_policy.detach(), actions) 121 | loss_grad = torch.autograd.grad(loss, actor.parameters()) 122 | loss_grad = flat_grad(loss_grad) 123 | step_dir = conjugate_gradient(actor, states, loss_grad.data, nsteps=10) 124 | loss = loss.data.numpy() 125 | 126 | # ---------------------------- 127 | # step 4: get step direction and step size and full step 128 | params = flat_params(actor) 129 | shs = 0.5 * (step_dir * fisher_vector_product(actor, states, step_dir) 130 | ).sum(0, keepdim=True) 131 | step_size = 1 / torch.sqrt(shs / hp.max_kl)[0] 132 | full_step = step_size * step_dir 133 | 134 | # ---------------------------- 135 | # step 5: do backtracking line search for n times 136 | old_actor = Actor(actor.num_inputs, actor.num_outputs) 137 | update_model(old_actor, params) 138 | expected_improve = (loss_grad * full_step).sum(0, keepdim=True) 139 | expected_improve = expected_improve.data.numpy() 140 | 141 | flag = False 142 | fraction = 1.0 143 | for i in range(10): 144 | new_params = params + fraction * full_step 145 | update_model(actor, new_params) 146 | new_loss = surrogate_loss(actor, advants, states, old_policy.detach(), 147 | actions) 148 | new_loss = new_loss.data.numpy() 149 | loss_improve = new_loss - loss 150 | expected_improve *= fraction 151 | kl = kl_divergence(new_actor=actor, old_actor=old_actor, states=states) 152 | kl = kl.mean() 153 | 154 | print('kl: {:.4f} loss improve: {:.4f} expected improve: {:.4f} ' 155 | 'number of line search: {}' 156 | .format(kl.data.numpy(), loss_improve, expected_improve[0], i)) 157 | 158 | # see https: // en.wikipedia.org / wiki / Backtracking_line_search 159 | if kl < hp.max_kl and (loss_improve / expected_improve) > 0.5: 160 | flag = True 161 | break 162 | 163 | fraction *= 0.5 164 | 165 | if not flag: 166 | params = flat_params(old_actor) 167 | update_model(actor, params) 168 | print('policy update does not impove the surrogate') 169 | 170 | 171 | 172 | --------------------------------------------------------------------------------