├── .vscode ├── .ropeproject │ ├── config.py │ └── objectdb └── settings.json ├── A2C └── advantage_actor_critic.py ├── A3C ├── SharedAdam.py ├── __pycache__ │ ├── SharedAdam.cpython-37.pyc │ └── utils.cpython-37.pyc ├── a3c_cartpole.py └── utils.py ├── AC └── actor_critic.py ├── ACER └── acer_cartpole.py ├── DDPG └── ddpg.py ├── DSAC └── distributional_sac_discrete.py ├── ICM_PPO └── icm.py ├── PPO_CLIP ├── gae_ppo_cartpole.py ├── ppo_cartpole.py └── ppo_pendulum.py ├── REINFORCE └── reinforce.py ├── RND_PPO └── rnd.py ├── Readme.md ├── SAC ├── sac.py └── sac_discrete.py ├── TD3 └── td3.py └── TRPO └── trpo_gae.py /.vscode/.ropeproject/config.py: -------------------------------------------------------------------------------- 1 | # The default ``config.py`` 2 | # flake8: noqa 3 | 4 | 5 | def set_prefs(prefs): 6 | """This function is called before opening the project""" 7 | 8 | # Specify which files and folders to ignore in the project. 9 | # Changes to ignored resources are not added to the history and 10 | # VCSs. Also they are not returned in `Project.get_files()`. 11 | # Note that ``?`` and ``*`` match all characters but slashes. 12 | # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' 13 | # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' 14 | # '.svn': matches 'pkg/.svn' and all of its children 15 | # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' 16 | # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' 17 | prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', 18 | '.hg', '.svn', '_svn', '.git', '.tox'] 19 | 20 | # Specifies which files should be considered python files. It is 21 | # useful when you have scripts inside your project. Only files 22 | # ending with ``.py`` are considered to be python files by 23 | # default. 24 | # prefs['python_files'] = ['*.py'] 25 | 26 | # Custom source folders: By default rope searches the project 27 | # for finding source folders (folders that should be searched 28 | # for finding modules). You can add paths to that list. Note 29 | # that rope guesses project source folders correctly most of the 30 | # time; use this if you have any problems. 31 | # The folders should be relative to project root and use '/' for 32 | # separating folders regardless of the platform rope is running on. 33 | # 'src/my_source_folder' for instance. 34 | # prefs.add('source_folders', 'src') 35 | 36 | # You can extend python path for looking up modules 37 | # prefs.add('python_path', '~/python/') 38 | 39 | # Should rope save object information or not. 40 | prefs['save_objectdb'] = True 41 | prefs['compress_objectdb'] = False 42 | 43 | # If `True`, rope analyzes each module when it is being saved. 44 | prefs['automatic_soa'] = True 45 | # The depth of calls to follow in static object analysis 46 | prefs['soa_followed_calls'] = 0 47 | 48 | # If `False` when running modules or unit tests "dynamic object 49 | # analysis" is turned off. This makes them much faster. 50 | prefs['perform_doa'] = True 51 | 52 | # Rope can check the validity of its object DB when running. 53 | prefs['validate_objectdb'] = True 54 | 55 | # How many undos to hold? 56 | prefs['max_history_items'] = 32 57 | 58 | # Shows whether to save history across sessions. 59 | prefs['save_history'] = True 60 | prefs['compress_history'] = False 61 | 62 | # Set the number spaces used for indenting. According to 63 | # :PEP:`8`, it is best to use 4 spaces. Since most of rope's 64 | # unit-tests use 4 spaces it is more reliable, too. 65 | prefs['indent_size'] = 4 66 | 67 | # Builtin and c-extension modules that are allowed to be imported 68 | # and inspected by rope. 69 | prefs['extension_modules'] = [] 70 | 71 | # Add all standard c-extensions to extension_modules list. 72 | prefs['import_dynload_stdmods'] = True 73 | 74 | # If `True` modules with syntax errors are considered to be empty. 75 | # The default value is `False`; When `False` syntax errors raise 76 | # `rope.base.exceptions.ModuleSyntaxError` exception. 77 | prefs['ignore_syntax_errors'] = False 78 | 79 | # If `True`, rope ignores unresolvable imports. Otherwise, they 80 | # appear in the importing namespace. 81 | prefs['ignore_bad_imports'] = False 82 | 83 | # If `True`, rope will insert new module imports as 84 | # `from import ` by default. 85 | prefs['prefer_module_from_imports'] = False 86 | 87 | # If `True`, rope will transform a comma list of imports into 88 | # multiple separate import statements when organizing 89 | # imports. 90 | prefs['split_imports'] = False 91 | 92 | # If `True`, rope will remove all top-level import statements and 93 | # reinsert them at the top of the module when making changes. 94 | prefs['pull_imports_to_top'] = True 95 | 96 | # If `True`, rope will sort imports alphabetically by module name instead 97 | # of alphabetically by import statement, with from imports after normal 98 | # imports. 99 | prefs['sort_imports_alphabetically'] = False 100 | 101 | # Location of implementation of 102 | # rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general 103 | # case, you don't have to change this value, unless you're an rope expert. 104 | # Change this value to inject you own implementations of interfaces 105 | # listed in module rope.base.oi.type_hinting.providers.interfaces 106 | # For example, you can add you own providers for Django Models, or disable 107 | # the search type-hinting in a class hierarchy, etc. 108 | prefs['type_hinting_factory'] = ( 109 | 'rope.base.oi.type_hinting.factory.default_type_hinting_factory') 110 | 111 | 112 | def project_opened(project): 113 | """This function is called after opening the project""" 114 | # Do whatever you like here! 115 | -------------------------------------------------------------------------------- /.vscode/.ropeproject/objectdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deligentfool/policy_based_RL/d3332680fc8f09e864b113616faa8954705c6aea/.vscode/.ropeproject/objectdb -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "E:\\Anaconda3\\python.exe" 3 | } -------------------------------------------------------------------------------- /A2C/advantage_actor_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import random 5 | import numpy as np 6 | import gym 7 | from torch.distributions import Categorical 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | 11 | class policy_net(nn.Module): 12 | def __init__(self, input_dim, output_dim): 13 | super(policy_net, self).__init__() 14 | self.input_dim = input_dim 15 | self.output_dim = output_dim 16 | 17 | self.fc1 = nn.Linear(self.input_dim, 128) 18 | self.fc2 = nn.Linear(128, 128) 19 | self.fc3 = nn.Linear(128, self.output_dim) 20 | 21 | self.rewards = [] 22 | self.log_probs = [] 23 | 24 | def forward(self, input): 25 | x = self.fc1(input) 26 | x = F.relu(x) 27 | x = self.fc2(x) 28 | x = F.relu(x) 29 | x = self.fc3(x) 30 | return F.softmax(x, 1) 31 | 32 | def act(self, input): 33 | probs = self.forward(input) 34 | dist = Categorical(probs=probs) 35 | action = dist.sample() 36 | log_prob = dist.log_prob(action) 37 | self.log_probs.append(log_prob) 38 | return action[0].item() 39 | 40 | 41 | class value_net(nn.Module): 42 | def __init__(self, input_dim, output_dim): 43 | super(value_net, self).__init__() 44 | self.input_dim = input_dim 45 | self.output_dim = output_dim 46 | 47 | self.fc1 = nn.Linear(self.input_dim, 128) 48 | self.fc2 = nn.Linear(128, 128) 49 | self.fc3 = nn.Linear(128, self.output_dim) 50 | 51 | def forward(self, input): 52 | x = self.fc1(input) 53 | x = F.relu(x) 54 | x = self.fc2(x) 55 | x = F.relu(x) 56 | x = self.fc3(x) 57 | return x 58 | 59 | 60 | class advantage_actor_critic(object): 61 | def __init__(self, env, gamma, learning_rate, episode, render): 62 | self.env = env 63 | self.observation_dim = self.env.observation_space.shape[0] 64 | self.action_dim = self.env.action_space.n 65 | self.gamma = gamma 66 | self.learning_rate = learning_rate 67 | self.episode = episode 68 | self.render = render 69 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 70 | self.value_net = value_net(self.observation_dim, 1) 71 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 72 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 73 | self.total_returns = [] 74 | self.values_buffer = [] 75 | self.writer = SummaryWriter('runs/a2c') 76 | self.weight_reward = None 77 | self.count = 0 78 | 79 | def train(self, ): 80 | total_returns = torch.FloatTensor(self.total_returns).unsqueeze(1).detach() 81 | values = torch.cat(self.values_buffer, 0) 82 | delta = (total_returns - values).squeeze(1) 83 | log_probs = torch.cat(self.policy_net.log_probs, 0) 84 | 85 | policy_loss = (- log_probs * delta.detach()) 86 | policy_loss = policy_loss.sum() 87 | self.writer.add_scalar('policy_loss', policy_loss, self.count) 88 | self.policy_optimizer.zero_grad() 89 | policy_loss.backward(retain_graph=True) 90 | torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.1) 91 | self.policy_optimizer.step() 92 | 93 | value_loss = delta.pow(2).sum() 94 | self.writer.add_scalar('value_loss', value_loss, self.count) 95 | self.value_optimizer.zero_grad() 96 | value_loss.backward(retain_graph=True) 97 | torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.1) 98 | self.value_optimizer.step() 99 | 100 | def run(self, ): 101 | for i in range(self.episode): 102 | obs = self.env.reset() 103 | total_reward = 0 104 | if self.render: 105 | self.env.render() 106 | while True: 107 | self.values_buffer.append(self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0)))) 108 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 109 | next_obs, reward, done, info = self.env.step(action) 110 | self.policy_net.rewards.append(reward) 111 | self.count += 1 112 | total_reward += reward 113 | if self.render: 114 | self.env.render() 115 | obs = next_obs 116 | if done: 117 | R = 0 118 | if self.weight_reward: 119 | self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward 120 | else: 121 | self.weight_reward = total_reward 122 | for r in reversed(self.policy_net.rewards): 123 | R = R * self.gamma + r 124 | self.total_returns.append(R) 125 | self.total_returns = list(reversed(self.total_returns)) 126 | self.train() 127 | del self.policy_net.rewards[:] 128 | del self.policy_net.log_probs[:] 129 | del self.total_returns[:] 130 | del self.values_buffer[:] 131 | print('episode: {} reward: {:.1f} weight_reward: {:.2f}'.format(i+1, total_reward, self.weight_reward)) 132 | break 133 | 134 | 135 | if __name__ == '__main__': 136 | env = gym.make('CartPole-v0') 137 | env = env.unwrapped 138 | test = advantage_actor_critic(env, gamma=0.99, learning_rate=1e-3, episode=100000, render=False) 139 | test.run() -------------------------------------------------------------------------------- /A3C/SharedAdam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class SharedAdam(torch.optim.Adam): 5 | def __init__(self, 6 | params, 7 | lr=1e-3, 8 | betas=(0.9, 0.9), 9 | eps=1e-8, 10 | weight_decay=0): 11 | super(SharedAdam, self).__init__(params, 12 | lr=lr, 13 | betas=betas, 14 | eps=eps, 15 | weight_decay=weight_decay) 16 | for group in self.param_groups: 17 | for p in group['params']: 18 | state = self.state[p] 19 | state['step'] = 0 20 | state['exp_avg'] = torch.zeros_like(p.data) 21 | state['exp_avg_sq'] = torch.zeros_like(p.data) 22 | 23 | state['exp_avg'].share_memory_() 24 | state['exp_avg_sq'].share_memory_() 25 | -------------------------------------------------------------------------------- /A3C/__pycache__/SharedAdam.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deligentfool/policy_based_RL/d3332680fc8f09e864b113616faa8954705c6aea/A3C/__pycache__/SharedAdam.cpython-37.pyc -------------------------------------------------------------------------------- /A3C/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deligentfool/policy_based_RL/d3332680fc8f09e864b113616faa8954705c6aea/A3C/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /A3C/a3c_cartpole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import gym 6 | import torch.multiprocessing as mp 7 | import os 8 | from utils import pull_and_push, record, set_init 9 | from SharedAdam import SharedAdam 10 | os.environ["OMP_NUM_THREADS"] = "1" 11 | 12 | 13 | class Net(nn.Module): 14 | def __init__(self, action_d, observation_d): 15 | super(Net, self).__init__() 16 | self.action_d = action_d 17 | self.observation_d = observation_d 18 | self.policy_layer_1 = nn.Linear(self.observation_d, 256) 19 | self.policy_layer_2 = nn.Linear(256, self.action_d) 20 | self.value_layer_1 = nn.Linear(self.observation_d, 256) 21 | self.value_layer_2 = nn.Linear(256, 1) 22 | set_init([ 23 | self.policy_layer_1, self.policy_layer_2, self.value_layer_1, 24 | self.value_layer_2 25 | ]) 26 | self.distribution = torch.distributions.Categorical 27 | 28 | def forward(self, x): 29 | pl_1 = F.relu6(self.policy_layer_1(x)) 30 | policy = F.softmax(self.policy_layer_2(pl_1), dim=1) 31 | vl_1 = F.relu6(self.value_layer_1(x)) 32 | value = self.value_layer_2(vl_1) 33 | return policy, value 34 | 35 | def choose_action(self, s): 36 | self.eval() 37 | prob, _ = self.forward(s) 38 | prob = prob.data 39 | m = self.distribution(prob) 40 | return m.sample().numpy()[0] 41 | 42 | def loss_func(self, s, a, value_target): 43 | self.train() 44 | prob, value = self.forward(s) 45 | td_error = value_target - value 46 | critic_loss = td_error.pow(2) 47 | 48 | m = self.distribution(prob) 49 | log_pob = m.log_prob(a) 50 | exp_v = log_pob * td_error.detach().squeeze() 51 | actor_loss = -exp_v 52 | loss = (critic_loss + actor_loss).mean() 53 | return loss 54 | 55 | 56 | class Worker(mp.Process): 57 | def __init__(self, global_net, optimizer, global_episode_counter, 58 | global_reward, res_queue, name, max_episode, 59 | update_global_iteration, gamma): 60 | super(Worker, self).__init__() 61 | self.name = 'w' + name 62 | self.global_episode_counter = global_episode_counter 63 | self.global_reward = global_reward 64 | self.res_queue = res_queue 65 | self.global_net = global_net 66 | self.optimizer = optimizer 67 | self.max_episode = max_episode 68 | self.update_global_iteration = update_global_iteration 69 | self.gamma = gamma 70 | self.env = gym.make('CartPole-v0') 71 | self.env = self.env.unwrapped 72 | self.action_d = env.action_space.n 73 | self.observation_d = env.observation_space.shape[0] 74 | self.local_net = Net(self.action_d, self.observation_d) 75 | 76 | def run(self): 77 | total_step = 1 78 | while self.global_episode_counter.value < self.max_episode: 79 | s = self.env.reset() 80 | buffer_s, buffer_a, buffer_r = [], [], [] 81 | episode_reward = 0 82 | while True: 83 | if self.name == 'w0': 84 | self.env.render() 85 | a = self.local_net.choose_action( 86 | torch.Tensor(s).view(-1, self.observation_d)) 87 | s_, r, done, _ = self.env.step(a) 88 | if done: 89 | r = -1 90 | episode_reward += r 91 | buffer_a.append(a) 92 | buffer_r.append(r) 93 | buffer_s.append(s) 94 | if total_step % self.update_global_iteration == 0 or done: 95 | # sync 96 | pull_and_push(self.optimizer, self.local_net, 97 | self.global_net, done, s_, buffer_s, 98 | buffer_a, buffer_r, self.gamma) 99 | buffer_s, buffer_a, buffer_r = [], [], [] 100 | if done: 101 | # record 102 | record(self.global_episode_counter, self.global_reward, 103 | episode_reward, self.res_queue, self.name) 104 | break 105 | s = s_ 106 | total_step += 1 107 | self.res_queue.put(None) 108 | 109 | 110 | if __name__ == '__main__': 111 | env = gym.make('CartPole-v0') 112 | action_d = env.action_space.n 113 | observation_d = env.observation_space.shape[0] 114 | global_net = Net(action_d, observation_d) 115 | optimizer = SharedAdam(global_net.parameters(), lr=0.0001) 116 | global_episode_counter, global_reward, res_queue = mp.Value( 117 | 'i', 0), mp.Value('d', 0.), mp.Queue() 118 | workers = [ 119 | Worker(global_net, 120 | optimizer, global_episode_counter, global_reward, res_queue, 121 | str(i), 10000, 10, 0.9) for i in range(mp.cpu_count()) 122 | ] 123 | [worker.start() for worker in workers] 124 | res = [] 125 | while True: 126 | r = res_queue.get() 127 | if r is not None: 128 | res.append(r) 129 | else: 130 | break 131 | [worker.join() for worker in workers] 132 | 133 | 134 | import matplotlib.pyplot as plt 135 | plt.plot(res) 136 | plt.ylabel('Moving average ep reward') 137 | plt.xlabel('Step') 138 | plt.show() -------------------------------------------------------------------------------- /A3C/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | 5 | 6 | def pull_and_push(opt, local_net, global_net, done, s_, b_s, b_a, b_r, gamma): 7 | if done: 8 | v_s_ = 0 9 | else: 10 | _, v_s_ = local_net.forward(torch.Tensor([s_])) 11 | v_s_ = v_s_.data.numpy()[0, 0] 12 | buffer_v_target = [] 13 | for r in reversed(b_r): 14 | v_s_ = r + gamma * v_s_ 15 | buffer_v_target.append(v_s_) 16 | buffer_v_target.reverse() 17 | 18 | loss = local_net.loss_func(torch.Tensor(b_s), 19 | torch.Tensor(b_a).view(-1, 1), 20 | torch.Tensor(buffer_v_target).view(-1, 1)) 21 | opt.zero_grad() 22 | loss.backward() 23 | for l_p, g_p in zip(local_net.parameters(), global_net.parameters()): 24 | g_p._grad = l_p.grad 25 | opt.step() 26 | local_net.load_state_dict(global_net.state_dict()) 27 | 28 | 29 | def record(global_ep, global_ep_r, ep_r, res_queue, name): 30 | with global_ep.get_lock(): 31 | global_ep.value += 1 32 | with global_ep_r.get_lock(): 33 | if global_ep_r.value == 0.: 34 | global_ep_r.value = ep_r 35 | else: 36 | global_ep_r.value = global_ep_r.value * 0.99 + ep_r * 0.01 37 | res_queue.put(global_ep_r.value) 38 | print( 39 | name, 40 | "Ep:", 41 | global_ep.value, 42 | "| Ep_r: %.0f" % global_ep_r.value, 43 | ) 44 | 45 | 46 | def set_init(layers): 47 | for layer in layers: 48 | nn.init.normal_(layer.weight, mean=0., std=0.1) 49 | nn.init.constant_(layer.bias, 0.) -------------------------------------------------------------------------------- /AC/actor_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Categorical 5 | import random 6 | import numpy as np 7 | import gym 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | 11 | class policy_net(nn.Module): 12 | def __init__(self, input_dim, output_dim): 13 | super(policy_net, self).__init__() 14 | self.input_dim = input_dim 15 | self.output_dim = output_dim 16 | 17 | self.fc1 = nn.Linear(self.input_dim, 128) 18 | self.fc2 = nn.Linear(128, 128) 19 | self.fc3 = nn.Linear(128, self.output_dim) 20 | 21 | self.log_probs = [] 22 | self.rewards = [] 23 | 24 | def forward(self, input): 25 | x = F.relu(self.fc1(input)) 26 | x = F.relu(self.fc2(x)) 27 | x = self.fc3(x) 28 | return F.softmax(x, 1) 29 | 30 | def act(self, input): 31 | prob = self.forward(input) 32 | dist = Categorical(prob) 33 | action = dist.sample() 34 | log_prob = dist.log_prob(action) 35 | self.log_probs.append(log_prob) 36 | return action.detach().item() 37 | 38 | 39 | class q_value_net(nn.Module): 40 | # * different with A2C, this is a q value network that the input is observation and action 41 | def __init__(self, input1_dim, input2_dim, output_dim): 42 | super(q_value_net, self).__init__() 43 | self.input1_dim = input1_dim 44 | self.input2_dim = input2_dim 45 | self.output_dim = output_dim 46 | 47 | self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128) 48 | self.fc2 = nn.Linear(128, 128) 49 | self.fc3 = nn.Linear(128, self.output_dim) 50 | 51 | def forward(self, input1, input2): 52 | x = torch.cat([input1, input2], 1) 53 | x = self.fc1(x) 54 | x = F.relu(x) 55 | x = self.fc2(x) 56 | x = F.relu(x) 57 | x = self.fc3(x) 58 | return x 59 | 60 | 61 | class actor_critic(object): 62 | def __init__(self, env, learning_rate, episode, render): 63 | self.env = env 64 | self.observation_dim = self.env.observation_space.shape[0] 65 | self.action_dim = self.env.action_space.n 66 | self.learning_rate = learning_rate 67 | self.episode = episode 68 | self.render = render 69 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 70 | self.q_value_net = q_value_net(self.observation_dim, self.action_dim, 1) 71 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 72 | self.value_optimizer = torch.optim.Adam(self.q_value_net.parameters(), lr=self.learning_rate) 73 | self.values_buffer = [] 74 | self.next_observation_buffer = [] 75 | self.writer = SummaryWriter('runs/actor_critic') 76 | self.weight_reward = None 77 | self.count = 0 78 | 79 | def train(self, ): 80 | values = torch.cat(self.values_buffer, 0) 81 | log_probs = torch.cat(self.policy_net.log_probs, 0).unsqueeze(1) 82 | rewards = torch.FloatTensor(self.policy_net.rewards).unsqueeze(1) 83 | next_observation = torch.FloatTensor(self.next_observation_buffer) 84 | 85 | policy_loss = (- log_probs * values.detach()) 86 | policy_loss = policy_loss.sum() 87 | self.writer.add_scalar('policy_loss', policy_loss, self.count) 88 | self.policy_optimizer.zero_grad() 89 | policy_loss.backward(retain_graph=True) 90 | torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.1) 91 | self.policy_optimizer.step() 92 | 93 | # * find the max value in all actions 94 | q_stack = None 95 | for action in range(self.action_dim): 96 | action = self.one_hot(action) 97 | action = torch.FloatTensor(action) 98 | action = action.expand(values.size(0), 2) 99 | tmp = self.q_value_net.forward(next_observation, action) 100 | if q_stack is None: 101 | q_stack = tmp 102 | else: 103 | q_stack = torch.cat([q_stack, tmp], 1) 104 | q_max = q_stack.max(1)[0].unsqueeze(1) 105 | value_loss = (rewards + q_max - values).pow(2).sum() 106 | self.writer.add_scalar('value_loss', value_loss, self.count) 107 | self.value_optimizer.zero_grad() 108 | value_loss.backward(retain_graph=True) 109 | torch.nn.utils.clip_grad_norm_(self.q_value_net.parameters(), 0.1) 110 | self.value_optimizer.step() 111 | 112 | def one_hot(self, action): 113 | one_hot_action = np.zeros(self.action_dim) 114 | one_hot_action[action] = 1 115 | return one_hot_action 116 | 117 | def run(self, ): 118 | for i in range(self.episode): 119 | obs = self.env.reset() 120 | total_reward = 0 121 | if self.render: 122 | self.env.render() 123 | while True: 124 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 125 | self.values_buffer.append(self.q_value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0)), torch.FloatTensor(np.expand_dims(self.one_hot(action), 0)))) 126 | next_obs, reward, done, info = self.env.step(action) 127 | self.policy_net.rewards.append(reward) 128 | self.next_observation_buffer.append(next_obs) 129 | self.count += 1 130 | total_reward += reward 131 | if self.render: 132 | self.env.render() 133 | obs = next_obs 134 | if done: 135 | if self.weight_reward: 136 | self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward 137 | else: 138 | self.weight_reward = total_reward 139 | self.train() 140 | del self.policy_net.rewards[:] 141 | del self.policy_net.log_probs[:] 142 | del self.values_buffer[:] 143 | del self.next_observation_buffer[:] 144 | print('episode: {} reward: {:.1f} weight_reward: {:.2f}'.format(i+1, total_reward, self.weight_reward)) 145 | break 146 | 147 | 148 | if __name__ == '__main__': 149 | env = gym.make('CartPole-v0') 150 | env = env.unwrapped 151 | test = actor_critic(env, learning_rate=1e-3, episode=100000, render=False) 152 | test.run() -------------------------------------------------------------------------------- /ACER/acer_cartpole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import gym 5 | import numpy as np 6 | import random 7 | from collections import deque 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | 11 | class replay_buffer(object): 12 | # * a different implement of replay buffer 13 | def __init__(self, capacity): 14 | self.capacity = capacity 15 | self.memory = deque(maxlen=self.capacity) 16 | self.memory.append([]) 17 | 18 | def store(self, observation, action, reward, policy, done): 19 | observation = np.expand_dims(observation, 0) 20 | self.memory[-1].append([observation, action, reward, policy, done]) 21 | 22 | def sample(self, batch_size=None): 23 | if not batch_size: 24 | batch = self.memory[-1] 25 | else: 26 | batch_list = random.sample(list(self.memory)[: -1], batch_size) 27 | batch = [] 28 | for i in batch_list: 29 | batch.extend(i) 30 | 31 | observation, action, reward, policy, done = zip(* batch) 32 | return np.concatenate(observation, 0), action, reward, np.concatenate(policy, 0), done 33 | 34 | def create(self): 35 | self.memory.append([]) 36 | 37 | def __len__(self): 38 | return len(self.memory) 39 | 40 | 41 | class policy_value_net(nn.Module): 42 | # * a network for the discrete case 43 | def __init__(self, observation_dim, action_dim): 44 | super(policy_value_net, self).__init__() 45 | self.observation_dim = observation_dim 46 | self.action_dim = action_dim 47 | 48 | self.policy_fc1 = nn.Linear(self.observation_dim, 128) 49 | self.policy_fc2 = nn.Linear(128, 128) 50 | self.policy_fc3 = nn.Linear(128, self.action_dim) 51 | 52 | self.value_fc1 = nn.Linear(self.observation_dim, 128) 53 | self.value_fc2 = nn.Linear(128, 128) 54 | self.value_fc3 = nn.Linear(128, self.action_dim) 55 | 56 | def forward(self, observation): 57 | policy_x = F.tanh(self.policy_fc1(observation)) 58 | policy_x = F.tanh(self.policy_fc2(policy_x)) 59 | policy_x = self.policy_fc3(policy_x) 60 | policy = F.softmax(policy_x, 1).clamp(max=1-1e-20) 61 | 62 | q_value_x = F.tanh(self.value_fc1(observation)) 63 | q_value_x = F.tanh(self.value_fc2(q_value_x)) 64 | q_value = self.value_fc3(q_value_x) 65 | 66 | value = (policy * q_value).sum(1, keepdim=True) 67 | return policy, q_value, value 68 | 69 | 70 | class acer(object): 71 | # * without trust region policy optimization 72 | def __init__(self, env, episode, capacity, learning_rate, exploration, c, gamma, batch_size, entropy_weight, replay_ratio, render, log): 73 | self.env = env 74 | self.episode = episode 75 | self.capacity = capacity 76 | self.learning_rate = learning_rate 77 | self.exploration = exploration 78 | self.c = c 79 | self.gamma = gamma 80 | self.batch_size = batch_size 81 | self.entropy_weight = entropy_weight 82 | self.replay_ratio = replay_ratio 83 | self.render = render 84 | self.log = log 85 | 86 | self.observation_dim = self.env.observation_space.shape[0] 87 | self.action_dim = self.env.action_space.n 88 | self.net = policy_value_net(self.observation_dim, self.action_dim) 89 | self.buffer = replay_buffer(self.capacity) 90 | self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) 91 | self.weight_reward = None 92 | self.writer = SummaryWriter('runs/acer_cartpole') 93 | self.train_count = 0 94 | 95 | def compute_loss(self, policies, q_values, values, actions, rewards, retrace, dones, behavior_policies): 96 | loss = 0 97 | for i in reversed(range(policies.size(0))): 98 | rho = (policies[i] / behavior_policies[i]).detach() 99 | 100 | retrace = rewards[i] + self.gamma * retrace * (1. - dones[i]) 101 | advantage = retrace - values[i].squeeze() 102 | 103 | log_policy_action = policies[i].gather(0, actions[i]).log() 104 | rho_action = rho.gather(0, actions[i]) 105 | actor_loss = -torch.clamp(rho_action, max=self.c).detach() * log_policy_action * advantage.detach() 106 | rho_correction = torch.clamp(1 - self.c / rho, min=0.).detach() 107 | actor_loss -= (rho_correction * policies[i].log() * (q_values[i] - values[i]).detach()).sum() 108 | 109 | entropy = self.entropy_weight * -(policies[i] * policies[i].log()).sum() 110 | critic_loss = (retrace - q_values[i].gather(0, actions[i])).pow(2).sum() 111 | 112 | loss += (critic_loss + actor_loss - entropy) 113 | 114 | retrace = torch.clamp(rho_action, max=self.c).detach() * (retrace - q_values[i].gather(0, actions[i])) + values[i] 115 | retrace = retrace.squeeze().detach() 116 | 117 | self.optimizer.zero_grad() 118 | loss.backward() 119 | self.optimizer.step() 120 | return loss.item() 121 | 122 | def on_policy_train(self, next_observation): 123 | observations, actions, rewards, behavior_policies, dones = self.buffer.sample() 124 | 125 | observations = torch.FloatTensor(observations) 126 | actions = torch.LongTensor(actions) 127 | rewards = torch.FloatTensor(rewards) 128 | behavior_policies = torch.FloatTensor(behavior_policies) 129 | dones = torch.FloatTensor(dones) 130 | 131 | policies, q_values, values = self.net.forward(observations) 132 | 133 | _, _, retrace = self.net.forward(torch.FloatTensor(np.expand_dims(next_observation, 0))) 134 | retrace = retrace.squeeze().detach() 135 | loss = self.compute_loss(policies, q_values, values, actions, rewards, retrace, dones, behavior_policies) 136 | if self.log: 137 | self.writer.add_scalar('on_policy_loss', loss, self.train_count) 138 | 139 | def off_policy_train(self): 140 | loss_list = [] 141 | for _ in range(np.random.poisson(self.replay_ratio)): 142 | observations, actions, rewards, behavior_policies, dones = self.buffer.sample(self.batch_size) 143 | 144 | observations = torch.FloatTensor(observations) 145 | actions = torch.LongTensor(actions) 146 | rewards = torch.FloatTensor(rewards) 147 | behavior_policies = torch.FloatTensor(behavior_policies) 148 | dones = torch.FloatTensor(dones) 149 | 150 | policies, q_values, values = self.net.forward(observations) 151 | 152 | _, _, retrace = self.net.forward(observations[-1].unsqueeze(0)) 153 | retrace = retrace.squeeze().detach() 154 | loss = self.compute_loss(policies, q_values, values, actions, rewards, retrace, dones, behavior_policies) 155 | loss_list.append(loss) 156 | if self.log: 157 | self.writer.add_scalar('off_policy_loss', np.mean(loss_list), self.train_count) 158 | 159 | def run(self): 160 | for i in range(self.episode): 161 | obs = self.env.reset() 162 | total_reward = 0 163 | if self.render: 164 | self.env.render() 165 | while True: 166 | policy, _, _ = self.net.forward(torch.FloatTensor(np.expand_dims(obs, 0))) 167 | action = policy.multinomial(1).item() 168 | next_obs, reward, done, info = self.env.step(action) 169 | total_reward += reward 170 | if self.render: 171 | self.env.render() 172 | policy = policy.detach().numpy() 173 | self.buffer.store(obs, action, reward / 10., policy, done) 174 | obs = next_obs 175 | 176 | if done: 177 | if len(self.buffer) > self.exploration: 178 | self.on_policy_train(next_obs) 179 | self.off_policy_train() 180 | self.train_count += 1 181 | if not self.weight_reward: 182 | self.weight_reward = total_reward 183 | else: 184 | self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward 185 | self.buffer.create() 186 | if self.log: 187 | self.writer.add_scalar('reward', total_reward, i + 1) 188 | self.writer.add_scalar('weight_reward', self.weight_reward, i + 1) 189 | print('episode: {} reward: {} weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward)) 190 | break 191 | 192 | 193 | if __name__ == '__main__': 194 | env = gym.make('CartPole-v0') 195 | env = env.unwrapped 196 | test = acer(env=env, 197 | episode=10000, 198 | capacity=10000, 199 | learning_rate=1e-3, 200 | exploration=1000, 201 | c=1., 202 | gamma=0.99, 203 | batch_size=16, 204 | entropy_weight=1e-4, 205 | replay_ratio=2, 206 | render=False, 207 | log=False) 208 | test.run() -------------------------------------------------------------------------------- /DDPG/ddpg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import random 6 | import numpy as np 7 | from collections import deque 8 | import gym 9 | import math 10 | from torch.utils.tensorboard import SummaryWriter 11 | 12 | class replay_buffer(object): 13 | def __init__(self, capacity): 14 | self.capacity = capacity 15 | self.memory = deque(maxlen=self.capacity) 16 | 17 | def store(self, observation, action, reward, next_observation, done): 18 | observation = np.expand_dims(observation, 0) 19 | next_observation = np.expand_dims(next_observation, 0) 20 | self.memory.append([observation, action, reward, next_observation, done]) 21 | 22 | def sample(self, batch_size): 23 | batch = random.sample(self.memory, batch_size) 24 | observation, action, reward, next_observation, done = zip(* batch) 25 | return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done 26 | 27 | def __len__(self): 28 | return len(self.memory) 29 | 30 | 31 | class policy_net(nn.Module): 32 | # * deterministic actor network, output a deterministic value as the selected action 33 | def __init__(self, input_dim, output_dim): 34 | super(policy_net, self).__init__() 35 | self.input_dim = input_dim 36 | self.output_dim = output_dim 37 | 38 | self.fc1 = nn.Linear(self.input_dim, 128) 39 | self.fc2 = nn.Linear(128, 128) 40 | self.fc3 = nn.Linear(128, self.output_dim) 41 | 42 | def forward(self, input): 43 | x = F.relu(self.fc1(input)) 44 | x = F.relu(self.fc2(x)) 45 | x = self.fc3(x) 46 | return x 47 | 48 | def act(self, input): 49 | action = self.forward(input).detach().item() 50 | return action 51 | 52 | 53 | 54 | class value_net(nn.Module): 55 | def __init__(self, input1_dim, input2_dim, output_dim): 56 | super(value_net, self).__init__() 57 | self.input1_dim = input1_dim 58 | self.input2_dim = input2_dim 59 | self.output_dim = output_dim 60 | 61 | self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128) 62 | self.fc2 = nn.Linear(128, 128) 63 | self.fc3 = nn.Linear(128, self.output_dim) 64 | 65 | def forward(self, input1, input2): 66 | # * concatentate the observation and action as the input 67 | x = torch.cat([input1, input2], 1) 68 | x = F.relu(self.fc1(x)) 69 | x = F.relu(self.fc2(x)) 70 | x = self.fc3(x) 71 | return x 72 | 73 | 74 | class ddpg(object): 75 | def __init__(self, env, episode, learning_rate, gamma, capacity, batch_size, value_iter, policy_iter, epsilon_init, decay, epsilon_min, rho, max_a, min_a, render, log): 76 | self.env = env 77 | self.episode = episode 78 | self.learning_rate = learning_rate 79 | self.gamma = gamma 80 | self.capacity = capacity 81 | self.batch_size = batch_size 82 | self.value_iter = value_iter 83 | self.policy_iter = policy_iter 84 | self.epsilon_init = epsilon_init 85 | self.decay = decay 86 | self.epsilon_min = epsilon_min 87 | self.rho = rho 88 | self.max_a = max_a 89 | self.min_a = min_a 90 | self.render = render 91 | self.log = log 92 | 93 | self.observation_dim = self.env.observation_space.shape[0] 94 | self.action_dim = self.env.action_space.shape[0] 95 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 96 | self.target_policy_net = policy_net(self.observation_dim, self.action_dim) 97 | self.value_net = value_net(self.observation_dim, self.action_dim, 1) 98 | self.target_value_net = value_net(self.observation_dim, self.action_dim, 1) 99 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 100 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 101 | self.target_policy_net.load_state_dict(self.policy_net.state_dict()) 102 | self.target_value_net.load_state_dict(self.value_net.state_dict()) 103 | self.buffer = replay_buffer(self.capacity) 104 | self.writer = SummaryWriter('runs/ddpg') 105 | self.count = 0 106 | self.train_count = 0 107 | self.weight_reward = 0 108 | self.epsilon = lambda x: self.epsilon_min + (self.epsilon_init - self.epsilon_min) * math.exp(- x / self.decay) 109 | 110 | def soft_update(self): 111 | for param, target_param in zip(self.value_net.parameters(), self.target_value_net.parameters()): 112 | target_param.detach().copy_(self.rho * target_param.detach() + (1. - self.rho) * param.detach()) 113 | for param, target_param in zip(self.policy_net.parameters(), self.target_policy_net.parameters()): 114 | target_param.detach().copy_(self.rho * target_param.detach() + (1. - self.rho) * param.detach()) 115 | 116 | def train(self): 117 | observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size) 118 | 119 | observation = torch.FloatTensor(observation) 120 | action = torch.FloatTensor(action).unsqueeze(1) 121 | reward = torch.FloatTensor(reward).unsqueeze(1) 122 | next_observation = torch.FloatTensor(next_observation) 123 | done = torch.FloatTensor(done).unsqueeze(1) 124 | 125 | value_loss_buffer = [] 126 | for _ in range(self.value_iter): 127 | target_next_action = self.target_policy_net.forward(next_observation) 128 | target_next_value = self.target_value_net.forward(next_observation, target_next_action) 129 | q_target = reward + self.gamma * (1 - done) * target_next_value 130 | q_target = q_target.detach() 131 | q = self.value_net.forward(observation, action) 132 | value_loss = (q - q_target).pow(2).mean() 133 | value_loss_buffer.append(value_loss.detach().item()) 134 | 135 | self.value_optimizer.zero_grad() 136 | value_loss.backward() 137 | torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0) 138 | self.value_optimizer.step() 139 | 140 | policy_loss_buffer = [] 141 | for _ in range(self.policy_iter): 142 | current_action = self.policy_net.forward(observation) 143 | policy_loss = (- self.value_net.forward(observation, current_action)).mean() 144 | policy_loss_buffer.append(policy_loss.detach().item()) 145 | 146 | self.policy_optimizer.zero_grad() 147 | policy_loss.backward() 148 | torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5) 149 | self.policy_optimizer.step() 150 | 151 | if self.log: 152 | self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count) 153 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 154 | 155 | self.soft_update() 156 | 157 | def run(self): 158 | for i in range(self.episode): 159 | obs = self.env.reset() 160 | total_reward = 0 161 | if self.render: 162 | self.env.render() 163 | while True: 164 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 165 | action = action + np.random.randn() * self.epsilon(self.count) 166 | action = np.clip(action, self.min_a, self.max_a) 167 | next_obs, reward, done, _ = self.env.step([action]) 168 | if self.render: 169 | self.env.render() 170 | self.buffer.store(obs, action, reward, next_obs, done) 171 | self.count += 1 172 | total_reward += reward 173 | obs = next_obs 174 | 175 | if done: 176 | self.train_count += 1 177 | self.train() 178 | if not self.weight_reward: 179 | self.weight_reward = total_reward 180 | else: 181 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 182 | if self.log: 183 | self.writer.add_scalar('reward', total_reward, i + 1) 184 | self.writer.add_scalar('weight_reward', self.weight_reward, i + 1) 185 | print('episode: {} reward: {:.2f} weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward)) 186 | break 187 | 188 | 189 | if __name__ == '__main__': 190 | env = gym.make('Pendulum-v0') 191 | test = ddpg(env=env, 192 | episode=10000, 193 | learning_rate=1e-3, 194 | gamma=0.99, 195 | capacity=10000, 196 | batch_size=64, 197 | value_iter=10, 198 | policy_iter=10, 199 | epsilon_init=1., 200 | decay=10000, 201 | epsilon_min=0.01, 202 | rho=0.995, 203 | max_a=2., 204 | min_a=-2., 205 | render=False, 206 | log=False) 207 | test.run() -------------------------------------------------------------------------------- /DSAC/distributional_sac_discrete.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import gym 5 | from collections import deque 6 | import random 7 | from torch.utils.tensorboard import SummaryWriter 8 | import numpy as np 9 | 10 | 11 | class replay_buffer(object): 12 | def __init__(self, capacity): 13 | self.capacity = capacity 14 | self.memory = deque(maxlen=self.capacity) 15 | 16 | def store(self, observation, action, reward, next_observation, done): 17 | observation = np.expand_dims(observation, 0) 18 | next_observation = np.expand_dims(next_observation, 0) 19 | self.memory.append([observation, action, reward, next_observation, done]) 20 | 21 | def sample(self, batch_size): 22 | batch = random.sample(self.memory, batch_size) 23 | observation, action, reward, next_observation, done = zip(* batch) 24 | return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done 25 | 26 | def __len__(self): 27 | return len(self.memory) 28 | 29 | 30 | class value_net(nn.Module): 31 | def __init__(self, observation_dim, action_dim, quant_num, cosine_num): 32 | super(value_net, self).__init__() 33 | self.observation_dim = observation_dim 34 | self.action_dim = action_dim 35 | self.quant_num = quant_num 36 | self.cosine_num = cosine_num 37 | 38 | self.feature_layer = nn.Sequential( 39 | nn.Linear(self.observation_dim, 128), 40 | nn.ReLU(), 41 | nn.Linear(128, 128) 42 | ) 43 | 44 | self.cosine_layer = nn.Sequential( 45 | nn.Linear(self.cosine_num, 128), 46 | nn.ReLU() 47 | ) 48 | 49 | self.psi_layer = nn.Sequential( 50 | nn.Linear(128, 128), 51 | nn.ReLU(), 52 | nn.Linear(128, self.action_dim) 53 | ) 54 | 55 | self.quantile_fraction_layer = nn.Sequential( 56 | nn.Linear(128, self.quant_num), 57 | nn.Softmax(dim=-1) 58 | ) 59 | 60 | def calc_state_embedding(self, observation): 61 | return self.feature_layer(observation) 62 | 63 | def calc_quantile_fraction(self, state_embedding): 64 | assert not state_embedding.requires_grad 65 | q = self.quantile_fraction_layer(state_embedding.detach()) 66 | tau_0 = torch.zeros(q.size(0), 1) 67 | tau = torch.cat([tau_0, q], dim=-1) 68 | tau = torch.cumsum(tau, dim=-1) 69 | entropy = torch.distributions.Categorical(probs=q).entropy() 70 | tau_hat = ((tau[:, :-1] + tau[:, 1:]) / 2.).detach() 71 | return tau, tau_hat, entropy 72 | 73 | def calc_quantile_value(self, tau, state_embedding): 74 | assert not tau.requires_grad 75 | quants = torch.arange(0, self.cosine_num, 1.0).unsqueeze(0).unsqueeze(0) 76 | cos_trans = torch.cos(quants * tau.unsqueeze(-1).detach() * np.pi) 77 | # * cos_trans: [batch_size, quant_num, cosine_num] 78 | rand_feat = self.cosine_layer(cos_trans) 79 | # * rand_feat: [batch_size, quant_num, 128] 80 | x = state_embedding.unsqueeze(1) 81 | # * x: [batch_size, 1, 128] 82 | x = x * rand_feat 83 | # * x: [batch_size, quant_num, 128] 84 | value = self.psi_layer(x).transpose(1, 2) 85 | # * value: [batch_size, action_dim, quant_num] 86 | return value 87 | 88 | def act(self, observation, epsilon): 89 | if random.random() > epsilon: 90 | state_embedding = self.calc_state_embedding(observation) 91 | tau, tau_hat, _ = self.calc_quantile_fraction(state_embedding.detach()) 92 | q_value = self.calc_q_value(state_embedding, tau, tau_hat) 93 | action = q_value.max(1)[1].detach().item() 94 | else: 95 | action = random.choice(list(range(self.action_dim))) 96 | return action 97 | 98 | def calc_sa_quantile_value(self, state_embedding, action, tau): 99 | sa_quantile_value = self.calc_quantile_value(tau.detach(), state_embedding) 100 | sa_quantile_value = sa_quantile_value.gather(1, action.unsqueeze(-1).expand(sa_quantile_value.size(0), 1, sa_quantile_value.size(-1))).squeeze(1) 101 | return sa_quantile_value 102 | 103 | def calc_q_value(self, state_embedding, tau, tau_hat): 104 | tau_delta = tau[:, 1:] - tau[:, :-1] 105 | tau_hat_value = self.calc_quantile_value(tau_hat.detach(), state_embedding) 106 | q_value = (tau_delta.unsqueeze(1) * tau_hat_value).sum(-1).detach() 107 | return q_value 108 | 109 | 110 | class policy_net(nn.Module): 111 | def __init__(self, input_dim, output_dim): 112 | super(policy_net, self).__init__() 113 | self.input_dim = input_dim 114 | self.output_dim = output_dim 115 | 116 | self.fc1 = nn.Linear(self.input_dim, 128) 117 | self.fc2 = nn.Linear(128, 128) 118 | self.fc3 = nn.Linear(128, self.output_dim) 119 | 120 | def forward(self, input): 121 | x = F.relu(self.fc1(input)) 122 | x = F.relu(self.fc2(x)) 123 | policy = F.softmax(self.fc3(x), dim=-1) 124 | return policy 125 | 126 | def act(self, input): 127 | policy = self.forward(input) 128 | dist = torch.distributions.Categorical(policy) 129 | action = dist.sample() 130 | return action[0].item() 131 | 132 | 133 | class sac_discrete(object): 134 | def __init__(self, env, batch_size, value_learning_rate, policy_learning_rate, quantile_learning_rate, quant_num, cosine_num, exploration, episode, gamma, alpha, auto_entropy_tuning, capacity, rho, update_iter, update_every, render, log, k=1.): 135 | self.env = env 136 | self.batch_size = batch_size 137 | self.value_learning_rate = value_learning_rate 138 | self.policy_learning_rate = policy_learning_rate 139 | self.quantile_learning_rate = quantile_learning_rate 140 | self.exploration = exploration 141 | self.episode = episode 142 | self.gamma = gamma 143 | self.auto_entropy_tuning = auto_entropy_tuning 144 | if not self.auto_entropy_tuning: 145 | self.alpha = alpha 146 | else: 147 | self.log_alpha = torch.zeros(1, requires_grad=True) 148 | self.alpha = self.log_alpha.exp() 149 | # * set the max possible entropy as the target entropy 150 | self.target_entropy = -np.log((1. / self.env.action_space.n)) * 0.98 151 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.value_learning_rate, eps=1e-4) 152 | self.capacity = capacity 153 | self.rho = rho 154 | self.update_iter = update_iter 155 | self.update_every = update_every 156 | self.render = render 157 | self.log = log 158 | self.quant_num = quant_num 159 | self.cosine_num = cosine_num 160 | self.k = k 161 | 162 | self.observation_dim = self.env.observation_space.shape[0] 163 | self.action_num = self.env.action_space.n 164 | 165 | self.value_net1 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num) 166 | self.value_net2 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num) 167 | self.target_value_net1 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num) 168 | self.target_value_net2 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num) 169 | self.policy_net = policy_net(self.observation_dim, self.action_num) 170 | self.target_value_net1.load_state_dict(self.value_net1.state_dict()) 171 | self.target_value_net2.load_state_dict(self.value_net2.state_dict()) 172 | 173 | self.buffer = replay_buffer(capacity=self.capacity) 174 | 175 | self.value_net1_params = list(self.value_net1.feature_layer.parameters()) + list(self.value_net1.cosine_layer.parameters()) + list(self.value_net1.psi_layer.parameters()) 176 | self.value_net2_params = list(self.value_net2.feature_layer.parameters()) + list(self.value_net2.cosine_layer.parameters()) + list(self.value_net2.psi_layer.parameters()) 177 | self.value_optimizer1 = torch.optim.Adam(self.value_net1_params, lr=self.value_learning_rate) 178 | self.value_optimizer2 = torch.optim.Adam(self.value_net2_params, lr=self.value_learning_rate) 179 | self.quantile_optimizer1 = torch.optim.RMSprop(self.value_net1.quantile_fraction_layer.parameters(), lr=self.quantile_learning_rate) 180 | self.quantile_optimizer2 = torch.optim.RMSprop(self.value_net2.quantile_fraction_layer.parameters(), lr=self.quantile_learning_rate) 181 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.policy_learning_rate) 182 | 183 | self.weight_reward = None 184 | self.count = 0 185 | self.train_count = 0 186 | self.writer = SummaryWriter('run/dsac_discrete') 187 | 188 | def soft_update(self): 189 | for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()): 190 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 191 | for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()): 192 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 193 | 194 | def calc_quantile_value_loss(self, tau, value, target_value): 195 | # * calculate quantile value loss 196 | # * get the quantile huber loss 197 | assert not tau.requires_grad 198 | u = target_value.unsqueeze(-2) - value.unsqueeze(-1) 199 | huber_loss = 0.5 * u.abs().clamp(min=0., max=self.k).pow(2) 200 | huber_loss = huber_loss + self.k * (u.abs() - u.abs().clamp(min=0., max=self.k) - 0.5 * self.k) 201 | quantile_loss = (tau.unsqueeze(-1) - (u < 0).float()).abs() * huber_loss 202 | loss = quantile_loss.mean() 203 | return loss 204 | 205 | def calc_quantile_fraction_loss(self, net, embedding, actions, tau, tau_hat): 206 | # * calculate quantile fraction loss 207 | assert not tau_hat.requires_grad 208 | sa_quantile_hat = net.calc_sa_quantile_value(embedding, actions, tau_hat).detach() 209 | sa_quantile = net.calc_sa_quantile_value(embedding, actions, tau[:, 1:-1]).detach() 210 | gradient_tau = 2 * sa_quantile - sa_quantile_hat[:, :-1] - sa_quantile_hat[:, 1:] 211 | return (gradient_tau.detach() * tau[:, 1: -1]).sum(1).mean() 212 | 213 | def train(self): 214 | observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size) 215 | 216 | observation = torch.FloatTensor(observation) 217 | action = torch.LongTensor(action).unsqueeze(1) 218 | reward = torch.FloatTensor(reward).unsqueeze(1) 219 | next_observation = torch.FloatTensor(next_observation) 220 | done = torch.FloatTensor(done).unsqueeze(1) 221 | 222 | value_loss1_buffer = [] 223 | value_loss2_buffer = [] 224 | policy_loss_buffer = [] 225 | for _ in range(self.update_iter): 226 | policy = self.policy_net.forward(next_observation) 227 | 228 | state_embedding1 = self.value_net1.calc_state_embedding(observation) 229 | tau1, tau_hat1, entropy1 = self.value_net1.calc_quantile_fraction(state_embedding1.detach()) 230 | dist1 = self.value_net1.calc_quantile_value(tau_hat1.detach(), state_embedding1) 231 | dist1 = dist1.gather(1, action.unsqueeze(-1).expand(self.batch_size, 1, dist1.size(2))).squeeze() 232 | # * use tau_hat to calculate the quantile value 233 | target_next_state_embedding1 = self.target_value_net1.calc_state_embedding(next_observation) 234 | # * double q 235 | eval_next_state_embedding1 = self.value_net1.calc_state_embedding(next_observation) 236 | next_tau1, next_tau_hat1, _ = self.value_net1.calc_quantile_fraction(eval_next_state_embedding1.detach()) 237 | target_action1 = self.value_net1.calc_q_value(eval_next_state_embedding1, next_tau1, next_tau_hat1).max(1)[1].detach() 238 | target_dist1 = self.target_value_net1.calc_quantile_value(tau_hat1.detach(), target_next_state_embedding1) 239 | target_dist1 = target_dist1.gather(1, target_action1.unsqueeze(-1).unsqueeze(-1).expand(self.batch_size, 1, target_dist1.size(2))).squeeze() 240 | target_dist1 = reward + self.gamma * target_dist1 * (1. - done) 241 | target_q_value1 = self.target_value_net1.calc_q_value(target_next_state_embedding1, tau1, tau_hat1) 242 | target_q_value1 = reward + self.gamma * target_q_value1 * (1. - done) 243 | #value = target_dist1.gather(1, action.unsqueeze(-1).expand(self.batch_size, 1, target_dist1.size(2))).squeeze() 244 | 245 | state_embedding2 = self.value_net2.calc_state_embedding(observation) 246 | tau2, tau_hat2, entropy2 = self.value_net2.calc_quantile_fraction(state_embedding2.detach()) 247 | dist2 = self.value_net2.calc_quantile_value(tau_hat2.detach(), state_embedding2) 248 | dist2 = dist2.gather(1, action.unsqueeze(-1).expand(self.batch_size, 1, dist2.size(2))).squeeze() 249 | # * use tau_hat to calculate the quantile value 250 | target_next_state_embedding2 = self.target_value_net2.calc_state_embedding(next_observation) 251 | eval_next_state_embedding2 = self.value_net2.calc_state_embedding(next_observation) 252 | next_tau2, next_tau_hat2, _ = self.value_net2.calc_quantile_fraction(eval_next_state_embedding2.detach()) 253 | target_action2 = self.value_net2.calc_q_value(eval_next_state_embedding2, next_tau2, next_tau_hat2).max(1)[1].detach() 254 | target_dist2 = self.target_value_net2.calc_quantile_value(tau_hat2.detach(), target_next_state_embedding2) 255 | target_dist2 = target_dist2.gather(1, target_action2.unsqueeze(-1).unsqueeze(-1).expand(self.batch_size, 1, target_dist2.size(2))).squeeze() 256 | target_dist2 = reward + self.gamma * target_dist2 * (1. - done) 257 | target_q_value2 = self.target_value_net2.calc_q_value(target_next_state_embedding2, tau2, tau_hat2) 258 | target_q_value2 = reward + self.gamma * target_q_value2 * (1. - done) 259 | # * calculate the expectation directly 260 | 261 | value_loss1 = self.calc_quantile_value_loss(tau_hat1.detach(), dist1, target_dist1) 262 | value_loss2 = self.calc_quantile_value_loss(tau_hat2.detach(), dist2, target_dist2) 263 | value_loss1_buffer.append(value_loss1.detach().item()) 264 | value_loss2_buffer.append(value_loss2.detach().item()) 265 | 266 | quantile_loss1 = self.calc_quantile_fraction_loss(self.value_net1, state_embedding1, action, tau1, tau_hat1) 267 | quantile_loss2 = self.calc_quantile_fraction_loss(self.value_net2, state_embedding2, action, tau2, tau_hat2) 268 | 269 | self.quantile_optimizer1.zero_grad() 270 | quantile_loss1.backward(retain_graph=True) 271 | self.quantile_optimizer1.step() 272 | 273 | self.quantile_optimizer2.zero_grad() 274 | quantile_loss2.backward(retain_graph=True) 275 | self.quantile_optimizer2.step() 276 | 277 | self.value_optimizer1.zero_grad() 278 | value_loss1.backward() 279 | nn.utils.clip_grad_norm_(self.value_net1_params, 10) 280 | self.value_optimizer1.step() 281 | 282 | self.value_optimizer2.zero_grad() 283 | value_loss2.backward() 284 | nn.utils.clip_grad_norm_(self.value_net2_params, 10) 285 | self.value_optimizer2.step() 286 | 287 | # * calculate the expectation directly 288 | policy_loss = policy * (self.alpha * policy.log() - torch.min(target_q_value1, target_q_value2).detach()) 289 | policy_loss = policy_loss.mean() 290 | 291 | self.policy_optimizer.zero_grad() 292 | policy_loss.backward() 293 | nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10) 294 | self.policy_optimizer.step() 295 | 296 | if self.auto_entropy_tuning: 297 | self.alpha_optimizer.zero_grad() 298 | entropy_loss = -(self.log_alpha * (policy.log() + self.target_entropy).detach()).mean() 299 | entropy_loss.backward() 300 | nn.utils.clip_grad_norm_([self.log_alpha], 0.2) 301 | self.alpha_optimizer.step() 302 | 303 | self.alpha = self.log_alpha.exp() 304 | 305 | self.soft_update() 306 | if self.log: 307 | self.writer.add_scalar('value_loss1', np.mean(value_loss1_buffer), self.train_count) 308 | self.writer.add_scalar('value_loss2', np.mean(value_loss2_buffer), self.train_count) 309 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 310 | 311 | def run(self): 312 | for i in range(self.episode): 313 | obs = self.env.reset() 314 | total_reward = 0 315 | if self.render: 316 | self.env.render() 317 | while True: 318 | if i >= self.exploration: 319 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 320 | else: 321 | action = random.choice(list(range(self.action_num))) 322 | next_obs, reward, done, _ = self.env.step(action) 323 | if self.render: 324 | self.env.render() 325 | self.buffer.store(obs, action, reward, next_obs, done) 326 | self.count += 1 327 | total_reward += reward 328 | obs = next_obs 329 | 330 | if (self.count % self.update_every) == 0 and i >= self.exploration: 331 | self.train_count += 1 332 | self.train() 333 | if done: 334 | if not self.weight_reward: 335 | self.weight_reward = total_reward 336 | else: 337 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 338 | if self.log: 339 | self.writer.add_scalar('reward', total_reward, i + 1) 340 | self.writer.add_scalar('weight_reward', self.weight_reward, i + 1) 341 | print('episode: {} reward: {:.2f} weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward)) 342 | break 343 | 344 | if __name__ == '__main__': 345 | env = gym.make('CartPole-v1').unwrapped 346 | test = sac_discrete( 347 | env=env, 348 | batch_size=64, 349 | value_learning_rate=3e-4, 350 | policy_learning_rate=3e-4, 351 | quantile_learning_rate=2.5e-9, 352 | quant_num=32, 353 | cosine_num=32, 354 | exploration=3000, 355 | episode=10000, 356 | gamma=0.99, 357 | alpha=None, 358 | auto_entropy_tuning=True, 359 | capacity=100000, 360 | rho=0.995, 361 | update_iter=3, 362 | update_every=5, 363 | render=False, 364 | log=False 365 | ) 366 | test.run() 367 | -------------------------------------------------------------------------------- /ICM_PPO/icm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import random 5 | import numpy as np 6 | import gym 7 | from collections import deque 8 | from torch.distributions import Categorical 9 | from torch.utils.tensorboard import SummaryWriter 10 | 11 | 12 | class gae_trajectory_buffer(object): 13 | def __init__(self, capacity, gamma, lam): 14 | self.capacity = capacity 15 | self.gamma = gamma 16 | self.lam = lam 17 | self.memory = deque(maxlen=self.capacity) 18 | # * [obs, act, next_obs, rew, don, val, ret, adv] 19 | 20 | def store(self, obs, act, rew, don, val, next_obs): 21 | obs = np.expand_dims(obs, 0) 22 | next_obs = np.expand_dims(next_obs, 0) 23 | self.memory.append([obs, act, next_obs, rew, don, val]) 24 | 25 | def process(self): 26 | R = 0 27 | Adv = 0 28 | Value_previous = 0 29 | for traj in reversed(list(self.memory)): 30 | R = self.gamma * R * (1 - traj[4]) + traj[5] 31 | traj.append(R) 32 | # * the generalized advantage estimator(GAE) 33 | delta = traj[3] + Value_previous * self.gamma * (1 - traj[4]) - traj[5] 34 | Adv = delta + (1 - traj[4]) * Adv * self.gamma * self.lam 35 | traj.append(Adv) 36 | Value_previous = traj[5] 37 | 38 | def get(self): 39 | obs, act, next_obs, rew, don, val, ret, adv = zip(* self.memory) 40 | act = np.expand_dims(act, 1) 41 | rew = np.expand_dims(rew, 1) 42 | don = np.expand_dims(don, 1) 43 | val = np.expand_dims(val, 1) 44 | ret = np.expand_dims(ret, 1) 45 | adv = np.array(adv) 46 | adv = (adv - adv.mean()) / adv.std() 47 | adv = np.expand_dims(adv, 1) 48 | return np.concatenate(obs, 0), act, np.concatenate(next_obs, 0), rew, don, val, ret, adv 49 | 50 | def __len__(self): 51 | return len(self.memory) 52 | 53 | def clear(self): 54 | self.memory.clear() 55 | 56 | 57 | class policy_net(nn.Module): 58 | def __init__(self, input_dim, output_dim): 59 | super(policy_net, self).__init__() 60 | self.input_dim = input_dim 61 | self.output_dim = output_dim 62 | self.fc1 = nn.Linear(self.input_dim, 128) 63 | self.fc2 = nn.Linear(128, 128) 64 | self.fc3 = nn.Linear(128, self.output_dim) 65 | 66 | def forward(self, input): 67 | x = F.relu(self.fc1(input)) 68 | x = F.relu(self.fc2(x)) 69 | x = self.fc3(x) 70 | return F.softmax(x, 1) 71 | 72 | def act(self, input): 73 | probs = self.forward(input) 74 | dist = Categorical(probs) 75 | action = dist.sample() 76 | action = action.detach().item() 77 | return action 78 | 79 | 80 | class value_net(nn.Module): 81 | def __init__(self, input_dim, output_dim): 82 | super(value_net, self).__init__() 83 | self.input_dim = input_dim 84 | self.output_dim = output_dim 85 | 86 | self.fc1 = nn.Linear(self.input_dim, 128) 87 | self.fc2 = nn.Linear(128, 128) 88 | self.fc3 = nn.Linear(128, self.output_dim) 89 | 90 | def forward(self, input): 91 | x = F.relu(self.fc1(input)) 92 | x = F.relu(self.fc2(x)) 93 | x = self.fc3(x) 94 | return x 95 | 96 | 97 | class icm(nn.Module): 98 | def __init__(self, observation_dim, action_dim, state_dim, reset_time): 99 | super(icm, self).__init__() 100 | self.observation_dim = observation_dim 101 | self.action_dim = action_dim 102 | self.state_dim = state_dim 103 | self.reset_time = reset_time 104 | 105 | self.feature = nn.Sequential( 106 | nn.Linear(self.observation_dim, 256), 107 | nn.ReLU(), 108 | nn.Linear(256, 256), 109 | nn.ReLU(), 110 | nn.Linear(256, self.state_dim) 111 | ) 112 | 113 | self.inverse_net = nn.Sequential( 114 | nn.Linear(2 * self.state_dim, 256), 115 | nn.ReLU(), 116 | nn.Linear(256, 256), 117 | nn.ReLU(), 118 | nn.Linear(256, self.action_dim) 119 | ) 120 | 121 | self.forward_net_1 = nn.Sequential( 122 | nn.Linear(self.state_dim + self.action_dim, 256), 123 | nn.ReLU(), 124 | nn.Linear(256, 256) 125 | ) 126 | 127 | self.reset_net = [ 128 | nn.Sequential( 129 | nn.Linear(256, 256), 130 | nn.ReLU(), 131 | nn.Linear(256, 256), 132 | nn.ReLU() 133 | ) 134 | ] * 2 * self.reset_time 135 | 136 | self.forward_net_2 = nn.Sequential( 137 | nn.Linear(256, 256), 138 | nn.ReLU(), 139 | nn.Linear(256, self.state_dim) 140 | ) 141 | 142 | def forward(self, observation, action, next_observation): 143 | state = self.feature(observation) 144 | next_state = self.feature(next_observation) 145 | cat_state = torch.cat([state, next_state], 1) 146 | pred_action = self.inverse_net(cat_state) 147 | pred_action = torch.softmax(pred_action, 1) 148 | pred_state = self.forward_net_1(torch.cat([state, action], 1)) 149 | for i in range(self.reset_time): 150 | pred_state_tmp = self.reset_net[2 * i](pred_state) 151 | pred_state = self.reset_net[2 * i + 1](pred_state_tmp) + pred_state 152 | pred_state = self.forward_net_2(pred_state) 153 | return pred_action, pred_state, next_state 154 | 155 | def intrinsic_reward(self, observation, action, next_observation): 156 | state = self.feature(observation) 157 | next_state = self.feature(next_observation) 158 | pred_state = self.forward_net_1(torch.cat([state, action], 1)) 159 | for i in range(self.reset_time): 160 | pred_state_tmp = self.reset_net[2 * i](pred_state) 161 | pred_state = self.reset_net[2 * i + 1](pred_state_tmp) + pred_state 162 | pred_state = self.forward_net_2(pred_state) 163 | r_i = (pred_state - next_state).pow(2).sum() 164 | return r_i.detach().item() 165 | 166 | 167 | class icm_ppo(object): 168 | def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter, state_dim, reset_time, intrinsic_weight): 169 | super(icm_ppo, self).__init__() 170 | self.env = env 171 | self.episode = episode 172 | self.learning_rate = learning_rate 173 | self.gamma = gamma 174 | self.lam = lam 175 | self.epsilon = epsilon 176 | self.capacity = capacity 177 | self.render = render 178 | self.log = log 179 | self.value_update_iter = value_update_iter 180 | self.policy_update_iter = policy_update_iter 181 | self.state_dim = state_dim 182 | self.reset_time = reset_time 183 | self.intrinsic_weight = intrinsic_weight 184 | 185 | self.observation_dim = self.env.observation_space.shape[0] 186 | self.action_dim = self.env.action_space.n 187 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 188 | self.value_net = value_net(self.observation_dim, 1) 189 | self.icm_net = icm(self.observation_dim, self.action_dim, self.state_dim, self.reset_time) 190 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 191 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 192 | self.icm_optimizer = torch.optim.Adam(self.icm_net.parameters(), lr=self.learning_rate) 193 | self.mse_func = torch.nn.MSELoss() 194 | self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam) 195 | self.count = 0 196 | self.train_count = 0 197 | self.weight_reward = None 198 | self.writer = SummaryWriter('runs/icm_ppo_cartpole') 199 | 200 | def train(self): 201 | obs, act, next_obs, rew, don, val, ret, adv = self.buffer.get() 202 | 203 | obs = torch.FloatTensor(obs) 204 | act = torch.LongTensor(act) 205 | next_obs = torch.FloatTensor(next_obs) 206 | rew = torch.FloatTensor(rew) 207 | don = torch.FloatTensor(don) 208 | val = torch.FloatTensor(val) 209 | ret = torch.FloatTensor(ret) 210 | adv = torch.FloatTensor(adv).squeeze(1) 211 | act_one_hot = torch.zeros(act.size(0), self.action_dim).scatter(1, act, 1) 212 | 213 | old_probs = self.policy_net.forward(obs) 214 | old_probs = old_probs.gather(1, act).squeeze(1).detach() 215 | value_loss_buffer = [] 216 | for _ in range(self.value_update_iter): 217 | value = self.value_net.forward(obs) 218 | td_target = rew + self.gamma * self.value_net.forward(next_obs) * (1 - don) 219 | value_loss = F.smooth_l1_loss(td_target.detach(), value) 220 | value_loss_buffer.append(value_loss.item()) 221 | self.value_optimizer.zero_grad() 222 | value_loss.backward() 223 | self.value_optimizer.step() 224 | if self.log: 225 | self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count) 226 | 227 | policy_loss_buffer = [] 228 | for _ in range(self.policy_update_iter): 229 | probs = self.policy_net.forward(obs) 230 | probs = probs.gather(1, act).squeeze(1) 231 | ratio = probs / old_probs 232 | surr1 = ratio * adv 233 | surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv 234 | policy_loss = - torch.min(surr1, surr2).mean() 235 | policy_loss_buffer.append(policy_loss.item()) 236 | self.policy_optimizer.zero_grad() 237 | policy_loss.backward() 238 | self.policy_optimizer.step() 239 | if self.log: 240 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 241 | 242 | pred_action, pred_state, next_state = self.icm_net.forward(obs, act_one_hot, next_obs) 243 | forward_loss = self.mse_func(pred_state, next_state.detach()) 244 | inverse_loss = self.mse_func(pred_action, act_one_hot) 245 | icm_loss = forward_loss + inverse_loss 246 | self.icm_optimizer.zero_grad() 247 | icm_loss.backward() 248 | self.icm_optimizer.step() 249 | 250 | def run(self): 251 | for i in range(self.episode): 252 | obs = self.env.reset() 253 | total_reward = 0 254 | if self.render: 255 | self.env.render() 256 | while True: 257 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 258 | next_obs, reward, done, _ = self.env.step(action) 259 | if self.render: 260 | self.env.render() 261 | value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item() 262 | action_one_hot = np.zeros([1, self.action_dim]) 263 | action_one_hot[0, action] = 1 264 | action_one_hot = torch.FloatTensor(action_one_hot) 265 | intrinsic_reward = self.intrinsic_weight * self.icm_net.intrinsic_reward(torch.FloatTensor(np.expand_dims(obs, 0)), action_one_hot, torch.FloatTensor(np.expand_dims(next_obs, 0))) 266 | reward = max(intrinsic_reward, 0.1) + reward 267 | self.buffer.store(obs, action, reward / 100., done, value, next_obs) 268 | self.count += 1 269 | total_reward += reward 270 | obs = next_obs 271 | #if self.count % self.capacity == 0: 272 | if done: 273 | self.buffer.process() 274 | self.train_count += 1 275 | self.train() 276 | self.buffer.clear() 277 | if done: 278 | if not self.weight_reward: 279 | self.weight_reward = total_reward 280 | else: 281 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 282 | if self.log: 283 | self.writer.add_scalar('weight_reward', self.weight_reward, i+1) 284 | self.writer.add_scalar('reward', total_reward, i+1) 285 | print('episode: {} reward: {:.2f} weight_reward: {:.2f} train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count)) 286 | break 287 | 288 | 289 | if __name__ == '__main__': 290 | env = gym.make('CartPole-v0').unwrapped 291 | test = icm_ppo( 292 | env=env, 293 | episode=10000, 294 | learning_rate=1e-3, 295 | gamma=0.99, 296 | lam=0.97, 297 | epsilon=0.2, 298 | capacity=20000, 299 | render=False, 300 | log=False, 301 | value_update_iter=3, 302 | policy_update_iter=3, 303 | state_dim=256, 304 | reset_time=2, 305 | intrinsic_weight=1e-6 306 | ) 307 | test.run() 308 | -------------------------------------------------------------------------------- /PPO_CLIP/gae_ppo_cartpole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import random 5 | import numpy as np 6 | import gym 7 | from collections import deque 8 | from torch.distributions import Categorical 9 | from torch.utils.tensorboard import SummaryWriter 10 | 11 | 12 | class gae_trajectory_buffer(object): 13 | def __init__(self, capacity, gamma, lam): 14 | self.capacity = capacity 15 | self.gamma = gamma 16 | self.lam = lam 17 | self.memory = deque(maxlen=self.capacity) 18 | # * [obs, next_obs, act, rew, don, val, ret, adv] 19 | 20 | def store(self, obs, next_obs, act, rew, don, val): 21 | obs = np.expand_dims(obs, 0) 22 | next_obs = np.expand_dims(next_obs, 0) 23 | self.memory.append([obs, next_obs, act, rew, don, val]) 24 | 25 | def process(self): 26 | R = 0 27 | Adv = 0 28 | Value_previous = 0 29 | for traj in reversed(list(self.memory)): 30 | R = self.gamma * R * (1 - traj[4]) + traj[5] 31 | traj.append(R) 32 | # * the generalized advantage estimator(GAE) 33 | delta = traj[3] + Value_previous * self.gamma * (1 - traj[4]) - traj[5] 34 | Adv = delta + (1 - traj[4]) * Adv * self.gamma * self.lam 35 | traj.append(Adv) 36 | Value_previous = traj[5] 37 | 38 | def get(self): 39 | obs, next_obs, act, rew, don, val, ret, adv = zip(* self.memory) 40 | act = np.expand_dims(act, 1) 41 | rew = np.expand_dims(rew, 1) 42 | don = np.expand_dims(don, 1) 43 | val = np.expand_dims(val, 1) 44 | ret = np.expand_dims(ret, 1) 45 | adv = np.array(adv) 46 | adv = (adv - adv.mean()) / adv.std() 47 | adv = np.expand_dims(adv, 1) 48 | return np.concatenate(obs, 0), np.concatenate(next_obs, 0), act, rew, don, val, ret, adv 49 | 50 | def __len__(self): 51 | return len(self.memory) 52 | 53 | def clear(self): 54 | self.memory.clear() 55 | 56 | 57 | class policy_net(nn.Module): 58 | def __init__(self, input_dim, output_dim): 59 | super(policy_net, self).__init__() 60 | self.input_dim = input_dim 61 | self.output_dim = output_dim 62 | self.fc1 = nn.Linear(self.input_dim, 128) 63 | self.fc2 = nn.Linear(128, 128) 64 | self.fc3 = nn.Linear(128, self.output_dim) 65 | 66 | def forward(self, input): 67 | x = F.relu(self.fc1(input)) 68 | x = F.relu(self.fc2(x)) 69 | x = self.fc3(x) 70 | return F.softmax(x, 1) 71 | 72 | def act(self, input): 73 | probs = self.forward(input) 74 | dist = Categorical(probs) 75 | action = dist.sample() 76 | action = action.detach().item() 77 | return action 78 | 79 | 80 | class value_net(nn.Module): 81 | def __init__(self, input_dim, output_dim): 82 | super(value_net, self).__init__() 83 | self.input_dim = input_dim 84 | self.output_dim = output_dim 85 | 86 | self.fc1 = nn.Linear(self.input_dim, 128) 87 | self.fc2 = nn.Linear(128, 128) 88 | self.fc3 = nn.Linear(128, self.output_dim) 89 | 90 | def forward(self, input): 91 | x = F.relu(self.fc1(input)) 92 | x = F.relu(self.fc2(x)) 93 | x = self.fc3(x) 94 | return x 95 | 96 | 97 | class ppo_clip(object): 98 | def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter): 99 | super(ppo_clip, self).__init__() 100 | self.env = env 101 | self.episode = episode 102 | self.learning_rate = learning_rate 103 | self.gamma = gamma 104 | self.lam = lam 105 | self.epsilon = epsilon 106 | self.capacity = capacity 107 | self.render = render 108 | self.log = log 109 | self.value_update_iter = value_update_iter 110 | self.policy_update_iter = policy_update_iter 111 | 112 | self.observation_dim = self.env.observation_space.shape[0] 113 | self.action_dim = self.env.action_space.n 114 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 115 | self.value_net = value_net(self.observation_dim, 1) 116 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 117 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 118 | self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam) 119 | self.count = 0 120 | self.train_count = 0 121 | self.weight_reward = None 122 | self.writer = SummaryWriter('runs/ppo_clip_cartpole') 123 | 124 | def train(self): 125 | obs, next_obs, act, rew, don, val, ret, adv = self.buffer.get() 126 | 127 | obs = torch.FloatTensor(obs) 128 | next_obs = torch.FloatTensor(next_obs) 129 | act = torch.LongTensor(act) 130 | rew = torch.FloatTensor(rew) 131 | don = torch.FloatTensor(don) 132 | val = torch.FloatTensor(val) 133 | ret = torch.FloatTensor(ret) 134 | adv = torch.FloatTensor(adv).squeeze(1) 135 | 136 | old_probs = self.policy_net.forward(obs) 137 | old_probs = old_probs.gather(1, act).squeeze(1).detach() 138 | value_loss_buffer = [] 139 | policy_loss_buffer = [] 140 | for _ in range(self.value_update_iter): 141 | value = self.value_net.forward(obs) 142 | td_target = rew + self.gamma * self.value_net.forward(next_obs) * (1 - don) 143 | #value_loss = (ret - value).pow(2).mean() 144 | value_loss = F.smooth_l1_loss(td_target.detach(), value) 145 | value_loss_buffer.append(value_loss.item()) 146 | self.value_optimizer.zero_grad() 147 | value_loss.backward() 148 | self.value_optimizer.step() 149 | if self.log: 150 | self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count) 151 | 152 | probs = self.policy_net.forward(obs) 153 | probs = probs.gather(1, act).squeeze(1) 154 | ratio = probs / old_probs 155 | surr1 = ratio * adv 156 | surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv 157 | policy_loss = - torch.min(surr1, surr2).mean() 158 | policy_loss_buffer.append(policy_loss.item()) 159 | self.policy_optimizer.zero_grad() 160 | policy_loss.backward() 161 | self.policy_optimizer.step() 162 | if self.log: 163 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 164 | 165 | def run(self): 166 | for i in range(self.episode): 167 | obs = self.env.reset() 168 | total_reward = 0 169 | if self.render: 170 | self.env.render() 171 | while True: 172 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 173 | next_obs, reward, done, _ = self.env.step(action) 174 | if self.render: 175 | self.env.render() 176 | value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item() 177 | self.buffer.store(obs, next_obs, action, reward / 100., done, value) 178 | self.count += 1 179 | total_reward += reward 180 | obs = next_obs 181 | if self.count % self.capacity == 0: 182 | self.buffer.process() 183 | self.train_count += 1 184 | self.train() 185 | self.buffer.clear() 186 | if done: 187 | if not self.weight_reward: 188 | self.weight_reward = total_reward 189 | else: 190 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 191 | if self.log: 192 | self.writer.add_scalar('weight_reward', self.weight_reward, i+1) 193 | self.writer.add_scalar('reward', total_reward, i+1) 194 | print('episode: {} reward: {:.2f} weight_reward: {:.2f} train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count)) 195 | break 196 | 197 | 198 | if __name__ == '__main__': 199 | env = gym.make('CartPole-v1').unwrapped 200 | test = ppo_clip(env=env, 201 | episode=10000, 202 | learning_rate=1e-3, 203 | gamma=0.99, 204 | lam=0.97, 205 | epsilon=0.2, 206 | capacity=20, 207 | render=False, 208 | log=False, 209 | value_update_iter=10, 210 | policy_update_iter=10) 211 | test.run() 212 | -------------------------------------------------------------------------------- /PPO_CLIP/ppo_cartpole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import random 5 | import numpy as np 6 | import gym 7 | from collections import deque 8 | from torch.distributions import Categorical 9 | from torch.utils.tensorboard import SummaryWriter 10 | 11 | 12 | class trajectory_buffer(object): 13 | def __init__(self, capacity): 14 | self.capacity = capacity 15 | self.memory = deque(maxlen=self.capacity) 16 | # * [obs, next_obs, act, rew, don, val] 17 | 18 | def store(self, obs, next_obs, act, rew, don, val): 19 | obs = np.expand_dims(obs, 0) 20 | next_obs = np.expand_dims(next_obs, 0) 21 | self.memory.append([obs, next_obs, act, rew, don, val]) 22 | 23 | def get(self): 24 | obs, next_obs, act, rew, don, val = zip(* self.memory) 25 | act = np.expand_dims(act, 1) 26 | rew = np.expand_dims(rew, 1) 27 | don = np.expand_dims(don, 1) 28 | val = np.expand_dims(val, 1) 29 | return np.concatenate(obs, 0), np.concatenate(next_obs, 0), act, rew, don, val 30 | 31 | def __len__(self): 32 | return len(self.memory) 33 | 34 | def clear(self): 35 | self.memory.clear() 36 | 37 | 38 | class policy_net(nn.Module): 39 | def __init__(self, input_dim, output_dim): 40 | super(policy_net, self).__init__() 41 | self.input_dim = input_dim 42 | self.output_dim = output_dim 43 | self.fc1 = nn.Linear(self.input_dim, 128) 44 | self.fc2 = nn.Linear(128, 128) 45 | self.fc3 = nn.Linear(128, self.output_dim) 46 | 47 | def forward(self, input): 48 | x = F.relu(self.fc1(input)) 49 | x = F.relu(self.fc2(x)) 50 | x = self.fc3(x) 51 | return F.softmax(x, 1) 52 | 53 | def act(self, input): 54 | probs = self.forward(input) 55 | dist = Categorical(probs) 56 | action = dist.sample() 57 | action = action.detach().item() 58 | return action 59 | 60 | 61 | class value_net(nn.Module): 62 | def __init__(self, input_dim, output_dim): 63 | super(value_net, self).__init__() 64 | self.input_dim = input_dim 65 | self.output_dim = output_dim 66 | 67 | self.fc1 = nn.Linear(self.input_dim, 128) 68 | self.fc2 = nn.Linear(128, 128) 69 | self.fc3 = nn.Linear(128, self.output_dim) 70 | 71 | def forward(self, input): 72 | x = F.relu(self.fc1(input)) 73 | x = F.relu(self.fc2(x)) 74 | x = self.fc3(x) 75 | return x 76 | 77 | 78 | class ppo_clip(object): 79 | def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter): 80 | super(ppo_clip, self).__init__() 81 | self.env = env 82 | self.episode = episode 83 | self.learning_rate = learning_rate 84 | self.gamma = gamma 85 | self.lam = lam 86 | self.epsilon = epsilon 87 | self.capacity = capacity 88 | self.render = render 89 | self.log = log 90 | self.value_update_iter = value_update_iter 91 | self.policy_update_iter = policy_update_iter 92 | 93 | self.observation_dim = self.env.observation_space.shape[0] 94 | self.action_dim = self.env.action_space.n 95 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 96 | self.value_net = value_net(self.observation_dim, 1) 97 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 98 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 99 | self.buffer = trajectory_buffer(capacity=self.capacity) 100 | self.count = 0 101 | self.train_count = 0 102 | self.weight_reward = None 103 | self.writer = SummaryWriter('runs/ppo_clip_cartpole') 104 | 105 | def train(self): 106 | obs, next_obs, act, rew, don, val = self.buffer.get() 107 | 108 | obs = torch.FloatTensor(obs) 109 | next_obs = torch.FloatTensor(next_obs) 110 | act = torch.LongTensor(act) 111 | rew = torch.FloatTensor(rew) 112 | don = torch.FloatTensor(don) 113 | val = torch.FloatTensor(val) 114 | 115 | old_probs = self.policy_net.forward(obs) 116 | old_probs = old_probs.gather(1, act).squeeze(1).detach() 117 | value_loss_buffer = [] 118 | policy_loss_buffer = [] 119 | for _ in range(self.value_update_iter): 120 | td_target = rew + self.gamma * self.value_net.forward(next_obs) * (1 - don) 121 | delta = td_target - self.value_net.forward(obs) 122 | delta = delta.detach().numpy() 123 | 124 | advantage_lst = [] 125 | advantage = 0.0 126 | for delta_t in delta[::-1]: 127 | advantage = self.gamma * self.lam * advantage + delta_t[0] 128 | advantage_lst.append([advantage]) 129 | 130 | advantage_lst.reverse() 131 | advantage = torch.FloatTensor(advantage_lst) 132 | 133 | value = self.value_net.forward(obs) 134 | #value_loss = (ret - value).pow(2).mean() 135 | value_loss = F.smooth_l1_loss(td_target.detach(), value) 136 | value_loss_buffer.append(value_loss.item()) 137 | self.value_optimizer.zero_grad() 138 | value_loss.backward() 139 | self.value_optimizer.step() 140 | if self.log: 141 | self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count) 142 | 143 | probs = self.policy_net.forward(obs) 144 | probs = probs.gather(1, act).squeeze(1) 145 | ratio = probs / old_probs 146 | surr1 = ratio * advantage 147 | surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * advantage 148 | policy_loss = - torch.min(surr1, surr2).mean() 149 | policy_loss_buffer.append(policy_loss.item()) 150 | self.policy_optimizer.zero_grad() 151 | policy_loss.backward() 152 | self.policy_optimizer.step() 153 | if self.log: 154 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 155 | 156 | def run(self): 157 | for i in range(self.episode): 158 | obs = self.env.reset() 159 | total_reward = 0 160 | if self.render: 161 | self.env.render() 162 | while True: 163 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 164 | next_obs, reward, done, _ = self.env.step(action) 165 | if self.render: 166 | self.env.render() 167 | value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item() 168 | self.buffer.store(obs, next_obs, action, reward, done, value) 169 | self.count += 1 170 | total_reward += reward 171 | obs = next_obs 172 | if self.count % 20 == 0: 173 | self.train_count += 1 174 | self.train() 175 | self.buffer.clear() 176 | if done: 177 | if not self.weight_reward: 178 | self.weight_reward = total_reward 179 | else: 180 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 181 | if self.log: 182 | self.writer.add_scalar('weight_reward', self.weight_reward, i+1) 183 | self.writer.add_scalar('reward', total_reward, i+1) 184 | print('episode: {} reward: {:.2f} weight_reward: {:.2f} train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count)) 185 | break 186 | 187 | 188 | if __name__ == '__main__': 189 | env = gym.make('CartPole-v1').unwrapped 190 | test = ppo_clip(env=env, 191 | episode=10000, 192 | learning_rate=1e-3, 193 | gamma=0.99, 194 | lam=0.95, 195 | epsilon=0.1, 196 | capacity=2000, 197 | render=False, 198 | log=False, 199 | value_update_iter=3, 200 | policy_update_iter=3) 201 | test.run() 202 | -------------------------------------------------------------------------------- /PPO_CLIP/ppo_pendulum.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import random 5 | import numpy as np 6 | import gym 7 | from collections import deque 8 | from torch.distributions import Normal 9 | from torch.utils.tensorboard import SummaryWriter 10 | 11 | 12 | class gae_trajectory_buffer(object): 13 | def __init__(self, capacity, gamma, lam): 14 | self.capacity = capacity 15 | self.gamma = gamma 16 | self.lam = lam 17 | self.memory = deque(maxlen=self.capacity) 18 | # * [obs, act, rew, don, val, ret, adv] 19 | 20 | def store(self, obs, act, rew, don, val): 21 | obs = np.expand_dims(obs, 0) 22 | self.memory.append([obs, act, rew, don, val]) 23 | 24 | def process(self): 25 | R = 0 26 | Adv = 0 27 | Value_previous = 0 28 | for traj in reversed(list(self.memory)): 29 | R = self.gamma * R * (1 - traj[3]) + traj[4] 30 | traj.append(R) 31 | # * the generalized advantage estimator(GAE) 32 | delta = traj[2] + Value_previous * self.gamma * (1 - traj[3]) - traj[4] 33 | Adv = delta + (1 - traj[3]) * Adv * self.gamma * self.lam 34 | traj.append(Adv) 35 | Value_previous = traj[4] 36 | 37 | def get(self): 38 | obs, act, rew, don, val, ret, adv = zip(* self.memory) 39 | act = np.expand_dims(act, 1) 40 | rew = np.expand_dims(rew, 1) 41 | don = np.expand_dims(don, 1) 42 | val = np.expand_dims(val, 1) 43 | ret = np.expand_dims(ret, 1) 44 | adv = np.array(adv) 45 | adv = (adv - adv.mean()) / adv.std() 46 | adv = np.expand_dims(adv, 1) 47 | return np.concatenate(obs, 0), act, rew, don, val, ret, adv 48 | 49 | def __len__(self): 50 | return len(self.memory) 51 | 52 | 53 | class policy_net(nn.Module): 54 | def __init__(self, input_dim, output_dim): 55 | super(policy_net, self).__init__() 56 | self.input_dim = input_dim 57 | self.output_dim = output_dim 58 | self.fc1 = nn.Linear(self.input_dim, 128) 59 | self.fc2 = nn.Linear(128, 128) 60 | self.fc3 = nn.Linear(128, self.output_dim) 61 | 62 | def forward(self, input): 63 | x = torch.tanh(self.fc1(input)) 64 | x = torch.tanh(self.fc2(x)) 65 | mu = self.fc3(x) 66 | return mu 67 | 68 | def act(self, input): 69 | mu = self.forward(input) 70 | sigma = torch.ones_like(mu) 71 | dist = Normal(mu, sigma) 72 | action = dist.sample().detach().item() 73 | return action 74 | 75 | def get_distribution(self, input): 76 | mu = self.forward(input) 77 | sigma = torch.ones_like(mu) 78 | dist = Normal(mu, sigma) 79 | return dist 80 | 81 | 82 | class value_net(nn.Module): 83 | def __init__(self, input_dim, output_dim): 84 | super(value_net, self).__init__() 85 | self.input_dim = input_dim 86 | self.output_dim = output_dim 87 | 88 | self.fc1 = nn.Linear(self.input_dim, 128) 89 | self.fc2 = nn.Linear(128, 128) 90 | self.fc3 = nn.Linear(128, self.output_dim) 91 | 92 | def forward(self, input): 93 | x = torch.tanh(self.fc1(input)) 94 | x = torch.tanh(self.fc2(x)) 95 | x = self.fc3(x) 96 | return x 97 | 98 | 99 | class ppo_clip(object): 100 | def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter): 101 | super(ppo_clip, self).__init__() 102 | self.env = env 103 | self.episode = episode 104 | self.learning_rate = learning_rate 105 | self.gamma = gamma 106 | self.lam = lam 107 | self.epsilon = epsilon 108 | self.capacity = capacity 109 | self.render = render 110 | self.log = log 111 | self.value_update_iter = value_update_iter 112 | self.policy_update_iter = policy_update_iter 113 | 114 | self.observation_dim = self.env.observation_space.shape[0] 115 | self.action_dim = self.env.action_space.shape[0] 116 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 117 | self.value_net = value_net(self.observation_dim, 1) 118 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 119 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 120 | self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam) 121 | self.count = 0 122 | self.train_count = 0 123 | self.weight_reward = None 124 | self.writer = SummaryWriter('runs/ppo_clip_pendulum') 125 | 126 | def train(self): 127 | obs, act, rew, don, val, ret, adv = self.buffer.get() 128 | 129 | obs = torch.FloatTensor(obs) 130 | act = torch.FloatTensor(act) 131 | rew = torch.FloatTensor(rew) 132 | don = torch.FloatTensor(don) 133 | val = torch.FloatTensor(val) 134 | ret = torch.FloatTensor(ret) 135 | adv = torch.FloatTensor(adv) 136 | 137 | old_dist = self.policy_net.get_distribution(obs) 138 | old_log_probs = old_dist.log_prob(act).detach() 139 | value_loss_buffer = [] 140 | for _ in range(self.value_update_iter): 141 | value = self.value_net.forward(obs) 142 | value_loss = (ret - value).pow(2).mean() 143 | value_loss_buffer.append(value_loss.item()) 144 | self.value_optimizer.zero_grad() 145 | value_loss.backward() 146 | self.value_optimizer.step() 147 | if self.log: 148 | self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count) 149 | 150 | policy_loss_buffer = [] 151 | for _ in range(self.policy_update_iter): 152 | dist = self.policy_net.get_distribution(obs) 153 | log_probs = dist.log_prob(act) 154 | ratio = torch.exp(log_probs - old_log_probs) 155 | surr1 = ratio * adv 156 | surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv 157 | policy_loss = - torch.min(surr1, surr2).mean() 158 | policy_loss_buffer.append(policy_loss.item()) 159 | self.policy_optimizer.zero_grad() 160 | policy_loss.backward() 161 | self.policy_optimizer.step() 162 | if self.log: 163 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 164 | 165 | def run(self): 166 | for i in range(self.episode): 167 | obs = self.env.reset() 168 | total_reward = 0 169 | if self.render: 170 | self.env.render() 171 | while True: 172 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 173 | next_obs, reward, done, _ = self.env.step([action]) 174 | if self.render: 175 | self.env.render() 176 | value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item() 177 | self.buffer.store(obs, action, reward, done, value) 178 | self.count += 1 179 | total_reward += reward 180 | obs = next_obs 181 | if self.count % self.capacity == 0: 182 | self.buffer.process() 183 | self.train_count += 1 184 | self.train() 185 | if done: 186 | if not self.weight_reward: 187 | self.weight_reward = total_reward 188 | else: 189 | self.weight_reward = self.weight_reward * 0.9 + total_reward * 0.1 190 | if self.log: 191 | self.writer.add_scalar('weight_reward', self.weight_reward, i+1) 192 | self.writer.add_scalar('reward', total_reward, i+1) 193 | print('episode: {} reward: {:.2f} weight_reward: {:.2f} train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count)) 194 | break 195 | 196 | 197 | if __name__ == '__main__': 198 | env = gym.make('Pendulum-v0') 199 | test = ppo_clip(env=env, 200 | episode=10000, 201 | learning_rate=1e-3, 202 | gamma=0.99, 203 | lam=0.97, 204 | epsilon=0.2, 205 | capacity=2000, 206 | render=False, 207 | log=False, 208 | value_update_iter=10, 209 | policy_update_iter=10) 210 | test.run() 211 | -------------------------------------------------------------------------------- /REINFORCE/reinforce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import random 5 | from torch.distributions import Categorical 6 | import gym 7 | import numpy as np 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | class net(nn.Module): 11 | def __init__(self, input_dim, output_dim): 12 | super(net, self).__init__() 13 | self.input_dim = input_dim 14 | self.output_dim = output_dim 15 | 16 | self.fc1 = nn.Linear(self.input_dim, 256) 17 | self.fc2 = nn.Linear(256, self.output_dim) 18 | 19 | self.rewards = [] 20 | self.log_probs = [] 21 | 22 | def forward(self, input): 23 | x = self.fc1(input) 24 | x = F.relu(x) 25 | x = self.fc2(x) 26 | x = F.softmax(x, 1) 27 | return x 28 | 29 | def act(self, input): 30 | probs = self.forward(input) 31 | dist = Categorical(probs) 32 | action = dist.sample() 33 | self.log_probs.append(dist.log_prob(action)) 34 | return action.item() 35 | 36 | 37 | class reinforce(object): 38 | def __init__(self, env, gamma, learning_rate, episode, render): 39 | self.env = env 40 | self.observation_dim = self.env.observation_space.shape[0] 41 | self.action_dim = self.env.action_space.n 42 | self.gamma = gamma 43 | self.learning_rate = learning_rate 44 | self.episode = episode 45 | self.render = render 46 | self.net = net(self.observation_dim, self.action_dim) 47 | self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) 48 | self.total_returns = [] 49 | self.weight_reward = None 50 | self.writer = SummaryWriter('runs/reinforce') 51 | self.count = 0 52 | 53 | def train(self, ): 54 | total_returns = torch.FloatTensor(self.total_returns) 55 | eps = np.finfo(np.float32).eps.item() 56 | total_returns = (total_returns - total_returns.mean()) / (total_returns.std() + eps) 57 | log_probs = torch.cat(self.net.log_probs, 0) 58 | loss = (- log_probs * total_returns.detach()) 59 | loss = loss.sum() 60 | self.writer.add_scalar('loss', loss, self.count) 61 | self.optimizer.zero_grad() 62 | loss.backward() 63 | torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.1) 64 | self.optimizer.step() 65 | 66 | def run(self, ): 67 | for i in range(self.episode): 68 | obs = self.env.reset() 69 | total_reward = 0 70 | if self.render: 71 | self.env.render() 72 | while True: 73 | action = self.net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 74 | next_obs, reward, done, info = self.env.step(action) 75 | self.net.rewards.append(reward) 76 | total_reward += reward 77 | self.count += 1 78 | if self.render: 79 | self.env.render() 80 | obs = next_obs 81 | if done: 82 | R = 0 83 | if self.weight_reward: 84 | self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward 85 | else: 86 | self.weight_reward = total_reward 87 | for r in reversed(self.net.rewards): 88 | R = R * self.gamma + r 89 | self.total_returns.append(R) 90 | self.total_returns = list(reversed(self.total_returns)) 91 | self.train() 92 | del self.net.rewards[:] 93 | del self.net.log_probs[:] 94 | del self.total_returns[:] 95 | print('episode: {} reward: {:.1f} weight_reward: {:.2f}'.format(i+1, total_reward, self.weight_reward)) 96 | self.writer.add_scalar('reward', total_reward, i) 97 | self.writer.add_scalar('weight_reward', self.weight_reward, i) 98 | break 99 | 100 | 101 | if __name__ == '__main__': 102 | env = gym.make('CartPole-v0') 103 | env = env.unwrapped 104 | test = reinforce(env, gamma=0.99, learning_rate=1e-3, episode=100000, render=False) 105 | test.run() -------------------------------------------------------------------------------- /RND_PPO/rnd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import random 5 | import numpy as np 6 | import gym 7 | from collections import deque 8 | from torch.distributions import Categorical 9 | from torch.utils.tensorboard import SummaryWriter 10 | 11 | 12 | class gae_trajectory_buffer(object): 13 | def __init__(self, capacity, gamma, lam): 14 | self.capacity = capacity 15 | self.gamma = gamma 16 | self.lam = lam 17 | self.memory = deque(maxlen=self.capacity) 18 | # * [obs, next_obs, act, rew, don, val, ret, adv] 19 | 20 | def store(self, obs, next_obs, act, rew, don, val): 21 | obs = np.expand_dims(obs, 0) 22 | next_obs = np.expand_dims(next_obs, 0) 23 | self.memory.append([obs, next_obs, act, rew, don, val]) 24 | 25 | def process(self): 26 | R = 0 27 | Adv = 0 28 | Value_previous = 0 29 | for traj in reversed(list(self.memory)): 30 | R = self.gamma * R * (1 - traj[4]) + traj[5] 31 | traj.append(R) 32 | # * the generalized advantage estimator(GAE) 33 | delta = traj[3] + Value_previous * self.gamma * (1 - traj[4]) - traj[5] 34 | Adv = delta + (1 - traj[4]) * Adv * self.gamma * self.lam 35 | traj.append(Adv) 36 | Value_previous = traj[5] 37 | 38 | def get(self): 39 | obs, next_obs, act, rew, don, val, ret, adv = zip(* self.memory) 40 | act = np.expand_dims(act, 1) 41 | rew = np.expand_dims(rew, 1) 42 | don = np.expand_dims(don, 1) 43 | val = np.expand_dims(val, 1) 44 | ret = np.expand_dims(ret, 1) 45 | adv = np.array(adv) 46 | adv = (adv - adv.mean()) / adv.std() 47 | adv = np.expand_dims(adv, 1) 48 | return np.concatenate(obs, 0), np.concatenate(next_obs, 0), act, rew, don, val, ret, adv 49 | 50 | def __len__(self): 51 | return len(self.memory) 52 | 53 | def clear(self): 54 | self.memory.clear() 55 | 56 | 57 | class policy_net(nn.Module): 58 | def __init__(self, input_dim, output_dim): 59 | super(policy_net, self).__init__() 60 | self.input_dim = input_dim 61 | self.output_dim = output_dim 62 | self.fc1 = nn.Linear(self.input_dim, 128) 63 | self.fc2 = nn.Linear(128, 128) 64 | self.fc3 = nn.Linear(128, self.output_dim) 65 | 66 | def forward(self, input): 67 | x = F.relu(self.fc1(input)) 68 | x = F.relu(self.fc2(x)) 69 | x = self.fc3(x) 70 | return F.softmax(x, 1) 71 | 72 | def act(self, input): 73 | probs = self.forward(input) 74 | dist = Categorical(probs) 75 | action = dist.sample() 76 | action = action.detach().item() 77 | return action 78 | 79 | 80 | class value_net(nn.Module): 81 | def __init__(self, input_dim, output_dim): 82 | super(value_net, self).__init__() 83 | self.input_dim = input_dim 84 | self.output_dim = output_dim 85 | 86 | self.fc1 = nn.Linear(self.input_dim, 128) 87 | self.fc2 = nn.Linear(128, 128) 88 | self.int_layer = nn.Linear(128, self.output_dim) 89 | self.ext_layer = nn.Linear(128, self.output_dim) 90 | 91 | def forward(self, input): 92 | x = F.relu(self.fc1(input)) 93 | x = F.relu(self.fc2(x)) 94 | value_int = self.int_layer(x) 95 | value_ext = self.ext_layer(x) 96 | return value_int, value_ext 97 | 98 | 99 | class rnd(nn.Module): 100 | def __init__(self, input_dim): 101 | super(rnd, self).__init__() 102 | self.input_dim = input_dim 103 | 104 | self.predictor = nn.Sequential( 105 | nn.Linear(self.input_dim, 128), 106 | nn.ReLU(), 107 | nn.Linear(128, 256), 108 | nn.ReLU(), 109 | nn.Linear(256, 256), 110 | nn.ReLU(), 111 | nn.Linear(256, 256) 112 | ) 113 | 114 | self.target = nn.Sequential( 115 | nn.Linear(self.input_dim, 128), 116 | nn.ReLU(), 117 | nn.Linear(128, 256), 118 | nn.ReLU(), 119 | nn.Linear(256, 256) 120 | ) 121 | 122 | for param in self.target.parameters(): 123 | param.requires_grad = False 124 | 125 | def forward(self, input): 126 | pre_feature = self.predictor(input) 127 | tar_feature = self.target(input) 128 | return pre_feature, tar_feature 129 | 130 | def calc_int_reward(self, input): 131 | pre_feature, tar_feature = self.forward(input) 132 | int_reward = 0.5 * (pre_feature - tar_feature).pow(2).sum(-1) 133 | return int_reward.detach().numpy() 134 | 135 | class ppo_clip(object): 136 | def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter, int_coef, ext_coef, rnd_update_prop): 137 | super(ppo_clip, self).__init__() 138 | self.env = env 139 | self.episode = episode 140 | self.learning_rate = learning_rate 141 | self.gamma = gamma 142 | self.lam = lam 143 | self.epsilon = epsilon 144 | self.capacity = capacity 145 | self.render = render 146 | self.log = log 147 | self.value_update_iter = value_update_iter 148 | self.policy_update_iter = policy_update_iter 149 | self.int_coef = int_coef 150 | self.ext_coef = ext_coef 151 | self.rnd_update_prop = rnd_update_prop 152 | 153 | self.observation_dim = self.env.observation_space.shape[0] 154 | self.action_dim = self.env.action_space.n 155 | 156 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 157 | self.value_net = value_net(self.observation_dim, 1) 158 | self.rnd = rnd(self.observation_dim) 159 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 160 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 161 | self.rnd_optimizer = torch.optim.Adam(self.rnd.predictor.parameters(), lr=self.learning_rate) 162 | self.int_buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam) 163 | self.ext_buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam) 164 | 165 | self.count = 0 166 | self.train_count = 0 167 | self.weight_reward = None 168 | self.writer = SummaryWriter('runs/ppo_clip_rnd') 169 | 170 | def train(self): 171 | obs, next_obs, act, int_rew, don, _, _, int_adv = self.int_buffer.get() 172 | _, _, _, ext_rew, _, _, _, ext_adv = self.int_buffer.get() 173 | 174 | obs = torch.FloatTensor(obs) 175 | next_obs = torch.FloatTensor(next_obs) 176 | act = torch.LongTensor(act) 177 | int_rew = torch.FloatTensor(int_rew) 178 | ext_rew = torch.FloatTensor(ext_rew) 179 | don = torch.FloatTensor(don) 180 | int_adv = torch.FloatTensor(int_adv).squeeze(1) 181 | ext_adv = torch.FloatTensor(ext_adv).squeeze(1) 182 | adv = self.int_coef * int_adv + self.ext_coef * ext_adv 183 | 184 | old_probs = self.policy_net.forward(obs) 185 | old_probs = old_probs.gather(1, act).squeeze(1).detach() 186 | value_loss_buffer = [] 187 | policy_loss_buffer = [] 188 | rnd_loss_buffer = [] 189 | for _ in range(self.value_update_iter): 190 | value_int, value_ext = self.value_net.forward(obs) 191 | next_value_int, next_value_ext = self.value_net.forward(next_obs) 192 | # * intrinsic value net 193 | int_td_target = int_rew + self.gamma * next_value_int * (1 - don) 194 | int_value_loss = F.mse_loss(int_td_target.detach(), value_int) 195 | # * external value net 196 | ext_td_target = ext_rew + self.gamma * next_value_ext * (1 - don) 197 | ext_value_loss = F.mse_loss(ext_td_target.detach(), value_ext) 198 | value_loss = 0.5 * (int_value_loss + ext_value_loss) 199 | 200 | value_loss_buffer.append(value_loss.item()) 201 | self.value_optimizer.zero_grad() 202 | value_loss.backward() 203 | self.value_optimizer.step() 204 | 205 | probs = self.policy_net.forward(obs) 206 | probs = probs.gather(1, act).squeeze(1) 207 | ratio = probs / old_probs 208 | surr1 = ratio * adv 209 | surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv 210 | policy_loss = - torch.min(surr1, surr2).mean() 211 | policy_loss_buffer.append(policy_loss.item()) 212 | self.policy_optimizer.zero_grad() 213 | policy_loss.backward() 214 | self.policy_optimizer.step() 215 | 216 | pre_feature, tar_feature = self.rnd.forward(obs) 217 | rnd_loss = (pre_feature - tar_feature.detach()).pow(2).mean(-1) 218 | mask = torch.rand(len(rnd_loss)) 219 | mask = torch.FloatTensor((mask < self.rnd_update_prop).float()) 220 | rnd_loss = (rnd_loss * mask).sum() / torch.max(mask.sum(), torch.FloatTensor([1.])) 221 | rnd_loss_buffer.append(rnd_loss) 222 | self.rnd_optimizer.zero_grad() 223 | rnd_loss.backward() 224 | self.rnd_optimizer.step() 225 | if self.log: 226 | self.writer.add_scalar('rnd_loss', np.mean(policy_loss_buffer), self.train_count) 227 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 228 | self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count) 229 | 230 | def run(self): 231 | for i in range(self.episode): 232 | obs = self.env.reset() 233 | total_reward = 0 234 | if self.render: 235 | self.env.render() 236 | while True: 237 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 238 | next_obs, ext_reward, done, _ = self.env.step(action) 239 | int_reward = self.rnd.calc_int_reward(torch.FloatTensor(np.expand_dims(obs, 0)))[0] 240 | if self.render: 241 | self.env.render() 242 | value_int, value_ext = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))) 243 | value_int = value_int.detach().item() 244 | value_ext = value_ext.detach().item() 245 | self.ext_buffer.store(obs, next_obs, action, ext_reward, done, value_ext) 246 | self.int_buffer.store(obs, next_obs, action, int_reward, done, value_int) 247 | self.count += 1 248 | total_reward += ext_reward 249 | obs = next_obs 250 | if self.count % self.capacity == 0: 251 | self.int_buffer.process() 252 | self.ext_buffer.process() 253 | self.train_count += 1 254 | self.train() 255 | self.int_buffer.clear() 256 | self.ext_buffer.clear() 257 | if done: 258 | if not self.weight_reward: 259 | self.weight_reward = total_reward 260 | else: 261 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 262 | if self.log: 263 | self.writer.add_scalar('weight_reward', self.weight_reward, i+1) 264 | self.writer.add_scalar('reward', total_reward, i+1) 265 | print('episode: {} reward: {:.2f} weight_reward: {:.2f} train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count)) 266 | break 267 | 268 | 269 | if __name__ == '__main__': 270 | env = gym.make('CartPole-v1').unwrapped 271 | test = ppo_clip( 272 | env=env, 273 | episode=10000, 274 | learning_rate=1e-3, 275 | gamma=0.99, 276 | lam=0.97, 277 | epsilon=0.2, 278 | capacity=20, 279 | render=False, 280 | log=False, 281 | value_update_iter=10, 282 | policy_update_iter=10, 283 | int_coef=1., 284 | ext_coef=2., 285 | rnd_update_prop=0.25 286 | ) 287 | test.run() 288 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Related Papers 2 | 3 | * A3C 4 | 5 | [Asynchronous Methods for Deep Reinforcement Learning](http://arxiv.org/abs/1602.01783) 6 | 7 | * ACER 8 | 9 | [Sample Efficient Actor-Critic with Experience Replay](http://arxiv.org/abs/1611.01224) 10 | 11 | * TRPO 12 | 13 | [Trust Region Policy Optimization](http://arxiv.org/abs/1502.05477) 14 | 15 | * PPO 16 | 17 | [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347v2) 18 | 19 | * ICM 20 | 21 | [Curiosity-driven Exploration by Self-supervised Prediction](http://arxiv.org/abs/1705.05363) 22 | 23 | * RND 24 | 25 | [Exploration by Random Network Distillation](http://arxiv.org/abs/1810.12894) 26 | 27 | * DDPG 28 | 29 | [Continuous control with deep reinforcement learning](http://arxiv.org/abs/1509.02971) 30 | 31 | * TD3 32 | 33 | [Addressing Function Approximation Error in Actor-Critic Methods](http://arxiv.org/abs/1802.09477) 34 | 35 | * SAC 36 | 37 | [Soft Actor-Critic Algorithms and Applications](http://arxiv.org/abs/1812.05905) 38 | 39 | [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor](http://arxiv.org/abs/1801.01290) 40 | 41 | [Soft Actor-Critic for Discrete Action Settings](http://arxiv.org/abs/1910.07207) 42 | 43 | * DSAC 44 | 45 | [DSAC: Distributional Soft Actor Critic for Risk-Sensitive Reinforcement Learning](http://arxiv.org/abs/2004.14547) 46 | 47 | -------------------------------------------------------------------------------- /SAC/sac.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import random 6 | from collections import deque 7 | import numpy as np 8 | import gym 9 | import math 10 | from torch.utils.tensorboard import SummaryWriter 11 | 12 | class normallized_action_wrapper(gym.ActionWrapper): 13 | # * because the tanh value range is [-1, 1], so change the env action range 14 | def action(self, action): 15 | # * change action range from [-1, 1] to [env.low, env.high] 16 | low = self.action_space.low 17 | high = self.action_space.high 18 | 19 | action = (action + 1) / 2 * (high - low) - 2 20 | action = np.clip(action, low, high) 21 | return action 22 | 23 | def reverse_action(self, action): 24 | # * change action range from [env.low, env.high] to [-1, 1] 25 | low = self.action_space.low 26 | high = self.action_space.high 27 | 28 | action = (action - low) / ((high - low) / 2) - 1 29 | action = np.clip(action, -1, 1) 30 | return action 31 | 32 | 33 | class replay_buffer(object): 34 | def __init__(self, capacity): 35 | self.capacity = capacity 36 | self.memory = deque(maxlen=self.capacity) 37 | 38 | def store(self, observation, action, reward, next_observation, done): 39 | observation = np.expand_dims(observation, 0) 40 | next_observation = np.expand_dims(next_observation, 0) 41 | self.memory.append([observation, action, reward, next_observation, done]) 42 | 43 | def sample(self, batch_size): 44 | batch = random.sample(self.memory, batch_size) 45 | observation, action, reward, next_observation, done = zip(* batch) 46 | return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done 47 | 48 | def __len__(self): 49 | return len(self.memory) 50 | 51 | 52 | class policy_net(nn.Module): 53 | # * SAC trains a stochastic policy, not a deterministic policy which like TD3 and DDPG 54 | def __init__(self, input_dim, output_dim, min_log_sigma=-20., max_log_sigma=2.): 55 | super(policy_net, self).__init__() 56 | self.input_dim = input_dim 57 | self.output_dim = output_dim 58 | self.min_log_sigma = min_log_sigma 59 | self.max_log_sigma = max_log_sigma 60 | 61 | self.fc1 = nn.Linear(self.input_dim, 128) 62 | self.fc2 = nn.Linear(128, 128) 63 | self.fc_mu = nn.Linear(128, self.output_dim) 64 | self.fc_sigma = nn.Linear(128, self.output_dim) 65 | 66 | def forward(self, input): 67 | x = F.relu(self.fc1(input)) 68 | x = F.relu(self.fc2(x)) 69 | mu = self.fc_mu(x) 70 | # * standard deviations are parameterized, the way not same as VPG, PPO and TRPO 71 | log_sigma = self.fc_sigma(x) 72 | log_sigma = torch.clamp(log_sigma, self.min_log_sigma, self.max_log_sigma) 73 | return mu, log_sigma 74 | 75 | def act(self, input): 76 | mu, log_sigma = self.forward(input) 77 | sigma = torch.exp(log_sigma) 78 | dist = Normal(mu, sigma) 79 | # * reparameterization trick: recognize the difference of sample() and rsample() 80 | action = dist.rsample() 81 | tanh_action = torch.tanh(action) 82 | # * the log-probabilities of actions can be calculated in closed forms 83 | log_prob = dist.log_prob(action) 84 | log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2)).sum(1, keepdim=True) 85 | return tanh_action, log_prob 86 | 87 | 88 | class value_net(nn.Module): 89 | def __init__(self, input1_dim, input2_dim, output_dim): 90 | super(value_net, self).__init__() 91 | self.input1_dim = input1_dim 92 | self.input2_dim = input2_dim 93 | self.output_dim = output_dim 94 | 95 | self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128) 96 | self.fc2 = nn.Linear(128, 128) 97 | self.fc3 = nn.Linear(128, self.output_dim) 98 | 99 | def forward(self, input1, input2): 100 | x = torch.cat([input1, input2], 1) 101 | x = F.relu(self.fc1(x)) 102 | x = F.relu(self.fc2(x)) 103 | x = self.fc3(x) 104 | return x 105 | 106 | 107 | class sac(object): 108 | def __init__(self, env, batch_size, learning_rate, exploration, episode, gamma, alpha, auto_entropy_tuning, capacity, rho, update_iter, update_every, render, log): 109 | self.env = env 110 | self.batch_size = batch_size 111 | self.learning_rate = learning_rate 112 | self.exploration = exploration 113 | self.episode = episode 114 | self.gamma = gamma 115 | self.auto_entropy_tuning = auto_entropy_tuning 116 | if not self.auto_entropy_tuning: 117 | self.alpha = alpha 118 | else: 119 | # * the automatic temperature alpha tuning mechanism 120 | self.log_alpha = torch.zeros(1, requires_grad=True) 121 | self.alpha = self.log_alpha.exp() 122 | self.target_entropy = - torch.prod(torch.FloatTensor(self.env.action_space.shape)).item() 123 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.learning_rate, eps=1e-4) 124 | self.capacity = capacity 125 | self.rho = rho 126 | self.update_iter = update_iter 127 | self.update_every = update_every 128 | self.render = render 129 | self.log = log 130 | 131 | self.observation_dim = self.env.observation_space.shape[0] 132 | self.action_dim = self.env.action_space.shape[0] 133 | 134 | self.value_net1 = value_net(self.observation_dim, self.action_dim, 1) 135 | self.value_net2 = value_net(self.observation_dim, self.action_dim, 1) 136 | self.target_value_net1 = value_net(self.observation_dim, self.action_dim, 1) 137 | self.target_value_net2 = value_net(self.observation_dim, self.action_dim, 1) 138 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 139 | self.target_value_net1.load_state_dict(self.value_net1.state_dict()) 140 | self.target_value_net2.load_state_dict(self.value_net2.state_dict()) 141 | 142 | self.buffer = replay_buffer(capacity=self.capacity) 143 | 144 | self.value_optimizer1 = torch.optim.Adam(self.value_net1.parameters(), lr=self.learning_rate) 145 | self.value_optimizer2 = torch.optim.Adam(self.value_net2.parameters(), lr=self.learning_rate) 146 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 147 | 148 | self.weight_reward = None 149 | self.count = 0 150 | self.train_count = 0 151 | self.writer = SummaryWriter('runs/sac') 152 | 153 | def soft_update(self): 154 | for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()): 155 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 156 | for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()): 157 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 158 | 159 | def train(self): 160 | observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size) 161 | 162 | observation = torch.FloatTensor(observation) 163 | action = torch.FloatTensor(action).unsqueeze(1) 164 | reward = torch.FloatTensor(reward).unsqueeze(1) 165 | next_observation = torch.FloatTensor(next_observation) 166 | done = torch.FloatTensor(done).unsqueeze(1) 167 | 168 | value_loss1_buffer = [] 169 | value_loss2_buffer = [] 170 | policy_loss_buffer = [] 171 | for _ in range(self.update_iter): 172 | next_action, log_prob = self.policy_net.act(next_observation) 173 | target_q_value1 = self.target_value_net1.forward(next_observation, next_action) 174 | target_q_value2 = self.target_value_net2.forward(next_observation, next_action) 175 | target_q = reward + (1 - done) * self.gamma * (torch.min(target_q_value1, target_q_value2) - self.alpha * log_prob) 176 | target_q = target_q.detach() 177 | 178 | q1 = self.value_net1.forward(observation, action) 179 | q2 = self.value_net2.forward(observation, action) 180 | value_loss1 = (q1 - target_q).pow(2).mean() 181 | value_loss2 = (q2 - target_q).pow(2).mean() 182 | value_loss1_buffer.append(value_loss1.detach().item()) 183 | value_loss2_buffer.append(value_loss2.detach().item()) 184 | 185 | self.value_optimizer1.zero_grad() 186 | value_loss1.backward() 187 | nn.utils.clip_grad_norm_(self.value_net1.parameters(), 0.5) 188 | self.value_optimizer1.step() 189 | 190 | self.value_optimizer2.zero_grad() 191 | value_loss2.backward() 192 | nn.utils.clip_grad_norm_(self.value_net2.parameters(), 0.5) 193 | self.value_optimizer2.step() 194 | 195 | sample_action, sample_log_prob = self.policy_net.act(observation) 196 | sample_q1 = self.value_net1.forward(observation, sample_action) 197 | sample_q2 = self.value_net2.forward(observation, sample_action) 198 | policy_loss = - (torch.min(sample_q1, sample_q2) - self.alpha * sample_log_prob) 199 | policy_loss = policy_loss.mean() 200 | policy_loss_buffer.append(policy_loss.detach().item()) 201 | 202 | self.policy_optimizer.zero_grad() 203 | policy_loss.backward() 204 | nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5) 205 | self.policy_optimizer.step() 206 | 207 | if self.auto_entropy_tuning: 208 | self.alpha_optimizer.zero_grad() 209 | entropy_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean() 210 | entropy_loss.backward() 211 | self.alpha_optimizer.step() 212 | 213 | self.alpha = self.log_alpha.exp() 214 | 215 | self.soft_update() 216 | if self.log: 217 | self.writer.add_scalar('value_loss1', np.mean(value_loss1_buffer), self.train_count) 218 | self.writer.add_scalar('value_loss2', np.mean(value_loss2_buffer), self.train_count) 219 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 220 | 221 | def run(self): 222 | for i in range(self.episode): 223 | obs = self.env.reset() 224 | total_reward = 0 225 | if self.render: 226 | self.env.render() 227 | while True: 228 | if i >= self.exploration: 229 | action, _ = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 230 | action = action.detach().item() 231 | else: 232 | action = np.random.uniform(-1., 1.) 233 | next_obs, reward, done, _ = self.env.step(action) 234 | if self.render: 235 | self.env.render() 236 | self.buffer.store(obs, action, reward, next_obs, done) 237 | self.count += 1 238 | total_reward += reward 239 | obs = next_obs 240 | 241 | if (self.count % self.update_every) == 0 and i >= self.exploration: 242 | self.train_count += 1 243 | self.train() 244 | if done: 245 | if not self.weight_reward: 246 | self.weight_reward = total_reward 247 | else: 248 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 249 | if self.log: 250 | self.writer.add_scalar('reward', total_reward, i + 1) 251 | self.writer.add_scalar('weight_reward', self.weight_reward, i + 1) 252 | print('episode: {} reward: {:.2f} weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward)) 253 | break 254 | 255 | 256 | if __name__ == '__main__': 257 | env = normallized_action_wrapper(gym.make('Pendulum-v0')) 258 | test = sac(env=env, 259 | batch_size=100, 260 | learning_rate=1e-3, 261 | exploration=300, 262 | episode=10000, 263 | gamma=0.99, 264 | alpha=None, 265 | auto_entropy_tuning=True, 266 | capacity=1000000, 267 | rho=0.995, 268 | update_iter=10, 269 | update_every=50, 270 | render=False, 271 | log=False) 272 | test.run() -------------------------------------------------------------------------------- /SAC/sac_discrete.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import gym 5 | from collections import deque 6 | import random 7 | from torch.utils.tensorboard import SummaryWriter 8 | import numpy as np 9 | 10 | 11 | class replay_buffer(object): 12 | def __init__(self, capacity): 13 | self.capacity = capacity 14 | self.memory = deque(maxlen=self.capacity) 15 | 16 | def store(self, observation, action, reward, next_observation, done): 17 | observation = np.expand_dims(observation, 0) 18 | next_observation = np.expand_dims(next_observation, 0) 19 | self.memory.append([observation, action, reward, next_observation, done]) 20 | 21 | def sample(self, batch_size): 22 | batch = random.sample(self.memory, batch_size) 23 | observation, action, reward, next_observation, done = zip(* batch) 24 | return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done 25 | 26 | def __len__(self): 27 | return len(self.memory) 28 | 29 | 30 | class value_net(nn.Module): 31 | def __init__(self, input_dim, output_dim): 32 | super(value_net, self).__init__() 33 | self.input_dim = input_dim 34 | self.output_dim = output_dim 35 | 36 | self.fc1 = nn.Linear(self.input_dim, 128) 37 | self.fc2 = nn.Linear(128, 128) 38 | self.fc3 = nn.Linear(128, self.output_dim) 39 | 40 | def forward(self, input): 41 | x = F.relu(self.fc1(input)) 42 | x = F.relu(self.fc2(x)) 43 | x = self.fc3(x) 44 | return x 45 | 46 | 47 | class policy_net(nn.Module): 48 | def __init__(self, input_dim, output_dim): 49 | super(policy_net, self).__init__() 50 | self.input_dim = input_dim 51 | self.output_dim = output_dim 52 | 53 | self.fc1 = nn.Linear(self.input_dim, 128) 54 | self.fc2 = nn.Linear(128, 128) 55 | self.fc3 = nn.Linear(128, self.output_dim) 56 | 57 | def forward(self, input): 58 | x = F.relu(self.fc1(input)) 59 | x = F.relu(self.fc2(x)) 60 | policy = F.softmax(self.fc3(x), dim=-1) 61 | return policy 62 | 63 | def act(self, input): 64 | policy = self.forward(input) 65 | dist = torch.distributions.Categorical(policy) 66 | action = dist.sample() 67 | return action[0].item() 68 | 69 | 70 | class sac_discrete(object): 71 | def __init__(self, env, batch_size, learning_rate, exploration, episode, gamma, alpha, auto_entropy_tuning, capacity, rho, update_iter, update_every, render, log): 72 | self.env = env 73 | self.batch_size = batch_size 74 | self.learning_rate = learning_rate 75 | self.exploration = exploration 76 | self.episode = episode 77 | self.gamma = gamma 78 | self.auto_entropy_tuning = auto_entropy_tuning 79 | if not self.auto_entropy_tuning: 80 | self.alpha = alpha 81 | else: 82 | self.log_alpha = torch.zeros(1, requires_grad=True) 83 | self.alpha = self.log_alpha.exp() 84 | # * set the max possible entropy as the target entropy 85 | self.target_entropy = -np.log((1. / self.env.action_space.n)) * 0.98 86 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.learning_rate, eps=1e-4) 87 | self.capacity = capacity 88 | self.rho = rho 89 | self.update_iter = update_iter 90 | self.update_every = update_every 91 | self.render = render 92 | self.log = log 93 | 94 | self.observation_dim = self.env.observation_space.shape[0] 95 | self.action_num = self.env.action_space.n 96 | 97 | self.value_net1 = value_net(self.observation_dim, self.action_num) 98 | self.value_net2 = value_net(self.observation_dim, self.action_num) 99 | self.target_value_net1 = value_net(self.observation_dim, self.action_num) 100 | self.target_value_net2 = value_net(self.observation_dim, self.action_num) 101 | self.policy_net = policy_net(self.observation_dim, self.action_num) 102 | self.target_value_net1.load_state_dict(self.value_net1.state_dict()) 103 | self.target_value_net2.load_state_dict(self.value_net2.state_dict()) 104 | 105 | self.buffer = replay_buffer(capacity=self.capacity) 106 | 107 | self.value_optimizer1 = torch.optim.Adam(self.value_net1.parameters(), lr=self.learning_rate) 108 | self.value_optimizer2 = torch.optim.Adam(self.value_net2.parameters(), lr=self.learning_rate) 109 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 110 | 111 | self.weight_reward = None 112 | self.count = 0 113 | self.train_count = 0 114 | self.writer = SummaryWriter('runs/sac_discrete') 115 | 116 | def soft_update(self): 117 | for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()): 118 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 119 | for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()): 120 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 121 | 122 | def train(self): 123 | observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size) 124 | 125 | observation = torch.FloatTensor(observation) 126 | action = torch.LongTensor(action).unsqueeze(1) 127 | reward = torch.FloatTensor(reward).unsqueeze(1) 128 | next_observation = torch.FloatTensor(next_observation) 129 | done = torch.FloatTensor(done).unsqueeze(1) 130 | 131 | value_loss1_buffer = [] 132 | value_loss2_buffer = [] 133 | policy_loss_buffer = [] 134 | for _ in range(self.update_iter): 135 | policy = self.policy_net.forward(next_observation) 136 | target_q_value1 = self.target_value_net1.forward(next_observation) 137 | target_q_value2 = self.target_value_net2.forward(next_observation) 138 | # * calculate the expectation directly 139 | target_q = reward + (1 - done) * self.gamma * (policy * (torch.min(target_q_value1, target_q_value2) - self.alpha * policy.log())).mean(dim=1).unsqueeze(-1) 140 | target_q = target_q.detach() 141 | 142 | q1 = self.value_net1.forward(observation).gather(dim=1, index=action) 143 | q2 = self.value_net2.forward(observation).gather(dim=1, index=action) 144 | value_loss1 = (q1 - target_q).pow(2).mean() 145 | value_loss2 = (q2 - target_q).pow(2).mean() 146 | value_loss1_buffer.append(value_loss1.detach().item()) 147 | value_loss2_buffer.append(value_loss2.detach().item()) 148 | 149 | self.value_optimizer1.zero_grad() 150 | value_loss1.backward() 151 | nn.utils.clip_grad_norm_(self.value_net1.parameters(), 0.5) 152 | self.value_optimizer1.step() 153 | 154 | self.value_optimizer2.zero_grad() 155 | value_loss2.backward() 156 | nn.utils.clip_grad_norm_(self.value_net2.parameters(), 0.5) 157 | self.value_optimizer2.step() 158 | 159 | # * calculate the expectation directly 160 | policy_loss = policy * (self.alpha * policy.log() - torch.min(target_q_value1, target_q_value2).detach()) 161 | policy_loss = policy_loss.mean() 162 | 163 | self.policy_optimizer.zero_grad() 164 | policy_loss.backward() 165 | nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5) 166 | self.policy_optimizer.step() 167 | 168 | if self.auto_entropy_tuning: 169 | self.alpha_optimizer.zero_grad() 170 | entropy_loss = -(self.log_alpha * (policy.log() + self.target_entropy).detach()).mean() 171 | entropy_loss.backward() 172 | nn.utils.clip_grad_norm_([self.log_alpha], 0.2) 173 | self.alpha_optimizer.step() 174 | 175 | self.alpha = self.log_alpha.exp() 176 | 177 | self.soft_update() 178 | if self.log: 179 | self.writer.add_scalar('value_loss1', np.mean(value_loss1_buffer), self.train_count) 180 | self.writer.add_scalar('value_loss2', np.mean(value_loss2_buffer), self.train_count) 181 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 182 | 183 | def run(self): 184 | for i in range(self.episode): 185 | obs = self.env.reset() 186 | total_reward = 0 187 | if self.render: 188 | self.env.render() 189 | while True: 190 | if i >= self.exploration: 191 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 192 | else: 193 | action = random.choice(list(range(self.action_num))) 194 | next_obs, reward, done, _ = self.env.step(action) 195 | if self.render: 196 | self.env.render() 197 | self.buffer.store(obs, action, reward, next_obs, done) 198 | self.count += 1 199 | total_reward += reward 200 | obs = next_obs 201 | 202 | if (self.count % self.update_every) == 0 and i >= self.exploration: 203 | self.train_count += 1 204 | self.train() 205 | if done: 206 | if not self.weight_reward: 207 | self.weight_reward = total_reward 208 | else: 209 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 210 | if self.log: 211 | self.writer.add_scalar('reward', total_reward, i + 1) 212 | self.writer.add_scalar('weight_reward', self.weight_reward, i + 1) 213 | print('episode: {} reward: {:.2f} weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward)) 214 | break 215 | 216 | if __name__ == '__main__': 217 | env = gym.make('CartPole-v1').unwrapped 218 | test = sac_discrete( 219 | env=env, 220 | batch_size=64, 221 | learning_rate=3e-4, 222 | exploration=3000, 223 | episode=10000, 224 | gamma=0.99, 225 | alpha=None, 226 | auto_entropy_tuning=True, 227 | capacity=100000, 228 | rho=0.995, 229 | update_iter=3, 230 | update_every=5, 231 | render=False, 232 | log=False 233 | ) 234 | test.run() 235 | -------------------------------------------------------------------------------- /TD3/td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import random 6 | from collections import deque 7 | import numpy as np 8 | import gym 9 | import math 10 | from torch.utils.tensorboard import SummaryWriter 11 | 12 | 13 | class replay_buffer(object): 14 | def __init__(self, capacity): 15 | self.capacity = capacity 16 | self.memory = deque(maxlen=self.capacity) 17 | 18 | def store(self, observation, action, reward, next_observation, done): 19 | observation = np.expand_dims(observation, 0) 20 | next_observation = np.expand_dims(next_observation, 0) 21 | self.memory.append([observation, action, reward, next_observation, done]) 22 | 23 | def sample(self, batch_size): 24 | batch = random.sample(self.memory, batch_size) 25 | observation, action, reward, next_observation, done = zip(* batch) 26 | return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done 27 | 28 | def __len__(self): 29 | return len(self.memory) 30 | 31 | 32 | class policy_net(nn.Module): 33 | # * deterministic actor network, output a deterministic value as the selected action 34 | def __init__(self, input_dim, output_dim): 35 | super(policy_net, self).__init__() 36 | self.input_dim = input_dim 37 | self.output_dim = output_dim 38 | 39 | self.fc1 = nn.Linear(self.input_dim, 128) 40 | self.fc2 = nn.Linear(128, 128) 41 | self.fc3 = nn.Linear(128, self.output_dim) 42 | 43 | def forward(self, input): 44 | x = F.relu(self.fc1(input)) 45 | x = F.relu(self.fc2(x)) 46 | x = self.fc3(x) 47 | return x 48 | 49 | def act(self, input): 50 | action = self.forward(input).detach().item() 51 | return action 52 | 53 | 54 | class value_net(nn.Module): 55 | def __init__(self, input1_dim, input2_dim, output_dim): 56 | super(value_net, self).__init__() 57 | self.input1_dim = input1_dim 58 | self.input2_dim = input2_dim 59 | self.output_dim = output_dim 60 | 61 | self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128) 62 | self.fc2 = nn.Linear(128, 128) 63 | self.fc3 = nn.Linear(128, self.output_dim) 64 | 65 | def forward(self, input1, input2): 66 | x = torch.cat([input1, input2], 1) 67 | x = F.relu(self.fc1(x)) 68 | x = F.relu(self.fc2(x)) 69 | x = self.fc3(x) 70 | return x 71 | 72 | 73 | class td3(object): 74 | def __init__(self, env, batch_size, learning_rate, exploration, episode, gamma, capacity, rho, update_iter, policy_delay, epsilon_init, decay, epsilon_min, max_a, min_a, noisy_range, render, log): 75 | self.env = env 76 | self.batch_size = batch_size 77 | self.learning_rate = learning_rate 78 | self.exploration = exploration 79 | self.episode = episode 80 | self.gamma = gamma 81 | self.capacity = capacity 82 | self.rho = rho 83 | self.update_iter = update_iter 84 | self.policy_delay = policy_delay 85 | self.epsilon_init = epsilon_init 86 | self.decay = decay 87 | self.epsilon_min = epsilon_min 88 | self.max_a = max_a 89 | self.min_a = min_a 90 | self.noisy_range = noisy_range 91 | self.render = render 92 | self.log = log 93 | 94 | self.observation_dim = self.env.observation_space.shape[0] 95 | self.action_dim = self.env.action_space.shape[0] 96 | 97 | self.value_net1 = value_net(self.observation_dim, self.action_dim, 1) 98 | self.value_net2 = value_net(self.observation_dim, self.action_dim, 1) 99 | self.target_value_net1 = value_net(self.observation_dim, self.action_dim, 1) 100 | self.target_value_net2 = value_net(self.observation_dim, self.action_dim, 1) 101 | self.policy_net = policy_net(self.observation_dim, self.action_dim) 102 | self.target_policy_net = policy_net(self.observation_dim, self.action_dim) 103 | self.target_value_net1.load_state_dict(self.value_net1.state_dict()) 104 | self.target_value_net2.load_state_dict(self.value_net2.state_dict()) 105 | self.target_policy_net.load_state_dict(self.policy_net.state_dict()) 106 | 107 | self.buffer = replay_buffer(capacity=self.capacity) 108 | 109 | self.value_optimizer1 = torch.optim.Adam(self.value_net1.parameters(), lr=self.learning_rate) 110 | self.value_optimizer2 = torch.optim.Adam(self.value_net2.parameters(), lr=self.learning_rate) 111 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 112 | 113 | self.weight_reward = None 114 | self.count = 0 115 | self.train_count = 0 116 | self.epsilon = lambda x: self.epsilon_min + (self.epsilon_init - self.epsilon_min) * math.exp(- x / self.decay) 117 | self.writer = SummaryWriter('runs/td3') 118 | 119 | def soft_update(self): 120 | for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()): 121 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 122 | for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()): 123 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 124 | for param, target_param in zip(self.policy_net.parameters(), self.target_policy_net.parameters()): 125 | target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho) 126 | 127 | def train(self): 128 | value1_loss_buffer = [] 129 | value2_loss_buffer = [] 130 | policy_loss_buffer = [] 131 | for iter in range(self.update_iter): 132 | observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size) 133 | 134 | observation = torch.FloatTensor(observation) 135 | action = torch.FloatTensor(action).unsqueeze(1) 136 | reward = torch.FloatTensor(reward).unsqueeze(1) 137 | next_observation = torch.FloatTensor(next_observation) 138 | done = torch.FloatTensor(done).unsqueeze(1) 139 | 140 | target_next_action = self.target_policy_net.forward(next_observation) 141 | target_next_action = target_next_action + np.clip(np.random.randn() * self.epsilon(self.count), - self.noisy_range, self.noisy_range) 142 | target_next_action = torch.clamp(target_next_action, self.min_a, self.max_a).detach() 143 | 144 | q_min = torch.min(self.target_value_net1.forward(next_observation, target_next_action), self.target_value_net2.forward(next_observation, target_next_action)) 145 | target_q = reward + (1 - done) * self.gamma * q_min.detach() 146 | q1 = self.value_net1.forward(observation, action) 147 | q2 = self.value_net2.forward(observation, action) 148 | value_loss1 = (q1 - target_q).pow(2).mean() 149 | value_loss2 = (q2 - target_q).pow(2).mean() 150 | value1_loss_buffer.append(value_loss1.detach().item()) 151 | value2_loss_buffer.append(value_loss2.detach().item()) 152 | 153 | self.value_optimizer1.zero_grad() 154 | value_loss1.backward() 155 | torch.nn.utils.clip_grad_norm_(self.value_net1.parameters(), 0.5) 156 | self.value_optimizer1.step() 157 | 158 | self.value_optimizer2.zero_grad() 159 | value_loss2.backward() 160 | torch.nn.utils.clip_grad_norm_(self.value_net2.parameters(), 0.5) 161 | self.value_optimizer2.step() 162 | 163 | if (iter + 1) % self.policy_delay == 0: 164 | current_action = self.policy_net.forward(observation) 165 | policy_loss = (- self.value_net1.forward(observation, current_action)).mean() 166 | policy_loss_buffer.append(policy_loss.detach().item()) 167 | 168 | self.policy_optimizer.zero_grad() 169 | policy_loss.backward() 170 | torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.) 171 | self.policy_optimizer.step() 172 | 173 | self.soft_update() 174 | if self.log: 175 | self.writer.add_scalar('value1_loss', np.mean(value1_loss_buffer), self.train_count) 176 | self.writer.add_scalar('value2_loss', np.mean(value2_loss_buffer), self.train_count) 177 | self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count) 178 | 179 | def run(self): 180 | for i in range(self.episode): 181 | obs = self.env.reset() 182 | total_reward = 0 183 | if self.render: 184 | self.env.render() 185 | while True: 186 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 187 | action = action + np.random.randn() * self.epsilon(self.count) 188 | action = np.clip(action, self.min_a, self.max_a) 189 | next_obs, reward, done, _ = self.env.step([action]) 190 | if self.render: 191 | self.env.render() 192 | self.buffer.store(obs, action, reward, next_obs, done) 193 | self.count += 1 194 | total_reward += reward 195 | obs = next_obs 196 | 197 | if done: 198 | if i > self.exploration: 199 | self.train_count += 1 200 | self.train() 201 | if not self.weight_reward: 202 | self.weight_reward = total_reward 203 | else: 204 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 205 | if self.log: 206 | self.writer.add_scalar('reward', total_reward, i + 1) 207 | self.writer.add_scalar('weight_reward', self.weight_reward, i + 1) 208 | print('episode: {} reward: {:.2f} weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward)) 209 | break 210 | 211 | 212 | if __name__ == '__main__': 213 | env = gym.make('Pendulum-v0') 214 | test = td3(env=env, 215 | batch_size=100, 216 | learning_rate=1e-3, 217 | exploration=300, 218 | episode=10000, 219 | gamma=0.99, 220 | capacity=10000, 221 | rho=0.995, 222 | update_iter=10, 223 | policy_delay=2, 224 | epsilon_init=1., 225 | decay=10000, 226 | epsilon_min=0.01, 227 | max_a=2., 228 | min_a=-2., 229 | noisy_range=0.5, 230 | render=False, 231 | log=False) 232 | test.run() -------------------------------------------------------------------------------- /TRPO/trpo_gae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import numpy as np 6 | import gym 7 | from collections import deque 8 | import random 9 | from torch.utils.tensorboard import SummaryWriter 10 | 11 | 12 | class gae_trajectory_buffer(object): 13 | def __init__(self, capacity, gamma, lam): 14 | self.capacity = capacity 15 | self.memory = deque(maxlen=self.capacity) 16 | # * [observation, action, reward, done, value, return, advantage] 17 | self.gamma = gamma 18 | self.lam = lam 19 | 20 | def store(self, observation, action, reward, done, value): 21 | observation = np.expand_dims(observation, 0) 22 | self.memory.append([observation, action, reward, done, value]) 23 | 24 | def process(self): 25 | R = 0 26 | Adv = 0 27 | Value_previous = 0 28 | for traj in reversed(list(self.memory)): 29 | R = R * self.gamma * (1 - traj[3]) + traj[2] 30 | traj.append(R) 31 | # * the generalized advantage estimator(GAE) 32 | delta = traj[2] + self.gamma * (1 - traj[3]) * Value_previous - traj[4] 33 | Adv = delta + self.gamma * self.lam * Adv * (1 - traj[3]) 34 | Value_previous = traj[4] 35 | traj.append(Adv) 36 | 37 | def get(self): 38 | observation, action, reward, done, value, ret, advantage = zip(* list(self.memory)) 39 | observation = np.concatenate(observation, 0) 40 | action = np.expand_dims(action, 1) 41 | reward = np.expand_dims(reward, 1) 42 | done = np.expand_dims(done, 1) 43 | value = np.expand_dims(value, 1) 44 | ret = np.expand_dims(ret, 1) 45 | advantage = np.array(advantage) 46 | advantage = (advantage - advantage.mean()) / advantage.std() 47 | advantage = np.expand_dims(advantage, 1) 48 | return observation, action, reward, done, value, ret, advantage 49 | 50 | def clear(self): 51 | self.memory.clear() 52 | 53 | def __len__(self): 54 | return len(self.memory) 55 | 56 | 57 | class gaussian_policy_net(nn.Module): 58 | def __init__(self, input_dim, output_dim): 59 | super(gaussian_policy_net, self).__init__() 60 | self.input_dim = input_dim 61 | self.output_dim = output_dim 62 | 63 | self.fc1 = nn.Linear(self.input_dim, 128) 64 | self.fc2 = nn.Linear(128, 128) 65 | self.fc3 = nn.Linear(128, self.output_dim) 66 | 67 | def forward(self, input): 68 | x = F.tanh(self.fc1(input)) 69 | x = F.tanh(self.fc2(x)) 70 | mu = self.fc3(x) 71 | sigma = torch.ones_like(mu) 72 | #log_sigma = torch.zeros_like(mu) 73 | #sigma = torch.exp(log_sigma) 74 | return mu, sigma 75 | 76 | def act(self, input): 77 | mu, sigma = self.forward(input) 78 | dist = Normal(mu, sigma) 79 | action = dist.sample().detach().item() 80 | return action 81 | 82 | def distribute(self, input): 83 | mu, sigma = self.forward(input) 84 | dist = Normal(mu, sigma) 85 | return dist 86 | 87 | class value_net(nn.Module): 88 | def __init__(self, input_dim, output_dim): 89 | super(value_net, self).__init__() 90 | self.input_dim = input_dim 91 | self.output_dim = output_dim 92 | 93 | self.fc1 = nn.Linear(self.input_dim, 128) 94 | self.fc2 = nn.Linear(128, 128) 95 | self.fc3 = nn.Linear(128, self.output_dim) 96 | 97 | def forward(self, input): 98 | x = F.tanh(self.fc1(input)) 99 | x = F.tanh(self.fc2(x)) 100 | x = self.fc3(x) 101 | return x 102 | 103 | 104 | class trpo(object): 105 | def __init__(self, env, capacity, gamma, learning_rate, render, sample_size, episode, lam, delta, value_train_iter, policy_train_iter, method, backtrack_coeff, backtrack_alpha, training, log): 106 | self.env = env 107 | self.gamma = gamma 108 | self.lam = lam 109 | self.delta = delta 110 | self.capacity = capacity 111 | self.learning_rate = learning_rate 112 | self.render = render 113 | self.sample_size = sample_size 114 | self.episode = episode 115 | self.value_train_iter = value_train_iter 116 | self.policy_train_iter = policy_train_iter 117 | self.method = method 118 | self.backtrack_coeff = backtrack_coeff 119 | self.backtrack_alpha = backtrack_alpha 120 | self.training = training 121 | 122 | self.observation_dim = self.env.observation_space.shape[0] 123 | self.action_dim = self.env.action_space.shape[0] 124 | self.policy_net = gaussian_policy_net(self.observation_dim, self.action_dim) 125 | self.old_policy_net = gaussian_policy_net(self.observation_dim, self.action_dim) 126 | self.value_net = value_net(self.observation_dim, 1) 127 | self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam) 128 | self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate) 129 | self.old_policy_optimizer = torch.optim.Adam(self.old_policy_net.parameters(), lr=self.learning_rate) 130 | self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) 131 | self.count = 0 132 | self.train_count = 0 133 | self.weight_reward = None 134 | self.writer = SummaryWriter('runs/trpo_gae') 135 | self.log = log 136 | 137 | def guassian_kl(self, old_policy, policy, obs): 138 | # * calculate the guassian distribution kl 139 | mu_old, sigma_old = old_policy.forward(obs) 140 | mu_old, sigma_old = mu_old.detach(), sigma_old.detach() 141 | 142 | mu, sigma = policy.forward(obs) 143 | 144 | kl = torch.log(sigma / sigma_old) + (sigma_old.pow(2) + (mu_old - mu).pow(2)) / (2. * sigma.pow(2)) - 0.5 145 | return kl.sum(-1, keepdim=True).mean() 146 | 147 | def flatten_grad(self, grads, hessian=False): 148 | grad_flat = [] 149 | if hessian == False: 150 | for grad in grads: 151 | grad_flat.append(grad.view(-1)) 152 | grad_flat = torch.cat(grad_flat, 0) 153 | else: 154 | for grad in grads: 155 | grad_flat.append(grad.contiguous().view(-1)) 156 | grad_flat = torch.cat(grad_flat, 0).detach() 157 | return grad_flat 158 | 159 | def flatten_param(self, params): 160 | param_flat = [] 161 | for param in params: 162 | param_flat.append(param.view(-1)) 163 | return torch.cat(param_flat, 0).detach() 164 | 165 | def hessian_vector_product(self, obs, p, damping_coeff=0.1): 166 | # * calculate the production of hessian matrix with a vector 167 | # * obs : observation 168 | # * p : a vector 169 | kl = self.guassian_kl(self.old_policy_net, self.policy_net, obs) 170 | kl_grad = torch.autograd.grad(kl, self.policy_net.parameters(), create_graph=True) 171 | kl_grad = self.flatten_grad(kl_grad) 172 | 173 | kl_grad_p = (kl_grad * p).sum() 174 | kl_hessian = torch.autograd.grad(kl_grad_p, self.policy_net.parameters()) 175 | kl_hessian = self.flatten_grad(kl_hessian, hessian=True) 176 | return kl_hessian + p * damping_coeff 177 | 178 | def conjugate_gradient(self, obs, b, cg_iters=10, eps=1e-8, residual_tol=1e-10): 179 | # * calculate the search direction with conjugate gradient method, find the x that makes hx = g 180 | # * obs : observation 181 | # * b : gradient 182 | x = torch.zeros_like(b) 183 | r = b.clone() 184 | p = r.clone() 185 | rTr = torch.dot(r, r) 186 | 187 | for _ in range(cg_iters): 188 | Ap = self.hessian_vector_product(obs, p) 189 | alpha = rTr / (torch.dot(p, Ap) + eps) 190 | x = x + alpha * p 191 | r = r - alpha * Ap 192 | 193 | new_rTr = torch.dot(r, r) 194 | beta = new_rTr / rTr 195 | p = r + beta * p 196 | rTr = new_rTr 197 | 198 | if rTr < residual_tol: 199 | break 200 | return x 201 | 202 | def update_model(self, model, params): 203 | index = 0 204 | for param in model.parameters(): 205 | param_length = param.view(-1).size(0) 206 | new_param = params[index: index + param_length] 207 | new_param = new_param.view(param.size()) 208 | param.detach().copy_(new_param) 209 | index += param_length 210 | 211 | def train(self): 212 | self.train_count += 1 213 | obs, act, rew, do, val, ret, adv = self.buffer.get() 214 | 215 | obs = torch.FloatTensor(obs) 216 | act = torch.FloatTensor(act) 217 | rew = torch.FloatTensor(rew) 218 | do = torch.FloatTensor(do) 219 | val = torch.FloatTensor(val) 220 | ret = torch.FloatTensor(ret) 221 | adv = torch.FloatTensor(adv) 222 | 223 | dist_old = self.policy_net.distribute(obs) 224 | log_prob_old = dist_old.log_prob(act).detach() 225 | dist = self.policy_net.distribute(obs) 226 | log_prob = dist.log_prob(act) 227 | value = self.value_net.forward(obs) 228 | 229 | ratio_old = torch.exp(log_prob - log_prob_old) 230 | policy_loss_old = (ratio_old * adv).mean() 231 | value_loss = (value - ret).pow(2).mean() 232 | self.writer.add_scalar('value_loss', value_loss, self.train_count) 233 | self.writer.add_scalar('policy_loss_old', policy_loss_old, self.train_count) 234 | 235 | for _ in range(self.value_train_iter): 236 | self.value_optimizer.zero_grad() 237 | value_loss.backward(retain_graph=True) 238 | self.value_optimizer.step() 239 | 240 | gradient = torch.autograd.grad(policy_loss_old, self.policy_net.parameters()) 241 | gradient = self.flatten_grad(gradient) 242 | 243 | search_dir = self.conjugate_gradient(obs, gradient) 244 | # * search_dir is x in paper 245 | xhx = torch.dot(self.hessian_vector_product(obs, search_dir), search_dir) 246 | step_size = torch.sqrt((2. * self.delta) / xhx) 247 | old_params = self.flatten_param(self.policy_net.parameters()) 248 | self.update_model(self.old_policy_net, old_params) 249 | 250 | if self.method == 'npg': 251 | params = old_params + step_size * search_dir 252 | self.update_model(self.policy_net, params) 253 | 254 | elif self.method == 'trpo': 255 | full_improve = (gradient * step_size * search_dir).sum(0, keepdim=True) 256 | dist_old = self.old_policy_net.distribute(obs) 257 | 258 | for i in range(self.policy_train_iter): 259 | params = old_params + self.backtrack_coeff * step_size * search_dir 260 | self.update_model(self.policy_net, params) 261 | 262 | dist = self.policy_net.distribute(obs) 263 | log_prob = dist.log_prob(act) 264 | ratio = torch.exp(log_prob - log_prob_old) 265 | policy_loss = (ratio * adv).mean() 266 | loss_improve = policy_loss - policy_loss_old 267 | full_improve = full_improve * self.backtrack_coeff 268 | improve_condition = loss_improve / full_improve 269 | 270 | kl = self.guassian_kl(self.old_policy_net, self.policy_net, obs) 271 | 272 | if kl < self.delta and improve_condition > self.backtrack_alpha: 273 | self.writer.add_scalar('improve_condition', improve_condition, self.train_count) 274 | self.writer.add_scalar('kl', kl, self.train_count) 275 | self.writer.add_scalar('backtrack_coeff', self.backtrack_coeff, self.train_count) 276 | break 277 | else: 278 | if i == self.policy_train_iter - 1: 279 | params = self.flatten_param(self.old_policy_net.parameters()) 280 | self.update_model(self.policy_net, params) 281 | self.writer.add_scalar('improve_condition', improve_condition, self.train_count) 282 | self.writer.add_scalar('kl', kl, self.train_count) 283 | self.writer.add_scalar('backtrack_coeff', 0., self.train_count) 284 | self.backtrack_coeff = self.backtrack_coeff * 0.5 285 | self.backtrack_coeff = 1. 286 | 287 | def run(self): 288 | for i in range(self.episode): 289 | total_reward = 0 290 | obs = self.env.reset() 291 | if self.render: 292 | self.env.render() 293 | while True: 294 | self.count += 1 295 | if self.training: 296 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0))) 297 | next_obs, reward, done, _ = self.env.step([action]) 298 | val = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item() 299 | self.buffer.store(obs, action, reward, done, val) 300 | if self.count % self.capacity == 0: 301 | self.buffer.process() 302 | self.train() 303 | else: 304 | action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs))) 305 | next_obs, reward, done, _ = self.env.step(action) 306 | 307 | total_reward += reward 308 | obs = next_obs 309 | if done: 310 | if not self.weight_reward: 311 | self.weight_reward = total_reward 312 | else: 313 | self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01 314 | if self.log: 315 | self.writer.add_scalar('weight_reward', self.weight_reward, i + 1) 316 | self.writer.add_scalar('reward', total_reward, i + 1) 317 | print('episode: {} reward: {:.2f} weight_reward: {:.2f} train_step: {}'.format(i + 1, total_reward, self.weight_reward, self.train_count)) 318 | break 319 | 320 | 321 | if __name__ == '__main__': 322 | env = gym.make('Pendulum-v0') 323 | test = trpo(env=env, 324 | capacity=2000, 325 | gamma=0.99, 326 | learning_rate=1e-3, 327 | render=False, 328 | sample_size=64, 329 | episode=5000, 330 | lam=0.97, 331 | delta=1e-2, 332 | value_train_iter=80, 333 | policy_train_iter=10, 334 | method='trpo', 335 | backtrack_coeff=1., 336 | backtrack_alpha=0.5, 337 | training=True, 338 | log=False) 339 | test.run() --------------------------------------------------------------------------------