├── .vscode
    ├── .ropeproject
    │   ├── config.py
    │   └── objectdb
    └── settings.json
├── A2C
    └── advantage_actor_critic.py
├── A3C
    ├── SharedAdam.py
    ├── __pycache__
    │   ├── SharedAdam.cpython-37.pyc
    │   └── utils.cpython-37.pyc
    ├── a3c_cartpole.py
    └── utils.py
├── AC
    └── actor_critic.py
├── ACER
    └── acer_cartpole.py
├── DDPG
    └── ddpg.py
├── DSAC
    └── distributional_sac_discrete.py
├── ICM_PPO
    └── icm.py
├── PPO_CLIP
    ├── gae_ppo_cartpole.py
    ├── ppo_cartpole.py
    └── ppo_pendulum.py
├── REINFORCE
    └── reinforce.py
├── RND_PPO
    └── rnd.py
├── Readme.md
├── SAC
    ├── sac.py
    └── sac_discrete.py
├── TD3
    └── td3.py
└── TRPO
    └── trpo_gae.py


/.vscode/.ropeproject/config.py:
--------------------------------------------------------------------------------
  1 | # The default ``config.py``
  2 | # flake8: noqa
  3 | 
  4 | 
  5 | def set_prefs(prefs):
  6 |     """This function is called before opening the project"""
  7 | 
  8 |     # Specify which files and folders to ignore in the project.
  9 |     # Changes to ignored resources are not added to the history and
 10 |     # VCSs.  Also they are not returned in `Project.get_files()`.
 11 |     # Note that ``?`` and ``*`` match all characters but slashes.
 12 |     # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
 13 |     # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
 14 |     # '.svn': matches 'pkg/.svn' and all of its children
 15 |     # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
 16 |     # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
 17 |     prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
 18 |                                   '.hg', '.svn', '_svn', '.git', '.tox']
 19 | 
 20 |     # Specifies which files should be considered python files.  It is
 21 |     # useful when you have scripts inside your project.  Only files
 22 |     # ending with ``.py`` are considered to be python files by
 23 |     # default.
 24 |     # prefs['python_files'] = ['*.py']
 25 | 
 26 |     # Custom source folders:  By default rope searches the project
 27 |     # for finding source folders (folders that should be searched
 28 |     # for finding modules).  You can add paths to that list.  Note
 29 |     # that rope guesses project source folders correctly most of the
 30 |     # time; use this if you have any problems.
 31 |     # The folders should be relative to project root and use '/' for
 32 |     # separating folders regardless of the platform rope is running on.
 33 |     # 'src/my_source_folder' for instance.
 34 |     # prefs.add('source_folders', 'src')
 35 | 
 36 |     # You can extend python path for looking up modules
 37 |     # prefs.add('python_path', '~/python/')
 38 | 
 39 |     # Should rope save object information or not.
 40 |     prefs['save_objectdb'] = True
 41 |     prefs['compress_objectdb'] = False
 42 | 
 43 |     # If `True`, rope analyzes each module when it is being saved.
 44 |     prefs['automatic_soa'] = True
 45 |     # The depth of calls to follow in static object analysis
 46 |     prefs['soa_followed_calls'] = 0
 47 | 
 48 |     # If `False` when running modules or unit tests "dynamic object
 49 |     # analysis" is turned off.  This makes them much faster.
 50 |     prefs['perform_doa'] = True
 51 | 
 52 |     # Rope can check the validity of its object DB when running.
 53 |     prefs['validate_objectdb'] = True
 54 | 
 55 |     # How many undos to hold?
 56 |     prefs['max_history_items'] = 32
 57 | 
 58 |     # Shows whether to save history across sessions.
 59 |     prefs['save_history'] = True
 60 |     prefs['compress_history'] = False
 61 | 
 62 |     # Set the number spaces used for indenting.  According to
 63 |     # :PEP:`8`, it is best to use 4 spaces.  Since most of rope's
 64 |     # unit-tests use 4 spaces it is more reliable, too.
 65 |     prefs['indent_size'] = 4
 66 | 
 67 |     # Builtin and c-extension modules that are allowed to be imported
 68 |     # and inspected by rope.
 69 |     prefs['extension_modules'] = []
 70 | 
 71 |     # Add all standard c-extensions to extension_modules list.
 72 |     prefs['import_dynload_stdmods'] = True
 73 | 
 74 |     # If `True` modules with syntax errors are considered to be empty.
 75 |     # The default value is `False`; When `False` syntax errors raise
 76 |     # `rope.base.exceptions.ModuleSyntaxError` exception.
 77 |     prefs['ignore_syntax_errors'] = False
 78 | 
 79 |     # If `True`, rope ignores unresolvable imports.  Otherwise, they
 80 |     # appear in the importing namespace.
 81 |     prefs['ignore_bad_imports'] = False
 82 | 
 83 |     # If `True`, rope will insert new module imports as
 84 |     # `from <package> import <module>` by default.
 85 |     prefs['prefer_module_from_imports'] = False
 86 | 
 87 |     # If `True`, rope will transform a comma list of imports into
 88 |     # multiple separate import statements when organizing
 89 |     # imports.
 90 |     prefs['split_imports'] = False
 91 | 
 92 |     # If `True`, rope will remove all top-level import statements and
 93 |     # reinsert them at the top of the module when making changes.
 94 |     prefs['pull_imports_to_top'] = True
 95 | 
 96 |     # If `True`, rope will sort imports alphabetically by module name instead
 97 |     # of alphabetically by import statement, with from imports after normal
 98 |     # imports.
 99 |     prefs['sort_imports_alphabetically'] = False
100 | 
101 |     # Location of implementation of
102 |     # rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general
103 |     # case, you don't have to change this value, unless you're an rope expert.
104 |     # Change this value to inject you own implementations of interfaces
105 |     # listed in module rope.base.oi.type_hinting.providers.interfaces
106 |     # For example, you can add you own providers for Django Models, or disable
107 |     # the search type-hinting in a class hierarchy, etc.
108 |     prefs['type_hinting_factory'] = (
109 |         'rope.base.oi.type_hinting.factory.default_type_hinting_factory')
110 | 
111 | 
112 | def project_opened(project):
113 |     """This function is called after opening the project"""
114 |     # Do whatever you like here!
115 | 


--------------------------------------------------------------------------------
/.vscode/.ropeproject/objectdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deligentfool/policy_based_RL/d3332680fc8f09e864b113616faa8954705c6aea/.vscode/.ropeproject/objectdb


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "E:\\Anaconda3\\python.exe"
3 | }


--------------------------------------------------------------------------------
/A2C/advantage_actor_critic.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | import numpy as np
  6 | import gym
  7 | from torch.distributions import Categorical
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | 
 11 | class policy_net(nn.Module):
 12 |     def __init__(self, input_dim, output_dim):
 13 |         super(policy_net, self).__init__()
 14 |         self.input_dim = input_dim
 15 |         self.output_dim = output_dim
 16 | 
 17 |         self.fc1 = nn.Linear(self.input_dim, 128)
 18 |         self.fc2 = nn.Linear(128, 128)
 19 |         self.fc3 = nn.Linear(128, self.output_dim)
 20 | 
 21 |         self.rewards = []
 22 |         self.log_probs = []
 23 | 
 24 |     def forward(self, input):
 25 |         x = self.fc1(input)
 26 |         x = F.relu(x)
 27 |         x = self.fc2(x)
 28 |         x = F.relu(x)
 29 |         x = self.fc3(x)
 30 |         return F.softmax(x, 1)
 31 | 
 32 |     def act(self, input):
 33 |         probs = self.forward(input)
 34 |         dist = Categorical(probs=probs)
 35 |         action = dist.sample()
 36 |         log_prob = dist.log_prob(action)
 37 |         self.log_probs.append(log_prob)
 38 |         return action[0].item()
 39 | 
 40 | 
 41 | class value_net(nn.Module):
 42 |     def __init__(self, input_dim, output_dim):
 43 |         super(value_net, self).__init__()
 44 |         self.input_dim = input_dim
 45 |         self.output_dim = output_dim
 46 | 
 47 |         self.fc1 = nn.Linear(self.input_dim, 128)
 48 |         self.fc2 = nn.Linear(128, 128)
 49 |         self.fc3 = nn.Linear(128, self.output_dim)
 50 | 
 51 |     def forward(self, input):
 52 |         x = self.fc1(input)
 53 |         x = F.relu(x)
 54 |         x = self.fc2(x)
 55 |         x = F.relu(x)
 56 |         x = self.fc3(x)
 57 |         return x
 58 | 
 59 | 
 60 | class advantage_actor_critic(object):
 61 |     def __init__(self, env, gamma, learning_rate, episode, render):
 62 |         self.env = env
 63 |         self.observation_dim = self.env.observation_space.shape[0]
 64 |         self.action_dim = self.env.action_space.n
 65 |         self.gamma = gamma
 66 |         self.learning_rate = learning_rate
 67 |         self.episode = episode
 68 |         self.render = render
 69 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
 70 |         self.value_net = value_net(self.observation_dim, 1)
 71 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
 72 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
 73 |         self.total_returns = []
 74 |         self.values_buffer = []
 75 |         self.writer = SummaryWriter('runs/a2c')
 76 |         self.weight_reward = None
 77 |         self.count = 0
 78 | 
 79 |     def train(self, ):
 80 |         total_returns = torch.FloatTensor(self.total_returns).unsqueeze(1).detach()
 81 |         values = torch.cat(self.values_buffer, 0)
 82 |         delta = (total_returns - values).squeeze(1)
 83 |         log_probs = torch.cat(self.policy_net.log_probs, 0)
 84 | 
 85 |         policy_loss = (- log_probs * delta.detach())
 86 |         policy_loss = policy_loss.sum()
 87 |         self.writer.add_scalar('policy_loss', policy_loss, self.count)
 88 |         self.policy_optimizer.zero_grad()
 89 |         policy_loss.backward(retain_graph=True)
 90 |         torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.1)
 91 |         self.policy_optimizer.step()
 92 | 
 93 |         value_loss = delta.pow(2).sum()
 94 |         self.writer.add_scalar('value_loss', value_loss, self.count)
 95 |         self.value_optimizer.zero_grad()
 96 |         value_loss.backward(retain_graph=True)
 97 |         torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.1)
 98 |         self.value_optimizer.step()
 99 | 
100 |     def run(self, ):
101 |         for i in range(self.episode):
102 |             obs = self.env.reset()
103 |             total_reward = 0
104 |             if self.render:
105 |                 self.env.render()
106 |             while True:
107 |                 self.values_buffer.append(self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))))
108 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
109 |                 next_obs, reward, done, info = self.env.step(action)
110 |                 self.policy_net.rewards.append(reward)
111 |                 self.count += 1
112 |                 total_reward += reward
113 |                 if self.render:
114 |                     self.env.render()
115 |                 obs = next_obs
116 |                 if done:
117 |                     R = 0
118 |                     if self.weight_reward:
119 |                         self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward
120 |                     else:
121 |                         self.weight_reward = total_reward
122 |                     for r in reversed(self.policy_net.rewards):
123 |                         R = R * self.gamma + r
124 |                         self.total_returns.append(R)
125 |                     self.total_returns = list(reversed(self.total_returns))
126 |                     self.train()
127 |                     del self.policy_net.rewards[:]
128 |                     del self.policy_net.log_probs[:]
129 |                     del self.total_returns[:]
130 |                     del self.values_buffer[:]
131 |                     print('episode: {}  reward: {:.1f}  weight_reward: {:.2f}'.format(i+1, total_reward, self.weight_reward))
132 |                     break
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     env = gym.make('CartPole-v0')
137 |     env = env.unwrapped
138 |     test = advantage_actor_critic(env, gamma=0.99, learning_rate=1e-3, episode=100000, render=False)
139 |     test.run()


--------------------------------------------------------------------------------
/A3C/SharedAdam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class SharedAdam(torch.optim.Adam):
 5 |     def __init__(self,
 6 |                  params,
 7 |                  lr=1e-3,
 8 |                  betas=(0.9, 0.9),
 9 |                  eps=1e-8,
10 |                  weight_decay=0):
11 |         super(SharedAdam, self).__init__(params,
12 |                                          lr=lr,
13 |                                          betas=betas,
14 |                                          eps=eps,
15 |                                          weight_decay=weight_decay)
16 |         for group in self.param_groups:
17 |             for p in group['params']:
18 |                 state = self.state[p]
19 |                 state['step'] = 0
20 |                 state['exp_avg'] = torch.zeros_like(p.data)
21 |                 state['exp_avg_sq'] = torch.zeros_like(p.data)
22 | 
23 |                 state['exp_avg'].share_memory_()
24 |                 state['exp_avg_sq'].share_memory_()
25 | 


--------------------------------------------------------------------------------
/A3C/__pycache__/SharedAdam.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deligentfool/policy_based_RL/d3332680fc8f09e864b113616faa8954705c6aea/A3C/__pycache__/SharedAdam.cpython-37.pyc


--------------------------------------------------------------------------------
/A3C/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deligentfool/policy_based_RL/d3332680fc8f09e864b113616faa8954705c6aea/A3C/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/A3C/a3c_cartpole.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import gym
  6 | import torch.multiprocessing as mp
  7 | import os
  8 | from utils import pull_and_push, record, set_init
  9 | from SharedAdam import SharedAdam
 10 | os.environ["OMP_NUM_THREADS"] = "1"
 11 | 
 12 | 
 13 | class Net(nn.Module):
 14 |     def __init__(self, action_d, observation_d):
 15 |         super(Net, self).__init__()
 16 |         self.action_d = action_d
 17 |         self.observation_d = observation_d
 18 |         self.policy_layer_1 = nn.Linear(self.observation_d, 256)
 19 |         self.policy_layer_2 = nn.Linear(256, self.action_d)
 20 |         self.value_layer_1 = nn.Linear(self.observation_d, 256)
 21 |         self.value_layer_2 = nn.Linear(256, 1)
 22 |         set_init([
 23 |             self.policy_layer_1, self.policy_layer_2, self.value_layer_1,
 24 |             self.value_layer_2
 25 |         ])
 26 |         self.distribution = torch.distributions.Categorical
 27 | 
 28 |     def forward(self, x):
 29 |         pl_1 = F.relu6(self.policy_layer_1(x))
 30 |         policy = F.softmax(self.policy_layer_2(pl_1), dim=1)
 31 |         vl_1 = F.relu6(self.value_layer_1(x))
 32 |         value = self.value_layer_2(vl_1)
 33 |         return policy, value
 34 | 
 35 |     def choose_action(self, s):
 36 |         self.eval()
 37 |         prob, _ = self.forward(s)
 38 |         prob = prob.data
 39 |         m = self.distribution(prob)
 40 |         return m.sample().numpy()[0]
 41 | 
 42 |     def loss_func(self, s, a, value_target):
 43 |         self.train()
 44 |         prob, value = self.forward(s)
 45 |         td_error = value_target - value
 46 |         critic_loss = td_error.pow(2)
 47 | 
 48 |         m = self.distribution(prob)
 49 |         log_pob = m.log_prob(a)
 50 |         exp_v = log_pob * td_error.detach().squeeze()
 51 |         actor_loss = -exp_v
 52 |         loss = (critic_loss + actor_loss).mean()
 53 |         return loss
 54 | 
 55 | 
 56 | class Worker(mp.Process):
 57 |     def __init__(self, global_net, optimizer, global_episode_counter,
 58 |                  global_reward, res_queue, name, max_episode,
 59 |                  update_global_iteration, gamma):
 60 |         super(Worker, self).__init__()
 61 |         self.name = 'w' + name
 62 |         self.global_episode_counter = global_episode_counter
 63 |         self.global_reward = global_reward
 64 |         self.res_queue = res_queue
 65 |         self.global_net = global_net
 66 |         self.optimizer = optimizer
 67 |         self.max_episode = max_episode
 68 |         self.update_global_iteration = update_global_iteration
 69 |         self.gamma = gamma
 70 |         self.env = gym.make('CartPole-v0')
 71 |         self.env = self.env.unwrapped
 72 |         self.action_d = env.action_space.n
 73 |         self.observation_d = env.observation_space.shape[0]
 74 |         self.local_net = Net(self.action_d, self.observation_d)
 75 | 
 76 |     def run(self):
 77 |         total_step = 1
 78 |         while self.global_episode_counter.value < self.max_episode:
 79 |             s = self.env.reset()
 80 |             buffer_s, buffer_a, buffer_r = [], [], []
 81 |             episode_reward = 0
 82 |             while True:
 83 |                 if self.name == 'w0':
 84 |                     self.env.render()
 85 |                 a = self.local_net.choose_action(
 86 |                     torch.Tensor(s).view(-1, self.observation_d))
 87 |                 s_, r, done, _ = self.env.step(a)
 88 |                 if done:
 89 |                     r = -1
 90 |                 episode_reward += r
 91 |                 buffer_a.append(a)
 92 |                 buffer_r.append(r)
 93 |                 buffer_s.append(s)
 94 |                 if total_step % self.update_global_iteration == 0 or done:
 95 |                     # sync
 96 |                     pull_and_push(self.optimizer, self.local_net,
 97 |                                   self.global_net, done, s_, buffer_s,
 98 |                                   buffer_a, buffer_r, self.gamma)
 99 |                     buffer_s, buffer_a, buffer_r = [], [], []
100 |                     if done:
101 |                         # record
102 |                         record(self.global_episode_counter, self.global_reward,
103 |                                episode_reward, self.res_queue, self.name)
104 |                         break
105 |                 s = s_
106 |                 total_step += 1
107 |         self.res_queue.put(None)
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     env = gym.make('CartPole-v0')
112 |     action_d = env.action_space.n
113 |     observation_d = env.observation_space.shape[0]
114 |     global_net = Net(action_d, observation_d)
115 |     optimizer = SharedAdam(global_net.parameters(), lr=0.0001)
116 |     global_episode_counter, global_reward, res_queue = mp.Value(
117 |         'i', 0), mp.Value('d', 0.), mp.Queue()
118 |     workers = [
119 |         Worker(global_net,
120 |                optimizer, global_episode_counter, global_reward, res_queue,
121 |                str(i), 10000, 10, 0.9) for i in range(mp.cpu_count())
122 |     ]
123 |     [worker.start() for worker in workers]
124 |     res = []
125 |     while True:
126 |         r = res_queue.get()
127 |         if r is not None:
128 |             res.append(r)
129 |         else:
130 |             break
131 |     [worker.join() for worker in workers]
132 | 
133 | 
134 |     import matplotlib.pyplot as plt
135 |     plt.plot(res)
136 |     plt.ylabel('Moving average ep reward')
137 |     plt.xlabel('Step')
138 |     plt.show()


--------------------------------------------------------------------------------
/A3C/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | def pull_and_push(opt, local_net, global_net, done, s_, b_s, b_a, b_r, gamma):
 7 |     if done:
 8 |         v_s_ = 0
 9 |     else:
10 |         _, v_s_ = local_net.forward(torch.Tensor([s_]))
11 |         v_s_ = v_s_.data.numpy()[0, 0]
12 |     buffer_v_target = []
13 |     for r in reversed(b_r):
14 |         v_s_ = r + gamma * v_s_
15 |         buffer_v_target.append(v_s_)
16 |     buffer_v_target.reverse()
17 | 
18 |     loss = local_net.loss_func(torch.Tensor(b_s),
19 |                                torch.Tensor(b_a).view(-1, 1),
20 |                                torch.Tensor(buffer_v_target).view(-1, 1))
21 |     opt.zero_grad()
22 |     loss.backward()
23 |     for l_p, g_p in zip(local_net.parameters(), global_net.parameters()):
24 |         g_p._grad = l_p.grad
25 |     opt.step()
26 |     local_net.load_state_dict(global_net.state_dict())
27 | 
28 | 
29 | def record(global_ep, global_ep_r, ep_r, res_queue, name):
30 |     with global_ep.get_lock():
31 |         global_ep.value += 1
32 |     with global_ep_r.get_lock():
33 |         if global_ep_r.value == 0.:
34 |             global_ep_r.value = ep_r
35 |         else:
36 |             global_ep_r.value = global_ep_r.value * 0.99 + ep_r * 0.01
37 |     res_queue.put(global_ep_r.value)
38 |     print(
39 |         name,
40 |         "Ep:",
41 |         global_ep.value,
42 |         "| Ep_r: %.0f" % global_ep_r.value,
43 |     )
44 | 
45 | 
46 | def set_init(layers):
47 |     for layer in layers:
48 |         nn.init.normal_(layer.weight, mean=0., std=0.1)
49 |         nn.init.constant_(layer.bias, 0.)


--------------------------------------------------------------------------------
/AC/actor_critic.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.distributions import Categorical
  5 | import random
  6 | import numpy as np
  7 | import gym
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | 
 11 | class policy_net(nn.Module):
 12 |     def __init__(self, input_dim, output_dim):
 13 |         super(policy_net, self).__init__()
 14 |         self.input_dim = input_dim
 15 |         self.output_dim = output_dim
 16 | 
 17 |         self.fc1 = nn.Linear(self.input_dim, 128)
 18 |         self.fc2 = nn.Linear(128, 128)
 19 |         self.fc3 = nn.Linear(128, self.output_dim)
 20 | 
 21 |         self.log_probs = []
 22 |         self.rewards = []
 23 | 
 24 |     def forward(self, input):
 25 |         x = F.relu(self.fc1(input))
 26 |         x = F.relu(self.fc2(x))
 27 |         x = self.fc3(x)
 28 |         return F.softmax(x, 1)
 29 | 
 30 |     def act(self, input):
 31 |         prob = self.forward(input)
 32 |         dist = Categorical(prob)
 33 |         action = dist.sample()
 34 |         log_prob = dist.log_prob(action)
 35 |         self.log_probs.append(log_prob)
 36 |         return action.detach().item()
 37 | 
 38 | 
 39 | class q_value_net(nn.Module):
 40 |     # * different with A2C, this is a q value network that the input is observation and action
 41 |     def __init__(self, input1_dim, input2_dim, output_dim):
 42 |         super(q_value_net, self).__init__()
 43 |         self.input1_dim = input1_dim
 44 |         self.input2_dim = input2_dim
 45 |         self.output_dim = output_dim
 46 | 
 47 |         self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128)
 48 |         self.fc2 = nn.Linear(128, 128)
 49 |         self.fc3 = nn.Linear(128, self.output_dim)
 50 | 
 51 |     def forward(self, input1, input2):
 52 |         x = torch.cat([input1, input2], 1)
 53 |         x = self.fc1(x)
 54 |         x = F.relu(x)
 55 |         x = self.fc2(x)
 56 |         x = F.relu(x)
 57 |         x = self.fc3(x)
 58 |         return x
 59 | 
 60 | 
 61 | class actor_critic(object):
 62 |     def __init__(self, env, learning_rate, episode, render):
 63 |         self.env = env
 64 |         self.observation_dim = self.env.observation_space.shape[0]
 65 |         self.action_dim = self.env.action_space.n
 66 |         self.learning_rate = learning_rate
 67 |         self.episode = episode
 68 |         self.render = render
 69 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
 70 |         self.q_value_net = q_value_net(self.observation_dim, self.action_dim, 1)
 71 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
 72 |         self.value_optimizer = torch.optim.Adam(self.q_value_net.parameters(), lr=self.learning_rate)
 73 |         self.values_buffer = []
 74 |         self.next_observation_buffer = []
 75 |         self.writer = SummaryWriter('runs/actor_critic')
 76 |         self.weight_reward = None
 77 |         self.count = 0
 78 | 
 79 |     def train(self, ):
 80 |         values = torch.cat(self.values_buffer, 0)
 81 |         log_probs = torch.cat(self.policy_net.log_probs, 0).unsqueeze(1)
 82 |         rewards = torch.FloatTensor(self.policy_net.rewards).unsqueeze(1)
 83 |         next_observation = torch.FloatTensor(self.next_observation_buffer)
 84 | 
 85 |         policy_loss = (- log_probs * values.detach())
 86 |         policy_loss = policy_loss.sum()
 87 |         self.writer.add_scalar('policy_loss', policy_loss, self.count)
 88 |         self.policy_optimizer.zero_grad()
 89 |         policy_loss.backward(retain_graph=True)
 90 |         torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.1)
 91 |         self.policy_optimizer.step()
 92 | 
 93 |         # * find the max value in all actions
 94 |         q_stack = None
 95 |         for action in range(self.action_dim):
 96 |             action = self.one_hot(action)
 97 |             action = torch.FloatTensor(action)
 98 |             action = action.expand(values.size(0), 2)
 99 |             tmp = self.q_value_net.forward(next_observation, action)
100 |             if q_stack is None:
101 |                 q_stack = tmp
102 |             else:
103 |                 q_stack = torch.cat([q_stack, tmp], 1)
104 |         q_max = q_stack.max(1)[0].unsqueeze(1)
105 |         value_loss = (rewards + q_max - values).pow(2).sum()
106 |         self.writer.add_scalar('value_loss', value_loss, self.count)
107 |         self.value_optimizer.zero_grad()
108 |         value_loss.backward(retain_graph=True)
109 |         torch.nn.utils.clip_grad_norm_(self.q_value_net.parameters(), 0.1)
110 |         self.value_optimizer.step()
111 | 
112 |     def one_hot(self, action):
113 |         one_hot_action = np.zeros(self.action_dim)
114 |         one_hot_action[action] = 1
115 |         return one_hot_action
116 | 
117 |     def run(self, ):
118 |         for i in range(self.episode):
119 |             obs = self.env.reset()
120 |             total_reward = 0
121 |             if self.render:
122 |                 self.env.render()
123 |             while True:
124 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
125 |                 self.values_buffer.append(self.q_value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0)), torch.FloatTensor(np.expand_dims(self.one_hot(action), 0))))
126 |                 next_obs, reward, done, info = self.env.step(action)
127 |                 self.policy_net.rewards.append(reward)
128 |                 self.next_observation_buffer.append(next_obs)
129 |                 self.count += 1
130 |                 total_reward += reward
131 |                 if self.render:
132 |                     self.env.render()
133 |                 obs = next_obs
134 |                 if done:
135 |                     if self.weight_reward:
136 |                         self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward
137 |                     else:
138 |                         self.weight_reward = total_reward
139 |                     self.train()
140 |                     del self.policy_net.rewards[:]
141 |                     del self.policy_net.log_probs[:]
142 |                     del self.values_buffer[:]
143 |                     del self.next_observation_buffer[:]
144 |                     print('episode: {}  reward: {:.1f}  weight_reward: {:.2f}'.format(i+1, total_reward, self.weight_reward))
145 |                     break
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     env = gym.make('CartPole-v0')
150 |     env = env.unwrapped
151 |     test = actor_critic(env, learning_rate=1e-3, episode=100000, render=False)
152 |     test.run()


--------------------------------------------------------------------------------
/ACER/acer_cartpole.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import gym
  5 | import numpy as np
  6 | import random
  7 | from collections import deque
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | 
 11 | class replay_buffer(object):
 12 |     # * a different implement of replay buffer
 13 |     def __init__(self, capacity):
 14 |         self.capacity = capacity
 15 |         self.memory = deque(maxlen=self.capacity)
 16 |         self.memory.append([])
 17 | 
 18 |     def store(self, observation, action, reward, policy, done):
 19 |         observation = np.expand_dims(observation, 0)
 20 |         self.memory[-1].append([observation, action, reward, policy, done])
 21 | 
 22 |     def sample(self, batch_size=None):
 23 |         if not batch_size:
 24 |             batch = self.memory[-1]
 25 |         else:
 26 |             batch_list = random.sample(list(self.memory)[: -1], batch_size)
 27 |             batch = []
 28 |             for i in batch_list:
 29 |                 batch.extend(i)
 30 | 
 31 |         observation, action, reward, policy, done = zip(* batch)
 32 |         return np.concatenate(observation, 0), action, reward, np.concatenate(policy, 0), done
 33 | 
 34 |     def create(self):
 35 |         self.memory.append([])
 36 | 
 37 |     def __len__(self):
 38 |         return len(self.memory)
 39 | 
 40 | 
 41 | class policy_value_net(nn.Module):
 42 |     # * a network for the discrete case
 43 |     def __init__(self, observation_dim, action_dim):
 44 |         super(policy_value_net, self).__init__()
 45 |         self.observation_dim = observation_dim
 46 |         self.action_dim = action_dim
 47 | 
 48 |         self.policy_fc1 = nn.Linear(self.observation_dim, 128)
 49 |         self.policy_fc2 = nn.Linear(128, 128)
 50 |         self.policy_fc3 = nn.Linear(128, self.action_dim)
 51 | 
 52 |         self.value_fc1 = nn.Linear(self.observation_dim, 128)
 53 |         self.value_fc2 = nn.Linear(128, 128)
 54 |         self.value_fc3 = nn.Linear(128, self.action_dim)
 55 | 
 56 |     def forward(self, observation):
 57 |         policy_x = F.tanh(self.policy_fc1(observation))
 58 |         policy_x = F.tanh(self.policy_fc2(policy_x))
 59 |         policy_x = self.policy_fc3(policy_x)
 60 |         policy = F.softmax(policy_x, 1).clamp(max=1-1e-20)
 61 | 
 62 |         q_value_x = F.tanh(self.value_fc1(observation))
 63 |         q_value_x = F.tanh(self.value_fc2(q_value_x))
 64 |         q_value = self.value_fc3(q_value_x)
 65 | 
 66 |         value = (policy * q_value).sum(1, keepdim=True)
 67 |         return policy, q_value, value
 68 | 
 69 | 
 70 | class acer(object):
 71 |     # * without trust region policy optimization
 72 |     def __init__(self, env, episode, capacity, learning_rate, exploration, c, gamma, batch_size, entropy_weight, replay_ratio, render, log):
 73 |         self.env = env
 74 |         self.episode = episode
 75 |         self.capacity = capacity
 76 |         self.learning_rate = learning_rate
 77 |         self.exploration = exploration
 78 |         self.c = c
 79 |         self.gamma = gamma
 80 |         self.batch_size = batch_size
 81 |         self.entropy_weight = entropy_weight
 82 |         self.replay_ratio = replay_ratio
 83 |         self.render = render
 84 |         self.log = log
 85 | 
 86 |         self.observation_dim = self.env.observation_space.shape[0]
 87 |         self.action_dim = self.env.action_space.n
 88 |         self.net = policy_value_net(self.observation_dim, self.action_dim)
 89 |         self.buffer = replay_buffer(self.capacity)
 90 |         self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
 91 |         self.weight_reward = None
 92 |         self.writer = SummaryWriter('runs/acer_cartpole')
 93 |         self.train_count = 0
 94 | 
 95 |     def compute_loss(self, policies, q_values, values, actions, rewards, retrace, dones, behavior_policies):
 96 |         loss = 0
 97 |         for i in reversed(range(policies.size(0))):
 98 |             rho = (policies[i] / behavior_policies[i]).detach()
 99 | 
100 |             retrace = rewards[i] + self.gamma * retrace * (1. - dones[i])
101 |             advantage = retrace - values[i].squeeze()
102 | 
103 |             log_policy_action = policies[i].gather(0, actions[i]).log()
104 |             rho_action = rho.gather(0, actions[i])
105 |             actor_loss = -torch.clamp(rho_action, max=self.c).detach() * log_policy_action * advantage.detach()
106 |             rho_correction = torch.clamp(1 - self.c / rho, min=0.).detach()
107 |             actor_loss -= (rho_correction * policies[i].log() * (q_values[i] - values[i]).detach()).sum()
108 | 
109 |             entropy = self.entropy_weight * -(policies[i] * policies[i].log()).sum()
110 |             critic_loss = (retrace - q_values[i].gather(0, actions[i])).pow(2).sum()
111 | 
112 |             loss += (critic_loss + actor_loss - entropy)
113 | 
114 |             retrace = torch.clamp(rho_action, max=self.c).detach() * (retrace - q_values[i].gather(0, actions[i])) + values[i]
115 |             retrace = retrace.squeeze().detach()
116 | 
117 |         self.optimizer.zero_grad()
118 |         loss.backward()
119 |         self.optimizer.step()
120 |         return loss.item()
121 | 
122 |     def on_policy_train(self, next_observation):
123 |         observations, actions, rewards, behavior_policies, dones = self.buffer.sample()
124 | 
125 |         observations = torch.FloatTensor(observations)
126 |         actions = torch.LongTensor(actions)
127 |         rewards = torch.FloatTensor(rewards)
128 |         behavior_policies = torch.FloatTensor(behavior_policies)
129 |         dones = torch.FloatTensor(dones)
130 | 
131 |         policies, q_values, values = self.net.forward(observations)
132 | 
133 |         _, _, retrace = self.net.forward(torch.FloatTensor(np.expand_dims(next_observation, 0)))
134 |         retrace = retrace.squeeze().detach()
135 |         loss = self.compute_loss(policies, q_values, values, actions, rewards, retrace, dones, behavior_policies)
136 |         if self.log:
137 |             self.writer.add_scalar('on_policy_loss', loss, self.train_count)
138 | 
139 |     def off_policy_train(self):
140 |         loss_list = []
141 |         for _ in range(np.random.poisson(self.replay_ratio)):
142 |             observations, actions, rewards, behavior_policies, dones = self.buffer.sample(self.batch_size)
143 | 
144 |             observations = torch.FloatTensor(observations)
145 |             actions = torch.LongTensor(actions)
146 |             rewards = torch.FloatTensor(rewards)
147 |             behavior_policies = torch.FloatTensor(behavior_policies)
148 |             dones = torch.FloatTensor(dones)
149 | 
150 |             policies, q_values, values = self.net.forward(observations)
151 | 
152 |             _, _, retrace = self.net.forward(observations[-1].unsqueeze(0))
153 |             retrace = retrace.squeeze().detach()
154 |             loss = self.compute_loss(policies, q_values, values, actions, rewards, retrace, dones, behavior_policies)
155 |             loss_list.append(loss)
156 |         if self.log:
157 |             self.writer.add_scalar('off_policy_loss', np.mean(loss_list), self.train_count)
158 | 
159 |     def run(self):
160 |         for i in range(self.episode):
161 |             obs = self.env.reset()
162 |             total_reward = 0
163 |             if self.render:
164 |                 self.env.render()
165 |             while True:
166 |                 policy, _, _ = self.net.forward(torch.FloatTensor(np.expand_dims(obs, 0)))
167 |                 action = policy.multinomial(1).item()
168 |                 next_obs, reward, done, info = self.env.step(action)
169 |                 total_reward += reward
170 |                 if self.render:
171 |                     self.env.render()
172 |                 policy = policy.detach().numpy()
173 |                 self.buffer.store(obs, action, reward / 10., policy, done)
174 |                 obs = next_obs
175 | 
176 |                 if done:
177 |                     if len(self.buffer) > self.exploration:
178 |                         self.on_policy_train(next_obs)
179 |                         self.off_policy_train()
180 |                         self.train_count += 1
181 |                     if not self.weight_reward:
182 |                         self.weight_reward = total_reward
183 |                     else:
184 |                         self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward
185 |                     self.buffer.create()
186 |                     if self.log:
187 |                         self.writer.add_scalar('reward', total_reward, i + 1)
188 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i + 1)
189 |                     print('episode: {}  reward: {}  weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward))
190 |                     break
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     env = gym.make('CartPole-v0')
195 |     env = env.unwrapped
196 |     test = acer(env=env,
197 |                 episode=10000,
198 |                 capacity=10000,
199 |                 learning_rate=1e-3,
200 |                 exploration=1000,
201 |                 c=1.,
202 |                 gamma=0.99,
203 |                 batch_size=16,
204 |                 entropy_weight=1e-4,
205 |                 replay_ratio=2,
206 |                 render=False,
207 |                 log=False)
208 |     test.run()


--------------------------------------------------------------------------------
/DDPG/ddpg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.distributions import Normal
  5 | import random
  6 | import numpy as np
  7 | from collections import deque
  8 | import gym
  9 | import math
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | 
 12 | class replay_buffer(object):
 13 |     def __init__(self, capacity):
 14 |         self.capacity = capacity
 15 |         self.memory = deque(maxlen=self.capacity)
 16 | 
 17 |     def store(self, observation, action, reward, next_observation, done):
 18 |         observation = np.expand_dims(observation, 0)
 19 |         next_observation = np.expand_dims(next_observation, 0)
 20 |         self.memory.append([observation, action, reward, next_observation, done])
 21 | 
 22 |     def sample(self, batch_size):
 23 |         batch = random.sample(self.memory, batch_size)
 24 |         observation, action, reward, next_observation, done = zip(* batch)
 25 |         return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done
 26 | 
 27 |     def __len__(self):
 28 |         return len(self.memory)
 29 | 
 30 | 
 31 | class policy_net(nn.Module):
 32 |     # * deterministic actor network, output a deterministic value as the selected action
 33 |     def __init__(self, input_dim, output_dim):
 34 |         super(policy_net, self).__init__()
 35 |         self.input_dim = input_dim
 36 |         self.output_dim = output_dim
 37 | 
 38 |         self.fc1 = nn.Linear(self.input_dim, 128)
 39 |         self.fc2 = nn.Linear(128, 128)
 40 |         self.fc3 = nn.Linear(128, self.output_dim)
 41 | 
 42 |     def forward(self, input):
 43 |         x = F.relu(self.fc1(input))
 44 |         x = F.relu(self.fc2(x))
 45 |         x = self.fc3(x)
 46 |         return x
 47 | 
 48 |     def act(self, input):
 49 |         action = self.forward(input).detach().item()
 50 |         return action
 51 | 
 52 | 
 53 | 
 54 | class value_net(nn.Module):
 55 |     def __init__(self, input1_dim, input2_dim, output_dim):
 56 |         super(value_net, self).__init__()
 57 |         self.input1_dim = input1_dim
 58 |         self.input2_dim = input2_dim
 59 |         self.output_dim = output_dim
 60 | 
 61 |         self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128)
 62 |         self.fc2 = nn.Linear(128, 128)
 63 |         self.fc3 = nn.Linear(128, self.output_dim)
 64 | 
 65 |     def forward(self, input1, input2):
 66 |         # * concatentate the observation and action as the input
 67 |         x = torch.cat([input1, input2], 1)
 68 |         x = F.relu(self.fc1(x))
 69 |         x = F.relu(self.fc2(x))
 70 |         x = self.fc3(x)
 71 |         return x
 72 | 
 73 | 
 74 | class ddpg(object):
 75 |     def __init__(self, env, episode, learning_rate, gamma, capacity, batch_size, value_iter, policy_iter, epsilon_init, decay, epsilon_min, rho, max_a, min_a, render, log):
 76 |         self.env = env
 77 |         self.episode = episode
 78 |         self.learning_rate = learning_rate
 79 |         self.gamma = gamma
 80 |         self.capacity = capacity
 81 |         self.batch_size = batch_size
 82 |         self.value_iter = value_iter
 83 |         self.policy_iter = policy_iter
 84 |         self.epsilon_init = epsilon_init
 85 |         self.decay = decay
 86 |         self.epsilon_min = epsilon_min
 87 |         self.rho = rho
 88 |         self.max_a = max_a
 89 |         self.min_a = min_a
 90 |         self.render = render
 91 |         self.log = log
 92 | 
 93 |         self.observation_dim = self.env.observation_space.shape[0]
 94 |         self.action_dim = self.env.action_space.shape[0]
 95 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
 96 |         self.target_policy_net = policy_net(self.observation_dim, self.action_dim)
 97 |         self.value_net = value_net(self.observation_dim, self.action_dim, 1)
 98 |         self.target_value_net = value_net(self.observation_dim, self.action_dim, 1)
 99 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
100 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
101 |         self.target_policy_net.load_state_dict(self.policy_net.state_dict())
102 |         self.target_value_net.load_state_dict(self.value_net.state_dict())
103 |         self.buffer = replay_buffer(self.capacity)
104 |         self.writer = SummaryWriter('runs/ddpg')
105 |         self.count = 0
106 |         self.train_count = 0
107 |         self.weight_reward = 0
108 |         self.epsilon = lambda x: self.epsilon_min + (self.epsilon_init - self.epsilon_min) * math.exp(- x / self.decay)
109 | 
110 |     def soft_update(self):
111 |         for param, target_param in zip(self.value_net.parameters(), self.target_value_net.parameters()):
112 |             target_param.detach().copy_(self.rho * target_param.detach() + (1. - self.rho) * param.detach())
113 |         for param, target_param in zip(self.policy_net.parameters(), self.target_policy_net.parameters()):
114 |             target_param.detach().copy_(self.rho * target_param.detach() + (1. - self.rho) * param.detach())
115 | 
116 |     def train(self):
117 |         observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size)
118 | 
119 |         observation = torch.FloatTensor(observation)
120 |         action = torch.FloatTensor(action).unsqueeze(1)
121 |         reward = torch.FloatTensor(reward).unsqueeze(1)
122 |         next_observation = torch.FloatTensor(next_observation)
123 |         done = torch.FloatTensor(done).unsqueeze(1)
124 | 
125 |         value_loss_buffer = []
126 |         for _ in range(self.value_iter):
127 |             target_next_action = self.target_policy_net.forward(next_observation)
128 |             target_next_value = self.target_value_net.forward(next_observation, target_next_action)
129 |             q_target = reward + self.gamma * (1 - done) * target_next_value
130 |             q_target = q_target.detach()
131 |             q = self.value_net.forward(observation, action)
132 |             value_loss = (q - q_target).pow(2).mean()
133 |             value_loss_buffer.append(value_loss.detach().item())
134 | 
135 |             self.value_optimizer.zero_grad()
136 |             value_loss.backward()
137 |             torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
138 |             self.value_optimizer.step()
139 | 
140 |         policy_loss_buffer = []
141 |         for _ in range(self.policy_iter):
142 |             current_action = self.policy_net.forward(observation)
143 |             policy_loss = (- self.value_net.forward(observation, current_action)).mean()
144 |             policy_loss_buffer.append(policy_loss.detach().item())
145 | 
146 |             self.policy_optimizer.zero_grad()
147 |             policy_loss.backward()
148 |             torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5)
149 |             self.policy_optimizer.step()
150 | 
151 |         if self.log:
152 |             self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count)
153 |             self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
154 | 
155 |         self.soft_update()
156 | 
157 |     def run(self):
158 |         for i in range(self.episode):
159 |             obs = self.env.reset()
160 |             total_reward = 0
161 |             if self.render:
162 |                 self.env.render()
163 |             while True:
164 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
165 |                 action = action + np.random.randn() * self.epsilon(self.count)
166 |                 action = np.clip(action, self.min_a, self.max_a)
167 |                 next_obs, reward, done, _ = self.env.step([action])
168 |                 if self.render:
169 |                     self.env.render()
170 |                 self.buffer.store(obs, action, reward, next_obs, done)
171 |                 self.count += 1
172 |                 total_reward += reward
173 |                 obs = next_obs
174 | 
175 |                 if done:
176 |                     self.train_count += 1
177 |                     self.train()
178 |                     if not self.weight_reward:
179 |                         self.weight_reward = total_reward
180 |                     else:
181 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
182 |                     if self.log:
183 |                         self.writer.add_scalar('reward', total_reward, i + 1)
184 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i + 1)
185 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward))
186 |                     break
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     env = gym.make('Pendulum-v0')
191 |     test = ddpg(env=env,
192 |                 episode=10000,
193 |                 learning_rate=1e-3,
194 |                 gamma=0.99,
195 |                 capacity=10000,
196 |                 batch_size=64,
197 |                 value_iter=10,
198 |                 policy_iter=10,
199 |                 epsilon_init=1.,
200 |                 decay=10000,
201 |                 epsilon_min=0.01,
202 |                 rho=0.995,
203 |                 max_a=2.,
204 |                 min_a=-2.,
205 |                 render=False,
206 |                 log=False)
207 |     test.run()


--------------------------------------------------------------------------------
/DSAC/distributional_sac_discrete.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import gym
  5 | from collections import deque
  6 | import random
  7 | from torch.utils.tensorboard import SummaryWriter
  8 | import numpy as np
  9 | 
 10 | 
 11 | class replay_buffer(object):
 12 |     def __init__(self, capacity):
 13 |         self.capacity = capacity
 14 |         self.memory = deque(maxlen=self.capacity)
 15 | 
 16 |     def store(self, observation, action, reward, next_observation, done):
 17 |         observation = np.expand_dims(observation, 0)
 18 |         next_observation = np.expand_dims(next_observation, 0)
 19 |         self.memory.append([observation, action, reward, next_observation, done])
 20 | 
 21 |     def sample(self, batch_size):
 22 |         batch = random.sample(self.memory, batch_size)
 23 |         observation, action, reward, next_observation, done = zip(* batch)
 24 |         return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done
 25 | 
 26 |     def __len__(self):
 27 |         return len(self.memory)
 28 | 
 29 | 
 30 | class value_net(nn.Module):
 31 |     def __init__(self, observation_dim, action_dim, quant_num, cosine_num):
 32 |         super(value_net, self).__init__()
 33 |         self.observation_dim = observation_dim
 34 |         self.action_dim = action_dim
 35 |         self.quant_num = quant_num
 36 |         self.cosine_num = cosine_num
 37 | 
 38 |         self.feature_layer = nn.Sequential(
 39 |             nn.Linear(self.observation_dim, 128),
 40 |             nn.ReLU(),
 41 |             nn.Linear(128, 128)
 42 |         )
 43 | 
 44 |         self.cosine_layer = nn.Sequential(
 45 |             nn.Linear(self.cosine_num, 128),
 46 |             nn.ReLU()
 47 |         )
 48 | 
 49 |         self.psi_layer = nn.Sequential(
 50 |             nn.Linear(128, 128),
 51 |             nn.ReLU(),
 52 |             nn.Linear(128, self.action_dim)
 53 |         )
 54 | 
 55 |         self.quantile_fraction_layer = nn.Sequential(
 56 |             nn.Linear(128, self.quant_num),
 57 |             nn.Softmax(dim=-1)
 58 |         )
 59 | 
 60 |     def calc_state_embedding(self, observation):
 61 |         return self.feature_layer(observation)
 62 | 
 63 |     def calc_quantile_fraction(self, state_embedding):
 64 |         assert not state_embedding.requires_grad
 65 |         q = self.quantile_fraction_layer(state_embedding.detach())
 66 |         tau_0 = torch.zeros(q.size(0), 1)
 67 |         tau = torch.cat([tau_0, q], dim=-1)
 68 |         tau = torch.cumsum(tau, dim=-1)
 69 |         entropy = torch.distributions.Categorical(probs=q).entropy()
 70 |         tau_hat = ((tau[:, :-1] + tau[:, 1:]) / 2.).detach()
 71 |         return tau, tau_hat, entropy
 72 | 
 73 |     def calc_quantile_value(self, tau, state_embedding):
 74 |         assert not tau.requires_grad
 75 |         quants = torch.arange(0, self.cosine_num, 1.0).unsqueeze(0).unsqueeze(0)
 76 |         cos_trans = torch.cos(quants * tau.unsqueeze(-1).detach() * np.pi)
 77 |         # * cos_trans: [batch_size, quant_num, cosine_num]
 78 |         rand_feat = self.cosine_layer(cos_trans)
 79 |         # * rand_feat: [batch_size, quant_num, 128]
 80 |         x = state_embedding.unsqueeze(1)
 81 |         # * x: [batch_size, 1, 128]
 82 |         x = x * rand_feat
 83 |         # * x: [batch_size, quant_num, 128]
 84 |         value = self.psi_layer(x).transpose(1, 2)
 85 |         # * value: [batch_size, action_dim, quant_num]
 86 |         return value
 87 | 
 88 |     def act(self, observation, epsilon):
 89 |         if random.random() > epsilon:
 90 |             state_embedding = self.calc_state_embedding(observation)
 91 |             tau, tau_hat, _ = self.calc_quantile_fraction(state_embedding.detach())
 92 |             q_value = self.calc_q_value(state_embedding, tau, tau_hat)
 93 |             action = q_value.max(1)[1].detach().item()
 94 |         else:
 95 |             action = random.choice(list(range(self.action_dim)))
 96 |         return action
 97 | 
 98 |     def calc_sa_quantile_value(self, state_embedding, action, tau):
 99 |         sa_quantile_value = self.calc_quantile_value(tau.detach(), state_embedding)
100 |         sa_quantile_value = sa_quantile_value.gather(1, action.unsqueeze(-1).expand(sa_quantile_value.size(0), 1, sa_quantile_value.size(-1))).squeeze(1)
101 |         return sa_quantile_value
102 | 
103 |     def calc_q_value(self, state_embedding, tau, tau_hat):
104 |         tau_delta = tau[:, 1:] - tau[:, :-1]
105 |         tau_hat_value = self.calc_quantile_value(tau_hat.detach(), state_embedding)
106 |         q_value = (tau_delta.unsqueeze(1) * tau_hat_value).sum(-1).detach()
107 |         return q_value
108 | 
109 | 
110 | class policy_net(nn.Module):
111 |     def __init__(self, input_dim, output_dim):
112 |         super(policy_net, self).__init__()
113 |         self.input_dim = input_dim
114 |         self.output_dim = output_dim
115 | 
116 |         self.fc1 = nn.Linear(self.input_dim, 128)
117 |         self.fc2 = nn.Linear(128, 128)
118 |         self.fc3 = nn.Linear(128, self.output_dim)
119 | 
120 |     def forward(self, input):
121 |         x = F.relu(self.fc1(input))
122 |         x = F.relu(self.fc2(x))
123 |         policy = F.softmax(self.fc3(x), dim=-1)
124 |         return policy
125 | 
126 |     def act(self, input):
127 |         policy = self.forward(input)
128 |         dist = torch.distributions.Categorical(policy)
129 |         action = dist.sample()
130 |         return action[0].item()
131 | 
132 | 
133 | class sac_discrete(object):
134 |     def __init__(self, env, batch_size, value_learning_rate, policy_learning_rate, quantile_learning_rate, quant_num, cosine_num, exploration, episode, gamma, alpha, auto_entropy_tuning, capacity, rho, update_iter, update_every, render, log, k=1.):
135 |         self.env = env
136 |         self.batch_size = batch_size
137 |         self.value_learning_rate = value_learning_rate
138 |         self.policy_learning_rate = policy_learning_rate
139 |         self.quantile_learning_rate = quantile_learning_rate
140 |         self.exploration = exploration
141 |         self.episode = episode
142 |         self.gamma = gamma
143 |         self.auto_entropy_tuning = auto_entropy_tuning
144 |         if not self.auto_entropy_tuning:
145 |             self.alpha = alpha
146 |         else:
147 |             self.log_alpha = torch.zeros(1, requires_grad=True)
148 |             self.alpha = self.log_alpha.exp()
149 |             # * set the max possible entropy as the target entropy
150 |             self.target_entropy = -np.log((1. / self.env.action_space.n)) * 0.98
151 |             self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.value_learning_rate, eps=1e-4)
152 |         self.capacity = capacity
153 |         self.rho = rho
154 |         self.update_iter = update_iter
155 |         self.update_every = update_every
156 |         self.render = render
157 |         self.log = log
158 |         self.quant_num = quant_num
159 |         self.cosine_num = cosine_num
160 |         self.k = k
161 | 
162 |         self.observation_dim = self.env.observation_space.shape[0]
163 |         self.action_num = self.env.action_space.n
164 | 
165 |         self.value_net1 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num)
166 |         self.value_net2 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num)
167 |         self.target_value_net1 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num)
168 |         self.target_value_net2 = value_net(self.observation_dim, self.action_num, self.quant_num, self.cosine_num)
169 |         self.policy_net = policy_net(self.observation_dim, self.action_num)
170 |         self.target_value_net1.load_state_dict(self.value_net1.state_dict())
171 |         self.target_value_net2.load_state_dict(self.value_net2.state_dict())
172 | 
173 |         self.buffer = replay_buffer(capacity=self.capacity)
174 | 
175 |         self.value_net1_params = list(self.value_net1.feature_layer.parameters()) + list(self.value_net1.cosine_layer.parameters()) + list(self.value_net1.psi_layer.parameters())
176 |         self.value_net2_params = list(self.value_net2.feature_layer.parameters()) + list(self.value_net2.cosine_layer.parameters()) + list(self.value_net2.psi_layer.parameters())
177 |         self.value_optimizer1 = torch.optim.Adam(self.value_net1_params, lr=self.value_learning_rate)
178 |         self.value_optimizer2 = torch.optim.Adam(self.value_net2_params, lr=self.value_learning_rate)
179 |         self.quantile_optimizer1 = torch.optim.RMSprop(self.value_net1.quantile_fraction_layer.parameters(), lr=self.quantile_learning_rate)
180 |         self.quantile_optimizer2 = torch.optim.RMSprop(self.value_net2.quantile_fraction_layer.parameters(), lr=self.quantile_learning_rate)
181 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.policy_learning_rate)
182 | 
183 |         self.weight_reward = None
184 |         self.count = 0
185 |         self.train_count = 0
186 |         self.writer = SummaryWriter('run/dsac_discrete')
187 | 
188 |     def soft_update(self):
189 |         for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()):
190 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
191 |         for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()):
192 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
193 | 
194 |     def calc_quantile_value_loss(self, tau, value, target_value):
195 |         # * calculate quantile value loss
196 |         # * get the quantile huber loss
197 |         assert not tau.requires_grad
198 |         u = target_value.unsqueeze(-2) - value.unsqueeze(-1)
199 |         huber_loss = 0.5 * u.abs().clamp(min=0., max=self.k).pow(2)
200 |         huber_loss = huber_loss + self.k * (u.abs() - u.abs().clamp(min=0., max=self.k) - 0.5 * self.k)
201 |         quantile_loss = (tau.unsqueeze(-1) - (u < 0).float()).abs() * huber_loss
202 |         loss = quantile_loss.mean()
203 |         return loss
204 | 
205 |     def calc_quantile_fraction_loss(self, net, embedding, actions, tau, tau_hat):
206 |         # * calculate quantile fraction loss
207 |         assert not tau_hat.requires_grad
208 |         sa_quantile_hat = net.calc_sa_quantile_value(embedding, actions, tau_hat).detach()
209 |         sa_quantile = net.calc_sa_quantile_value(embedding, actions, tau[:, 1:-1]).detach()
210 |         gradient_tau = 2 * sa_quantile - sa_quantile_hat[:, :-1] - sa_quantile_hat[:, 1:]
211 |         return (gradient_tau.detach() * tau[:, 1: -1]).sum(1).mean()
212 | 
213 |     def train(self):
214 |         observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size)
215 | 
216 |         observation = torch.FloatTensor(observation)
217 |         action = torch.LongTensor(action).unsqueeze(1)
218 |         reward = torch.FloatTensor(reward).unsqueeze(1)
219 |         next_observation = torch.FloatTensor(next_observation)
220 |         done = torch.FloatTensor(done).unsqueeze(1)
221 | 
222 |         value_loss1_buffer = []
223 |         value_loss2_buffer = []
224 |         policy_loss_buffer = []
225 |         for _ in range(self.update_iter):
226 |             policy = self.policy_net.forward(next_observation)
227 | 
228 |             state_embedding1 = self.value_net1.calc_state_embedding(observation)
229 |             tau1, tau_hat1, entropy1 = self.value_net1.calc_quantile_fraction(state_embedding1.detach())
230 |             dist1 = self.value_net1.calc_quantile_value(tau_hat1.detach(), state_embedding1)
231 |             dist1 = dist1.gather(1, action.unsqueeze(-1).expand(self.batch_size, 1, dist1.size(2))).squeeze()
232 |             # * use tau_hat to calculate the quantile value
233 |             target_next_state_embedding1 = self.target_value_net1.calc_state_embedding(next_observation)
234 |             # * double q
235 |             eval_next_state_embedding1 = self.value_net1.calc_state_embedding(next_observation)
236 |             next_tau1, next_tau_hat1, _ = self.value_net1.calc_quantile_fraction(eval_next_state_embedding1.detach())
237 |             target_action1 = self.value_net1.calc_q_value(eval_next_state_embedding1, next_tau1, next_tau_hat1).max(1)[1].detach()
238 |             target_dist1 = self.target_value_net1.calc_quantile_value(tau_hat1.detach(), target_next_state_embedding1)
239 |             target_dist1 = target_dist1.gather(1, target_action1.unsqueeze(-1).unsqueeze(-1).expand(self.batch_size, 1, target_dist1.size(2))).squeeze()
240 |             target_dist1 = reward + self.gamma * target_dist1 * (1. - done)
241 |             target_q_value1 = self.target_value_net1.calc_q_value(target_next_state_embedding1, tau1, tau_hat1)
242 |             target_q_value1 = reward + self.gamma * target_q_value1 * (1. - done)
243 |             #value = target_dist1.gather(1, action.unsqueeze(-1).expand(self.batch_size, 1, target_dist1.size(2))).squeeze()
244 | 
245 |             state_embedding2 = self.value_net2.calc_state_embedding(observation)
246 |             tau2, tau_hat2, entropy2 = self.value_net2.calc_quantile_fraction(state_embedding2.detach())
247 |             dist2 = self.value_net2.calc_quantile_value(tau_hat2.detach(), state_embedding2)
248 |             dist2 = dist2.gather(1, action.unsqueeze(-1).expand(self.batch_size, 1, dist2.size(2))).squeeze()
249 |             # * use tau_hat to calculate the quantile value
250 |             target_next_state_embedding2 = self.target_value_net2.calc_state_embedding(next_observation)
251 |             eval_next_state_embedding2 = self.value_net2.calc_state_embedding(next_observation)
252 |             next_tau2, next_tau_hat2, _ = self.value_net2.calc_quantile_fraction(eval_next_state_embedding2.detach())
253 |             target_action2 = self.value_net2.calc_q_value(eval_next_state_embedding2, next_tau2, next_tau_hat2).max(1)[1].detach()
254 |             target_dist2 = self.target_value_net2.calc_quantile_value(tau_hat2.detach(), target_next_state_embedding2)
255 |             target_dist2 = target_dist2.gather(1, target_action2.unsqueeze(-1).unsqueeze(-1).expand(self.batch_size, 1, target_dist2.size(2))).squeeze()
256 |             target_dist2 = reward + self.gamma * target_dist2 * (1. - done)
257 |             target_q_value2 = self.target_value_net2.calc_q_value(target_next_state_embedding2, tau2, tau_hat2)
258 |             target_q_value2 = reward + self.gamma * target_q_value2 * (1. - done)
259 |             # * calculate the expectation directly
260 | 
261 |             value_loss1 = self.calc_quantile_value_loss(tau_hat1.detach(), dist1, target_dist1)
262 |             value_loss2 = self.calc_quantile_value_loss(tau_hat2.detach(), dist2, target_dist2)
263 |             value_loss1_buffer.append(value_loss1.detach().item())
264 |             value_loss2_buffer.append(value_loss2.detach().item())
265 | 
266 |             quantile_loss1 = self.calc_quantile_fraction_loss(self.value_net1, state_embedding1, action, tau1, tau_hat1)
267 |             quantile_loss2 = self.calc_quantile_fraction_loss(self.value_net2, state_embedding2, action, tau2, tau_hat2)
268 | 
269 |             self.quantile_optimizer1.zero_grad()
270 |             quantile_loss1.backward(retain_graph=True)
271 |             self.quantile_optimizer1.step()
272 | 
273 |             self.quantile_optimizer2.zero_grad()
274 |             quantile_loss2.backward(retain_graph=True)
275 |             self.quantile_optimizer2.step()
276 | 
277 |             self.value_optimizer1.zero_grad()
278 |             value_loss1.backward()
279 |             nn.utils.clip_grad_norm_(self.value_net1_params, 10)
280 |             self.value_optimizer1.step()
281 | 
282 |             self.value_optimizer2.zero_grad()
283 |             value_loss2.backward()
284 |             nn.utils.clip_grad_norm_(self.value_net2_params, 10)
285 |             self.value_optimizer2.step()
286 | 
287 |             # * calculate the expectation directly
288 |             policy_loss = policy * (self.alpha * policy.log() - torch.min(target_q_value1, target_q_value2).detach())
289 |             policy_loss = policy_loss.mean()
290 | 
291 |             self.policy_optimizer.zero_grad()
292 |             policy_loss.backward()
293 |             nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10)
294 |             self.policy_optimizer.step()
295 | 
296 |             if self.auto_entropy_tuning:
297 |                 self.alpha_optimizer.zero_grad()
298 |                 entropy_loss = -(self.log_alpha * (policy.log() + self.target_entropy).detach()).mean()
299 |                 entropy_loss.backward()
300 |                 nn.utils.clip_grad_norm_([self.log_alpha], 0.2)
301 |                 self.alpha_optimizer.step()
302 | 
303 |                 self.alpha = self.log_alpha.exp()
304 | 
305 |             self.soft_update()
306 |         if self.log:
307 |             self.writer.add_scalar('value_loss1', np.mean(value_loss1_buffer), self.train_count)
308 |             self.writer.add_scalar('value_loss2', np.mean(value_loss2_buffer), self.train_count)
309 |             self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
310 | 
311 |     def run(self):
312 |         for i in range(self.episode):
313 |             obs = self.env.reset()
314 |             total_reward = 0
315 |             if self.render:
316 |                 self.env.render()
317 |             while True:
318 |                 if i >= self.exploration:
319 |                     action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
320 |                 else:
321 |                     action = random.choice(list(range(self.action_num)))
322 |                 next_obs, reward, done, _ = self.env.step(action)
323 |                 if self.render:
324 |                     self.env.render()
325 |                 self.buffer.store(obs, action, reward, next_obs, done)
326 |                 self.count += 1
327 |                 total_reward += reward
328 |                 obs = next_obs
329 | 
330 |                 if (self.count % self.update_every) == 0 and i >= self.exploration:
331 |                     self.train_count += 1
332 |                     self.train()
333 |                 if done:
334 |                     if not self.weight_reward:
335 |                         self.weight_reward = total_reward
336 |                     else:
337 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
338 |                     if self.log:
339 |                         self.writer.add_scalar('reward', total_reward, i + 1)
340 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i + 1)
341 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward))
342 |                     break
343 | 
344 | if __name__ == '__main__':
345 |     env = gym.make('CartPole-v1').unwrapped
346 |     test = sac_discrete(
347 |         env=env,
348 |         batch_size=64,
349 |         value_learning_rate=3e-4,
350 |         policy_learning_rate=3e-4,
351 |         quantile_learning_rate=2.5e-9,
352 |         quant_num=32,
353 |         cosine_num=32,
354 |         exploration=3000,
355 |         episode=10000,
356 |         gamma=0.99,
357 |         alpha=None,
358 |         auto_entropy_tuning=True,
359 |         capacity=100000,
360 |         rho=0.995,
361 |         update_iter=3,
362 |         update_every=5,
363 |         render=False,
364 |         log=False
365 |     )
366 |     test.run()
367 | 


--------------------------------------------------------------------------------
/ICM_PPO/icm.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | import numpy as np
  6 | import gym
  7 | from collections import deque
  8 | from torch.distributions import Categorical
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | 
 11 | 
 12 | class gae_trajectory_buffer(object):
 13 |     def __init__(self, capacity, gamma, lam):
 14 |         self.capacity = capacity
 15 |         self.gamma = gamma
 16 |         self.lam = lam
 17 |         self.memory = deque(maxlen=self.capacity)
 18 |         # * [obs, act, next_obs, rew, don, val, ret, adv]
 19 | 
 20 |     def store(self, obs, act, rew, don, val, next_obs):
 21 |         obs = np.expand_dims(obs, 0)
 22 |         next_obs = np.expand_dims(next_obs, 0)
 23 |         self.memory.append([obs, act, next_obs, rew, don, val])
 24 | 
 25 |     def process(self):
 26 |         R = 0
 27 |         Adv = 0
 28 |         Value_previous = 0
 29 |         for traj in reversed(list(self.memory)):
 30 |             R = self.gamma * R * (1 - traj[4]) + traj[5]
 31 |             traj.append(R)
 32 |             # * the generalized advantage estimator(GAE)
 33 |             delta = traj[3] + Value_previous * self.gamma * (1 - traj[4]) - traj[5]
 34 |             Adv = delta + (1 - traj[4]) * Adv * self.gamma * self.lam
 35 |             traj.append(Adv)
 36 |             Value_previous = traj[5]
 37 | 
 38 |     def get(self):
 39 |         obs, act, next_obs, rew, don, val, ret, adv = zip(* self.memory)
 40 |         act = np.expand_dims(act, 1)
 41 |         rew = np.expand_dims(rew, 1)
 42 |         don = np.expand_dims(don, 1)
 43 |         val = np.expand_dims(val, 1)
 44 |         ret = np.expand_dims(ret, 1)
 45 |         adv = np.array(adv)
 46 |         adv = (adv - adv.mean()) / adv.std()
 47 |         adv = np.expand_dims(adv, 1)
 48 |         return np.concatenate(obs, 0), act, np.concatenate(next_obs, 0), rew, don, val, ret, adv
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.memory)
 52 | 
 53 |     def clear(self):
 54 |         self.memory.clear()
 55 | 
 56 | 
 57 | class policy_net(nn.Module):
 58 |     def __init__(self, input_dim, output_dim):
 59 |         super(policy_net, self).__init__()
 60 |         self.input_dim = input_dim
 61 |         self.output_dim = output_dim
 62 |         self.fc1 = nn.Linear(self.input_dim, 128)
 63 |         self.fc2 = nn.Linear(128, 128)
 64 |         self.fc3 = nn.Linear(128, self.output_dim)
 65 | 
 66 |     def forward(self, input):
 67 |         x = F.relu(self.fc1(input))
 68 |         x = F.relu(self.fc2(x))
 69 |         x = self.fc3(x)
 70 |         return F.softmax(x, 1)
 71 | 
 72 |     def act(self, input):
 73 |         probs = self.forward(input)
 74 |         dist = Categorical(probs)
 75 |         action = dist.sample()
 76 |         action = action.detach().item()
 77 |         return action
 78 | 
 79 | 
 80 | class value_net(nn.Module):
 81 |     def __init__(self, input_dim, output_dim):
 82 |         super(value_net, self).__init__()
 83 |         self.input_dim = input_dim
 84 |         self.output_dim = output_dim
 85 | 
 86 |         self.fc1 = nn.Linear(self.input_dim, 128)
 87 |         self.fc2 = nn.Linear(128, 128)
 88 |         self.fc3 = nn.Linear(128, self.output_dim)
 89 | 
 90 |     def forward(self, input):
 91 |         x = F.relu(self.fc1(input))
 92 |         x = F.relu(self.fc2(x))
 93 |         x = self.fc3(x)
 94 |         return x
 95 | 
 96 | 
 97 | class icm(nn.Module):
 98 |     def __init__(self, observation_dim, action_dim, state_dim, reset_time):
 99 |         super(icm, self).__init__()
100 |         self.observation_dim = observation_dim
101 |         self.action_dim = action_dim
102 |         self.state_dim = state_dim
103 |         self.reset_time = reset_time
104 | 
105 |         self.feature = nn.Sequential(
106 |             nn.Linear(self.observation_dim, 256),
107 |             nn.ReLU(),
108 |             nn.Linear(256, 256),
109 |             nn.ReLU(),
110 |             nn.Linear(256, self.state_dim)
111 |         )
112 | 
113 |         self.inverse_net = nn.Sequential(
114 |             nn.Linear(2 * self.state_dim, 256),
115 |             nn.ReLU(),
116 |             nn.Linear(256, 256),
117 |             nn.ReLU(),
118 |             nn.Linear(256, self.action_dim)
119 |         )
120 | 
121 |         self.forward_net_1 = nn.Sequential(
122 |             nn.Linear(self.state_dim + self.action_dim, 256),
123 |             nn.ReLU(),
124 |             nn.Linear(256, 256)
125 |         )
126 | 
127 |         self.reset_net = [
128 |             nn.Sequential(
129 |                 nn.Linear(256, 256),
130 |                 nn.ReLU(),
131 |                 nn.Linear(256, 256),
132 |                 nn.ReLU()
133 |             )
134 |         ] * 2 * self.reset_time
135 | 
136 |         self.forward_net_2 = nn.Sequential(
137 |             nn.Linear(256, 256),
138 |             nn.ReLU(),
139 |             nn.Linear(256, self.state_dim)
140 |         )
141 | 
142 |     def forward(self, observation, action, next_observation):
143 |         state = self.feature(observation)
144 |         next_state = self.feature(next_observation)
145 |         cat_state = torch.cat([state, next_state], 1)
146 |         pred_action = self.inverse_net(cat_state)
147 |         pred_action = torch.softmax(pred_action, 1)
148 |         pred_state = self.forward_net_1(torch.cat([state, action], 1))
149 |         for i in range(self.reset_time):
150 |             pred_state_tmp = self.reset_net[2 * i](pred_state)
151 |             pred_state = self.reset_net[2 * i + 1](pred_state_tmp) + pred_state
152 |         pred_state = self.forward_net_2(pred_state)
153 |         return pred_action, pred_state, next_state
154 | 
155 |     def intrinsic_reward(self, observation, action, next_observation):
156 |         state = self.feature(observation)
157 |         next_state = self.feature(next_observation)
158 |         pred_state = self.forward_net_1(torch.cat([state, action], 1))
159 |         for i in range(self.reset_time):
160 |             pred_state_tmp = self.reset_net[2 * i](pred_state)
161 |             pred_state = self.reset_net[2 * i + 1](pred_state_tmp) + pred_state
162 |         pred_state = self.forward_net_2(pred_state)
163 |         r_i = (pred_state - next_state).pow(2).sum()
164 |         return r_i.detach().item()
165 | 
166 | 
167 | class icm_ppo(object):
168 |     def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter,  state_dim, reset_time, intrinsic_weight):
169 |         super(icm_ppo, self).__init__()
170 |         self.env = env
171 |         self.episode = episode
172 |         self.learning_rate = learning_rate
173 |         self.gamma = gamma
174 |         self.lam = lam
175 |         self.epsilon = epsilon
176 |         self.capacity = capacity
177 |         self.render = render
178 |         self.log = log
179 |         self.value_update_iter = value_update_iter
180 |         self.policy_update_iter = policy_update_iter
181 |         self.state_dim = state_dim
182 |         self.reset_time = reset_time
183 |         self.intrinsic_weight = intrinsic_weight
184 | 
185 |         self.observation_dim = self.env.observation_space.shape[0]
186 |         self.action_dim = self.env.action_space.n
187 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
188 |         self.value_net = value_net(self.observation_dim, 1)
189 |         self.icm_net = icm(self.observation_dim, self.action_dim, self.state_dim, self.reset_time)
190 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
191 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
192 |         self.icm_optimizer = torch.optim.Adam(self.icm_net.parameters(), lr=self.learning_rate)
193 |         self.mse_func = torch.nn.MSELoss()
194 |         self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam)
195 |         self.count = 0
196 |         self.train_count = 0
197 |         self.weight_reward = None
198 |         self.writer = SummaryWriter('runs/icm_ppo_cartpole')
199 | 
200 |     def train(self):
201 |         obs, act, next_obs, rew, don, val, ret, adv = self.buffer.get()
202 | 
203 |         obs = torch.FloatTensor(obs)
204 |         act = torch.LongTensor(act)
205 |         next_obs = torch.FloatTensor(next_obs)
206 |         rew = torch.FloatTensor(rew)
207 |         don = torch.FloatTensor(don)
208 |         val = torch.FloatTensor(val)
209 |         ret = torch.FloatTensor(ret)
210 |         adv = torch.FloatTensor(adv).squeeze(1)
211 |         act_one_hot = torch.zeros(act.size(0), self.action_dim).scatter(1, act, 1)
212 | 
213 |         old_probs = self.policy_net.forward(obs)
214 |         old_probs = old_probs.gather(1, act).squeeze(1).detach()
215 |         value_loss_buffer = []
216 |         for _ in range(self.value_update_iter):
217 |             value = self.value_net.forward(obs)
218 |             td_target = rew + self.gamma * self.value_net.forward(next_obs) * (1 - don)
219 |             value_loss = F.smooth_l1_loss(td_target.detach(), value)
220 |             value_loss_buffer.append(value_loss.item())
221 |             self.value_optimizer.zero_grad()
222 |             value_loss.backward()
223 |             self.value_optimizer.step()
224 |             if self.log:
225 |                 self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count)
226 | 
227 |         policy_loss_buffer = []
228 |         for _ in range(self.policy_update_iter):
229 |             probs = self.policy_net.forward(obs)
230 |             probs = probs.gather(1, act).squeeze(1)
231 |             ratio = probs / old_probs
232 |             surr1 = ratio * adv
233 |             surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv
234 |             policy_loss = - torch.min(surr1, surr2).mean()
235 |             policy_loss_buffer.append(policy_loss.item())
236 |             self.policy_optimizer.zero_grad()
237 |             policy_loss.backward()
238 |             self.policy_optimizer.step()
239 |             if self.log:
240 |                 self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
241 | 
242 |         pred_action, pred_state, next_state = self.icm_net.forward(obs, act_one_hot, next_obs)
243 |         forward_loss = self.mse_func(pred_state, next_state.detach())
244 |         inverse_loss = self.mse_func(pred_action, act_one_hot)
245 |         icm_loss = forward_loss + inverse_loss
246 |         self.icm_optimizer.zero_grad()
247 |         icm_loss.backward()
248 |         self.icm_optimizer.step()
249 | 
250 |     def run(self):
251 |         for i in range(self.episode):
252 |             obs = self.env.reset()
253 |             total_reward = 0
254 |             if self.render:
255 |                 self.env.render()
256 |             while True:
257 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
258 |                 next_obs, reward, done, _ = self.env.step(action)
259 |                 if self.render:
260 |                     self.env.render()
261 |                 value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item()
262 |                 action_one_hot = np.zeros([1, self.action_dim])
263 |                 action_one_hot[0, action] = 1
264 |                 action_one_hot = torch.FloatTensor(action_one_hot)
265 |                 intrinsic_reward = self.intrinsic_weight * self.icm_net.intrinsic_reward(torch.FloatTensor(np.expand_dims(obs, 0)), action_one_hot, torch.FloatTensor(np.expand_dims(next_obs, 0)))
266 |                 reward = max(intrinsic_reward, 0.1) + reward
267 |                 self.buffer.store(obs, action, reward / 100., done, value, next_obs)
268 |                 self.count += 1
269 |                 total_reward += reward
270 |                 obs = next_obs
271 |                 #if self.count % self.capacity == 0:
272 |                 if done:
273 |                     self.buffer.process()
274 |                     self.train_count += 1
275 |                     self.train()
276 |                     self.buffer.clear()
277 |                 if done:
278 |                     if not self.weight_reward:
279 |                         self.weight_reward = total_reward
280 |                     else:
281 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
282 |                     if self.log:
283 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i+1)
284 |                         self.writer.add_scalar('reward', total_reward, i+1)
285 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}  train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count))
286 |                     break
287 | 
288 | 
289 | if __name__ == '__main__':
290 |     env = gym.make('CartPole-v0').unwrapped
291 |     test = icm_ppo(
292 |         env=env,
293 |         episode=10000,
294 |         learning_rate=1e-3,
295 |         gamma=0.99,
296 |         lam=0.97,
297 |         epsilon=0.2,
298 |         capacity=20000,
299 |         render=False,
300 |         log=False,
301 |         value_update_iter=3,
302 |         policy_update_iter=3,
303 |         state_dim=256,
304 |         reset_time=2,
305 |         intrinsic_weight=1e-6
306 |     )
307 |     test.run()
308 | 


--------------------------------------------------------------------------------
/PPO_CLIP/gae_ppo_cartpole.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | import numpy as np
  6 | import gym
  7 | from collections import deque
  8 | from torch.distributions import Categorical
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | 
 11 | 
 12 | class gae_trajectory_buffer(object):
 13 |     def __init__(self, capacity, gamma, lam):
 14 |         self.capacity = capacity
 15 |         self.gamma = gamma
 16 |         self.lam = lam
 17 |         self.memory = deque(maxlen=self.capacity)
 18 |         # * [obs, next_obs, act, rew, don, val, ret, adv]
 19 | 
 20 |     def store(self, obs, next_obs, act, rew, don, val):
 21 |         obs = np.expand_dims(obs, 0)
 22 |         next_obs = np.expand_dims(next_obs, 0)
 23 |         self.memory.append([obs, next_obs, act, rew, don, val])
 24 | 
 25 |     def process(self):
 26 |         R = 0
 27 |         Adv = 0
 28 |         Value_previous = 0
 29 |         for traj in reversed(list(self.memory)):
 30 |             R = self.gamma * R * (1 - traj[4]) + traj[5]
 31 |             traj.append(R)
 32 |             # * the generalized advantage estimator(GAE)
 33 |             delta = traj[3] + Value_previous * self.gamma * (1 - traj[4]) - traj[5]
 34 |             Adv = delta + (1 - traj[4]) * Adv * self.gamma * self.lam
 35 |             traj.append(Adv)
 36 |             Value_previous = traj[5]
 37 | 
 38 |     def get(self):
 39 |         obs, next_obs, act, rew, don, val, ret, adv = zip(* self.memory)
 40 |         act = np.expand_dims(act, 1)
 41 |         rew = np.expand_dims(rew, 1)
 42 |         don = np.expand_dims(don, 1)
 43 |         val = np.expand_dims(val, 1)
 44 |         ret = np.expand_dims(ret, 1)
 45 |         adv = np.array(adv)
 46 |         adv = (adv - adv.mean()) / adv.std()
 47 |         adv = np.expand_dims(adv, 1)
 48 |         return np.concatenate(obs, 0), np.concatenate(next_obs, 0), act, rew, don, val, ret, adv
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.memory)
 52 | 
 53 |     def clear(self):
 54 |         self.memory.clear()
 55 | 
 56 | 
 57 | class policy_net(nn.Module):
 58 |     def __init__(self, input_dim, output_dim):
 59 |         super(policy_net, self).__init__()
 60 |         self.input_dim = input_dim
 61 |         self.output_dim = output_dim
 62 |         self.fc1 = nn.Linear(self.input_dim, 128)
 63 |         self.fc2 = nn.Linear(128, 128)
 64 |         self.fc3 = nn.Linear(128, self.output_dim)
 65 | 
 66 |     def forward(self, input):
 67 |         x = F.relu(self.fc1(input))
 68 |         x = F.relu(self.fc2(x))
 69 |         x = self.fc3(x)
 70 |         return F.softmax(x, 1)
 71 | 
 72 |     def act(self, input):
 73 |         probs = self.forward(input)
 74 |         dist = Categorical(probs)
 75 |         action = dist.sample()
 76 |         action = action.detach().item()
 77 |         return action
 78 | 
 79 | 
 80 | class value_net(nn.Module):
 81 |     def __init__(self, input_dim, output_dim):
 82 |         super(value_net, self).__init__()
 83 |         self.input_dim = input_dim
 84 |         self.output_dim = output_dim
 85 | 
 86 |         self.fc1 = nn.Linear(self.input_dim, 128)
 87 |         self.fc2 = nn.Linear(128, 128)
 88 |         self.fc3 = nn.Linear(128, self.output_dim)
 89 | 
 90 |     def forward(self, input):
 91 |         x = F.relu(self.fc1(input))
 92 |         x = F.relu(self.fc2(x))
 93 |         x = self.fc3(x)
 94 |         return x
 95 | 
 96 | 
 97 | class ppo_clip(object):
 98 |     def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter):
 99 |         super(ppo_clip, self).__init__()
100 |         self.env = env
101 |         self.episode = episode
102 |         self.learning_rate = learning_rate
103 |         self.gamma = gamma
104 |         self.lam = lam
105 |         self.epsilon = epsilon
106 |         self.capacity = capacity
107 |         self.render = render
108 |         self.log = log
109 |         self.value_update_iter = value_update_iter
110 |         self.policy_update_iter = policy_update_iter
111 | 
112 |         self.observation_dim = self.env.observation_space.shape[0]
113 |         self.action_dim = self.env.action_space.n
114 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
115 |         self.value_net = value_net(self.observation_dim, 1)
116 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
117 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
118 |         self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam)
119 |         self.count = 0
120 |         self.train_count = 0
121 |         self.weight_reward = None
122 |         self.writer = SummaryWriter('runs/ppo_clip_cartpole')
123 | 
124 |     def train(self):
125 |         obs, next_obs, act, rew, don, val, ret, adv = self.buffer.get()
126 | 
127 |         obs = torch.FloatTensor(obs)
128 |         next_obs = torch.FloatTensor(next_obs)
129 |         act = torch.LongTensor(act)
130 |         rew = torch.FloatTensor(rew)
131 |         don = torch.FloatTensor(don)
132 |         val = torch.FloatTensor(val)
133 |         ret = torch.FloatTensor(ret)
134 |         adv = torch.FloatTensor(adv).squeeze(1)
135 | 
136 |         old_probs = self.policy_net.forward(obs)
137 |         old_probs = old_probs.gather(1, act).squeeze(1).detach()
138 |         value_loss_buffer = []
139 |         policy_loss_buffer = []
140 |         for _ in range(self.value_update_iter):
141 |             value = self.value_net.forward(obs)
142 |             td_target = rew + self.gamma * self.value_net.forward(next_obs) * (1 - don)
143 |             #value_loss = (ret - value).pow(2).mean()
144 |             value_loss = F.smooth_l1_loss(td_target.detach(), value)
145 |             value_loss_buffer.append(value_loss.item())
146 |             self.value_optimizer.zero_grad()
147 |             value_loss.backward()
148 |             self.value_optimizer.step()
149 |             if self.log:
150 |                 self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count)
151 | 
152 |             probs = self.policy_net.forward(obs)
153 |             probs = probs.gather(1, act).squeeze(1)
154 |             ratio = probs / old_probs
155 |             surr1 = ratio * adv
156 |             surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv
157 |             policy_loss = - torch.min(surr1, surr2).mean()
158 |             policy_loss_buffer.append(policy_loss.item())
159 |             self.policy_optimizer.zero_grad()
160 |             policy_loss.backward()
161 |             self.policy_optimizer.step()
162 |             if self.log:
163 |                 self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
164 | 
165 |     def run(self):
166 |         for i in range(self.episode):
167 |             obs = self.env.reset()
168 |             total_reward = 0
169 |             if self.render:
170 |                 self.env.render()
171 |             while True:
172 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
173 |                 next_obs, reward, done, _ = self.env.step(action)
174 |                 if self.render:
175 |                     self.env.render()
176 |                 value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item()
177 |                 self.buffer.store(obs, next_obs, action, reward / 100., done, value)
178 |                 self.count += 1
179 |                 total_reward += reward
180 |                 obs = next_obs
181 |                 if self.count % self.capacity == 0:
182 |                     self.buffer.process()
183 |                     self.train_count += 1
184 |                     self.train()
185 |                     self.buffer.clear()
186 |                 if done:
187 |                     if not self.weight_reward:
188 |                         self.weight_reward = total_reward
189 |                     else:
190 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
191 |                     if self.log:
192 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i+1)
193 |                         self.writer.add_scalar('reward', total_reward, i+1)
194 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}  train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count))
195 |                     break
196 | 
197 | 
198 | if __name__ == '__main__':
199 |     env = gym.make('CartPole-v1').unwrapped
200 |     test = ppo_clip(env=env,
201 |                     episode=10000,
202 |                     learning_rate=1e-3,
203 |                     gamma=0.99,
204 |                     lam=0.97,
205 |                     epsilon=0.2,
206 |                     capacity=20,
207 |                     render=False,
208 |                     log=False,
209 |                     value_update_iter=10,
210 |                     policy_update_iter=10)
211 |     test.run()
212 | 


--------------------------------------------------------------------------------
/PPO_CLIP/ppo_cartpole.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | import numpy as np
  6 | import gym
  7 | from collections import deque
  8 | from torch.distributions import Categorical
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | 
 11 | 
 12 | class trajectory_buffer(object):
 13 |     def __init__(self, capacity):
 14 |         self.capacity = capacity
 15 |         self.memory = deque(maxlen=self.capacity)
 16 |         # * [obs, next_obs, act, rew, don, val]
 17 | 
 18 |     def store(self, obs, next_obs, act, rew, don, val):
 19 |         obs = np.expand_dims(obs, 0)
 20 |         next_obs = np.expand_dims(next_obs, 0)
 21 |         self.memory.append([obs, next_obs, act, rew, don, val])
 22 | 
 23 |     def get(self):
 24 |         obs, next_obs, act, rew, don, val = zip(* self.memory)
 25 |         act = np.expand_dims(act, 1)
 26 |         rew = np.expand_dims(rew, 1)
 27 |         don = np.expand_dims(don, 1)
 28 |         val = np.expand_dims(val, 1)
 29 |         return np.concatenate(obs, 0), np.concatenate(next_obs, 0), act, rew, don, val
 30 | 
 31 |     def __len__(self):
 32 |         return len(self.memory)
 33 | 
 34 |     def clear(self):
 35 |         self.memory.clear()
 36 | 
 37 | 
 38 | class policy_net(nn.Module):
 39 |     def __init__(self, input_dim, output_dim):
 40 |         super(policy_net, self).__init__()
 41 |         self.input_dim = input_dim
 42 |         self.output_dim = output_dim
 43 |         self.fc1 = nn.Linear(self.input_dim, 128)
 44 |         self.fc2 = nn.Linear(128, 128)
 45 |         self.fc3 = nn.Linear(128, self.output_dim)
 46 | 
 47 |     def forward(self, input):
 48 |         x = F.relu(self.fc1(input))
 49 |         x = F.relu(self.fc2(x))
 50 |         x = self.fc3(x)
 51 |         return F.softmax(x, 1)
 52 | 
 53 |     def act(self, input):
 54 |         probs = self.forward(input)
 55 |         dist = Categorical(probs)
 56 |         action = dist.sample()
 57 |         action = action.detach().item()
 58 |         return action
 59 | 
 60 | 
 61 | class value_net(nn.Module):
 62 |     def __init__(self, input_dim, output_dim):
 63 |         super(value_net, self).__init__()
 64 |         self.input_dim = input_dim
 65 |         self.output_dim = output_dim
 66 | 
 67 |         self.fc1 = nn.Linear(self.input_dim, 128)
 68 |         self.fc2 = nn.Linear(128, 128)
 69 |         self.fc3 = nn.Linear(128, self.output_dim)
 70 | 
 71 |     def forward(self, input):
 72 |         x = F.relu(self.fc1(input))
 73 |         x = F.relu(self.fc2(x))
 74 |         x = self.fc3(x)
 75 |         return x
 76 | 
 77 | 
 78 | class ppo_clip(object):
 79 |     def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter):
 80 |         super(ppo_clip, self).__init__()
 81 |         self.env = env
 82 |         self.episode = episode
 83 |         self.learning_rate = learning_rate
 84 |         self.gamma = gamma
 85 |         self.lam = lam
 86 |         self.epsilon = epsilon
 87 |         self.capacity = capacity
 88 |         self.render = render
 89 |         self.log = log
 90 |         self.value_update_iter = value_update_iter
 91 |         self.policy_update_iter = policy_update_iter
 92 | 
 93 |         self.observation_dim = self.env.observation_space.shape[0]
 94 |         self.action_dim = self.env.action_space.n
 95 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
 96 |         self.value_net = value_net(self.observation_dim, 1)
 97 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
 98 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
 99 |         self.buffer = trajectory_buffer(capacity=self.capacity)
100 |         self.count = 0
101 |         self.train_count = 0
102 |         self.weight_reward = None
103 |         self.writer = SummaryWriter('runs/ppo_clip_cartpole')
104 | 
105 |     def train(self):
106 |         obs, next_obs, act, rew, don, val = self.buffer.get()
107 | 
108 |         obs = torch.FloatTensor(obs)
109 |         next_obs = torch.FloatTensor(next_obs)
110 |         act = torch.LongTensor(act)
111 |         rew = torch.FloatTensor(rew)
112 |         don = torch.FloatTensor(don)
113 |         val = torch.FloatTensor(val)
114 | 
115 |         old_probs = self.policy_net.forward(obs)
116 |         old_probs = old_probs.gather(1, act).squeeze(1).detach()
117 |         value_loss_buffer = []
118 |         policy_loss_buffer = []
119 |         for _ in range(self.value_update_iter):
120 |             td_target = rew + self.gamma * self.value_net.forward(next_obs) * (1 - don)
121 |             delta = td_target - self.value_net.forward(obs)
122 |             delta = delta.detach().numpy()
123 | 
124 |             advantage_lst = []
125 |             advantage = 0.0
126 |             for delta_t in delta[::-1]:
127 |                 advantage = self.gamma * self.lam * advantage + delta_t[0]
128 |                 advantage_lst.append([advantage])
129 | 
130 |             advantage_lst.reverse()
131 |             advantage = torch.FloatTensor(advantage_lst)
132 | 
133 |             value = self.value_net.forward(obs)
134 |             #value_loss = (ret - value).pow(2).mean()
135 |             value_loss = F.smooth_l1_loss(td_target.detach(), value)
136 |             value_loss_buffer.append(value_loss.item())
137 |             self.value_optimizer.zero_grad()
138 |             value_loss.backward()
139 |             self.value_optimizer.step()
140 |             if self.log:
141 |                 self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count)
142 | 
143 |             probs = self.policy_net.forward(obs)
144 |             probs = probs.gather(1, act).squeeze(1)
145 |             ratio = probs / old_probs
146 |             surr1 = ratio * advantage
147 |             surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * advantage
148 |             policy_loss = - torch.min(surr1, surr2).mean()
149 |             policy_loss_buffer.append(policy_loss.item())
150 |             self.policy_optimizer.zero_grad()
151 |             policy_loss.backward()
152 |             self.policy_optimizer.step()
153 |             if self.log:
154 |                 self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
155 | 
156 |     def run(self):
157 |         for i in range(self.episode):
158 |             obs = self.env.reset()
159 |             total_reward = 0
160 |             if self.render:
161 |                 self.env.render()
162 |             while True:
163 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
164 |                 next_obs, reward, done, _ = self.env.step(action)
165 |                 if self.render:
166 |                     self.env.render()
167 |                 value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item()
168 |                 self.buffer.store(obs, next_obs, action, reward, done, value)
169 |                 self.count += 1
170 |                 total_reward += reward
171 |                 obs = next_obs
172 |                 if self.count % 20 == 0:
173 |                     self.train_count += 1
174 |                     self.train()
175 |                     self.buffer.clear()
176 |                 if done:
177 |                     if not self.weight_reward:
178 |                         self.weight_reward = total_reward
179 |                     else:
180 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
181 |                     if self.log:
182 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i+1)
183 |                         self.writer.add_scalar('reward', total_reward, i+1)
184 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}  train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count))
185 |                     break
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     env = gym.make('CartPole-v1').unwrapped
190 |     test = ppo_clip(env=env,
191 |                     episode=10000,
192 |                     learning_rate=1e-3,
193 |                     gamma=0.99,
194 |                     lam=0.95,
195 |                     epsilon=0.1,
196 |                     capacity=2000,
197 |                     render=False,
198 |                     log=False,
199 |                     value_update_iter=3,
200 |                     policy_update_iter=3)
201 |     test.run()
202 | 


--------------------------------------------------------------------------------
/PPO_CLIP/ppo_pendulum.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | import numpy as np
  6 | import gym
  7 | from collections import deque
  8 | from torch.distributions import Normal
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | 
 11 | 
 12 | class gae_trajectory_buffer(object):
 13 |     def __init__(self, capacity, gamma, lam):
 14 |         self.capacity = capacity
 15 |         self.gamma = gamma
 16 |         self.lam = lam
 17 |         self.memory = deque(maxlen=self.capacity)
 18 |         # * [obs, act, rew, don, val, ret, adv]
 19 | 
 20 |     def store(self, obs, act, rew, don, val):
 21 |         obs = np.expand_dims(obs, 0)
 22 |         self.memory.append([obs, act, rew, don, val])
 23 | 
 24 |     def process(self):
 25 |         R = 0
 26 |         Adv = 0
 27 |         Value_previous = 0
 28 |         for traj in reversed(list(self.memory)):
 29 |             R = self.gamma * R * (1 - traj[3]) + traj[4]
 30 |             traj.append(R)
 31 |             # * the generalized advantage estimator(GAE)
 32 |             delta = traj[2] + Value_previous * self.gamma * (1 - traj[3]) - traj[4]
 33 |             Adv = delta + (1 - traj[3]) * Adv * self.gamma * self.lam
 34 |             traj.append(Adv)
 35 |             Value_previous = traj[4]
 36 | 
 37 |     def get(self):
 38 |         obs, act, rew, don, val, ret, adv = zip(* self.memory)
 39 |         act = np.expand_dims(act, 1)
 40 |         rew = np.expand_dims(rew, 1)
 41 |         don = np.expand_dims(don, 1)
 42 |         val = np.expand_dims(val, 1)
 43 |         ret = np.expand_dims(ret, 1)
 44 |         adv = np.array(adv)
 45 |         adv = (adv - adv.mean()) / adv.std()
 46 |         adv = np.expand_dims(adv, 1)
 47 |         return np.concatenate(obs, 0), act, rew, don, val, ret, adv
 48 | 
 49 |     def __len__(self):
 50 |         return len(self.memory)
 51 | 
 52 | 
 53 | class policy_net(nn.Module):
 54 |     def __init__(self, input_dim, output_dim):
 55 |         super(policy_net, self).__init__()
 56 |         self.input_dim = input_dim
 57 |         self.output_dim = output_dim
 58 |         self.fc1 = nn.Linear(self.input_dim, 128)
 59 |         self.fc2 = nn.Linear(128, 128)
 60 |         self.fc3 = nn.Linear(128, self.output_dim)
 61 | 
 62 |     def forward(self, input):
 63 |         x = torch.tanh(self.fc1(input))
 64 |         x = torch.tanh(self.fc2(x))
 65 |         mu = self.fc3(x)
 66 |         return mu
 67 | 
 68 |     def act(self, input):
 69 |         mu = self.forward(input)
 70 |         sigma = torch.ones_like(mu)
 71 |         dist = Normal(mu, sigma)
 72 |         action = dist.sample().detach().item()
 73 |         return action
 74 | 
 75 |     def get_distribution(self, input):
 76 |         mu = self.forward(input)
 77 |         sigma = torch.ones_like(mu)
 78 |         dist = Normal(mu, sigma)
 79 |         return dist
 80 | 
 81 | 
 82 | class value_net(nn.Module):
 83 |     def __init__(self, input_dim, output_dim):
 84 |         super(value_net, self).__init__()
 85 |         self.input_dim = input_dim
 86 |         self.output_dim = output_dim
 87 | 
 88 |         self.fc1 = nn.Linear(self.input_dim, 128)
 89 |         self.fc2 = nn.Linear(128, 128)
 90 |         self.fc3 = nn.Linear(128, self.output_dim)
 91 | 
 92 |     def forward(self, input):
 93 |         x = torch.tanh(self.fc1(input))
 94 |         x = torch.tanh(self.fc2(x))
 95 |         x = self.fc3(x)
 96 |         return x
 97 | 
 98 | 
 99 | class ppo_clip(object):
100 |     def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter):
101 |         super(ppo_clip, self).__init__()
102 |         self.env = env
103 |         self.episode = episode
104 |         self.learning_rate = learning_rate
105 |         self.gamma = gamma
106 |         self.lam = lam
107 |         self.epsilon = epsilon
108 |         self.capacity = capacity
109 |         self.render = render
110 |         self.log = log
111 |         self.value_update_iter = value_update_iter
112 |         self.policy_update_iter = policy_update_iter
113 | 
114 |         self.observation_dim = self.env.observation_space.shape[0]
115 |         self.action_dim = self.env.action_space.shape[0]
116 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
117 |         self.value_net = value_net(self.observation_dim, 1)
118 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
119 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
120 |         self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam)
121 |         self.count = 0
122 |         self.train_count = 0
123 |         self.weight_reward = None
124 |         self.writer = SummaryWriter('runs/ppo_clip_pendulum')
125 | 
126 |     def train(self):
127 |         obs, act, rew, don, val, ret, adv = self.buffer.get()
128 | 
129 |         obs = torch.FloatTensor(obs)
130 |         act = torch.FloatTensor(act)
131 |         rew = torch.FloatTensor(rew)
132 |         don = torch.FloatTensor(don)
133 |         val = torch.FloatTensor(val)
134 |         ret = torch.FloatTensor(ret)
135 |         adv = torch.FloatTensor(adv)
136 | 
137 |         old_dist = self.policy_net.get_distribution(obs)
138 |         old_log_probs = old_dist.log_prob(act).detach()
139 |         value_loss_buffer = []
140 |         for _ in range(self.value_update_iter):
141 |             value = self.value_net.forward(obs)
142 |             value_loss = (ret - value).pow(2).mean()
143 |             value_loss_buffer.append(value_loss.item())
144 |             self.value_optimizer.zero_grad()
145 |             value_loss.backward()
146 |             self.value_optimizer.step()
147 |             if self.log:
148 |                 self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count)
149 | 
150 |         policy_loss_buffer = []
151 |         for _ in range(self.policy_update_iter):
152 |             dist = self.policy_net.get_distribution(obs)
153 |             log_probs = dist.log_prob(act)
154 |             ratio = torch.exp(log_probs - old_log_probs)
155 |             surr1 = ratio * adv
156 |             surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv
157 |             policy_loss = - torch.min(surr1, surr2).mean()
158 |             policy_loss_buffer.append(policy_loss.item())
159 |             self.policy_optimizer.zero_grad()
160 |             policy_loss.backward()
161 |             self.policy_optimizer.step()
162 |             if self.log:
163 |                 self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
164 | 
165 |     def run(self):
166 |         for i in range(self.episode):
167 |             obs = self.env.reset()
168 |             total_reward = 0
169 |             if self.render:
170 |                 self.env.render()
171 |             while True:
172 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
173 |                 next_obs, reward, done, _ = self.env.step([action])
174 |                 if self.render:
175 |                     self.env.render()
176 |                 value = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item()
177 |                 self.buffer.store(obs, action, reward, done, value)
178 |                 self.count += 1
179 |                 total_reward += reward
180 |                 obs = next_obs
181 |                 if self.count % self.capacity == 0:
182 |                     self.buffer.process()
183 |                     self.train_count += 1
184 |                     self.train()
185 |                 if done:
186 |                     if not self.weight_reward:
187 |                         self.weight_reward = total_reward
188 |                     else:
189 |                         self.weight_reward = self.weight_reward * 0.9 + total_reward * 0.1
190 |                     if self.log:
191 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i+1)
192 |                         self.writer.add_scalar('reward', total_reward, i+1)
193 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}  train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count))
194 |                     break
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     env = gym.make('Pendulum-v0')
199 |     test = ppo_clip(env=env,
200 |                     episode=10000,
201 |                     learning_rate=1e-3,
202 |                     gamma=0.99,
203 |                     lam=0.97,
204 |                     epsilon=0.2,
205 |                     capacity=2000,
206 |                     render=False,
207 |                     log=False,
208 |                     value_update_iter=10,
209 |                     policy_update_iter=10)
210 |     test.run()
211 | 


--------------------------------------------------------------------------------
/REINFORCE/reinforce.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | from torch.distributions import Categorical
  6 | import gym
  7 | import numpy as np
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | class net(nn.Module):
 11 |     def __init__(self, input_dim, output_dim):
 12 |         super(net, self).__init__()
 13 |         self.input_dim = input_dim
 14 |         self.output_dim = output_dim
 15 | 
 16 |         self.fc1 = nn.Linear(self.input_dim, 256)
 17 |         self.fc2 = nn.Linear(256, self.output_dim)
 18 | 
 19 |         self.rewards = []
 20 |         self.log_probs = []
 21 | 
 22 |     def forward(self, input):
 23 |         x = self.fc1(input)
 24 |         x = F.relu(x)
 25 |         x = self.fc2(x)
 26 |         x = F.softmax(x, 1)
 27 |         return x
 28 | 
 29 |     def act(self, input):
 30 |         probs = self.forward(input)
 31 |         dist = Categorical(probs)
 32 |         action = dist.sample()
 33 |         self.log_probs.append(dist.log_prob(action))
 34 |         return action.item()
 35 | 
 36 | 
 37 | class reinforce(object):
 38 |     def __init__(self, env, gamma, learning_rate, episode, render):
 39 |         self.env = env
 40 |         self.observation_dim = self.env.observation_space.shape[0]
 41 |         self.action_dim = self.env.action_space.n
 42 |         self.gamma = gamma
 43 |         self.learning_rate = learning_rate
 44 |         self.episode = episode
 45 |         self.render = render
 46 |         self.net = net(self.observation_dim, self.action_dim)
 47 |         self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate)
 48 |         self.total_returns = []
 49 |         self.weight_reward = None
 50 |         self.writer = SummaryWriter('runs/reinforce')
 51 |         self.count = 0
 52 | 
 53 |     def train(self, ):
 54 |         total_returns = torch.FloatTensor(self.total_returns)
 55 |         eps = np.finfo(np.float32).eps.item()
 56 |         total_returns = (total_returns - total_returns.mean()) / (total_returns.std() + eps)
 57 |         log_probs = torch.cat(self.net.log_probs, 0)
 58 |         loss = (- log_probs * total_returns.detach())
 59 |         loss = loss.sum()
 60 |         self.writer.add_scalar('loss', loss, self.count)
 61 |         self.optimizer.zero_grad()
 62 |         loss.backward()
 63 |         torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.1)
 64 |         self.optimizer.step()
 65 | 
 66 |     def run(self, ):
 67 |         for i in range(self.episode):
 68 |             obs = self.env.reset()
 69 |             total_reward = 0
 70 |             if self.render:
 71 |                 self.env.render()
 72 |             while True:
 73 |                 action = self.net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
 74 |                 next_obs, reward, done, info = self.env.step(action)
 75 |                 self.net.rewards.append(reward)
 76 |                 total_reward += reward
 77 |                 self.count += 1
 78 |                 if self.render:
 79 |                     self.env.render()
 80 |                 obs = next_obs
 81 |                 if done:
 82 |                     R = 0
 83 |                     if self.weight_reward:
 84 |                         self.weight_reward = 0.99 * self.weight_reward + 0.01 * total_reward
 85 |                     else:
 86 |                         self.weight_reward = total_reward
 87 |                     for r in reversed(self.net.rewards):
 88 |                         R = R * self.gamma + r
 89 |                         self.total_returns.append(R)
 90 |                     self.total_returns = list(reversed(self.total_returns))
 91 |                     self.train()
 92 |                     del self.net.rewards[:]
 93 |                     del self.net.log_probs[:]
 94 |                     del self.total_returns[:]
 95 |                     print('episode: {}  reward: {:.1f}  weight_reward: {:.2f}'.format(i+1, total_reward, self.weight_reward))
 96 |                     self.writer.add_scalar('reward', total_reward, i)
 97 |                     self.writer.add_scalar('weight_reward', self.weight_reward, i)
 98 |                     break
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     env = gym.make('CartPole-v0')
103 |     env = env.unwrapped
104 |     test = reinforce(env, gamma=0.99, learning_rate=1e-3, episode=100000, render=False)
105 |     test.run()


--------------------------------------------------------------------------------
/RND_PPO/rnd.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | import numpy as np
  6 | import gym
  7 | from collections import deque
  8 | from torch.distributions import Categorical
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | 
 11 | 
 12 | class gae_trajectory_buffer(object):
 13 |     def __init__(self, capacity, gamma, lam):
 14 |         self.capacity = capacity
 15 |         self.gamma = gamma
 16 |         self.lam = lam
 17 |         self.memory = deque(maxlen=self.capacity)
 18 |         # * [obs, next_obs, act, rew, don, val, ret, adv]
 19 | 
 20 |     def store(self, obs, next_obs, act, rew, don, val):
 21 |         obs = np.expand_dims(obs, 0)
 22 |         next_obs = np.expand_dims(next_obs, 0)
 23 |         self.memory.append([obs, next_obs, act, rew, don, val])
 24 | 
 25 |     def process(self):
 26 |         R = 0
 27 |         Adv = 0
 28 |         Value_previous = 0
 29 |         for traj in reversed(list(self.memory)):
 30 |             R = self.gamma * R * (1 - traj[4]) + traj[5]
 31 |             traj.append(R)
 32 |             # * the generalized advantage estimator(GAE)
 33 |             delta = traj[3] + Value_previous * self.gamma * (1 - traj[4]) - traj[5]
 34 |             Adv = delta + (1 - traj[4]) * Adv * self.gamma * self.lam
 35 |             traj.append(Adv)
 36 |             Value_previous = traj[5]
 37 | 
 38 |     def get(self):
 39 |         obs, next_obs, act, rew, don, val, ret, adv = zip(* self.memory)
 40 |         act = np.expand_dims(act, 1)
 41 |         rew = np.expand_dims(rew, 1)
 42 |         don = np.expand_dims(don, 1)
 43 |         val = np.expand_dims(val, 1)
 44 |         ret = np.expand_dims(ret, 1)
 45 |         adv = np.array(adv)
 46 |         adv = (adv - adv.mean()) / adv.std()
 47 |         adv = np.expand_dims(adv, 1)
 48 |         return np.concatenate(obs, 0), np.concatenate(next_obs, 0), act, rew, don, val, ret, adv
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.memory)
 52 | 
 53 |     def clear(self):
 54 |         self.memory.clear()
 55 | 
 56 | 
 57 | class policy_net(nn.Module):
 58 |     def __init__(self, input_dim, output_dim):
 59 |         super(policy_net, self).__init__()
 60 |         self.input_dim = input_dim
 61 |         self.output_dim = output_dim
 62 |         self.fc1 = nn.Linear(self.input_dim, 128)
 63 |         self.fc2 = nn.Linear(128, 128)
 64 |         self.fc3 = nn.Linear(128, self.output_dim)
 65 | 
 66 |     def forward(self, input):
 67 |         x = F.relu(self.fc1(input))
 68 |         x = F.relu(self.fc2(x))
 69 |         x = self.fc3(x)
 70 |         return F.softmax(x, 1)
 71 | 
 72 |     def act(self, input):
 73 |         probs = self.forward(input)
 74 |         dist = Categorical(probs)
 75 |         action = dist.sample()
 76 |         action = action.detach().item()
 77 |         return action
 78 | 
 79 | 
 80 | class value_net(nn.Module):
 81 |     def __init__(self, input_dim, output_dim):
 82 |         super(value_net, self).__init__()
 83 |         self.input_dim = input_dim
 84 |         self.output_dim = output_dim
 85 | 
 86 |         self.fc1 = nn.Linear(self.input_dim, 128)
 87 |         self.fc2 = nn.Linear(128, 128)
 88 |         self.int_layer = nn.Linear(128, self.output_dim)
 89 |         self.ext_layer = nn.Linear(128, self.output_dim)
 90 | 
 91 |     def forward(self, input):
 92 |         x = F.relu(self.fc1(input))
 93 |         x = F.relu(self.fc2(x))
 94 |         value_int = self.int_layer(x)
 95 |         value_ext = self.ext_layer(x)
 96 |         return value_int, value_ext
 97 | 
 98 | 
 99 | class rnd(nn.Module):
100 |     def __init__(self, input_dim):
101 |         super(rnd, self).__init__()
102 |         self.input_dim = input_dim
103 | 
104 |         self.predictor = nn.Sequential(
105 |             nn.Linear(self.input_dim, 128),
106 |             nn.ReLU(),
107 |             nn.Linear(128, 256),
108 |             nn.ReLU(),
109 |             nn.Linear(256, 256),
110 |             nn.ReLU(),
111 |             nn.Linear(256, 256)
112 |         )
113 | 
114 |         self.target = nn.Sequential(
115 |             nn.Linear(self.input_dim, 128),
116 |             nn.ReLU(),
117 |             nn.Linear(128, 256),
118 |             nn.ReLU(),
119 |             nn.Linear(256, 256)
120 |         )
121 | 
122 |         for param in self.target.parameters():
123 |             param.requires_grad = False
124 | 
125 |     def forward(self, input):
126 |         pre_feature = self.predictor(input)
127 |         tar_feature = self.target(input)
128 |         return pre_feature, tar_feature
129 | 
130 |     def calc_int_reward(self, input):
131 |         pre_feature, tar_feature = self.forward(input)
132 |         int_reward = 0.5 * (pre_feature - tar_feature).pow(2).sum(-1)
133 |         return int_reward.detach().numpy()
134 | 
135 | class ppo_clip(object):
136 |     def __init__(self, env, episode, learning_rate, gamma, lam, epsilon, capacity, render, log, value_update_iter, policy_update_iter, int_coef, ext_coef, rnd_update_prop):
137 |         super(ppo_clip, self).__init__()
138 |         self.env = env
139 |         self.episode = episode
140 |         self.learning_rate = learning_rate
141 |         self.gamma = gamma
142 |         self.lam = lam
143 |         self.epsilon = epsilon
144 |         self.capacity = capacity
145 |         self.render = render
146 |         self.log = log
147 |         self.value_update_iter = value_update_iter
148 |         self.policy_update_iter = policy_update_iter
149 |         self.int_coef = int_coef
150 |         self.ext_coef = ext_coef
151 |         self.rnd_update_prop = rnd_update_prop
152 | 
153 |         self.observation_dim = self.env.observation_space.shape[0]
154 |         self.action_dim = self.env.action_space.n
155 | 
156 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
157 |         self.value_net = value_net(self.observation_dim, 1)
158 |         self.rnd = rnd(self.observation_dim)
159 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
160 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
161 |         self.rnd_optimizer = torch.optim.Adam(self.rnd.predictor.parameters(), lr=self.learning_rate)
162 |         self.int_buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam)
163 |         self.ext_buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam)
164 | 
165 |         self.count = 0
166 |         self.train_count = 0
167 |         self.weight_reward = None
168 |         self.writer = SummaryWriter('runs/ppo_clip_rnd')
169 | 
170 |     def train(self):
171 |         obs, next_obs, act, int_rew, don, _, _, int_adv = self.int_buffer.get()
172 |         _, _, _, ext_rew, _, _, _, ext_adv = self.int_buffer.get()
173 | 
174 |         obs = torch.FloatTensor(obs)
175 |         next_obs = torch.FloatTensor(next_obs)
176 |         act = torch.LongTensor(act)
177 |         int_rew = torch.FloatTensor(int_rew)
178 |         ext_rew = torch.FloatTensor(ext_rew)
179 |         don = torch.FloatTensor(don)
180 |         int_adv = torch.FloatTensor(int_adv).squeeze(1)
181 |         ext_adv = torch.FloatTensor(ext_adv).squeeze(1)
182 |         adv = self.int_coef * int_adv + self.ext_coef * ext_adv
183 | 
184 |         old_probs = self.policy_net.forward(obs)
185 |         old_probs = old_probs.gather(1, act).squeeze(1).detach()
186 |         value_loss_buffer = []
187 |         policy_loss_buffer = []
188 |         rnd_loss_buffer = []
189 |         for _ in range(self.value_update_iter):
190 |             value_int, value_ext = self.value_net.forward(obs)
191 |             next_value_int, next_value_ext = self.value_net.forward(next_obs)
192 |             # * intrinsic value net
193 |             int_td_target = int_rew + self.gamma * next_value_int * (1 - don)
194 |             int_value_loss = F.mse_loss(int_td_target.detach(), value_int)
195 |             # * external value net
196 |             ext_td_target = ext_rew + self.gamma * next_value_ext * (1 - don)
197 |             ext_value_loss = F.mse_loss(ext_td_target.detach(), value_ext)
198 |             value_loss = 0.5 * (int_value_loss + ext_value_loss)
199 | 
200 |             value_loss_buffer.append(value_loss.item())
201 |             self.value_optimizer.zero_grad()
202 |             value_loss.backward()
203 |             self.value_optimizer.step()
204 | 
205 |             probs = self.policy_net.forward(obs)
206 |             probs = probs.gather(1, act).squeeze(1)
207 |             ratio = probs / old_probs
208 |             surr1 = ratio * adv
209 |             surr2 = torch.clamp(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv
210 |             policy_loss = - torch.min(surr1, surr2).mean()
211 |             policy_loss_buffer.append(policy_loss.item())
212 |             self.policy_optimizer.zero_grad()
213 |             policy_loss.backward()
214 |             self.policy_optimizer.step()
215 | 
216 |             pre_feature, tar_feature = self.rnd.forward(obs)
217 |             rnd_loss = (pre_feature - tar_feature.detach()).pow(2).mean(-1)
218 |             mask = torch.rand(len(rnd_loss))
219 |             mask = torch.FloatTensor((mask < self.rnd_update_prop).float())
220 |             rnd_loss = (rnd_loss * mask).sum() / torch.max(mask.sum(), torch.FloatTensor([1.]))
221 |             rnd_loss_buffer.append(rnd_loss)
222 |             self.rnd_optimizer.zero_grad()
223 |             rnd_loss.backward()
224 |             self.rnd_optimizer.step()
225 |         if self.log:
226 |             self.writer.add_scalar('rnd_loss', np.mean(policy_loss_buffer), self.train_count)
227 |             self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
228 |             self.writer.add_scalar('value_loss', np.mean(value_loss_buffer), self.train_count)
229 | 
230 |     def run(self):
231 |         for i in range(self.episode):
232 |             obs = self.env.reset()
233 |             total_reward = 0
234 |             if self.render:
235 |                 self.env.render()
236 |             while True:
237 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
238 |                 next_obs, ext_reward, done, _ = self.env.step(action)
239 |                 int_reward = self.rnd.calc_int_reward(torch.FloatTensor(np.expand_dims(obs, 0)))[0]
240 |                 if self.render:
241 |                     self.env.render()
242 |                 value_int, value_ext = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0)))
243 |                 value_int = value_int.detach().item()
244 |                 value_ext = value_ext.detach().item()
245 |                 self.ext_buffer.store(obs, next_obs, action, ext_reward, done, value_ext)
246 |                 self.int_buffer.store(obs, next_obs, action, int_reward, done, value_int)
247 |                 self.count += 1
248 |                 total_reward += ext_reward
249 |                 obs = next_obs
250 |                 if self.count % self.capacity == 0:
251 |                     self.int_buffer.process()
252 |                     self.ext_buffer.process()
253 |                     self.train_count += 1
254 |                     self.train()
255 |                     self.int_buffer.clear()
256 |                     self.ext_buffer.clear()
257 |                 if done:
258 |                     if not self.weight_reward:
259 |                         self.weight_reward = total_reward
260 |                     else:
261 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
262 |                     if self.log:
263 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i+1)
264 |                         self.writer.add_scalar('reward', total_reward, i+1)
265 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}  train_step: {}'.format(i+1, total_reward, self.weight_reward, self.train_count))
266 |                     break
267 | 
268 | 
269 | if __name__ == '__main__':
270 |     env = gym.make('CartPole-v1').unwrapped
271 |     test = ppo_clip(
272 |         env=env,
273 |         episode=10000,
274 |         learning_rate=1e-3,
275 |         gamma=0.99,
276 |         lam=0.97,
277 |         epsilon=0.2,
278 |         capacity=20,
279 |         render=False,
280 |         log=False,
281 |         value_update_iter=10,
282 |         policy_update_iter=10,
283 |         int_coef=1.,
284 |         ext_coef=2.,
285 |         rnd_update_prop=0.25
286 |     )
287 |     test.run()
288 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Related Papers
 2 | 
 3 | * A3C
 4 | 
 5 |   [Asynchronous Methods for Deep Reinforcement Learning](http://arxiv.org/abs/1602.01783)
 6 | 
 7 | * ACER
 8 | 
 9 |   [Sample Efficient Actor-Critic with Experience Replay](http://arxiv.org/abs/1611.01224)
10 | 
11 | * TRPO
12 | 
13 |   [Trust Region Policy Optimization](http://arxiv.org/abs/1502.05477)
14 | 
15 | * PPO
16 | 
17 |   [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347v2)
18 | 
19 | * ICM
20 | 
21 |   [Curiosity-driven Exploration by Self-supervised Prediction](http://arxiv.org/abs/1705.05363)
22 | 
23 | * RND
24 | 
25 |   [Exploration by Random Network Distillation](http://arxiv.org/abs/1810.12894)
26 | 
27 | * DDPG
28 | 
29 |   [Continuous control with deep reinforcement learning](http://arxiv.org/abs/1509.02971)
30 | 
31 | * TD3
32 | 
33 |   [Addressing Function Approximation Error in Actor-Critic Methods](http://arxiv.org/abs/1802.09477)
34 | 
35 | * SAC
36 | 
37 |   [Soft Actor-Critic Algorithms and Applications](http://arxiv.org/abs/1812.05905)
38 | 
39 |   [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor](http://arxiv.org/abs/1801.01290)
40 | 
41 |   [Soft Actor-Critic for Discrete Action Settings](http://arxiv.org/abs/1910.07207)
42 | 
43 | * DSAC
44 | 
45 |   [DSAC: Distributional Soft Actor Critic for Risk-Sensitive Reinforcement Learning](http://arxiv.org/abs/2004.14547)
46 | 
47 | 


--------------------------------------------------------------------------------
/SAC/sac.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.distributions import Normal
  5 | import random
  6 | from collections import deque
  7 | import numpy as np
  8 | import gym
  9 | import math
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | 
 12 | class normallized_action_wrapper(gym.ActionWrapper):
 13 |     # * because the tanh value range is [-1, 1], so change the env action range
 14 |     def action(self, action):
 15 |         # * change action range from [-1, 1] to [env.low, env.high]
 16 |         low = self.action_space.low
 17 |         high = self.action_space.high
 18 | 
 19 |         action = (action + 1) / 2 * (high - low) - 2
 20 |         action = np.clip(action, low, high)
 21 |         return action
 22 | 
 23 |     def reverse_action(self, action):
 24 |         # * change action range from [env.low, env.high] to [-1, 1]
 25 |         low = self.action_space.low
 26 |         high = self.action_space.high
 27 | 
 28 |         action = (action - low) / ((high - low) / 2) - 1
 29 |         action = np.clip(action, -1, 1)
 30 |         return action
 31 | 
 32 | 
 33 | class replay_buffer(object):
 34 |     def __init__(self, capacity):
 35 |         self.capacity = capacity
 36 |         self.memory = deque(maxlen=self.capacity)
 37 | 
 38 |     def store(self, observation, action, reward, next_observation, done):
 39 |         observation = np.expand_dims(observation, 0)
 40 |         next_observation = np.expand_dims(next_observation, 0)
 41 |         self.memory.append([observation, action, reward, next_observation, done])
 42 | 
 43 |     def sample(self, batch_size):
 44 |         batch = random.sample(self.memory, batch_size)
 45 |         observation, action, reward, next_observation, done = zip(* batch)
 46 |         return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done
 47 | 
 48 |     def __len__(self):
 49 |         return len(self.memory)
 50 | 
 51 | 
 52 | class policy_net(nn.Module):
 53 |     # * SAC trains a stochastic policy, not a deterministic policy which like TD3 and DDPG
 54 |     def __init__(self, input_dim, output_dim, min_log_sigma=-20., max_log_sigma=2.):
 55 |         super(policy_net, self).__init__()
 56 |         self.input_dim = input_dim
 57 |         self.output_dim = output_dim
 58 |         self.min_log_sigma = min_log_sigma
 59 |         self.max_log_sigma = max_log_sigma
 60 | 
 61 |         self.fc1 = nn.Linear(self.input_dim, 128)
 62 |         self.fc2 = nn.Linear(128, 128)
 63 |         self.fc_mu = nn.Linear(128, self.output_dim)
 64 |         self.fc_sigma = nn.Linear(128, self.output_dim)
 65 | 
 66 |     def forward(self, input):
 67 |         x = F.relu(self.fc1(input))
 68 |         x = F.relu(self.fc2(x))
 69 |         mu = self.fc_mu(x)
 70 |         # * standard deviations are parameterized, the way not same as VPG, PPO and TRPO
 71 |         log_sigma = self.fc_sigma(x)
 72 |         log_sigma = torch.clamp(log_sigma, self.min_log_sigma, self.max_log_sigma)
 73 |         return mu, log_sigma
 74 | 
 75 |     def act(self, input):
 76 |         mu, log_sigma = self.forward(input)
 77 |         sigma = torch.exp(log_sigma)
 78 |         dist = Normal(mu, sigma)
 79 |         # * reparameterization trick: recognize the difference of sample() and rsample()
 80 |         action = dist.rsample()
 81 |         tanh_action = torch.tanh(action)
 82 |         # * the log-probabilities of actions can be calculated in closed forms
 83 |         log_prob = dist.log_prob(action)
 84 |         log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2)).sum(1, keepdim=True)
 85 |         return tanh_action, log_prob
 86 | 
 87 | 
 88 | class value_net(nn.Module):
 89 |     def __init__(self, input1_dim, input2_dim, output_dim):
 90 |         super(value_net, self).__init__()
 91 |         self.input1_dim = input1_dim
 92 |         self.input2_dim = input2_dim
 93 |         self.output_dim = output_dim
 94 | 
 95 |         self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128)
 96 |         self.fc2 = nn.Linear(128, 128)
 97 |         self.fc3 = nn.Linear(128, self.output_dim)
 98 | 
 99 |     def forward(self, input1, input2):
100 |         x = torch.cat([input1, input2], 1)
101 |         x = F.relu(self.fc1(x))
102 |         x = F.relu(self.fc2(x))
103 |         x = self.fc3(x)
104 |         return x
105 | 
106 | 
107 | class sac(object):
108 |     def __init__(self, env, batch_size, learning_rate, exploration, episode, gamma, alpha, auto_entropy_tuning, capacity, rho, update_iter, update_every, render, log):
109 |         self.env = env
110 |         self.batch_size = batch_size
111 |         self.learning_rate = learning_rate
112 |         self.exploration = exploration
113 |         self.episode = episode
114 |         self.gamma = gamma
115 |         self.auto_entropy_tuning = auto_entropy_tuning
116 |         if not self.auto_entropy_tuning:
117 |             self.alpha = alpha
118 |         else:
119 |             # * the automatic temperature alpha tuning mechanism
120 |             self.log_alpha = torch.zeros(1, requires_grad=True)
121 |             self.alpha = self.log_alpha.exp()
122 |             self.target_entropy = - torch.prod(torch.FloatTensor(self.env.action_space.shape)).item()
123 |             self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.learning_rate, eps=1e-4)
124 |         self.capacity = capacity
125 |         self.rho = rho
126 |         self.update_iter = update_iter
127 |         self.update_every = update_every
128 |         self.render = render
129 |         self.log = log
130 | 
131 |         self.observation_dim = self.env.observation_space.shape[0]
132 |         self.action_dim = self.env.action_space.shape[0]
133 | 
134 |         self.value_net1 = value_net(self.observation_dim, self.action_dim, 1)
135 |         self.value_net2 = value_net(self.observation_dim, self.action_dim, 1)
136 |         self.target_value_net1 = value_net(self.observation_dim, self.action_dim, 1)
137 |         self.target_value_net2 = value_net(self.observation_dim, self.action_dim, 1)
138 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
139 |         self.target_value_net1.load_state_dict(self.value_net1.state_dict())
140 |         self.target_value_net2.load_state_dict(self.value_net2.state_dict())
141 | 
142 |         self.buffer = replay_buffer(capacity=self.capacity)
143 | 
144 |         self.value_optimizer1 = torch.optim.Adam(self.value_net1.parameters(), lr=self.learning_rate)
145 |         self.value_optimizer2 = torch.optim.Adam(self.value_net2.parameters(), lr=self.learning_rate)
146 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
147 | 
148 |         self.weight_reward = None
149 |         self.count = 0
150 |         self.train_count = 0
151 |         self.writer = SummaryWriter('runs/sac')
152 | 
153 |     def soft_update(self):
154 |         for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()):
155 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
156 |         for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()):
157 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
158 | 
159 |     def train(self):
160 |         observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size)
161 | 
162 |         observation = torch.FloatTensor(observation)
163 |         action = torch.FloatTensor(action).unsqueeze(1)
164 |         reward = torch.FloatTensor(reward).unsqueeze(1)
165 |         next_observation = torch.FloatTensor(next_observation)
166 |         done = torch.FloatTensor(done).unsqueeze(1)
167 | 
168 |         value_loss1_buffer = []
169 |         value_loss2_buffer = []
170 |         policy_loss_buffer = []
171 |         for _ in range(self.update_iter):
172 |             next_action, log_prob = self.policy_net.act(next_observation)
173 |             target_q_value1 = self.target_value_net1.forward(next_observation, next_action)
174 |             target_q_value2 = self.target_value_net2.forward(next_observation, next_action)
175 |             target_q = reward + (1 - done) * self.gamma * (torch.min(target_q_value1, target_q_value2) - self.alpha * log_prob)
176 |             target_q = target_q.detach()
177 | 
178 |             q1 = self.value_net1.forward(observation, action)
179 |             q2 = self.value_net2.forward(observation, action)
180 |             value_loss1 = (q1 - target_q).pow(2).mean()
181 |             value_loss2 = (q2 - target_q).pow(2).mean()
182 |             value_loss1_buffer.append(value_loss1.detach().item())
183 |             value_loss2_buffer.append(value_loss2.detach().item())
184 | 
185 |             self.value_optimizer1.zero_grad()
186 |             value_loss1.backward()
187 |             nn.utils.clip_grad_norm_(self.value_net1.parameters(), 0.5)
188 |             self.value_optimizer1.step()
189 | 
190 |             self.value_optimizer2.zero_grad()
191 |             value_loss2.backward()
192 |             nn.utils.clip_grad_norm_(self.value_net2.parameters(), 0.5)
193 |             self.value_optimizer2.step()
194 | 
195 |             sample_action, sample_log_prob = self.policy_net.act(observation)
196 |             sample_q1 = self.value_net1.forward(observation, sample_action)
197 |             sample_q2 = self.value_net2.forward(observation, sample_action)
198 |             policy_loss = - (torch.min(sample_q1, sample_q2) - self.alpha * sample_log_prob)
199 |             policy_loss = policy_loss.mean()
200 |             policy_loss_buffer.append(policy_loss.detach().item())
201 | 
202 |             self.policy_optimizer.zero_grad()
203 |             policy_loss.backward()
204 |             nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5)
205 |             self.policy_optimizer.step()
206 | 
207 |             if self.auto_entropy_tuning:
208 |                 self.alpha_optimizer.zero_grad()
209 |                 entropy_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean()
210 |                 entropy_loss.backward()
211 |                 self.alpha_optimizer.step()
212 | 
213 |                 self.alpha = self.log_alpha.exp()
214 | 
215 |             self.soft_update()
216 |         if self.log:
217 |             self.writer.add_scalar('value_loss1', np.mean(value_loss1_buffer), self.train_count)
218 |             self.writer.add_scalar('value_loss2', np.mean(value_loss2_buffer), self.train_count)
219 |             self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
220 | 
221 |     def run(self):
222 |         for i in range(self.episode):
223 |             obs = self.env.reset()
224 |             total_reward = 0
225 |             if self.render:
226 |                 self.env.render()
227 |             while True:
228 |                 if i >= self.exploration:
229 |                     action, _ = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
230 |                     action = action.detach().item()
231 |                 else:
232 |                     action = np.random.uniform(-1., 1.)
233 |                 next_obs, reward, done, _ = self.env.step(action)
234 |                 if self.render:
235 |                     self.env.render()
236 |                 self.buffer.store(obs, action, reward, next_obs, done)
237 |                 self.count += 1
238 |                 total_reward += reward
239 |                 obs = next_obs
240 | 
241 |                 if (self.count % self.update_every) == 0 and i >= self.exploration:
242 |                     self.train_count += 1
243 |                     self.train()
244 |                 if done:
245 |                     if not self.weight_reward:
246 |                         self.weight_reward = total_reward
247 |                     else:
248 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
249 |                     if self.log:
250 |                         self.writer.add_scalar('reward', total_reward, i + 1)
251 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i + 1)
252 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward))
253 |                     break
254 | 
255 | 
256 | if __name__ == '__main__':
257 |     env = normallized_action_wrapper(gym.make('Pendulum-v0'))
258 |     test = sac(env=env,
259 |                batch_size=100,
260 |                learning_rate=1e-3,
261 |                exploration=300,
262 |                episode=10000,
263 |                gamma=0.99,
264 |                alpha=None,
265 |                auto_entropy_tuning=True,
266 |                capacity=1000000,
267 |                rho=0.995,
268 |                update_iter=10,
269 |                update_every=50,
270 |                render=False,
271 |                log=False)
272 |     test.run()


--------------------------------------------------------------------------------
/SAC/sac_discrete.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import gym
  5 | from collections import deque
  6 | import random
  7 | from torch.utils.tensorboard import SummaryWriter
  8 | import numpy as np
  9 | 
 10 | 
 11 | class replay_buffer(object):
 12 |     def __init__(self, capacity):
 13 |         self.capacity = capacity
 14 |         self.memory = deque(maxlen=self.capacity)
 15 | 
 16 |     def store(self, observation, action, reward, next_observation, done):
 17 |         observation = np.expand_dims(observation, 0)
 18 |         next_observation = np.expand_dims(next_observation, 0)
 19 |         self.memory.append([observation, action, reward, next_observation, done])
 20 | 
 21 |     def sample(self, batch_size):
 22 |         batch = random.sample(self.memory, batch_size)
 23 |         observation, action, reward, next_observation, done = zip(* batch)
 24 |         return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done
 25 | 
 26 |     def __len__(self):
 27 |         return len(self.memory)
 28 | 
 29 | 
 30 | class value_net(nn.Module):
 31 |     def __init__(self, input_dim, output_dim):
 32 |         super(value_net, self).__init__()
 33 |         self.input_dim = input_dim
 34 |         self.output_dim = output_dim
 35 | 
 36 |         self.fc1 = nn.Linear(self.input_dim, 128)
 37 |         self.fc2 = nn.Linear(128, 128)
 38 |         self.fc3 = nn.Linear(128, self.output_dim)
 39 | 
 40 |     def forward(self, input):
 41 |         x = F.relu(self.fc1(input))
 42 |         x = F.relu(self.fc2(x))
 43 |         x = self.fc3(x)
 44 |         return x
 45 | 
 46 | 
 47 | class policy_net(nn.Module):
 48 |     def __init__(self, input_dim, output_dim):
 49 |         super(policy_net, self).__init__()
 50 |         self.input_dim = input_dim
 51 |         self.output_dim = output_dim
 52 | 
 53 |         self.fc1 = nn.Linear(self.input_dim, 128)
 54 |         self.fc2 = nn.Linear(128, 128)
 55 |         self.fc3 = nn.Linear(128, self.output_dim)
 56 | 
 57 |     def forward(self, input):
 58 |         x = F.relu(self.fc1(input))
 59 |         x = F.relu(self.fc2(x))
 60 |         policy = F.softmax(self.fc3(x), dim=-1)
 61 |         return policy
 62 | 
 63 |     def act(self, input):
 64 |         policy = self.forward(input)
 65 |         dist = torch.distributions.Categorical(policy)
 66 |         action = dist.sample()
 67 |         return action[0].item()
 68 | 
 69 | 
 70 | class sac_discrete(object):
 71 |     def __init__(self, env, batch_size, learning_rate, exploration, episode, gamma, alpha, auto_entropy_tuning, capacity, rho, update_iter, update_every, render, log):
 72 |         self.env = env
 73 |         self.batch_size = batch_size
 74 |         self.learning_rate = learning_rate
 75 |         self.exploration = exploration
 76 |         self.episode = episode
 77 |         self.gamma = gamma
 78 |         self.auto_entropy_tuning = auto_entropy_tuning
 79 |         if not self.auto_entropy_tuning:
 80 |             self.alpha = alpha
 81 |         else:
 82 |             self.log_alpha = torch.zeros(1, requires_grad=True)
 83 |             self.alpha = self.log_alpha.exp()
 84 |             # * set the max possible entropy as the target entropy
 85 |             self.target_entropy = -np.log((1. / self.env.action_space.n)) * 0.98
 86 |             self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.learning_rate, eps=1e-4)
 87 |         self.capacity = capacity
 88 |         self.rho = rho
 89 |         self.update_iter = update_iter
 90 |         self.update_every = update_every
 91 |         self.render = render
 92 |         self.log = log
 93 | 
 94 |         self.observation_dim = self.env.observation_space.shape[0]
 95 |         self.action_num = self.env.action_space.n
 96 | 
 97 |         self.value_net1 = value_net(self.observation_dim, self.action_num)
 98 |         self.value_net2 = value_net(self.observation_dim, self.action_num)
 99 |         self.target_value_net1 = value_net(self.observation_dim, self.action_num)
100 |         self.target_value_net2 = value_net(self.observation_dim, self.action_num)
101 |         self.policy_net = policy_net(self.observation_dim, self.action_num)
102 |         self.target_value_net1.load_state_dict(self.value_net1.state_dict())
103 |         self.target_value_net2.load_state_dict(self.value_net2.state_dict())
104 | 
105 |         self.buffer = replay_buffer(capacity=self.capacity)
106 | 
107 |         self.value_optimizer1 = torch.optim.Adam(self.value_net1.parameters(), lr=self.learning_rate)
108 |         self.value_optimizer2 = torch.optim.Adam(self.value_net2.parameters(), lr=self.learning_rate)
109 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
110 | 
111 |         self.weight_reward = None
112 |         self.count = 0
113 |         self.train_count = 0
114 |         self.writer = SummaryWriter('runs/sac_discrete')
115 | 
116 |     def soft_update(self):
117 |         for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()):
118 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
119 |         for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()):
120 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
121 | 
122 |     def train(self):
123 |         observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size)
124 | 
125 |         observation = torch.FloatTensor(observation)
126 |         action = torch.LongTensor(action).unsqueeze(1)
127 |         reward = torch.FloatTensor(reward).unsqueeze(1)
128 |         next_observation = torch.FloatTensor(next_observation)
129 |         done = torch.FloatTensor(done).unsqueeze(1)
130 | 
131 |         value_loss1_buffer = []
132 |         value_loss2_buffer = []
133 |         policy_loss_buffer = []
134 |         for _ in range(self.update_iter):
135 |             policy = self.policy_net.forward(next_observation)
136 |             target_q_value1 = self.target_value_net1.forward(next_observation)
137 |             target_q_value2 = self.target_value_net2.forward(next_observation)
138 |             # * calculate the expectation directly
139 |             target_q = reward + (1 - done) * self.gamma * (policy * (torch.min(target_q_value1, target_q_value2) - self.alpha * policy.log())).mean(dim=1).unsqueeze(-1)
140 |             target_q = target_q.detach()
141 | 
142 |             q1 = self.value_net1.forward(observation).gather(dim=1, index=action)
143 |             q2 = self.value_net2.forward(observation).gather(dim=1, index=action)
144 |             value_loss1 = (q1 - target_q).pow(2).mean()
145 |             value_loss2 = (q2 - target_q).pow(2).mean()
146 |             value_loss1_buffer.append(value_loss1.detach().item())
147 |             value_loss2_buffer.append(value_loss2.detach().item())
148 | 
149 |             self.value_optimizer1.zero_grad()
150 |             value_loss1.backward()
151 |             nn.utils.clip_grad_norm_(self.value_net1.parameters(), 0.5)
152 |             self.value_optimizer1.step()
153 | 
154 |             self.value_optimizer2.zero_grad()
155 |             value_loss2.backward()
156 |             nn.utils.clip_grad_norm_(self.value_net2.parameters(), 0.5)
157 |             self.value_optimizer2.step()
158 | 
159 |             # * calculate the expectation directly
160 |             policy_loss = policy * (self.alpha * policy.log() - torch.min(target_q_value1, target_q_value2).detach())
161 |             policy_loss = policy_loss.mean()
162 | 
163 |             self.policy_optimizer.zero_grad()
164 |             policy_loss.backward()
165 |             nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5)
166 |             self.policy_optimizer.step()
167 | 
168 |             if self.auto_entropy_tuning:
169 |                 self.alpha_optimizer.zero_grad()
170 |                 entropy_loss = -(self.log_alpha * (policy.log() + self.target_entropy).detach()).mean()
171 |                 entropy_loss.backward()
172 |                 nn.utils.clip_grad_norm_([self.log_alpha], 0.2)
173 |                 self.alpha_optimizer.step()
174 | 
175 |                 self.alpha = self.log_alpha.exp()
176 | 
177 |             self.soft_update()
178 |         if self.log:
179 |             self.writer.add_scalar('value_loss1', np.mean(value_loss1_buffer), self.train_count)
180 |             self.writer.add_scalar('value_loss2', np.mean(value_loss2_buffer), self.train_count)
181 |             self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
182 | 
183 |     def run(self):
184 |         for i in range(self.episode):
185 |             obs = self.env.reset()
186 |             total_reward = 0
187 |             if self.render:
188 |                 self.env.render()
189 |             while True:
190 |                 if i >= self.exploration:
191 |                     action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
192 |                 else:
193 |                     action = random.choice(list(range(self.action_num)))
194 |                 next_obs, reward, done, _ = self.env.step(action)
195 |                 if self.render:
196 |                     self.env.render()
197 |                 self.buffer.store(obs, action, reward, next_obs, done)
198 |                 self.count += 1
199 |                 total_reward += reward
200 |                 obs = next_obs
201 | 
202 |                 if (self.count % self.update_every) == 0 and i >= self.exploration:
203 |                     self.train_count += 1
204 |                     self.train()
205 |                 if done:
206 |                     if not self.weight_reward:
207 |                         self.weight_reward = total_reward
208 |                     else:
209 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
210 |                     if self.log:
211 |                         self.writer.add_scalar('reward', total_reward, i + 1)
212 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i + 1)
213 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward))
214 |                     break
215 | 
216 | if __name__ == '__main__':
217 |     env = gym.make('CartPole-v1').unwrapped
218 |     test = sac_discrete(
219 |         env=env,
220 |         batch_size=64,
221 |         learning_rate=3e-4,
222 |         exploration=3000,
223 |         episode=10000,
224 |         gamma=0.99,
225 |         alpha=None,
226 |         auto_entropy_tuning=True,
227 |         capacity=100000,
228 |         rho=0.995,
229 |         update_iter=3,
230 |         update_every=5,
231 |         render=False,
232 |         log=False
233 |     )
234 |     test.run()
235 | 


--------------------------------------------------------------------------------
/TD3/td3.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.distributions import Normal
  5 | import random
  6 | from collections import deque
  7 | import numpy as np
  8 | import gym
  9 | import math
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | 
 12 | 
 13 | class replay_buffer(object):
 14 |     def __init__(self, capacity):
 15 |         self.capacity = capacity
 16 |         self.memory = deque(maxlen=self.capacity)
 17 | 
 18 |     def store(self, observation, action, reward, next_observation, done):
 19 |         observation = np.expand_dims(observation, 0)
 20 |         next_observation = np.expand_dims(next_observation, 0)
 21 |         self.memory.append([observation, action, reward, next_observation, done])
 22 | 
 23 |     def sample(self, batch_size):
 24 |         batch = random.sample(self.memory, batch_size)
 25 |         observation, action, reward, next_observation, done = zip(* batch)
 26 |         return np.concatenate(observation, 0), action, reward, np.concatenate(next_observation, 0), done
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.memory)
 30 | 
 31 | 
 32 | class policy_net(nn.Module):
 33 |     # * deterministic actor network, output a deterministic value as the selected action
 34 |     def __init__(self, input_dim, output_dim):
 35 |         super(policy_net, self).__init__()
 36 |         self.input_dim = input_dim
 37 |         self.output_dim = output_dim
 38 | 
 39 |         self.fc1 = nn.Linear(self.input_dim, 128)
 40 |         self.fc2 = nn.Linear(128, 128)
 41 |         self.fc3 = nn.Linear(128, self.output_dim)
 42 | 
 43 |     def forward(self, input):
 44 |         x = F.relu(self.fc1(input))
 45 |         x = F.relu(self.fc2(x))
 46 |         x = self.fc3(x)
 47 |         return x
 48 | 
 49 |     def act(self, input):
 50 |         action = self.forward(input).detach().item()
 51 |         return action
 52 | 
 53 | 
 54 | class value_net(nn.Module):
 55 |     def __init__(self, input1_dim, input2_dim, output_dim):
 56 |         super(value_net, self).__init__()
 57 |         self.input1_dim = input1_dim
 58 |         self.input2_dim = input2_dim
 59 |         self.output_dim = output_dim
 60 | 
 61 |         self.fc1 = nn.Linear(self.input1_dim + self.input2_dim, 128)
 62 |         self.fc2 = nn.Linear(128, 128)
 63 |         self.fc3 = nn.Linear(128, self.output_dim)
 64 | 
 65 |     def forward(self, input1, input2):
 66 |         x = torch.cat([input1, input2], 1)
 67 |         x = F.relu(self.fc1(x))
 68 |         x = F.relu(self.fc2(x))
 69 |         x = self.fc3(x)
 70 |         return x
 71 | 
 72 | 
 73 | class td3(object):
 74 |     def __init__(self, env, batch_size, learning_rate, exploration, episode, gamma, capacity, rho, update_iter, policy_delay, epsilon_init, decay, epsilon_min, max_a, min_a, noisy_range, render, log):
 75 |         self.env = env
 76 |         self.batch_size = batch_size
 77 |         self.learning_rate = learning_rate
 78 |         self.exploration = exploration
 79 |         self.episode = episode
 80 |         self.gamma = gamma
 81 |         self.capacity = capacity
 82 |         self.rho = rho
 83 |         self.update_iter = update_iter
 84 |         self.policy_delay = policy_delay
 85 |         self.epsilon_init = epsilon_init
 86 |         self.decay = decay
 87 |         self.epsilon_min = epsilon_min
 88 |         self.max_a = max_a
 89 |         self.min_a = min_a
 90 |         self.noisy_range = noisy_range
 91 |         self.render = render
 92 |         self.log = log
 93 | 
 94 |         self.observation_dim = self.env.observation_space.shape[0]
 95 |         self.action_dim = self.env.action_space.shape[0]
 96 | 
 97 |         self.value_net1 = value_net(self.observation_dim, self.action_dim, 1)
 98 |         self.value_net2 = value_net(self.observation_dim, self.action_dim, 1)
 99 |         self.target_value_net1 = value_net(self.observation_dim, self.action_dim, 1)
100 |         self.target_value_net2 = value_net(self.observation_dim, self.action_dim, 1)
101 |         self.policy_net = policy_net(self.observation_dim, self.action_dim)
102 |         self.target_policy_net = policy_net(self.observation_dim, self.action_dim)
103 |         self.target_value_net1.load_state_dict(self.value_net1.state_dict())
104 |         self.target_value_net2.load_state_dict(self.value_net2.state_dict())
105 |         self.target_policy_net.load_state_dict(self.policy_net.state_dict())
106 | 
107 |         self.buffer = replay_buffer(capacity=self.capacity)
108 | 
109 |         self.value_optimizer1 = torch.optim.Adam(self.value_net1.parameters(), lr=self.learning_rate)
110 |         self.value_optimizer2 = torch.optim.Adam(self.value_net2.parameters(), lr=self.learning_rate)
111 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
112 | 
113 |         self.weight_reward = None
114 |         self.count = 0
115 |         self.train_count = 0
116 |         self.epsilon = lambda x: self.epsilon_min + (self.epsilon_init - self.epsilon_min) * math.exp(- x / self.decay)
117 |         self.writer = SummaryWriter('runs/td3')
118 | 
119 |     def soft_update(self):
120 |         for param, target_param in zip(self.value_net1.parameters(), self.target_value_net1.parameters()):
121 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
122 |         for param, target_param in zip(self.value_net2.parameters(), self.target_value_net2.parameters()):
123 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
124 |         for param, target_param in zip(self.policy_net.parameters(), self.target_policy_net.parameters()):
125 |             target_param.detach().copy_(param.detach() * (1 - self.rho) + target_param.detach() * self.rho)
126 | 
127 |     def train(self):
128 |         value1_loss_buffer = []
129 |         value2_loss_buffer = []
130 |         policy_loss_buffer = []
131 |         for iter in range(self.update_iter):
132 |             observation, action, reward, next_observation, done = self.buffer.sample(self.batch_size)
133 | 
134 |             observation = torch.FloatTensor(observation)
135 |             action = torch.FloatTensor(action).unsqueeze(1)
136 |             reward = torch.FloatTensor(reward).unsqueeze(1)
137 |             next_observation = torch.FloatTensor(next_observation)
138 |             done = torch.FloatTensor(done).unsqueeze(1)
139 | 
140 |             target_next_action = self.target_policy_net.forward(next_observation)
141 |             target_next_action = target_next_action + np.clip(np.random.randn() * self.epsilon(self.count), - self.noisy_range, self.noisy_range)
142 |             target_next_action = torch.clamp(target_next_action, self.min_a, self.max_a).detach()
143 | 
144 |             q_min = torch.min(self.target_value_net1.forward(next_observation, target_next_action), self.target_value_net2.forward(next_observation, target_next_action))
145 |             target_q = reward + (1 - done) * self.gamma * q_min.detach()
146 |             q1 = self.value_net1.forward(observation, action)
147 |             q2 = self.value_net2.forward(observation, action)
148 |             value_loss1 = (q1 - target_q).pow(2).mean()
149 |             value_loss2 = (q2 - target_q).pow(2).mean()
150 |             value1_loss_buffer.append(value_loss1.detach().item())
151 |             value2_loss_buffer.append(value_loss2.detach().item())
152 | 
153 |             self.value_optimizer1.zero_grad()
154 |             value_loss1.backward()
155 |             torch.nn.utils.clip_grad_norm_(self.value_net1.parameters(), 0.5)
156 |             self.value_optimizer1.step()
157 | 
158 |             self.value_optimizer2.zero_grad()
159 |             value_loss2.backward()
160 |             torch.nn.utils.clip_grad_norm_(self.value_net2.parameters(), 0.5)
161 |             self.value_optimizer2.step()
162 | 
163 |             if (iter + 1) % self.policy_delay == 0:
164 |                 current_action = self.policy_net.forward(observation)
165 |                 policy_loss = (- self.value_net1.forward(observation, current_action)).mean()
166 |                 policy_loss_buffer.append(policy_loss.detach().item())
167 | 
168 |                 self.policy_optimizer.zero_grad()
169 |                 policy_loss.backward()
170 |                 torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.)
171 |                 self.policy_optimizer.step()
172 | 
173 |             self.soft_update()
174 |         if self.log:
175 |             self.writer.add_scalar('value1_loss', np.mean(value1_loss_buffer), self.train_count)
176 |             self.writer.add_scalar('value2_loss', np.mean(value2_loss_buffer), self.train_count)
177 |             self.writer.add_scalar('policy_loss', np.mean(policy_loss_buffer), self.train_count)
178 | 
179 |     def run(self):
180 |         for i in range(self.episode):
181 |             obs = self.env.reset()
182 |             total_reward = 0
183 |             if self.render:
184 |                 self.env.render()
185 |             while True:
186 |                 action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
187 |                 action = action + np.random.randn() * self.epsilon(self.count)
188 |                 action = np.clip(action, self.min_a, self.max_a)
189 |                 next_obs, reward, done, _ = self.env.step([action])
190 |                 if self.render:
191 |                     self.env.render()
192 |                 self.buffer.store(obs, action, reward, next_obs, done)
193 |                 self.count += 1
194 |                 total_reward += reward
195 |                 obs = next_obs
196 | 
197 |                 if done:
198 |                     if i > self.exploration:
199 |                         self.train_count += 1
200 |                         self.train()
201 |                     if not self.weight_reward:
202 |                         self.weight_reward = total_reward
203 |                     else:
204 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
205 |                     if self.log:
206 |                         self.writer.add_scalar('reward', total_reward, i + 1)
207 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i + 1)
208 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}'.format(i + 1, total_reward, self.weight_reward))
209 |                     break
210 | 
211 | 
212 | if __name__ == '__main__':
213 |     env = gym.make('Pendulum-v0')
214 |     test = td3(env=env,
215 |                batch_size=100,
216 |                learning_rate=1e-3,
217 |                exploration=300,
218 |                episode=10000,
219 |                gamma=0.99,
220 |                capacity=10000,
221 |                rho=0.995,
222 |                update_iter=10,
223 |                policy_delay=2,
224 |                epsilon_init=1.,
225 |                decay=10000,
226 |                epsilon_min=0.01,
227 |                max_a=2.,
228 |                min_a=-2.,
229 |                noisy_range=0.5,
230 |                render=False,
231 |                log=False)
232 |     test.run()


--------------------------------------------------------------------------------
/TRPO/trpo_gae.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.distributions import Normal
  5 | import numpy as np
  6 | import gym
  7 | from collections import deque
  8 | import random
  9 | from torch.utils.tensorboard import SummaryWriter
 10 | 
 11 | 
 12 | class gae_trajectory_buffer(object):
 13 |     def __init__(self, capacity, gamma, lam):
 14 |         self.capacity = capacity
 15 |         self.memory = deque(maxlen=self.capacity)
 16 |         # * [observation, action, reward, done, value, return, advantage]
 17 |         self.gamma = gamma
 18 |         self.lam = lam
 19 | 
 20 |     def store(self, observation, action, reward, done, value):
 21 |         observation = np.expand_dims(observation, 0)
 22 |         self.memory.append([observation, action, reward, done, value])
 23 | 
 24 |     def process(self):
 25 |         R = 0
 26 |         Adv = 0
 27 |         Value_previous = 0
 28 |         for traj in reversed(list(self.memory)):
 29 |             R = R * self.gamma * (1 - traj[3]) + traj[2]
 30 |             traj.append(R)
 31 |             # * the generalized advantage estimator(GAE)
 32 |             delta = traj[2] + self.gamma * (1 - traj[3]) * Value_previous - traj[4]
 33 |             Adv = delta + self.gamma * self.lam * Adv * (1 - traj[3])
 34 |             Value_previous = traj[4]
 35 |             traj.append(Adv)
 36 | 
 37 |     def get(self):
 38 |         observation, action, reward, done, value, ret, advantage = zip(* list(self.memory))
 39 |         observation = np.concatenate(observation, 0)
 40 |         action = np.expand_dims(action, 1)
 41 |         reward = np.expand_dims(reward, 1)
 42 |         done = np.expand_dims(done, 1)
 43 |         value = np.expand_dims(value, 1)
 44 |         ret = np.expand_dims(ret, 1)
 45 |         advantage = np.array(advantage)
 46 |         advantage = (advantage - advantage.mean()) / advantage.std()
 47 |         advantage = np.expand_dims(advantage, 1)
 48 |         return observation, action, reward, done, value, ret, advantage
 49 | 
 50 |     def clear(self):
 51 |         self.memory.clear()
 52 | 
 53 |     def __len__(self):
 54 |         return len(self.memory)
 55 | 
 56 | 
 57 | class gaussian_policy_net(nn.Module):
 58 |     def __init__(self, input_dim, output_dim):
 59 |         super(gaussian_policy_net, self).__init__()
 60 |         self.input_dim = input_dim
 61 |         self.output_dim = output_dim
 62 | 
 63 |         self.fc1 = nn.Linear(self.input_dim, 128)
 64 |         self.fc2 = nn.Linear(128, 128)
 65 |         self.fc3 = nn.Linear(128, self.output_dim)
 66 | 
 67 |     def forward(self, input):
 68 |         x = F.tanh(self.fc1(input))
 69 |         x = F.tanh(self.fc2(x))
 70 |         mu = self.fc3(x)
 71 |         sigma = torch.ones_like(mu)
 72 |         #log_sigma = torch.zeros_like(mu)
 73 |         #sigma = torch.exp(log_sigma)
 74 |         return mu, sigma
 75 | 
 76 |     def act(self, input):
 77 |         mu, sigma = self.forward(input)
 78 |         dist = Normal(mu, sigma)
 79 |         action = dist.sample().detach().item()
 80 |         return action
 81 | 
 82 |     def distribute(self, input):
 83 |         mu, sigma = self.forward(input)
 84 |         dist = Normal(mu, sigma)
 85 |         return dist
 86 | 
 87 | class value_net(nn.Module):
 88 |     def __init__(self, input_dim, output_dim):
 89 |         super(value_net, self).__init__()
 90 |         self.input_dim = input_dim
 91 |         self.output_dim = output_dim
 92 | 
 93 |         self.fc1 = nn.Linear(self.input_dim, 128)
 94 |         self.fc2 = nn.Linear(128, 128)
 95 |         self.fc3 = nn.Linear(128, self.output_dim)
 96 | 
 97 |     def forward(self, input):
 98 |         x = F.tanh(self.fc1(input))
 99 |         x = F.tanh(self.fc2(x))
100 |         x = self.fc3(x)
101 |         return x
102 | 
103 | 
104 | class trpo(object):
105 |     def __init__(self, env, capacity, gamma, learning_rate, render, sample_size, episode, lam, delta, value_train_iter, policy_train_iter, method, backtrack_coeff, backtrack_alpha, training, log):
106 |         self.env = env
107 |         self.gamma = gamma
108 |         self.lam = lam
109 |         self.delta = delta
110 |         self.capacity = capacity
111 |         self.learning_rate = learning_rate
112 |         self.render = render
113 |         self.sample_size = sample_size
114 |         self.episode = episode
115 |         self.value_train_iter = value_train_iter
116 |         self.policy_train_iter = policy_train_iter
117 |         self.method = method
118 |         self.backtrack_coeff = backtrack_coeff
119 |         self.backtrack_alpha = backtrack_alpha
120 |         self.training = training
121 | 
122 |         self.observation_dim = self.env.observation_space.shape[0]
123 |         self.action_dim = self.env.action_space.shape[0]
124 |         self.policy_net = gaussian_policy_net(self.observation_dim, self.action_dim)
125 |         self.old_policy_net = gaussian_policy_net(self.observation_dim, self.action_dim)
126 |         self.value_net = value_net(self.observation_dim, 1)
127 |         self.buffer = gae_trajectory_buffer(capacity=self.capacity, gamma=self.gamma, lam=self.lam)
128 |         self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.learning_rate)
129 |         self.old_policy_optimizer = torch.optim.Adam(self.old_policy_net.parameters(), lr=self.learning_rate)
130 |         self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
131 |         self.count = 0
132 |         self.train_count = 0
133 |         self.weight_reward = None
134 |         self.writer = SummaryWriter('runs/trpo_gae')
135 |         self.log = log
136 | 
137 |     def guassian_kl(self, old_policy, policy, obs):
138 |         # * calculate the guassian distribution kl
139 |         mu_old, sigma_old = old_policy.forward(obs)
140 |         mu_old, sigma_old = mu_old.detach(), sigma_old.detach()
141 | 
142 |         mu, sigma = policy.forward(obs)
143 | 
144 |         kl = torch.log(sigma / sigma_old) + (sigma_old.pow(2) + (mu_old - mu).pow(2)) / (2. * sigma.pow(2)) - 0.5
145 |         return kl.sum(-1, keepdim=True).mean()
146 | 
147 |     def flatten_grad(self, grads, hessian=False):
148 |         grad_flat = []
149 |         if hessian == False:
150 |             for grad in grads:
151 |                 grad_flat.append(grad.view(-1))
152 |             grad_flat = torch.cat(grad_flat, 0)
153 |         else:
154 |             for grad in grads:
155 |                 grad_flat.append(grad.contiguous().view(-1))
156 |             grad_flat = torch.cat(grad_flat, 0).detach()
157 |         return grad_flat
158 | 
159 |     def flatten_param(self, params):
160 |         param_flat = []
161 |         for param in params:
162 |             param_flat.append(param.view(-1))
163 |         return torch.cat(param_flat, 0).detach()
164 | 
165 |     def hessian_vector_product(self, obs, p, damping_coeff=0.1):
166 |         # * calculate the production of hessian matrix with a vector
167 |         # * obs : observation
168 |         # * p : a vector
169 |         kl = self.guassian_kl(self.old_policy_net, self.policy_net, obs)
170 |         kl_grad = torch.autograd.grad(kl, self.policy_net.parameters(), create_graph=True)
171 |         kl_grad = self.flatten_grad(kl_grad)
172 | 
173 |         kl_grad_p = (kl_grad * p).sum()
174 |         kl_hessian = torch.autograd.grad(kl_grad_p, self.policy_net.parameters())
175 |         kl_hessian = self.flatten_grad(kl_hessian, hessian=True)
176 |         return kl_hessian + p * damping_coeff
177 | 
178 |     def conjugate_gradient(self, obs, b, cg_iters=10, eps=1e-8, residual_tol=1e-10):
179 |         # * calculate the search direction with conjugate gradient method, find the x that makes hx = g
180 |         # * obs : observation
181 |         # * b : gradient
182 |         x = torch.zeros_like(b)
183 |         r = b.clone()
184 |         p = r.clone()
185 |         rTr = torch.dot(r, r)
186 | 
187 |         for _ in range(cg_iters):
188 |             Ap = self.hessian_vector_product(obs, p)
189 |             alpha = rTr / (torch.dot(p, Ap) + eps)
190 |             x = x + alpha * p
191 |             r = r - alpha * Ap
192 | 
193 |             new_rTr = torch.dot(r, r)
194 |             beta = new_rTr / rTr
195 |             p = r + beta * p
196 |             rTr = new_rTr
197 | 
198 |             if rTr < residual_tol:
199 |                 break
200 |         return x
201 | 
202 |     def update_model(self, model, params):
203 |         index = 0
204 |         for param in model.parameters():
205 |             param_length = param.view(-1).size(0)
206 |             new_param = params[index: index + param_length]
207 |             new_param = new_param.view(param.size())
208 |             param.detach().copy_(new_param)
209 |             index += param_length
210 | 
211 |     def train(self):
212 |         self.train_count += 1
213 |         obs, act, rew, do, val, ret, adv = self.buffer.get()
214 | 
215 |         obs = torch.FloatTensor(obs)
216 |         act = torch.FloatTensor(act)
217 |         rew = torch.FloatTensor(rew)
218 |         do = torch.FloatTensor(do)
219 |         val = torch.FloatTensor(val)
220 |         ret = torch.FloatTensor(ret)
221 |         adv = torch.FloatTensor(adv)
222 | 
223 |         dist_old = self.policy_net.distribute(obs)
224 |         log_prob_old = dist_old.log_prob(act).detach()
225 |         dist = self.policy_net.distribute(obs)
226 |         log_prob = dist.log_prob(act)
227 |         value = self.value_net.forward(obs)
228 | 
229 |         ratio_old = torch.exp(log_prob - log_prob_old)
230 |         policy_loss_old = (ratio_old * adv).mean()
231 |         value_loss = (value - ret).pow(2).mean()
232 |         self.writer.add_scalar('value_loss', value_loss, self.train_count)
233 |         self.writer.add_scalar('policy_loss_old', policy_loss_old, self.train_count)
234 | 
235 |         for _ in range(self.value_train_iter):
236 |             self.value_optimizer.zero_grad()
237 |             value_loss.backward(retain_graph=True)
238 |             self.value_optimizer.step()
239 | 
240 |         gradient = torch.autograd.grad(policy_loss_old, self.policy_net.parameters())
241 |         gradient = self.flatten_grad(gradient)
242 | 
243 |         search_dir = self.conjugate_gradient(obs, gradient)
244 |         # * search_dir is x in paper
245 |         xhx = torch.dot(self.hessian_vector_product(obs, search_dir), search_dir)
246 |         step_size = torch.sqrt((2. * self.delta) / xhx)
247 |         old_params = self.flatten_param(self.policy_net.parameters())
248 |         self.update_model(self.old_policy_net, old_params)
249 | 
250 |         if self.method == 'npg':
251 |             params = old_params + step_size * search_dir
252 |             self.update_model(self.policy_net, params)
253 | 
254 |         elif self.method == 'trpo':
255 |             full_improve = (gradient * step_size * search_dir).sum(0, keepdim=True)
256 |             dist_old = self.old_policy_net.distribute(obs)
257 | 
258 |             for i in range(self.policy_train_iter):
259 |                 params = old_params + self.backtrack_coeff * step_size * search_dir
260 |                 self.update_model(self.policy_net, params)
261 | 
262 |                 dist = self.policy_net.distribute(obs)
263 |                 log_prob = dist.log_prob(act)
264 |                 ratio = torch.exp(log_prob - log_prob_old)
265 |                 policy_loss = (ratio * adv).mean()
266 |                 loss_improve = policy_loss - policy_loss_old
267 |                 full_improve = full_improve * self.backtrack_coeff
268 |                 improve_condition = loss_improve / full_improve
269 | 
270 |                 kl = self.guassian_kl(self.old_policy_net, self.policy_net, obs)
271 | 
272 |                 if kl < self.delta and improve_condition > self.backtrack_alpha:
273 |                     self.writer.add_scalar('improve_condition', improve_condition, self.train_count)
274 |                     self.writer.add_scalar('kl', kl, self.train_count)
275 |                     self.writer.add_scalar('backtrack_coeff', self.backtrack_coeff, self.train_count)
276 |                     break
277 |                 else:
278 |                     if i == self.policy_train_iter - 1:
279 |                         params = self.flatten_param(self.old_policy_net.parameters())
280 |                         self.update_model(self.policy_net, params)
281 |                         self.writer.add_scalar('improve_condition', improve_condition, self.train_count)
282 |                         self.writer.add_scalar('kl', kl, self.train_count)
283 |                         self.writer.add_scalar('backtrack_coeff', 0., self.train_count)
284 |                 self.backtrack_coeff = self.backtrack_coeff * 0.5
285 |             self.backtrack_coeff = 1.
286 | 
287 |     def run(self):
288 |         for i in range(self.episode):
289 |             total_reward = 0
290 |             obs = self.env.reset()
291 |             if self.render:
292 |                 self.env.render()
293 |             while True:
294 |                 self.count += 1
295 |                 if self.training:
296 |                     action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs, 0)))
297 |                     next_obs, reward, done, _ = self.env.step([action])
298 |                     val = self.value_net.forward(torch.FloatTensor(np.expand_dims(obs, 0))).detach().item()
299 |                     self.buffer.store(obs, action, reward, done, val)
300 |                     if self.count % self.capacity == 0:
301 |                         self.buffer.process()
302 |                         self.train()
303 |                 else:
304 |                     action = self.policy_net.act(torch.FloatTensor(np.expand_dims(obs)))
305 |                     next_obs, reward, done, _ = self.env.step(action)
306 | 
307 |                 total_reward += reward
308 |                 obs = next_obs
309 |                 if done:
310 |                     if not self.weight_reward:
311 |                         self.weight_reward = total_reward
312 |                     else:
313 |                         self.weight_reward = self.weight_reward * 0.99 + total_reward * 0.01
314 |                     if self.log:
315 |                         self.writer.add_scalar('weight_reward', self.weight_reward, i + 1)
316 |                         self.writer.add_scalar('reward', total_reward, i + 1)
317 |                     print('episode: {}  reward: {:.2f}  weight_reward: {:.2f}  train_step: {}'.format(i + 1, total_reward, self.weight_reward, self.train_count))
318 |                     break
319 | 
320 | 
321 | if __name__ == '__main__':
322 |     env = gym.make('Pendulum-v0')
323 |     test = trpo(env=env,
324 |                 capacity=2000,
325 |                 gamma=0.99,
326 |                 learning_rate=1e-3,
327 |                 render=False,
328 |                 sample_size=64,
329 |                 episode=5000,
330 |                 lam=0.97,
331 |                 delta=1e-2,
332 |                 value_train_iter=80,
333 |                 policy_train_iter=10,
334 |                 method='trpo',
335 |                 backtrack_coeff=1.,
336 |                 backtrack_alpha=0.5,
337 |                 training=True,
338 |                 log=False)
339 |     test.run()


--------------------------------------------------------------------------------